diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index dfe1e3848ee5a..4e1b2ecf3671a 100755
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -32,7 +32,7 @@ repos:
         description: Format files with ClangFormat.
         entry: bash ./tools/codestyle/clang_format.hook -i
         language: system
-        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|xpu|kps)$
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|xpu|kps)$
         exclude: |
             (?x)^(
                 paddle/fluid/distributed/ps/thirdparty/round_robin.h
@@ -81,64 +81,3 @@ repos:
     -   id: cmakelint
         args: [--config=./tools/codestyle/.cmakelintrc]
         # exclude files which need to be fixed
-        exclude: |
-            (?x)^(
-                CMakeLists.txt|
-                python/paddle/fluid/tests/unittests/CMakeLists.txt|             
-                paddle/fluid/inference/tests/infer_ut/CMakeLists.txt|
-                cmake/configure.cmake|
-                paddle/fluid/inference/api/demo_ci/CMakeLists.txt|
-                cmake/flags.cmake|
-                cmake/inference_lib.cmake|
-                cmake/external/protobuf.cmake|
-                paddle/fluid/framework/fleet/CMakeLists.txt|
-                paddle/fluid/inference/CMakeLists.txt|
-                paddle/fluid/inference/tests/api/CMakeLists.txt|
-                paddle/fluid/operators/CMakeLists.txt|
-                cmake/external/lite.cmake|
-                cmake/external/poplar.cmake|
-                cmake/python_module.cmake|
-                python/paddle/fluid/tests/unittests/asp/CMakeLists.txt|
-                cmake/cuda.cmake|
-                cmake/FindNumPy.cmake|
-                cmake/coveralls.cmake|
-                cmake/external/glog.cmake|
-                cmake/external/onnxruntime.cmake|
-                cmake/external/openblas.cmake|
-                cmake/external/xpu.cmake|
-                cmake/hip.cmake|
-                paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt|
-                paddle/fluid/inference/api/CMakeLists.txt|
-                paddle/fluid/operators/controlflow/CMakeLists.txt|
-                python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt|
-                cmake/operators.cmake|
-                cmake/tensorrt.cmake|
-                paddle/fluid/inference/api/details/CMakeLists.txt|
-                python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt|
-                cmake/external/arm_brpc.cmake|
-                cmake/external/concurrentqueue.cmake|
-                cmake/external/eigen.cmake|
-                cmake/external/mklml.cmake|
-                cmake/external/paddle2onnx.cmake|
-                cmake/miopen.cmake|
-                cmake/nccl.cmake|
-                cmake/simd.cmake|
-                paddle/fluid/inference/analysis/CMakeLists.txt|
-                paddle/fluid/inference/tests/infer_ut/external-cmake/gtest-cpp.cmake|
-                paddle/fluid/memory/allocation/CMakeLists.txt|
-                paddle/fluid/memory/CMakeLists.txt|
-                paddle/fluid/operators/cinn/CMakeLists.txt|
-                paddle/infrt/external_kernels/CMakeLists.txt|
-                paddle/infrt/kernel/phi/CMakeLists.txt|
-                python/paddle/fluid/contrib/slim/tests/CMakeLists.txt|
-                python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt|
-                python/paddle/fluid/tests/unittests/distribution/CMakeLists.txt|
-                python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt|
-                python/paddle/fluid/tests/unittests/fft/CMakeLists.txt|
-                python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt|
-                python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt|
-                python/paddle/fluid/tests/unittests/npu/CMakeLists.txt|
-                python/paddle/fluid/tests/unittests/ps/CMakeLists.txt|
-                python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt|
-                python/paddle/fluid/tests/unittests/sequence/CMakeLists.txt
-            )$
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a3e0b64e97b25..ea4bc8a2d6c3e 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,10 +16,10 @@ if(APPLE AND WITH_ARM)
   # cmake 3.19.2 version starts to support M1
   cmake_minimum_required(VERSION 3.19.2)
   cmake_policy(VERSION 3.19.2)
-else(APPLE AND WITH_ARM)
+else()
   cmake_minimum_required(VERSION 3.15)
   cmake_policy(VERSION 3.10)
-endif(APPLE AND WITH_ARM)
+endif()
 # use to get_property location of static lib
 # https://cmake.org/cmake/help/v3.0/policy/CMP0026.html?highlight=cmp0026
 cmake_policy(SET CMP0026 OLD)
@@ -152,7 +152,7 @@ if(WIN32)
       if(${flag_var} MATCHES "/MD")
         string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
       endif()
-    endforeach(flag_var)
+    endforeach()
   endif()
 
   # NOTE(zhouwei): msvc max/min macro conflict with std::min/max, define NOMINMAX globally
@@ -179,10 +179,10 @@ if(WIN32)
       math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3")
       set(${flag_var} "${${flag_var}} /MP${PROCESS_MAX}")
     endif()
-  endforeach(flag_var)
+  endforeach()
   foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
     set(${flag_var} "${${flag_var}} /w")
-  endforeach(flag_var)
+  endforeach()
 
   # Windows Remove /Zi, /ZI for Release, MinSizeRel builds
   foreach(flag_var
@@ -191,7 +191,7 @@ if(WIN32)
     if(${flag_var} MATCHES "/Z[iI]")
       string(REGEX REPLACE "/Z[iI]" "" ${flag_var} "${${flag_var}}")
     endif()
-  endforeach(flag_var)
+  endforeach()
 
   set(CMAKE_C_FLAGS
       "${CMAKE_C_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838"
@@ -207,7 +207,7 @@ if(WIN32)
     if(MSVC_STATIC_CRT)
       set(${flag_var} "${${flag_var}} /NODEFAULTLIB:MSVCRT.LIB")
     endif()
-  endforeach(flag_var)
+  endforeach()
 
   if(WITH_WIN_DUMP_DBG)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Zi")
@@ -216,16 +216,16 @@ if(WIN32)
     foreach(flag_var CMAKE_SHARED_LINKER_FLAGS CMAKE_STATIC_LINKER_FLAGS
                      CMAKE_EXE_LINKER_FLAGS CMAKE_LINKER_FLAGS)
       set(${flag_var} "${${flag_var}} /DEBUG /OPT:REF /OPT:ICF")
-    endforeach(flag_var)
+    endforeach()
 
     add_definitions("-DWITH_WIN_DUMP_DBG")
   endif()
 
-else(WIN32)
+else()
   set(CMAKE_CXX_FLAGS
       "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations"
   )
-endif(WIN32)
+endif()
 
 find_package(Git REQUIRED)
 
@@ -430,7 +430,7 @@ endif()
 if(WITH_ROCM)
   include(hip)
   include(miopen) # set miopen libraries, must before configure
-endif(WITH_ROCM)
+endif()
 
 if(WITH_XPU_KP)
   include(xpu_kp)
diff --git a/README.md b/README.md
index 048a273a7d78b..e44378310f726 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ English | [简体中文](./README_cn.md)
 Welcome to the PaddlePaddle GitHub.
 
 PaddlePaddle, as the first independent R&D deep learning platform in China, has been officially open-sourced to professional communities since 2016. It is an industrial platform with advanced technologies and rich features that cover core deep learning frameworks, basic model libraries, end-to-end development kits, tools & components as well as service platforms.
-PaddlePaddle is originated from industrial practices with dedication and commitments to industrialization. It has been widely adopted by a wide range of sectors including manufacturing, agriculture, enterprise service, and so on while serving more than 4 million developers, 157,000 companies and generating 476,000 models. With such advantages, PaddlePaddle has helped an increasing number of partners commercialize AI.
+PaddlePaddle is originated from industrial practices with dedication and commitments to industrialization. It has been widely adopted by a wide range of sectors including manufacturing, agriculture, enterprise service, and so on while serving more than 4.7 million developers, 180,000 companies and generating 560,000 models. With such advantages, PaddlePaddle has helped an increasing number of partners commercialize AI.
 
 
 ## Installation
@@ -85,7 +85,7 @@ We provide [English](https://www.paddlepaddle.org.cn/documentation/docs/en/guide
 
 - [Github Issues](https://github.com/PaddlePaddle/Paddle/issues): bug reports, feature requests, install issues, usage issues, etc.
 - QQ discussion group: 441226485 (PaddlePaddle).
-- [Forums](https://ai.baidu.com/forum/topic/list/168?pageNo=1): discuss implementations, research, etc.
+- [Forums](https://aistudio.baidu.com/paddle/forum): discuss implementations, research, etc.
     
 ## Courses
 
diff --git a/README_cn.md b/README_cn.md
index 3834ee148f940..8f0caab285e07 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -15,11 +15,11 @@
 
 欢迎来到 PaddlePaddle GitHub
 
-飞桨(PaddlePaddle)以百度多年的深度学习技术研究和业务应用为基础，是中国首个自主研发、功能完备、 开源开放的产业级深度学习平台，集深度学习核心训练和推理框架、基础模型库、端到端开发套件和丰富的工具组件于一体。目前，飞桨累计开发者406万，服务企业15.7万家，基于飞桨开源深度学习平台产生了47.6万个模型。飞桨助力开发者快速实现AI想法，快速上线AI业务。帮助越来越多的行业完成AI赋能，实现产业智能化升级。
+飞桨(PaddlePaddle)以百度多年的深度学习技术研究和业务应用为基础，是中国首个自主研发、功能完备、 开源开放的产业级深度学习平台，集深度学习核心训练和推理框架、基础模型库、端到端开发套件和丰富的工具组件于一体。目前，飞桨累计开发者477万，服务企业18万家，基于飞桨开源深度学习平台产生了56万个模型。飞桨助力开发者快速实现AI想法，快速上线AI业务。帮助越来越多的行业完成AI赋能，实现产业智能化升级。
 
 ## 安装
 
-### PaddlePaddle最新版本: [v2.2](https://github.com/PaddlePaddle/Paddle/tree/release/2.2)
+### PaddlePaddle最新版本: [v2.3](https://github.com/PaddlePaddle/Paddle/tree/release/2.3)
 
 跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases)
 
@@ -83,7 +83,7 @@ PaddlePaddle用户可领取**免费Tesla V100在线算力资源**，训练模型
 
 - 欢迎您通过[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)来提交问题、报告与建议
 - QQ群: 441226485 (PaddlePaddle)
-- [论坛](https://ai.baidu.com/forum/topic/list/168): 欢迎大家在PaddlePaddle论坛分享在使用PaddlePaddle中遇到的问题和经验, 营造良好的论坛氛围
+- [论坛](https://aistudio.baidu.com/paddle/forum): 欢迎大家在PaddlePaddle论坛分享在使用PaddlePaddle中遇到的问题和经验, 营造良好的论坛氛围
     
 ## 课程
 
diff --git a/cmake/FindNumPy.cmake b/cmake/FindNumPy.cmake
index fc7cdb8c1923c..b2d36f0cb01ba 100644
--- a/cmake/FindNumPy.cmake
+++ b/cmake/FindNumPy.cmake
@@ -26,7 +26,7 @@ if(PYTHON_EXECUTABLE)
     OUTPUT_VARIABLE NUMPY_PATH)
 elseif(_numpy_out)
   message(STATUS "Python executable not found.")
-endif(PYTHON_EXECUTABLE)
+endif()
 
 find_path(PYTHON_NUMPY_INCLUDE_DIR numpy/arrayobject.h
           HINTS "${NUMPY_PATH}" "${PYTHON_INCLUDE_PATH}")
@@ -35,7 +35,7 @@ if(PYTHON_NUMPY_INCLUDE_DIR)
   set(PYTHON_NUMPY_FOUND
       1
       CACHE INTERNAL "Python numpy found")
-endif(PYTHON_NUMPY_INCLUDE_DIR)
+endif()
 
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(NumPy DEFAULT_MSG PYTHON_NUMPY_INCLUDE_DIR)
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 91464b84ef029..f84bb15d5922b 100755
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -14,19 +14,19 @@
 
 if(NOT WITH_PYTHON)
   add_definitions(-DPADDLE_NO_PYTHON)
-endif(NOT WITH_PYTHON)
+endif()
 
 if(WITH_TESTING)
   add_definitions(-DPADDLE_WITH_TESTING)
-endif(WITH_TESTING)
+endif()
 
 if(WITH_INFERENCE_API_TEST)
   add_definitions(-DPADDLE_WITH_INFERENCE_API_TEST)
-endif(WITH_INFERENCE_API_TEST)
+endif()
 
 if(NOT WITH_PROFILER)
   add_definitions(-DPADDLE_DISABLE_PROFILER)
-endif(NOT WITH_PROFILER)
+endif()
 
 if(WITH_AVX AND AVX_FOUND)
   set(SIMD_FLAG ${AVX_FLAG})
@@ -60,8 +60,8 @@ if(WIN32)
       FATAL
       "Windows build only support msvc. Which was binded by the nvcc compiler of NVIDIA."
     )
-  endif(NOT MSVC)
-endif(WIN32)
+  endif()
+endif()
 
 if(WITH_MUSL)
   add_definitions(-DPADDLE_WITH_MUSL)
@@ -195,9 +195,9 @@ if(WITH_MKLML AND MKLML_IOMP_LIB)
   if(WIN32)
     # openmp not support well for now on windows
     set(OPENMP_FLAGS "")
-  else(WIN32)
+  else()
     set(OPENMP_FLAGS "-fopenmp")
-  endif(WIN32)
+  endif()
   set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
   set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
@@ -221,15 +221,15 @@ endif()
 
 if(WITH_BRPC_RDMA)
   add_definitions(-DPADDLE_WITH_BRPC_RDMA)
-endif(WITH_BRPC_RDMA)
+endif()
 
 if(ON_INFER)
   add_definitions(-DPADDLE_ON_INFERENCE)
-endif(ON_INFER)
+endif()
 
 if(WITH_CRYPTO)
   add_definitions(-DPADDLE_WITH_CRYPTO)
-endif(WITH_CRYPTO)
+endif()
 
 if(WITH_CUSTOM_DEVICE AND NOT WIN32)
   add_definitions(-DPADDLE_WITH_CUSTOM_DEVICE)
diff --git a/cmake/coveralls.cmake b/cmake/coveralls.cmake
index 02c1a136280f7..9c28903498729 100644
--- a/cmake/coveralls.cmake
+++ b/cmake/coveralls.cmake
@@ -96,7 +96,7 @@ if(WITH_COVERAGE)
       if(NOT ${EXCLUDE_DIR_FOUND} EQUAL -1)
         list(REMOVE_ITEM PADDLE_SOURCES ${TMP_PATH})
       endif()
-    endforeach(TMP_PATH)
+    endforeach()
   endforeach()
 
   # convert to absolute path
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index aa958786cb8f4..87b943abd0106 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -274,7 +274,7 @@ set(CMAKE_CUDA_STANDARD 14)
 # So replace /W[1-4] with /W0
 if(WIN32)
   string(REGEX REPLACE "/W[1-4]" " /W0 " CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}")
-endif(WIN32)
+endif()
 # in cuda9, suppress cuda warning on eigen
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -w")
 # Set :expt-relaxed-constexpr to suppress Eigen warnings
@@ -293,7 +293,7 @@ if(WIN32)
       if(${flag_var} MATCHES "-MD")
         string(REGEX REPLACE "-MD" "-MT" ${flag_var} "${${flag_var}}")
       endif()
-    endforeach(flag_var)
+    endforeach()
   endif()
 endif()
 
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 443b7aa7d56b7..31280a768b3a8 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -25,7 +25,8 @@ if(WIN32)
 elseif(LINUX)
   if(WITH_ROCM)
     # For HIPCC Eigen::internal::device::numeric_limits is not EIGEN_DEVICE_FUNC
-    # which will cause compiler error of using __host__ funciont in __host__ __device__
+    # which will cause compiler error of using __host__ funciont
+    # in __host__ __device__
     file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Meta.h native_src)
     file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/util/Meta.h
          native_dst)
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index a9942a6bca67b..df1b827ed1824 100755
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -28,12 +28,12 @@ if(WIN32)
       CACHE FILEPATH "glog library." FORCE)
   set(GLOG_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4530")
   add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
-else(WIN32)
+else()
   set(GLOG_LIBRARIES
       "${GLOG_INSTALL_DIR}/lib/libglog.a"
       CACHE FILEPATH "glog library." FORCE)
   set(GLOG_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
-endif(WIN32)
+endif()
 
 include_directories(${GLOG_INCLUDE_DIR})
 
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index 90d61f47a52e8..14a8298790799 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -32,7 +32,8 @@ if(WIN32)
   set(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.dll)
 else()
   #TODO(intel-huying):
-  #  Now enable csrmm function in mklml library temporarily, it will be updated as offical version later.
+  #  Now enable csrmm function in mklml library temporarily,
+  #  it will be updated as offical version later.
   set(MKLML_VER
       "csrmm_mklml_lnx_2019.0.5"
       CACHE STRING "" FORCE)
@@ -51,8 +52,9 @@ message(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}")
 set(MKLML_PREFIX_DIR ${THIRD_PARTY_PATH}/mklml)
 set(MKLML_SOURCE_DIR ${THIRD_PARTY_PATH}/mklml/src/extern_mklml)
 
-# Ninja Generator can not establish the correct dependency relationship between the imported library with target,
-# the product file in the ExternalProject need to be specified manually, please refer to
+# Ninja Generator can not establish the correct dependency relationship
+# between the imported library with target, the product file
+# in the ExternalProject need to be specified manually, please refer to
 # https://stackoverflow.com/questions/54866067/cmake-and-ninja-missing-and-no-known-rule-to-make-it
 # It is the same to all other ExternalProject.
 ExternalProject_Add(
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 1cccfb86f4208..a93121e95c4e7 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -58,7 +58,7 @@ if(NOT WIN32)
     UPDATE_COMMAND ""
     CONFIGURE_COMMAND ""
     BUILD_BYPRODUCTS ${CBLAS_LIBRARIES})
-else(NOT WIN32)
+else()
   set(CBLAS_LIBRARIES
       "${CBLAS_INSTALL_DIR}/lib/openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
       CACHE FILEPATH "openblas library." FORCE)
@@ -92,4 +92,4 @@ else(NOT WIN32)
     BUILD_BYPRODUCTS ${CBLAS_LIBRARIES})
   set(OPENBLAS_SHARED_LIB
       ${CBLAS_INSTALL_DIR}/bin/openblas${CMAKE_SHARED_LIBRARY_SUFFIX})
-endif(NOT WIN32)
+endif()
diff --git a/cmake/external/paddle2onnx.cmake b/cmake/external/paddle2onnx.cmake
index 75e2c42cb5a29..96f24bfc8a5bb 100644
--- a/cmake/external/paddle2onnx.cmake
+++ b/cmake/external/paddle2onnx.cmake
@@ -69,7 +69,7 @@ else()
   set(PADDLE2ONNX_COMPILE_LIB
       "${PADDLE2ONNX_INSTALL_DIR}/lib/libpaddle2onnx.so"
       CACHE FILEPATH "paddle2onnx compile library." FORCE)
-endif(WIN32)
+endif()
 
 if(WIN32)
   set(PADDLE2ONNX_URL
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 7c5de92362db4..6f9078c8eeecd 100755
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -16,7 +16,7 @@ include(ExternalProject)
 # Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp
 if(NOT WIN32)
   find_package(Protobuf QUIET)
-endif(NOT WIN32)
+endif()
 
 unset_var(PROTOBUF_INCLUDE_DIR)
 unset_var(PROTOBUF_FOUND)
@@ -147,7 +147,7 @@ set(PROTOBUF_ROOT
     CACHE PATH "Folder contains protobuf")
 if(WIN32)
   set(PROTOBUF_ROOT ${THIRD_PARTY_PATH}/install/protobuf)
-endif(WIN32)
+endif()
 
 if(NOT "${PROTOBUF_ROOT}" STREQUAL "")
   find_path(
@@ -349,4 +349,4 @@ if(NOT PROTOBUF_FOUND)
   # `protoc.exe` existed before calling it.
   set(EXTERN_PROTOBUF_DEPEND extern_protobuf)
   prompt_protobuf_lib(extern_protobuf)
-endif(NOT PROTOBUF_FOUND)
+endif()
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index af27500398f57..7d1cca4feb6a6 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -134,9 +134,9 @@ if(WITH_XPU_BKCL)
   set(XPU_BKCL_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/include")
   include_directories(${XPU_BKCL_INC_DIR})
   target_link_libraries(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} ${XPU_BKCL_LIB})
-else(WITH_XPU_BKCL)
+else()
   target_link_libraries(xpulib ${XPU_API_LIB} ${XPU_RT_LIB})
-endif(WITH_XPU_BKCL)
+endif()
 
 add_dependencies(xpulib ${XPU_PROJECT})
 
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index e3c5545df8b27..41a7b4a9d1cce 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -113,10 +113,10 @@ check_type_size(pthread_spinlock_t SPINLOCK_FOUND)
 check_type_size(pthread_barrier_t BARRIER_FOUND)
 if(SPINLOCK_FOUND)
   add_definitions(-DPADDLE_USE_PTHREAD_SPINLOCK)
-endif(SPINLOCK_FOUND)
+endif()
 if(BARRIER_FOUND)
   add_definitions(-DPADDLE_USE_PTHREAD_BARRIER)
-endif(BARRIER_FOUND)
+endif()
 set(CMAKE_EXTRA_INCLUDE_FILES "")
 
 # Only one sanitizer is allowed in compile time
@@ -180,7 +180,7 @@ if(NOT WIN32)
           -Wno-parentheses # Warning in Eigen gcc 8.3
       )
     endif()
-  endif(NOT APPLE)
+  endif()
 
   set(GPU_COMMON_FLAGS
       -fPIC
@@ -200,21 +200,21 @@ if(NOT WIN32)
      AND NOT WITH_MIPS)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64")
   endif()
-endif(NOT WIN32)
+endif()
 
 if(APPLE)
   if(WITH_ARM)
     set(CMAKE_OSX_ARCHITECTURES
         "arm64"
         CACHE STRING "Build architectures for OSX" FORCE)
-  else(WITH_ARM)
+  else()
     set(CMAKE_OSX_ARCHITECTURES
         "x86_64"
         CACHE STRING "Build architectures for OSX" FORCE)
-  endif(WITH_ARM)
+  endif()
   # On Mac OS X register class specifier is deprecated and will cause warning error on latest clang 10.0
   set(COMMON_FLAGS -Wno-deprecated-register)
-endif(APPLE)
+endif()
 
 if(WITH_HETERPS AND WITH_PSLIB)
   set(COMMON_FLAGS -D_GLIBCXX_USE_CXX11_ABI=0 ${COMMON_FLAGS})
@@ -224,7 +224,7 @@ endif()
 
 if(LINUX)
   set(GPU_COMMON_FLAGS -Wall -Wextra -Werror ${GPU_COMMON_FLAGS})
-endif(LINUX)
+endif()
 
 foreach(flag ${COMMON_FLAGS})
   safe_set_cflag(CMAKE_C_FLAGS ${flag})
diff --git a/cmake/hip.cmake b/cmake/hip.cmake
index 3514882c944de..44e9e2ee8ccaf 100644
--- a/cmake/hip.cmake
+++ b/cmake/hip.cmake
@@ -112,7 +112,7 @@ if(CMAKE_BUILD_TYPE MATCHES Debug)
   list(APPEND HIP_CXX_FLAGS -g2)
   list(APPEND HIP_CXX_FLAGS -O0)
   list(APPEND HIP_HIPCC_FLAGS -fdebug-info-for-profiling)
-endif(CMAKE_BUILD_TYPE MATCHES Debug)
+endif()
 
 set(HIP_HCC_FLAGS ${HIP_CXX_FLAGS})
 set(HIP_CLANG_FLAGS ${HIP_CXX_FLAGS})
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index a8e3696418bd4..56345373dbe8c 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -59,14 +59,14 @@ function(copy TARGET)
         POST_BUILD
         COMMAND ${PYTHON_EXECUTABLE} ${COPY_SCRIPT_DIR}/copyfile.py
                 ${native_src} ${native_dst})
-    else(WIN32) #not windows
+    else() #not windows
       add_custom_command(
         TARGET ${TARGET}
         POST_BUILD
         COMMAND mkdir -p "${dst}"
         COMMAND cp -r "${src}" "${dst}"
         COMMENT "copying ${src} -> ${dst}")
-    endif(WIN32) # not windows
+    endif() # not windows
   endforeach()
 endfunction()
 
@@ -265,7 +265,7 @@ if(WIN32)
     DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include
          ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib
          ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
-else(WIN32)
+else()
   set(paddle_inference_lib
       ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_inference.*)
   copy(
@@ -273,7 +273,7 @@ else(WIN32)
     SRCS ${src_dir}/inference/api/paddle_*.h ${paddle_inference_lib}
     DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include
          ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
-endif(WIN32)
+endif()
 
 copy(
   inference_lib_dist
@@ -350,11 +350,11 @@ set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 if(WIN32)
   set(paddle_inference_c_lib
       $<TARGET_FILE_DIR:paddle_inference_c>/paddle_inference_c.*)
-else(WIN32)
+else()
   set(paddle_inference_c_lib
       ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi_exp/libpaddle_inference_c.*
   )
-endif(WIN32)
+endif()
 
 copy(
   inference_lib_dist
@@ -436,7 +436,7 @@ set(module "platform")
 set(platform_lib_deps profiler_proto errors)
 if(WITH_GPU)
   set(platform_lib_deps ${platform_lib_deps} external_error_proto)
-endif(WITH_GPU)
+endif()
 
 add_dependencies(fluid_lib_dist ${platform_lib_deps})
 copy(
diff --git a/cmake/miopen.cmake b/cmake/miopen.cmake
index 392ff0401eaef..13d9563e3abdd 100644
--- a/cmake/miopen.cmake
+++ b/cmake/miopen.cmake
@@ -65,10 +65,9 @@ macro(find_miopen_version miopen_header_file)
     math(EXPR MIOPEN_VERSION "${MIOPEN_MAJOR_VERSION} * 1000 +
              ${MIOPEN_MINOR_VERSION} * 10 + ${MIOPEN_PATCH_VERSION}")
     message(
-      STATUS
-        "Current MIOpen header is ${MIOPEN_INCLUDE_DIR}/miopen/miopen.h "
-        "Current MIOpen version is v${MIOPEN_MAJOR_VERSION}.${MIOPEN_MINOR_VERSION}.${MIOPEN_PATCH_VERSION}. "
-    )
+      STATUS "Current MIOpen header is ${MIOPEN_INCLUDE_DIR}/miopen/miopen.h "
+             "Current MIOpen version is v${MIOPEN_MAJOR_VERSION}.\
+        ${MIOPEN_MINOR_VERSION}.${MIOPEN_PATCH_VERSION}. ")
   endif()
 endmacro()
 
diff --git a/cmake/nccl.cmake b/cmake/nccl.cmake
index 8ce3cd91ac82a..eaa7bd23fd9b2 100644
--- a/cmake/nccl.cmake
+++ b/cmake/nccl.cmake
@@ -50,10 +50,8 @@ if(WITH_NCCL)
     endif()
     add_definitions("-DNCCL_VERSION_CODE=$NCCL_VERSION")
 
-    message(
-      STATUS
-        "Current NCCL header is ${NCCL_INCLUDE_DIR}/nccl.h. "
-        "Current NCCL version is v${NCCL_MAJOR_VERSION}.${NCCL_MINOR_VERSION}.${NCCL_PATCH_VERSION} "
-    )
+    message(STATUS "Current NCCL header is ${NCCL_INCLUDE_DIR}/nccl.h. "
+                   "Current NCCL version is \
+        v${NCCL_MAJOR_VERSION}.${NCCL_MINOR_VERSION}.${NCCL_PATCH_VERSION} ")
   endif()
 endif()
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 4e0cc1027eff0..e8d7ba1401ebe 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -217,7 +217,7 @@ function(op_library TARGET)
         return()
       endif()
     endforeach()
-  endif(WIN32)
+  endif()
 
   # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
   if(WITH_UNITY_BUILD AND op_library_UNITY)
diff --git a/cmake/python_module.cmake b/cmake/python_module.cmake
index 9367435b61b55..47e449c9dadb1 100644
--- a/cmake/python_module.cmake
+++ b/cmake/python_module.cmake
@@ -22,8 +22,8 @@ function(find_python_module module)
       set(PY_${module_upper}
           ${_${module}_location}
           CACHE STRING "Location of Python module ${module}")
-    endif(NOT _${module}_status)
-  endif(NOT PY_${module_upper})
+    endif()
+  endif()
   find_package_handle_standard_args(PY_${module} DEFAULT_MSG PY_${module_upper})
   if(NOT PY_${module_upper}_FOUND AND ${module}_FIND_REQUIRED)
     message(FATAL_ERROR "python module ${module} is not found")
@@ -39,7 +39,7 @@ function(find_python_module module)
     set(PY_${module_upper}_VERSION
         ${_${module}_version}
         CACHE STRING "Version of Python module ${module}")
-  endif(NOT _${module}_status)
+  endif()
 
   set(PY_${module_upper}_FOUND
       ${PY_${module_upper}_FOUND}
@@ -47,4 +47,4 @@ function(find_python_module module)
   set(PY_${module_upper}_VERSION
       ${PY_${module_upper}_VERSION}
       PARENT_SCOPE)
-endfunction(find_python_module)
+endfunction()
diff --git a/cmake/simd.cmake b/cmake/simd.cmake
index ff8b9d6f9a9b4..3d730657062a0 100644
--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@@ -81,10 +81,10 @@ check_cxx_source_runs(
 #include <immintrin.h>
 int main()
 {
-    __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
-    __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
-    __m256 result = _mm256_add_ps (a, b);
-    return 0;
+  __m256 a = _mm256_set_ps(-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
+  __m256 b = _mm256_set_ps(1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
+  __m256 result = _mm256_add_ps(a, b);
+  return 0;
 }"
   AVX_FOUND)
 
diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt
index 0cfc82709637f..21da7a0560ee3 100644
--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
@@ -14,7 +14,7 @@ if(WITH_DISTRIBUTE)
     DEPS phi_api eager_api gloo_wrapper)
 endif()
 
-if(WITH_NCCL)
+if(WITH_NCCL OR WITH_RCCL)
   cc_library(
     processgroup_nccl
     SRCS ProcessGroupNCCL.cc NCCLTools.cc Common.cc
diff --git a/paddle/fluid/distributed/collective/NCCLTools.h b/paddle/fluid/distributed/collective/NCCLTools.h
index f38ce8faa7ffb..5f1da003313ad 100644
--- a/paddle/fluid/distributed/collective/NCCLTools.h
+++ b/paddle/fluid/distributed/collective/NCCLTools.h
@@ -14,7 +14,13 @@
 
 #pragma once
 
+#ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
+#endif
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
+
 #include <error.h>
 
 #include <string>
@@ -23,9 +29,19 @@
 #include "paddle/fluid/distributed/collective/Types.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/variable.h"
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#endif
+
 #include "paddle/fluid/platform/device_context.h"
+
+#ifdef PADDLE_WITH_RCCL
+#include "paddle/fluid/platform/dynload/rccl.h"
+#else
 #include "paddle/fluid/platform/dynload/nccl.h"
+#endif
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -56,7 +72,11 @@ class EventManager {
   ~EventManager() {
     if (is_created_) {
       platform::CUDADeviceGuard guard(device_index_);
+#ifdef PADDLE_WITH_HIP
+      hipEventDestroy(event_);
+#else
       cudaEventDestroy(event_);
+#endif
     }
   }
 
@@ -94,24 +114,42 @@ class EventManager {
                           device_index, device_index_));
 
     platform::CUDADeviceGuard guard(device_index_);
+#ifdef PADDLE_WITH_CUDA
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, ctx.stream()));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, ctx.stream()));
+#endif
   }
 
   bool Query() const {
+#ifdef PADDLE_WITH_HIP
+    gpuError_t err = hipEventQuery(event_);
+    if (err == hipSuccess) {
+      return true;
+    }
+    if (err == hipErrorNotReady) {
+      return false;
+    }
+#else
     gpuError_t err = cudaEventQuery(event_);
     if (err == cudaSuccess) {
       return true;
-    } else if (err == cudaErrorNotReady) {
-      return false;
-    } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(err);
+    }
+    if (err == cudaErrorNotReady) {
       return false;
     }
+#endif
+    PADDLE_ENFORCE_GPU_SUCCESS(err);
+    return false;
   }
 
   void Synchronize() const {
     if (is_created_) {
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(hipEventSynchronize(event_));
+#else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(event_));
+#endif
     }
   }
 
@@ -124,12 +162,22 @@ class EventManager {
                             "Event's device %d",
                             device_index, device_index_));
       platform::CUDADeviceGuard guard(device_index_);
+
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(ctx.stream(), event_, 0));
+#else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(ctx.stream(), event_, 0));
+#endif
     }
   }
 
  private:
+#ifdef PADDLE_WITH_HIP
+  unsigned int flags_ = hipEventDefault;
+#else
   unsigned int flags_ = cudaEventDefault;
+#endif
+
   bool is_created_{false};
   gpuEvent_t event_{};
   int8_t device_index_{0};
@@ -138,7 +186,13 @@ class EventManager {
   void CreateEvent(int device_index) {
     device_index_ = device_index;
     platform::CUDADeviceGuard guard(device_index);
+
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventCreateWithFlags(&event_, flags_));
+#else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreateWithFlags(&event_, flags_));
+#endif
+
     is_created_ = true;
   }
 };
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
index dc67205c78f56..793f8dacbf8d4 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -95,7 +95,11 @@ bool ProcessGroupNCCL::NCCLTask::Wait(std::chrono::milliseconds timeout) {
     // If we use the work to do barrier, we should block cpu
     for (auto& place : places_) {
       platform::CUDADeviceGuard gpuGuard(place);
+#ifdef PADDLE_WITH_CUDA
       PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
+#endif
     }
   }
   return true;
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
index 2325e645b4c46..c56f75b46518c 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
@@ -30,8 +30,13 @@
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/stream/cuda_stream.h"
 
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/distributed/collective/NCCLTools.h"
+#endif
+
+#ifdef PADDLE_WITH_RCCL
+#include "paddle/fluid/platform/dynload/rccl.h"
+#else
 #include "paddle/fluid/platform/dynload/nccl.h"
 #endif
 
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index 73baf21015833..e92e1e12b8991 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -1152,7 +1152,8 @@ static std::string GenerateGradNodeCreationContent(
   size_t bwd_in_slot_num = out_vars.size();
   size_t bwd_out_slot_num = in_vars.size();
   const char* GRAD_OP_NODE_TEMPLATE =
-      "      auto grad_node = std::shared_ptr<GradNode%s>(new GradNode%s(%d, "
+      "      auto grad_node = std::shared_ptr<%sGradNodeCompat>(new "
+      "%sGradNodeCompat(%d, "
       "%d));\n";
   grad_node_creation_str += "    // Create GradOpNode\n";
   grad_node_creation_str +=
@@ -2080,10 +2081,8 @@ static std::string GenerateSingleOpBase(
   generated_grad_function_body +=
       "  paddle::small_vector<std::vector<paddle::experimental::Tensor>, "
       "egr::kSlotSmallVectorSize> " +
-      hooked_grads +
-      " = "
-      "GradNode" +
-      fwd_op_type + "::ApplyGradientHooks(grads);\n";
+      hooked_grads + " = " + fwd_op_type +
+      "GradNodeCompat::ApplyGradientHooks(grads);\n";
 
   // [Generation] Get Ins Map
   std::unordered_set<std::string> dispensable_input_name_set;
@@ -2547,7 +2546,7 @@ static std::string GenerateGradNodeCCContents(
   */
 
   const char* EAGER_LOG_TEMPLATE =
-      "  VLOG(3) << \"Running Eager Backward Node: GradNode%s\";\n";
+      "  VLOG(3) << \"Running Eager Backward Node: %sGradNodeCompat\";\n";
   std::string generated_grad_function_body =
       paddle::string::Sprintf(EAGER_LOG_TEMPLATE, fwd_op_type);
 
@@ -2616,7 +2615,7 @@ static std::string GenerateGradNodeCCContents(
   const char* GRAD_FUNCTION_TEMPLATE =
       "paddle::small_vector<std::vector<paddle::experimental::Tensor>, "
       "egr::kSlotSmallVectorSize> "
-      "GradNode%s::operator()("
+      "%sGradNodeCompat::operator()("
       "paddle::small_vector<std::vector<paddle::experimental::Tensor>, "
       "egr::kSlotSmallVectorSize>& grads, bool "
       "create_graph, bool is_new_grad) {\n"
@@ -2645,14 +2644,15 @@ static std::string GenerateGradNodeHeaderContents(
   VLOG(6) << "Generating Grad Node Header";
 
   const char* GRAD_NODE_TEMPLATE =
-      "class GradNode%s : public egr::GradNodeBase {\n"
+      "class %sGradNodeCompat : public egr::GradNodeBase {\n"
       " public:\n"
-      "  GradNode%s() : egr::GradNodeBase() { VLOG(7) << \" Construct "
-      "GradNode%s \"; }\n"
-      "  GradNode%s(size_t bwd_in_slot_num, size_t bwd_out_slot_num) : "
+      "  %sGradNodeCompat() : egr::GradNodeBase() { VLOG(7) << \" Construct "
+      "%sGradNodeCompat \"; }\n"
+      "  %sGradNodeCompat(size_t bwd_in_slot_num, size_t bwd_out_slot_num) : "
       "egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) { VLOG(7) << \" "
-      "Construct GradNode%s \"; }\n"
-      "  ~GradNode%s() override { VLOG(6) << \" Destruct GradNode%s \"; }\n"
+      "Construct %sGradNodeCompat \"; }\n"
+      "  ~%sGradNodeCompat() override { VLOG(6) << \" Destruct "
+      "%sGradNodeCompat \"; }\n"
       "\n"
       "  virtual "
       "paddle::small_vector<std::vector<paddle::experimental::Tensor>, "
@@ -2667,11 +2667,11 @@ static std::string GenerateGradNodeHeaderContents(
       "%s\n"
       "    SetIsTensorWrappersCleared(true);\n"
       "  }\n"
-      "  std::string name() override { return \"GradNode%sMid\"; } \n "
+      "  std::string name() override { return \"%sGradNodeCompat\"; } \n "
       "\n"
       "std::shared_ptr<GradNodeBase> Copy() const override {{\n "
-      "    auto copied_node = std::shared_ptr<GradNode%s>(new "
-      "GradNode%s(*this));\n "
+      "    auto copied_node = std::shared_ptr<%sGradNodeCompat>(new "
+      "%sGradNodeCompat(*this));\n "
       "    return copied_node;\n "
       "}}\n "
       "\n"
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
index 87b2ff986dc92..dee3b3d79a2e7 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -147,7 +147,18 @@ def RemoveConstAndReference(string):
 
 
 def GetGradNodeName(string):
-    return f"GradNode{string}Final"
+
+    def str2Hump(text):
+        arr = filter(None, text.split('_'))
+        res = ''
+        for i in arr:
+            res = res + i[0].upper() + i[1:]
+        return res
+
+    string = str2Hump(string)
+    if string.rfind("Grad") == (len(string) - 4):
+        string = string[:-4]
+    return f"{string}GradNodeFinal"
 
 
 def GetDygraphForwardFunctionName(string):
@@ -335,6 +346,7 @@ def ParseYamlInplaceInfo(string):
 ###  Generator Base  ###
 ########################
 class FunctionGeneratorBase:
+
     def __init__(self, forward_api_contents, namespace):
         self.forward_api_contents = forward_api_contents
         self.namespace = namespace
@@ -357,7 +369,7 @@ def __init__(self, forward_api_contents, namespace):
         # Special Op Attributes
         self.optional_inputs = []  #[name, ...]
         self.no_need_buffers = []  #[name, ...]
-        self.intermediate_outputs = []  #[name, ...]    
+        self.intermediate_outputs = []  #[name, ...]
         self.forward_inplace_map = {}  #{name : name, ...}
 
     def ParseForwardInplaceInfo(self):
@@ -423,8 +435,9 @@ def DetermineForwardPositionMap(self, forward_inputs_list,
             input_type = forward_input[1]
             input_pos = forward_input[2]
 
-            self.forward_inputs_position_map[
-                input_name] = [input_type, input_pos]
+            self.forward_inputs_position_map[input_name] = [
+                input_type, input_pos
+            ]
 
         for i in range(len(forward_returns_list)):
             forward_return = forward_returns_list[i]
@@ -432,11 +445,13 @@ def DetermineForwardPositionMap(self, forward_inputs_list,
             return_type = forward_return[1]
             return_pos = forward_return[2]
 
-            self.forward_outputs_position_map[
-                return_name] = [return_type, return_pos]
+            self.forward_outputs_position_map[return_name] = [
+                return_type, return_pos
+            ]
 
 
 class GeneratorBase:
+
     def __init__(self, api_yaml_path):
         self.namespace = ""
         self.api_yaml_path = api_yaml_path
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index d23d71b07626d..c0feecd2e9e20 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -411,6 +411,7 @@ def GenerateCoreOpInfoDefinition():
 ## Generator Class ##
 #####################
 class DygraphFunctionGeneratorBase(FunctionGeneratorBase):
+
     def __init__(self, forward_api_contents, grad_api_contents, namespace):
         self.forward_api_contents = forward_api_contents
         # Members from Parent:
@@ -532,8 +533,8 @@ def ForwardsValidationCheck(self):
             max_input_position = max(max_input_position, pos)
 
         for _, _, _, pos in forward_attrs_list:
-            assert pos > max_input_position, AssertMessage(pos,
-                                                           max_input_position)
+            assert pos > max_input_position, AssertMessage(
+                pos, max_input_position)
 
     def BackwardValidationCheck(self):
         backward_forward_inputs_map = self.backward_forward_inputs_map
@@ -678,7 +679,7 @@ def GenerateNodeCreationCodes(self):
         # Node Construction
         num_backward_inputs = len(forward_outputs_position_map.keys())
         num_backward_outputs = len(forward_inputs_position_map.keys())
-        grad_node_name = GetGradNodeName(forward_api_name)
+        grad_node_name = GetGradNodeName(self.backward_api_name)
 
         # Helper
         indent = GetIndent(2)
@@ -845,6 +846,7 @@ def run(self):
 
 
 class DygraphForwardFunctionGenerator(DygraphFunctionGeneratorBase):
+
     def __init__(self, forward_api_contents, grad_api_contents, namespace):
         DygraphFunctionGeneratorBase.__init__(self, forward_api_contents,
                                               grad_api_contents, namespace)
@@ -947,12 +949,12 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
         if is_inplaced and len(forward_outputs_position_map) == 1:
             api_out_type = "auto&"
         forward_call_str = f"{indent}{api_out_type} api_result = paddle::experimental::{namespace}{function_name}({inputs_call_args_str});"
-        num_outputs = len(forward_outputs_position_map.keys()) - len(
-            intermediate_outputs)
+        num_outputs = len(
+            forward_outputs_position_map.keys()) - len(intermediate_outputs)
 
         # Check Nan and Inf
-        check_nan_inf_str = CHECK_NAN_AND_INF_TEMPLATE.format(function_name,
-                                                              "api_result")
+        check_nan_inf_str = CHECK_NAN_AND_INF_TEMPLATE.format(
+            function_name, "api_result")
 
         # Get Outputs
         get_outputs_str = ""
@@ -1007,8 +1009,8 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
                 if pos == corresponding_pos:
                     has_corresponding_grad_output = True
             if has_corresponding_grad_output or (
-                    name in forward_inplace_map and
-                    forward_api_name not in inplace_check_blacklist):
+                    name in forward_inplace_map
+                    and forward_api_name not in inplace_check_blacklist):
                 input_autograd_meta_name = GetAutoGradMetaName(name)
                 if IsPlainTensorType(ttype):
                     input_autograd_meta = f"{indent}egr::AutogradMeta* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({name});"
@@ -1116,17 +1118,20 @@ def UpdateCoreOpsInformation(self, is_inplaced):
         forward_outputs_position_map = self.forward_outputs_position_map
         forward_attrs_list = self.forward_attrs_list
 
-        num_args = len(forward_inputs_position_map.keys()) + len(
-            forward_attrs_list)
+        num_args = len(
+            forward_inputs_position_map.keys()) + len(forward_attrs_list)
         num_returns = len(forward_outputs_position_map.keys())
 
         final_state_fwd_api_name = "final_state_" + forward_api_name
-        core_ops_returns_info[
-            final_state_fwd_api_name] = ["" for i in range(num_returns)]
-        core_ops_args_info[
-            final_state_fwd_api_name] = ["" for i in range(num_args)]
-        core_ops_args_type_info[
-            final_state_fwd_api_name] = ["" for i in range(num_args)]
+        core_ops_returns_info[final_state_fwd_api_name] = [
+            "" for i in range(num_returns)
+        ]
+        core_ops_args_info[final_state_fwd_api_name] = [
+            "" for i in range(num_args)
+        ]
+        core_ops_args_type_info[final_state_fwd_api_name] = [
+            "" for i in range(num_args)
+        ]
 
         for name, (ttype, pos) in forward_inputs_position_map.items():
             core_ops_args_info[final_state_fwd_api_name][pos] = name
@@ -1159,6 +1164,7 @@ def run(self):
 
 
 class DygraphNodeGenerator(DygraphFunctionGeneratorBase):
+
     def __init__(self,
                  forward_api_contents,
                  grad_api_contents,
@@ -1167,7 +1173,7 @@ def __init__(self,
         DygraphFunctionGeneratorBase.__init__(self, forward_api_contents,
                                               grad_api_contents, namespace)
 
-        # Record name mapping from forward_api_name to grad_api_names
+        # Record name mapping from forward_var_name to grad_var_names
         self.to_next_grad_name_mapping = {}  # {name : name}
 
         # Generated Results
@@ -1281,7 +1287,7 @@ def GenerateNodeDeclaration(self):
                 attribute_members_str += ATTRIBUTE_MEMBER_TEMPLATE.format(
                     RemoveConstAndReference(atype), saved_attr_name)
 
-        grad_node_name = GetGradNodeName(forward_op_name)
+        grad_node_name = GetGradNodeName(self.backward_api_name)
         self.node_declaration_str = NODE_DECLARATION_TEMPLATE.format(
             grad_node_name, grad_node_name, grad_node_name, grad_node_name,
             grad_node_name, clear_tensor_wrapper_str, grad_node_name,
@@ -1447,8 +1453,8 @@ def GenerateNodeDefinition(self, next_grad_node_creation_str,
 {indent}{grad_api_namespace}{backward_api_name}({grad_api_args_str});"""
 
         # Check Nan and Inf
-        check_nan_inf_str = CHECK_NAN_AND_INF_TEMPLATE.format(backward_api_name,
-                                                              "returns")
+        check_nan_inf_str = CHECK_NAN_AND_INF_TEMPLATE.format(
+            backward_api_name, "returns")
 
         # Prepare for Node Creation if Necessary
         inputs_autograd_meta_str = ""
@@ -1533,7 +1539,7 @@ def GenerateNodeDefinition(self, next_grad_node_creation_str,
         returns_str = f"{indent}if(NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&returns);\n"
         returns_str += f"{indent}return returns;\n"
 
-        grad_node_name = GetGradNodeName(forward_api_name)
+        grad_node_name = GetGradNodeName(self.backward_api_name)
 
         self.node_definition_str = GRAD_FUNCTION_TEMPLATE.format(
             grad_node_name, fill_zero_str, get_grad_in_args_str, grad_node_name,
@@ -1560,6 +1566,7 @@ def run(self):
 
 
 class DygraphForwardAndNodesGenerator(GeneratorBase):
+
     def __init__(self, api_yaml_path, backward_yaml_path):
         # Parent members:
         # self.namespace
@@ -1617,9 +1624,10 @@ def GenerateCode(self):
                 next_grad_api_contents = self.GetBackwardAPIContents(
                     backward_api_contents)
 
-                node_generator = DygraphNodeGenerator(
-                    forward_api_contents, backward_api_contents, namespace,
-                    next_grad_api_contents)
+                node_generator = DygraphNodeGenerator(forward_api_contents,
+                                                      backward_api_contents,
+                                                      namespace,
+                                                      next_grad_api_contents)
                 node_generator.run()
                 self.node_declaration_str += node_generator.node_declaration_str + "\n"
                 self.node_definition_str += node_generator.node_definition_str + "\n"
diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
index 36cfb4db1137a..09bbc2a0ba40a 100644
--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -536,7 +536,7 @@ std::vector<paddle::experimental::Tensor> RunBackward(
     const std::vector<paddle::experimental::Tensor>& inputs = {},
     bool allow_unused = false,
     const std::vector<paddle::experimental::Tensor>& no_grad_vars = {}) {
-  VLOG(6) << "Start Backward";
+  VLOG(3) << "Start Backward";
 
   // *Gradient Hook should happen at node-level
   // *Inplace version check should perform at node-level
@@ -634,7 +634,7 @@ std::vector<paddle::experimental::Tensor> RunBackward(
     GeneralGrad::Instance().ReconstructBackwardGraph(orig_queue);
   }
 
-  VLOG(6) << "Update In degree Map for backward";
+  VLOG(3) << "Update In degree Map for backward";
   // 3. Compute in_degree for each node
   std::unordered_map<GradNodeBase*, int> node_in_degree_map =
       getInDegreeMap(queue);
@@ -654,7 +654,7 @@ std::vector<paddle::experimental::Tensor> RunBackward(
   //    |- node(grads)
   //    |- Prepare for next node
   // 3. Update queue
-  VLOG(6) << "Run Backward";
+  VLOG(3) << "Run Backward";
   while (!queue.empty()) {
     GradNodeBase* node = queue.front();
     VLOG(6) << "Running GradNode:" << node->name();
@@ -739,7 +739,7 @@ std::vector<paddle::experimental::Tensor> RunBackward(
         // Since we make edge has as same rank as bwd outputs, we indexing them
         // with the same rank(i, j)
         auto next_node_shared = edge.GetMutableGradNode();
-
+        VLOG(3) << "Found pending node: " << next_node_shared->name();
         // Next node could be nullptr if it is leaf tensor with no
         // AccumulationNode attached
         // Or it could also originated from dispensable inputs
@@ -826,7 +826,7 @@ void Backward(
     const std::vector<paddle::experimental::Tensor>& tensors,  // outputs
     const std::vector<paddle::experimental::Tensor>& grad_tensors,
     bool retain_graph) {
-  VLOG(6) << "Run in Backward";
+  VLOG(3) << "Run in Backward";
   paddle::platform::RecordEvent backward_record_event(
       "backward", paddle::platform::TracerEventType::Operator, 1);
   RunBackward(tensors, grad_tensors, retain_graph);
@@ -839,7 +839,7 @@ std::vector<paddle::experimental::Tensor> Grad(
     const std::vector<paddle::experimental::Tensor>& grad_tensors,
     bool retain_graph, bool create_graph, bool only_inputs, bool allow_unused,
     const std::vector<paddle::experimental::Tensor>& no_grad_vars) {
-  VLOG(6) << "Run in Grad";
+  VLOG(3) << "Run in Grad";
 
   DuplicateCheck(inputs, true /* is_input */);
   DuplicateCheck(tensors, false /* is_input */);
diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.cc b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
index 3efcf3b21a4e3..beff23d433421 100644
--- a/paddle/fluid/eager/custom_operator/custom_operator_node.cc
+++ b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
@@ -263,9 +263,9 @@ RunCustomOpNode::operator()(
                                 trace_backward, &(ins_auto_grad_metas[i]));
   }
 
-  if (require_any_grad) {
-    auto meta_info_map = egr::Controller::Instance().GetOpMetaInfoMap();
-    const auto& vec_map = meta_info_map.at(op_type_);
+  auto meta_info_map = egr::Controller::Instance().GetOpMetaInfoMap();
+  const auto& vec_map = meta_info_map.at(op_type_);
+  if (require_any_grad && (vec_map.size() > 2)) {
     paddle::platform::RecordEvent node_creation_record_event(
         "Custom Op " + op_type_ + " double_grad node_creation",
         paddle::platform::TracerEventType::OperatorInner, 1);
diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc
index 01b31a2500cb0..1cba94339bfdf 100644
--- a/paddle/fluid/eager/grad_node_info.cc
+++ b/paddle/fluid/eager/grad_node_info.cc
@@ -230,7 +230,7 @@ void GradNodeBase::SetGradOutMeta(const paddle::experimental::Tensor& fwd_in,
       fwd_in_meta->SetGradNode(
           std::make_shared<egr::GradNodeAccumulation>(fwd_in_meta));
     }
-    VLOG(6) << "Add Edges for slot: " << slot_rank << ", the Edge is from "
+    VLOG(3) << "Add Edges for slot: " << slot_rank << ", the Edge is from "
             << this->name() << " (addr: " << this << ") "
             << " to " << fwd_in_meta->GetMutableGradNode()->name()
             << " (addr: " << fwd_in_meta->GetMutableGradNode().get() << ")";
@@ -286,7 +286,7 @@ void GradNodeBase::SetGradOutMeta(
         fwd_in_meta->SetGradNode(
             std::make_shared<egr::GradNodeAccumulation>(fwd_in_meta));
       }
-      VLOG(6) << "Add Edges for slot: " << slot_rank << ", the Edge is from "
+      VLOG(3) << "Add Edges for slot: " << slot_rank << ", the Edge is from "
               << this->name() << " (addr: " << this << ") "
               << " to " << fwd_in_meta->GetMutableGradNode()->name()
               << " (addr: " << fwd_in_meta->GetMutableGradNode().get() << ")";
diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc
index 7f5ae233874b6..ee5dd622412e1 100644
--- a/paddle/fluid/eager/grad_tensor_holder.cc
+++ b/paddle/fluid/eager/grad_tensor_holder.cc
@@ -77,6 +77,8 @@ void GradTensorHolder::CopyValueFromTensor(
             "Only Support DENSE_TENSOR, SPARSE_COO_TENSOR, SPARSE_CSR_TENSOR "
             "now."));
       }
+      egr::EagerUtils::autograd_meta(&(buffer_[slot_id][rank]))
+          ->SetStopGradient(false);
     }
   }
 }
@@ -84,8 +86,6 @@ void GradTensorHolder::CopyValueFromTensor(
 void GradTensorHolder::add(size_t slot_id, size_t rank,
                            const paddle::experimental::Tensor& t,
                            bool create_graph) {
-  // TODO(jiabin): We need to deal with empty input_buffer with slot size not
-  // empty;
   PADDLE_ENFORCE(slot_id < buffer_.size(),
                  paddle::platform::errors::Fatal(
                      "Invalid slot_id for GradTensorHolder::add() "
diff --git a/paddle/fluid/eager/nan_inf_utils.cc b/paddle/fluid/eager/nan_inf_utils.cc
index d1c5983a3702f..0ed1a198c916d 100644
--- a/paddle/fluid/eager/nan_inf_utils.cc
+++ b/paddle/fluid/eager/nan_inf_utils.cc
@@ -114,6 +114,7 @@ void CheckTensorHasNanOrInf(const std::string& api_name,
                             const TupleOfTensorAndVector& tensors) {
   CheckTensorHasNanOrInf(api_name, std::get<0>(tensors));
   CheckTensorHasNanOrInf(api_name, std::get<1>(tensors));
+  CheckTensorHasNanOrInf(api_name, std::get<2>(tensors));
 }
 
 }  // namespace egr
diff --git a/paddle/fluid/eager/nan_inf_utils.h b/paddle/fluid/eager/nan_inf_utils.h
index a411504fa4900..815e3bd6cd14f 100644
--- a/paddle/fluid/eager/nan_inf_utils.h
+++ b/paddle/fluid/eager/nan_inf_utils.h
@@ -31,7 +31,8 @@ using TupleOfFourTensors = std::tuple<Tensor, Tensor, Tensor, Tensor>;
 using TupleOfFiveTensors = std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor>;
 using TupleOfSixTensors =
     std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor>;
-using TupleOfTensorAndVector = std::tuple<Tensor, std::vector<Tensor>>;
+using TupleOfTensorAndVector =
+    std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>>;
 
 void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor);
 
diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt
index 3b22a4b0d5d7a..42235b7c484e3 100644
--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
@@ -5,7 +5,7 @@ if(WITH_PSLIB)
     if(NOT WITH_HETERPS)
       set(BRPC_DEPS brpc)
     endif()
-  endif(WITH_PSLIB_BRPC)
+  endif()
   cc_library(
     fleet_wrapper
     SRCS fleet_wrapper.cc
@@ -21,7 +21,7 @@ else()
     fleet_wrapper
     SRCS fleet_wrapper.cc
     DEPS framework_proto variable_helper scope)
-endif(WITH_PSLIB)
+endif()
 
 if(WITH_HETERPS)
   if(WITH_NCCL AND WITH_GPU)
@@ -48,7 +48,7 @@ else()
     ps_gpu_wrapper
     SRCS ps_gpu_wrapper.cc
     DEPS gloo_wrapper)
-endif(WITH_HETERPS)
+endif()
 
 if(WITH_NCCL OR WITH_RCCL)
   cc_library(
@@ -74,7 +74,7 @@ else()
     box_wrapper
     SRCS box_wrapper.cc
     DEPS framework_proto lod_tensor)
-endif(WITH_BOX_PS)
+endif()
 
 if(WITH_GLOO)
   cc_library(
@@ -94,7 +94,7 @@ else()
     metrics
     SRCS metrics.cc
     DEPS gloo_wrapper)
-endif(WITH_GLOO)
+endif()
 
 if(WITH_PSLIB)
   set(DISTRIBUTE_COMPILE_FLAGS
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc
index 72b7477f2b870..dbea438b14048 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -27,9 +27,18 @@
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
 
+// The difference between "sequential_run" and "serial_run":
+// "sequential_run" dispatches OPs one by one according to the sequence in the
+// Program, while "serial_run" ensures that all Ops are scheduled in a singal
+// thread. In standalone executor, "sequential_run" is also "serial_run", while
+// "serial_run" is not necessarily "sequential_run".
+PADDLE_DEFINE_EXPORTED_bool(new_executor_sequential_run, false,
+                            "Enable sequential execution for standalone "
+                            "executor, only applied to GPU OPs.");
+
 PADDLE_DEFINE_EXPORTED_bool(
-    new_executor_sequential_run, false,
-    "Enable sequential execution for standalone executor, used for debug");
+    new_executor_serial_run, false,
+    "Enable serial execution for standalone executor, used for debug.");
 
 DECLARE_bool(use_mkldnn);
 
@@ -42,10 +51,8 @@ constexpr size_t kPrepareWorkQueueIdx = 2;
 void AsyncWorkQueue::AddTask(const OpFuncType& op_func_type,
                              std::function<void()> fn) {
   VLOG(4) << "Add task: " << static_cast<size_t>(op_func_type) << " ";
-  // NOTE(zhiqiu): use thhe second queue of size of, so only one thread is used.
-  if (FLAGS_new_executor_sequential_run) {
-    VLOG(4) << "FLAGS_new_executor_sequential_run:"
-            << FLAGS_new_executor_sequential_run;
+  // NOTE(zhiqiu): use the second queue of size of, so only one thread is used.
+  if (FLAGS_new_executor_serial_run) {
     queue_group_->AddTask(static_cast<size_t>(OpFuncType::kQueueAsync),
                           std::move(fn));
   } else {
@@ -789,12 +796,14 @@ std::map<int, std::list<int>> build_op_downstream_map(
   std::set<int>
       remove_duplicate;  // remove the duplicate between inputs and outputs
 
+  size_t op_num = vec_instruction.size();
+
   // reserve
-  for (size_t op_idx = 0; op_idx < vec_instruction.size(); ++op_idx) {
+  for (size_t op_idx = 0; op_idx < op_num; ++op_idx) {
     op2dependences[op_idx] = std::set<int>();
   }
 
-  for (size_t op_idx = 0; op_idx < vec_instruction.size(); ++op_idx) {
+  for (size_t op_idx = 0; op_idx < op_num; ++op_idx) {
     remove_duplicate.clear();
     // step1: update the op2dependences structure
     for (auto& item :
@@ -859,8 +868,7 @@ std::map<int, std::list<int>> build_op_downstream_map(
   std::map<int, std::list<int>> op_downstream_map =
       GetDownstreamMap(op2dependences);
 
-  ShrinkDownstreamMap(&op_downstream_map, op_happens_before,
-                      vec_instruction.size());
+  ShrinkDownstreamMap(&op_downstream_map, op_happens_before, op_num);
 
   // add dependences for random op, make sure that the random op is scheduled
   // sequentially
@@ -880,7 +888,7 @@ std::map<int, std::list<int>> build_op_downstream_map(
   };
 
   int dependence_op_idx = -1;
-  for (size_t op_idx = 0; op_idx < vec_instruction.size(); ++op_idx) {
+  for (size_t op_idx = 0; op_idx < op_num; ++op_idx) {
     if (random_op_set.count(vec_instruction[op_idx].OpBase()->Type())) {
       if (dependence_op_idx != -1) {
         AddDownstreamOp(dependence_op_idx, op_idx, &op_downstream_map,
@@ -907,7 +915,7 @@ std::map<int, std::list<int>> build_op_downstream_map(
   };
 
   dependence_op_idx = -1;
-  for (size_t op_idx = 0; op_idx < vec_instruction.size(); ++op_idx) {
+  for (size_t op_idx = 0; op_idx < op_num; ++op_idx) {
     if (is_comm_op(vec_instruction[op_idx].OpBase()->Type())) {
       if (dependence_op_idx != -1) {
         AddDownstreamOp(dependence_op_idx, op_idx, &op_downstream_map,
@@ -931,7 +939,7 @@ std::map<int, std::list<int>> build_op_downstream_map(
   // c_sync_comm_stream(a)
   const std::string kSyncComm = "c_sync_comm_stream";
   dependence_op_idx = -1;
-  for (size_t op_idx = 0; op_idx < vec_instruction.size(); ++op_idx) {
+  for (size_t op_idx = 0; op_idx < op_num; ++op_idx) {
     if (vec_instruction[op_idx].OpBase()->Type() == kSyncComm) {
       dependence_op_idx = op_idx;
     } else {
@@ -947,7 +955,7 @@ std::map<int, std::list<int>> build_op_downstream_map(
 
   // add dependency for coalesce_tensor
   const std::string kCoalesceTensor = "coalesce_tensor";
-  for (size_t op_idx = 0; op_idx < vec_instruction.size(); ++op_idx) {
+  for (size_t op_idx = 0; op_idx < op_num; ++op_idx) {
     if (vec_instruction[op_idx].OpBase()->Type() == kCoalesceTensor) {
       VLOG(4) << "Add depend for " << kCoalesceTensor << " " << op_idx;
       auto fused_out = vec_instruction[op_idx].Outputs().at("FusedOutput")[0];
@@ -977,7 +985,7 @@ std::map<int, std::list<int>> build_op_downstream_map(
 
       // find first op that reads fused_out
       auto first_read_fused_out_op = -1;
-      for (auto j = op_idx + 1; j < vec_instruction.size(); ++j) {
+      for (auto j = op_idx + 1; j < op_num; ++j) {
         if (is_read(vec_instruction[j], fused_out)) {
           first_read_fused_out_op = j;
           break;
@@ -1017,8 +1025,7 @@ std::map<int, std::list<int>> build_op_downstream_map(
       // we should take the last one to add depned instead of
       // 'first_read_fused_out_op'
       size_t target = first_read_fused_out_op;
-      for (size_t j = first_read_fused_out_op + 1; j < vec_instruction.size();
-           ++j) {
+      for (size_t j = first_read_fused_out_op + 1; j < op_num; ++j) {
         if (j == target + 1 &&
             is_comm_op(vec_instruction[target].OpBase()->Type()) &&
             is_comm_op(vec_instruction[j].OpBase()->Type())) {
@@ -1032,7 +1039,6 @@ std::map<int, std::list<int>> build_op_downstream_map(
         for (auto var_id : outputs) {
           if (is_read(vec_instruction[j], var_id)) {
             AddDownstreamOp(target, j, &op_downstream_map, *op_happens_before);
-            op2dependences[j].insert(target);
             VLOG(4) << target << " -> " << j;
             VLOG(4) << "Add depend from "
                     << vec_instruction[target].OpBase()->Type() << " to "
@@ -1043,6 +1049,24 @@ std::map<int, std::list<int>> build_op_downstream_map(
     }
   }
 
+  if (FLAGS_new_executor_sequential_run) {
+    dependence_op_idx = -1;
+    for (size_t op_idx = 0; op_idx < op_num; ++op_idx) {
+      if (!IsCpuOp(vec_instruction[op_idx])) {
+        if (dependence_op_idx != -1) {
+          AddDownstreamOp(dependence_op_idx, op_idx, &op_downstream_map,
+                          *op_happens_before);
+          VLOG(4) << "Add depend from "
+                  << vec_instruction[dependence_op_idx].OpBase()->Type() << "("
+                  << dependence_op_idx << ") to "
+                  << vec_instruction[op_idx].OpBase()->Type() << "(" << op_idx
+                  << ")";
+        }
+        dependence_op_idx = op_idx;
+      }
+    }
+  }
+
   VLOG(8) << "downstream count: " << CountDownstreamMap(op_downstream_map);
   VLOG(8) << "downstream_map: " << std::endl
           << StringizeDownstreamMap(op_downstream_map);
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc
index 24e65599018fa..e804d153f2814 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc
@@ -12,17 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// clang-format off
+#include "paddle/fluid/framework/paddle2cinn/cinn_cache_key.h"
+
 #include <map>
 #include <unordered_set>
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/paddle2cinn/cinn_cache_key.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/phi/core/ddim.h"
-// clang-format on
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc
index 8a6f92a6f45d0..68c701530a12d 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc
@@ -12,12 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-// clang-format off
-#include "gtest/gtest.h"
+#include "paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h"
 
+#include "gtest/gtest.h"
 #include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h"
-// clang-format on
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/paddle2cinn/transform_desc_test.cc b/paddle/fluid/framework/paddle2cinn/transform_desc_test.cc
index ae9f51c3f6790..1ee108f566f5f 100644
--- a/paddle/fluid/framework/paddle2cinn/transform_desc_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/transform_desc_test.cc
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// clang-format off
+#include "paddle/fluid/framework/paddle2cinn/transform_desc.h"
+
 #include <unordered_map>
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/framework/paddle2cinn/transform_desc.h"
-// clang-format on
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc
index 88738255af78e..f4af3c5eba00e 100644
--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@@ -59,9 +59,12 @@ ProgramDesc::ProgramDesc() {
 
 ProgramDesc::ProgramDesc(const ProgramDesc &o) {
   desc_ = o.desc_;
+  std::vector<framework::BlockDesc *> old_block_desc;
   for (int i = 0; i < desc_.blocks_size(); ++i) {
     auto *block = desc_.mutable_blocks(i);
     blocks_.emplace_back(new BlockDesc(*o.blocks_[i], block, this));
+    // record all block desc's ptr from origin program
+    old_block_desc.emplace_back(o.blocks_[i].get());
   }
   for (size_t block_id = 0; block_id < blocks_.size(); ++block_id) {
     auto all_ops = blocks_[block_id]->AllOps();
@@ -70,9 +73,21 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {
 
       for (const std::string &attr_name : op->AttrNames()) {
         if (op->GetAttrType(attr_name) == proto::AttrType::BLOCK) {
-          int sub_block_id =
-              o.Block(block_id).Op(op_id)->GetBlockAttrId(attr_name);
-          op->SetBlockAttr(attr_name, MutableBlock(sub_block_id));
+          framework::BlockDesc *block_desc =
+              BOOST_GET_CONST(framework::BlockDesc *, op->GetAttr(attr_name));
+          if (std::find(old_block_desc.begin(), old_block_desc.end(),
+                        block_desc) != old_block_desc.end()) {
+            // The block is owned by the origin program. Just use id to get
+            // the corresponding block.
+            int sub_block_id =
+                o.Block(block_id).Op(op_id)->GetBlockAttrId(attr_name);
+            op->SetBlockAttr(attr_name, MutableBlock(sub_block_id));
+          } else {
+            // The block is not owned by the origin program. Should copy
+            // the real block desc instead of logical block in the program.
+            VLOG(3) << "Set op's block attr with the original block";
+            op->SetBlockAttr(attr_name, block_desc);
+          }
         } else if (op->GetAttrType(attr_name) == proto::AttrType::BLOCKS) {
           std::vector<int> sub_block_ids =
               o.Block(block_id).Op(op_id)->GetBlocksAttrIds(attr_name);
diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc
index a4baca6f25724..5fd8eae852859 100644
--- a/paddle/fluid/imperative/partial_grad_engine.cc
+++ b/paddle/fluid/imperative/partial_grad_engine.cc
@@ -1085,7 +1085,7 @@ void PartialGradEngine::Clear() {
 void PartialGradEngine::Execute() {
   PADDLE_ENFORCE_NOT_NULL(task_, platform::errors::PermissionDenied(
                                      "PartialGradEngine has been destructed"));
-  VLOG(10) << "Starts to execute PartialGradEngine";
+  VLOG(3) << "Starts to execute PartialGradEngine";
   results_ = task_->Run();
   Clear();
 }
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index a29e530b2b80c..4e991a3013875 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -97,7 +97,7 @@ set(SHARED_INFERENCE_DEPS ${fluid_modules} phi analysis_predictor
 
 if(WITH_CRYPTO)
   set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} paddle_crypto)
-endif(WITH_CRYPTO)
+endif()
 
 if(WITH_PSCORE)
   set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} fleet ps_service
@@ -108,7 +108,7 @@ if(WITH_ONNXRUNTIME)
   set(SHARED_INFERENCE_SRCS
       ${SHARED_INFERENCE_SRCS}
       ${CMAKE_CURRENT_SOURCE_DIR}/api/onnxruntime_predictor.cc)
-endif(WITH_ONNXRUNTIME)
+endif()
 
 # Create shared inference library
 cc_library(
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index f374c5c7cc20f..4b7bed65bab77 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -87,7 +87,7 @@ function(inference_analysis_test TARGET)
     inference_base_test_run(${TARGET} COMMAND ${TARGET} ARGS
                             ${analysis_test_ARGS})
   endif()
-endfunction(inference_analysis_test)
+endfunction()
 
 if(NOT APPLE AND NOT WIN32)
   inference_analysis_test(
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index e25c5e963982f..cace195640f64 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -15,7 +15,7 @@
 
 if(APPLE)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
-endif(APPLE)
+endif()
 
 add_subdirectory(details)
 
@@ -84,14 +84,14 @@ if(WITH_ONNXRUNTIME)
          infer_io_utils
          onnxruntime
          paddle2onnx)
-else(WITH_ONNXRUNTIME)
+else()
   cc_library(
     analysis_predictor
     SRCS analysis_predictor.cc resource_manager.cc infer_context.cc
          ${mkldnn_quantizer_src}
     DEPS ${inference_deps} zero_copy_tensor ir_pass_manager op_compatible_info
          infer_io_utils)
-endif(WITH_ONNXRUNTIME)
+endif()
 
 cc_test(
   test_paddle_inference_api
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 0645af611b9d2..c41b667e18a83 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1960,6 +1960,8 @@ USE_TRT_CONVERTER(strided_slice)
 USE_TRT_CONVERTER(transformer_input_convert)
 USE_TRT_CONVERTER(recover_padding)
 USE_TRT_CONVERTER(remove_padding)
+USE_TRT_CONVERTER(top_k)
+USE_TRT_CONVERTER(top_k_v2)
 #if PADDLE_WITH_CUSPARSELT && IS_TRT_VERSION_GE(8000)
 USE_TRT_CONVERTER(sparse_fc)
 USE_TRT_CONVERTER(sparse_multihead_matmul)
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index a76ed63f10646..c58aad36c97d2 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -21,8 +21,8 @@ macro(safe_set_static_flag)
           CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
     if(${flag_var} MATCHES "/MD")
       string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
-    endif(${flag_var} MATCHES "/MD")
-  endforeach(flag_var)
+    endif()
+  endforeach()
 endmacro()
 
 if(NOT DEFINED PADDLE_LIB)
@@ -105,7 +105,7 @@ if(WITH_GPU)
       endif()
     endif()
     message(STATUS "Current CUDA lib path: ${CUDA_LIB}")
-  endif(NOT WIN32)
+  endif()
 endif()
 
 if(USE_TENSORRT AND WITH_GPU)
@@ -157,9 +157,9 @@ if(WITH_MKL)
     include_directories("${MKLDNN_PATH}/include")
     if(WIN32)
       set(MKLDNN_LIB ${MKLDNN_PATH}/lib/mkldnn.lib)
-    else(WIN32)
+    else()
       set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
-    endif(WIN32)
+    endif()
   endif()
 else()
   set(OPENBLAS_LIB_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}openblas")
@@ -232,7 +232,7 @@ else()
       utf8proc_static
       ${EXTERNAL_LIB})
   set(DEPS ${DEPS} shlwapi.lib)
-endif(NOT WIN32)
+endif()
 
 if(WITH_GPU)
   if(NOT WIN32)
diff --git a/paddle/fluid/inference/api/details/CMakeLists.txt b/paddle/fluid/inference/api/details/CMakeLists.txt
index c1ff6ea68a2bd..2acd96b3fb97c 100644
--- a/paddle/fluid/inference/api/details/CMakeLists.txt
+++ b/paddle/fluid/inference/api/details/CMakeLists.txt
@@ -26,13 +26,13 @@ if(WITH_ONNXRUNTIME)
     zero_copy_tensor_dummy
     SRCS zero_copy_tensor_dummy.cc
     DEPS onnxruntime)
-else(WITH_ONNXRUNTIME)
+else()
   cc_library(
     zero_copy_tensor
     SRCS zero_copy_tensor.cc
     DEPS scope lod_tensor enforce)
   cc_library(zero_copy_tensor_dummy SRCS zero_copy_tensor_dummy.cc)
-endif(WITH_ONNXRUNTIME)
+endif()
 
 cc_test(
   zero_copy_tensor_test
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 2c9ba42821535..52a3c1df9a925 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -60,7 +60,8 @@ list(
   roll_op.cc
   transformer_input_convert_op.cc
   remove_padding_op.cc
-  recover_padding_op.cc)
+  recover_padding_op.cc
+  top_k_op.cc)
 
 if(CUSPARSELT_FOUND AND ${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 8)
   list(APPEND CONVERT_FILES sparse_fc_op.cc sparse_multihead_matmul_op.cc)
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index 077ba32ba89c1..f6ecf76d01675 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -295,20 +295,215 @@ class OpConverter {
     engine->ClearWeights();
   }
 
+  // rank(result) = rank(input)
+  nvinfer1::ITensor* Gather(nvinfer1::ITensor* input,
+                            const std::vector<int32_t> indices, int axis = 0) {
+    auto* indices_tensor = Add1DConstantLayer(indices, " ");
+    auto* result =
+        TRT_ENGINE_ADD_LAYER(engine_, Gather, *input, *indices_tensor, axis)
+            ->getOutput(0);
+    return result;
+  }
+
+  // paddle allows negative index
+  // for axis length = 5, paddle allows [-5, 4]
+  nvinfer1::ITensor* FixNegIndices(nvinfer1::ITensor* input_shape,
+                                   nvinfer1::ITensor* indices) {
+    int rank = input_shape->getDimensions().nbDims;
+    std::vector<int32_t> zero = std::vector<int32_t>(rank, 0);
+    std::vector<int32_t> minus_one = std::vector<int32_t>(rank, -1);
+    nvinfer1::ITensor* zero_tensor = Add1DConstantLayer(zero);
+    nvinfer1::ITensor* minus_one_tensor = Add1DConstantLayer(minus_one);
+    // -1, 0
+    auto* sign = Max(Min(indices, zero_tensor), minus_one_tensor);
+    return Sub(indices, Prod(sign, input_shape));
+  }
+
+  nvinfer1::ITensor* Shape(nvinfer1::ITensor* input) {
+    return TRT_ENGINE_ADD_LAYER(engine_, Shape, *input)->getOutput(0);
+  }
+
+  // Concat not make rank changed
+  nvinfer1::ITensor* Concat(const std::vector<nvinfer1::ITensor*>& inputs,
+                            int axis = 0) {
+    auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Concatenation, inputs.data(),
+                                       inputs.size());
+    if (axis != 0) layer->setAxis(axis);
+    nvinfer1::ITensor* c = layer->getOutput(0);
+    return c;
+  }
+
+  nvinfer1::ITensor* Sum(nvinfer1::ITensor* a, nvinfer1::ITensor* b) {
+    nvinfer1::ITensor* c =
+        TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *a, *b,
+                             nvinfer1::ElementWiseOperation::kSUM)
+            ->getOutput(0);
+    return c;
+  }
+
+  nvinfer1::ITensor* Prod(nvinfer1::ITensor* a, nvinfer1::ITensor* b) {
+    nvinfer1::ITensor* c =
+        TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *a, *b,
+                             nvinfer1::ElementWiseOperation::kPROD)
+            ->getOutput(0);
+    return c;
+  }
+
+  nvinfer1::ITensor* Min(nvinfer1::ITensor* a, nvinfer1::ITensor* b) {
+    nvinfer1::ITensor* c =
+        TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *a, *b,
+                             nvinfer1::ElementWiseOperation::kMIN)
+            ->getOutput(0);
+    return c;
+  }
+
+  nvinfer1::ITensor* Max(nvinfer1::ITensor* a, nvinfer1::ITensor* b) {
+    nvinfer1::ITensor* c =
+        TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *a, *b,
+                             nvinfer1::ElementWiseOperation::kMAX)
+            ->getOutput(0);
+    return c;
+  }
+
+  nvinfer1::ITensor* Sub(nvinfer1::ITensor* a, nvinfer1::ITensor* b) {
+    nvinfer1::ITensor* c =
+        TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *a, *b,
+                             nvinfer1::ElementWiseOperation::kSUB)
+            ->getOutput(0);
+    return c;
+  }
+
+  nvinfer1::ITensor* Div(nvinfer1::ITensor* a, nvinfer1::ITensor* b) {
+    nvinfer1::ITensor* c =
+        TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *a, *b,
+                             nvinfer1::ElementWiseOperation::kDIV)
+            ->getOutput(0);
+    return c;
+  }
+
+  nvinfer1::ITensor* Act(nvinfer1::ITensor* a,
+                         nvinfer1::ActivationType act_type) {
+    nvinfer1::ITensor* c =
+        TRT_ENGINE_ADD_LAYER(engine_, Activation, *a, act_type)->getOutput(0);
+    return c;
+  }
+
+  // Get element tensor of 1D shape tensor
+  nvinfer1::ITensor* GetEleTensorOfShape(nvinfer1::ITensor* shape_tensor,
+                                         int index, bool is_scalar = false) {
+    auto* tensor =
+        TRT_ENGINE_ADD_LAYER(engine_, Gather, *shape_tensor,
+                             *Add1DConstantLayer(index, " ", is_scalar), 0)
+            ->getOutput(0);
+    return tensor;
+  }
+
+  // Create and add Multi-D constant float layer
+  nvinfer1::ITensor* AddConstantLayer(const float* data,
+                                      const std::vector<int32_t>& weight_dims,
+                                      const std::string& weight_name) {
+    std::unique_ptr<framework::Tensor> tmp_tensor(new framework::Tensor());
+    int data_size = std::accumulate(weight_dims.begin(), weight_dims.end(), 1,
+                                    std::multiplies<int>());
+    tmp_tensor->Resize({data_size});
+    auto* tmp_data = tmp_tensor->mutable_data<float>(platform::CPUPlace());
+    for (int i = 0; i < data_size; i++) {
+      tmp_data[i] = data[i];
+    }
+    engine_->SetWeights(weight_name, std::move(tmp_tensor));
+
+    TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
+                                  static_cast<void*>(tmp_data),
+                                  static_cast<size_t>(data_size)};
+    nvinfer1::Dims trt_dims;
+    trt_dims.nbDims = weight_dims.size();
+    for (size_t i = 0; i < weight_dims.size(); i++)
+      trt_dims.d[i] = weight_dims[i];
+    auto const_layer =
+        TRT_ENGINE_ADD_LAYER(engine_, Constant, trt_dims, weight.get());
+    return const_layer->getOutput(0);
+  }
+
+  // Create and add 1D constant float layer
+  nvinfer1::ITensor* Add1DConstantLayer(const std::vector<float>& data,
+                                        const std::string& weight_name = "",
+                                        bool scalar = false) {
+    std::unique_ptr<framework::Tensor> tmp_tensor(new framework::Tensor());
+    int data_size = data.size();
+    tmp_tensor->Resize({data_size});
+    auto* tmp_data = tmp_tensor->mutable_data<float>(platform::CPUPlace());
+    for (int i = 0; i < data_size; i++) {
+      tmp_data[i] = data[i];
+    }
+    engine_->SetWeights(weight_name, std::move(tmp_tensor));
+
+    TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
+                                  static_cast<void*>(tmp_data),
+                                  static_cast<size_t>(data_size)};
+    nvinfer1::Dims input_shape;
+    input_shape.nbDims = scalar ? 0 : 1;
+    input_shape.d[0] = data_size;
+    auto const_layer =
+        TRT_ENGINE_ADD_LAYER(engine_, Constant, input_shape, weight.get());
+    return const_layer->getOutput(0);
+  }
+
+  // Create and add 1D constant layer
+  nvinfer1::ITensor* Add1DConstantLayer(const std::vector<int>& data,
+                                        const std::string& weight_name = "",
+                                        bool scalar = false) {
+    std::unique_ptr<framework::Tensor> tmp_tensor(new framework::Tensor());
+    int data_size = data.size();
+    tmp_tensor->Resize({data_size});
+    auto* tmp_data = tmp_tensor->mutable_data<int>(platform::CPUPlace());
+    for (int i = 0; i < data_size; i++) {
+      tmp_data[i] = data[i];
+    }
+    engine_->SetWeights(weight_name, std::move(tmp_tensor));
+
+    TensorRTEngine::Weight weight{nvinfer1::DataType::kINT32,
+                                  static_cast<void*>(tmp_data),
+                                  static_cast<size_t>(data_size)};
+    nvinfer1::Dims input_shape;
+    input_shape.nbDims = scalar ? 0 : 1;
+    input_shape.d[0] = data_size;
+    auto const_layer =
+        TRT_ENGINE_ADD_LAYER(engine_, Constant, input_shape, weight.get());
+    return const_layer->getOutput(0);
+  }
+
+  nvinfer1::ITensor* Add1DConstantLayer(nvinfer1::Dims data,
+                                        const std::string& weight_name = "",
+                                        bool scalar = false) {
+    std::vector<int> tmp_data;
+    for (int i = 0; i < data.nbDims; i++) tmp_data.push_back(data.d[i]);
+    return Add1DConstantLayer(tmp_data, weight_name, scalar);
+  }
+
+  nvinfer1::ITensor* Add1DConstantLayer(int32_t data,
+                                        const std::string& weight_name = "",
+                                        bool scalar = false) {
+    std::vector<int> tmp_data;
+    tmp_data.push_back(data);
+    return Add1DConstantLayer(tmp_data, weight_name, scalar);
+  }
+
   void RreplenishLayerAndOutput(
       nvinfer1::ILayer* layer, const std::string& layer_type,
       const std::vector<std::string>& output_tensor_names,
       bool test_mode = false) {
     size_t num_out = output_tensor_names.size();
+    std::string layer_name = layer_type + " (Output: ";
     for (size_t i = 0; i < num_out; i++) {
       layer->getOutput(i)->setName(output_tensor_names[i].c_str());
       engine_->SetITensor(output_tensor_names[i], layer->getOutput(i));
       if (test_mode) {
         engine_->DeclareOutput(output_tensor_names[i]);
       }
+      layer_name += output_tensor_names[i];
+      if (i != num_out - 1) layer_name += ", ";
     }
-    layer->setName(
-        (layer_type + " (Output: " + output_tensor_names[0] + ")").c_str());
+    layer->setName((layer_name + ")").c_str());
   }
   void SetEngine(TensorRTEngine* engine) { engine_ = engine; }
 
diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc
index 591eb06a36202..1638515ffc47f 100644
--- a/paddle/fluid/inference/tensorrt/convert/split_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc
@@ -29,7 +29,6 @@ class SplitOpConverter : public OpConverter {
     // Declare inputs
     auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
     auto input_dims = input->getDimensions();
-    size_t input_num = op_desc.Input("X").size();
     size_t output_num = op_desc.Output("Out").size();
 
     // Get Attrs
@@ -41,48 +40,115 @@ class SplitOpConverter : public OpConverter {
     if (op_desc.HasAttr("num")) {
       num = BOOST_GET_CONST(int, op_desc.GetAttr("num"));
     }
-
+    nvinfer1::ITensor* shape_tensor = nullptr;
     if (engine_->with_dynamic_shape()) {
-#if IS_TRT_VERSION_GE(6000)
       axis += (axis < 0) ? input_dims.nbDims : 0;
-#endif
+      // only be called in dynamic_shape mode
+      shape_tensor = Shape(input);
     } else {
       axis += (axis < 0) ? input_dims.nbDims : -1;
     }
-    if (num > 0) {
-      int64_t in_axis_dim = input_dims.d[axis];
-      size_t out_axis_dim = in_axis_dim / num;
-      for (int i = 0; i < num; ++i) {
-        output_lengths.push_back(out_axis_dim);
+    bool in_axis_dim_dynamic = false;
+    nvinfer1::ITensor* avg_len_tensor = nullptr;
+    // need infer output_lengths
+    if (num > 0 && output_lengths.empty()) {
+      if (input_dims.d[axis] > 0) {
+        int64_t in_axis_dim = input_dims.d[axis];
+        size_t out_axis_dim = in_axis_dim / num;
+        for (int i = 0; i < num; ++i) {
+          output_lengths.push_back(out_axis_dim);
+        }
+      } else {
+        in_axis_dim_dynamic = true;
+        auto* num_tensor = Add1DConstantLayer(num);
+        avg_len_tensor =
+            Div(GetEleTensorOfShape(shape_tensor, axis), num_tensor);
       }
     }
 
     nvinfer1::ILayer* layer = nullptr;
+#if IS_TRT_VERSION_GE(6000)
+    if (engine_->with_dynamic_shape()) {
+      nvinfer1::Dims trt_step_dims;
+      trt_step_dims.nbDims = input->getDimensions().nbDims;
+      for (int i = 0; i < trt_step_dims.nbDims; i++) trt_step_dims.d[i] = 1;
+
+      std::vector<int32_t> gather_indices;
+      gather_indices.resize(trt_step_dims.nbDims);
+      std::iota(gather_indices.begin(), gather_indices.end(), 0);
+      gather_indices[axis] = gather_indices.size();
+      std::vector<int32_t> zeros(trt_step_dims.nbDims, 0);
+      auto* zeros_tensor = Add1DConstantLayer(zeros);
+      // input : [N,C,H,W]
+      int start_point = 0;
+      for (size_t i = 0; i < output_num; i++) {
+        nvinfer1::ITensor* this_len_tensor = nullptr;
+        nvinfer1::ITensor* start_point_tensor = nullptr;
+        if (!in_axis_dim_dynamic) {
+          this_len_tensor = Add1DConstantLayer(output_lengths[i]);
+          start_point_tensor = Add1DConstantLayer(start_point);
+          start_point += output_lengths[i];
+        } else {
+          this_len_tensor = avg_len_tensor;
+          auto* i_tensor = Add1DConstantLayer(i);
+          start_point_tensor = Prod(i_tensor, avg_len_tensor);
+        }
+
+        std::vector<nvinfer1::ITensor*> concat_inputs1 = {zeros_tensor,
+                                                          start_point_tensor};
+        std::vector<nvinfer1::ITensor*> concat_inputs2 = {shape_tensor,
+                                                          this_len_tensor};
+        auto* start_tensor = Gather(Concat(concat_inputs1), gather_indices);
+        auto* size_tensor = Gather(Concat(concat_inputs2), gather_indices);
+        layer = TRT_ENGINE_ADD_LAYER(engine_, Slice, *input, trt_step_dims,
+                                     trt_step_dims, trt_step_dims);
+        layer->setInput(1, *start_tensor);
+        layer->setInput(2, *size_tensor);
+
+        auto output_name = op_desc.Output("Out")[i];
+        RreplenishLayerAndOutput(layer, "split", {output_name}, test_mode);
+      }
+    } else {
+      auto chw_input_dims = input->getDimensions();
+      nvinfer1::Dims trt_start_dims;
+      trt_start_dims.nbDims = chw_input_dims.nbDims;
+      memset(trt_start_dims.d, 0, sizeof(int32_t) * chw_input_dims.nbDims);
+      nvinfer1::Dims trt_size_dims = chw_input_dims;
+      nvinfer1::Dims trt_step_dims;
+      trt_step_dims.nbDims = chw_input_dims.nbDims;
+      for (int i = 0; i < trt_step_dims.nbDims; i++) trt_step_dims.d[i] = 1;
+
+      // input : [C,H,W]
+      for (size_t i = 0; i < output_num; i++) {
+        trt_start_dims.d[axis] = std::accumulate(output_lengths.begin(),
+                                                 output_lengths.begin() + i, 0);
+        trt_size_dims.d[axis] = output_lengths[i];
+        layer = TRT_ENGINE_ADD_LAYER(engine_, Slice, *input, trt_start_dims,
+                                     trt_size_dims, trt_step_dims);
+        auto output_name = op_desc.Output("Out")[i];
+        RreplenishLayerAndOutput(layer, "split", {output_name}, test_mode);
+      }
+    }
+#else
     if (engine_->with_dynamic_shape()) {
       bool with_fp16 =
           engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
       plugin::SplitPluginDynamic* plugin =
           new plugin::SplitPluginDynamic(axis, output_lengths, with_fp16);
-      layer = engine_->AddDynamicPlugin(&input, input_num, plugin);
+      layer = engine_->AddDynamicPlugin(&input, 1, plugin);
     } else {
       bool with_fp16 =
           engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
       plugin::SplitPlugin* plugin =
           new plugin::SplitPlugin(axis, output_lengths, with_fp16);
-      layer = engine_->AddPluginV2Ext(&input, input_num, plugin);
+      layer = engine_->AddPluginV2Ext(&input, 1, plugin);
     }
-
-    std::string layer_name = "split (Output: ";
+    std::vector<std::string> output_names;
     for (size_t i = 0; i < output_num; i++) {
-      auto output_name = op_desc.Output("Out")[i];
-      layer->getOutput(i)->setName(output_name.c_str());
-      engine_->SetITensor(output_name, layer->getOutput(i));
-      layer_name += output_name;
-      if (test_mode) {
-        engine_->DeclareOutput(output_name);
-      }
+      output_names.push_back(op_desc.Output("Out")[i]);
     }
-    layer->setName((layer_name + ")").c_str());
+    RreplenishLayerAndOutput(layer, "split", output_names, test_mode);
+#endif
   }
 };
 
diff --git a/paddle/fluid/inference/tensorrt/convert/top_k_op.cc b/paddle/fluid/inference/tensorrt/convert/top_k_op.cc
new file mode 100644
index 0000000000000..1d7f1ca243b2a
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/top_k_op.cc
@@ -0,0 +1,116 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <NvInfer.h>
+
+#include <string>
+
+#include "glog/logging.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/helper.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class TopKOpConverter : public OpConverter {
+ public:
+  TopKOpConverter() {}
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    // Here the two nullptr looks strange, that's because the
+    // framework::OpDesc's constructor is strange.
+    framework::OpDesc op_desc(op, nullptr);
+
+    auto* input_tensor = engine_->GetITensor(op_desc.Input("X")[0]);
+
+    const int k = op_desc.HasAttr("k")
+                      ? BOOST_GET_CONST(int, op_desc.GetAttr("k"))
+                      : 1.0f;
+
+    nvinfer1::Dims input_dims = input_tensor->getDimensions();
+    int axis = input_dims.nbDims;
+    nvinfer1::ITopKLayer* layer =
+        TRT_ENGINE_ADD_LAYER(engine_, TopK, *input_tensor,
+                             nvinfer1::TopKOperation::kMAX, k, 1 << (axis - 1));
+
+    std::vector<std::string> output_names;
+    output_names.push_back(op_desc.Output("Out").front());
+    output_names.push_back(op_desc.Output("Indices").front());
+
+    RreplenishLayerAndOutput(layer, "top_k", output_names, test_mode);
+  }
+};
+class TopKv2OpConverter : public OpConverter {
+ public:
+  TopKv2OpConverter() {}
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    // Here the two nullptr looks strange, that's because the
+    // framework::OpDesc's constructor is strange.
+    framework::OpDesc op_desc(op, nullptr);
+
+    auto* input_tensor = engine_->GetITensor(op_desc.Input("X")[0]);
+
+    const int k = op_desc.HasAttr("k")
+                      ? BOOST_GET_CONST(int, op_desc.GetAttr("k"))
+                      : 1.0f;
+    const int axis = op_desc.HasAttr("axis")
+                         ? BOOST_GET_CONST(int, op_desc.GetAttr("axis"))
+                         : 1.0f;
+    const bool largest = op_desc.HasAttr("largest")
+                             ? BOOST_GET_CONST(bool, op_desc.GetAttr("largest"))
+                             : true;
+    auto flag =
+        largest ? nvinfer1::TopKOperation::kMAX : nvinfer1::TopKOperation::kMIN;
+    nvinfer1::ITopKLayer* layer = nullptr;
+    if (axis == -1) {
+      nvinfer1::Dims input_dims = input_tensor->getDimensions();
+      layer = TRT_ENGINE_ADD_LAYER(engine_, TopK, *input_tensor, flag, k,
+                                   1 << (input_dims.nbDims - 1));
+    } else {
+      if (engine_->with_dynamic_shape()) {
+        layer = TRT_ENGINE_ADD_LAYER(engine_, TopK, *input_tensor, flag, k,
+                                     1 << axis);
+      } else {
+        layer = TRT_ENGINE_ADD_LAYER(engine_, TopK, *input_tensor, flag, k,
+                                     1 << (axis - 1));
+      }
+    }
+    std::vector<std::string> output_names;
+    output_names.push_back(op_desc.Output("Out").front());
+    output_names.push_back(op_desc.Output("Indices").front());
+
+    RreplenishLayerAndOutput(layer, "top_k_v2", output_names, test_mode);
+  }
+};
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(top_k, TopKOpConverter);
+REGISTER_TRT_OP_CONVERTER(top_k_v2, TopKv2OpConverter);
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index b28fe827156c3..0260c489b5041 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -686,7 +686,7 @@ class TensorRTEngine {
 // them, and an macro like this is more extensible when underlying TensorRT
 // library add new layer supports.
 #define TRT_ENGINE_ADD_LAYER(engine__, layer__, ...) \
-  engine__->network()->add##layer__(__VA_ARGS__);
+  engine__->network()->add##layer__(__VA_ARGS__)
 
 class TRTEngineManager {
  public:
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 57ac400dadab3..d9b1e9b85f7e4 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -104,6 +104,8 @@ struct SimpleOpTypeSetTeller : public Teller {
       "stack",
       "transpose2",
       "transpose",
+      "top_k",
+      "top_k_v2",
       "flatten2",
       "flatten",
       "gather",
@@ -175,6 +177,8 @@ struct SimpleOpTypeSetTeller : public Teller {
       "stack",
       "transpose2",
       "transpose",
+      "top_k",
+      "top_k_v2",
       "flatten2",
       "flatten",
       "gather",
@@ -1037,15 +1041,6 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
               return false;
             }
           }
-        } else {
-          for (size_t i = 0; i < axes.size(); i++) {
-            if (starts[i] < 0 || ends[i] < 0) {
-              VLOG(3) << "Invalid slice attribute 'starts' or 'ends'. "
-                         "Negative starts or ends not supported in TensorRT "
-                         "when running in dynamic shape mode.";
-              return false;
-            }
-          }
         }
       }
     }
@@ -1759,6 +1754,34 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       }
     }
 
+    if (op_type == "top_k_v2" || op_type == "top_k") {
+      auto* block = desc.Block();
+      auto x_var_name = desc.Input("X")[0];
+      auto* x_var_desc = block->FindVar(x_var_name);
+      const auto x_shape = x_var_desc->GetShape();
+      if (x_shape.size() == 1) {
+        VLOG(3) << "top_k/top_k_v2 does not support 1-dimensional input in "
+                   "tensorrt";
+        return false;
+      }
+      if (desc.HasAttr("axis")) {
+        int axis = BOOST_GET_CONST(int, desc.GetAttr("axis"));
+        if (axis == 0) {
+          VLOG(3) << "top_k_v2 does not support axis == 0 in "
+                     "tensorrt";
+          return false;
+        }
+      }
+      if (desc.HasAttr("sorted")) {
+        bool sorted = BOOST_GET_CONST(bool, desc.GetAttr("sorted"));
+        if (!sorted) {
+          VLOG(3) << "top_k_v2 does not support results not sorted in "
+                     "tensorrt";
+          return false;
+        }
+      }
+    }
+
 #if IS_TRT_VERSION_GE(8000)
     if (op_type == "sparse_fc" || op_type == "sparse_multihead_matmul") {
       if (!with_dynamic_shape) {
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 307af84fa367e..8261ce288cb97 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -799,7 +799,7 @@ if(WITH_MKLDNN)
   if(NOT LINUX)
     download_quant_data_without_verify(${QUANT2_MobileNetV1_MODEL_DIR}
                                        "MobileNet_qat_perf.tar.gz")
-  endif(NOT LINUX)
+  endif()
   download_quant_data_without_verify(${QUANT2_INT8_MobileNetV1_MODEL_DIR}
                                      "MobileNet_qat_perf_int8.tar.gz")
   inference_analysis_api_quant_test_run(
@@ -829,7 +829,7 @@ if(WITH_MKLDNN)
     download_quant_data_without_verify(
       ${QUANT2_RESNET50_CHANNELWISE_MODEL_DIR}
       ${QUANT2_RESNET50_CHANNELWISE_MODEL_ARCHIVE})
-  endif(NOT LINUX)
+  endif()
   set(QUANT2_RESNET50_MODEL
       ${QUANT2_RESNET50_CHANNELWISE_MODEL_DIR}/ResNet50_qat_channelwise)
   inference_analysis_api_quant_test_run(
@@ -869,10 +869,8 @@ endif()
 set(BERT_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/bert_emb128")
 download_model_and_data_without_verify(
   ${BERT_INSTALL_DIR} "bert_emb128_model.tar.gz" "bert_data_len20.txt.tar.gz")
-if(WITH_GPU)
-  inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR}
-                              analyzer_bert_tester.cc)
-endif()
+inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR}
+                            analyzer_bert_tester.cc)
 
 # multiple models prediction
 set(MMP_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/multi_model_prediction")
diff --git a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt
index 0aee989367e4b..e1ef57e3a136e 100644
--- a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt
@@ -22,8 +22,8 @@ macro(safe_set_static_flag)
           CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
     if(${flag_var} MATCHES "/MD")
       string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
-    endif(${flag_var} MATCHES "/MD")
-  endforeach(flag_var)
+    endif()
+  endforeach()
 endmacro()
 
 if(NOT DEFINED PADDLE_LIB)
@@ -106,7 +106,7 @@ if(WITH_GPU)
       endif()
     endif()
     message(STATUS "Current CUDA lib path: ${CUDA_LIB}")
-  endif(NOT WIN32)
+  endif()
 endif()
 
 if(USE_TENSORRT AND WITH_GPU)
@@ -182,9 +182,9 @@ if(WITH_MKL)
     include_directories("${MKLDNN_PATH}/include")
     if(WIN32)
       set(MKLDNN_LIB ${MKLDNN_PATH}/lib/mkldnn.lib)
-    else(WIN32)
+    else()
       set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
-    endif(WIN32)
+    endif()
   endif()
 else()
   set(OPENBLAS_LIB_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}openblas")
@@ -255,7 +255,7 @@ else()
       cryptopp-static
       ${EXTERNAL_LIB})
   set(DEPS ${DEPS} shlwapi.lib)
-endif(NOT WIN32)
+endif()
 
 if(WITH_GPU)
   if(NOT WIN32)
@@ -302,7 +302,7 @@ if(WITH_GTEST)
       ${DEMO_NAME}
       ${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest${CMAKE_STATIC_LIBRARY_SUFFIX}
     )
-  endif(WIN32)
+  endif()
 endif()
 if(WIN32)
   if("${CMAKE_GENERATOR}" MATCHES "Ninja")
diff --git a/paddle/fluid/inference/tests/infer_ut/external-cmake/gtest-cpp.cmake b/paddle/fluid/inference/tests/infer_ut/external-cmake/gtest-cpp.cmake
index 49b0a04197d12..b141b76c6f33b 100644
--- a/paddle/fluid/inference/tests/infer_ut/external-cmake/gtest-cpp.cmake
+++ b/paddle/fluid/inference/tests/infer_ut/external-cmake/gtest-cpp.cmake
@@ -10,7 +10,8 @@ set(GTEST_REPOSITORY https://github.com/google/googletest.git)
 set(GTEST_TAG release-1.8.1)
 include_directories(${GTEST_INCLUDE_DIR})
 if(WIN32)
-  # if use CMAKE_INSTALL_LIBDIR, the path of lib actually is install/gtest/lib/gtest.lib but GTEST_LIBRARIES
+  # if use CMAKE_INSTALL_LIBDIR, the path of lib actually is \
+  # install/gtest/lib/gtest.lib but GTEST_LIBRARIES
   # is install/gtest/gtest.lib
   set(GTEST_LIBRARIES
       "${GTEST_INSTALL_DIR}/lib/gtest.lib"
@@ -25,7 +26,7 @@ else()
   set(GTEST_MAIN_LIBRARIES
       "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest_main.a"
       CACHE FILEPATH "gtest main libraries." FORCE)
-endif(WIN32)
+endif()
 ExternalProject_Add(
   extern_gtest
   PREFIX gtest
diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt
index 1f72482eef777..5d1f97c096bdd 100644
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
@@ -47,10 +47,8 @@ if(WITH_GPU)
   if(WITH_TESTING AND TEST stream_safe_cuda_alloc_test)
     set_tests_properties(
       stream_safe_cuda_alloc_test
-      PROPERTIES
-        ENVIRONMENT
-        "FLAGS_use_stream_safe_cuda_allocator=true;FLAGS_allocator_strategy=auto_growth"
-    )
+      PROPERTIES ENVIRONMENT "FLAGS_use_stream_safe_cuda_allocator=true; \
+        FLAGS_allocator_strategy=auto_growth")
   endif()
 endif()
 
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 109afd06f4df1..e1b14c4bae875 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -261,4 +261,4 @@ if(NOT WIN32)
       SRCS cuda_ipc_allocator.cc
       DEPS allocator)
   endif()
-endif(NOT WIN32)
+endif()
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index eb0664eb17d35..d2d9ef1ab8fb6 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -1,6 +1,6 @@
 include(operators)
 
-# solve "math constants not defined" problems caused by the order of inclusion 
+# solve "math constants not defined" problems caused by the order of inclusion
 # of <cmath> and the definition of macro _USE_MATH_DEFINES
 add_definitions(-D_USE_MATH_DEFINES)
 
diff --git a/paddle/fluid/operators/activation_op_mlu.cc b/paddle/fluid/operators/activation_op_mlu.cc
index 90d0a72074b81..1debfbf4af2a3 100644
--- a/paddle/fluid/operators/activation_op_mlu.cc
+++ b/paddle/fluid/operators/activation_op_mlu.cc
@@ -145,6 +145,26 @@ class SqrtGradMLUKernel : public framework::OpKernel<T> {
   }
 };
 
+// CNNL_LOG_E = 0,
+// CNNL_LOG_2 = 1,
+// CNNL_LOG_10 = 2,
+template <cnnlLogBase_t Log_base, typename T>
+class LogMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+    output->mutable_data<T>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc input_desc(*input);
+    MLUCnnlTensorDesc output_desc(*output);
+    cnnlComputationPreference_t prefer = CNNL_COMPUTATION_HIGH_PRECISION;
+
+    MLUCnnl::Log(ctx, prefer, Log_base, input_desc.get(), GetBasePtr(input),
+                 output_desc.get(), GetBasePtr(output));
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -213,3 +233,16 @@ REGISTER_OP_MLU_KERNEL(sqrt, ops::SqrtMLUKernel<float>,
                        ops::SqrtMLUKernel<paddle::platform::float16>);
 REGISTER_OP_MLU_KERNEL(sqrt_grad, ops::SqrtGradMLUKernel<float>,
                        ops::SqrtGradMLUKernel<paddle::platform::float16>);
+
+// log log2 log10
+REGISTER_OP_MLU_KERNEL(
+    log, ops::LogMLUKernel<CNNL_LOG_E, float>,
+    ops::LogMLUKernel<CNNL_LOG_E, paddle::platform::float16>);
+
+REGISTER_OP_MLU_KERNEL(
+    log2, ops::LogMLUKernel<CNNL_LOG_2, float>,
+    ops::LogMLUKernel<CNNL_LOG_2, paddle::platform::float16>);
+
+REGISTER_OP_MLU_KERNEL(
+    log10, ops::LogMLUKernel<CNNL_LOG_10, float>,
+    ops::LogMLUKernel<CNNL_LOG_10, paddle::platform::float16>);
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc
index 237cfcc6f1172..48ca1e22df72d 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
 
 namespace paddle {
@@ -22,6 +23,8 @@ using Tensor = framework::Tensor;
 
 template <typename T>
 class CheckFiniteAndUnscaleMLUKernel : public framework::OpKernel<T> {
+  using MPDType = typename details::MPTypeTrait<T>::Type;
+
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
     auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
@@ -51,6 +54,7 @@ class CheckFiniteAndUnscaleMLUKernel : public framework::OpKernel<T> {
       }
 
       MLUCnnlTensorDesc x_desc(*x);
+      MLUCnnlTensorDesc out_desc(*out);
 
       MLUCnnl::IsNanInf(ctx, x_desc.get(), GetBasePtr(x),
                         GetBasePtr(&is_finite));
@@ -70,10 +74,34 @@ class CheckFiniteAndUnscaleMLUKernel : public framework::OpKernel<T> {
       // out = in/scale, if found_inf = false
       // But when found_inf is true, the data of Out should not be used.
       // So, on MLU, we always compute out with in/scale.
-      MLUCnnlTensorDesc out_desc(*out);
-      MLUCnnl::Div(ctx, CNNL_COMPUTATION_HIGH_PRECISION, x_desc.get(),
-                   GetBasePtr(x), scale_desc.get(), GetBasePtr(scale),
-                   out_desc.get(), GetBasePtr(out));
+      Tensor float_x;
+      Tensor float_out;
+      if (std::is_same<T, paddle::platform::float16>::value) {
+        float_x.Resize(x->dims());
+        float_out.Resize(out->dims());
+        float_x.mutable_data<MPDType>(ctx.GetPlace());
+        float_out.mutable_data<MPDType>(ctx.GetPlace());
+
+        MLUCnnlTensorDesc float_x_desc(float_x);
+        MLUCnnlTensorDesc float_out_desc(float_out);
+        auto cast_fp16_type =
+            GetCastDataType(DataType::FLOAT16, DataType::FLOAT32);
+        MLUCnnl::Cast(ctx, cast_fp16_type, x_desc.get(), GetBasePtr(x),
+                      float_x_desc.get(), GetBasePtr(&float_x));
+
+        MLUCnnl::Div(ctx, CNNL_COMPUTATION_HIGH_PRECISION, float_x_desc.get(),
+                     GetBasePtr(&float_x), scale_desc.get(), GetBasePtr(scale),
+                     float_out_desc.get(), GetBasePtr(&float_out));
+
+        auto cast_fp32_type =
+            GetCastDataType(DataType::FLOAT32, DataType::FLOAT16);
+        MLUCnnl::Cast(ctx, cast_fp32_type, float_out_desc.get(),
+                      GetBasePtr(&float_out), out_desc.get(), GetBasePtr(out));
+      } else {
+        MLUCnnl::Div(ctx, CNNL_COMPUTATION_HIGH_PRECISION, x_desc.get(),
+                     GetBasePtr(x), scale_desc.get(), GetBasePtr(scale),
+                     out_desc.get(), GetBasePtr(out));
+      }
     }
   }
 };
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cc b/paddle/fluid/operators/amp/update_loss_scaling_op.cc
index baf742b0b404b..26fa1c9131627 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cc
@@ -107,9 +107,9 @@ class UpdateLossScalingOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("LossScaling", "(Tensor) 1-dim tensor, updated loss scaling.");
     AddOutput("OutGoodSteps", "(Tensor) 1-dim tensor, pdated good steps.");
     AddOutput("OutBadSteps", "(Tensor) 1-dim tensor, updated bad steps.");
-    AddOutput("StopUpdate",
-              "(Tensor) 1-dim tensor. Stop updating loss scaling, and just "
-              "zero inputs. It has higher priority than Attr(stop_update).")
+    AddInput("StopUpdate",
+             "(Tensor) 1-dim tensor. Stop updating loss scaling, and just "
+             "zero inputs. It has higher priority than Attr(stop_update).")
         .AsDispensable();
     AddAttr<int>("incr_every_n_steps",
                  "A value represents increasing loss scaling every n "
diff --git a/paddle/fluid/operators/conv_op_mlu.cc b/paddle/fluid/operators/conv_op_mlu.cc
index c1517dbe16f84..b1b39608d624d 100644
--- a/paddle/fluid/operators/conv_op_mlu.cc
+++ b/paddle/fluid/operators/conv_op_mlu.cc
@@ -238,6 +238,228 @@ class MLUConvGradOpKernel : public framework::OpKernel<T> {
     }
   }
 };
+
+template <typename T>
+class MLUDepthwiseConvOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const Tensor* input = ctx.Input<Tensor>("Input");
+    auto* filter = ctx.Input<Tensor>("Filter");
+    auto* output = ctx.Output<Tensor>("Output");
+    output->mutable_data<T>(ctx.GetPlace());
+    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    const std::string padding_algorithm =
+        ctx.Attr<std::string>("padding_algorithm");
+    const std::string data_format = ctx.Attr<std::string>("data_format");
+
+    const bool channel_last = data_format == "NHWC";
+    int groups;
+
+    // update padding and dilation
+    auto in_dims = input->dims();
+    auto filter_dims = filter->dims();
+    auto in_dims_size = in_dims.size();
+    framework::DDim in_data_dims;
+    framework::DDim filter_data_dims;
+
+    if (channel_last) {
+      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
+    } else {
+      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
+    }
+    filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
+    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
+    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
+                             in_data_dims, strides, ksize);
+
+    Tensor input_tensor(input->type());
+    Tensor output_tensor(output->type());
+    const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
+    if (channel_last) {
+      groups = in_dims[3];
+      input_tensor.ShareDataWith(*input);
+      output_tensor.ShareDataWith(*output);
+    } else {
+      // transpose input from NCHW to NHWC
+      groups = in_dims[1];
+      TransposeFromMLUTensor<T>(ctx, perm_to_nhwc, input, &input_tensor,
+                                true /*need_reshape_or_alloc*/);
+      auto output_dims = output->dims();
+      output_tensor.mutable_data<T>(
+          {output_dims[0], output_dims[2], output_dims[3], output_dims[1]},
+          ctx.GetPlace());
+    }
+    input_tensor.set_layout(DataLayout::kNHWC);
+    output_tensor.set_layout(DataLayout::kNHWC);
+
+    // transpose filter from MCHW to MHWC
+    Tensor trans_filter(filter->type());
+    TransposeFromMLUTensor<T>(ctx, perm_to_nhwc, filter, &trans_filter,
+                              true /*need_reshape_or_alloc*/);
+
+    cnnlTensorLayout_t data_layout = CNNL_LAYOUT_NHWC;
+    MLUCnnlTensorDesc input_desc(input_tensor, data_layout,
+                                 ToCnnlDataType(input_tensor.dtype()));
+    MLUCnnlTensorDesc filter_desc(trans_filter, data_layout,
+                                  ToCnnlDataType(trans_filter.type()));
+    MLUCnnlTensorDesc output_desc(output_tensor, data_layout,
+                                  ToCnnlDataType(output_tensor.dtype()));
+
+    MLUCnnlConvolutionDesc conv_desc(in_dims_size, paddings.data(),
+                                     strides.data(), dilations.data(), groups,
+                                     ToCnnlDataType<T>());
+
+    MLUCnnl::ConvolutionForward(
+        ctx, conv_desc.get(), nullptr /*alpha*/, nullptr /*beta*/,
+        nullptr /*bias_desc*/, nullptr /*bias_ptr*/, input_desc.get(),
+        GetBasePtr(&input_tensor), filter_desc.get(), GetBasePtr(&trans_filter),
+        output_desc.get(), GetBasePtr(&output_tensor));
+
+    if (!channel_last) {
+      // transpose output from NHWC to NCHW
+      const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
+      TransposeFromMLUTensor<T>(ctx, perm_to_nchw, &output_tensor, output,
+                                false /*need_reshape_or_alloc*/);
+    }
+  }
+};
+
+template <typename T>
+class MLUDepthwiseConvGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto input = ctx.Input<Tensor>("Input");
+    auto filter = ctx.Input<Tensor>("Filter");
+    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
+    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    auto filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
+
+    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    const std::string padding_algorithm =
+        ctx.Attr<std::string>("padding_algorithm");
+    const std::string data_format = ctx.Attr<std::string>("data_format");
+
+    const bool channel_last = data_format == "NHWC";
+
+    // update padding and dilation
+    auto in_dims = input->dims();
+    auto filter_dims = filter->dims();
+    auto in_dims_size = in_dims.size();
+    framework::DDim in_data_dims;
+    framework::DDim filter_data_dims;
+    int groups;
+
+    if (channel_last) {
+      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
+    } else {
+      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
+    }
+    filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
+
+    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
+    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
+                             in_data_dims, strides, ksize);
+
+    Tensor input_tensor(input->type());
+    Tensor output_grad_tensor(output_grad->type());
+    const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
+    const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
+    if (channel_last) {
+      input_tensor.ShareDataWith(*input);
+      output_grad_tensor.ShareDataWith(*output_grad);
+      groups = in_dims[3];
+    } else {
+      groups = in_dims[1];
+      // transpose input and output_grad from NCHW to NHWC
+      TransposeFromMLUTensor<T>(ctx, perm_to_nhwc, input, &input_tensor,
+                                true /*need_reshape_or_alloc*/);
+      TransposeFromMLUTensor<T>(ctx, perm_to_nhwc, output_grad,
+                                &output_grad_tensor,
+                                true /*need_reshape_or_alloc*/);
+    }
+    input_tensor.set_layout(DataLayout::kNHWC);
+    output_grad_tensor.set_layout(DataLayout::kNHWC);
+
+    if (filter_grad) {
+      filter_grad->mutable_data<T>(ctx.GetPlace());
+
+      auto filter_grad_dims = filter_grad->dims();
+      Tensor temp_filter_grad(filter_grad->type());
+      temp_filter_grad.mutable_data<T>(
+          {filter_grad_dims[0], filter_grad_dims[2], filter_grad_dims[3],
+           filter_grad_dims[1]},
+          ctx.GetPlace());
+
+      cnnlDataType_t tensor_dtype = ToCnnlDataType<T>();
+      cnnlTensorLayout_t data_layout = CNNL_LAYOUT_NHWC;
+      MLUCnnlTensorDesc input_desc(input_tensor, data_layout, tensor_dtype);
+      MLUCnnlTensorDesc out_grad_desc(output_grad_tensor, data_layout,
+                                      tensor_dtype);
+      MLUCnnlTensorDesc temp_filter_grad_desc(temp_filter_grad, data_layout,
+                                              tensor_dtype);
+
+      MLUCnnlConvolutionDesc conv_desc(in_dims_size, paddings.data(),
+                                       strides.data(), dilations.data(), groups,
+                                       tensor_dtype);
+
+      MLUCnnl::ConvBackpropFilter(
+          ctx, conv_desc.get(), input_desc.get(), GetBasePtr(&input_tensor),
+          out_grad_desc.get(), GetBasePtr(&output_grad_tensor),
+          temp_filter_grad_desc.get(), GetBasePtr(&temp_filter_grad));
+
+      // transpose filter_grad from MHWC to MCHW
+      TransposeFromMLUTensor<T>(ctx, perm_to_nchw, &temp_filter_grad,
+                                filter_grad, false /*need_reshape_or_alloc*/);
+    }
+    if (input_grad) {
+      input_grad->mutable_data<T>(ctx.GetPlace());
+
+      Tensor input_grad_tensor(input_grad->type());
+      if (channel_last) {
+        input_grad_tensor.ShareDataWith(*input_grad);
+      } else {
+        auto input_grad_dims = input_grad->dims();
+        input_grad_tensor.mutable_data<T>(
+            {input_grad_dims[0], input_grad_dims[2], input_grad_dims[3],
+             input_grad_dims[1]},
+            ctx.GetPlace());
+      }
+      input_grad_tensor.set_layout(DataLayout::kNHWC);
+
+      // transpose filter from MCHW to MHWC
+      Tensor trans_filter(filter->type());
+      TransposeFromMLUTensor<T>(ctx, perm_to_nhwc, filter, &trans_filter,
+                                true /*need_reshape_or_alloc*/);
+
+      cnnlDataType_t tensor_dtype = ToCnnlDataType<T>();
+      cnnlTensorLayout_t data_layout = CNNL_LAYOUT_NHWC;
+      MLUCnnlTensorDesc filter_desc(trans_filter, data_layout, tensor_dtype);
+      MLUCnnlTensorDesc out_grad_desc(output_grad_tensor, data_layout,
+                                      tensor_dtype);
+      MLUCnnlTensorDesc in_grad_desc(input_grad_tensor, data_layout,
+                                     tensor_dtype);
+
+      MLUCnnlConvolutionDesc conv_desc(in_dims_size, paddings.data(),
+                                       strides.data(), dilations.data(), groups,
+                                       tensor_dtype);
+
+      MLUCnnl::ConvBackpropInput(
+          ctx, conv_desc.get(), filter_desc.get(), GetBasePtr(&trans_filter),
+          out_grad_desc.get(), GetBasePtr(&output_grad_tensor),
+          in_grad_desc.get(), GetBasePtr(&input_grad_tensor));
+
+      if (!channel_last) {
+        // transpose input_grad from NHWC to NCHW
+        TransposeFromMLUTensor<T>(ctx, perm_to_nchw, &input_grad_tensor,
+                                  input_grad, false /*need_reshape_or_alloc*/);
+      }
+    }
+  }
+};
 }  // namespace operators
 }  // namespace paddle
 
@@ -249,3 +471,10 @@ REGISTER_OP_MLU_KERNEL(conv2d, ops::MLUConvOpKernel<float>,
 
 REGISTER_OP_MLU_KERNEL(conv2d_grad, ops::MLUConvGradOpKernel<float>,
                        ops::MLUConvGradOpKernel<plat::float16>);
+
+REGISTER_OP_MLU_KERNEL(depthwise_conv2d, ops::MLUDepthwiseConvOpKernel<float>,
+                       ops::MLUDepthwiseConvOpKernel<plat::float16>);
+
+REGISTER_OP_MLU_KERNEL(depthwise_conv2d_grad,
+                       ops::MLUDepthwiseConvGradOpKernel<float>,
+                       ops::MLUDepthwiseConvGradOpKernel<plat::float16>);
diff --git a/paddle/fluid/operators/detection/locality_aware_nms_op.cc b/paddle/fluid/operators/detection/locality_aware_nms_op.cc
index 3f8bc8674186d..bbeacd0eb5ff0 100644
--- a/paddle/fluid/operators/detection/locality_aware_nms_op.cc
+++ b/paddle/fluid/operators/detection/locality_aware_nms_op.cc
@@ -316,7 +316,7 @@ class LocalityAwareNMSKernel : public framework::OpKernel<T> {
     auto* boxes_input = ctx.Input<LoDTensor>("BBoxes");
     auto* scores_input = ctx.Input<LoDTensor>("Scores");
     auto* outs = ctx.Output<LoDTensor>("Out");
-    auto score_dims = scores_input->dims();
+    auto& score_dims = scores_input->dims();
     auto score_size = score_dims.size();
     auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
 
diff --git a/paddle/fluid/operators/detection/retinanet_detection_output_op.cc b/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
index 4e49a6ed8521e..a5d22149eca22 100644
--- a/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
+++ b/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
@@ -471,7 +471,7 @@ class RetinanetDetectionOutputKernel : public framework::OpKernel<T> {
       std::vector<Tensor> box_per_batch_list(boxes_list.size());
       std::vector<Tensor> score_per_batch_list(scores_list.size());
       for (size_t j = 0; j < boxes_list.size(); ++j) {
-        auto score_dims = scores_list[j].dims();
+        const auto& score_dims = scores_list[j].dims();
         score_per_batch_list[j] = scores_list[j].Slice(i, i + 1);
         score_per_batch_list[j].Resize({score_dims[1], score_dims[2]});
         box_per_batch_list[j] = boxes_list[j].Slice(i, i + 1);
diff --git a/paddle/fluid/operators/einsum_op.cc b/paddle/fluid/operators/einsum_op.cc
index 7fc19d6913f83..95f841f7797b9 100644
--- a/paddle/fluid/operators/einsum_op.cc
+++ b/paddle/fluid/operators/einsum_op.cc
@@ -41,6 +41,10 @@ class EinsumOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsExtra()
         .AsIntermediate();
 
+    AddOutput("XShape", "(Tensor), The cache of the x_shape of: A and B.")
+        .AsDuplicable()
+        .AsExtra()
+        .AsIntermediate();
     AddAttr<std::string>("equation",
                          "(string) A einsum equation. such as `ij,jk->ik`"
                          "There must have `->` and the number of operands in "
@@ -59,8 +63,8 @@ class EinsumGradOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     auto x_name = "Operands";
     auto x_grad_name = framework::GradVarName(x_name);
-    ctx->SetOutputsDim(x_grad_name, ctx->GetInputsDim(x_name));
-    ctx->ShareAllLoD(x_name, x_grad_name);
+    ctx->SetOutputsDim(x_grad_name, ctx->GetInputsDim("Operands"));
+    ctx->ShareAllLoD("Operands", x_grad_name);
   }
 
  protected:
@@ -79,8 +83,15 @@ class EinsumGradMaker : public framework::SingleGradOpMaker<T> {
 
   void Apply(GradOpPtr<T> retv) const override {
     retv->SetType("einsum_grad");
-    retv->SetInput("Operands", this->Input("Operands"));
-    retv->SetInput("InnerCache", this->Output("InnerCache"));
+    if (this->HasOutput("InnerCache")) {
+      retv->SetInput("InnerCache", this->Output("InnerCache"));
+    }
+    if (this->HasOutput("XShape")) {
+      // add if for compatibility.
+      retv->SetInput("Operands", this->Output("XShape"));  // for memory save.
+    } else {
+      retv->SetInput("Operands", this->Input("Operands"));
+    }
     retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
     retv->SetAttrMap(this->Attrs());
     retv->SetOutput(framework::GradVarName("Operands"),
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op_mlu.cc b/paddle/fluid/operators/elementwise/elementwise_max_op_mlu.cc
new file mode 100644
index 0000000000000..3ff93c49a3603
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op_mlu.cc
@@ -0,0 +1,37 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_MLU
+
+#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ElementwiseMaxMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    MLUBinaryOp<MAXIMUM, T>(ctx);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_MLU_KERNEL(elementwise_max, ops::ElementwiseMaxMLUKernel<int>,
+                       ops::ElementwiseMaxMLUKernel<float>,
+                       ops::ElementwiseMaxMLUKernel<paddle::platform::float16>);
+#endif
diff --git a/paddle/fluid/operators/elementwise/elementwise_mlu.h b/paddle/fluid/operators/elementwise/elementwise_mlu.h
index 8c230c5f47bf6..a6a153c34d47f 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mlu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mlu.h
@@ -108,6 +108,7 @@ void MLUOpTensorKernel(const framework::ExecutionContext& ctx,
 enum BINARY_FUNCTOR {
   DIV,
   DIVNONAN,
+  MAXIMUM,
 };
 
 template <BINARY_FUNCTOR func>
@@ -126,6 +127,16 @@ inline void MLUBinary<DIV>(const framework::ExecutionContext& ctx,
   MLUCnnl::Div(ctx, prefer, x_desc, x, y_desc, y, out_desc, out);
 }
 
+template <>
+inline void MLUBinary<MAXIMUM>(
+    const framework::ExecutionContext& ctx,
+    cnnlComputationPreference_t prefer,  // useless, only for compatible
+    const cnnlTensorDescriptor_t x_desc, const void* x,
+    const cnnlTensorDescriptor_t y_desc, const void* y,
+    const cnnlTensorDescriptor_t out_desc, void* out) {
+  MLUCnnl::Maximum(ctx, x_desc, x, y_desc, y, out_desc, out);
+}
+
 template <BINARY_FUNCTOR Functor, typename T>
 void MLUBinaryOp(const framework::ExecutionContext& ctx) {
   auto* x = ctx.Input<Tensor>("X");
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
index 814827d95b6bd..01c5b79fff115 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
@@ -16,29 +16,25 @@ limitations under the License. */
 // https://github.com/NVIDIA/FasterTransformer/blob/v4.0/fastertransformer/cuda/masked_multihead_attention.cu
 // We add License in the head.
 
-// headers sort by clang-format may cause compiling error or test faiure,
-// see https://github.com/PaddlePaddle/Paddle/pull/42840/
-// clang-format off
 #include <cuda_fp16.h>
 #include <float.h>
+
 #include <cub/cub.cuh>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
-
-#include "paddle/phi/kernels/funcs/math_function.h"
-
 #include "paddle/fluid/operators/fused/attention_layer_norm.h"
 #include "paddle/fluid/operators/fused/attn_gemm.h"
 #include "paddle/fluid/operators/fused/fmha_ref.h"
 #include "paddle/fluid/operators/fused/fused_dropout_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
-// clang-format on
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/interpolate_op.h b/paddle/fluid/operators/interpolate_op.h
index 18caed22b4855..f90bffe9df836 100644
--- a/paddle/fluid/operators/interpolate_op.h
+++ b/paddle/fluid/operators/interpolate_op.h
@@ -38,7 +38,8 @@ inline std::vector<int> get_new_shape(
                           "The shape of dimension tensor should be [1],"
                           "but received d%.",
                           tensor->dims()));
-    if (platform::is_gpu_place(tensor->place())) {
+    if (platform::is_gpu_place(tensor->place()) ||
+        platform::is_mlu_place(tensor->place())) {
       framework::Tensor temp;
       paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
       vec_new_shape.push_back(static_cast<int32_t>(*temp.data<int32_t>()));
@@ -55,7 +56,8 @@ inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
   std::vector<T> vec_new_data;
   auto* new_data = new_data_tensor->data<T>();
   framework::Tensor cpu_starts_tensor;
-  if (platform::is_gpu_place(new_data_tensor->place())) {
+  if (platform::is_gpu_place(new_data_tensor->place()) ||
+      platform::is_mlu_place(new_data_tensor->place())) {
     paddle::framework::TensorCopySync(*new_data_tensor, platform::CPUPlace(),
                                       &cpu_starts_tensor);
     new_data = cpu_starts_tensor.data<T>();
diff --git a/paddle/fluid/operators/interpolate_v2_op_mlu.cc b/paddle/fluid/operators/interpolate_v2_op_mlu.cc
new file mode 100644
index 0000000000000..9977337a395c6
--- /dev/null
+++ b/paddle/fluid/operators/interpolate_v2_op_mlu.cc
@@ -0,0 +1,488 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/interpolate_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/fluid/operators/utils.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+using DataLayout = framework::DataLayout;
+
+inline std::vector<int> get_new_shape_mlu(
+    const std::vector<const Tensor*>& list_new_shape_tensor) {
+  // get tensor from
+  std::vector<int> vec_new_shape;
+  for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
+    auto tensor = list_new_shape_tensor[i];
+    PADDLE_ENFORCE_EQ(
+        tensor->dims(), phi::make_ddim({1}),
+        platform::errors::InvalidArgument("shape of dim tensor should be [1]"));
+    framework::Tensor temp;
+    paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
+    vec_new_shape.push_back(static_cast<int32_t>(*temp.data<int32_t>()));
+  }
+
+  return vec_new_shape;
+}
+
+template <typename T>
+class InterpolateV2MLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = ctx.template device_context<MLUDeviceContext>();
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+
+    auto input_dims = input->dims();
+    PADDLE_ENFORCE_GE(
+        input_dims.size(), 4,
+        platform::errors::External("MLU Interpolate kernel supports input "
+                                   "range greater or equal than 4."));
+    PADDLE_ENFORCE_LE(
+        input_dims.size(), 5,
+        platform::errors::External("MLU Interpolate kernel supports input "
+                                   "range less or equal than 5. "));
+
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
+    int n, c, in_d, in_h, in_w;
+    ExtractNCDWH(input_dims, data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+    auto interp_method = ctx.Attr<std::string>("interp_method");
+    bool align_corners = ctx.Attr<bool>("align_corners");
+    int align_mode = ctx.Attr<int>("align_mode");
+    int align_center = align_corners ? 0 : (align_mode == 1 ? 0 : 1);
+
+    int out_d = ctx.Attr<int>("out_d");
+    int out_h = ctx.Attr<int>("out_h");
+    int out_w = ctx.Attr<int>("out_w");
+    float scale_d = -1;
+    float scale_h = -1;
+    float scale_w = -1;
+
+    auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+    if (list_new_size_tensor.size() > 0) {
+      // have size tensor
+      auto new_size = get_new_shape_mlu(list_new_size_tensor);
+      if (new_size.size() <= 2) {
+        // default NCHW
+        out_h = new_size[0];
+        out_w = new_size[1];
+      } else {
+        // rank of input is 5, HCDHW
+        out_d = new_size[0];
+        out_h = new_size[1];
+        out_w = new_size[2];
+      }
+    } else {
+      auto scale_tensor = ctx.Input<Tensor>("Scale");
+      auto scale = ctx.Attr<std::vector<float>>("scale");
+      if (scale_tensor != nullptr) {
+        std::vector<float> scale_data;
+        scale_data = GetDataFromTensor<float>(scale_tensor);
+
+        if (scale_data.size() > 1 && scale_data.size() <= 2) {
+          scale_h = scale_data[0];
+          scale_w = scale_data[1];
+        } else if (scale_data.size() > 2) {
+          scale_d = scale_data[0];
+          scale_h = scale_data[1];
+          scale_w = scale_data[2];
+        } else {
+          scale_d = scale_data[0];
+          scale_h = scale_data[0];
+          scale_w = scale_data[0];
+        }
+        PADDLE_ENFORCE_EQ(
+            scale_w > 0 && scale_h > 0, true,
+            platform::errors::InvalidArgument("scale of Op(interpolate) "
+                                              "should be greater than 0."));
+      } else {
+        if (scale.size() > 1 && scale.size() <= 2) {
+          scale_h = scale[0];
+          scale_w = scale[1];
+
+          PADDLE_ENFORCE_EQ(
+              scale_w > 0 && scale_h > 0, true,
+              platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                                "should be greater than 0."));
+        } else if (scale.size() > 2) {
+          scale_d = scale[0];
+          scale_h = scale[1];
+          scale_w = scale[2];
+          PADDLE_ENFORCE_EQ(
+              scale_d > 0 && scale_w > 0 && scale_h > 0, true,
+              platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                                "should be greater than 0."));
+        }
+      }
+      if (scale_h > 0. && scale_w > 0.) {
+        out_h = static_cast<int>(in_h * scale_h);
+        out_w = static_cast<int>(in_w * scale_w);
+      }
+
+      if (scale_d > 0.) {
+        out_d = static_cast<int>(in_d * scale_d);
+      }
+      auto out_size = ctx.Input<Tensor>("OutSize");
+      if (out_size != nullptr) {
+        std::vector<int32_t> out_size_data;
+        out_size_data = GetDataFromTensor<int>(out_size);
+        if (out_size_data.size() <= 2) {
+          out_h = out_size_data[0];
+          out_w = out_size_data[1];
+        } else {
+          out_d = out_size_data[0];
+          out_h = out_size_data[1];
+          out_w = out_size_data[2];
+        }
+      }
+    }
+    PADDLE_ENFORCE_GT(
+        out_h, 0,
+        platform::errors::InvalidArgument("out_h in Attr(out_shape) of "
+                                          "Op(interpolate) "
+                                          "should be greater than 0."));
+    PADDLE_ENFORCE_GT(
+        out_w, 0,
+        platform::errors::InvalidArgument("out_w in Attr(out_shape) of "
+                                          "Op(interpolate) "
+                                          "should be greater than 0."));
+
+    // do transpose according to cnnl's constraints
+    // cnnlInterp_v2 only accepts NHWC when mode is CNNL_INTERP_BILINEAR and
+    // CNNL_INTERP_NEAREST,
+    framework::DDim dim_in, dim_in_trans, dim_out, dim_out_trans;
+    Tensor transformed_input, transformed_output;
+    bool need_transpose = input_dims.size() != 2;
+    if (input_dims.size() == 4) {
+      // need to do transpose if layout is kNCHW
+      need_transpose &= data_layout == DataLayout::kNCHW;
+      if (need_transpose) {
+        // if need_transpose, do the following
+        // 1. transpose input NCHW -> NHWC
+        // 2. interpolation in(NHWC) -> out(NHWC)
+        // 3. transpose output NHWC -> HCHW
+        // dim_in = {n, c, in_h, in_w};
+        dim_in_trans = {n, in_h, in_w, c};
+        dim_out = {n, c, out_h, out_w};
+        dim_out_trans = {n, out_h, out_w, c};
+        output->mutable_data<T>(dim_out, ctx.GetPlace());
+
+        if (in_h == out_h && in_w == out_w) {
+          framework::TensorCopy(*input, ctx.GetPlace(), output);
+          return;
+        }
+        // do transpose on input tensor, then do interpolation
+        MLUCnnlTensorDesc input_desc(*input, CNNL_LAYOUT_NCHW,
+                                     ToCnnlDataType(input->dtype()));
+
+        transformed_input =
+            ctx.AllocateTmpTensor<T, MLUDeviceContext>(dim_in_trans, dev_ctx);
+        transformed_output =
+            ctx.AllocateTmpTensor<T, MLUDeviceContext>(dim_out_trans, dev_ctx);
+
+        MLUCnnlTensorDesc input_reshaped_desc(
+            transformed_input, CNNL_LAYOUT_NHWC,
+            ToCnnlDataType(transformed_input.dtype()));
+        const std::vector<int> perm = {0, 2, 3, 1};
+        MLUCnnl::Transpose(ctx, perm, input_dims.size(), input_desc.get(),
+                           GetBasePtr(input), input_reshaped_desc.get(),
+                           GetBasePtr(&transformed_input));
+      } else {
+        // if no need_transpose, do the following
+        // 1. interpolation in(NHWC) -> out(NHWC)
+        // dim_in = {n, in_h, in_w, c};
+        dim_out = {n, out_h, out_w, c};
+        output->mutable_data<T>(dim_out, ctx.GetPlace());
+
+        if (in_h == out_h && in_w == out_w) {
+          framework::TensorCopy(*input, ctx.GetPlace(), output);
+          return;
+        }
+        transformed_input = *input;
+        transformed_output = *output;
+      }
+
+      MLUCnnlTensorDesc input_desc(transformed_input, CNNL_LAYOUT_NHWC,
+                                   ToCnnlDataType(transformed_input.dtype()));
+      MLUCnnlTensorDesc output_desc(transformed_output, CNNL_LAYOUT_NHWC,
+                                    ToCnnlDataType(transformed_output.dtype()));
+      MLUCnnl::Interp(ctx, GetMLUCnnlInterpMode(interp_method), align_corners,
+                      align_center, input_desc.get(),
+                      GetBasePtr(&transformed_input), output_desc.get(),
+                      GetBasePtr(&transformed_output));
+
+      if (need_transpose) {
+        // if need_transpose, reshape output back to NCHW
+        const std::vector<int> perm = {0, 3, 1, 2};
+        MLUCnnlTensorDesc output_reshape_desc(*output, CNNL_LAYOUT_NCHW,
+                                              ToCnnlDataType(output->dtype()));
+        MLUCnnl::Transpose(ctx, perm, dim_out_trans.size(), output_desc.get(),
+                           GetBasePtr(&transformed_output),
+                           output_reshape_desc.get(), GetBasePtr(output));
+      }
+    } else {
+      PADDLE_ENFORCE_EQ(
+          interp_method, "trilinear",
+          platform::errors::External("MLU Interpolate kernel only supports 5D "
+                                     "data in trilinear mode."));
+
+      // need to do transpose if layout is kNCDHW
+      need_transpose &= data_layout == DataLayout::kNCHW;
+      if (need_transpose) {
+        // if need_transpose, do the following
+        // 1. transpose input NCDHW -> NDHWC
+        // 2. interpolation in(NDHWC) -> out(NDHWC)
+        // 3. transpose output NDHWC -> HCDHW
+        // dim_in = {n, c, in_d, in_h, in_w};
+        dim_in_trans = {n, in_d, in_h, in_w, c};
+        dim_out = {n, c, out_d, out_h, out_w};
+        dim_out_trans = {n, out_d, out_h, out_w, c};
+        output->mutable_data<T>(dim_out, ctx.GetPlace());
+
+        if (in_h == out_h && in_w == out_w && in_d == out_d) {
+          framework::TensorCopy(*input, ctx.GetPlace(), output);
+          return;
+        }
+        // do transpose on input tensor (HCDHW -> NDHWC), then do interpolation
+        MLUCnnlTensorDesc input_desc(*input, CNNL_LAYOUT_NCDHW,
+                                     ToCnnlDataType(input->dtype()));
+
+        transformed_input =
+            ctx.AllocateTmpTensor<T, MLUDeviceContext>(dim_in_trans, dev_ctx);
+        transformed_output =
+            ctx.AllocateTmpTensor<T, MLUDeviceContext>(dim_out_trans, dev_ctx);
+
+        MLUCnnlTensorDesc input_reshaped_desc(
+            transformed_input, CNNL_LAYOUT_NDHWC,
+            ToCnnlDataType(transformed_input.dtype()));
+        const std::vector<int> perm = {0, 2, 3, 4, 1};
+        MLUCnnl::Transpose(ctx, perm, input_dims.size(), input_desc.get(),
+                           GetBasePtr(input), input_reshaped_desc.get(),
+                           GetBasePtr(&transformed_input));
+      } else {
+        // if no need_transpose, do the following
+        // 1. interpolation in(NDHWC) -> out(NDHWC)
+        // dim_in = {n, in_d, in_h, in_w, c};
+        dim_out = {n, out_d, out_h, out_w, c};
+        output->mutable_data<T>(dim_out, ctx.GetPlace());
+
+        if (in_h == out_h && in_w == out_w && in_d == out_d) {
+          framework::TensorCopy(*input, ctx.GetPlace(), output);
+          return;
+        }
+        transformed_input = *input;
+        transformed_output = *output;
+      }
+
+      MLUCnnlTensorDesc input_desc(transformed_input, CNNL_LAYOUT_NDHWC,
+                                   ToCnnlDataType(transformed_input.dtype()));
+      MLUCnnlTensorDesc output_desc(transformed_output, CNNL_LAYOUT_NDHWC,
+                                    ToCnnlDataType(transformed_output.dtype()));
+      // use trilinear mode in HCDHW layout
+      MLUCnnl::Interp(ctx, GetMLUCnnlInterpMode(interp_method), align_corners,
+                      align_center, input_desc.get(),
+                      GetBasePtr(&transformed_input), output_desc.get(),
+                      GetBasePtr(&transformed_output));
+
+      if (need_transpose) {
+        // if need_transpose, reshape output back (NDHWC -> NCDHW)
+        const std::vector<int> perm = {0, 4, 1, 2, 3};
+        MLUCnnlTensorDesc output_reshape_desc(*output, CNNL_LAYOUT_NCDHW,
+                                              ToCnnlDataType(output->dtype()));
+        MLUCnnl::Transpose(ctx, perm, dim_out_trans.size(), output_desc.get(),
+                           GetBasePtr(&transformed_output),
+                           output_reshape_desc.get(), GetBasePtr(output));
+      }
+    }
+  }
+};
+
+template <typename T>
+class InterpolateV2GradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = ctx.template device_context<MLUDeviceContext>();
+    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    auto output_grad_dims = output_grad->dims();
+
+    PADDLE_ENFORCE_EQ(output_grad_dims.size(), 4,
+                      platform::errors::External(
+                          "XPU Interpolategrad kernel only support 2d"));
+
+    auto* input = ctx.Input<Tensor>("X");
+    auto input_dims = input->dims();
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
+    int n, c, in_d, in_h, in_w;
+    ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
+
+    auto interp_method = ctx.Attr<std::string>("interp_method");
+    bool align_corners = ctx.Attr<bool>("align_corners");
+    int align_mode = ctx.Attr<int>("align_mode");
+    int align_center = align_corners ? 0 : (align_mode == 0 ? 0 : 1);
+    align_center = 0;
+
+    int out_h = ctx.Attr<int>("out_h");
+    int out_w = ctx.Attr<int>("out_w");
+    float scale_h = -1;
+    float scale_w = -1;
+
+    auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+    if (list_new_size_tensor.size() > 0) {
+      // have size tensor
+      auto new_size = get_new_shape_mlu(list_new_size_tensor);
+      out_h = new_size[0];
+      out_w = new_size[1];
+    } else {
+      auto scale_tensor = ctx.Input<Tensor>("Scale");
+      auto scale = ctx.Attr<std::vector<float>>("scale");
+      if (scale_tensor != nullptr) {
+        std::vector<float> scale_data;
+        scale_data = GetDataFromTensor<float>(scale_tensor);
+        if (scale_data.size() > 1) {
+          scale_h = scale_data[0];
+          scale_w = scale_data[1];
+        } else {
+          scale_h = scale_data[0];
+          scale_w = scale_data[0];
+        }
+        PADDLE_ENFORCE_EQ(
+            scale_w > 0 && scale_h > 0, true,
+            platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                              "should be greater than 0."));
+      } else {
+        if (scale.size() > 1) {
+          scale_h = scale[0];
+          scale_w = scale[1];
+
+          PADDLE_ENFORCE_EQ(
+              scale_w > 0 && scale_h > 0, true,
+              platform::errors::InvalidArgument("scale  of Op(interpolate) "
+                                                "should be greater than 0."));
+        }
+      }
+      if (scale_h > 0. && scale_w > 0.) {
+        out_h = static_cast<int>(in_h * scale_h);
+        out_w = static_cast<int>(in_w * scale_w);
+      }
+      auto out_size = ctx.Input<Tensor>("OutSize");
+      if (out_size != nullptr) {
+        std::vector<int32_t> out_size_data;
+        out_size_data = GetDataFromTensor<int>(out_size);
+        out_h = out_size_data[0];
+        out_w = out_size_data[1];
+      }
+    }
+
+    framework::DDim dim_grad;
+    framework::DDim dim_out_grad, dim_out_trans_grad, dim_in_grad,
+        dim_in_trans_grad;
+    Tensor transformed_output_grad, transformed_input_grad;
+    bool need_transpose =
+        input_dims.size() != 2 && data_layout == DataLayout::kNCHW;
+
+    if (need_transpose) {
+      // if need_transpose, do the following
+      // 1. transpose output_grad NCHW -> NHWC
+      // 2. InterpBackward output_grad(NHWC) -> input_grad(NHWC)
+      // 3. transpose input_grad NHWC -> HCHW
+      // dim_out_grad = {n, c, out_h, out_w};
+      dim_out_trans_grad = {n, out_h, out_w, c};
+      dim_in_grad = {n, c, in_h, in_w};
+      dim_in_trans_grad = {n, in_h, in_w, c};
+      input_grad->mutable_data<T>(dim_in_grad, ctx.GetPlace());
+
+      if (in_h == out_h && in_w == out_w) {
+        framework::TensorCopy(*output_grad, ctx.GetPlace(), input_grad);
+        return;
+      }
+      // do transpose on input tensor, then do interpolation
+      MLUCnnlTensorDesc input_desc(*output_grad, CNNL_LAYOUT_NCHW,
+                                   ToCnnlDataType(output_grad->dtype()));
+
+      transformed_output_grad = ctx.AllocateTmpTensor<T, MLUDeviceContext>(
+          dim_out_trans_grad, dev_ctx);
+      transformed_input_grad = ctx.AllocateTmpTensor<T, MLUDeviceContext>(
+          dim_in_trans_grad, dev_ctx);
+
+      MLUCnnlTensorDesc input_reshaped_desc(
+          transformed_output_grad, CNNL_LAYOUT_NHWC,
+          ToCnnlDataType(transformed_output_grad.dtype()));
+      const std::vector<int> perm = {0, 2, 3, 1};
+      MLUCnnl::Transpose(ctx, perm, input_dims.size(), input_desc.get(),
+                         GetBasePtr(output_grad), input_reshaped_desc.get(),
+                         GetBasePtr(&transformed_output_grad));
+    } else {
+      // if no need_transpose, do the following
+      // 1. InterpBackward output_grad(NHWC) -> input_grad(NHWC)
+      dim_in_grad = {n, in_h, in_w, c};
+      input_grad->mutable_data<T>(dim_in_grad, ctx.GetPlace());
+
+      if (in_h == out_h && in_w == out_w) {
+        framework::TensorCopy(*output_grad, ctx.GetPlace(), input_grad);
+        return;
+      }
+      transformed_output_grad = *output_grad;
+      transformed_input_grad = *input_grad;
+    }
+
+    MLUCnnlTensorDesc input_desc(
+        transformed_output_grad, CNNL_LAYOUT_NHWC,
+        ToCnnlDataType(transformed_output_grad.dtype()));
+    MLUCnnlTensorDesc output_desc(
+        transformed_input_grad, CNNL_LAYOUT_NHWC,
+        ToCnnlDataType(transformed_input_grad.dtype()));
+    MLUCnnl::InterpBackward(
+        ctx, GetMLUCnnlInterpBackwardMode(interp_method), align_corners,
+        align_center, input_desc.get(), GetBasePtr(&transformed_output_grad),
+        output_desc.get(), GetBasePtr(&transformed_input_grad));
+
+    if (need_transpose) {
+      const std::vector<int> perm = {0, 3, 1, 2};
+      MLUCnnlTensorDesc output_reshape_desc(
+          *input_grad, CNNL_LAYOUT_NCHW, ToCnnlDataType(input_grad->dtype()));
+      MLUCnnl::Transpose(ctx, perm, dim_in_trans_grad.size(), output_desc.get(),
+                         GetBasePtr(&transformed_input_grad),
+                         output_reshape_desc.get(), GetBasePtr(input_grad));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(bilinear_interp_v2, ops::InterpolateV2MLUKernel<float>,
+                       ops::InterpolateV2MLUKernel<plat::float16>);
+REGISTER_OP_MLU_KERNEL(nearest_interp_v2, ops::InterpolateV2MLUKernel<float>,
+                       ops::InterpolateV2MLUKernel<plat::float16>);
+
+REGISTER_OP_MLU_KERNEL(nearest_interp_v2_grad,
+                       ops::InterpolateV2GradMLUKernel<float>,
+                       ops::InterpolateV2GradMLUKernel<plat::float16>);
+REGISTER_OP_MLU_KERNEL(bilinear_interp_v2_grad,
+                       ops::InterpolateV2GradMLUKernel<float>,
+                       ops::InterpolateV2GradMLUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/layer_norm_op_mlu.cc b/paddle/fluid/operators/layer_norm_op_mlu.cc
index a368af86a3da6..919358febd2eb 100644
--- a/paddle/fluid/operators/layer_norm_op_mlu.cc
+++ b/paddle/fluid/operators/layer_norm_op_mlu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
 
 namespace paddle {
@@ -122,6 +123,8 @@ class LayerNormMLUKernel : public framework::OpKernel<T> {
 
 template <typename T>
 class LayerNormGradMLUKernel : public framework::OpKernel<T> {
+  using MPDType = typename details::MPTypeTrait<T>::Type;
+
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
@@ -207,14 +210,14 @@ class LayerNormGradMLUKernel : public framework::OpKernel<T> {
 
     if (dscale && (tmp_dscale.dtype() == DataType::FLOAT16 &&
                    dscale->dtype() == DataType::FLOAT32)) {
-      dscale->mutable_data<T>(place);
+      dscale->mutable_data<MPDType>(place);
       MLUCnnl::Cast(ctx, cast_fp16_to_fp32, float16_desc.get(),
                     GetBasePtr(&tmp_dscale), float32_desc.get(),
                     GetBasePtr(dscale));
     }
     if (dbias && (tmp_dbias.dtype() == DataType::FLOAT16 &&
                   dbias->dtype() == DataType::FLOAT32)) {
-      dbias->mutable_data<T>(place);
+      dbias->mutable_data<MPDType>(place);
       MLUCnnl::Cast(ctx, cast_fp16_to_fp32, float16_desc.get(),
                     GetBasePtr(&tmp_dbias), float32_desc.get(),
                     GetBasePtr(dbias));
diff --git a/paddle/fluid/operators/lookup_table_v2_op_mlu.cc b/paddle/fluid/operators/lookup_table_v2_op_mlu.cc
index c8ab269c023a5..b69a52c761d4a 100644
--- a/paddle/fluid/operators/lookup_table_v2_op_mlu.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op_mlu.cc
@@ -18,7 +18,6 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-constexpr int64_t kNoPadding = -1;
 
 template <typename T>
 class LookupTableV2MLUKernel : public framework::OpKernel<T> {
@@ -27,6 +26,7 @@ class LookupTableV2MLUKernel : public framework::OpKernel<T> {
     auto *ids_t = ctx.Input<framework::LoDTensor>("Ids");      // int tensor
     auto *output_t = ctx.Output<framework::LoDTensor>("Out");  // float tensor
     auto *table_t = ctx.Input<framework::LoDTensor>("W");
+    int padding_idx = static_cast<int>(ctx.Attr<int64_t>("padding_idx"));
 
     auto *table_var = ctx.InputVar("W");
     PADDLE_ENFORCE_EQ(
@@ -38,43 +38,10 @@ class LookupTableV2MLUKernel : public framework::OpKernel<T> {
     MLUCnnlTensorDesc table_desc(*table_t);
     MLUCnnlTensorDesc output_desc(*output_t);
 
-    int64_t padding_idx = ctx.Attr<int64_t>("padding_idx");
-    if (padding_idx == kNoPadding) {
-      MLUCnnl::GatherFunctor(ctx, /*axis=*/0, /*batch_dims=*/0,
-                             table_desc.get(), GetBasePtr(table_t),
-                             ids_desc.get(), GetBasePtr(ids_t),
-                             output_desc.get(), GetBasePtr(output_t));
-    } else {
-      Tensor tmp_table_t(table_t->type());
-      tmp_table_t.mutable_data<T>(table_t->dims(), ctx.GetPlace());
-
-      Tensor index;
-      index.mutable_data<int32_t>({1, 1}, ctx.GetPlace());
-      auto idx_value = static_cast<int32_t>(padding_idx);
-      MLUCnnlTensorDesc index_desc(index);
-      MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &idx_value, index_desc.get(),
-                    GetBasePtr(&index));
-
-      auto update_dim = phi::make_ddim({1, table_t->dims()[1]});
-      Tensor update;
-      update.mutable_data<T>(update_dim, ctx.GetPlace());
-
-      auto update_value = static_cast<T>(0);
-      MLUCnnlTensorDesc update_desc(update);
-      MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &update_value,
-                    update_desc.get(), GetBasePtr(&update));
-
-      MLUCnnlTensorDesc tmp_table_desc(tmp_table_t);
-      MLUCnnl::ScatterNd(
-          ctx, CNNL_SCATTERND_UPDATE, index_desc.get(), GetBasePtr(&index),
-          update_desc.get(), GetBasePtr(&update), table_desc.get(),
-          GetBasePtr(table_t), tmp_table_desc.get(), GetBasePtr(&tmp_table_t));
-
-      MLUCnnl::GatherFunctor(ctx, /*axis=*/0, /*batch_dims=*/0,
-                             tmp_table_desc.get(), GetBasePtr(&tmp_table_t),
-                             ids_desc.get(), GetBasePtr(ids_t),
-                             output_desc.get(), GetBasePtr(output_t));
-    }
+    MLUCnnl::EmbeddingForward(ctx, padding_idx, table_desc.get(),
+                              GetBasePtr(table_t), ids_desc.get(),
+                              static_cast<const int *>(GetBasePtr(ids_t)),
+                              output_desc.get(), GetBasePtr(output_t));
   }
 };
 
@@ -82,6 +49,16 @@ template <typename T>
 class LookupTableV2GradMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *table_var = ctx.InputVar("W");
+    PADDLE_ENFORCE_EQ(table_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::PermissionDenied(
+                          "Unsupported Variable Type , idx in "
+                          "LookupTableV2GradMLUKernel should be LoDTensor."));
+    bool is_sparse = ctx.Attr<bool>("is_sparse");
+    PADDLE_ENFORCE_EQ(
+        is_sparse, false,
+        platform::errors::InvalidArgument(
+            "LookupTableV2GradMLUKernel dose NOT support is_sparse = True."));
     auto *ids_t = ctx.Input<framework::LoDTensor>("Ids");
     auto *output_grad_t =
         ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
@@ -91,6 +68,13 @@ class LookupTableV2GradMLUKernel : public framework::OpKernel<T> {
 
     int padding_idx = static_cast<int>(ctx.Attr<int64_t>("padding_idx"));
 
+    int64_t ids_numel = ids_t->numel();
+    PADDLE_ENFORCE_EQ(
+        ids_numel <= std::numeric_limits<int32_t>::max(), true,
+        platform::errors::OutOfRange(
+            "Number of ids greater than int32_t::max , please check "
+            "number of ids in LookupTableV2GradMLUKernel."));
+
     Tensor ids_int32(ids_t->dtype());
     if (ids_t->dtype() != DataType::INT32) {
       ids_int32.mutable_data<int>(ids_t->dims(), ctx.GetPlace());
@@ -125,5 +109,4 @@ REGISTER_OP_MLU_KERNEL(lookup_table_v2, ops::LookupTableV2MLUKernel<float>,
 
 REGISTER_OP_MLU_KERNEL(lookup_table_v2_grad,
                        ops::LookupTableV2GradMLUKernel<float>,
-                       ops::LookupTableV2GradMLUKernel<int>,
                        ops::LookupTableV2GradMLUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h
index 33da631d27b14..bb3797d268291 100644
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -262,7 +262,7 @@ class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> {
  public:
   void operator()(const DeviceContext& context, const int axis_dim,
                   const framework::Tensor* X, framework::Tensor* Y) {
-    auto in_dims = X->dims();
+    const auto& in_dims = X->dims();
     const float* in_data = X->data<float>();
     float* out_data = Y->data<float>();
     const int kBatchDim = 0;
@@ -387,7 +387,7 @@ class SoftmaxGradFunctor<DeviceContext, T, enable_if_CPU<DeviceContext>> {
   void operator()(const DeviceContext& context, const int axis_dim,
                   const framework::Tensor* y, const framework::Tensor* y_grad,
                   framework::Tensor* x_grad) {
-    auto out_dims = y->dims();
+    const auto& out_dims = y->dims();
     constexpr int kBatchDim = 0;
     constexpr int kClassDim = 1;
     const int num_classes = out_dims[kClassDim];
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc
index dc8301b9e0b8d..d5b843d47afb7 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.cc
+++ b/paddle/fluid/operators/mlu/mlu_baseop.cc
@@ -901,14 +901,11 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
       cnnlAddN(handle, inputs_desc, inputs, input_num, output_desc, output));
 }
 
-/* static */ void MLUCnnl::Log(const ExecutionContext& ctx,
-                               cnnlComputationPreference_t prefer,
-                               const cnnlTensorDescriptor_t input_desc,
-                               const void* input,
-                               const cnnlTensorDescriptor_t output_desc,
-                               void* output) {
+/* static */ void MLUCnnl::Log(
+    const ExecutionContext& ctx, cnnlComputationPreference_t prefer,
+    cnnlLogBase_t log_base, const cnnlTensorDescriptor_t input_desc,
+    const void* input, const cnnlTensorDescriptor_t output_desc, void* output) {
   cnnlHandle_t handle = GetHandleFromCTX(ctx);
-  cnnlLogBase_t log_base = CNNL_LOG_E;
 
   PADDLE_ENFORCE_MLU_SUCCESS(cnnlLog_v2(handle, prefer, log_base, input_desc,
                                         input, output_desc, output));
@@ -1925,9 +1922,9 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
     const cnnlTensorDescriptor_t output_desc, void* output) {
   cnnlHandle_t handle = GetHandleFromCTX(ctx);
 
-  PADDLE_ENFORCE_MLU_SUCCESS(
-      cnnlInterpBackward(handle, align_corners, half_pixel_centers, mode,
-                         input_desc, input, output_desc, output));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlInterpBackward_v2(
+      handle, align_corners, half_pixel_centers, mode, NULL, true, input_desc,
+      input, output_desc, output));
 }
 
 /* static */ void MLUCnnl::Cast(const ExecutionContext& ctx,
@@ -2802,6 +2799,18 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
       cnnlReciprocal(handle, input_desc, input, output_desc, output));
 }
 
+/* static */ void MLUCnnl::EmbeddingForward(
+    const ExecutionContext& ctx, const int padding_idx,
+    const cnnlTensorDescriptor_t weight_desc, const void* weight,
+    const cnnlTensorDescriptor_t indices_desc, const int* indices,
+    const cnnlTensorDescriptor_t output_desc, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlEmbeddingForward_v2(
+      handle, weight_desc, weight, indices_desc, indices, padding_idx,
+      nullptr /*max_norm*/, nullptr /*norm_type*/, output_desc, output));
+}
+
 /* static */ void MLUCnnl::EmbeddingBackward(
     const ExecutionContext& ctx, int padding_idx, bool scale_grad_by_freq,
     const cnnlTensorDescriptor_t indices_desc, const void* indices,
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h
index 774e297c06dd0..71648c5c5fbca 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.h
+++ b/paddle/fluid/operators/mlu/mlu_baseop.h
@@ -41,6 +41,20 @@ const std::map<std::string, cnnlReduceOp_t> MLUReduceOpMap = {
     {"reduce_prod", CNNL_REDUCE_MUL},
 };
 
+const std::map<std::string, cnnlInterpMode_t> MLUInterpModeMap = {
+    {"bilinear", CNNL_INTERP_BILINEAR},
+    {"nearest", CNNL_INTERP_NEAREST},
+    {"linear", CNNL_INTERP_LINEAR},
+    {"trilinear", CNNL_INTERP_TRILINEAR},
+    {"bicubic", CNNL_INTERP_BICUBIC}};
+
+const std::map<std::string, cnnlInterpBackwardMode_t> MLUInterpBackwardModeMap =
+    {{"bilinear", CNNL_INTERP_BACKWARD_BILINEAR},
+     {"nearest", CNNL_INTERP_BACKWARD_NEAREST},
+     {"linear", CNNL_INTERP_BACKWARD_LINEAR},
+     {"trilinear", CNNL_INTERP_BACKWARD_TRILINEAR},
+     {"bicubic", CNNL_INTERP_BACKWARD_BICUBIC}};
+
 inline cnnlReduceOp_t GetMLUCnnlReduceOp(const std::string reduce_name) {
   auto iter = MLUReduceOpMap.find(reduce_name);
   if (iter != MLUReduceOpMap.end()) {
@@ -50,6 +64,25 @@ inline cnnlReduceOp_t GetMLUCnnlReduceOp(const std::string reduce_name) {
       "Not support reduce op type of MLU Device: %s", reduce_name));
 }
 
+inline cnnlInterpMode_t GetMLUCnnlInterpMode(const std::string interp_mode) {
+  auto iter = MLUInterpModeMap.find(interp_mode);
+  if (iter != MLUInterpModeMap.end()) {
+    return iter->second;
+  }
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "Not support interp mode of MLU Device: %s", interp_mode));
+}
+
+inline cnnlInterpBackwardMode_t GetMLUCnnlInterpBackwardMode(
+    const std::string interp_mode) {
+  auto iter = MLUInterpBackwardModeMap.find(interp_mode);
+  if (iter != MLUInterpBackwardModeMap.end()) {
+    return iter->second;
+  }
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "Not support interp mode of MLU Device: %s", interp_mode));
+}
+
 inline const void* GetBasePtr(const Tensor* t) { return t->data(); }
 
 inline void* GetBasePtr(Tensor* t) { return t->data(); }
@@ -633,7 +666,7 @@ class MLUCnnl {
                    const cnnlTensorDescriptor_t output_desc, void* output);
 
   static void Log(const ExecutionContext& ctx,
-                  cnnlComputationPreference_t prefer,
+                  cnnlComputationPreference_t prefer, cnnlLogBase_t log_base,
                   const cnnlTensorDescriptor_t input_desc, const void* input,
                   const cnnlTensorDescriptor_t output_desc, void* output);
 
@@ -1235,6 +1268,12 @@ class MLUCnnl {
                          const cnnlTensorDescriptor_t output_desc,
                          void* output);
 
+  static void EmbeddingForward(
+      const ExecutionContext& ctx, const int padding_idx,
+      const cnnlTensorDescriptor_t weight_desc, const void* weight,
+      const cnnlTensorDescriptor_t indices_desc, const int* indices,
+      const cnnlTensorDescriptor_t output_desc, void* output);
+
   static void EmbeddingBackward(
       const ExecutionContext& ctx, int padding_idx, bool scale_grad_by_freq,
       const cnnlTensorDescriptor_t indices_desc, const void* indices,
diff --git a/paddle/fluid/operators/optimizers/adam_op_mlu.cc b/paddle/fluid/operators/optimizers/adam_op_mlu.cc
index 9d335021234eb..36d0fb491a975 100644
--- a/paddle/fluid/operators/optimizers/adam_op_mlu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_mlu.cc
@@ -237,8 +237,8 @@ class AdamWMLUKernel : public AdamMLUKernel<T> {
                                         ctx.device_context(), &skip_update_vec);
       skip_update = skip_update_vec[0];
     }
-    VLOG(3) << "Skip update" << skip_update;
     bool with_decay = ctx.Attr<bool>("with_decay");
+    VLOG(3) << "Skip update: " << skip_update << ", With decay: " << with_decay;
     if (!skip_update && with_decay) {
       if (ctx.HasInput("MasterParam")) {
         PADDLE_THROW(platform::errors::Unimplemented(
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index 30ead84d1a987..9aa68881e44a0 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -35,27 +35,8 @@ bool CanMKLDNNSupportPool(const framework::ExecutionContext& ctx) {
   auto src_tz = phi::vectorize(ctx.Input<Tensor>("X")->dims());
   std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
   // Fast but not exhustive check
-  if ((src_tz[src_tz.size() - 1] % ksize[1] == 0) &&
-      (src_tz[src_tz.size() - 2] % ksize[0] == 0))
-    return true;
-
-  // Exhustive check
-  auto IH = static_cast<double>(src_tz[src_tz.size() - 2]);
-  auto IW = static_cast<double>(src_tz[src_tz.size() - 1]);
-  auto OH = static_cast<double>(ksize[0]);
-  auto OW = static_cast<double>(ksize[1]);
-
-  auto SH = static_cast<int>(floor((IH * 2.0) / OH) - floor(IH / OH));
-  auto SW = static_cast<int>(floor((IW * 2.0) / OW) - floor(IW / OW));
-  auto KH = static_cast<int>(ceil((IH * 2.0) / OH) - floor(IH / OH));
-  auto KW = static_cast<int>(ceil((IW * 2.0) / OW) - floor(IW / OW));
-
-  auto PH = (SH * (static_cast<int>(OH) - 1) + KH - static_cast<int>(IH));
-  auto PW = (SW * (static_cast<int>(OW) - 1) + KW - static_cast<int>(IW));
-  // If there is additional padding needed then
-  // this is situation that oneDNN cannot comply with
-  // paddlepaddle reference implementation
-  return (PH == 0) && (PW == 0);
+  return ((src_tz[src_tz.size() - 1] % ksize[1] == 0) &&
+          (src_tz[src_tz.size() - 2] % ksize[0] == 0));
 }
 
 framework::OpKernelType PoolOp::GetExpectedKernelType(
diff --git a/paddle/fluid/operators/run_program_op.h b/paddle/fluid/operators/run_program_op.h
index bfd33efe833d2..42e8379bca4af 100644
--- a/paddle/fluid/operators/run_program_op.h
+++ b/paddle/fluid/operators/run_program_op.h
@@ -257,7 +257,12 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
 
     auto input_var_names = ctx.InputNames("X");
     auto output_var_names = ctx.OutputNames("Out");
-    auto dout_var_names = ctx.OutputNames("DOut");
+    std::vector<std::string> dout_var_names;
+    if (!dout_vars.empty()) {
+      // DOut is a dispensable out, only get the names when it exists.
+      // Otherwise, it will throw a NotFound error.
+      dout_var_names = ctx.OutputNames("DOut");
+    }
 
     // current program may not hold parameters
     std::vector<std::string> param_names;
@@ -272,10 +277,23 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
     // NOTE(chenweihang): In order not to add new variable type, use vector
     // here. Originally, here can use scope directly.
     auto *out_scope_vec = ctx.Output<StepScopeVar>("OutScope");
-    PADDLE_ENFORCE_EQ(
-        out_scope_vec->size(), 1,
-        platform::errors::InvalidArgument(
-            "The OutScope of RunProgramGradOp should only hold one scope."));
+    std::unique_ptr<framework::Scope> inner_scope{nullptr};
+    if (out_scope_vec->size() == 0) {
+      // For cuda graph under static mode usage.
+      // For static mode, we cannot set value of a tensor before any run,
+      // the OutScope variable passed to the op actually contains nothing.
+      // Just create a tmp scope to run the program.
+      PADDLE_ENFORCE_EQ(
+          use_cuda_graph, true,
+          platform::errors::InvalidArgument(
+              "If not provide OutScope then must run under cuda graph mode."));
+      inner_scope = std::make_unique<framework::Scope>();
+    } else {
+      PADDLE_ENFORCE_EQ(
+          out_scope_vec->size(), 1,
+          platform::errors::InvalidArgument(
+              "The OutScope of RunProgramGradOp should only hold one scope."));
+    }
 
     // Step 2. prepare executor and init persistable variables
 
@@ -284,9 +302,10 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
     // Learning. Tensor data in multi-step training should be saved into single
     // scope separately. Otherwise, the gradients can be miscalculated because
     // always using the Tensor data of the last step in forward.
-    framework::Scope *global_inner_scope = out_scope_vec->front();
+    framework::Scope *global_inner_scope =
+        out_scope_vec->size() == 0 ? inner_scope.get() : out_scope_vec->front();
     VLOG(2) << "The number of sub scopes before forward: "
-            << out_scope_vec->front()->kids().size();
+            << global_inner_scope->kids().size();
     framework::Scope &scope = global_inner_scope->NewScope();
 
     // share input_vars & parameters into scope
@@ -341,13 +360,19 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
                                 &scope);
 
     // Debug info: scope info when run end
-    VLOG(3) << framework::GenScopeTreeDebugInfo(out_scope_vec->front());
+    framework::Scope *target_scope{nullptr};
+    if (out_scope_vec->size() == 0) {
+      target_scope = inner_scope.get();
+    } else {
+      target_scope = out_scope_vec->front();
+    }
+    VLOG(3) << framework::GenScopeTreeDebugInfo(target_scope);
     // Step 5. Drop all children scopes while testing.
     if (is_test) {
-      out_scope_vec->front()->DropKids();
+      target_scope->DropKids();
     }
     VLOG(2) << "The number of sub scopes after forward: "
-            << out_scope_vec->front()->kids().size();
+            << target_scope->kids().size();
 #ifdef PADDLE_WITH_MKLDNN
     if (FLAGS_use_mkldnn) platform::DontClearMKLDNNCache(ctx.GetPlace());
 #endif
diff --git a/paddle/fluid/operators/shuffle_channel_op.h b/paddle/fluid/operators/shuffle_channel_op.h
index 409acdfdff7ba..06abd0628ea39 100644
--- a/paddle/fluid/operators/shuffle_channel_op.h
+++ b/paddle/fluid/operators/shuffle_channel_op.h
@@ -27,7 +27,7 @@ class ShuffleChannelOpKernel : public framework::OpKernel<T> {
     auto* output = ctx.Output<framework::Tensor>("Out");
     int group = ctx.Attr<int>("group");
 
-    auto input_dims = input->dims();
+    const auto& input_dims = input->dims();
     auto num = input_dims[0];
     auto channel = input_dims[1];
     auto height = input_dims[2];
diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc
index a815e12d061cf..edc72f4125eb6 100644
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
@@ -104,7 +104,11 @@ class SliceOp : public framework::OperatorWithKernel {
           platform::errors::InvalidArgument(
               "The size of ends must be equal to the size of axes."));
     }
-
+    for (auto &axis : axes) {
+      if (axis < 0) {
+        axis = std::max(0, axis + in_dims.size());
+      }
+    }
     phi::funcs::CheckAndUpdateSliceAttrs<int>(in_dims, axes, &starts, &ends,
                                               nullptr, &infer_flags);
 
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index 7304467833a90..d6287f4c766ef 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -57,15 +57,16 @@ class SoftmaxOp : public framework::OperatorWithKernel {
     }
 #endif
 
-#ifndef PADDLE_WITH_ASCEND_CL
     if (input_data_type == framework::proto::VarType::FP16) {
-      PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()) ||
-                            platform::is_xpu_place(ctx.GetPlace()),
-                        true,
-                        platform::errors::InvalidArgument(
-                            "float16 can only be used on GPU/XPU place"));
+      PADDLE_ENFORCE_EQ(
+          platform::is_gpu_place(ctx.GetPlace()) ||
+              platform::is_npu_place(ctx.GetPlace()) ||
+              platform::is_xpu_place(ctx.GetPlace()) ||
+              platform::is_mlu_place(ctx.GetPlace()),
+          true,
+          platform::errors::InvalidArgument(
+              "float16 can only be used on GPU/NPU/XPU/MLU place"));
     }
-#endif
 
     return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
                                    library_);
@@ -174,9 +175,10 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
     if (input_data_type == framework::proto::VarType::FP16) {
       if (!(platform::is_gpu_place(ctx.GetPlace()) ||
             platform::is_npu_place(ctx.GetPlace()) ||
-            platform::is_xpu_place(ctx.GetPlace())))
+            platform::is_xpu_place(ctx.GetPlace()) ||
+            platform::is_mlu_place(ctx.GetPlace())))
         PADDLE_THROW(platform::errors::InvalidArgument(
-            "float16 can only be used on GPU/NPU/XPU place"));
+            "float16 can only be used on GPU/NPU/XPU/MLU place"));
     }
 
     return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
index 4ef2a9709a59d..9d3d342431b78 100644
--- a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
+++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
@@ -28,6 +28,16 @@ void BeginCUDAGraphCapture(platform::CUDAPlace place,
   auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
   dev_ctx->cudnn_workspace_handle().ResetWorkspace();
 
+  // After PR(#43206), cudnn related initializations will change to lazy mode.
+  // It will only be initialized when op calls them. But cuda graph not support
+  // capture such kind of init, need to init all these handle before cuda graph.
+  dev_ctx->cublas_handle();
+#if CUDA_VERSION >= 11060
+  dev_ctx->cublaslt_handle();
+#endif
+  dev_ctx->cudnn_handle();
+  dev_ctx->cusolver_dn_handle();
+
   auto stream = dev_ctx->stream();
   CUDAGraph::BeginCapture(place, stream, mode);
 
diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
index 82363fcff6349..65f5e81238bc8 100644
--- a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
+++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
@@ -92,6 +92,26 @@ std::unique_ptr<ProfilerResult> DeserializationReader::Parse() {
               device_node);  // insert into runtime_node
         }
       }
+      // handle mem node
+      for (int mem_node_index = 0;
+           mem_node_index < host_node_proto.mem_nodes_size();
+           mem_node_index++) {
+        const MemTraceEventNodeProto& mem_node_proto =
+            host_node_proto.mem_nodes(mem_node_index);
+        MemTraceEventNode* mem_node = RestoreMemTraceEventNode(mem_node_proto);
+        host_node->AddMemNode(mem_node);
+      }
+      // handle op supplement node
+      for (int op_supplement_node_index = 0;
+           op_supplement_node_index <
+           host_node_proto.op_supplement_nodes_size();
+           op_supplement_node_index++) {
+        const OperatorSupplementEventNodeProto& op_supplement_node_proto =
+            host_node_proto.op_supplement_nodes(op_supplement_node_index);
+        OperatorSupplementEventNode* op_supplement_node =
+            RestoreOperatorSupplementEventNode(op_supplement_node_proto);
+        host_node->SetOperatorSupplementNode(op_supplement_node);
+      }
     }
     // restore parent-child relationship
     for (auto it = child_parent_map.begin(); it != child_parent_map.end();
@@ -176,6 +196,62 @@ HostTraceEventNode* DeserializationReader::RestoreHostTraceEventNode(
   return new HostTraceEventNode(host_event);
 }
 
+MemTraceEventNode* DeserializationReader::RestoreMemTraceEventNode(
+    const MemTraceEventNodeProto& mem_node_proto) {
+  const MemTraceEventProto& mem_event_proto = mem_node_proto.mem_event();
+  MemTraceEvent mem_event;
+  mem_event.timestamp_ns = mem_event_proto.timestamp_ns();
+  mem_event.addr = mem_event_proto.addr();
+  mem_event.type = static_cast<TracerMemEventType>(mem_event_proto.type());
+  mem_event.process_id = mem_event_proto.process_id();
+  mem_event.thread_id = mem_event_proto.thread_id();
+  mem_event.increase_bytes = mem_event_proto.increase_bytes();
+  mem_event.place = mem_event_proto.place();
+  mem_event.current_allocated = mem_event_proto.current_allocated();
+  mem_event.current_reserved = mem_event_proto.current_reserved();
+  return new MemTraceEventNode(mem_event);
+}
+
+OperatorSupplementEventNode*
+DeserializationReader::RestoreOperatorSupplementEventNode(
+    const OperatorSupplementEventNodeProto& op_supplement_node_proto) {
+  const OperatorSupplementEventProto& op_supplement_event_proto =
+      op_supplement_node_proto.op_supplement_event();
+  OperatorSupplementEvent op_supplement_event;
+  op_supplement_event.timestamp_ns = op_supplement_event_proto.timestamp_ns();
+  op_supplement_event.op_type = op_supplement_event_proto.op_type();
+  op_supplement_event.callstack = op_supplement_event_proto.callstack();
+  op_supplement_event.process_id = op_supplement_event_proto.process_id();
+  op_supplement_event.thread_id = op_supplement_event_proto.thread_id();
+  std::map<std::string, std::vector<std::vector<int64_t>>> input_shapes;
+  std::map<std::string, std::vector<std::string>> dtypes;
+  auto input_shape_proto = op_supplement_event_proto.input_shapes();
+  for (int i = 0; i < input_shape_proto.key_size(); i++) {
+    auto input_shape_vec = input_shapes[input_shape_proto.key(i)];
+    auto shape_vectors_proto = input_shape_proto.shape_vecs(i);
+    for (int j = 0; j < shape_vectors_proto.shapes_size(); j++) {
+      auto shape_vector_proto = shape_vectors_proto.shapes(j);
+      std::vector<int64_t> shape;
+      for (int k = 0; k < shape_vector_proto.size_size(); k++) {
+        shape.push_back(shape_vector_proto.size(k));
+      }
+      input_shape_vec.push_back(shape);
+    }
+  }
+  op_supplement_event.input_shapes = input_shapes;
+  auto dtype_proto = op_supplement_event_proto.dtypes();
+  for (int i = 0; i < dtype_proto.key_size(); i++) {
+    auto dtype_vec = dtypes[dtype_proto.key(i)];
+    auto dtype_vec_proto = dtype_proto.dtype_vecs(i);
+    for (int j = 0; j < dtype_vec_proto.dtype_size(); j++) {
+      auto dtype_string = dtype_vec_proto.dtype(j);
+      dtype_vec.push_back(dtype_string);
+    }
+  }
+  op_supplement_event.dtypes = dtypes;
+  return new OperatorSupplementEventNode(op_supplement_event);
+}
+
 KernelEventInfo DeserializationReader::HandleKernelEventInfoProto(
     const DeviceTraceEventProto& device_event_proto) {
   const KernelEventInfoProto& kernel_info_proto =
diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.h b/paddle/fluid/platform/profiler/dump/deserialization_reader.h
index e6feb4f9489e8..7df93b7703c32 100644
--- a/paddle/fluid/platform/profiler/dump/deserialization_reader.h
+++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.h
@@ -36,6 +36,9 @@ class DeserializationReader {
   KernelEventInfo HandleKernelEventInfoProto(const DeviceTraceEventProto&);
   MemcpyEventInfo HandleMemcpyEventInfoProto(const DeviceTraceEventProto&);
   MemsetEventInfo HandleMemsetEventInfoProto(const DeviceTraceEventProto&);
+  MemTraceEventNode* RestoreMemTraceEventNode(const MemTraceEventNodeProto&);
+  OperatorSupplementEventNode* RestoreOperatorSupplementEventNode(
+      const OperatorSupplementEventNodeProto&);
   std::string filename_;
   std::ifstream input_file_stream_;
   NodeTreesProto* node_trees_proto_;
diff --git a/paddle/fluid/platform/profiler/dump/nodetree.proto b/paddle/fluid/platform/profiler/dump/nodetree.proto
index 7016745059d40..0f0c9c92c9c93 100644
--- a/paddle/fluid/platform/profiler/dump/nodetree.proto
+++ b/paddle/fluid/platform/profiler/dump/nodetree.proto
@@ -46,6 +46,15 @@ enum TracerEventTypeProto {
   PythonOp = 13;
   // Used to mark python level userdefined
   PythonUserDefined = 14;
+  // Used to mark mlu runtime record returned by cnpapi
+  MluRuntime = 15;
+};
+
+enum TracerMemEventTypeProto {
+  // Used to mark memory allocation
+  Allocate = 0;
+  // Used to mark memory free
+  Free = 1;
 };
 
 message KernelEventInfoProto {
@@ -121,6 +130,58 @@ message HostTraceEventProto {
   required uint64 thread_id = 6;
 }
 
+message MemTraceEventProto {
+  // timestamp of the record
+  required uint64 timestamp_ns = 1;
+  // memory manipulation type
+  required TracerMemEventTypeProto type = 2;
+  // memory addr of allocation or free
+  required uint64 addr = 3;
+  // process id of the record
+  required uint64 process_id = 4;
+  // thread id of the record
+  required uint64 thread_id = 5;
+  // increase bytes after this manipulation, allocation for sign +, free for
+  // sign -
+  required int64 increase_bytes = 6;
+  // place
+  required string place = 7;
+  // current total allocated memory
+  required uint64 current_allocated = 8;
+  // current total reserved memory
+  required uint64 current_reserved = 9;
+}
+
+message OperatorSupplementEventProto {
+  // timestamp of the record
+  required uint64 timestamp_ns = 1;
+  // op type name
+  required string op_type = 2;
+  // process id of the record
+  required uint64 process_id = 3;
+  // thread id of the record
+  required uint64 thread_id = 4;
+  // input shapes
+  message input_shape_proto {
+    repeated string key = 1;
+    message shape_vector {
+      message shape { repeated uint64 size = 1; }
+      repeated shape shapes = 1;
+    }
+    repeated shape_vector shape_vecs = 2;
+  }
+  required input_shape_proto input_shapes = 5;
+  // dtypes
+  message dtype_proto {
+    repeated string key = 1;
+    message dtype_vector { repeated string dtype = 1; }
+    repeated dtype_vector dtype_vecs = 2;
+  }
+  required dtype_proto dtypes = 6;
+  // call stack
+  required string callstack = 7;
+}
+
 message CudaRuntimeTraceEventProto {
   // record name
   required string name = 1;
@@ -166,6 +227,12 @@ message DeviceTraceEventProto {
   }
 }
 
+message OperatorSupplementEventNodeProto {
+  required OperatorSupplementEventProto op_supplement_event = 1;
+}
+
+message MemTraceEventNodeProto { required MemTraceEventProto mem_event = 1; }
+
 message DeviceTraceEventNodeProto {
   required DeviceTraceEventProto device_event = 1;
 }
@@ -180,6 +247,9 @@ message HostTraceEventNodeProto {
   required int64 parentid = 2;
   required HostTraceEventProto host_trace_event = 3;
   repeated CudaRuntimeTraceEventNodeProto runtime_nodes = 4;
+  // below is added in version 1.0.1
+  repeated MemTraceEventNodeProto mem_nodes = 5;
+  repeated OperatorSupplementEventNodeProto op_supplement_nodes = 6;
 }
 
 message ThreadNodeTreeProto {
diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.cc b/paddle/fluid/platform/profiler/dump/serialization_logger.cc
index b8afe2af0e776..eaf1353168ea4 100644
--- a/paddle/fluid/platform/profiler/dump/serialization_logger.cc
+++ b/paddle/fluid/platform/profiler/dump/serialization_logger.cc
@@ -20,7 +20,7 @@ namespace paddle {
 namespace platform {
 
 static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.pb";
-static const char* version = "1.0.0";
+static const char* version = "1.0.1";
 static uint32_t span_indx = 0;
 
 static std::string DefaultFileName() {
@@ -106,10 +106,33 @@ void SerializationLogger::LogNodeTrees(const NodeTrees& node_trees) {
           (*devicenode)->LogMe(this);  // fill detail information
         }
       }
+      for (auto memnode = (*hostnode)->GetMemTraceEventNodes().begin();
+           memnode != (*hostnode)->GetMemTraceEventNodes().end(); ++memnode) {
+        MemTraceEventNodeProto* mem_node_proto =
+            current_host_trace_event_node_proto_->add_mem_nodes();
+        current_mem_trace_event_node_proto_ = mem_node_proto;
+        (*memnode)->LogMe(this);
+      }
     }
   }
 }
 
+void SerializationLogger::LogMemTraceEventNode(
+    const MemTraceEventNode& mem_node) {
+  MemTraceEventProto* mem_trace_event = new MemTraceEventProto();
+  mem_trace_event->set_timestamp_ns(mem_node.TimeStampNs());
+  mem_trace_event->set_type(
+      static_cast<TracerMemEventTypeProto>(mem_node.Type()));
+  mem_trace_event->set_addr(mem_node.Addr());
+  mem_trace_event->set_process_id(mem_node.ProcessId());
+  mem_trace_event->set_thread_id(mem_node.ThreadId());
+  mem_trace_event->set_increase_bytes(mem_node.IncreaseBytes());
+  mem_trace_event->set_place(mem_node.Place());
+  mem_trace_event->set_current_allocated(mem_node.CurrentAllocated());
+  mem_trace_event->set_current_reserved(mem_node.CurrentReserved());
+  current_mem_trace_event_node_proto_->set_allocated_mem_event(mem_trace_event);
+}
+
 void SerializationLogger::LogHostTraceEventNode(
     const HostTraceEventNode& host_node) {
   HostTraceEventProto* host_trace_event = new HostTraceEventProto();
@@ -122,6 +145,59 @@ void SerializationLogger::LogHostTraceEventNode(
   host_trace_event->set_thread_id(host_node.ThreadId());
   current_host_trace_event_node_proto_->set_allocated_host_trace_event(
       host_trace_event);
+  OperatorSupplementEventNode* op_supplement_event_node =
+      host_node.GetOperatorSupplementEventNode();
+  if (op_supplement_event_node != nullptr) {
+    current_op_supplement_event_node_proto_ =
+        current_host_trace_event_node_proto_->add_op_supplement_nodes();
+    OperatorSupplementEventProto* op_supplement_event_proto =
+        new OperatorSupplementEventProto();
+    op_supplement_event_proto->set_op_type(op_supplement_event_node->Name());
+    op_supplement_event_proto->set_timestamp_ns(
+        op_supplement_event_node->TimeStampNs());
+    op_supplement_event_proto->set_process_id(
+        op_supplement_event_node->ProcessId());
+    op_supplement_event_proto->set_thread_id(
+        op_supplement_event_node->ThreadId());
+    op_supplement_event_proto->set_callstack(
+        op_supplement_event_node->CallStack());
+
+    OperatorSupplementEventProto::input_shape_proto* input_shape_proto =
+        op_supplement_event_proto->mutable_input_shapes();
+    for (auto it = op_supplement_event_node->InputShapes().begin();
+         it != op_supplement_event_node->InputShapes().end(); it++) {
+      input_shape_proto->add_key(it->first);
+      OperatorSupplementEventProto::input_shape_proto::shape_vector*
+          shape_vectors_proto = input_shape_proto->add_shape_vecs();
+      auto shape_vectors = it->second;
+      for (auto shape_vecs_it = shape_vectors.begin();
+           shape_vecs_it != shape_vectors.end(); shape_vecs_it++) {
+        auto shape_vector = *shape_vecs_it;
+        OperatorSupplementEventProto::input_shape_proto::shape_vector::shape*
+            shape_proto = shape_vectors_proto->add_shapes();
+        for (auto shape_it = shape_vector.begin();
+             shape_it != shape_vector.end(); shape_it++) {
+          shape_proto->add_size(*shape_it);
+        }
+      }
+    }
+
+    OperatorSupplementEventProto::dtype_proto* dtype_proto =
+        op_supplement_event_proto->mutable_dtypes();
+    for (auto it = op_supplement_event_node->Dtypes().begin();
+         it != op_supplement_event_node->Dtypes().end(); it++) {
+      dtype_proto->add_key(it->first);
+      OperatorSupplementEventProto::dtype_proto::dtype_vector*
+          dtype_vector_proto = dtype_proto->add_dtype_vecs();
+      auto dtype_vector = it->second;
+      for (auto dtype_it = dtype_vector.begin(); dtype_it != dtype_vector.end();
+           dtype_it++) {
+        dtype_vector_proto->add_dtype(*dtype_it);
+      }
+    }
+    current_op_supplement_event_node_proto_->set_allocated_op_supplement_event(
+        op_supplement_event_proto);
+  }
 }
 
 void SerializationLogger::LogRuntimeTraceEventNode(
diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.h b/paddle/fluid/platform/profiler/dump/serialization_logger.h
index 378834cff590d..31910cb68c5d7 100644
--- a/paddle/fluid/platform/profiler/dump/serialization_logger.h
+++ b/paddle/fluid/platform/profiler/dump/serialization_logger.h
@@ -34,6 +34,7 @@ class SerializationLogger : public BaseLogger {
   void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override;
   void LogNodeTrees(const NodeTrees&) override;
   void LogMetaInfo(const std::unordered_map<std::string, std::string>);
+  void LogMemTraceEventNode(const MemTraceEventNode&) override;
 
  private:
   void OpenFile();
@@ -48,6 +49,8 @@ class SerializationLogger : public BaseLogger {
   HostTraceEventNodeProto* current_host_trace_event_node_proto_;
   CudaRuntimeTraceEventNodeProto* current_runtime_trace_event_node_proto_;
   DeviceTraceEventNodeProto* current_device_trace_event_node_proto_;
+  MemTraceEventNodeProto* current_mem_trace_event_node_proto_;
+  OperatorSupplementEventNodeProto* current_op_supplement_event_node_proto_;
 };
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc
index 002071de0d1ef..dc6a6bf32d6e3 100644
--- a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc
+++ b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc
@@ -34,6 +34,7 @@ using paddle::platform::ProfilerResult;
 using paddle::platform::RuntimeTraceEvent;
 using paddle::platform::SerializationLogger;
 using paddle::platform::TracerEventType;
+using paddle::platform::TracerMemEventType;
 
 TEST(SerializationLoggerTest, dump_case0) {
   std::list<HostTraceEvent> host_events;
@@ -50,6 +51,19 @@ TEST(SerializationLoggerTest, dump_case0) {
       std::string("op2"), TracerEventType::Operator, 21000, 30000, 10, 10));
   host_events.push_back(HostTraceEvent(
       std::string("op3"), TracerEventType::Operator, 31000, 40000, 10, 11));
+  mem_events.push_back(MemTraceEvent(11500, 0x1000,
+                                     TracerMemEventType::Allocate, 10, 10, 50,
+                                     "GPU:0", 50, 50));
+  mem_events.push_back(MemTraceEvent(11900, 0x1000, TracerMemEventType::Free,
+                                     10, 10, -50, "GPU:0", 0, 50));
+  std::map<std::string, std::vector<std::vector<int64_t>>> input_shapes;
+  std::map<std::string, std::vector<std::string>> dtypes;
+  input_shapes[std::string("X")].push_back(std::vector<int64_t>{1, 2, 3});
+  input_shapes[std::string("X")].push_back(std::vector<int64_t>{4, 5, 6, 7});
+  dtypes[std::string("X")].push_back(std::string("int8"));
+  dtypes[std::string("X")].push_back(std::string("float32"));
+  op_supplement_events.push_back(OperatorSupplementEvent(
+      11600, "op1", input_shapes, dtypes, "op1()", 10, 10));
   runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000,
                                              17000, 10, 10, 1, 0));
   runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 25000,
@@ -91,6 +105,8 @@ TEST(SerializationLoggerTest, dump_case0) {
     if ((*it)->Name() == "op1") {
       EXPECT_EQ((*it)->GetChildren().size(), 0u);
       EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u);
+      EXPECT_EQ((*it)->GetMemTraceEventNodes().size(), 2u);
+      EXPECT_NE((*it)->GetOperatorSupplementEventNode(), nullptr);
     }
   }
   for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) {
@@ -100,6 +116,7 @@ TEST(SerializationLoggerTest, dump_case0) {
     }
   }
   tree.LogMe(&logger);
+  logger.LogMetaInfo(std::unordered_map<std::string, std::string>());
 }
 
 TEST(SerializationLoggerTest, dump_case1) {
@@ -154,6 +171,7 @@ TEST(SerializationLoggerTest, dump_case1) {
     }
   }
   tree.LogMe(&logger);
+  logger.LogMetaInfo(std::unordered_map<std::string, std::string>());
 }
 
 TEST(DeserializationReaderTest, restore_case0) {
@@ -173,6 +191,8 @@ TEST(DeserializationReaderTest, restore_case0) {
     if ((*it)->Name() == "op1") {
       EXPECT_EQ((*it)->GetChildren().size(), 0u);
       EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u);
+      EXPECT_EQ((*it)->GetMemTraceEventNodes().size(), 2u);
+      EXPECT_NE((*it)->GetOperatorSupplementEventNode(), nullptr);
     }
   }
   for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) {
diff --git a/paddle/fluid/platform/profiler/event_python.cc b/paddle/fluid/platform/profiler/event_python.cc
index abde62c6b1444..4e40e87bbbf20 100644
--- a/paddle/fluid/platform/profiler/event_python.cc
+++ b/paddle/fluid/platform/profiler/event_python.cc
@@ -32,6 +32,9 @@ HostPythonNode::~HostPythonNode() {
   for (auto it = device_node_ptrs.begin(); it != device_node_ptrs.end(); ++it) {
     delete *it;
   }
+  for (auto it = mem_node_ptrs.begin(); it != mem_node_ptrs.end(); ++it) {
+    delete *it;
+  }
 }
 
 HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) {
@@ -77,6 +80,29 @@ HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) {
       runtime_python_node->device_node_ptrs.push_back(device_python_node);
     }
   }
+  // copy MemTraceEventNode
+  for (auto memnode = root->GetMemTraceEventNodes().begin();
+       memnode != root->GetMemTraceEventNodes().end(); memnode++) {
+    MemPythonNode* mem_python_node = new MemPythonNode();
+    mem_python_node->timestamp_ns = (*memnode)->TimeStampNs();
+    mem_python_node->addr = (*memnode)->Addr();
+    mem_python_node->type = (*memnode)->Type();
+    mem_python_node->process_id = (*memnode)->ProcessId();
+    mem_python_node->thread_id = (*memnode)->ThreadId();
+    mem_python_node->increase_bytes = (*memnode)->IncreaseBytes();
+    mem_python_node->place = (*memnode)->Place();
+    mem_python_node->current_allocated = (*memnode)->CurrentAllocated();
+    mem_python_node->current_reserved = (*memnode)->CurrentReserved();
+    host_python_node->mem_node_ptrs.push_back(mem_python_node);
+  }
+  // copy OperatorSupplementEventNode's information if exists
+  OperatorSupplementEventNode* op_supplement_node =
+      root->GetOperatorSupplementEventNode();
+  if (op_supplement_node != nullptr) {
+    host_python_node->input_shapes = op_supplement_node->InputShapes();
+    host_python_node->dtypes = op_supplement_node->Dtypes();
+    host_python_node->callstack = op_supplement_node->CallStack();
+  }
   return host_python_node;
 }
 
diff --git a/paddle/fluid/platform/profiler/event_python.h b/paddle/fluid/platform/profiler/event_python.h
index 172116dbb0edd..4d1f5ad4f788e 100644
--- a/paddle/fluid/platform/profiler/event_python.h
+++ b/paddle/fluid/platform/profiler/event_python.h
@@ -43,6 +43,31 @@ struct DevicePythonNode {
   uint64_t stream_id;
 };
 
+struct MemPythonNode {
+  MemPythonNode() = default;
+  ~MemPythonNode() {}
+
+  // timestamp of the record
+  uint64_t timestamp_ns;
+  // memory addr of allocation or free
+  uint64_t addr;
+  // memory manipulation type
+  TracerMemEventType type;
+  // process id of the record
+  uint64_t process_id;
+  // thread id of the record
+  uint64_t thread_id;
+  // increase bytes after this manipulation, allocation for sign +, free for
+  // sign -
+  int64_t increase_bytes;
+  // place
+  std::string place;
+  // current total allocated memory
+  uint64_t current_allocated;
+  // current total reserved memory
+  uint64_t current_reserved;
+};
+
 struct HostPythonNode {
   HostPythonNode() = default;
   ~HostPythonNode();
@@ -58,12 +83,19 @@ struct HostPythonNode {
   uint64_t process_id;
   // thread id of the record
   uint64_t thread_id;
+  // input shapes
+  std::map<std::string, std::vector<std::vector<int64_t>>> input_shapes;
+  std::map<std::string, std::vector<std::string>> dtypes;
+  // call stack
+  std::string callstack;
   // children node
   std::vector<HostPythonNode*> children_node_ptrs;
   // runtime node
   std::vector<HostPythonNode*> runtime_node_ptrs;
   // device node
   std::vector<DevicePythonNode*> device_node_ptrs;
+  // mem node
+  std::vector<MemPythonNode*> mem_node_ptrs;
 };
 
 class ProfilerResult {
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index c75ac0b52c52c..311ad7b48ed7b 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -384,7 +384,7 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args,
         require_any_grad || egr::EagerUtils::ComputeRequireGrad(
                                 trace_backward, &(ins_auto_grad_metas[i]));
   }
-  if (require_any_grad) {
+  if (require_any_grad && (vec_map.size() > 1)) {
     VLOG(6) << " Construct Grad for Custom Op: " << op_type;
     ConstructFwdAndBwdMap(vec_map, op_type);
     for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) {
diff --git a/paddle/infrt/api/infrt_api.cc b/paddle/infrt/api/infrt_api.cc
index 2f4bbd5df352c..a58c6cc5b86ef 100644
--- a/paddle/infrt/api/infrt_api.cc
+++ b/paddle/infrt/api/infrt_api.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// clang-format off
 #include "paddle/infrt/api/infrt_api.h"
 
 #include <llvm/ADT/SmallVector.h>
@@ -31,6 +30,7 @@
 #include "paddle/infrt/dialect/dense_tensor.h"
 #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
 #include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h"
+#include "paddle/infrt/dialect/infrt/pass/infrt_weights_unfold_pass.h"
 #include "paddle/infrt/dialect/mlir_loader.h"
 #include "paddle/infrt/dialect/phi/ir/phi_base.h"
 #include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h"
@@ -51,18 +51,14 @@
 #include "paddle/infrt/kernel/test_kernels.h"
 #include "paddle/infrt/tensor/tensor_map.h"
 
-#include "paddle/infrt/dialect/infrt/pass/infrt_weights_unfold_pass.h"
-
 #if defined(INFRT_WITH_GPU) && defined(INFRT_WITH_TRT)
-#include "paddle/infrt/kernel/tensorrt/registry.h"
-
 #include "paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h"
 #include "paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h"
 #include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h"
 #include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h"
 #include "paddle/infrt/dialect/tensorrt/trt_type_convert_pass.h"
+#include "paddle/infrt/kernel/tensorrt/registry.h"
 #endif
-// clang-format on
 
 using namespace infrt::host_context;  // NOLINT
 using namespace infrt::tensor;        // NOLINT
diff --git a/paddle/infrt/dialect/dense_tensor.h b/paddle/infrt/dialect/dense_tensor.h
index 8dec818a80a27..eebcbbbcbc698 100644
--- a/paddle/infrt/dialect/dense_tensor.h
+++ b/paddle/infrt/dialect/dense_tensor.h
@@ -14,17 +14,14 @@
 
 #pragma once
 
-// clang-format off
 #include <mlir/IR/Dialect.h>
 #include <mlir/IR/OpDefinition.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 
 #include <string>
 
-#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
-
 #include "paddle/infrt/dialect/dense_tensor_dialect.hpp.inc"
+#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
 
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/dense_tensor.hpp.inc"
-// clang-format on
diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h
index f7358db5bf356..9e09cdde502b7 100644
--- a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h
@@ -14,7 +14,6 @@
 
 #pragma once
 
-// clang-format off
 #include <mlir/Dialect/Traits.h>
 #include <mlir/IR/Attributes.h>
 #include <mlir/IR/Builders.h>
@@ -30,12 +29,10 @@
 #include <mlir/Interfaces/LoopLikeInterface.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 
+#include "paddle/infrt/dialect/dense_tensor.h"
 #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorDialect.h.inc"
 #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorTypes.h.inc"
-
-#include "paddle/infrt/dialect/dense_tensor.h"
 #include "paddle/infrt/dialect/phi/ir/phi_base.h"
 // NOLINT
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h.inc"
-// clang-format on
diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc b/paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc
index 24af0ea437875..530d0981f1e8e 100644
--- a/paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc
+++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// clang-format off
+#include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h"
+
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+
 #include <vector>
 
-#include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h"
 #include "paddle/phi/kernels/declarations.h"
-// clang-format on
 
 namespace infrt {
 
diff --git a/paddle/infrt/dialect/tensorrt/trt_exec.cc b/paddle/infrt/dialect/tensorrt/trt_exec.cc
index 899e71f1c990f..d1ce1c1b562df 100644
--- a/paddle/infrt/dialect/tensorrt/trt_exec.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_exec.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// clang-format off
 #include <llvm/Support/CommandLine.h>
 #include <mlir/Pass/PassManager.h>
 
@@ -27,18 +26,15 @@
 #include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h"
 #include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h"
 #include "paddle/infrt/dialect/tensorrt/trt_type_convert_pass.h"
-
 #include "paddle/infrt/host_context/core_runtime.h"
 #include "paddle/infrt/host_context/kernel_registry.h"
 #include "paddle/infrt/host_context/mlir_to_runtime_translate.h"
-
 #include "paddle/infrt/kernel/basic_kernels.h"
 #include "paddle/infrt/kernel/control_flow_kernels.h"
 #include "paddle/infrt/kernel/tensor_kernels.h"
 #include "paddle/infrt/kernel/tensor_shape_kernels.h"
-#include "paddle/infrt/kernel/test_kernels.h"
-
 #include "paddle/infrt/kernel/tensorrt/registry.h"
+#include "paddle/infrt/kernel/test_kernels.h"
 
 #ifdef INFRT_WITH_PHI
 #include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h"
@@ -48,7 +44,6 @@
 #endif
 
 #include <mlir/Transforms/Passes.h>
-// clang-format on
 
 int main(int argc, char** argv) {
   static llvm::cl::opt<std::string> input_file(
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.cc b/paddle/infrt/dialect/tensorrt/trt_ops.cc
index 161fbbbcc65a5..8e39fea4cd8ec 100644
--- a/paddle/infrt/dialect/tensorrt/trt_ops.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.cc
@@ -12,21 +12,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// clang-format off
 #include "paddle/infrt/dialect/tensorrt/trt_ops.h"
+
 #include <mlir/IR/DialectImplementation.h>
 #include <mlir/IR/Matchers.h>
 #include <mlir/IR/OpImplementation.h>
 #include <mlir/IR/PatternMatch.h>
 #include <mlir/Interfaces/CallInterfaces.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
-#include "paddle/infrt/common/global.h"
-#include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h"
 
+#include "paddle/infrt/common/global.h"
 #include "paddle/infrt/dialect/dense_tensor.h"
 #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h"
 #include "paddle/infrt/dialect/phi/ir/phi_base.h"
-// clang-format on
+#include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h"
 
 namespace infrt {
 namespace trt {
diff --git a/paddle/infrt/external_kernels/CMakeLists.txt b/paddle/infrt/external_kernels/CMakeLists.txt
index 96cfe2b73d8cd..9b318872e7551 100644
--- a/paddle/infrt/external_kernels/CMakeLists.txt
+++ b/paddle/infrt/external_kernels/CMakeLists.txt
@@ -10,6 +10,6 @@ message(STATUS "external_kernels_lib: ${external_kernels_lib}")
 add_test(
   NAME run_and_check_external_kernels
   COMMAND
-    sh -c
-    "${CMAKE_BINARY_DIR}/infrt/host_context/infrtexec -i ${basic_mlir} --shared_libs=${external_kernels_lib} | ${LLVM_PATH}/bin/FileCheck ${basic_mlir}"
-)
+    sh -c "${CMAKE_BINARY_DIR}/infrt/host_context/infrtexec -i ${basic_mlir} \
+       --shared_libs=${external_kernels_lib} | \
+                     ${LLVM_PATH}/bin/FileCheck ${basic_mlir}")
diff --git a/paddle/infrt/kernel/tensorrt/trt_kernels.cc b/paddle/infrt/kernel/tensorrt/trt_kernels.cc
index 0ea68f2e835f7..931fe21b2c710 100644
--- a/paddle/infrt/kernel/tensorrt/trt_kernels.cc
+++ b/paddle/infrt/kernel/tensorrt/trt_kernels.cc
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// clang-format off
 #include "paddle/infrt/kernel/tensorrt/trt_kernels.h"
+
 #include <string>
 #include <unordered_set>
+
 #include "NvInfer.h"
 #include "NvInferRuntime.h"
 #include "NvInferRuntimeCommon.h"
@@ -27,17 +28,14 @@
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/Value.h"
-
-#include "paddle/infrt/kernel/tensorrt/trt_helper.h"
-#include "paddle/infrt/kernel/tensorrt/trt_layers.h"
-
 #include "paddle/infrt/backends/tensorrt/trt_engine.h"
 #include "paddle/infrt/backends/tensorrt/trt_options.h"
 #include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 #include "paddle/infrt/host_context/symbol_table.h"
+#include "paddle/infrt/kernel/tensorrt/trt_helper.h"
+#include "paddle/infrt/kernel/tensorrt/trt_layers.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/dense_tensor.h"
-// clang-format on
 
 namespace infrt {
 namespace kernel {
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index bc41a24c44562..072ab6fd68a1a 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -402,7 +402,8 @@ void EighInferMeta(const MetaTensor& x,
 void EinsumInferMeta(const std::vector<const MetaTensor*>& inputs,
                      const std::string& equation,
                      MetaTensor* out,
-                     std::vector<MetaTensor*> inner_cache) {
+                     std::vector<MetaTensor*> inner_cache,
+                     std::vector<MetaTensor*> xshape) {
   // collect the following informations to prepare einsum.
   LabelMap labelshape(0);
   LabelMap labeltype(LabelType::Reduction);
@@ -439,6 +440,12 @@ void EinsumInferMeta(const std::vector<const MetaTensor*>& inputs,
   VLOG(3) << "Label Shape is : " << label_to_string(all_labels, labelshape);
   out->set_dims(make_ddim(output_dims));
   out->set_dtype(inputs[0]->dtype());
+  for (size_t i = 0; i < xshape.size(); ++i) {
+    if (xshape[i] != nullptr) {
+      xshape[i]->set_dims(inputs[i]->dims());
+      xshape[i]->set_dtype(inputs[i]->dtype());
+    }
+  }
 }
 
 void ExpandInferMeta(const MetaTensor& x,
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index a0cad3e628e3f..f64d406e019ce 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -83,7 +83,8 @@ void EighInferMeta(const MetaTensor& x,
 void EinsumInferMeta(const std::vector<const MetaTensor*>& inputs,
                      const std::string& equation,
                      MetaTensor* out,
-                     std::vector<MetaTensor*> inner_cache);
+                     std::vector<MetaTensor*> inner_cache,
+                     std::vector<MetaTensor*> xshape);
 
 void ExpandInferMeta(const MetaTensor& x,
                      const IntArray& shape,
diff --git a/paddle/phi/kernels/einsum_kernel.h b/paddle/phi/kernels/einsum_kernel.h
index 87df2b1c64a4a..569cf7a55afd4 100644
--- a/paddle/phi/kernels/einsum_kernel.h
+++ b/paddle/phi/kernels/einsum_kernel.h
@@ -29,6 +29,7 @@ void EinsumKernelRaw(const Context& dev_ctx,
                      const std::vector<const DenseTensor*>& inputs,
                      const std::string& equation,
                      DenseTensor* out,
-                     std::vector<DenseTensor*> cache);
+                     std::vector<DenseTensor*> inner_cache,
+                     std::vector<DenseTensor*> xshape);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/einsum_grad_impl.h b/paddle/phi/kernels/impl/einsum_grad_impl.h
index a72db326807f8..a04185a0c53ed 100644
--- a/paddle/phi/kernels/impl/einsum_grad_impl.h
+++ b/paddle/phi/kernels/impl/einsum_grad_impl.h
@@ -177,7 +177,6 @@ void EinsumGradKernel(const Context& dev_ctx,
       cache[0].ShareBufferWith(*(inner_cache[0]));
       cache[1].ShareBufferWith(*(inner_cache[1]));
     }
-
     EinsumKernelImpl<T, Context>(dev_ctx,
                                  all_labels,
                                  operands_for_A,
diff --git a/paddle/phi/kernels/impl/einsum_impl.h b/paddle/phi/kernels/impl/einsum_impl.h
index f3521c81ce46b..43b2760b404f9 100644
--- a/paddle/phi/kernels/impl/einsum_impl.h
+++ b/paddle/phi/kernels/impl/einsum_impl.h
@@ -459,7 +459,7 @@ DenseTensor PerformContraction(
     }
     // reduction
     DenseTensor trans_t;
-    if (FLAGS_einsum_opt && use_cache && cache[operand_idx] != nullptr &&
+    if (use_cache && cache[operand_idx] != nullptr &&
         cache[operand_idx]->IsInitialized()) {
       trans_t.ShareBufferWith(*(cache[operand_idx]));
       VLOG(5) << "Cache Used!";
@@ -468,7 +468,7 @@ DenseTensor PerformContraction(
           dev_ctx, t, perm, all_labels, ellipsis, label2type);
       trans_t = PerformTranspose<T, Context>(
           dev_ctx, reduct_t, perm, reordered_all_labels, ellipsis, label2type);
-      if (FLAGS_einsum_opt && cache[operand_idx] != nullptr)
+      if (cache[operand_idx] != nullptr)
         cache[operand_idx]->ShareBufferWith(trans_t);
     }
     auto mul_dims = GetShapeByType<int>(all_labels,
@@ -599,6 +599,11 @@ void EinsumKernelImpl(const Context& dev_ctx,
                                   out);
     // Reshape Procedure
   } else if (inputs.size() == 1) {
+    if (cache[0] != nullptr) {  // For compatibility, may be cache is nullptr if
+                                // loading the program from v2.3.0
+      (*cache[0]) = *(inputs[0]);  // ShareBuffer for backward, because backward
+                                   // we can only see cached tensor.
+    }
     auto reduce_A = PerformReduction<T, Context>(dev_ctx,
                                                  *inputs[0],
                                                  label2perms[0],
@@ -627,7 +632,8 @@ void EinsumKernelRaw(const Context& dev_ctx,
                      const std::vector<const DenseTensor*>& inputs,
                      const std::string& equation,
                      DenseTensor* out,
-                     std::vector<DenseTensor*> cache) {
+                     std::vector<DenseTensor*> cache,
+                     std::vector<DenseTensor*> xshape) {
   std::vector<char> tmp;
   // for the sake of compatibility, we may load and run v2.3 EinsumOp. Output
   // may have nullptr and the cache.size() is not equal to inputs.size(). refer
diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index 73935640e349b..4b4a75727a55c 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -442,8 +442,14 @@ void MultiplyDoubleGradKernel(const Context& dev_ctx,
   // (5) dx = dout * ddy
   if (ddout) {
     auto& place = *dev_ctx.eigen_device();
-    // size(ddout) > size(ddx), ddout can't use memory of ddx using inplace
-    if (ddout->numel() > ddx.get_ptr()->numel()) {
+    // size(ddout) > size(ddx) or we don't have ddx, ddout can't use memory of
+    // ddx using inplace
+
+    bool without_ddx = (ddx.get_ptr() == nullptr);
+    if (!without_ddx) {
+      without_ddx = (ddout->numel() > ddx.get_ptr()->numel());
+    }
+    if (without_ddx) {
       phi::funcs::ElemwiseGradCompute<Context, T, MulGradDX<T>, MulGradDY<T>>(
           dev_ctx,
           ddx_safe,
diff --git a/paddle/phi/kernels/impl/lerp_kernel_impl.h b/paddle/phi/kernels/impl/lerp_kernel_impl.h
index 58759308fac41..72fa0672a5f48 100644
--- a/paddle/phi/kernels/impl/lerp_kernel_impl.h
+++ b/paddle/phi/kernels/impl/lerp_kernel_impl.h
@@ -28,7 +28,7 @@ static void LerpFunction(const Context& ctx,
                          DenseTensor* out) {
   ctx.template Alloc<T>(out);
 
-  auto out_dims = out->dims();
+  const auto& out_dims = out->dims();
   auto x_dims = phi::funcs::ExtendDims2Rank(x.dims(), D);
   auto y_dims = phi::funcs::ExtendDims2Rank(y.dims(), D);
   auto w_dims = phi::funcs::ExtendDims2Rank(weight.dims(), D);
diff --git a/paddle/phi/ops/compat/einsum_sig.cc b/paddle/phi/ops/compat/einsum_sig.cc
index 5e45bcf97ce0e..4fd31c1a2d842 100644
--- a/paddle/phi/ops/compat/einsum_sig.cc
+++ b/paddle/phi/ops/compat/einsum_sig.cc
@@ -18,7 +18,7 @@ namespace phi {
 
 KernelSignature EinsumOpArgumentMapping(const ArgumentMappingContext& ctx) {
   return KernelSignature(
-      "einsum", {"Operands"}, {"equation"}, {"Out", "InnerCache"});
+      "einsum", {"Operands"}, {"equation"}, {"Out", "InnerCache", "XShape"});
 }
 
 KernelSignature EinsumGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 2a18d2f7e0195..f538193782179 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -680,7 +680,12 @@ pip install requests
 
 set PATH=%THIRD_PARTY_PATH:/=\%\install\openblas\lib;%THIRD_PARTY_PATH:/=\%\install\openblas\bin;^
 %THIRD_PARTY_PATH:/=\%\install\zlib\bin;%THIRD_PARTY_PATH:/=\%\install\mklml\lib;^
-%THIRD_PARTY_PATH:/=\%\install\mkldnn\bin;%THIRD_PARTY_PATH:/=\%\install\warpctc\bin;%PATH%
+%THIRD_PARTY_PATH:/=\%\install\mkldnn\bin;%THIRD_PARTY_PATH:/=\%\install\warpctc\bin;^
+%THIRD_PARTY_PATH:/=\%\install\onnxruntime\lib;%THIRD_PARTY_PATH:/=\%\install\paddle2onnx\lib;^
+%work_dir%\%BUILD_DIR%\paddle\fluid\inference;%PATH%
+
+REM TODO: make ut find .dll in install\onnxruntime\lib
+xcopy %THIRD_PARTY_PATH:/=\%\install\onnxruntime\lib\onnxruntime.dll %work_dir%\%BUILD_DIR%\paddle\fluid\inference\tests\api\ /Y
 
 if "%WITH_GPU%"=="ON" (
     call:parallel_test_base_gpu
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 3ed5f992ed40c..9e4aac55f5d2d 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -955,6 +955,9 @@ function fetch_upstream_develop_if_not_exist() {
 }
 
 function check_whl_size() {
+    if [ ${BRANCH} != 'develop' ];then
+        return
+    fi
 
     set +x
     pr_whl_size=`du -m ${PADDLE_ROOT}/build/pr_whl/*.whl|awk '{print $1}'`
@@ -1094,6 +1097,10 @@ function check_approvals_of_unittest() {
             fi
         fi
     elif [ $check_times == 3 ]; then
+        if [ ${BRANCH} != 'develop' ];then
+            return
+        fi
+
         rm -f fluidInference_so_size
         curl -O https://paddle-docker-tar.bj.bcebos.com/paddle_ci_index/fluidInference_so_size
         oriBuildSize=`cat fluidInference_so_size`
@@ -3276,6 +3283,10 @@ function build_develop() {
 }
 
 function check_coverage_build() {
+    if [ ${BRANCH} != 'develop' ];then
+        return
+    fi
+
     rm -f build_size
     curl -O https://paddle-docker-tar.bj.bcebos.com/paddle_ci_index/build_size
     curl -O https://xly-devops.bj.bcebos.com/PR/build_whl/${AGILE_PULL_ID}/${AGILE_REVISION}/coverage_build_size
diff --git a/python/paddle/device/cuda/graphs.py b/python/paddle/device/cuda/graphs.py
index dca32fb6bb85b..5c9c8740d85bb 100644
--- a/python/paddle/device/cuda/graphs.py
+++ b/python/paddle/device/cuda/graphs.py
@@ -14,7 +14,10 @@
 
 import os
 import paddle
+from paddle.fluid import core
+from paddle.fluid.layers.utils import _hash_with_id
 from paddle.fluid.core import is_compiled_with_cuda, is_compiled_with_rocm, CUDAPlace
+import warnings
 
 if is_compiled_with_cuda() and not is_compiled_with_rocm():
     from paddle.fluid.core import CUDAGraph as CoreCUDAGraph
@@ -106,3 +109,335 @@ def wrap_cuda_graph(function, mode="thread_local", memory_pool="default"):
         else:
             mock_func._cuda_graph_pool_id = memory_pool._cuda_graph_pool_id
     return new_function
+
+
+def copy_var_desc(dst, src):
+    """
+    copy var desc from src to dst
+
+    :param dst: framework.VarDesc(cpp), dst var desc, cpp VarDesc instance
+    :param src: framework.VarDesc(cpp), src var desc, cpp VarDesc instance
+    :return: no return
+    """
+    dst.set_shape(src.shape)
+    dst.set_dtype(src.dtype)
+    dst.set_lod_level(src.lod_level)
+    dst.set_type(src.type)
+    dst.set_persistable(src.persistable)
+    dst.set_is_parameter(src.is_parameter)
+    dst.set_stop_gradient(src.stop_gradient)
+
+
+def all_inputs_of_later_op(block, begin_idx):
+    """
+    find all inputs of ops after an idx, used to determine the logical output of a cuda graph section
+
+    :param block: framework.Block, the original block
+    :param begin_idx: int, from which idx (not include) to find the later ins
+    :return: a list of inputs names for all ops behind begin_idx
+    """
+    ins = []
+    for idx, op in enumerate(block.ops):
+        if idx <= begin_idx:
+            continue
+        for in_name in op.input_arg_names:
+            ins.append(in_name)
+    return list(set(ins))
+
+
+def construct_program_and_find_ins_outs(section, origin_program, section_idx):
+    """
+    1. Construct a new program for corresponding section
+    2. Find all the logical inputs and outputs of a program section
+
+    :param section: list, one cuda graph section, list of ops
+    :param origin_program: framework.Program, origin program
+    :param section_idx: list, the section ops' idx corresponding to the cuda graph section, a list of idx
+    :return: a new program for the cuda graph section
+             the logical ins and outs of the cuda graph section
+    """
+    program = paddle.static.Program()
+    block = program.global_block()
+    origin_block = origin_program.global_block()
+    ins = []
+    outs = []
+    op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
+    later_ins = all_inputs_of_later_op(origin_block, section_idx[-1])
+
+    for op in section:
+        for in_name in op.input_arg_names:
+            var = origin_block.var(in_name)
+            new_var_desc = block.desc.var(var.name.encode("ascii"))
+            copy_var_desc(new_var_desc, var)
+            if outs.count(in_name) == 0 and ins.count(in_name) == 0:
+                # This in var is generated from op outside this section
+                # Only record once for same input
+                ins.append(in_name)
+            elif later_ins.count(in_name) == 0:
+                # this is var is generated from op inside this section, and only will be used inside this section
+                outs.remove(in_name)
+        for out_name in op.output_arg_names:
+            var = origin_block.var(out_name)
+            new_var_desc = block.desc.var(var.name.encode("ascii"))
+            copy_var_desc(new_var_desc, var)
+            # for every output, we add it to the section's outs
+            if outs.count(out_name) == 0:
+                # Only record one out var even if it will be generated by multi ops.
+                # For scenario like this:
+                # A = op1(a)
+                # A = op2(b)
+                # B = op3(A)
+                outs.append(out_name)
+        new_op_desc = block.desc.append_op()
+        new_op_desc.copy_from(op.desc)
+        new_op_desc._set_attr(op_role_attr_name, op.attr(op_role_attr_name))
+
+    program._sync_with_cpp()
+
+    return program, [ins, outs]
+
+
+def get_cuda_graph_sections(program):
+    """
+    get all sections that should run under cuda graph and the corresponding idx
+
+    :param program: framework.Program, the original program
+    :return: A list of cuda graph sections and the corresponding ops' idx in the block.
+             The program is under is test or not.
+    """
+    block = program.global_block()
+    cuda_graph_sections = []  # record all ops in every cuda graph sections
+    sections_idx = []  # idx of all ops in every cuda graph sections
+    is_test = False  # will be set to True is any op's 'is_test' attr is True
+
+    # ops and it's idx between cuda graph wrapped op, may belong to a section
+    internal_section = []
+    internal_idx = []
+
+    current_section = []  # current recording cuda graph sections
+    current_idx = []  # current recording cuda graph ops' idx
+    current_cuda_graph_id = -1  # current recording cuda graph id
+    op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
+    loss_op_role = int(core.op_proto_and_checker_maker.OpRole.Loss)
+    backward_op_role = int(core.op_proto_and_checker_maker.OpRole.Backward)
+    loss_grad_op_role = loss_op_role | backward_op_role
+
+    for idx, op in enumerate(block.ops):
+        if op.type == 'conditional_block' or op.type == 'while':
+            assert op._cuda_graph_attr is None, "Cuda graph not support conditional block op and while op."
+        if op.has_attr('is_test') and op.attr('is_test'):
+            is_test = True
+        # find cuda graph sections
+        if op._cuda_graph_attr is not None:
+            assert isinstance(op._cuda_graph_attr,
+                              str), "cuda_graph_attr should be a str"
+            cuda_graph_attrs = op._cuda_graph_attr.split(';')
+            assert len(cuda_graph_attrs) == 3, "cuda graph attr should have three fields: " \
+                                               "cuda graph mode, cuda graph memory pool id, cuda graph id"
+            local_cuda_graph_id = int(cuda_graph_attrs[2])
+            if local_cuda_graph_id == current_cuda_graph_id:
+                if len(internal_section) > 0:
+                    assert len(internal_section) == len(
+                        internal_idx
+                    ), "len of internal section should be equal with len of internal idx"
+                    for internal_op in internal_section:
+                        loss_related = (int(internal_op.attr(op_role_attr_name))
+                                        == loss_op_role) or int(
+                                            (internal_op.attr(op_role_attr_name)
+                                             ) == loss_grad_op_role)
+                        sub_block_related = (op.type == 'conditional_block'
+                                             or op.type == 'while')
+                        if loss_related or sub_block_related:
+                            # if loss_related is True
+                            # The internal section contains loss related ops,
+                            # although these ops are between two cuda graph sections with same graph id,
+                            # they belong to none of these two sections.
+                            # The loss related op should be wrapped by user explicitly.
+
+                            # if sub_block_related is True
+                            # The internal section contains while op or conditional block op.
+                            # These two ops are not supported by cuda graph. Won't extend the section.
+                            internal_section = []
+                            internal_idx = []
+                            # Beside clear the internal section, a new cuda graph section should be recorded
+                            assert len(current_section) == len(current_idx), \
+                                "num of section's op is not equal with the idx"
+                            if len(current_section) > 0:
+                                # store previous section
+                                cuda_graph_sections.append(current_section)
+                                sections_idx.append(current_idx)
+                            current_section = []
+                            current_idx = []
+                            break
+                    # some ops inserted by some optimizer, should be added to current section
+                    for i in range(len(internal_section)):
+                        current_section.append(internal_section[i])
+                        current_idx.append(internal_idx[i])
+                internal_section = []
+                current_section.append(op)
+                current_idx.append(idx)
+            else:
+                # current graph id is different with previous, start a new section of cuda graph
+                # internal ops and idx belong to no section, just clear it
+                internal_section = []
+                internal_idx = []
+                current_cuda_graph_id = local_cuda_graph_id  # start record a new section
+                assert len(current_section) == len(
+                    current_idx
+                ), "num of section's op is not equal with num of idx"
+                if len(current_section) > 0:
+                    # store previous section
+                    cuda_graph_sections.append(current_section)
+                    sections_idx.append(current_idx)
+                current_section = [op]
+                current_idx = [idx]
+        else:
+            # recode ops which cuda_graph_attr is None, may belong to a section
+            internal_section.append(op)
+            internal_idx.append(idx)
+
+    # handle the last section
+    assert len(current_section) == len(
+        current_idx), "num of section's op is not equal with num of idx"
+    if len(current_section) > 0:
+        # store previous section
+        cuda_graph_sections.append(current_section)
+        sections_idx.append(current_idx)
+
+    return cuda_graph_sections, sections_idx, is_test
+
+
+def replace_cuda_graph_section(ins_and_outs, section_program, section_idx,
+                               origin_program, cuda_graph_section, order,
+                               is_test):
+    """
+    Use section_program and ins_and_outs to initialize a run_program_op,
+    and replace the section_idx marks ops in the origin program.
+
+    :param ins_and_outs: list, the logical ins and outs of the section program
+    :param section_program: framework.Program, the partial program need to run under cuda graph
+    :param section_idx: list, the idx need to be removed from origin program
+    :param origin_program: framework.Program, the origin program
+    :param cuda_graph_section: list, the ops in current sections, used to get the mode, memory pool id and is_test
+    :param order: int, the order of current section, used to create unique cuda graph var
+    :param is_test: bool, the program is running under is_test or not
+    :return: no return
+    """
+    ins = ins_and_outs[0]
+    outs = ins_and_outs[1]
+    insert_idx = section_idx[0]
+    origin_block = origin_program.global_block()
+
+    for idx in reversed(section_idx):
+        # remove all cuda graph marked ops from origin block
+        origin_block._remove_op(idx, sync=False)
+
+    mode = None
+    memory_pool_id = None
+
+    for op in cuda_graph_section:
+        # find the cuda graph mode and memory pool id, determine is test or not
+        if op._cuda_graph_attr is not None:
+            attrs = op._cuda_graph_attr.split(';')
+            mode = attrs[0]
+            memory_pool_id = int(attrs[1])
+            break
+
+    assert mode is not None and memory_pool_id is not None, \
+        "mode and memory pool id should be specified in cuda graph attr"
+
+    cuda_graph_var = origin_block.create_var(
+        name="cuda_graph_" + str(order),
+        type=core.VarDesc.VarType.RAW,
+        persistable=True,
+        stop_gradient=True,
+    )
+
+    # not used for the run_program_op, just needed by the op, but won't be used
+    out_scope_var = origin_block.create_var(
+        name="program_out_scope_" + str(order),
+        type=core.VarDesc.VarType.STEP_SCOPES,
+        persistable=True,
+        stop_gradient=True,
+    )
+
+    program_id = _hash_with_id(section_program, ins_and_outs)
+
+    # insert the run_program_op into the block
+    origin_block._insert_op(insert_idx,
+                            type='run_program',
+                            inputs={'X': ins},
+                            outputs={
+                                'Out': outs,
+                                'OutScope': out_scope_var,
+                                'CUDAGraph': cuda_graph_var
+                            },
+                            attrs={
+                                'global_block':
+                                section_program.global_block(),
+                                'start_op_index':
+                                0,
+                                'end_op_index':
+                                len(section_program.global_block().ops),
+                                'is_test':
+                                is_test,
+                                'program_id':
+                                program_id,
+                                'cuda_graph_capture_mode':
+                                mode,
+                                'cuda_graph_pool_id':
+                                memory_pool_id,
+                            })
+
+
+def cuda_graph_transform(program):
+    """
+    replace the ops marked with cuda_graph_attr to run_program_op to use cuda graph
+
+    :param program: framework.Program, the program to be transformed
+    :return: the cuda graph section program, user should hold these programs!
+    """
+
+    if len(program.blocks) > 1:
+        # some sub blocks may be inserted by optimizer but will not use during training, just warn here
+        warnings.warn(
+            "Sub block(s) has been detected in the program. "
+            "Cuda graph not support op with sub block, and it will only handle the global block."
+        )
+
+    # step 1: get all cuda graph sections.
+    # A cuda graph section contains all ops marked with same cuda graph id and
+    # some ops inserted by some optimizers (amp, sharding for example) between ops with same id.
+    cuda_graph_sections, sections_idx, is_test = get_cuda_graph_sections(
+        program)
+    assert len(cuda_graph_sections) == len(sections_idx), \
+        "num of cuda graph sections is not equal with num of idx sections"
+
+    # step 2: construct new program for each section and find inputs and outputs of each section.
+    # The inputs are variables generated outside the section but will be used by this section.
+    # The outputs are variables generated by this section and will be used after the end of the section.
+    ins_and_outs = []
+    section_programs = []
+    for i in range(len(cuda_graph_sections)):
+        # creating new program for current section
+        section_program, ins_outs = construct_program_and_find_ins_outs(
+            cuda_graph_sections[i], program, sections_idx[i])
+        ins_and_outs.append(ins_outs)
+        section_programs.append(section_program)
+    assert len(section_programs) == len(cuda_graph_sections), \
+        "the num of cuda graph sections should be equal with the num of new program"
+
+    # step 3: replace the ops in original program with run_program_op.
+    # Will remove all ops in the section from origin program, and use run_program_op to replace them.
+    for i in reversed(range(len(cuda_graph_sections))):
+        # carry out the replacement in reversed order, to keep the previous idx intact
+        replace_cuda_graph_section(ins_and_outs[i],
+                                   section_programs[i],
+                                   sections_idx[i],
+                                   program,
+                                   cuda_graph_sections[i],
+                                   order=i,
+                                   is_test=is_test)
+
+    # NOTE: user should hold these program, for now just return these program back to caller
+    return section_programs
diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py
index a0b2125f16642..dcdd098dcd9cc 100644
--- a/python/paddle/distributed/auto_parallel/engine.py
+++ b/python/paddle/distributed/auto_parallel/engine.py
@@ -17,6 +17,7 @@
 from collections import defaultdict
 
 import paddle
+import paddle.utils as utils
 import paddle.distributed.auto_parallel as auto
 
 from paddle import fluid, static
@@ -26,9 +27,9 @@
 from paddle.fluid import core
 from paddle.fluid import program_guard
 from paddle.fluid.layers.utils import flatten
-from paddle.fluid.executor import global_scope
+from paddle.fluid.executor import global_scope, _to_name_str
 from paddle.fluid.backward import append_backward
-from paddle.fluid.framework import Operator, Variable
+from paddle.fluid.framework import Operator
 from paddle.fluid.framework import _current_expected_place as _get_device
 from paddle.fluid.dygraph.parallel import ParallelEnv
 from paddle.distributed import fleet
@@ -137,7 +138,8 @@ def _build(self):
             metrics = []
             serial_main_prog = self._orig_main_prog.clone()
             serial_startup_prog = self._orig_startup_prog.clone()
-            with static.program_guard(serial_main_prog, serial_startup_prog):
+            with static.program_guard(serial_main_prog, serial_startup_prog), \
+                utils.unique_name.guard():
                 inputs_spec = self.inputs_spec
                 labels_spec = self.labels_spec if self.labels_spec else []
                 inputs = [s._create_feed_layer() for s in inputs_spec]
@@ -256,7 +258,7 @@ def fit(self,
             train_data,
             batch_size=1,
             epochs=1,
-            fetch_list=None,
+            fetches=None,
             steps_per_epoch=None,
             use_program_cache=False,
             return_numpy=True):
@@ -267,134 +269,131 @@ def fit(self,
             "train model is not ready, please call `engine.prepare()` first."
         train_dataloader = self._create_dataloader(train_data, batch_size,
                                                    epochs, steps_per_epoch)
-        self._usr_fetch_list = fetch_list
 
-        outputs = []
+        usr_fetch = self._to_map_fetch(fetches)
+        fetch_loss = self._inner_fetch(self.fetch_vars["loss"])
+        fetch_list, fetch_map = self._fetch_map(fetch_loss, usr_fetch)
+
         for epoch in range(epochs):
-            for step, data in enumerate(train_dataloader):
-                logs, outs = self._train_step(data, use_program_cache,
-                                              return_numpy)
-                outputs.append(outs)
-                train_logs = {
-                    "train_" + name: val
-                    for name, val in logs.items()
-                }
+            train_logs = {"epoch": epoch}
+            for step, _ in enumerate(train_dataloader):
+                outs = self._executor.run(self.main_program,
+                                          fetch_list=fetch_list,
+                                          use_program_cache=use_program_cache,
+                                          return_numpy=return_numpy)
+                train_logs["step"] = step
+                # inner fetches
+                if fetch_loss:
+                    train_logs["train_loss"] = outs[0][0]
+                # user fetches
+                user_outs = outs[len(fetch_loss):]
+                user_fetch_list = fetch_list[len(fetch_loss):]
+                for i, out in enumerate(user_outs):
+                    train_logs["train_" +
+                               fetch_map[user_fetch_list[i]]] = out[0]
                 self._logger.info(train_logs)
-        return outputs
 
     def evaluate(self,
                  eval_data,
                  batch_size=1,
-                 fetch_list=None,
+                 fetches=None,
                  use_program_cache=False,
                  return_numpy=True):
         self.mode = 'eval'
         assert self.mode in self._dist_main_progs, \
             "eval model is not ready, please call `engine.prepare()` first."
         eval_dataloader = self._create_dataloader(eval_data, batch_size)
-        self._usr_fetch_list = fetch_list
-
-        for step, data in enumerate(eval_dataloader):
-            eval_logs = dict()
-            logs, outs = self._eval_step(data, use_program_cache, return_numpy)
-            eval_logs["eval_loss"] = outs[0] if len(outs) > 0 else []
-            for metric in self._metrics:
-                results = metric.accumulate()
-                for i, res in enumerate(to_list(results)):
-                    eval_logs["eval_" + metric.name()[i]] = res
-            for name, val in logs.items():
-                eval_logs["eval_" + name] = val
+
+        usr_fetch = self._to_map_fetch(fetches)
+        fetch_loss = self._inner_fetch(self.fetch_vars["loss"])
+        fetch_metrics = self._inner_fetch(self.fetch_vars["metrics"])
+        inner_fetch = dict(fetch_loss, **fetch_metrics)
+        fetch_list, fetch_map = self._fetch_map(inner_fetch, usr_fetch)
+
+        for step, _ in enumerate(eval_dataloader):
+            eval_logs = {"step": step}
+            outs = self._executor.run(self.main_program,
+                                      fetch_list=fetch_list,
+                                      use_program_cache=use_program_cache,
+                                      return_numpy=return_numpy)
+            # inner fetches
+            if fetch_loss:
+                eval_logs["eval_loss"] = outs[0]
+            # Metric
+            if fetch_metrics:
+                metric_out = outs[len(fetch_loss):len(inner_fetch)]
+                for metric in self._metrics:
+                    metric.update(*metric_out)
+                    results = metric.accumulate()
+                    for i, res in enumerate(to_list(results)):
+                        eval_logs["eval_" + metric.name()[i]] = res
+            # usr fetches
+            usr_out = outs[len(inner_fetch):]
+            usr_fetch_list = fetch_list[len(inner_fetch):]
+            for i, out in enumerate(usr_out):
+                eval_logs["eval_" + fetch_map[usr_fetch_list[i]]] = out
+            # logger
             self._logger.info(eval_logs)
-        return eval_logs
 
     def predict(self,
                 test_data,
                 batch_size=1,
-                fetch_list=None,
+                fetches=None,
                 use_program_cache=False,
                 return_numpy=True):
         self.mode = 'predict'
         assert self.mode in self._dist_main_progs, \
             "predict model is not ready, please call `engine.prepare()` first."
         test_dataloader = self._create_dataloader(test_data, batch_size)
-        self._usr_fetch_list = fetch_list
+
+        usr_fetch = self._to_map_fetch(fetches)
+        fetch_outputs = self._inner_fetch(self.fetch_vars["outputs"])
+        fetch_list, fetch_map = self._fetch_map(fetch_outputs, usr_fetch)
 
         outputs = []
-        for step, data in enumerate(test_dataloader):
-            logs, outs = self._predict_step(data, use_program_cache,
-                                            return_numpy)
-            outputs.append(outs)
-            predict_logs = {"pred_" + name: val for name, val in logs.items()}
+        for step, _ in enumerate(test_dataloader):
+            predict_logs = {"step": step}
+            outs = self._executor.run(self.main_program,
+                                      fetch_list=fetch_list,
+                                      use_program_cache=use_program_cache,
+                                      return_numpy=return_numpy)
+            outputs.append(outs[:len(fetch_outputs)])
+            for i, out in enumerate(outs):
+                predict_logs["pred_" + fetch_map[fetch_list[i]]] = out[0]
             self._logger.info(predict_logs)
+
         return outputs
 
-    def _train_step(self, data, use_program_cache=False, return_numpy=True):
-        logs = {}
-        fetch_vars = self._fetch_vars[self.mode]["loss"]
-        fetch_list, usr_fetch_list = self._fetch_list(fetch_vars)
-        fetch_list += usr_fetch_list
-
-        outs = self._executor.run(self.main_program,
-                                  fetch_list=fetch_list,
-                                  use_program_cache=use_program_cache,
-                                  return_numpy=return_numpy)
-        for i, out in enumerate(outs):
-            logs[fetch_list[i]] = out
-        return logs, outs
-
-    def _eval_step(self, data, use_program_cache=False, return_numpy=True):
-        logs = {}
-        metrics = self._fetch_vars[self.mode]["metrics"]
-        losses = self._fetch_vars[self.mode]["loss"]
-        fetch_loss, usr_fetch_list = self._fetch_list(losses)
-        fetch_metrics, usr_fetch_list = self._fetch_list(metrics)
-        fetch_list = fetch_loss + fetch_metrics
-
-        outs = self._executor.run(self.main_program,
-                                  fetch_list=fetch_list + usr_fetch_list,
-                                  use_program_cache=use_program_cache,
-                                  return_numpy=return_numpy)
-        usr_out = outs[len(fetch_list):]
-        for i, out in enumerate(usr_out):
-            logs[usr_fetch_list[i]] = out
-        outs = outs[:len(fetch_list)]
-        if not outs[len(fetch_loss):]:
-            return logs, outs[:len(fetch_loss)]
-        for metric in self._metrics:
-            metric.update(*outs[len(fetch_loss):])
-        return logs, outs[:len(fetch_loss)]
-
-    def _predict_step(self, data, use_program_cache=False, return_numpy=True):
-        logs = {}
-        fetch_vars = self._fetch_vars[self.mode]["outputs"]
-        fetch_list, usr_fetch_list = self._fetch_list(fetch_vars)
-        fetch_list += usr_fetch_list
-
-        outs = self._executor.run(self.main_program,
-                                  fetch_list=fetch_list,
-                                  use_program_cache=use_program_cache,
-                                  return_numpy=return_numpy)
-        for i, out in enumerate(outs):
-            logs[fetch_list[i]] = out
-        return logs, outs
-
-    def _fetch_list(self, fetch_vars):
-        fetch_list = []
-        for var in fetch_vars:
-            if var.name in self.main_program.global_block().vars:
-                fetch_list.append(var.name)
-        usr_fetch_list = []
-        if self._usr_fetch_list:
-            assert isinstance(self._usr_fetch_list,
-                              list), "'fetch_list' type should be list."
-            for var in self._usr_fetch_list:
-                if isinstance(var, str):
-                    if var in self.main_program.global_block().vars:
-                        usr_fetch_list.append(var)
-                elif isinstance(var, Variable):
-                    if var.name in self.main_program.global_block().vars:
-                        usr_fetch_list.append(var.name)
-        return fetch_list, usr_fetch_list
+    def _local_var(self, var):
+        var_name = _to_name_str(var)
+        return var_name in self.main_program.global_block().vars
+
+    def _to_map_fetch(self, fetches):
+        if not fetches:
+            return {}
+        if isinstance(fetches, dict):
+            fetch_var_names = list(map(_to_name_str, fetches.values()))
+            usr_fetches = dict(zip(fetch_var_names, list(fetches.keys())))
+        elif isinstance(fetches, list):
+            fetch_var_names = list(map(_to_name_str, fetches))
+            usr_fetches = dict(zip(fetch_var_names, fetch_var_names))
+        return dict(filter(lambda x: self._local_var(x[0]),
+                           usr_fetches.items()))
+
+    def _inner_fetch(self, fetch_vars):
+        fetch_list = list(
+            map(lambda x: x.name, list(filter(self._local_var, fetch_vars))))
+        inner_fetches = dict(zip(fetch_list, fetch_list))
+        return inner_fetches
+
+    def _fetch_map(self, inner_fetch, usr_fetch):
+        # replace inner fetch name if usr set for it
+        for iname in inner_fetch:
+            if iname in usr_fetch:
+                inner_fetch[iname] = usr_fetch[iname]
+                usr_fetch.pop(iname)
+        fetches = dict(inner_fetch, **usr_fetch)
+        return list(fetches.keys()), fetches
 
     def _create_dataloader(self,
                            dataset,
@@ -515,7 +514,8 @@ def save(self, path, training=True, mode=None):
             mode = self.mode
 
         if training:
-            assert 'train' in self._serial_main_progs, "training model is not ready, please call `engine.prepare(mode='train')` first."
+            assert 'train' in self._serial_main_progs, \
+                "training model is not ready, please call `engine.prepare()` first."
             serial_program = self._serial_main_progs["train"]
             dist_main_prog = self._dist_main_progs["train"][self._cur_rank]
             dist_context = self._dist_contexts["train"]
@@ -571,3 +571,7 @@ def serial_main_program(self):
     @property
     def serial_startup_program(self):
         return self._serial_startup_progs[self.mode]
+
+    @property
+    def fetch_vars(self):
+        return self._fetch_vars[self.mode]
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index 762b961da53ba..d41f0fbb84570 100755
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -1811,8 +1811,8 @@ def unscale_method(self, optimizer):
                     if (param._grad_ivar() is not None) and (
                         param._grad_ivar().dtype == core.VarDesc.VarType.FP32)
                 ]
-            temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool))
-            temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool))
+            temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool_))
+            temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool_))
             if len(param_grads_fp16):
                 _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale,
                                                 param_grads_fp16,
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
index b1e0f6cc13068..fcbbadbe12159 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
@@ -200,8 +200,8 @@ def unscale_method(self, optimizer):
                     else:
                         param_grads_fp32.append(param.grad)
 
-        temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool))
-        temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool))
+        temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool_))
+        temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool_))
 
         device = "cpu" if optimizer.offload else "gpu"
         dev_id = 0 if device == "cpu" else int(
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
index ae98d4bdf7b1e..63e2b91b3d9bd 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
@@ -201,8 +201,8 @@ def unscale_method(self, optimizer):
                     else:
                         param_grads_fp32.append(param.grad)
 
-        temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool))
-        temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool))
+        temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool_))
+        temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool_))
 
         device = "cpu" if optimizer.offload else "gpu"
         dev_id = 0 if device == "cpu" else int(
diff --git a/python/paddle/distributed/fleet/utils/recompute.py b/python/paddle/distributed/fleet/utils/recompute.py
index 423536b095a40..1f4439cf1171f 100755
--- a/python/paddle/distributed/fleet/utils/recompute.py
+++ b/python/paddle/distributed/fleet/utils/recompute.py
@@ -73,8 +73,6 @@ class EagerRecomputeFunction(EagerPyLayer):
     @staticmethod
     def forward(ctx, run_function, preserve_rng_state, *args):
         from paddle.distributed.fleet.meta_parallel.parallel_layers.random import get_rng_state_tracker
-        if framework._dygraph_tracer()._has_grad:
-            check_recompute_necessary(args)
 
         # store for recomputing
         ctx.run_function = run_function
@@ -211,8 +209,6 @@ class RecomputeFunction(PyLayer):
     @staticmethod
     def forward(ctx, run_function, preserve_rng_state, *args):
         from paddle.distributed.fleet.meta_parallel.parallel_layers.random import get_rng_state_tracker
-        if framework._dygraph_tracer()._has_grad:
-            check_recompute_necessary(args)
 
         # store for recomputing
         ctx.run_function = run_function
@@ -466,6 +462,9 @@ def run_model(cuda_state, recompute_block=[], recompute_kwargs={}):
         raise ValueError("Unexpected keyword arguments: " +
                          ",".join(arg for arg in kwargs))
 
+    if framework._dygraph_tracer()._has_grad:
+        check_recompute_necessary(args)
+
     if in_dygraph_mode():
         return EagerRecomputeFunction.apply(function, preserve, *args)
     else:
diff --git a/python/paddle/distributed/launch/context/device.py b/python/paddle/distributed/launch/context/device.py
index 7df7db28f7877..f03d0ea3d41ef 100644
--- a/python/paddle/distributed/launch/context/device.py
+++ b/python/paddle/distributed/launch/context/device.py
@@ -21,6 +21,7 @@ class DeviceType:
     XPU = 'xpu'
     NPU = 'npu'
     MLU = 'mlu'
+    IPU = 'ipu'
 
 
 class Device(object):
@@ -69,6 +70,8 @@ def get_selected_device_key(self):
             return 'FLAGS_selected_xpus'
         if self._dtype == DeviceType.MLU:
             return 'FLAGS_selected_mlus'
+        if self._dtype == DeviceType.IPU:
+            return 'FLAGS_selected_ipus'
         return 'FLAGS_selected_devices'
 
     def get_selected_devices(self, devices=''):
@@ -130,6 +133,12 @@ def detect_device(self):
             dev._dtype = DeviceType.MLU
             num = fluid.core.get_mlu_device_count()
             visible_devices = os.getenv("MLU_VISIBLE_DEVICES")
+        elif fluid.core.is_compiled_with_ipu():
+            dev._dtype = DeviceType.IPU
+            num = fluid.core.get_ipu_device_count()
+            # For IPUs, 'labels' is a list which contains the available numbers of IPU devices.
+            dev._labels = [str(x) for x in range(0, num + 1)]
+            return dev
 
         if num == 0:
             dev._dtype = DeviceType.CPU
diff --git a/python/paddle/distributed/launch/controllers/__init__.py b/python/paddle/distributed/launch/controllers/__init__.py
index f1c6ea5399a46..c686164dbd884 100644
--- a/python/paddle/distributed/launch/controllers/__init__.py
+++ b/python/paddle/distributed/launch/controllers/__init__.py
@@ -17,9 +17,11 @@
 from .collective import CollectiveController
 from .collective import CollectiveElasticController
 from .ps import PSController
+from .ipu_controller import IPUController
 
 # the order is extremely important
 _controllers = [
+    IPUController,
     CollectiveElasticController,
     PSController,
     CollectiveController,
diff --git a/python/paddle/distributed/launch/controllers/controller.py b/python/paddle/distributed/launch/controllers/controller.py
index a8ae155562ae9..1f43679d748f1 100644
--- a/python/paddle/distributed/launch/controllers/controller.py
+++ b/python/paddle/distributed/launch/controllers/controller.py
@@ -29,6 +29,7 @@
 class ControleMode:
     COLLECTIVE = "collective"
     PS = "ps"
+    IPU = "ipu"
 
 
 class ControllerBase(object):
diff --git a/python/paddle/distributed/launch/controllers/ipu_controller.py b/python/paddle/distributed/launch/controllers/ipu_controller.py
new file mode 100644
index 0000000000000..92dc2960ab624
--- /dev/null
+++ b/python/paddle/distributed/launch/controllers/ipu_controller.py
@@ -0,0 +1,170 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import argparse
+
+from .collective import CollectiveController, ControleMode
+from paddle.distributed.launch.job.container import Container
+
+
+class IPUController(CollectiveController):
+
+    @classmethod
+    def enable(cls, ctx):
+        if ctx.args.training_script == "ipu":
+            ctx.logger.debug("{} enabled".format(cls.__name__))
+            ctx.args.run_mode = ControleMode.IPU
+            return True
+        else:
+            return False
+
+    def parse_ipu_args(self, args_list):
+        parser = argparse.ArgumentParser()
+        parser.add_argument("--hosts",
+                            type=str,
+                            help="The hosts for IPU distributd training.")
+        parser.add_argument("--nproc_per_host",
+                            type=int,
+                            help="The number of processes launched per host.")
+        parser.add_argument("--ipus_per_replica",
+                            type=int,
+                            help="The number of IPUs requested per replica.")
+        parser.add_argument("--ipu_partition",
+                            type=str,
+                            help="The partition name of IPU devices.")
+        parser.add_argument("--vipu_server",
+                            type=str,
+                            help="The ip of the IPU device manager.")
+        parser.add_argument(
+            "training_script",
+            type=str,
+            help=
+            "The full path to the IPU distributed training program/script to be launched in parallel. e.g., ``training.py``."
+        )
+        parser.add_argument('training_script_args', nargs=argparse.REMAINDER)
+        return parser.parse_args(args_list)
+
+    def replace_training_script(self):
+        # IPU distributed computing is based on PopRun which is a wrapper of MPI.
+        self.ctx.args.training_script = "poprun"
+        poprun_args = self.parse_ipu_args(self.ctx.args.training_script_args)
+
+        num_ipus = int(self.ctx.args.devices)
+        # The number of replicas for data parallel
+        assert (num_ipus % poprun_args.ipus_per_replica) == 0, \
+                    "The number of IPUs:{} mod the number of IPUs per replica:{} must == 0".format(num_ipus, poprun_args.ipus_per_replica)
+        num_replicas = num_ipus // poprun_args.ipus_per_replica
+        self.ctx.logger.info(
+            "The number of total replicas is {}.".format(num_replicas))
+
+        # The number of processes
+        num_nodes = len(poprun_args.hosts.split(','))
+        num_procs = num_nodes * poprun_args.nproc_per_host
+        self.ctx.logger.info(
+            "The number of total processes is {}.".format(num_procs))
+        assert (num_replicas % num_procs) == 0, \
+                    "The number of replicas:{} mod the number of processes:{} must == 0".format(num_replicas, num_procs)
+
+        # hosts and endpoints
+        hosts = poprun_args.hosts.replace(' ', '').split(',')
+        endpoints = [x + ":8090" for x in hosts]
+
+        # args for poprun
+        poprun_command = []
+
+        poprun_command.append('--num-instances={}'.format(num_procs))
+        poprun_command.append('--num-replicas={}'.format(num_replicas))
+        poprun_command.append('--ipus-per-replica={}'.format(
+            poprun_args.ipus_per_replica))
+        poprun_command.append('--host={}'.format(','.join(hosts)))
+        poprun_command.append('--vipu-partition={}'.format(
+            poprun_args.ipu_partition))
+        poprun_command.append('--vipu-server-host={}'.format(
+            poprun_args.vipu_server))
+
+        poprun_command.extend([
+            '--update-partition=no', '--vipu-server-timeout=120',
+            '--print-topology=yes', '--numa-aware=yes'
+        ])
+
+        # global envs
+        global_envs = '--mpi-local-args=\''
+        log_level = os.getenv('POPART_LOG_LEVEL', None)
+        if log_level:
+            global_envs += '-x POPART_LOG_LEVEL={} '.format(log_level)
+        global_envs += '-x PADDLE_TRAINERS_NUM={} -x PADDLE_TRAINER_ENDPOINTS={}'.format(
+            num_procs, ','.join(endpoints))
+        global_envs += '\''
+        poprun_command.append(global_envs)
+
+        # local envs
+        for idx in range(num_procs):
+            cur_endpoint = endpoints[idx // poprun_args.nproc_per_host]
+            rank_in_node = idx % poprun_args.nproc_per_host
+            poprun_command.append(
+                '--instance-mpi-local-args={}:\"-x PADDLE_TRAINER_ID={} -x PADDLE_CURRENT_ENDPOINT={} -x PADDLE_RANK_IN_NODE={}\"'
+                .format(idx, idx, cur_endpoint, rank_in_node))
+
+        # executor
+        poprun_command.append(sys.executable)
+
+        # script and script args
+        poprun_command.append(poprun_args.training_script)
+        poprun_command.extend(poprun_args.training_script_args)
+
+        # for debug
+        print("-----------  PopRun Command -----------")
+        print("poprun \\")
+        for i in range(len(poprun_command) - 1):
+            print("%s \\" % (poprun_command[i]))
+        print("%s" % (poprun_command[len(poprun_command) - 1]))
+        print("---------------------------------------")
+
+        # replace training_script_args
+        self.ctx.args.training_script_args = poprun_command
+
+    def _get_entrypoint(self):
+        entrypoint = [self.ctx.args.training_script]
+        entrypoint.extend(self.ctx.args.training_script_args)
+        entrypoint = [" ".join(entrypoint)]
+        return entrypoint
+
+    def new_container(self,
+                      entrypoint=None,
+                      envs={},
+                      use_ctx_env=True,
+                      out=None,
+                      err=None):
+        c = Container(
+            entrypoint=(entrypoint or self._get_entrypoint()),
+            env=(self.ctx.get_envs() if use_ctx_env else {}),
+        )
+        c.outfile, c.errfile = self._get_out_err_file(out, err)
+        c.update_env(envs)
+        # Need subprocess.Popen(shell=True) for PopRun command
+        c.shell = True
+        return c
+
+    def run(self):
+        # Replace the training script with the PopRun command
+        self.replace_training_script()
+
+        self.build_job()
+        self.build_pod()
+
+        self.deploy_pod()
+
+        self.watch()
diff --git a/python/paddle/distributed/launch/job/container.py b/python/paddle/distributed/launch/job/container.py
index 9f7b1733d1af2..8f515d9e6f38b 100644
--- a/python/paddle/distributed/launch/job/container.py
+++ b/python/paddle/distributed/launch/job/container.py
@@ -37,6 +37,7 @@ def __init__(self, entrypoint=[], rank=-1, env={}):
         self._grace_period = 10
 
         self._log_handler = None
+        self._shell = False
 
     @property
     def entrypoint(self):
@@ -70,6 +71,14 @@ def errfile(self):
     def errfile(self, err):
         self._err = err
 
+    @property
+    def shell(self):
+        return self._shell
+
+    @shell.setter
+    def shell(self, shell):
+        self._shell = shell
+
     def update_env(self, env={}, **kwargs):
         env = {k: v for k, v in env.items() if isinstance(v, str)}
         self._env.update(env)
@@ -109,7 +118,8 @@ def start(self):
         self._proc = ProcessContext(self._entrypoint,
                                     env=self._env,
                                     out=self._stdout,
-                                    err=self._stderr)
+                                    err=self._stderr,
+                                    shell=self._shell)
         self._proc.start()
 
     def terminate(self, force=False):
diff --git a/python/paddle/distributed/launch/main.py b/python/paddle/distributed/launch/main.py
index f90fa7401e9a0..4c1b99df178ea 100644
--- a/python/paddle/distributed/launch/main.py
+++ b/python/paddle/distributed/launch/main.py
@@ -91,6 +91,26 @@ def launch():
 
         - ``--elastic_timeout``: Seconds to wait before elastic job begin to train. Default ``--elastic_timeout=30``.
 
+    IPU Parameters:
+        IPU distributed launch only requires and allowes three arguments ``--devices``, ``training_script`` and ``training_script_args``.
+        The ``--devices`` is the number of IPU devices. e.g., ``--devices=4`` will launch the training program with four IPU devices.
+        The ``training_script`` is only allowed to set as ``ipu``. 
+        The ``training_script_args`` includes arguments required by IPU distributed launch and illustrated as below.
+        ``Examples 10`` has provided a example of paddle.distributed.launch with IPUs.
+
+        - ``--hosts``: The hosts for IPU distributd training.
+        
+        - ``--nproc_per_host``: The number of processes launched per host.
+
+        - ``--ipus_per_replica``: The number of IPUs requested per replica.
+
+        - ``--ipu_partition``: The partition name of IPU devices.
+
+        - ``--vipu_server``: The ip of the IPU device manager.
+
+        - ``training_script``: The full path to the IPU distributed training program/script to be launched in parallel. e.g., ``training.py``.
+
+        - ``training_script_args``: The args of the IPU distributed training program/script.
 
     Returns:
         - ``None``
@@ -229,6 +249,15 @@ def launch():
             
             # once the number of nodes changes between 2:4 during training, the strategy holds
 
+    Examples 10 (ipu):
+        .. code-block:: bash
+            :name: code-block-example-bash10
+
+            # With the following command, the job will begin to run the distributhed program with IPUs.
+            # Only support and require the `device_num` as the arg and `ipu` as the launch script.
+            # Please Check the details about the following args of the launch scripte from `utils/ipu_launch.py`.
+            python -m paddle.distributed.launch --devices 4 ipu --hosts=localhost --nproc_per_host=2 --ipus_per_replica=1 --ipu_partition=pod16 --vipu_server=127.0.0.1 train.py
+
     """
 
     # initialize the context to run
diff --git a/python/paddle/distributed/launch/utils/process_context.py b/python/paddle/distributed/launch/utils/process_context.py
index 075536c8a8cb5..5d8505aa66eb3 100644
--- a/python/paddle/distributed/launch/utils/process_context.py
+++ b/python/paddle/distributed/launch/utils/process_context.py
@@ -24,7 +24,8 @@ def __init__(self,
                  out=sys.stdout,
                  err=sys.stderr,
                  group=True,
-                 preexec_fn=None):
+                 preexec_fn=None,
+                 shell=False):
         self._cmd = cmd
         self._env = env
         self._preexec_fn = preexec_fn
@@ -33,6 +34,7 @@ def __init__(self,
         self._group = group if os.name != 'nt' else False
         self._proc = None
         self._code = None
+        self._shell = shell
 
     def _start(self):
         pre_fn = os.setsid if self._group else None
@@ -40,7 +42,8 @@ def _start(self):
                                       env=self._env,
                                       stdout=self._stdout,
                                       stderr=self._stderr,
-                                      preexec_fn=self._preexec_fn or pre_fn)
+                                      preexec_fn=self._preexec_fn or pre_fn,
+                                      shell=self._shell)
 
     def _close_std(self):
         try:
diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
index 5c16e0fe273c4..a4888e6f90655 100644
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -17,6 +17,10 @@
 import logging
 import numpy as np
 import shutil
+try:
+    from tqdm import tqdm
+except:
+    from .utils import tqdm
 from inspect import isgeneratorfunction
 from .... import io
 from .... import core
@@ -359,38 +363,41 @@ def quantize(self):
         self._set_activation_persistable()
 
         if self._algo in ["KL", "hist"]:
-            _logger.info("Preparation stage ...")
             batch_id = 0
+            with tqdm(
+                    total=self._batch_nums,
+                    bar_format=
+                    'Preparation stage, Run batch:|{bar}| {n_fmt}/{total_fmt}',
+                    ncols=80) as t:
+                for data in self._data_loader():
+                    self._executor.run(program=self._program,
+                                       feed=data,
+                                       fetch_list=self._fetch_list,
+                                       return_numpy=False,
+                                       scope=self._scope)
+                    self._collect_activation_abs_min_max()
+                    batch_id += 1
+                    t.update()
+                    if self._batch_nums and batch_id >= self._batch_nums:
+                        break
+            self._init_sampling_act_histogram()
+
+        batch_id = 0
+        with tqdm(total=self._batch_nums,
+                  bar_format=
+                  'Sampling stage, Run batch:|{bar}| {n_fmt}/{total_fmt}',
+                  ncols=80) as t:
             for data in self._data_loader():
                 self._executor.run(program=self._program,
                                    feed=data,
                                    fetch_list=self._fetch_list,
                                    return_numpy=False,
                                    scope=self._scope)
-                self._collect_activation_abs_min_max()
-                if batch_id % 5 == 0:
-                    _logger.info("Run batch: " + str(batch_id))
+                self._sampling()
                 batch_id += 1
+                t.update()
                 if self._batch_nums and batch_id >= self._batch_nums:
                     break
-            _logger.info("Finish preparation stage, all batch:" + str(batch_id))
-            self._init_sampling_act_histogram()
-
-        _logger.info("Sampling stage ...")
-        batch_id = 0
-        for data in self._data_loader():
-            self._executor.run(program=self._program,
-                               feed=data,
-                               fetch_list=self._fetch_list,
-                               return_numpy=False,
-                               scope=self._scope)
-            self._sampling()
-            if batch_id % 5 == 0:
-                _logger.info("Run batch: " + str(batch_id))
-            batch_id += 1
-            if self._batch_nums and batch_id >= self._batch_nums:
-                break
-        _logger.info("Finish sampling stage, all batch: " + str(batch_id))
 
         if self._algo == 'avg':
             for var_name in self._quantized_act_var_name:
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index eaf9bed3d6fe9..0dd79992eb1df 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -14,6 +14,10 @@
 
 import collections
 import numpy as np
+try:
+    from tqdm import tqdm
+except:
+    from .utils import tqdm
 from ..... import compat as cpt
 from .... import core
 from ....framework import IrGraph
@@ -373,10 +377,15 @@ def _has_weight(op):
         graph.out_node_mapping_table = dict()
         # The process of _transform_forward and _transform_backward is needed in two for loops.
         # The loop for transforming the forward graph:
-        for op in ops:
-            if op.name() in self._quantizable_ops:
-                if not self._is_skip_quant(graph, op) and _has_weight(op):
-                    _transform_forward(graph, op)
+        with tqdm(total=len(ops),
+                  bar_format=
+                  'Adding quant op with weight:|{bar}| {n_fmt}/{total_fmt}',
+                  ncols=80) as t:
+            for op in ops:
+                if op.name() in self._quantizable_ops:
+                    if not self._is_skip_quant(graph, op) and _has_weight(op):
+                        _transform_forward(graph, op)
+                t.update()
         # The loop for renaming the inputs of backward op.
         for op in ops:
             if op.name() in self._quantizable_grad_ops and _has_weight(op):
@@ -1418,73 +1427,81 @@ def apply(self, graph):
         for op in graph.all_op_nodes():
             if op.name() in self._teller_set:
                 target_ops.append(op)
-        for op in target_ops:
-            for output_var_name in utils._get_op_output_var_names(op):
-                in_node = graph._find_node_by_name(op.outputs, output_var_name)
-                if in_node.dtype() not in \
-                    [core.VarDesc.VarType.FP64, core.VarDesc.VarType.FP32]:
-                    continue
+        with tqdm(total=len(target_ops),
+                  bar_format='Adding OutScale op:|{bar}| {n_fmt}/{total_fmt}',
+                  ncols=80) as t:
+            for op in target_ops:
+                for output_var_name in utils._get_op_output_var_names(op):
+                    in_node = graph._find_node_by_name(op.outputs,
+                                                       output_var_name)
+                    if in_node.dtype() not in \
+                        [core.VarDesc.VarType.FP64, core.VarDesc.VarType.FP32]:
+                        continue
 
-                scale_node = graph.create_persistable_node(
-                    name=self._scale_name(in_node.name()),
-                    var_type=core.VarDesc.VarType.LOD_TENSOR,
-                    shape=[1],
-                    var_dtype=in_node.dtype())
-                data_type = 'float64' if in_node.dtype() \
-                    == core.VarDesc.VarType.FP64 else 'float32'
-                _init_var_node(scale_node, np.ones([1], dtype=data_type),
-                               self._scope, self._place)
-                ins = {'X': in_node}
-                outs = {'OutScale': scale_node}
-                if not self._is_test:
-                    state_in_node = graph.create_persistable_node(
-                        name=unique_name.generate('scale_state@'),
+                    scale_node = graph.create_persistable_node(
+                        name=self._scale_name(in_node.name()),
                         var_type=core.VarDesc.VarType.LOD_TENSOR,
-                        var_dtype=in_node.dtype(),
-                        shape=[1])
-                    _init_var_node(state_in_node, np.ones([1], dtype=data_type),
+                        shape=[1],
+                        var_dtype=in_node.dtype())
+                    data_type = 'float64' if in_node.dtype() \
+                        == core.VarDesc.VarType.FP64 else 'float32'
+                    _init_var_node(scale_node, np.ones([1], dtype=data_type),
                                    self._scope, self._place)
-                    accum_in_node = graph.create_persistable_node(
-                        name=unique_name.generate('scale_accum@'),
-                        var_type=core.VarDesc.VarType.LOD_TENSOR,
-                        var_dtype=in_node.dtype(),
-                        shape=[1])
-                    _init_var_node(accum_in_node, np.ones([1], dtype=data_type),
-                                   self._scope, self._place)
-                    state_out_node = graph.create_var_node_from_desc(
-                        state_in_node.var())
-                    accum_out_node = graph.create_var_node_from_desc(
-                        accum_in_node.var())
-
-                    ins['InState'] = state_in_node
-                    ins['InAccum'] = accum_in_node
-                    outs['OutState'] = state_out_node
-                    outs['OutAccum'] = accum_out_node
-
-                attrs = {
-                    'moving_rate': self._moving_rate,
-                    'is_test': self._is_test,
-                    'op_role': core.op_proto_and_checker_maker.OpRole.Forward
-                }
-                scale_op_node = graph.create_op_node(
-                    op_type='moving_average_abs_max_scale',
-                    attrs=attrs,
-                    inputs=ins,
-                    outputs=outs)
-                graph.link_to(in_node, scale_op_node)
-                graph.link_to(scale_op_node, scale_node)
-                if not self._is_test:
-                    graph.link_to(state_in_node, scale_op_node)
-                    graph.link_to(accum_in_node, scale_op_node)
-                    graph.link_to(scale_op_node, state_out_node)
-                    graph.link_to(scale_op_node, accum_out_node)
+                    ins = {'X': in_node}
+                    outs = {'OutScale': scale_node}
+                    if not self._is_test:
+                        state_in_node = graph.create_persistable_node(
+                            name=unique_name.generate('scale_state@'),
+                            var_type=core.VarDesc.VarType.LOD_TENSOR,
+                            var_dtype=in_node.dtype(),
+                            shape=[1])
+                        _init_var_node(state_in_node,
+                                       np.ones([1], dtype=data_type),
+                                       self._scope, self._place)
+                        accum_in_node = graph.create_persistable_node(
+                            name=unique_name.generate('scale_accum@'),
+                            var_type=core.VarDesc.VarType.LOD_TENSOR,
+                            var_dtype=in_node.dtype(),
+                            shape=[1])
+                        _init_var_node(accum_in_node,
+                                       np.ones([1], dtype=data_type),
+                                       self._scope, self._place)
+                        state_out_node = graph.create_var_node_from_desc(
+                            state_in_node.var())
+                        accum_out_node = graph.create_var_node_from_desc(
+                            accum_in_node.var())
+
+                        ins['InState'] = state_in_node
+                        ins['InAccum'] = accum_in_node
+                        outs['OutState'] = state_out_node
+                        outs['OutAccum'] = accum_out_node
+
+                    attrs = {
+                        'moving_rate': self._moving_rate,
+                        'is_test': self._is_test,
+                        'op_role':
+                        core.op_proto_and_checker_maker.OpRole.Forward
+                    }
+                    scale_op_node = graph.create_op_node(
+                        op_type='moving_average_abs_max_scale',
+                        attrs=attrs,
+                        inputs=ins,
+                        outputs=outs)
+                    graph.link_to(in_node, scale_op_node)
+                    graph.link_to(scale_op_node, scale_node)
+                    if not self._is_test:
+                        graph.link_to(state_in_node, scale_op_node)
+                        graph.link_to(accum_in_node, scale_op_node)
+                        graph.link_to(scale_op_node, state_out_node)
+                        graph.link_to(scale_op_node, accum_out_node)
+                t.update()
         return graph
 
     def _scale_name(self, var_name):
         """
         Return the scale name for the var named `var_name`.
         """
-        return "%s.scale" % (var_name)
+        return "%s@scale" % (var_name)
 
 
 class OutScaleForInferencePass(object):
@@ -1544,7 +1561,7 @@ def _scale_name(self, var_name):
         """
         Return the scale name for the var named `var_name`.
         """
-        return "%s.scale" % (var_name)
+        return "%s@scale" % (var_name)
 
 
 class AddQuantDequantPass(object):
@@ -1624,36 +1641,43 @@ def apply(self, graph):
 
         # Forward stage, insert quant_dequant op
         all_op_nodes = graph.all_op_nodes()
-        for op_node in all_op_nodes:
-            if op_node.name() in self._quantizable_op_type:
-                is_skip = False
-                if isinstance(self._skip_pattern, list):
-                    is_skip = op_node.op().has_attr("op_namescope") and \
-                                   any(pattern in op_node.op().attr("op_namescope") for pattern in self._skip_pattern)
-                elif isinstance(self._skip_pattern, str):
-                    is_skip = op_node.op().has_attr("op_namescope") and \
-                                   op_node.op().attr("op_namescope").find(self._skip_pattern) != -1
-                is_quantized = op_node.op().has_attr("quantization_type") and \
-                    op_node.op().attr("quantization_type") == "qat_with_weight"
-                if is_skip or is_quantized or \
-                    (not _is_input_all_not_persistable(graph, op_node)):
-                    continue
+        with tqdm(total=len(all_op_nodes),
+                  bar_format=
+                  'Adding quant activation op:|{bar}| {n_fmt}/{total_fmt}',
+                  ncols=80) as t:
+            for op_node in all_op_nodes:
+                if op_node.name() in self._quantizable_op_type:
+                    is_skip = False
+                    if isinstance(self._skip_pattern, list):
+                        is_skip = op_node.op().has_attr("op_namescope") and \
+                                    any(pattern in op_node.op().attr("op_namescope") for pattern in self._skip_pattern)
+                    elif isinstance(self._skip_pattern, str):
+                        is_skip = op_node.op().has_attr("op_namescope") and \
+                                    op_node.op().attr("op_namescope").find(self._skip_pattern) != -1
+                    is_quantized = op_node.op().has_attr("quantization_type") and \
+                        op_node.op().attr("quantization_type") == "qat_with_weight"
+                    if is_skip or is_quantized or \
+                        (not _is_input_all_not_persistable(graph, op_node)):
+                        continue
 
-                op_node.op()._set_attr("quantization_type",
-                                       "qat_without_weight")
-                op_node.op()._set_attr("activation_bits", self._quant_bits)
-                op_node.op()._set_attr("with_quant_attr", True)
-                arg_names = utils._get_op_input_var_names(op_node)
-                for arg_name in arg_names:
-                    in_node = graph._find_node_by_name(op_node.inputs, arg_name)
-                    if arg_name in dequantized_vars_map:
-                        quant_var_node = dequantized_vars_map[arg_name]
-                    else:
-                        quant_var_node, _ = \
-                            self._inser_quant_dequant_moving_average_abs_max_op(
-                            graph, in_node, self._quant_bits)
-                        dequantized_vars_map[arg_name] = quant_var_node
-                    graph.update_input_link(in_node, quant_var_node, op_node)
+                    op_node.op()._set_attr("quantization_type",
+                                           "qat_without_weight")
+                    op_node.op()._set_attr("activation_bits", self._quant_bits)
+                    op_node.op()._set_attr("with_quant_attr", True)
+                    arg_names = utils._get_op_input_var_names(op_node)
+                    for arg_name in arg_names:
+                        in_node = graph._find_node_by_name(
+                            op_node.inputs, arg_name)
+                        if arg_name in dequantized_vars_map:
+                            quant_var_node = dequantized_vars_map[arg_name]
+                        else:
+                            quant_var_node, _ = \
+                                self._inser_quant_dequant_moving_average_abs_max_op(
+                                graph, in_node, self._quant_bits)
+                            dequantized_vars_map[arg_name] = quant_var_node
+                        graph.update_input_link(in_node, quant_var_node,
+                                                op_node)
+            t.update()
 
         # Backward stage, update input link
         for op_node in all_op_nodes:
@@ -2204,10 +2228,16 @@ def apply(self, graph):
         graph.out_node_mapping_table = dict()
         # The process of _transform_forward and _transform_backward is needed in two for loops.
         # The loop for transforming the forward graph:
-        for op in ops:
-            if op.name() in self._quantizable_ops:
-                if not self._is_skip_quant(graph, op) and self._has_weight(op):
-                    self._transform_forward(graph, op)
+        with tqdm(total=len(ops),
+                  bar_format=
+                  'Adding quant op with weight:|{bar}| {n_fmt}/{total_fmt}',
+                  ncols=80) as t:
+            for op in ops:
+                if op.name() in self._quantizable_ops:
+                    if not self._is_skip_quant(graph,
+                                               op) and self._has_weight(op):
+                        self._transform_forward(graph, op)
+                t.update()
         # The loop for renaming the inputs of backward op.
         for op in ops:
             if op.name() in self._quantizable_grad_ops and self._has_weight(op):
@@ -2310,43 +2340,50 @@ def apply(self, graph):
 
         # Forward stage, insert quant_dequant op
         all_op_nodes = graph.all_op_nodes()
-        for op_node in all_op_nodes:
-            if op_node.name() in self._quantizable_op_type:
-                is_skip = False
-                if isinstance(self._skip_pattern, list):
-                    is_skip = op_node.op().has_attr("op_namescope") and \
-                                   any(pattern in op_node.op().attr("op_namescope") for pattern in self._skip_pattern)
-                elif isinstance(self._skip_pattern, str):
-                    is_skip = op_node.op().has_attr("op_namescope") and \
-                                   op_node.op().attr("op_namescope").find(self._skip_pattern) != -1
-                is_quantized = op_node.op().has_attr("quantization_type") and \
-                    op_node.op().attr("quantization_type") == "qat_with_weight"
-                if is_skip or is_quantized:
-                    continue
-
-                op_node.op()._set_attr("quantization_type",
-                                       "qat_without_weight")
-                arg_names = utils._get_op_input_var_names(op_node)
-                for arg_name in arg_names:
-                    in_node = graph._find_node_by_name(op_node.inputs, arg_name)
-                    if in_node.persistable():
+        with tqdm(total=len(all_op_nodes),
+                  bar_format=
+                  'Adding quant activation op:|{bar}| {n_fmt}/{total_fmt}',
+                  ncols=80) as t:
+            for op_node in all_op_nodes:
+                if op_node.name() in self._quantizable_op_type:
+                    is_skip = False
+                    if isinstance(self._skip_pattern, list):
+                        is_skip = op_node.op().has_attr("op_namescope") and \
+                                    any(pattern in op_node.op().attr("op_namescope") for pattern in self._skip_pattern)
+                    elif isinstance(self._skip_pattern, str):
+                        is_skip = op_node.op().has_attr("op_namescope") and \
+                                    op_node.op().attr("op_namescope").find(self._skip_pattern) != -1
+                    is_quantized = op_node.op().has_attr("quantization_type") and \
+                        op_node.op().attr("quantization_type") == "qat_with_weight"
+                    if is_skip or is_quantized:
                         continue
-                    if arg_name in dequantized_vars_map:
-                        dequant_var_node = dequantized_vars_map[arg_name]
-                    else:
-                        insert_quant_pass = InsertQuantizeLinear(
-                            self._place,
-                            self._scope,
-                            quant_bits=self._quant_bits,
-                            quant_axis=-1,
-                            channel_wise=False,
-                            is_test=self._is_test)
-                        quant_var_node, scale_var_node = insert_quant_pass.insert_quant_op(
-                            graph, in_node)
-                        dequant_var_node = insert_quant_pass.insert_dequant_op(
-                            graph, quant_var_node, scale_var_node)
-                        dequantized_vars_map[arg_name] = dequant_var_node
-                    graph.update_input_link(in_node, dequant_var_node, op_node)
+
+                    op_node.op()._set_attr("quantization_type",
+                                           "qat_without_weight")
+                    arg_names = utils._get_op_input_var_names(op_node)
+                    for arg_name in arg_names:
+                        in_node = graph._find_node_by_name(
+                            op_node.inputs, arg_name)
+                        if in_node.persistable():
+                            continue
+                        if arg_name in dequantized_vars_map:
+                            dequant_var_node = dequantized_vars_map[arg_name]
+                        else:
+                            insert_quant_pass = InsertQuantizeLinear(
+                                self._place,
+                                self._scope,
+                                quant_bits=self._quant_bits,
+                                quant_axis=-1,
+                                channel_wise=False,
+                                is_test=self._is_test)
+                            quant_var_node, scale_var_node = insert_quant_pass.insert_quant_op(
+                                graph, in_node)
+                            dequant_var_node = insert_quant_pass.insert_dequant_op(
+                                graph, quant_var_node, scale_var_node)
+                            dequantized_vars_map[arg_name] = dequant_var_node
+                        graph.update_input_link(in_node, dequant_var_node,
+                                                op_node)
+                t.update()
 
         # Backward stage, update input link
         for op_node in all_op_nodes:
diff --git a/python/paddle/fluid/contrib/slim/quantization/utils.py b/python/paddle/fluid/contrib/slim/quantization/utils.py
index 608844dd55da7..b9c304df5bafe 100644
--- a/python/paddle/fluid/contrib/slim/quantization/utils.py
+++ b/python/paddle/fluid/contrib/slim/quantization/utils.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import sys
 import numpy as np
 from ....framework import IrNode
 from ....framework import Operator
@@ -52,7 +53,6 @@
     "leaky_relu",
     "tanh",
     "swish",
-    "scale",
     "transpose",
     "transpose2",
     "sigmoid",
@@ -162,7 +162,6 @@
     "sigmoid": [["X"], ["Out"]],
     "elementwise_mul": [["X", "Y"], ["Out"]],
     "elementwise_pow": [["X", "Y"], ["Out"]],
-    "scale": [["X"], ["Out"]],
     "hard_swish": [["X"], ["Out"]],
     "hard_sigmoid": [["X"], ["Out"]],
     "gru": [["Input", "Weight"], ["Hidden"]],
@@ -414,3 +413,27 @@ def calculate_quant_cos_error(orig_tensor, qdq_tensor):
     cos_sim = np.inner(orig_tensor.flatten(), qdq_tensor.flatten()) \
               / (np.linalg.norm(orig_tensor.flatten()) * np.linalg.norm(qdq_tensor.flatten()))
     return cos_sim
+
+
+class tqdm(object):
+
+    def __init__(self, total, bar_format='Loading|{bar}', ncols=80):
+        self.total = total
+        self.bar_format = bar_format
+        self.ncols = ncols
+        self.n = 0
+
+    def update(self, n=1):
+        self.n += n
+        a = "=" * round((self.n / self.total) * self.ncols)
+        b = " " * (self.ncols - len(a))
+        prefix = self.bar_format.split('|')[0]
+        sys.stderr.write("\r{}|{}=>{}| {}/{}".format(prefix, a, b, self.n,
+                                                     self.total))
+        sys.stderr.flush()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        sys.stderr.write('\n')
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
index 88dc33f581ad2..4a90ab2753142 100644
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -102,7 +102,8 @@ function(inference_quant_int8_image_classification_test target quant_model_dir
          0.1)
 endfunction()
 
-# set batch_size 10 for UT only (avoid OOM). For whole dataset, use batch_size 25
+# set batch_size 10 for UT only (avoid OOM).
+# For whole dataset, use batch_size 25
 function(inference_quant2_int8_image_classification_test target quant_model_dir
          fp32_model_dir dataset_path)
   py_test(
@@ -127,7 +128,8 @@ function(inference_quant2_int8_image_classification_test target quant_model_dir
          0.1)
 endfunction()
 
-# set batch_size 10 for UT only (avoid OOM). For whole dataset, use batch_size 20
+# set batch_size 10 for UT only (avoid OOM).
+# For whole dataset, use batch_size 20
 function(
   inference_quant2_int8_nlp_test
   target
@@ -284,7 +286,10 @@ if(LINUX AND WITH_MKLDNN)
   download_quant_model(
     ${QUANT_RESNET101_MODEL_DIR} ${QUANT_RESNET101_MODEL_ARCHIVE}
     95c6d01e3aeba31c13efb2ba8057d558)
-  # inference_quant_int8_image_classification_test(test_quant_int8_resnet101_mkldnn ${QUANT_RESNET101_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
+  # inference_quant_int8_image_classification_test( \
+  #   test_quant_int8_resnet101_mkldnn \
+  #   ${QUANT_RESNET101_MODEL_DIR}/model \
+  #   ${IMAGENET_DATA_PATH})
 
   # Quant GoogleNet
   set(QUANT_GOOGLENET_MODEL_DIR "${QUANT_INSTALL_DIR}/GoogleNet_quant")
@@ -321,18 +326,24 @@ if(LINUX AND WITH_MKLDNN)
   set(QUANT_VGG16_MODEL_ARCHIVE "VGG16_qat_model.tar.gz")
   download_quant_model(${QUANT_VGG16_MODEL_DIR} ${QUANT_VGG16_MODEL_ARCHIVE}
                        c37e63ca82a102f47be266f8068b0b55)
-  # inference_quant_int8_image_classification_test(test_quant_int8_vgg16_mkldnn ${QUANT_VGG16_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
+  # inference_quant_int8_image_classification_test( \
+  #   test_quant_int8_vgg16_mkldnn \
+  #   ${QUANT_VGG16_MODEL_DIR}/model \
+  #   ${IMAGENET_DATA_PATH})
 
   # Quant VGG19
   set(QUANT_VGG19_MODEL_DIR "${QUANT_INSTALL_DIR}/VGG19_quant")
   set(QUANT_VGG19_MODEL_ARCHIVE "VGG19_qat_model.tar.gz")
   download_quant_model(${QUANT_VGG19_MODEL_DIR} ${QUANT_VGG19_MODEL_ARCHIVE}
                        62bcd4b6c3ca2af67e8251d1c96ea18f)
-  # inference_quant_int8_image_classification_test(test_quant_int8_vgg19_mkldnn ${QUANT_VGG19_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
+  # inference_quant_int8_image_classification_test( \
+  #   test_quant_int8_vgg19_mkldnn ${QUANT_VGG19_MODEL_DIR}/model \
+  #   ${IMAGENET_DATA_PATH})
 
   ### Quant2 for image classification
 
-  # Quant2 ResNet50 with input/output scales in `fake_quantize_moving_average_abs_max` operators,
+  # Quant2 ResNet50 with input/output scales in
+  # `fake_quantize_moving_average_abs_max` operators,
   # with weight scales in `fake_dequantize_max_abs` operators
   set(QUANT2_RESNET50_MODEL_DIR "${QUANT_INSTALL_DIR}/ResNet50_quant2")
   set(QUANT2_RESNET50_MODEL_ARCHIVE "ResNet50_qat_perf.tar.gz")
@@ -345,7 +356,8 @@ if(LINUX AND WITH_MKLDNN)
     ${QUANT2_RESNET50_MODEL_DIR}/ResNet50_qat_perf/float
     ${FP32_RESNET50_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
 
-  # Quant2 ResNet50 with input/output scales in `fake_quantize_range_abs_max` operators and the `out_threshold` attributes,
+  # Quant2 ResNet50 with input/output scales in `fake_quantize_range_abs_max`
+  # operators and the `out_threshold` attributes,
   # with weight scales in `fake_dequantize_max_abs` operators
   set(QUANT2_RESNET50_RANGE_MODEL_DIR
       "${QUANT_INSTALL_DIR}/ResNet50_quant2_range")
@@ -358,7 +370,8 @@ if(LINUX AND WITH_MKLDNN)
     ${QUANT2_RESNET50_RANGE_MODEL_DIR}/ResNet50_qat_range
     ${FP32_RESNET50_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
 
-  # Quant2 ResNet50 with input/output scales in `fake_quantize_range_abs_max` operators and the `out_threshold` attributes,
+  # Quant2 ResNet50 with input/output scales in `fake_quantize_range_abs_max`
+  # operators and the `out_threshold` attributes,
   # with weight scales in `fake_channel_wise_dequantize_max_abs` operators
   set(QUANT2_RESNET50_CHANNELWISE_MODEL_DIR
       "${QUANT_INSTALL_DIR}/ResNet50_quant2_channelwise")
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index 30cfb9f4b8591..876d4772462f5 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -49,7 +49,7 @@ def convert_dtype(dtype):
             return _PADDLE_DTYPE_2_NUMPY_DTYPE[dtype]
     elif isinstance(dtype, type):
         if dtype in [
-                np.bool, np.float16, np.uint16, np.float32, np.float64, np.int8,
+                bool, np.float16, np.uint16, np.float32, np.float64, np.int8,
                 np.int16, np.int32, np.int64, np.uint8, np.complex64,
                 np.complex128
         ]:
diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py
index af60776a3f1c5..92fe3fb91549b 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -374,6 +374,8 @@ def __init__(self, loader):
         # see _try_put_indices
         self._thread_lock = threading.Lock()
 
+        self._base_seed = np.random.randint(low=0, high=sys.maxsize)
+
         # init workers and indices queues and put 2 indices in each indices queue
         self._init_workers()
         for _ in range(self._outstanding_capacity):
@@ -406,7 +408,8 @@ def _init_workers(self):
                       self._data_queue, self._workers_done_event,
                       self._auto_collate_batch, self._collate_fn,
                       self._drop_last, self._worker_init_fn, i,
-                      self._num_workers, self._use_shared_memory))
+                      self._num_workers, self._use_shared_memory,
+                      self._base_seed))
             worker.daemon = True
             worker.start()
             self._workers.append(worker)
diff --git a/python/paddle/fluid/dataloader/worker.py b/python/paddle/fluid/dataloader/worker.py
index 0c3ec898aadfd..06ea7ef9d72a3 100644
--- a/python/paddle/fluid/dataloader/worker.py
+++ b/python/paddle/fluid/dataloader/worker.py
@@ -257,7 +257,7 @@ def mix(x, y):
 
 def _worker_loop(dataset, dataset_kind, indices_queue, out_queue, done_event,
                  auto_collate_batch, collate_fn, drop_last, init_fn, worker_id,
-                 num_workers, use_shared_memory):
+                 num_workers, use_shared_memory, base_seed):
     try:
         # NOTE: [ mmap files clear ] When the child process exits unexpectedly,
         # some shared memory objects may have been applied for but have not yet
@@ -272,15 +272,20 @@ def _worker_loop(dataset, dataset_kind, indices_queue, out_queue, done_event,
         try:
             import numpy as np
             import time
+            import random
         except ImportError:
             pass
         else:
-            np.random.seed(_generate_states(int(time.time()), worker_id))
+            seed = base_seed + worker_id
+            random.seed(seed)
+            paddle.seed(seed)
+            np.random.seed(_generate_states(base_seed, worker_id))
 
         global _worker_info
         _worker_info = WorkerInfo(id=worker_id,
                                   num_workers=num_workers,
-                                  dataset=dataset)
+                                  dataset=dataset,
+                                  seed=base_seed)
 
         init_exception = None
         try:
diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py
index 9da69b1e45e0b..e1ae4ad9bc5ed 100644
--- a/python/paddle/fluid/dygraph/amp/loss_scaler.py
+++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py
@@ -129,11 +129,11 @@ def __init__(self,
             self._decr_count = 0
             self._use_dynamic_loss_scaling = use_dynamic_loss_scaling
 
-            self._found_inf = to_variable(np.array([0]).astype(np.bool))
+            self._found_inf = to_variable(np.array([0]).astype(np.bool_))
             self._temp_found_inf_fp16 = to_variable(
-                np.array([0]).astype(np.bool))
+                np.array([0]).astype(np.bool_))
             self._temp_found_inf_fp32 = to_variable(
-                np.array([0]).astype(np.bool))
+                np.array([0]).astype(np.bool_))
             self._scale = to_variable(
                 np.array([self._init_loss_scaling]).astype(np.float32))
             self._cache_founf_inf = None
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
index de53a56468485..aa01945ac849e 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
@@ -20,6 +20,7 @@
 # See details in https://github.com/serge-sans-paille/gast/
 import os
 from paddle.utils import gast
+from paddle.fluid.dygraph.dygraph_to_static.early_return_transformer import EarlyReturnTransformer
 from paddle.fluid.dygraph.dygraph_to_static.assert_transformer import AssertTransformer
 from paddle.fluid.dygraph.dygraph_to_static.basic_api_transformer import BasicApiTransformer
 from paddle.fluid.dygraph.dygraph_to_static.break_continue_transformer import BreakContinueTransformer
@@ -87,6 +88,7 @@ def transfer_from_node_type(self, node_wrapper):
         self.visit(node_wrapper.node)
 
         transformers = [
+            EarlyReturnTransformer,
             BasicApiTransformer,  # Basic Api
             TensorShapeTransformer,  # Tensor.shape -> layers.shape(Tensor)
             ListTransformer,  # List used in control flow
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/early_return_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/early_return_transformer.py
new file mode 100644
index 0000000000000..bef1efb0427cf
--- /dev/null
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/early_return_transformer.py
@@ -0,0 +1,88 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+from paddle.utils import gast
+from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper
+
+
+class EarlyReturnTransformer(gast.NodeTransformer):
+    """
+    Transform if/else return statement of Dygraph into Static Graph.
+    """
+
+    def __init__(self, wrapper_root):
+        assert isinstance(
+            wrapper_root, AstNodeWrapper
+        ), "Type of input node should be AstNodeWrapper, but received %s ." % type(
+            wrapper_root)
+        self.root = wrapper_root.node
+
+    def transform(self):
+        """
+        Main function to transform AST.
+        """
+        self.visit(self.root)
+
+    def is_define_return_in_if(self, node):
+        assert isinstance(
+            node, gast.If
+        ), "Type of input node should be gast.If, but received %s ." % type(
+            node)
+        for child in node.body:
+            if isinstance(child, gast.Return):
+                return True
+        return False
+
+    def visit_block_nodes(self, nodes):
+        result_nodes = []
+        destination_nodes = result_nodes
+        for node in nodes:
+            rewritten_node = self.visit(node)
+
+            if isinstance(rewritten_node, (list, tuple)):
+                destination_nodes.extend(rewritten_node)
+            else:
+                destination_nodes.append(rewritten_node)
+
+            # append other nodes to if.orelse even though if.orelse is not empty
+            if isinstance(node, gast.If) and self.is_define_return_in_if(node):
+                destination_nodes = node.orelse
+                # handle stmt like `if/elif/elif`
+                while len(destination_nodes) > 0 and \
+                      isinstance(destination_nodes[0], gast.If) and \
+                      self.is_define_return_in_if(destination_nodes[0]):
+                    destination_nodes = destination_nodes[0].orelse
+
+        return result_nodes
+
+    def visit_If(self, node):
+        node.body = self.visit_block_nodes(node.body)
+        node.orelse = self.visit_block_nodes(node.orelse)
+        return node
+
+    def visit_While(self, node):
+        node.body = self.visit_block_nodes(node.body)
+        node.orelse = self.visit_block_nodes(node.orelse)
+        return node
+
+    def visit_For(self, node):
+        node.body = self.visit_block_nodes(node.body)
+        node.orelse = self.visit_block_nodes(node.orelse)
+        return node
+
+    def visit_FunctionDef(self, node):
+        node.body = self.visit_block_nodes(node.body)
+        return node
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 0d4acf5fe6d86..860b4e3f558ff 100755
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -483,7 +483,7 @@ def _as_lodtensor(data, place, dtype=None):
             data = np.array([data]).astype(dtype)
         elif isinstance(data, (list, tuple)):
             data = np.array(data)
-            if data.dtype == np.object:
+            if data.dtype == np.object_:
                 raise TypeError(
                     "\n\tFaild to convert input data to a regular ndarray :\n\t* Usually "
                     "this means the input data contains nested lists with different lengths. "
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 44ef1ff5ae6db..2412e300a779f 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1109,7 +1109,7 @@ def convert_np_dtype_to_dtype_(np_dtype):
         return core.VarDesc.VarType.INT16
     elif dtype == np.int64:
         return core.VarDesc.VarType.INT64
-    elif dtype == np.bool:
+    elif dtype == np.bool_:
         return core.VarDesc.VarType.BOOL
     elif dtype == np.uint16:
         # since there is still no support for bfloat16 in NumPy,
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 47199fcd1adbe..f09097b57bd71 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -1164,10 +1164,11 @@ def calculate_gain(nonlinearity, param=None):
 
     Examples:
         .. code-block:: python
-
+          :name: code-example1
             import paddle
             gain = paddle.nn.initializer.calculate_gain('tanh') # 5.0 / 3
             gain = paddle.nn.initializer.calculate_gain('leaky_relu', param=1.0) # 1.0 = math.sqrt(2.0 / (1+param^2))
+            initializer = paddle.nn.initializer.Orthogonal(gain)
 
     """
     if param is None:
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 2c3cb903d83ca..d7f0feb103c5f 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -12860,8 +12860,8 @@ def logical_or(x, y, out=None, name=None):
             import paddle
             import numpy as np
 
-            x_data = np.array([True, False], dtype=np.bool).reshape(2, 1)
-            y_data = np.array([True, False, True, False], dtype=np.bool).reshape(2, 2)
+            x_data = np.array([True, False], dtype=np.bool_).reshape(2, 1)
+            y_data = np.array([True, False, True, False], dtype=np.bool_).reshape(2, 2)
             x = paddle.to_tensor(x_data)
             y = paddle.to_tensor(y_data)
             res = paddle.logical_or(x, y)
@@ -12905,8 +12905,8 @@ def logical_xor(x, y, out=None, name=None):
             import paddle
             import numpy as np
 
-            x_data = np.array([True, False], dtype=np.bool).reshape([2, 1])
-            y_data = np.array([True, False, True, False], dtype=np.bool).reshape([2, 2])
+            x_data = np.array([True, False], dtype=np.bool_).reshape([2, 1])
+            y_data = np.array([True, False, True, False], dtype=np.bool_).reshape([2, 2])
             x = paddle.to_tensor(x_data)
             y = paddle.to_tensor(y_data)
             res = paddle.logical_xor(x, y)
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index ff299bcca9ba3..c590d69a621de 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -145,7 +145,7 @@ def __next__(self):
     @classmethod
     def _check_input_array(cls, item):
         arr = np.asarray(item)
-        if arr.dtype == np.object:
+        if arr.dtype == np.object_:
             raise TypeError(
                 "\n\tFaild to convert input data to a regular ndarray :\n\t* Usually "
                 "this means the input data contains nested lists with different lengths. "
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 34971cf11941f..0bbb34434e843 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -367,7 +367,7 @@ if(APPLE)
   if(NOT WITH_DISTRIBUTE)
     list(REMOVE_ITEM TEST_OPS test_desc_clone)
     list(REMOVE_ITEM TEST_OPS test_program_code)
-  endif(NOT WITH_DISTRIBUTE)
+  endif()
   message(
     WARNING
       "These tests has been disabled in OSX before being fixed:\n test_fuse_elewise_add_act_pass \n test_detection_map_op \n test_dist_se_resnext_*"
@@ -683,7 +683,7 @@ endif()
 
 foreach(TEST_OP ${TEST_OPS})
   py_test_modules(${TEST_OP} MODULES ${TEST_OP})
-endforeach(TEST_OP)
+endforeach()
 set_tests_properties(test_logcumsumexp_op PROPERTIES TIMEOUT 30)
 py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS
                 FLAGS_inner_op_parallelism=4)
@@ -873,8 +873,8 @@ if(WITH_DISTRIBUTE)
                         test_fleet_localsgd_meta_optimizer ENVS ${dist_ENVS})
 
       endif()
-    endif(NOT WIN32)
-  endif(NOT APPLE)
+    endif()
+  endif()
   if(WITH_DGC)
     # if with dgc, test all dgc tests.
     # NOTE. dist dgc tests is already in DIST_TEST_OPS
@@ -938,7 +938,7 @@ if(WITH_DISTRIBUTE)
         message(
           FATAL_ERROR "available ports have been exhausted:${dist_ut_port}")
       endif()
-    endforeach(TEST_OP)
+    endforeach()
     # solve it later.
     bash_test_modules(
       test_fleet_launch_ps
@@ -974,7 +974,7 @@ if(WITH_DISTRIBUTE)
         "PADDLE_DIST_UT_PORT=${dist_ut_port}+20"
         PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
     endif()
-  endif(NOT APPLE)
+  endif()
 endif()
 
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf)
diff --git a/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt b/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt
index 4fd16354e6c1a..b48b833b94602 100644
--- a/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt
@@ -10,7 +10,7 @@ list(REMOVE_ITEM TEST_OPS "test_fleet_with_asp_sharding")
 
 foreach(TEST_OP ${TEST_OPS})
   py_test_modules(${TEST_OP} MODULES ${TEST_OP})
-endforeach(TEST_OP)
+endforeach()
 
 if(WITH_DISTRIBUTE)
   if(WITH_GPU
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
index f7a1a28aa91ca..ae69ee087686a 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
@@ -96,10 +96,11 @@ def forward(self, input):
                                                      PP_MESH_1})(out)[0]
         out = self.dropout(out)
         out = self.linear2(out)
+        self.out = out
         return out
 
 
-def train():
+def train(fetch):
     mlp = MLPLayer(hidden_size=hidden_size,
                    intermediate_size=4 * hidden_size,
                    dropout_ratio=0.1,
@@ -118,7 +119,6 @@ def train():
     dist_strategy.amp = False
     dist_strategy.pipeline = False
     dist_strategy.recompute = False
-    # init parallel optimizer
     dist_strategy.semi_auto = True
     fleet.init(is_collective=True, strategy=dist_strategy)
 
@@ -129,20 +129,26 @@ def train():
                     strategy=dist_strategy)
     engine.prepare(optimizer, loss, metrics=paddle.metric.Accuracy())
 
+    # fetch
+    if fetch:
+        fetches = {'out': mlp.out}
+    else:
+        fetches = None
+
     # train
     train_dataset = MyDataset(batch_num * batch_size)
     engine.fit(train_dataset,
                batch_size=batch_size,
                steps_per_epoch=batch_num * batch_size,
-               fetch_list=['label'])
+               fetches=fetches)
 
     # eval
     eval_dataset = MyDataset(batch_size)
-    engine.evaluate(eval_dataset, batch_size, fetch_list=['label'])
+    engine.evaluate(eval_dataset, batch_size, fetches=fetches)
 
     # predict
     test_dataset = MyDataset(batch_size)
-    engine.predict(test_dataset, batch_size, fetch_list=['label'])
+    engine.predict(test_dataset, batch_size, fetches=fetches)
 
     # save
     temp_dir = tempfile.TemporaryDirectory()
@@ -152,4 +158,5 @@ def train():
 
 
 if __name__ == "__main__":
-    train()
+    train(fetch=True)
+    train(fetch=False)
diff --git a/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt b/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt
index b5ebeb659a649..c2ccad7dd24f0 100644
--- a/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt
@@ -7,7 +7,7 @@ set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0)
 
 foreach(TEST_OP ${TEST_OPS})
   py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${GC_ENVS})
-endforeach(TEST_OP)
+endforeach()
 
 set_tests_properties(test_autograd_functional_dynamic PROPERTIES TIMEOUT 160)
 set_tests_properties(test_autograd_functional_static PROPERTIES TIMEOUT 160)
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt b/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt
index 29e528edce914..51f298eccdbe2 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt
@@ -27,4 +27,4 @@ foreach(TEST_OP ${TEST_OPS})
   list(APPEND DIST_TEST_OPS ${TEST_OP})
   set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 120)
   set_tests_properties(${TEST_OP} PROPERTIES LABELS "RUN_TYPE=DIST")
-endforeach(TEST_OP)
+endforeach()
diff --git a/python/paddle/fluid/tests/unittests/distribution/CMakeLists.txt b/python/paddle/fluid/tests/unittests/distribution/CMakeLists.txt
index e3bf89c48821a..95739040ef4af 100644
--- a/python/paddle/fluid/tests/unittests/distribution/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/distribution/CMakeLists.txt
@@ -6,4 +6,4 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
 foreach(TEST_OP ${TEST_OPS})
   py_test_modules(${TEST_OP} MODULES ${TEST_OP})
-endforeach(TEST_OP)
+endforeach()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt
index f9a1e83d381fd..1687b277ab5b5 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt
@@ -32,8 +32,8 @@ set(TEST_EAGER_OPS
     test_simnet
     test_transformer)
 list(REMOVE_ITEM TEST_OPS test_lac)
-# NOTE(Aurelius84): In case of Windows CI, if open ON_INFER, RWLOCK of Scope will
-# be removed and will cause some random failed in multi-thread.
+# NOTE(Aurelius84): In case of Windows CI, if open ON_INFER, RWLOCK of Scope
+# will be removed and will cause some random failed in multi-thread.
 if(NOT ON_INFER)
   py_test_modules(test_lac MODULES test_lac ENVS FLAGS_enable_eager_mode=1)
   set_tests_properties(test_lac PROPERTIES TIMEOUT 120)
@@ -51,7 +51,7 @@ foreach(TEST_OP ${TEST_OPS})
   else()
     py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${GC_ENVS})
   endif()
-endforeach(TEST_OP)
+endforeach()
 
 set_tests_properties(test_se_resnet PROPERTIES TIMEOUT 900)
 set_tests_properties(test_yolov3 PROPERTIES TIMEOUT 900 LABELS
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py
index 0c7d2903c3625..39565044e7fd1 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py
@@ -100,6 +100,30 @@ def false_fn_0(q, x, y):
     return x
 
 
+def dyfunc_with_if_else_early_return1():
+    x = paddle.to_tensor([10])
+    if x == 0:
+        a = paddle.zeros([2, 2])
+        b = paddle.zeros([3, 3])
+        return a, b
+    a = paddle.zeros([2, 2]) + 1
+    return a
+
+
+def dyfunc_with_if_else_early_return2():
+    x = paddle.to_tensor([10])
+    if x == 0:
+        a = paddle.zeros([2, 2])
+        b = paddle.zeros([3, 3])
+        return a, b
+    elif x == 1:
+        c = paddle.zeros([2, 2]) + 1
+        d = paddle.zeros([3, 3]) + 1
+        return c, d
+    e = paddle.zeros([2, 2]) + 3
+    return e
+
+
 def dyfunc_with_if_else_with_list_geneator(x):
     if 10 > 5:
         y = paddle.add_n(
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
index cbc6e3c540f9f..cf8be6640300e 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
@@ -29,7 +29,7 @@
 from paddle.fluid.dygraph.dygraph_to_static.utils import func_to_source_code
 import paddle.jit.dy2static as _jst
 
-from ifelse_simple_func import dyfunc_with_if_else
+from ifelse_simple_func import dyfunc_with_if_else, dyfunc_with_if_else_early_return1, dyfunc_with_if_else_early_return2
 
 np.random.seed(0)
 
@@ -83,34 +83,22 @@ def false_fn_0(x_v):
         x_v = _jst.convert_ifelse(
             fluid.layers.mean(x_v)[0] > 5, true_fn_0, false_fn_0, (x_v, ),
             (x_v, ))
-        __return_0 = _jst.create_bool_as_type(label is not None, False)
 
-        def true_fn_1(__return_0, __return_value_0, label, x_v):
+        def true_fn_1(__return_value_0, label, x_v):
             loss = fluid.layers.cross_entropy(x_v, label)
             __return_0 = _jst.create_bool_as_type(label is not None, True)
             __return_value_0 = loss
-            return __return_0, __return_value_0
-
-        def false_fn_1(__return_0, __return_value_0):
-            return __return_0, __return_value_0
-
-        __return_0, __return_value_0 = _jst.convert_ifelse(
-            label is not None, true_fn_1, false_fn_1,
-            (__return_0, __return_value_0, label, x_v),
-            (__return_0, __return_value_0))
-
-        def true_fn_2(__return_0, __return_value_0, x_v):
-            __return_1 = _jst.create_bool_as_type(
-                _jst.convert_logical_not(__return_0), True)
-            __return_value_0 = x_v
             return __return_value_0
 
-        def false_fn_2(__return_value_0):
+        def false_fn_1(__return_value_0, label, x_v):
+            __return_1 = _jst.create_bool_as_type(label is not None, True)
+            __return_value_0 = x_v
             return __return_value_0
 
-        __return_value_0 = _jst.convert_ifelse(
-            _jst.convert_logical_not(__return_0), true_fn_2, false_fn_2,
-            (__return_0, __return_value_0, x_v), (__return_value_0, ))
+        __return_value_0 = _jst.convert_ifelse(label is not None, true_fn_1,
+                                               false_fn_1,
+                                               (__return_value_0, label, x_v),
+                                               (__return_value_0, label, x_v))
         return __return_value_0
 
 
@@ -123,45 +111,33 @@ def dyfunc_with_if_else(x_v, label=None):
                                             name='__return_value_init_1')
         __return_value_1 = __return_value_init_1
 
-        def true_fn_3(x_v):
+        def true_fn_2(x_v):
             x_v = x_v - 1
             return x_v
 
-        def false_fn_3(x_v):
+        def false_fn_2(x_v):
             x_v = x_v + 1
             return x_v
 
         x_v = _jst.convert_ifelse(
-            fluid.layers.mean(x_v)[0] > 5, true_fn_3, false_fn_3, (x_v, ),
+            fluid.layers.mean(x_v)[0] > 5, true_fn_2, false_fn_2, (x_v, ),
             (x_v, ))
-        __return_2 = _jst.create_bool_as_type(label is not None, False)
 
-        def true_fn_4(__return_2, __return_value_1, label, x_v):
+        def true_fn_3(__return_value_1, label, x_v):
             loss = fluid.layers.cross_entropy(x_v, label)
             __return_2 = _jst.create_bool_as_type(label is not None, True)
             __return_value_1 = loss
-            return __return_2, __return_value_1
-
-        def false_fn_4(__return_2, __return_value_1):
-            return __return_2, __return_value_1
-
-        __return_2, __return_value_1 = _jst.convert_ifelse(
-            label is not None, true_fn_4, false_fn_4,
-            (__return_2, __return_value_1, label, x_v),
-            (__return_2, __return_value_1))
-
-        def true_fn_5(__return_2, __return_value_1, x_v):
-            __return_3 = _jst.create_bool_as_type(
-                _jst.convert_logical_not(__return_2), True)
-            __return_value_1 = x_v
             return __return_value_1
 
-        def false_fn_5(__return_value_1):
+        def false_fn_3(__return_value_1, label, x_v):
+            __return_3 = _jst.create_bool_as_type(label is not None, True)
+            __return_value_1 = x_v
             return __return_value_1
 
-        __return_value_1 = _jst.convert_ifelse(
-            _jst.convert_logical_not(__return_2), true_fn_5, false_fn_5,
-            (__return_2, __return_value_1, x_v), (__return_value_1, ))
+        __return_value_1 = _jst.convert_ifelse(label is not None, true_fn_3,
+                                               false_fn_3,
+                                               (__return_value_1, label, x_v),
+                                               (__return_value_1, label, x_v))
         return __return_value_1
 
 
@@ -358,6 +334,21 @@ def test_raise_error(self):
             net.foo.train()
 
 
+class TestIfElseEarlyReturn(unittest.TestCase):
+
+    def test_ifelse_early_return1(self):
+        answer = np.zeros([2, 2]) + 1
+        static_func = paddle.jit.to_static(dyfunc_with_if_else_early_return1)
+        out = static_func()
+        self.assertTrue(np.allclose(answer, out.numpy()))
+
+    def test_ifelse_early_return2(self):
+        answer = np.zeros([2, 2]) + 3
+        static_func = paddle.jit.to_static(dyfunc_with_if_else_early_return2)
+        out = static_func()
+        self.assertTrue(np.allclose(answer, out.numpy()))
+
+
 class TestRemoveCommentInDy2St(unittest.TestCase):
 
     def func_with_comment(self):
diff --git a/python/paddle/fluid/tests/unittests/fft/CMakeLists.txt b/python/paddle/fluid/tests/unittests/fft/CMakeLists.txt
index e3bf89c48821a..95739040ef4af 100644
--- a/python/paddle/fluid/tests/unittests/fft/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/fft/CMakeLists.txt
@@ -6,4 +6,4 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
 foreach(TEST_OP ${TEST_OPS})
   py_test_modules(${TEST_OP} MODULES ${TEST_OP})
-endforeach(TEST_OP)
+endforeach()
diff --git a/python/paddle/fluid/tests/unittests/fft/test_fft.py b/python/paddle/fluid/tests/unittests/fft/test_fft.py
index a3c62323c2c20..f386fdc9c3460 100644
--- a/python/paddle/fluid/tests/unittests/fft/test_fft.py
+++ b/python/paddle/fluid/tests/unittests/fft/test_fft.py
@@ -44,8 +44,8 @@ def rand_x(dims=1,
            complex=False):
     shape = [np.random.randint(min_dim_len, max_dim_len) for i in range(dims)]
     if complex:
-        return np.random.randn(*shape).astype(
-            dtype) + 1.j * np.random.randn(*shape).astype(dtype)
+        return np.random.randn(
+            *shape).astype(dtype) + 1.j * np.random.randn(*shape).astype(dtype)
     else:
         return np.random.randn(*shape).astype(dtype)
 
@@ -473,7 +473,7 @@ def test_irfft2(self):
 @parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'), [
     ('test_bool_input',
      (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype(
-         np.bool8), None, -1, 'backward', NotImplementedError),
+         np.bool_), None, -1, 'backward', NotImplementedError),
     ('test_n_nagative', np.random.randn(4, 4, 4) +
      1j * np.random.randn(4, 4, 4), -1, -1, 'backward', ValueError),
     ('test_n_zero', np.random.randn(4, 4) + 1j * np.random.randn(4, 4), 0, -1,
@@ -543,7 +543,7 @@ def test_irfft(self):
     (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
     [('test_bool_input',
       (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype(
-          np.bool8), None, (-2, -1), 'backward', NotImplementedError),
+          np.bool_), None, (-2, -1), 'backward', NotImplementedError),
      ('test_n_nagative',
       np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (-1, -2),
       (-2, -1), 'backward', ValueError),
@@ -625,7 +625,7 @@ def test_irfft2(self):
     (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'),
     [('test_bool_input',
       (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype(
-          np.bool8), None, (-2, -1), 'backward', NotImplementedError),
+          np.bool_), None, (-2, -1), 'backward', NotImplementedError),
      ('test_n_nagative',
       np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (-1, -2),
       (-2, -1), 'backward', ValueError),
diff --git a/python/paddle/fluid/tests/unittests/fft/test_fft_with_static_graph.py b/python/paddle/fluid/tests/unittests/fft/test_fft_with_static_graph.py
index ce0a623aea076..ddf47065bb01d 100644
--- a/python/paddle/fluid/tests/unittests/fft/test_fft_with_static_graph.py
+++ b/python/paddle/fluid/tests/unittests/fft/test_fft_with_static_graph.py
@@ -370,7 +370,7 @@ def test_static_irfft2(self):
                                          4), None, -1, 'backward', TypeError),
     ('test_bool_input',
      (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype(
-         np.bool8), None, -1, 'backward', TypeError),
+         np.bool_), None, -1, 'backward', TypeError),
     ('test_n_nagative', np.random.randn(4, 4, 4) +
      1j * np.random.randn(4, 4, 4), -1, -1, 'backward', ValueError),
     ('test_n_zero', np.random.randn(4, 4) + 1j * np.random.randn(4, 4), 0, -1,
@@ -406,7 +406,7 @@ def test_static_hfft(self):
                                          4), None, -1, 'backward', TypeError),
     ('test_bool_input',
      (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype(
-         np.bool8), None, -1, 'backward', TypeError),
+         np.bool_), None, -1, 'backward', TypeError),
     ('test_n_nagative', np.random.randn(4, 4, 4) +
      1j * np.random.randn(4, 4, 4), -1, -1, 'backward', ValueError),
     ('test_n_zero', np.random.randn(4, 4) + 1j * np.random.randn(4, 4), 0, -1,
@@ -444,7 +444,7 @@ def test_static_irfft(self):
         4, 4, 4), None, None, 'backward', TypeError),
      ('test_bool_input',
       (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype(
-          np.bool8), None, (-2, -1), 'backward', TypeError),
+          np.bool_), None, (-2, -1), 'backward', TypeError),
      ('test_n_nagative',
       np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (-1, -2),
       (-2, -1), 'backward', ValueError),
@@ -485,7 +485,7 @@ def test_static_hfft2(self):
         4, 4, 4), None, None, 'backward', TypeError),
      ('test_bool_input',
       (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype(
-          np.bool8), None, (-2, -1), 'backward', TypeError),
+          np.bool_), None, (-2, -1), 'backward', TypeError),
      ('test_n_nagative',
       np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (-1, -2),
       (-2, -1), 'backward', ValueError),
@@ -526,7 +526,7 @@ def test_static_irfft2(self):
         4, 4, 4), None, None, 'backward', TypeError),
      ('test_bool_input',
       (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype(
-          np.bool8), None, (-2, -1), 'backward', TypeError),
+          np.bool_), None, (-2, -1), 'backward', TypeError),
      ('test_n_nagative',
       np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (-1, -2),
       (-2, -1), 'backward', ValueError),
@@ -568,7 +568,7 @@ def test_static_hfftn(self):
             4, 4, 4), None, None, 'backward', TypeError),
         #  ('test_bool_input',
         #                (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)
-        #                 ).astype(np.bool8), None, (-2, -1), 'backward', ValueError),
+        #                 ).astype(np.bool_), None, (-2, -1), 'backward', ValueError),
         ('test_n_nagative',
          np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (-1, -2),
          (-2, -1), 'backward', ValueError),
diff --git a/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt b/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt
index 976a36b761568..c60a7511022b4 100644
--- a/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt
@@ -16,6 +16,7 @@ foreach(target ${TEST_INTERP_CASES})
     FLAGS_use_stream_safe_cuda_allocator=true
     FLAGS_fast_eager_deletion_mode=false
     FLAGS_eager_delete_tensor_gb=0)
+
   py_test_modules(
     ${target}_non_eager_deletion
     MODULES
@@ -25,6 +26,7 @@ foreach(target ${TEST_INTERP_CASES})
     FLAGS_use_stream_safe_cuda_allocator=true
     FLAGS_fast_eager_deletion_mode=false
     FLAGS_eager_delete_tensor_gb=0.000001)
+
   py_test_modules(
     ${target}_fast_gc
     MODULES
@@ -34,6 +36,7 @@ foreach(target ${TEST_INTERP_CASES})
     FLAGS_use_stream_safe_cuda_allocator=true
     FLAGS_fast_eager_deletion_mode=true
     FLAGS_eager_delete_tensor_gb=0)
+
   py_test_modules(
     ${target}_fast_gc_non_eager_deletion
     MODULES
@@ -44,3 +47,11 @@ foreach(target ${TEST_INTERP_CASES})
     FLAGS_fast_eager_deletion_mode=true
     FLAGS_eager_delete_tensor_gb=0.000001)
 endforeach()
+
+py_test_modules(
+  test_standalone_executor_sequential_run MODULES test_standalone_executor ENVS
+  FLAGS_new_executor_sequential_run=true)
+
+py_test_modules(
+  test_standalone_executor_serial_run MODULES test_standalone_executor ENVS
+  FLAGS_new_executor_serial_run=true)
diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_controlflow.py b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_controlflow.py
index eeddcaa5bb534..5ce035097d01a 100644
--- a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_controlflow.py
+++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_controlflow.py
@@ -81,7 +81,9 @@ def _run(self, feed):
         return ret
 
     def run_raw_executor(self, feed):
+        os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '0'
         out = self._run(feed)
+        del os.environ['FLAGS_USE_STANDALONE_EXECUTOR']
         print("GT:", out)
         return out
 
diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
index 7faff7ec18193..9e375126550cc 100644
--- a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
+++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
@@ -231,10 +231,6 @@ def test_result(self):
         for gt, out in zip(ground_truths, res):
             self.assertEqual(gt[0], out[0])
 
-        res_sequential = self.run_new_executor_sequential()
-        for gt, out in zip(ground_truths, res_sequential):
-            self.assertEqual(gt[0], out[0])
-
     def run_raw_executor(self):
         paddle.seed(2020)
         main_program, startup_program, fetch_list = build_program()
@@ -264,12 +260,6 @@ def run_new_executor(self):
                 np.array(inter_core.run({}, fetch_list)._move_to_list()[0]))
         return outs
 
-    def run_new_executor_sequential(self):
-        os.environ['FLAGS_new_executor_sequential_run'] = '1'
-        res = self.run_new_executor()
-        del os.environ['FLAGS_new_executor_sequential_run']
-        return res
-
 
 class SwitchExecutorInterfaceTestCase(MultiStreamModelTestCase):
 
diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_multiply_write.py b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_multiply_write.py
index 8006c59d2ba12..a4d18d29be44c 100644
--- a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_multiply_write.py
+++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_multiply_write.py
@@ -36,8 +36,8 @@ def _get_feed(self):
         return None
 
     def build_program(self):
-        main_program = paddle.static.default_main_program()
-        startup_program = paddle.static.default_startup_program()
+        main_program = Program()
+        startup_program = Program()
         with paddle.static.program_guard(main_program, startup_program):
             out = paddle.full((1, ), 1)
             inp1 = paddle.full((1, ), 2)
diff --git a/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt
index 6b709d85d75c3..0174274827358 100644
--- a/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt
@@ -9,7 +9,7 @@ if(WITH_IPU)
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
     # set all UTs timeout to 200s
     set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 200)
-  endforeach(TEST_OP)
+  endforeach()
 
   set_tests_properties(test_conv_op_ipu PROPERTIES TIMEOUT 300)
   set_tests_properties(test_elemetwise_x_op_ipu PROPERTIES TIMEOUT 300)
diff --git a/python/paddle/fluid/tests/unittests/ipu/distributed/run_dist_ipu.sh b/python/paddle/fluid/tests/unittests/ipu/distributed/run_dist_ipu.sh
new file mode 100644
index 0000000000000..a4221b37eb14f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/distributed/run_dist_ipu.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+  
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+partition_name=pod64
+vipu_server=10.137.96.62
+allclose_script="
+import sys
+import numpy as np
+data1 = np.loadtxt(\"ipu_res.txt\")
+data2 = np.loadtxt(\"cpu_res.txt\")
+if np.allclose(data1[::16], data2, atol=1e-6):
+    sys.exit(0)
+else:
+    sys.exit(1)
+"
+
+for opt in lamb sgd adam ;
+do
+    for onchip in False True ;
+    do
+        for rts in False True ;
+        do
+            echo "Testcase: opt: ${opt}, onchip: ${onchip}, rts: ${rts}"
+            echo "paddle.distributed.fleet.launch test with IPUs..."
+            python3.7 -m paddle.distributed.launch \
+            --devices=8 \
+            ipu \
+            --hosts=localhost \
+            --nproc_per_host=2 \
+            --ipus_per_replica=2 \
+            --ipu_partition=${partition_name} \
+            --vipu_server=${vipu_server} \
+            test_dist_data_parallel_ipu.py ${opt} ipu_res.txt ${onchip} ${rts} > ipu.log
+            echo "paddle.distributed.fleet.launch test with IPUs...Done"
+
+            echo "paddle normal test with CPU..."
+            export POPLAR_IPUMODEL=1
+            python3.7 test_dist_data_parallel_ipu.py ${opt} cpu_res.txt > cpu.log
+            unset POPLAR_IPUMODEL
+            echo "paddle normal test with CPU...Done"
+
+            echo "Compare results..."
+            python3.7 -c """${allclose_script}"""
+            if [ $? -eq 0 ];then
+            echo "Compare results...Done"
+            else
+            echo "Error occurs. Please check ipu.log, cpu.log, ipu_res.txt and cpu_res.txt"
+            exit 0
+            fi
+        done
+    done
+done
+
+if [ -f "ipu.log" ]; then
+    rm "ipu.log"
+fi
+if [ -f "cpu.log" ]; then
+    rm "cpu.log"
+fi
+if [ -f "ipu_res.txt" ]; then
+    rm "ipu_res.txt"
+fi
+if [ -f "cpu_res.txt" ]; then
+    rm "cpu_res.txt"
+fi
diff --git a/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_data_parallel_ipu.py b/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_data_parallel_ipu.py
new file mode 100644
index 0000000000000..891aa501c5079
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_data_parallel_ipu.py
@@ -0,0 +1,193 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import sys
+import os
+import random
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+
+mpi_comm = None
+
+
+@unittest.skip('Disable distributed tests on auto CI.')
+class TestBase(IPUOpTest):
+
+    def set_attrs(self, enable_ipu, optimizer, log, onchip=False, rts=False):
+        self.ipu_options = {
+            "enable_pipelining": True,
+            "batches_per_step": 1,
+            "enable_gradient_accumulation": True,
+            "accumulation_factor": 4,
+            "enable_replicated_graphs": True,
+            "replicated_graph_count": 2,
+            "location_optimizer": {
+                "on_chip": onchip,
+                "use_replicated_tensor_sharding": rts
+            }
+        }
+
+        self.cpu_bs = 16
+        self.ipu_bs = 1
+        self.optimizer = optimizer
+        self.log = log
+        self.enable_ipu = enable_ipu
+
+    def test(self):
+        seed = 2021
+        np.random.seed(seed)
+        random.seed(seed)
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = seed
+        startup_prog.random_seed = seed
+
+        bs = self.ipu_bs if self.enable_ipu else self.cpu_bs
+        data = np.random.rand(1, 3, 10, 10).astype(np.float32)
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                image = paddle.static.data(name='image',
+                                           shape=[bs, 3, 10, 10],
+                                           dtype='float32')
+                with paddle.static.ipu_shard_guard(index=0, stage=0):
+                    conv1 = paddle.static.nn.conv2d(image,
+                                                    num_filters=3,
+                                                    filter_size=3,
+                                                    bias_attr=False)
+                with paddle.static.ipu_shard_guard(index=1, stage=1):
+                    conv2 = paddle.static.nn.conv2d(conv1,
+                                                    num_filters=3,
+                                                    filter_size=3,
+                                                    bias_attr=False)
+                    # should consider influence of bs
+                    loss = paddle.mean(conv2)
+
+                if self.optimizer == 'sgd':
+                    opt = paddle.optimizer.SGD(learning_rate=1e-2)
+                elif self.optimizer == 'adam':
+                    opt = paddle.optimizer.Adam(learning_rate=1e-2)
+                elif self.optimizer == 'lamb':
+                    opt = paddle.optimizer.Lamb(learning_rate=1e-2)
+                else:
+                    raise Exception('optimizer must be sgd, adam or lamb')
+
+                opt.minimize(loss)
+
+                if self.enable_ipu:
+                    place = paddle.IPUPlace()
+                else:
+                    place = paddle.CPUPlace()
+                executor = paddle.static.Executor(place)
+                executor.run(startup_prog)
+
+                if self.enable_ipu:
+                    feed_list = [image.name]
+                    fetch_list = [loss.name]
+                    ipu_strategy = paddle.static.IpuStrategy()
+                    ipu_strategy.set_graph_config(
+                        num_ipus=2 * self.ipu_options['replicated_graph_count'],
+                        is_training=True,
+                        enable_manual_shard=True)
+                    ipu_strategy.set_options(self.ipu_options)
+                    ipu_strategy.set_options({
+                        "enable_distribution":
+                        True,
+                        "enable_distributed_replicated_graphs":
+                        True,
+                        "global_replica_offset":
+                        int(os.environ.get("PADDLE_TRAINER_ID")) * 2,
+                        "global_replication_factor":
+                        4
+                    })
+                    program = paddle.static.IpuCompiledProgram(
+                        main_prog, ipu_strategy=ipu_strategy).compile(
+                            feed_list, fetch_list)
+                    feed = {
+                        "image":
+                        np.tile(data, [
+                            self.ipu_options['replicated_graph_count'] *
+                            self.ipu_options['batches_per_step'] *
+                            self.ipu_options['accumulation_factor'], 1, 1, 1
+                        ])
+                    }
+
+                else:
+                    program = main_prog
+                    feed = {"image": np.tile(data, [self.cpu_bs, 1, 1, 1])}
+
+                epoch = 10
+                if not self.enable_ipu:
+                    # global replication factor
+                    epoch *= 4
+                    epoch *= self.ipu_options['batches_per_step']
+                    epoch *= self.ipu_options['accumulation_factor']
+                    epoch = epoch / (self.cpu_bs / self.ipu_bs)
+
+                results = []
+                for i in range(int(epoch)):
+                    res = executor.run(program, feed=feed, fetch_list=[loss])
+                    if self.enable_ipu:
+                        res = mpi_comm.gather(res, root=0)
+                    results.append(res)
+                if self.enable_ipu:
+                    if int(os.environ.get("PADDLE_TRAINER_ID")) == 0:
+                        np.savetxt(self.log, np.array(results).flatten())
+                else:
+                    np.savetxt(self.log, np.array(results).flatten())
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    # Run distributed tests
+    if len(sys.argv) == 5:
+        from mpi4py import MPI
+
+        DISTRIBUTED_COMM = MPI.COMM_WORLD
+
+        def _get_comm():
+            global DISTRIBUTED_COMM
+            if DISTRIBUTED_COMM is None:
+                raise RuntimeError(
+                    "Distributed Commumication not setup. Please run setup_comm(MPI.COMM_WORLD) first."
+                )
+            return DISTRIBUTED_COMM
+
+        mpi_comm = _get_comm()
+
+        optimizer = sys.argv[1]
+        log = sys.argv[2]
+        onchip = True if sys.argv[3] == "True" else False
+        rts = True if sys.argv[4] == "True" else False
+        test = TestBase()
+        test.set_attrs(enable_ipu=True,
+                       optimizer=optimizer,
+                       log=log,
+                       onchip=onchip,
+                       rts=rts)
+        test.test()
+    # Run cpu tests for compare
+    elif len(sys.argv) == 3:
+        test = TestBase()
+        test.set_attrs(enable_ipu=False, optimizer=sys.argv[1], log=sys.argv[2])
+        test.test()
+    else:
+        raise ValueError(
+            "Only support 3 or 5 args. 3 for cpu test, 5 for ipu distributed test"
+        )
diff --git a/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_pod128_sample.py b/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_pod128_sample.py
new file mode 100644
index 0000000000000..f81ed48f04ffd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_pod128_sample.py
@@ -0,0 +1,115 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''
+python3.7 -m paddle.distributed.launch \
+--devices=128 \
+ipu \
+--hosts=host1,host2 \
+--ipus_per_host=2 \
+--nproc_per_host=1 \
+--ipu_partition=pod128 \
+--vipu_server=lr17-1-ctrl \
+python/paddle/fluid/tests/unittests/ipu/disabled/test_dist_pod128_ipu.py
+Equal to:
+poprun \
+--host=localhost,host2 \
+--num-instances=2 \
+--num-replicas=64 \
+--ipus-per-replica=2 \
+--print-topology=yes \
+--vipu-partition=pod128_bert \
+--vipu-server-host=lr17-1-ctrl \
+--update-partition=yes \
+python3.7 python/paddle/fluid/tests/unittests/ipu/disabled/test_dist_pod128_ipu.py
+'''
+
+import os
+import numpy as np
+import paddle
+
+
+def TestDistTraining():
+    paddle.enable_static()
+
+    attrs = {"size": [128, 16], "padding_idx": -1, "dtype": 'float32'}
+
+    scope = paddle.fluid.core.Scope()
+    main_prog = paddle.static.Program()
+    startup_prog = paddle.static.Program()
+    main_prog.random_seed = 42
+    startup_prog.random_seed = 42
+
+    np.random.seed(42)
+    input_data = np.random.uniform(0, 127, size=[128, 3, 2, 1]).astype(np.int32)
+
+    with paddle.fluid.scope_guard(scope):
+        with paddle.static.program_guard(main_prog, startup_prog):
+            x = paddle.static.data(name="x", shape=[3, 2, 1], dtype='int64')
+            with paddle.static.ipu_shard_guard(index=0, stage=0):
+                out = paddle.fluid.layers.embedding(x, **attrs)
+            with paddle.static.ipu_shard_guard(index=1, stage=1):
+                loss = paddle.mean(out)
+            opt = paddle.optimizer.Adam(learning_rate=1e-1)
+            opt.minimize(loss)
+
+            feed_list = ["x"]
+            fetch_list = [loss.name]
+
+            place = paddle.IPUPlace()
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            ipu_strategy = paddle.static.IpuStrategy()
+            ipu_strategy.set_graph_config(num_ipus=64,
+                                          is_training=True,
+                                          enable_manual_shard=True)
+            ipu_strategy.set_pipelining_config(
+                enable_pipelining=True,
+                batches_per_step=1,
+                enable_gradient_accumulation=True,
+                accumulation_factor=4)
+            ipu_strategy.set_options({
+                "enable_distribution":
+                True,
+                "enable_replicated_graphs":
+                True,
+                "replicated_graph_count":
+                32,
+                "enable_distributed_replicated_graphs":
+                True,
+                "global_replica_offset":
+                # Paddle : int(os.environ.get("PADDLE_TRAINER_ID")) * 32
+                # PopRun : int(os.environ.get("POPDIST_REPLICA_INDEX_OFFSET"))
+                int(os.environ.get("PADDLE_TRAINER_ID")) * 32,
+                "global_replication_factor":
+                64,
+                "location_optimizer": {
+                    "on_chip": False,
+                    "use_replicated_tensor_sharding": True
+                }
+            })
+
+            ipu_program = paddle.static.IpuCompiledProgram(
+                main_prog, ipu_strategy=ipu_strategy)
+            program = ipu_program.compile(feed_list, fetch_list)
+
+            for i in range(10):
+                res = exe.run(program,
+                              feed={"x": input_data},
+                              fetch_list=fetch_list)
+                print("index: {}, result: {}".format(i, res))
+
+
+if __name__ == "__main__":
+    TestDistTraining()
diff --git a/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_sample.py b/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_sample.py
new file mode 100644
index 0000000000000..d42977b5962d3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_sample.py
@@ -0,0 +1,178 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''
+Single host:
+python3.7 -m paddle.distributed.launch \
+--devices=4 \
+ipu \
+--hosts=localhost \
+--nproc_per_host=2 \
+--ipus_per_replica=1 \
+--ipu_partition=pod64 \
+--vipu_server=10.137.96.62 \
+python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_sample.py
+Equal to:
+poprun \
+--host=localhost \
+--num-instances=2 \
+--num-replicas=4 \
+--ipus-per-replica=1 \
+--print-topology=yes \
+python3.7 python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_sample.py
+'''
+'''
+Multi hosts:
+python3.7 -m paddle.distributed.launch \
+--devices=4 \
+ipu \
+--hosts=host1,host2 \
+--nproc_per_host=1 \
+--ipus_per_replica=1 \
+--ipu_partition=pod64 \
+--vipu_server=10.137.96.62 \
+python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_sample.py
+Equal to:
+poprun \
+--host=host1,host2 \
+--num-instances=2 \
+--num-replicas=4 \
+--ipus-per-replica=1 \
+--print-topology=yes \
+python3.7 python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_sample.py
+'''
+
+import os
+import sys
+import paddle
+import numpy as np
+
+mpi_comm = None
+
+
+def Test(use_dist, file_name):
+    paddle.enable_static()
+
+    attrs = {"size": [128, 16], "padding_idx": -1, "dtype": 'float32'}
+
+    scope = paddle.fluid.core.Scope()
+    main_prog = paddle.static.Program()
+    startup_prog = paddle.static.Program()
+    main_prog.random_seed = 42
+    startup_prog.random_seed = 42
+
+    with paddle.fluid.scope_guard(scope):
+        with paddle.static.program_guard(main_prog, startup_prog):
+            x = paddle.static.data(name="x", shape=[3, 2, 1], dtype='int64')
+
+            out = paddle.fluid.layers.embedding(x, **attrs)
+            loss = paddle.mean(out)
+            opt = paddle.optimizer.Adam(learning_rate=1e-1)
+            opt.minimize(loss)
+
+            feed_list = ["x"]
+            fetch_list = [loss.name]
+
+            place = paddle.IPUPlace()
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            ipu_strategy = paddle.static.IpuStrategy()
+            if use_dist:
+                ipu_strategy.set_graph_config(num_ipus=2, is_training=True)
+                # Set distributed envs
+                ipu_strategy.set_options({
+                    "enable_distribution":
+                    True,
+                    "enable_replicated_graphs":
+                    True,
+                    "replicated_graph_count":
+                    2,
+                    "enable_distributed_replicated_graphs":
+                    True,
+                    "global_replica_offset":
+                    int(os.environ.get("PADDLE_TRAINER_ID")) * 2,
+                    "global_replication_factor":
+                    4
+                })
+            else:
+                ipu_strategy.set_graph_config(num_ipus=4, is_training=True)
+                ipu_strategy.set_options({
+                    "enable_replicated_graphs": True,
+                    "replicated_graph_count": 4,
+                })
+
+            ipu_program = paddle.static.IpuCompiledProgram(
+                main_prog, ipu_strategy=ipu_strategy)
+            program = ipu_program.compile(feed_list, fetch_list)
+
+            if use_dist:
+                if os.environ.get("PADDLE_TRAINER_ID") == "0":
+                    input_data = np.concatenate([
+                        np.array([[[1], [3]], [[2], [4]],
+                                  [[4], [127]]]).astype(np.int32),
+                        np.array([[[1], [3]], [[2], [4]],
+                                  [[4], [127]]]).astype(np.int32)
+                    ])
+                else:
+                    input_data = np.concatenate([
+                        np.array([[[8], [60]], [[50], [77]],
+                                  [[90], [13]]]).astype(np.int32),
+                        np.array([[[8], [60]], [[50], [77]],
+                                  [[90], [13]]]).astype(np.int32)
+                    ])
+            else:
+                input_data = np.concatenate([
+                    np.array([[[1], [3]], [[2], [4]],
+                              [[4], [127]]]).astype(np.int32),
+                    np.array([[[1], [3]], [[2], [4]],
+                              [[4], [127]]]).astype(np.int32),
+                    np.array([[[8], [60]], [[50], [77]],
+                              [[90], [13]]]).astype(np.int32),
+                    np.array([[[8], [60]], [[50], [77]],
+                              [[90], [13]]]).astype(np.int32)
+                ])
+            feed_data = {"x": input_data}
+
+            for step in range(10):
+                res = exe.run(program, feed=feed_data, fetch_list=fetch_list)
+
+            if use_dist:
+                res = mpi_comm.gather(res)
+                if os.getenv("PADDLE_TRAINER_ID") == "0":
+                    np.savetxt(file_name, np.array(res).flatten())
+            else:
+                np.savetxt(file_name, np.array(res).flatten())
+
+
+if __name__ == "__main__":
+    file_name = sys.argv[1]
+
+    use_dist = False
+    if 'PADDLE_TRAINER_ID' in os.environ:
+        from mpi4py import MPI
+
+        DISTRIBUTED_COMM = MPI.COMM_WORLD
+
+        def _get_comm():
+            global DISTRIBUTED_COMM
+            if DISTRIBUTED_COMM is None:
+                raise RuntimeError(
+                    "Distributed Commumication not setup. Please run setup_comm(MPI.COMM_WORLD) first."
+                )
+            return DISTRIBUTED_COMM
+
+        mpi_comm = _get_comm()
+        use_dist = True
+
+    Test(use_dist, file_name)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py
index 3b2034ebe836c..af03480fbf698 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py
@@ -86,7 +86,7 @@ def set_data_feed(self):
         self.feed_fp32 = {'in_0': data.astype(np.float32)}
         self.feed_fp16 = {'in_0': data.astype(np.float16)}
         data = np.random.choice([True, False], size=(2, 3, 1))
-        self.assign_bool = data.astype(np.bool)
+        self.assign_bool = data.astype(np.bool_)
 
     @IPUOpTest.static_graph
     def build_model(self):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_slice.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_slice.py
index f82acb204f0a2..003c84c4c5ab0 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_slice.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_slice.py
@@ -29,10 +29,7 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         attrs = [
             program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
-
-        for x in attrs[0]["decrease_axis"]:
-            if x < 0:
-                return False
+        out_shape = list(inputs['input_data'].shape)
         for x in range(len(attrs[0]["axes"])):
             start = 0
             end = 0
@@ -48,15 +45,20 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
                 end = attrs[0]["ends"][x]
             start = max(0, start)
             end = max(0, end)
+            out_shape[attrs[0]["axes"][x]] = end - start
             if start >= end:
                 return False
-
+        for x in attrs[0]["decrease_axis"]:
+            if x < 0:
+                return False
+            if (out_shape[x] != 1):
+                return False
         return True
 
     def sample_program_configs(self):
 
         def generate_input1(attrs: List[Dict[str, Any]]):
-            return np.ones([6, 6, 64, 64]).astype(np.float32)
+            return np.random.random([6, 6, 64, 64]).astype(np.float32)
 
         for axes in [[0, 1], [1, 3], [2, 3]]:
             for starts in [[0, 1]]:
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_split.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_split.py
index 38ca6963e94b2..e8c283acc3b8f 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_split.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_split.py
@@ -73,13 +73,13 @@ def sample_program_configs(self):
 
         def generate_input1(attrs: List[Dict[str, Any]], batch):
             if self.dims == 4:
-                return np.ones([batch, 3, 3, 24]).astype(np.float32)
+                return np.random.random([batch, 3, 3, 24]).astype(np.float32)
             elif self.dims == 3:
-                return np.ones([batch, 3, 24]).astype(np.float32)
+                return np.random.random([batch, 3, 24]).astype(np.float32)
             elif self.dims == 2:
-                return np.ones([batch, 24]).astype(np.float32)
+                return np.random.random([batch, 24]).astype(np.float32)
             elif self.dims == 1:
-                return np.ones([24]).astype(np.float32)
+                return np.random.random([24]).astype(np.float32)
 
         def generate_AxisTensor(attrs: List[Dict[str, Any]]):
             return np.ones([1]).astype(np.int32)
@@ -162,25 +162,33 @@ def sample_predictor_configs(
         def generate_dynamic_shape(attrs):
             if self.dims == 4:
                 self.dynamic_shape.min_input_shape = {
-                    "split_input": [1, 3, 3, 24]
+                    "split_input": [1, 3 - 1, 3 - 1, 24 - 1]
                 }
                 self.dynamic_shape.max_input_shape = {
-                    "split_input": [9, 3, 3, 24]
+                    "split_input": [9, 3 + 1, 3 + 1, 24 + 1]
                 }
                 self.dynamic_shape.opt_input_shape = {
                     "split_input": [1, 3, 3, 24]
                 }
             elif self.dims == 3:
-                self.dynamic_shape.min_input_shape = {"split_input": [1, 3, 24]}
-                self.dynamic_shape.max_input_shape = {"split_input": [9, 3, 24]}
+                self.dynamic_shape.min_input_shape = {
+                    "split_input": [1, 3 - 1, 24 - 1]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "split_input": [9, 3 + 1, 24 + 1]
+                }
                 self.dynamic_shape.opt_input_shape = {"split_input": [1, 3, 24]}
             elif self.dims == 2:
-                self.dynamic_shape.min_input_shape = {"split_input": [1, 24]}
-                self.dynamic_shape.max_input_shape = {"split_input": [9, 24]}
+                self.dynamic_shape.min_input_shape = {
+                    "split_input": [1, 24 - 1]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "split_input": [9, 24 + 1]
+                }
                 self.dynamic_shape.opt_input_shape = {"split_input": [1, 24]}
             elif self.dims == 1:
-                self.dynamic_shape.min_input_shape = {"split_input": [24]}
-                self.dynamic_shape.max_input_shape = {"split_input": [24]}
+                self.dynamic_shape.min_input_shape = {"split_input": [24 - 1]}
+                self.dynamic_shape.max_input_shape = {"split_input": [24 + 1]}
                 self.dynamic_shape.opt_input_shape = {"split_input": [24]}
 
         def clear_dynamic_shape():
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_top_k.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_top_k.py
new file mode 100644
index 0000000000000..28509d42ee30b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_top_k.py
@@ -0,0 +1,136 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import unittest
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+
+
+class TrtConvertActivationTest(TrtLayerAutoScanTest):
+
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+        self.trt_param.workspace_size = 1073741824
+
+        def generate_input1(dims, batch, attrs: List[Dict[str, Any]]):
+            if dims == 1:
+                return np.random.random([32]).astype(np.float32)
+            elif dims == 2:
+                return np.random.random([3, 32]).astype(np.float32)
+            elif dims == 3:
+                return np.random.random([3, 32, 32]).astype(np.float32)
+            else:
+                return np.random.random([batch, 3, 32, 32]).astype(np.float32)
+
+        for dims in [2, 3, 4, 5]:
+            for batch in [1]:
+                for k in [1, 3]:
+                    self.dims = dims
+                    dics = [{"k": k}]
+                    ops_config = [{
+                        "op_type": "top_k",
+                        "op_inputs": {
+                            "X": ["input_data"]
+                        },
+                        "op_outputs": {
+                            "Out": ["output_data"],
+                            "Indices": ["indices_data"]
+                        },
+                        "op_attrs": dics[0]
+                    }]
+                    ops = self.generate_op_config(ops_config)
+
+                    program_config = ProgramConfig(
+                        ops=ops,
+                        weights={},
+                        inputs={
+                            "input_data":
+                            TensorConfig(data_gen=partial(
+                                generate_input1, dims, batch, dics))
+                        },
+                        outputs=["output_data", "indices_data"])
+
+                    yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+
+        def generate_dynamic_shape(attrs):
+            if self.dims == 1:
+                self.dynamic_shape.min_input_shape = {"input_data": [1]}
+                self.dynamic_shape.max_input_shape = {"input_data": [64]}
+                self.dynamic_shape.opt_input_shape = {"input_data": [32]}
+            elif self.dims == 2:
+                self.dynamic_shape.min_input_shape = {"input_data": [1, 16]}
+                self.dynamic_shape.max_input_shape = {"input_data": [4, 32]}
+                self.dynamic_shape.opt_input_shape = {"input_data": [3, 32]}
+            elif self.dims == 3:
+                self.dynamic_shape.min_input_shape = {"input_data": [1, 16, 16]}
+                self.dynamic_shape.max_input_shape = {"input_data": [4, 32, 32]}
+                self.dynamic_shape.opt_input_shape = {"input_data": [3, 32, 32]}
+            else:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data": [1, 3, 16, 16]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [4, 3, 32, 32]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data": [1, 3, 32, 32]
+                }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            if self.dims == 1:
+                return 0, 4
+            return 1, 3
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+
+        ## for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
+
+    def test(self):
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_top_k_v2.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_top_k_v2.py
new file mode 100644
index 0000000000000..651cc00d2cd7a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_top_k_v2.py
@@ -0,0 +1,153 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import unittest
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+
+
+class TrtConvertActivationTest(TrtLayerAutoScanTest):
+
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        inputs = program_config.inputs
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+        if len(inputs['input_data'].shape) <= attrs[0]['axis']:
+            return False
+        return True
+
+    def sample_program_configs(self):
+        self.trt_param.workspace_size = 1073741824
+
+        def generate_input1(dims, batch, attrs: List[Dict[str, Any]]):
+            if dims == 1:
+                return np.random.random([3]).astype(np.float32)
+            elif dims == 2:
+                return np.random.random([3, 32]).astype(np.float32)
+            elif dims == 3:
+                return np.random.random([3, 32, 32]).astype(np.float32)
+            else:
+                return np.random.random([batch, 32, 32, 32]).astype(np.float32)
+
+        for dims in [1, 2, 3, 4]:
+            for batch in [1, 4]:
+                for k in [1, 3]:
+                    for axis in [-1, 1, 2, 3]:
+                        for largest in [True, False]:
+                            for sort in [True, False]:
+                                self.dims = dims
+                                self.sort = sort
+                                dics = [{
+                                    "k": k,
+                                    "axis": axis,
+                                    "largest": largest,
+                                    "sorted": sort
+                                }]
+                                ops_config = [{
+                                    "op_type": "top_k_v2",
+                                    "op_inputs": {
+                                        "X": ["input_data"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["output_data"],
+                                        "Indices": ["indices_data"]
+                                    },
+                                    "op_attrs": dics[0]
+                                }]
+                                ops = self.generate_op_config(ops_config)
+
+                                program_config = ProgramConfig(
+                                    ops=ops,
+                                    weights={},
+                                    inputs={
+                                        "input_data":
+                                        TensorConfig(data_gen=partial(
+                                            generate_input1, dims, batch, dics))
+                                    },
+                                    outputs=["output_data", "indices_data"])
+
+                                yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+
+        def generate_dynamic_shape(attrs):
+            if self.dims == 1:
+                self.dynamic_shape.min_input_shape = {"input_data": [1]}
+                self.dynamic_shape.max_input_shape = {"input_data": [64]}
+                self.dynamic_shape.opt_input_shape = {"input_data": [32]}
+            elif self.dims == 2:
+                self.dynamic_shape.min_input_shape = {"input_data": [1, 1]}
+                self.dynamic_shape.max_input_shape = {"input_data": [4, 64]}
+                self.dynamic_shape.opt_input_shape = {"input_data": [3, 10]}
+            elif self.dims == 3:
+                self.dynamic_shape.min_input_shape = {"input_data": [1, 1, 1]}
+                self.dynamic_shape.max_input_shape = {"input_data": [4, 64, 64]}
+                self.dynamic_shape.opt_input_shape = {"input_data": [3, 10, 10]}
+            else:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data": [1, 3, 16, 16]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [4, 32, 32, 32]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data": [1, 3, 32, 32]
+                }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            if self.dims == 1:
+                return 0, 4
+            if self.sort == False:
+                return 0, 4
+            return 1, 3
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
+
+    def test(self):
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt b/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt
index 7ed1529ea4c6b..56ad5f710163a 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt
@@ -6,7 +6,7 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
 foreach(TEST_OP ${TEST_OPS})
   py_test_modules(${TEST_OP} MODULES ${TEST_OP})
-endforeach(TEST_OP)
+endforeach()
 set_tests_properties(test_concat_mkldnn_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_conv3d_mkldnn_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_flags_mkldnn_ops_on_off PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py
index 67d06e7b22c1b..04d6be1300154 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py
@@ -64,7 +64,7 @@ def init_data(self):
 
         B_data = np.random.uniform(-127, 127, (5, 20)).astype(np.float32)
 
-        quant_B = np.round(B_data * self.scale_y[0]).astype(np.int)
+        quant_B = np.round(B_data * self.scale_y[0]).astype(np.int_)
         output = np.dot(A_data, quant_B)
 
         scale_output_shift = (self.scale_out) / \
@@ -136,7 +136,7 @@ def init_data(self):
         A_data_reshape = A_data.reshape(3 * 4, 4 * 3)
         B_data_reshape = B_data.reshape(2 * 6, 1 * 2 * 3)
 
-        quant_B = np.round(B_data_reshape * self.scale_y[0]).astype(np.int)
+        quant_B = np.round(B_data_reshape * self.scale_y[0]).astype(np.int_)
         output = np.dot(A_data_reshape, quant_B)
 
         scale_output_shift = (self.scale_out) / \
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_bf16_mkldnn_op.py
index ca61f961b7a0a..9986726b3a601 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_bf16_mkldnn_op.py
@@ -47,7 +47,7 @@ def setUp(self):
         self.shape = self.get_x_shape()
         self.axis = self.get_axis()
 
-        x = np.random.uniform(0.1, 1, self.shape).astype(np.float)
+        x = np.random.uniform(0.1, 1, self.shape).astype(np.float64)
         out = convert_float_to_uint16(
             np.apply_along_axis(stable_softmax, self.axis, x))
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_bilinear_interp_v2_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_bilinear_interp_v2_op_mlu.py
new file mode 100644
index 0000000000000..b8c31578099e1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_bilinear_interp_v2_op_mlu.py
@@ -0,0 +1,663 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+
+sys.path.append('..')
+from op_test import OpTest
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.nn.functional import interpolate
+import paddle
+
+paddle.enable_static()
+
+
+def bilinear_interp_np(input,
+                       out_h,
+                       out_w,
+                       scale_w=0,
+                       scale_h=0,
+                       out_size=None,
+                       actual_shape=None,
+                       align_corners=True,
+                       align_mode=0,
+                       data_layout='NCHW'):
+    """bilinear interpolation implement in shape [N, C, H, W]"""
+    if data_layout == "NHWC":
+        input = np.transpose(input, (0, 3, 1, 2))  # NHWC => NCHW
+    if out_size is not None:
+        out_h = out_size[0]
+        out_w = out_size[1]
+    if actual_shape is not None:
+        out_h = actual_shape[0]
+        out_w = actual_shape[1]
+    batch_size, channel, in_h, in_w = input.shape
+
+    ratio_h = ratio_w = 0.0
+    if out_h > 1:
+        if (align_corners):
+            ratio_h = (in_h - 1.0) / (out_h - 1.0)
+        else:
+            if scale_h > 0:
+                ratio_h = 1.0 / scale_h
+            else:
+                ratio_h = 1.0 * in_h / out_h
+    if out_w > 1:
+        if (align_corners):
+            ratio_w = (in_w - 1.0) / (out_w - 1.0)
+        else:
+            if scale_w > 0:
+                ratio_w = 1.0 / scale_w
+            else:
+                ratio_w = 1.0 * in_w / out_w
+
+    out = np.zeros((batch_size, channel, out_h, out_w))
+
+    for i in range(out_h):
+        if (align_mode == 0 and not align_corners):
+            h = int(ratio_h * (i + 0.5) - 0.5)
+        else:
+            h = int(ratio_h * i)
+
+        h = max(0, h)
+        hid = 1 if h < in_h - 1 else 0
+        if (align_mode == 0 and not align_corners):
+            idx_src_h = max(ratio_h * (i + 0.5) - 0.5, 0)
+            h1lambda = idx_src_h - h
+        else:
+            h1lambda = ratio_h * i - h
+        h2lambda = 1.0 - h1lambda
+        for j in range(out_w):
+            if (align_mode == 0 and not align_corners):
+                w = int(ratio_w * (j + 0.5) - 0.5)
+            else:
+                w = int(ratio_w * j)
+            w = max(0, w)
+            wid = 1 if w < in_w - 1 else 0
+            if (align_mode == 0 and not align_corners):
+                idx_src_w = max(ratio_w * (j + 0.5) - 0.5, 0)
+                w1lambda = idx_src_w - w
+            else:
+                w1lambda = ratio_w * j - w
+            w2lambda = 1.0 - w1lambda
+
+            out[:, :, i, j] = h2lambda*(w2lambda*input[:, :, h, w] +
+                                        w1lambda*input[:, :, h, w+wid]) + \
+                h1lambda*(w2lambda*input[:, :, h+hid, w] +
+                          w1lambda*input[:, :, h+hid, w+wid])
+
+    if data_layout == "NHWC":
+        out = np.transpose(out, (0, 2, 3, 1))  # NCHW => NHWC
+
+    return out.astype(input.dtype)
+
+
+class TestBilinearInterpOp(OpTest):
+
+    def setUp(self):
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+        self.out_size = None
+        self.actual_shape = None
+        self.data_layout = 'NCHW'
+        self.init_test_case()
+        self.dtype = "float32"
+        self.op_type = "bilinear_interp_v2"
+        input_np = np.random.random(self.input_shape).astype(self.dtype)
+
+        if self.data_layout == "NCHW":
+            in_h = self.input_shape[2]
+            in_w = self.input_shape[3]
+        else:
+            in_h = self.input_shape[1]
+            in_w = self.input_shape[2]
+        scale_h = 0
+        scale_w = 0
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0.:
+                    scale_h = scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_w = scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[1]
+                scale_h = self.scale[0]
+            out_h = int(in_h * scale_h)
+            out_w = int(in_w * scale_w)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = bilinear_interp_np(input_np, out_h, out_w, 0, 0,
+                                       self.out_size, self.actual_shape,
+                                       self.align_corners, self.align_mode,
+                                       self.data_layout)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        if self.actual_shape is not None:
+            self.inputs['OutSize'] = self.actual_shape
+
+        self.attrs = {
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+            'align_mode': self.align_mode,
+            'data_layout': self.data_layout
+        }
+
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0.:
+                    self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out', in_place=True)
+
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 5, 5]
+        self.out_h = 2
+        self.out_w = 2
+        self.scale = 0.
+        self.out_size = np.array([3, 3]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpCase1(TestBilinearInterpOp):
+
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpCase2(TestBilinearInterpOp):
+
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpCase3(TestBilinearInterpOp):
+
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-5)
+
+
+class TestBilinearInterpCase4(TestBilinearInterpOp):
+
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+        self.scale = 0.
+        self.out_size = np.array([2, 2]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpCase5(TestBilinearInterpOp):
+
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.scale = 0.
+        self.out_size = np.array([11, 11]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpCase6(TestBilinearInterpOp):
+
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.out_size = np.array([65, 33]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpCase7(TestBilinearInterpOp):
+
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = [2.0, 0.5]
+        self.align_corners = False
+        self.align_mode = 1
+
+
+class TestBilinearInterpSame(TestBilinearInterpOp):
+
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 32, 64]
+        self.out_h = 32
+        self.out_w = 64
+        self.scale = 0.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpActualShape(TestBilinearInterpOp):
+
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [3, 2, 32, 16]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.out_size = np.array([66, 40]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpDataLayout(TestBilinearInterpOp):
+
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 5, 5, 3]
+        self.out_h = 2
+        self.out_w = 2
+        self.scale = 0.
+        self.out_size = np.array([3, 3]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+        self.data_layout = "NHWC"
+
+
+class TestBilinearInterpOtherMethod1(TestBilinearInterpOp):
+
+    def set_align_mode(self):
+        self.align_corners = False
+        self.align_mode = 1
+
+
+class TestBilinearInterpWithMethod2(TestBilinearInterpOp):
+
+    def set_align_mode(self):
+        self.align_corners = False
+        self.align_mode = 0
+
+
+class TestBilinearInterpWithMethod3(TestBilinearInterpOp):
+
+    def set_align_mode(self):
+        self.align_corners = True
+        self.align_mode = 0
+
+
+class TestBilinearInterpScale1(TestBilinearInterpOp):
+
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 5, 7]
+        self.out_h = 60
+        self.out_w = 25
+        self.scale = 2.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpScale2(TestBilinearInterpOp):
+
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 5, 7]
+        self.out_h = 60
+        self.out_w = 25
+        self.scale = 1.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpScale3(TestBilinearInterpOp):
+
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 5, 7]
+        self.out_h = 60
+        self.out_w = 25
+        self.scale = 1.5
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpScale4(TestBilinearInterpOp):
+
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 5, 7]
+        self.out_h = 60
+        self.out_w = 25
+        self.scale = [1.5, 0.5]
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpZero(TestBilinearInterpOp):
+
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 5, 7]
+        self.out_h = 60
+        self.out_w = 25
+        self.scale = 0.2
+        self.align_corners = False
+        self.align_mode = 0
+
+
+class TestBilinearInterpOp_attr_tensor(OpTest):
+
+    def setUp(self):
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+        self.out_size = None
+        self.actual_shape = None
+        self.init_test_case()
+        self.op_type = "bilinear_interp_v2"
+        self.shape_by_1Dtensor = False
+        self.scale_by_1Dtensor = False
+        self.attrs = {
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+        }
+
+        input_np = np.random.random(self.input_shape).astype("float32")
+        self.inputs = {'X': input_np}
+
+        if self.scale_by_1Dtensor:
+            self.inputs['Scale'] = np.array([self.scale]).astype("float32")
+        elif self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0:
+                    scale_h = scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_w = scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[1]
+                scale_h = self.scale[0]
+            out_h = int(self.input_shape[2] * scale_h)
+            out_w = int(self.input_shape[3] * scale_w)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        if self.shape_by_1Dtensor:
+            self.inputs['OutSize'] = self.out_size
+        elif self.out_size is not None:
+            size_tensor = []
+            for index, ele in enumerate(self.out_size):
+                size_tensor.append(("x" + str(index), np.ones(
+                    (1)).astype('int32') * ele))
+            self.inputs['SizeTensor'] = size_tensor
+
+        self.attrs['out_h'] = self.out_h
+        self.attrs['out_w'] = self.out_w
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0:
+                    self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        output_np = bilinear_interp_np(input_np, out_h, out_w, 0, 0,
+                                       self.out_size, self.actual_shape,
+                                       self.align_corners)
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out', in_place=True)
+
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 5, 5]
+        self.out_h = 3
+        self.out_w = 3
+        self.scale = 0.
+        self.out_size = [3, 3]
+        self.align_corners = True
+
+
+# out_size is a 1-D tensor
+class TestBilinearInterp_attr_tensor_Case1(TestBilinearInterpOp_attr_tensor):
+
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.scale = 0.
+        self.out_size = [8, 12]
+        self.align_corners = True
+
+
+# scale is a 1-D tensor
+class TestBilinearInterp_attr_tensor_Case2(TestBilinearInterpOp_attr_tensor):
+
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [3, 2, 32, 16]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.out_size = np.array([66, 40]).astype("int32")
+        self.align_corners = True
+        self.shape_by_1Dtensor = True
+
+
+# scale is a 1-D tensor
+class TestBilinearInterp_attr_tensor_Case3(TestBilinearInterpOp_attr_tensor):
+
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [3, 2, 32, 16]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 2.0
+        self.out_size = None
+        self.align_corners = True
+        self.scale_by_1Dtensor = True
+
+
+#TODO: comment this test for now until bilinear_interp_op added.
+# class TestBilinearInterpOpAPI(unittest.TestCase):
+#     def test_case(self):
+#         x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+
+#         dim = fluid.data(name="dim", shape=[1], dtype="int32")
+#         shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32")
+#         actual_size = fluid.data(name="actual_size", shape=[2], dtype="int32")
+#         scale_tensor = fluid.data(
+#             name="scale_tensor", shape=[1], dtype="float32")
+
+#         out1 = fluid.layers.resize_bilinear(x, out_shape=[12, 12])
+#         out2 = fluid.layers.resize_bilinear(x, out_shape=[12, dim])
+#         out3 = fluid.layers.resize_bilinear(x, out_shape=shape_tensor)
+#         out4 = fluid.layers.resize_bilinear(
+#             x, out_shape=[4, 4], actual_shape=actual_size)
+#         out5 = fluid.layers.resize_bilinear(x, scale=scale_tensor)
+
+#         x_data = np.random.random((2, 3, 6, 6)).astype("float32")
+#         dim_data = np.array([12]).astype("int32")
+#         shape_data = np.array([12, 12]).astype("int32")
+#         actual_size_data = np.array([12, 12]).astype("int32")
+#         scale_data = np.array([2.0]).astype("float32")
+
+#         if core.is_compiled_with_mlu():
+#             place = paddle.device.MLUPlace(0)
+#         else:
+#             place = core.CPUPlace()
+#         exe = fluid.Executor(place)
+#         exe.run(fluid.default_startup_program())
+#         results = exe.run(fluid.default_main_program(),
+#                           feed={
+#                               "x": x_data,
+#                               "dim": dim_data,
+#                               "shape_tensor": shape_data,
+#                               "actual_size": actual_size_data,
+#                               "scale_tensor": scale_data
+#                           },
+#                           fetch_list=[out1, out2, out3, out4, out5],
+#                           return_numpy=True)
+
+#         expect_res = bilinear_interp_np(
+#             x_data, out_h=12, out_w=12, align_corners=True)
+#         for res in results:
+#             self.assertTrue(np.allclose(res, expect_res))
+
+
+class TestBilinearInterpOpAPI_dy(unittest.TestCase):
+
+    def test_case(self):
+        import paddle
+        if core.is_compiled_with_mlu():
+            place = paddle.device.MLUPlace(0)
+        else:
+            place = core.CPUPlace()
+        with fluid.dygraph.guard(place):
+            input_data = np.random.random((2, 3, 6, 6)).astype("float32")
+            input_data = np.load('input.npy').astype("float32")
+            # print(input_data)
+            input_x = paddle.to_tensor(input_data)
+            expect_res = bilinear_interp_np(input_data,
+                                            out_h=12,
+                                            out_w=12,
+                                            align_corners=False)
+            out = interpolate(x=input_x,
+                              size=[12, 12],
+                              mode="bilinear",
+                              align_corners=False)
+            self.assertTrue(np.allclose(out.numpy(), expect_res))
+
+
+class TestBilinearInterpOpAPI_dy2(unittest.TestCase):
+
+    def test_case(self):
+        import paddle
+        if core.is_compiled_with_mlu():
+            place = paddle.device.MLUPlace(0)
+        else:
+            place = core.CPUPlace()
+        with fluid.dygraph.guard(place):
+            input_data = np.random.random((2, 3, 6, 6)).astype("float32")
+            size_np = np.array([12, 12]).astype("int64")
+            input_x = paddle.to_tensor(input_data)
+            size = paddle.to_tensor(size_np)
+            expect_res = bilinear_interp_np(input_data,
+                                            out_h=12,
+                                            out_w=12,
+                                            align_corners=False)
+            out = interpolate(x=input_x,
+                              size=size,
+                              mode="bilinear",
+                              align_corners=False)
+            self.assertTrue(np.allclose(out.numpy(), expect_res))
+
+
+class TestBilinearInterpOpAPI_dy3(unittest.TestCase):
+
+    def test_case(self):
+        import paddle
+        if core.is_compiled_with_mlu():
+            place = paddle.device.MLUPlace(0)
+        else:
+            place = core.CPUPlace()
+        with fluid.dygraph.guard(place):
+            input_data = np.random.random((2, 3, 6, 6)).astype("float32")
+            size_1 = np.array([12]).astype("int64")
+            input_x = paddle.to_tensor(input_data)
+            size = paddle.to_tensor(size_1)
+            expect_res = bilinear_interp_np(input_data,
+                                            out_h=12,
+                                            out_w=12,
+                                            align_corners=False)
+            out = interpolate(x=input_x,
+                              size=[size, size],
+                              mode="bilinear",
+                              align_corners=False)
+            self.assertTrue(np.allclose(out.numpy(), expect_res))
+
+
+class TestBilinearInterpOpAPI_dy4(unittest.TestCase):
+
+    def test_case(self):
+        import paddle
+        if core.is_compiled_with_mlu():
+            place = paddle.device.MLUPlace(0)
+        else:
+            place = core.CPUPlace()
+        with fluid.dygraph.guard(place):
+            input_data = np.random.random((2, 3, 6, 6)).astype("float32")
+            scale_np = np.array([2, 2]).astype("int64")
+            input_x = paddle.to_tensor(input_data)
+            scale = paddle.to_tensor(scale_np)
+            expect_res = bilinear_interp_np(input_data,
+                                            out_h=12,
+                                            out_w=12,
+                                            align_corners=False)
+            out = interpolate(x=input_x,
+                              scale_factor=scale,
+                              mode="bilinear",
+                              align_corners=False)
+
+            self.assertTrue(np.allclose(out.numpy(), expect_res))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_conv2d_op_depthwise_conv_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_conv2d_op_depthwise_conv_mlu.py
new file mode 100644
index 0000000000000..8d239732e7342
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_conv2d_op_depthwise_conv_mlu.py
@@ -0,0 +1,238 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+
+sys.path.append("..")
+
+import paddle
+
+paddle.enable_static()
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from op_test import OpTest
+from paddle.fluid import Program, program_guard
+from test_conv2d_op_mlu import TestConv2DOp, TestConv2DOp_v2, create_test_padding_SAME_class, create_test_padding_VALID_class, create_test_channel_last_class, create_test_fp16_class
+
+#----------------TestDepthwiseConv -----
+
+
+class TestDepthwiseConv(TestConv2DOp):
+
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [12, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+
+class TestDepthwiseConv2(TestConv2DOp):
+
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [12, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+
+class TestDepthwiseConv3(TestConv2DOp):
+
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [24, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+
+class TestDepthwiseConvandFuse(TestConv2DOp):
+
+    def init_test_case(self):
+        self.fuse_relu_before_depthwise_conv = True
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [12, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+
+class TestDepthwiseConv2andFuse(TestConv2DOp):
+
+    def init_test_case(self):
+        self.fuse_relu_before_depthwise_conv = True
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [12, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+
+class TestDepthwiseConv3andFuse(TestConv2DOp):
+
+    def init_test_case(self):
+        self.fuse_relu_before_depthwise_conv = True
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [24, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+
+class TestDepthwiseConv_AsyPadding(TestConv2DOp_v2):
+
+    def init_test_case(self):
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [12, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+    def init_paddings(self):
+        self.pad = [1, 1, 0, 1]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestDepthwiseConv2_AsyPadding(TestConv2DOp_v2):
+
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [12, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+    def init_paddings(self):
+        self.pad = [0, 1, 0, 2]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestDepthwiseConv3_AsyPadding(TestConv2DOp_v2):
+
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [24, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+    def init_paddings(self):
+        self.pad = [1, 1, 0, 0]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestDepthwiseConvandFuse_AsyPadding(TestConv2DOp_v2):
+
+    def init_test_case(self):
+        self.fuse_relu_before_depthwise_conv = True
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [12, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+    def init_paddings(self):
+        self.pad = [2, 1, 2, 3]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestDepthwiseConv2andFuse_AsyPadding(TestConv2DOp_v2):
+
+    def init_test_case(self):
+        self.fuse_relu_before_depthwise_conv = True
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [12, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+    def init_paddings(self):
+        self.pad = [1, 1, 1, 2]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestDepthwiseConv3andFuse_AsyPadding(TestConv2DOp_v2):
+
+    def init_test_case(self):
+        self.fuse_relu_before_depthwise_conv = True
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [24, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+    def init_paddings(self):
+        self.pad = [1, 2, 0, 2]
+        self.padding_algorithm = "EXPLICIT"
+
+
+# depthwise conv2d
+
+create_test_padding_SAME_class(TestDepthwiseConv_AsyPadding)
+create_test_padding_SAME_class(TestDepthwiseConvandFuse_AsyPadding)
+
+create_test_padding_VALID_class(TestDepthwiseConv_AsyPadding)
+create_test_padding_VALID_class(TestDepthwiseConvandFuse_AsyPadding)
+
+# channel last
+
+create_test_channel_last_class(TestDepthwiseConv_AsyPadding)
+create_test_channel_last_class(TestDepthwiseConvandFuse_AsyPadding)
+
+create_test_fp16_class(TestDepthwiseConv_AsyPadding)
+create_test_fp16_class(TestDepthwiseConvandFuse_AsyPadding)
+
+# TODO(MLU): Depthwise opration does not support dilation yet
+# it will throw an error of CNNL_STATUS_NOT_SUPPORTED.
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_elementwise_max_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_max_op_mlu.py
new file mode 100644
index 0000000000000..b4c74a99d85b7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_max_op_mlu.py
@@ -0,0 +1,241 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.core import ops
+
+paddle.enable_static()
+SEED = 2022
+
+
+class TestElementwiseMax(OpTest):
+
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "elementwise_max"
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.maximum(x, y)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestElementwiseMaxFp16(OpTest):
+
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "elementwise_max"
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        out = np.maximum(x, y)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.__class__.no_need_check_grad = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestElementwiseMaxInt32(OpTest):
+
+    def init_dtype(self):
+        self.dtype = np.int32
+
+
+class TestTestElementwiseMax_Vector(TestElementwiseMax):
+
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "elementwise_max"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [100]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [100]).astype("float32")
+        }
+        self.outputs = {'Out': np.maximum(self.inputs['X'], self.inputs['Y'])}
+
+
+class TestTestElementwiseMax_broadcast_0(TestElementwiseMax):
+
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "elementwise_max"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [100, 3, 4]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [100]).astype("float32")
+        }
+
+        self.attrs = {'axis': 0}
+        self.outputs = {
+            'Out': np.maximum(self.inputs['X'],
+                              self.inputs['Y'].reshape(100, 1, 1))
+        }
+
+
+class TestTestElementwiseMax_broadcast_1(TestElementwiseMax):
+
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "elementwise_max"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 100, 4]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [100]).astype("float32")
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': np.maximum(self.inputs['X'],
+                              self.inputs['Y'].reshape(1, 100, 1))
+        }
+
+
+class TestTestElementwiseMax_broadcast_2(TestElementwiseMax):
+
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "elementwise_max"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 100]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [100]).astype("float32")
+        }
+
+        self.outputs = {
+            'Out': np.maximum(self.inputs['X'],
+                              self.inputs['Y'].reshape(1, 1, 100))
+        }
+
+
+class TestTestElementwiseMax_broadcast_3(TestElementwiseMax):
+
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "elementwise_max"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 10, 12, 5]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [10, 12]).astype("float32")
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out':
+            np.maximum(self.inputs['X'], self.inputs['Y'].reshape(1, 10, 12, 1))
+        }
+
+
+class TestTestElementwiseMax_broadcast_4(TestElementwiseMax):
+
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "elementwise_max"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 50]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [2, 1, 50]).astype("float32")
+        }
+        self.outputs = {'Out': np.maximum(self.inputs['X'], self.inputs['Y'])}
+
+
+class TestTestElementwiseMax_broadcast_5(TestElementwiseMax):
+
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "elementwise_max"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 4, 20]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [2, 3, 1, 20]).astype("float32")
+        }
+        self.outputs = {'Out': np.maximum(self.inputs['X'], self.inputs['Y'])}
+
+
+class TestTestElementwiseMax_commonuse_1(TestElementwiseMax):
+
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "elementwise_max"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 100]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [1, 1, 100]).astype("float32"),
+        }
+        self.outputs = {'Out': np.maximum(self.inputs['X'], self.inputs['Y'])}
+
+
+class TestTestElementwiseMax_commonuse_2(TestElementwiseMax):
+
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "elementwise_max"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [30, 3, 1, 5]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [30, 1, 4, 1]).astype("float32"),
+        }
+        self.outputs = {'Out': np.maximum(self.inputs['X'], self.inputs['Y'])}
+
+
+class TestTestElementwiseMax_xsize_lessthan_ysize(TestElementwiseMax):
+
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "elementwise_max"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [10, 12]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [2, 3, 10, 12]).astype("float32"),
+        }
+
+        self.attrs = {'axis': 2}
+
+        self.outputs = {'Out': np.maximum(self.inputs['X'], self.inputs['Y'])}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_log_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_log_op_mlu.py
new file mode 100644
index 0000000000000..82aeb577205d5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_log_op_mlu.py
@@ -0,0 +1,216 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+class TestActivation(OpTest):
+
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "exp"
+        self.init_dtype()
+        self.init_kernel_type()
+        self.python_api = paddle.exp
+
+        np.random.seed(2049)
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.exp(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def init_kernel_type(self):
+        pass
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+        __class__.no_need_check_grad = True
+
+
+class TestLog(TestActivation):
+
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "log"
+        self.python_api = paddle.log
+        self.init_dtype()
+
+        np.random.seed(1024)
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.log(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def test_error(self):
+        in1 = fluid.layers.data(name="in1",
+                                shape=[11, 17],
+                                append_batch_size=False,
+                                dtype="int32")
+        in2 = fluid.layers.data(name="in2",
+                                shape=[11, 17],
+                                append_batch_size=False,
+                                dtype="int64")
+
+        self.assertRaises(TypeError, fluid.layers.log, in1)
+        self.assertRaises(TypeError, fluid.layers.log, in2)
+
+
+class TestLog2(TestActivation):
+
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "log2"
+        self.python_api = paddle.log2
+        self.init_dtype()
+
+        x = np.random.uniform(1, 10, [11, 17]).astype(self.dtype)
+        out = np.log2(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def test_error(self):
+        in1 = paddle.static.data(name="in1", shape=[11, 17], dtype="int32")
+        in2 = paddle.static.data(name="in2", shape=[11, 17], dtype="int64")
+
+        self.assertRaises(TypeError, paddle.log2, in1)
+        self.assertRaises(TypeError, paddle.log2, in2)
+
+    def test_api(self):
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            input_x = np.random.uniform(0.1, 1, [11, 17]).astype("float32")
+            data_x = paddle.static.data(name="data_x",
+                                        shape=[11, 17],
+                                        dtype="float32")
+
+            out1 = paddle.log2(data_x)
+            exe = paddle.static.Executor(place=fluid.CPUPlace())
+            exe.run(paddle.static.default_startup_program())
+            res1 = exe.run(paddle.static.default_main_program(),
+                           feed={"data_x": input_x},
+                           fetch_list=[out1])
+        expected_res = np.log2(input_x)
+        self.assertTrue(np.allclose(res1, expected_res))
+
+        # dygraph
+        with fluid.dygraph.guard():
+            np_x = np.random.uniform(0.1, 1, [11, 17]).astype("float32")
+            data_x = paddle.to_tensor(np_x)
+            z = paddle.log2(data_x)
+            np_z = z.numpy()
+            z_expected = np.array(np.log2(np_x))
+            np.savetxt("np_z.txt", np_z.flatten(), fmt="%.4f")
+            np.savetxt("z_expected.txt", z_expected.flatten(), fmt="%.4f")
+        self.assertTrue(np.allclose(np_z, z_expected, atol=1e-6))
+
+
+class TestLog10(TestActivation):
+
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "log10"
+        self.python_api = paddle.log10
+        self.init_dtype()
+
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.log10(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def test_error(self):
+        in1 = paddle.static.data(name="in1", shape=[11, 17], dtype="int32")
+        in2 = paddle.static.data(name="in2", shape=[11, 17], dtype="int64")
+
+        self.assertRaises(TypeError, paddle.log10, in1)
+        self.assertRaises(TypeError, paddle.log10, in2)
+
+    def test_api(self):
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            input_x = np.random.uniform(0.1, 1, [11, 17]).astype("float32")
+            data_x = paddle.static.data(name="data_x",
+                                        shape=[11, 17],
+                                        dtype="float32")
+
+            out1 = paddle.log10(data_x)
+            exe = paddle.static.Executor(place=paddle.CPUPlace())
+            exe.run(paddle.static.default_startup_program())
+            res1 = exe.run(paddle.static.default_main_program(),
+                           feed={"data_x": input_x},
+                           fetch_list=[out1])
+        expected_res = np.log10(input_x)
+        self.assertTrue(np.allclose(res1, expected_res))
+
+        # dygraph
+        with fluid.dygraph.guard():
+            np_x = np.random.uniform(0.1, 1, [11, 17]).astype("float32")
+            data_x = paddle.to_tensor(np_x)
+            z = paddle.log10(data_x)
+            np_z = z.numpy()
+            z_expected = np.array(np.log10(np_x))
+        self.assertTrue(np.allclose(np_z, z_expected))
+
+
+class TestLogHalf(TestLog):
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_api(self):
+        pass
+
+
+class TestLog2Half(TestLog2):
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_api(self):
+        pass
+
+
+class TestLog10Half(TestLog10):
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_api(self):
+        pass
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_lookup_table_v2_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_lookup_table_v2_op_mlu.py
index 17ef85dd2bd8a..2efa8823fdaf5 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_lookup_table_v2_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_lookup_table_v2_op_mlu.py
@@ -97,7 +97,6 @@ def init_dtype(self):
     def set_mlu(self):
         self.__class__.use_mlu = True
         self.place = paddle.device.MLUPlace(0)
-        self.__class__.no_need_check_grad = True
 
 
 class TestLookupTableV2Dim32(TestLookupTableV2):
@@ -126,7 +125,6 @@ def init_dims(self):
     def set_mlu(self):
         self.__class__.use_mlu = True
         self.place = paddle.device.MLUPlace(0)
-        self.__class__.no_need_check_grad = True
 
 
 class TestLookupTableV2WithPadding(TestLookupTableV2):
diff --git a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
index 7498fa72194d9..57e52206653c8 100644
--- a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
@@ -7,12 +7,13 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 if(WITH_ASCEND_CL)
   foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
-  endforeach(TEST_OP)
+  endforeach()
 
   # NOTE: NPU `get_float_status` read the value from register, During the test,
-  # it is found that this register will be overwritten by any program on the card.
-  # In order to prevent the interference of nan/inf in the other unittests, we
-  # need to set the unittests related to `float_status` to exclusive.
+  # it is found that this register will be overwritten by any program on the
+  # card. In order to prevent the interference of nan/inf in the other
+  # unittests, we need to set the unittests related to `float_status` to
+  # exclusive.
   set_tests_properties(test_amp_check_finite_and_scale_op_npu
                        PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
   set_tests_properties(test_flags_check_nan_inf_npu
diff --git a/python/paddle/fluid/tests/unittests/npu/test_box_coder_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_box_coder_op_npu.py
index 7febcaba45cb4..d8f442c84411a 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_box_coder_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_box_coder_op_npu.py
@@ -193,7 +193,7 @@ def set_attrs(self):
         }
         if self.use_variance:
             self.attrs['variance'] = self.prior_box_var.astype(
-                np.float).flatten()
+                np.float64).flatten()
         if self.axis != 0:
             self.attrs['axis'] = self.axis
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_density_prior_box_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_density_prior_box_op_npu.py
index 7271644ce8294..14aec76af8b19 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_density_prior_box_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_density_prior_box_op_npu.py
@@ -84,7 +84,7 @@ def init_test_params(self):
         self.batch_size = 10
 
         self.variances = [0.1, 0.1, 0.2, 0.2]
-        self.variances = np.array(self.variances, dtype=np.float).flatten()
+        self.variances = np.array(self.variances, dtype=np.float64).flatten()
 
         self.clip = True
         self.num_priors = 0
diff --git a/python/paddle/fluid/tests/unittests/npu/test_fill_zeros_like_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_fill_zeros_like_op_npu.py
index f9f338a731079..dc7c6f0096bdf 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_fill_zeros_like_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_fill_zeros_like_op_npu.py
@@ -48,7 +48,7 @@ def test_check_output(self):
 class TestFillZerosLikeOpBool(TestFillZerosLikeOp):
 
     def init_dtype(self):
-        self.dtype = np.bool
+        self.dtype = np.bool_
 
 
 class TestFillZerosLikeOpFp16(TestFillZerosLikeOp):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_prior_box_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_prior_box_op_npu.py
index cfd78c2b05b36..5290ad1c0d020 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_prior_box_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_prior_box_op_npu.py
@@ -95,9 +95,9 @@ def init_test_params(self):
         self.set_min_max_aspect_ratios_order()
         self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0]
         self.aspect_ratios = np.array(self.aspect_ratios,
-                                      dtype=np.float).flatten()
+                                      dtype=np.float64).flatten()
         self.variances = [0.1, 0.1, 0.2, 0.2]
-        self.variances = np.array(self.variances, dtype=np.float).flatten()
+        self.variances = np.array(self.variances, dtype=np.float64).flatten()
 
         self.clip = True
         self.num_priors = len(self.real_aspect_ratios) * len(self.min_sizes)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_reduce_max_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reduce_max_op_npu.py
index 64f66476542da..5cedd90d2685e 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_reduce_max_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_max_op_npu.py
@@ -106,7 +106,7 @@ def setUp(self):
         }
         self.outputs = {
             'Out':
-            self.inputs['X'].max(axis=tuple(self.attrs['dim'])).astype(np.bool)
+            self.inputs['X'].max(axis=tuple(self.attrs['dim'])).astype(np.bool_)
         }
 
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_reduce_min_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reduce_min_op_npu.py
index 85d1fe9478140..10aeea4dee5bf 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_reduce_min_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_min_op_npu.py
@@ -106,7 +106,7 @@ def setUp(self):
         }
         self.outputs = {
             'Out':
-            self.inputs['X'].min(axis=tuple(self.attrs['dim'])).astype(np.bool)
+            self.inputs['X'].min(axis=tuple(self.attrs['dim'])).astype(np.bool_)
         }
 
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_reduce_prod_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reduce_prod_op_npu.py
index c32e105b02ade..9e3bc365cee2e 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_reduce_prod_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_prod_op_npu.py
@@ -129,7 +129,8 @@ def setUp(self):
         self.attrs = {'dim': [0], 'out_dtype': int(core.VarDesc.VarType.BOOL)}
         self.outputs = {
             'Out':
-            self.inputs['X'].prod(axis=tuple(self.attrs['dim'])).astype(np.bool)
+            self.inputs['X'].prod(axis=tuple(self.attrs['dim'])).astype(
+                np.bool_)
         }
 
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_size_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_size_op_npu.py
index 76fc5846534ac..e5fd042674204 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_size_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_size_op_npu.py
@@ -72,7 +72,7 @@ class TestSizeOp4(TestSizeOp):
 
     def config(self):
         self.shape = [2**10]
-        self.dtype = np.bool
+        self.dtype = np.bool_
 
 
 class TestSizeOp5(TestSizeOp):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_tril_triu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_tril_triu_op_npu.py
index b3d5fa9a6b5c9..bdc68d43a2241 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_tril_triu_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_tril_triu_op_npu.py
@@ -199,7 +199,7 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def init_dtype(self):
-        self.dtype = np.bool
+        self.dtype = np.bool_
 
     def initTestCase(self):
         self.real_op_type = np.random.choice(['triu', 'tril'])
diff --git a/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_min_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_min_op_npu.py
index 21be9e295d2e1..48df4ad454aad 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_min_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_min_op_npu.py
@@ -37,7 +37,7 @@ def setUp(self):
 
         self.init()
         fluid.core.globals()['FLAGS_min_loss_scaling'] = 1639
-        found_inf = np.array([True], dtype=np.bool)
+        found_inf = np.array([True], dtype=np.bool_)
         x = np.random.random((1024, 1024)).astype(self.dtype)
         i = np.random.randint(0, 1024, 1)
         j = np.random.randint(0, 1024, 1)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_op_npu.py
index 5299369ff1743..678e50247afc8 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_op_npu.py
@@ -34,7 +34,7 @@ def setUp(self):
         self.place = paddle.NPUPlace(0)
 
         self.init()
-        found_inf = np.array([False], dtype=np.bool)
+        found_inf = np.array([False], dtype=np.bool_)
         x = np.random.random((1024, 1024)).astype(self.dtype)
 
         self.inputs = {
@@ -82,7 +82,7 @@ def setUp(self):
         self.place = paddle.NPUPlace(0)
 
         self.init()
-        found_inf = np.array([True], dtype=np.bool)
+        found_inf = np.array([True], dtype=np.bool_)
         x = np.random.random((1024, 1024)).astype(self.dtype)
         i = np.random.randint(0, 1024, 1)
         j = np.random.randint(0, 1024, 1)
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index ded9f188472dd..ba694f5353083 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -471,7 +471,7 @@ def infer_dtype(numpy_dict, dtype_set):
             np.dtype(np.int16),
             np.dtype(np.int8),
             np.dtype(np.uint8),
-            np.dtype(np.bool)
+            np.dtype(np.bool_)
         ]
         # check the dtype in dtype_list in order, select the first dtype that in dtype_set
         for dtype in dtype_list:
diff --git a/python/paddle/fluid/tests/unittests/ps/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ps/CMakeLists.txt
index ab985d73d5387..cb566a41aaaab 100755
--- a/python/paddle/fluid/tests/unittests/ps/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ps/CMakeLists.txt
@@ -8,4 +8,4 @@ foreach(TEST_OP ${TEST_OPS})
   py_test_modules(${TEST_OP} MODULES ${TEST_OP})
   list(APPEND TEST_OPS ${TEST_OP})
   set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 50)
-endforeach(TEST_OP)
+endforeach()
diff --git a/python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt b/python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt
index 35a95749880bd..04773499b3591 100644
--- a/python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt
@@ -6,7 +6,7 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
 foreach(TEST_OP ${TEST_OPS})
   py_test_modules(${TEST_OP} MODULES ${TEST_OP})
-endforeach(TEST_OP)
+endforeach()
 if(NOT WIN32)
   set_tests_properties(test_rnn_nets_static PROPERTIES TIMEOUT 120)
   set_tests_properties(test_rnn_nets PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/sequence/CMakeLists.txt b/python/paddle/fluid/tests/unittests/sequence/CMakeLists.txt
index 5c13f56d44646..a3f2059881bb8 100644
--- a/python/paddle/fluid/tests/unittests/sequence/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/sequence/CMakeLists.txt
@@ -6,7 +6,7 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
 foreach(TEST_OP ${TEST_OPS})
   py_test_modules(${TEST_OP} MODULES ${TEST_OP})
-endforeach(TEST_OP)
+endforeach()
 set_tests_properties(test_sequence_conv PROPERTIES TIMEOUT 120)
 set_tests_properties(test_sequence_concat PROPERTIES TIMEOUT 120)
 set_tests_properties(test_sequence_pool PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py
index 908af43e00825..e34da4c45a7c3 100644
--- a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py
@@ -20,156 +20,11 @@
 import paddle.fluid as fluid
 from test_dist_base import TestDistRunnerBase, runtime_main
 import paddle.distributed.fleet as fleet
-import paddle.incubate.nn.functional as incubate_f
-
-from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
-from paddle.fluid.dygraph.layers import Layer
-from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid import core
-from paddle.nn.initializer import Constant
+from paddle.incubate.nn import FusedMultiHeadAttention
 
 paddle.enable_static()
 
 
-def _set_var_distributed(var):
-    if var is None:
-        return
-
-    var.is_distributed = True
-
-    # NOTE: use current_block and find_var_recursive to support while_loop
-    startup_block = paddle.static.default_startup_program().current_block()
-    main_block = paddle.static.default_main_program().current_block()
-    startup_block._find_var_recursive(var.name).is_distributed = True
-    main_block._find_var_recursive(var.name).is_distributed = True
-
-
-class ParallelFusedMultiHeadAttention(Layer):
-
-    def __init__(self,
-                 embed_dim,
-                 num_heads,
-                 dropout_rate=0.5,
-                 attn_dropout_rate=0.5,
-                 kdim=None,
-                 vdim=None,
-                 normalize_before=False,
-                 need_weights=False,
-                 qkv_weight_attr=None,
-                 qkv_bias_attr=None,
-                 linear_weight_attr=None,
-                 linear_bias_attr=None,
-                 pre_ln_scale_attr=None,
-                 pre_ln_bias_attr=None,
-                 ln_scale_attr=None,
-                 ln_bias_attr=None,
-                 epsilon=1e-5,
-                 nranks=1,
-                 ring_id=-1,
-                 name=None):
-        super(ParallelFusedMultiHeadAttention, self).__init__()
-
-        assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
-                               "but received {}".format(embed_dim))
-        assert num_heads > 0, ("Expected nhead to be greater than 0, "
-                               "but received {}".format(num_heads))
-
-        self.normalize_before = normalize_before
-        self._dtype = self._helper.get_default_dtype()
-        self._epsilon = epsilon
-        self._ring_id = ring_id
-
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-        self.head_dim = embed_dim // num_heads
-        self.kdim = kdim
-        self.vdim = vdim
-        self.need_weights = need_weights
-        assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
-        assert need_weights == False, "Only support need_weight is False now."
-
-        # tensor model parallel
-        assert num_heads % nranks == 0
-        num_heads = num_heads // nranks
-
-        self.qkv_weight = self.create_parameter(
-            shape=[3, num_heads, self.head_dim, embed_dim],
-            attr=qkv_weight_attr,
-            dtype=self._dtype,
-            is_bias=False)
-        self.qkv_bias = self.create_parameter(
-            shape=[3, num_heads, self.head_dim],
-            attr=qkv_bias_attr,
-            dtype=self._dtype,
-            is_bias=True)
-        self.linear_weight = self.create_parameter(
-            shape=[num_heads * self.head_dim, embed_dim],
-            attr=linear_weight_attr,
-            dtype=self._dtype,
-            is_bias=False)
-        self.linear_bias = self.create_parameter(shape=[embed_dim],
-                                                 attr=linear_bias_attr,
-                                                 dtype=self._dtype,
-                                                 is_bias=True)
-
-        # tensor model parallel
-        if nranks > 1:
-            assert ring_id != -1
-            # column parallel
-            _set_var_distributed(self.qkv_weight)
-            _set_var_distributed(self.qkv_bias)
-            # row parallel
-            _set_var_distributed(self.linear_weight)
-
-        if normalize_before:
-            self.pre_ln_scale = self.create_parameter(
-                attr=pre_ln_scale_attr,
-                shape=[embed_dim],
-                default_initializer=Constant(value=1.0))
-            self.pre_ln_bias = self.create_parameter(attr=pre_ln_bias_attr,
-                                                     shape=[embed_dim],
-                                                     is_bias=True)
-            self.ln_scale = None
-            self.ln_bias = None
-        else:
-            self.pre_ln_scale = None
-            self.pre_ln_bias = None
-            self.ln_scale = self.create_parameter(
-                attr=ln_scale_attr,
-                shape=[embed_dim],
-                default_initializer=Constant(value=1.0))
-            self.ln_bias = self.create_parameter(attr=ln_bias_attr,
-                                                 shape=[embed_dim],
-                                                 is_bias=True)
-
-        self.dropout_rate = dropout_rate
-        self.attn_dropout_rate = attn_dropout_rate
-
-        self.name = name
-
-    def forward(self, query, key=None, value=None, attn_mask=None, cache=None):
-        out = incubate_f.fused_multi_head_attention(
-            x=query,
-            qkv_weight=self.qkv_weight,
-            linear_weight=self.linear_weight,
-            pre_layer_norm=self.normalize_before,
-            pre_ln_scale=self.pre_ln_scale,
-            pre_ln_bias=self.pre_ln_bias,
-            ln_scale=self.ln_scale,
-            ln_bias=self.ln_bias,
-            pre_ln_epsilon=self._epsilon,
-            qkv_bias=self.qkv_bias,
-            linear_bias=self.linear_bias,
-            attn_mask=attn_mask,
-            dropout_rate=self.dropout_rate,
-            attn_dropout_rate=self.attn_dropout_rate,
-            ln_epsilon=self._epsilon,
-            training=self.training,
-            ring_id=self._ring_id,
-            name=self.name)
-        return out
-
-
 def get_param_attr(weight, bias):
     weight_attr = paddle.ParamAttr(
         initializer=fluid.initializer.NumpyArrayInitializer(weight))
@@ -208,40 +63,40 @@ def create_model(data, rank):
         qkv_w_attr, qkv_b_attr = get_param_attr(col_qkv_w, col_qkv_b)
         linear_w_attr, linear_b_attr = get_param_attr(row_linear_w, linear_b)
 
-        attn = ParallelFusedMultiHeadAttention(hidden,
-                                               n_head,
-                                               dropout_rate=0.0,
-                                               attn_dropout_rate=0.0,
-                                               normalize_before=False,
-                                               qkv_weight_attr=qkv_w_attr,
-                                               qkv_bias_attr=qkv_b_attr,
-                                               linear_weight_attr=linear_w_attr,
-                                               linear_bias_attr=linear_b_attr,
-                                               pre_ln_scale_attr=pre_ln_w_attr,
-                                               pre_ln_bias_attr=pre_ln_b_attr,
-                                               ln_scale_attr=pre_ln_w_attr,
-                                               ln_bias_attr=pre_ln_b_attr,
-                                               nranks=MODEL_PARALLEL_SIZE,
-                                               ring_id=0)
+        attn = FusedMultiHeadAttention(hidden,
+                                       n_head,
+                                       dropout_rate=0.0,
+                                       attn_dropout_rate=0.0,
+                                       normalize_before=False,
+                                       qkv_weight_attr=qkv_w_attr,
+                                       qkv_bias_attr=qkv_b_attr,
+                                       linear_weight_attr=linear_w_attr,
+                                       linear_bias_attr=linear_b_attr,
+                                       pre_ln_scale_attr=pre_ln_w_attr,
+                                       pre_ln_bias_attr=pre_ln_b_attr,
+                                       ln_scale_attr=pre_ln_w_attr,
+                                       ln_bias_attr=pre_ln_b_attr,
+                                       nranks=MODEL_PARALLEL_SIZE,
+                                       ring_id=0)
         result = attn(data)
     else:
         pre_ln_w_attr, pre_ln_b_attr = get_param_attr(pre_ln_w, pre_ln_b)
         qkv_w_attr, qkv_b_attr = get_param_attr(qkv_w, qkv_b)
         linear_w_attr, linear_b_attr = get_param_attr(linear_w, linear_b)
 
-        attn = ParallelFusedMultiHeadAttention(hidden,
-                                               n_head,
-                                               dropout_rate=0.0,
-                                               attn_dropout_rate=0.0,
-                                               normalize_before=False,
-                                               qkv_weight_attr=qkv_w_attr,
-                                               qkv_bias_attr=qkv_b_attr,
-                                               linear_weight_attr=linear_w_attr,
-                                               linear_bias_attr=linear_b_attr,
-                                               pre_ln_scale_attr=pre_ln_w_attr,
-                                               pre_ln_bias_attr=pre_ln_b_attr,
-                                               ln_scale_attr=pre_ln_w_attr,
-                                               ln_bias_attr=pre_ln_b_attr)
+        attn = FusedMultiHeadAttention(hidden,
+                                       n_head,
+                                       dropout_rate=0.0,
+                                       attn_dropout_rate=0.0,
+                                       normalize_before=False,
+                                       qkv_weight_attr=qkv_w_attr,
+                                       qkv_bias_attr=qkv_b_attr,
+                                       linear_weight_attr=linear_w_attr,
+                                       linear_bias_attr=linear_b_attr,
+                                       pre_ln_scale_attr=pre_ln_w_attr,
+                                       pre_ln_bias_attr=pre_ln_b_attr,
+                                       ln_scale_attr=pre_ln_w_attr,
+                                       ln_bias_attr=pre_ln_b_attr)
         result = attn(data)
 
     predict = paddle.sum(result)
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py
index a5af3cd877c53..d2144e201dc22 100644
--- a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py
@@ -20,11 +20,7 @@
 import paddle.fluid as fluid
 from test_dist_base import TestDistRunnerBase, runtime_main
 import paddle.distributed.fleet as fleet
-
-from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
-from paddle.fluid.dygraph.layers import Layer
-from paddle.fluid.layer_helper import LayerHelper
-from paddle.nn.initializer import Constant
+from paddle.incubate.nn import FusedFeedForward
 
 paddle.enable_static()
 
@@ -34,239 +30,6 @@
 OUT_SIZE = 2 * MODEL_PARALLEL_SIZE
 
 
-def fused_feedforward(x,
-                      linear1_weight,
-                      linear2_weight,
-                      linear1_bias=None,
-                      linear2_bias=None,
-                      ln1_scale=None,
-                      ln1_bias=None,
-                      ln2_scale=None,
-                      ln2_bias=None,
-                      dropout1_rate=0.5,
-                      dropout2_rate=0.5,
-                      activation="relu",
-                      ln1_epsilon=1e-5,
-                      ln2_epsilon=1e-5,
-                      pre_layer_norm=False,
-                      training=True,
-                      mode='upscale_in_train',
-                      ring_id=-1,
-                      name=None):
-    seed = None
-    if mode not in ('downscale_in_infer', 'upscale_in_train'):
-        raise ValueError(
-            "mode argument should be 'downscale_in_infer' or 'upscale_in_train'"
-        )
-    mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
-
-    helper = LayerHelper("fused_feedforward")
-    dtype = x.dtype
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'fused_feedforward')
-    check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'],
-                'fused_feedforward')
-
-    out = helper.create_variable_for_type_inference(x.dtype)
-    dropout1_mask = helper.create_variable_for_type_inference(
-        'uint8', stop_gradient=True)
-    dropout2_mask = helper.create_variable_for_type_inference(
-        'uint8', stop_gradient=True)
-    ln1_mean = helper.create_variable_for_type_inference(x.dtype,
-                                                         stop_gradient=True)
-    ln1_variance = helper.create_variable_for_type_inference(x.dtype,
-                                                             stop_gradient=True)
-    ln2_mean = helper.create_variable_for_type_inference(x.dtype,
-                                                         stop_gradient=True)
-    ln2_variance = helper.create_variable_for_type_inference(x.dtype,
-                                                             stop_gradient=True)
-    linear1_out = helper.create_variable_for_type_inference(x.dtype,
-                                                            stop_gradient=True)
-    ln1_out = helper.create_variable_for_type_inference(x.dtype,
-                                                        stop_gradient=True)
-    dropout1_out = helper.create_variable_for_type_inference(x.dtype,
-                                                             stop_gradient=True)
-    dropout2_out = helper.create_variable_for_type_inference(x.dtype,
-                                                             stop_gradient=True)
-
-    if (seed is None or seed == 0) and helper.main_program.random_seed != 0:
-        seed = helper.main_program.random_seed
-
-    helper.append_op(type='fused_feedforward',
-                     inputs={
-                         'X': x,
-                         'Linear1Weight': linear1_weight,
-                         'Linear1Bias': linear1_bias,
-                         'Linear2Weight': linear2_weight,
-                         'Linear2Bias': linear2_bias,
-                         'Ln1Scale': ln1_scale,
-                         'Ln1Bias': ln1_bias,
-                         'Ln2Scale': ln2_scale,
-                         'Ln2Bias': ln2_bias,
-                     },
-                     outputs={
-                         'Out': out,
-                         'Dropout1Mask': dropout1_mask,
-                         'Dropout2Mask': dropout2_mask,
-                         'Ln1Mean': ln1_mean,
-                         'Ln1Variance': ln1_variance,
-                         'Ln2Mean': ln2_mean,
-                         'Ln2Variance': ln2_variance,
-                         'Linear1Out': linear1_out,
-                         'Ln1Out': ln1_out,
-                         'Dropout1Out': dropout1_out,
-                         'Dropout2Out': dropout2_out,
-                     },
-                     attrs={
-                         'dropout1_rate': dropout1_rate,
-                         'dropout2_rate': dropout2_rate,
-                         'act_method': activation,
-                         'pre_layer_norm': pre_layer_norm,
-                         'ln1_epsilon': ln1_epsilon,
-                         'ln2_epsilon': ln2_epsilon,
-                         'dropout1_is_test': not training,
-                         'dropout2_is_test': not training,
-                         'dropout1_fix_seed': seed is not None,
-                         'dropout2_fix_seed': seed is not None,
-                         'dropout1_seed': seed if seed is not None else 0,
-                         'dropout2_seed': seed if seed is not None else 0,
-                         'dropout1_implementation': mode,
-                         'dropout2_implementation': mode,
-                         'ring_id': ring_id,
-                     })
-    return out
-
-
-def _set_var_distributed(var):
-    if var is None:
-        return
-
-    var.is_distributed = True
-
-    # NOTE: use current_block and find_var_recursive to support while_loop
-    startup_block = paddle.static.default_startup_program().current_block()
-    main_block = paddle.static.default_main_program().current_block()
-    startup_block._find_var_recursive(var.name).is_distributed = True
-    main_block._find_var_recursive(var.name).is_distributed = True
-
-
-class ParallelFusedFeedForward(Layer):
-
-    def __init__(self,
-                 d_model,
-                 dim_feedforward,
-                 dropout_rate=0.1,
-                 epsilon=1e-05,
-                 activation="relu",
-                 act_dropout_rate=None,
-                 normalize_before=False,
-                 linear1_weight_attr=None,
-                 linear1_bias_attr=None,
-                 linear2_weight_attr=None,
-                 linear2_bias_attr=None,
-                 ln1_scale_attr=None,
-                 ln1_bias_attr=None,
-                 ln2_scale_attr=None,
-                 ln2_bias_attr=None,
-                 nranks=1,
-                 ring_id=-1,
-                 name=None):
-        super(ParallelFusedFeedForward, self).__init__()
-        assert d_model > 0, (
-            "Expected d_model to be greater than 0, but received {}".format(
-                d_model))
-        assert dim_feedforward > 0, (
-            "Expected dim_feedforward to be greater than 0, but received {}".
-            format(dim_feedforward))
-
-        self._dtype = self._helper.get_default_dtype()
-        self._d_model = d_model
-
-        assert dim_feedforward % nranks == 0
-        dim_feedforward = dim_feedforward // nranks
-        self._dim_feedforward = dim_feedforward
-        self._dropout_rate = dropout_rate
-        self._act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate
-        self._act_method = activation
-        self._normalize_before = normalize_before
-        self._epsilon = epsilon
-        self._ring_id = ring_id
-
-        self._linear1_weight = self.create_parameter(
-            shape=[d_model, dim_feedforward],
-            attr=linear1_weight_attr,
-            dtype=self._dtype,
-            is_bias=False)
-        self._linear1_bias = self.create_parameter(shape=[dim_feedforward],
-                                                   attr=linear1_bias_attr,
-                                                   dtype=self._dtype,
-                                                   is_bias=True)
-
-        self._linear2_weight = self.create_parameter(
-            shape=[dim_feedforward, d_model],
-            attr=linear2_weight_attr,
-            dtype=self._dtype,
-            is_bias=False)
-
-        self._linear2_bias = self.create_parameter(shape=[d_model],
-                                                   attr=linear2_bias_attr,
-                                                   dtype=self._dtype,
-                                                   is_bias=True)
-
-        if nranks > 1:
-            assert ring_id != -1
-            # column parallel
-            _set_var_distributed(self._linear1_weight)
-            _set_var_distributed(self._linear1_bias)
-            _set_var_distributed(self._linear2_weight)
-
-        if normalize_before:
-            self._ln1_scale = self.create_parameter(
-                shape=[d_model],
-                attr=ln1_scale_attr,
-                is_bias=False,
-                default_initializer=Constant(1.0))
-            self._ln1_bias = self.create_parameter(shape=[d_model],
-                                                   attr=ln1_bias_attr,
-                                                   is_bias=True)
-            self._ln2_scale = None
-            self._ln2_bias = None
-        else:
-            self._ln1_bias = None
-            self._ln2_bias = None
-            self._ln2_scale = self.create_parameter(
-                shape=[d_model],
-                attr=ln2_scale_attr,
-                is_bias=False,
-                default_initializer=Constant(1.0))
-            self._ln2_bias = self.create_parameter(shape=[d_model],
-                                                   attr=ln2_bias_attr,
-                                                   is_bias=True)
-
-        self.name = name
-
-    def forward(self, src, cache=None):
-        out = fused_feedforward(src,
-                                self._linear1_weight,
-                                self._linear2_weight,
-                                self._linear1_bias,
-                                self._linear2_bias,
-                                self._ln1_scale,
-                                self._ln1_bias,
-                                self._ln2_scale,
-                                self._ln2_bias,
-                                dropout1_rate=self._act_dropout_rate,
-                                dropout2_rate=self._dropout_rate,
-                                activation=self._act_method,
-                                ln1_epsilon=self._epsilon,
-                                ln2_epsilon=self._epsilon,
-                                pre_layer_norm=self._normalize_before,
-                                training=self.training,
-                                ring_id=self._ring_id,
-                                name=self.name)
-        return out
-
-
 def get_param_attr(weight, bias):
     weight_attr = paddle.ParamAttr(
         initializer=fluid.initializer.NumpyArrayInitializer(weight))
@@ -295,19 +58,19 @@ def create_model(data, rank):
         w0_attr, b0_attr = get_param_attr(col_w0, col_b0)
         w1_attr, b1_attr = get_param_attr(row_w1, b1)
 
-        ffn = ParallelFusedFeedForward(IN_SIZE,
-                                       OUT_SIZE,
-                                       dropout_rate=0.0,
-                                       activation='gelu',
-                                       normalize_before=True,
-                                       linear1_weight_attr=w0_attr,
-                                       linear1_bias_attr=b0_attr,
-                                       linear2_weight_attr=w1_attr,
-                                       linear2_bias_attr=b1_attr,
-                                       ln1_scale_attr=ln_w_attr,
-                                       ln1_bias_attr=ln_b_attr,
-                                       nranks=MODEL_PARALLEL_SIZE,
-                                       ring_id=0)
+        ffn = FusedFeedForward(IN_SIZE,
+                               OUT_SIZE,
+                               dropout_rate=0.0,
+                               activation='gelu',
+                               normalize_before=True,
+                               linear1_weight_attr=w0_attr,
+                               linear1_bias_attr=b0_attr,
+                               linear2_weight_attr=w1_attr,
+                               linear2_bias_attr=b1_attr,
+                               ln1_scale_attr=ln_w_attr,
+                               ln1_bias_attr=ln_b_attr,
+                               nranks=MODEL_PARALLEL_SIZE,
+                               ring_id=0)
         #ffn.eval()
         result = ffn(data)
     else:
@@ -315,17 +78,17 @@ def create_model(data, rank):
         w0_attr, b0_attr = get_param_attr(w0, b0)
         w1_attr, b1_attr = get_param_attr(w1, b1)
 
-        ffn = ParallelFusedFeedForward(IN_SIZE,
-                                       OUT_SIZE,
-                                       dropout_rate=0.0,
-                                       activation='gelu',
-                                       normalize_before=True,
-                                       linear1_weight_attr=w0_attr,
-                                       linear1_bias_attr=b0_attr,
-                                       linear2_weight_attr=w1_attr,
-                                       linear2_bias_attr=b1_attr,
-                                       ln1_scale_attr=ln_w_attr,
-                                       ln1_bias_attr=ln_b_attr)
+        ffn = FusedFeedForward(IN_SIZE,
+                               OUT_SIZE,
+                               dropout_rate=0.0,
+                               activation='gelu',
+                               normalize_before=True,
+                               linear1_weight_attr=w0_attr,
+                               linear1_bias_attr=b0_attr,
+                               linear2_weight_attr=w1_attr,
+                               linear2_bias_attr=b1_attr,
+                               ln1_scale_attr=ln_w_attr,
+                               ln1_bias_attr=ln_b_attr)
         #ffn.eval()
         result = ffn(data)
 
diff --git a/python/paddle/fluid/tests/unittests/test_assign_op.py b/python/paddle/fluid/tests/unittests/test_assign_op.py
index c35d7940a8a1c..31afb85750e8c 100644
--- a/python/paddle/fluid/tests/unittests/test_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_assign_op.py
@@ -145,7 +145,7 @@ def test_assign_LoDTensorArray(self):
 
     def test_assign_NumpyArray(self):
         with fluid.dygraph.guard():
-            array = np.random.random(size=(100, 10)).astype(np.bool)
+            array = np.random.random(size=(100, 10)).astype(np.bool_)
             result1 = paddle.zeros(shape=[3, 3], dtype='float32')
             paddle.assign(array, result1)
         self.assertTrue(np.allclose(result1.numpy(), array))
diff --git a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
index b99892c65e19f..873684576326a 100644
--- a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
@@ -35,7 +35,7 @@ def bipartite_match(distance, match_indices, match_dist):
 
     match_sorted = sorted(match_pair, key=lambda tup: tup[2], reverse=True)
 
-    row_indices = -1 * np.ones((row, ), dtype=np.int)
+    row_indices = -1 * np.ones((row, ), dtype=np.int_)
 
     idx = 0
     for i, j, dist in match_sorted:
@@ -69,7 +69,7 @@ def batch_bipartite_match(distance, lod, match_type=None, dist_threshold=None):
     """
     n = len(lod)
     m = distance.shape[1]
-    match_indices = -1 * np.ones((n, m), dtype=np.int)
+    match_indices = -1 * np.ones((n, m), dtype=np.int_)
     match_dist = np.zeros((n, m), dtype=np.float32)
     cur_offset = 0
     for i in range(n):
diff --git a/python/paddle/fluid/tests/unittests/test_box_coder_op.py b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
index 63df37f912259..ee064963b2f22 100644
--- a/python/paddle/fluid/tests/unittests/test_box_coder_op.py
+++ b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
@@ -235,7 +235,7 @@ def setUp(self):
         self.attrs = {
             'code_type': 'decode_center_size',
             'box_normalized': False,
-            'variance': prior_box_var.astype(np.float).flatten(),
+            'variance': prior_box_var.astype(np.float64).flatten(),
             'axis': axis
         }
         self.outputs = {'OutputBox': output_box}
diff --git a/python/paddle/fluid/tests/unittests/test_compare_op.py b/python/paddle/fluid/tests/unittests/test_compare_op.py
index 06432e4b00720..a893b65f5a421 100755
--- a/python/paddle/fluid/tests/unittests/test_compare_op.py
+++ b/python/paddle/fluid/tests/unittests/test_compare_op.py
@@ -249,8 +249,8 @@ def test_bool_api_4(self):
                 op = eval("paddle.%s" % (self.op_type))
                 out = op(x, y)
                 exe = paddle.static.Executor(self.place)
-                input_x = np.array([True, False, True]).astype(np.bool)
-                input_y = np.array([True, True, False]).astype(np.bool)
+                input_x = np.array([True, False, True]).astype(np.bool_)
+                input_y = np.array([True, True, False]).astype(np.bool_)
                 real_result = callback(input_x, input_y)
                 res, = exe.run(feed={
                     "x": input_x,
@@ -267,8 +267,8 @@ def test_bool_broadcast_api_4(self):
                 op = eval("paddle.%s" % (self.op_type))
                 out = op(x, y)
                 exe = paddle.static.Executor(self.place)
-                input_x = np.array([True, False, True]).astype(np.bool)
-                input_y = np.array([True]).astype(np.bool)
+                input_x = np.array([True, False, True]).astype(np.bool_)
+                input_y = np.array([True]).astype(np.bool_)
                 real_result = callback(input_x, input_y)
                 res, = exe.run(feed={
                     "x": input_x,
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_graph_partial_graph_static_run.py b/python/paddle/fluid/tests/unittests/test_cuda_graph_partial_graph_static_run.py
new file mode 100644
index 0000000000000..445211d35a1d9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_cuda_graph_partial_graph_static_run.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import unittest
+import numpy as np
+from paddle.device.cuda.graphs import wrap_cuda_graph, is_cuda_graph_supported, cuda_graph_transform
+
+paddle.enable_static()
+
+
+class SimpleModel(nn.Layer):
+
+    def __init__(self, in_size, out_size):
+        super(SimpleModel, self).__init__()
+        self.linear = nn.Linear(in_size, out_size)
+        self.dropout_1 = paddle.nn.Dropout(0.1)
+        self.relu = nn.ReLU()
+        self.dropout_2 = paddle.nn.Dropout(0.5)
+        self.gelu = nn.GELU()
+
+    def forward(self, x):
+        x = self.linear(x)
+        x = self.dropout_1(x)
+        x = self.relu(x)
+        x = self.dropout_2(x)
+        x = self.gelu(x)
+        return x
+
+
+class TestCudaGraphAttrAll(unittest.TestCase):
+
+    def setUp(self):
+        paddle.set_flags({'FLAGS_eager_delete_tensor_gb': 0.0})
+
+    def get_model(self, use_cuda_graph=False):
+        x = paddle.static.data(shape=[3, 10], dtype='float32', name='x')
+
+        model_start = SimpleModel(10, 20)
+        if use_cuda_graph:
+            model_start = wrap_cuda_graph(model_start)
+
+        model_inter = SimpleModel(20, 20)
+
+        model_end = SimpleModel(20, 10)
+        if use_cuda_graph:
+            model_end = wrap_cuda_graph(model_end, memory_pool='new')
+
+        start_out = model_start(x)
+        inter_out = model_inter(start_out)
+        end_out = model_end(inter_out)
+        loss = paddle.mean(end_out)
+
+        opt = paddle.optimizer.SGD()
+        opt.minimize(loss)
+
+        return loss
+
+    def run_with_cuda_graph(self, x_data):
+        # run with cuda graph
+        paddle.seed(1024)
+
+        main_prog = paddle.static.Program()
+        start_prog = paddle.static.Program()
+
+        with paddle.static.program_guard(main_prog, start_prog):
+            loss = self.get_model(use_cuda_graph=True)
+
+        section_programs = cuda_graph_transform(main_prog)
+        assert len(section_programs) == 4
+
+        block = main_prog.global_block()
+        run_program_op_num = 0
+        for op in block.ops:
+            if op.type == 'run_program':
+                run_program_op_num += 1
+        assert run_program_op_num == 4
+
+        exe = paddle.static.Executor(paddle.CUDAPlace(0))
+        exe.run(start_prog)
+
+        for i in range(10):
+            rst = exe.run(main_prog, feed={'x': x_data}, fetch_list=[loss])
+
+        return rst
+
+    def normal_run(self, x_data):
+        # run without cuda graph
+        paddle.seed(1024)
+
+        main_prog = paddle.static.Program()
+        start_prog = paddle.static.Program()
+
+        with paddle.static.program_guard(main_prog, start_prog):
+            loss = self.get_model()
+
+        exe = paddle.static.Executor(paddle.CUDAPlace(0))
+        exe.run(start_prog)
+
+        for i in range(10):
+            rst = exe.run(main_prog, feed={'x': x_data}, fetch_list=[loss])
+
+        return rst
+
+    def test_static_mode_cuda_graph(self):
+        if not is_cuda_graph_supported():
+            return
+        x_data = np.random.random((3, 10)).astype('float32')
+        cuda_graph_rst = self.run_with_cuda_graph(x_data)
+        normal_run_rst = self.normal_run(x_data)
+        assert np.array_equal(cuda_graph_rst, normal_run_rst)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py b/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py
index c00e75882943f..4b5d283aa512a 100644
--- a/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py
@@ -70,7 +70,7 @@ def init_test_params(self):
         self.batch_size = 10
 
         self.variances = [0.1, 0.1, 0.2, 0.2]
-        self.variances = np.array(self.variances, dtype=np.float).flatten()
+        self.variances = np.array(self.variances, dtype=np.float64).flatten()
 
         self.clip = True
         self.num_priors = 0
diff --git a/python/paddle/fluid/tests/unittests/test_directory_migration.py b/python/paddle/fluid/tests/unittests/test_directory_migration.py
index 727fcb28cc211..4cafd19d913b3 100644
--- a/python/paddle/fluid/tests/unittests/test_directory_migration.py
+++ b/python/paddle/fluid/tests/unittests/test_directory_migration.py
@@ -17,14 +17,22 @@
 import os
 import sys
 import time
+import tempfile
 import subprocess
 import unittest
+
 import numpy as np
 import paddle
 
 
 class TestDirectory(unittest.TestCase):
 
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def get_import_command(self, module):
         paths = module.split('.')
         if len(paths) == 1:
@@ -77,7 +85,7 @@ def test_new_directory(self):
             'paddle.static.nn.spectral_norm', 'paddle.static.nn.embedding'
         ]
 
-        import_file = 'run_import_modules.py'
+        import_file = os.path.join(self.temp_dir.name, 'run_import_modules.py')
 
         with open(import_file, "w") as wb:
             for module in new_directory:
@@ -137,7 +145,8 @@ def test_old_directory(self):
             'paddle.declarative.spectral_norm', 'paddle.declarative.embedding'
         ]
 
-        import_file = 'run_old_import_modules.py'
+        import_file = os.path.join(self.temp_dir.name,
+                                   'run_old_import_modules.py')
 
         with open(import_file, "w") as wb:
             cmd_context_count = """
diff --git a/python/paddle/fluid/tests/unittests/test_einsum_op.py b/python/paddle/fluid/tests/unittests/test_einsum_op.py
index c36950b6922fe..e34d04be927cc 100644
--- a/python/paddle/fluid/tests/unittests/test_einsum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_einsum_op.py
@@ -39,7 +39,9 @@ def setUp(self):
             'Out':
             out,
             "InnerCache": [('cache_' + str(i), np.array([1.0]))
-                           for i in range(len(self.operands))]
+                           for i in range(len(self.operands))],
+            "XShape": [('xshape_' + str(i), np.array([1.0]))
+                       for i in range(len(self.operands))],
         }
 
     def init_input(self):
@@ -48,14 +50,13 @@ def init_input(self):
             self.inputs.append(np.random.random(s).astype(t))
 
     def set_mandatory(self):
-        self.disable = False
         self.shapes = [(10, 10, 20), (20, 6)]
         self.types = [np.float64, np.float64]
         self.equation = "mij,jk->ki"
 
     def test_check_output(self):
         if not self.disable:
-            self.check_output(no_check_set=["InnerCache"])
+            self.check_output(no_check_set=["InnerCache", "XShape"])
 
     def test_grad(self):
         if not self.disable:
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_gemm_epilogue_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_gemm_epilogue_pass.py
index 29bfca4dd786b..2482ab0c549db 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_gemm_epilogue_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_gemm_epilogue_pass.py
@@ -27,7 +27,7 @@ def compare(ref, res, atol, rtol):
     ref = np.array(ref).flatten()
     res = np.array(res).flatten()
 
-    tmp_ref = ref.astype(np.float)
+    tmp_ref = ref.astype(np.float64)
     tol = atol + rtol * abs(tmp_ref)
 
     diff = abs(res - ref)
diff --git a/python/paddle/fluid/tests/unittests/test_fused_attention_op_api.py b/python/paddle/fluid/tests/unittests/test_fused_attention_op_api.py
index 89689942a0274..bf8983eee842f 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_attention_op_api.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_attention_op_api.py
@@ -83,7 +83,7 @@ def compute_reference(pre_layer_norm, query, attn_mask, ln_scale, ln_bias,
     if ln_bias is None:
         has_bias = False
 
-    if (pre_layer_norm):
+    if pre_layer_norm:
         ln_out = layer_norm(query, True, has_bias, ln_scale, ln_bias)
 
     num_head = qkv_weight.shape[1]
@@ -97,7 +97,7 @@ def compute_reference(pre_layer_norm, query, attn_mask, ln_scale, ln_bias,
     if qkv_bias is not None:
         qkv_bias = qkv_bias.reshape(qkv_bias.shape[0] * qkv_bias.shape[1] *
                                     qkv_bias.shape[2])
-    if (pre_layer_norm):
+    if pre_layer_norm:
         ln_out = ln_out.reshape(batch_size * seq_len, embed_dim)
         qkv = fc(ln_out, qkv_weight)
         if qkv_bias is not None:
@@ -239,12 +239,12 @@ def run_imperative(self):
             attn_mask_tensor = paddle.to_tensor(self.attn_mask)
         else:
             attn_mask_tensor = None
-        fused_attn = FusedMultiHeadAttention(self.embed_dim, self.num_heads,
-                                             self.dropout_prob,
-                                             self.attn_dropout_prob, self.kdim,
-                                             self.vdim, self.pre_layer_norm,
-                                             self.need_weight, self.weight_attr,
-                                             self.bias_attr)
+        fused_attn = FusedMultiHeadAttention(
+            self.embed_dim, self.num_heads, self.dropout_prob,
+            self.attn_dropout_prob, self.kdim, self.vdim, self.pre_layer_norm,
+            self.need_weight, self.weight_attr, self.bias_attr,
+            self.weight_attr, self.bias_attr, self.weight_attr, self.bias_attr,
+            self.weight_attr, self.bias_attr)
         if self.bias_attr is not False:
             qkv_bias = np.random.random(
                 fused_attn.qkv_bias.shape).astype('float32')
@@ -260,13 +260,19 @@ def run_imperative(self):
         if self.bias_attr is not False:
             fused_attn_qkv_bias = fused_attn.qkv_bias.numpy()
             fused_attn_linear_bias = fused_attn.linear_bias.numpy()
-            fused_attn_pre_ln_bias = fused_attn.pre_ln_bias.numpy()
-            fused_attn_ln_bias = fused_attn.ln_bias.numpy()
+            if self.pre_layer_norm:
+                fused_attn_pre_ln_bias = fused_attn.pre_ln_bias.numpy()
+                fused_attn_ln_bias = None
+            else:
+                fused_attn_pre_ln_bias = None
+                fused_attn_ln_bias = fused_attn.ln_bias.numpy()
 
         ref_out = compute_reference(
             self.pre_layer_norm, self.query, self.attn_mask,
-            fused_attn.pre_ln_scale.numpy(), fused_attn_pre_ln_bias,
-            fused_attn.ln_scale.numpy(), fused_attn_ln_bias,
+            fused_attn.pre_ln_scale.numpy() if self.pre_layer_norm else None,
+            fused_attn_pre_ln_bias,
+            fused_attn.ln_scale.numpy() if not self.pre_layer_norm else None,
+            fused_attn_ln_bias,
             fused_attn.qkv_weight.numpy(), fused_attn_qkv_bias,
             fused_attn.linear_weight.numpy(), fused_attn_linear_bias)
         np.testing.assert_allclose(ref_out,
@@ -275,12 +281,12 @@ def run_imperative(self):
                                    atol=self.atol)
 
     def run_static(self):
-        fused_attn = FusedMultiHeadAttention(self.embed_dim, self.num_heads,
-                                             self.dropout_prob,
-                                             self.attn_dropout_prob, self.kdim,
-                                             self.vdim, self.pre_layer_norm,
-                                             self.need_weight, self.weight_attr,
-                                             self.bias_attr)
+        fused_attn = FusedMultiHeadAttention(
+            self.embed_dim, self.num_heads, self.dropout_prob,
+            self.attn_dropout_prob, self.kdim, self.vdim, self.pre_layer_norm,
+            self.need_weight, self.weight_attr, self.bias_attr,
+            self.weight_attr, self.bias_attr, self.weight_attr, self.bias_attr,
+            self.weight_attr, self.bias_attr)
 
         x = paddle.static.data(
             name='X',
@@ -304,58 +310,118 @@ def run_static(self):
 
         qkv_bias = None
         linear_bias = None
+        ln_scale = None
+        ln_2_scale = None
         ln_bias = None
         ln_2_bias = None
         if self.has_attn_mask:
             if self.bias_attr is False:
-                out, qkv_weight, out_linear_weight, ln_scale, ln_2_scale = exe.run(
-                    paddle.static.default_main_program(),
-                    feed={
-                        "X": self.query,
-                        "SrcMask": self.attn_mask
-                    },
-                    fetch_list=[
-                        final_out, fused_attn.qkv_weight,
-                        fused_attn.linear_weight, fused_attn.pre_ln_scale,
-                        fused_attn.ln_scale
-                    ])
+                if self.pre_layer_norm:
+                    out, qkv_weight, out_linear_weight, ln_scale = exe.run(
+                        paddle.static.default_main_program(),
+                        feed={
+                            "X": self.query,
+                            "SrcMask": self.attn_mask
+                        },
+                        fetch_list=[
+                            final_out,
+                            fused_attn.qkv_weight,
+                            fused_attn.linear_weight,
+                            fused_attn.pre_ln_scale,
+                        ])
+                else:
+                    out, qkv_weight, out_linear_weight, ln_2_scale = exe.run(
+                        paddle.static.default_main_program(),
+                        feed={
+                            "X": self.query,
+                            "SrcMask": self.attn_mask
+                        },
+                        fetch_list=[
+                            final_out, fused_attn.qkv_weight,
+                            fused_attn.linear_weight, fused_attn.ln_scale
+                        ])
             else:
-                out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_scale, ln_bias, ln_2_scale, ln_2_bias = exe.run(
-                    paddle.static.default_main_program(),
-                    feed={
-                        "X": self.query,
-                        "SrcMask": self.attn_mask
-                    },
-                    fetch_list=[
-                        final_out, fused_attn.qkv_weight, fused_attn.qkv_bias,
-                        fused_attn.linear_weight, fused_attn.linear_bias,
-                        fused_attn.pre_ln_scale, fused_attn.pre_ln_bias,
-                        fused_attn.ln_scale, fused_attn.ln_bias
-                    ])
+                if self.pre_layer_norm:
+                    out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_scale, ln_bias = exe.run(
+                        paddle.static.default_main_program(),
+                        feed={
+                            "X": self.query,
+                            "SrcMask": self.attn_mask
+                        },
+                        fetch_list=[
+                            final_out,
+                            fused_attn.qkv_weight,
+                            fused_attn.qkv_bias,
+                            fused_attn.linear_weight,
+                            fused_attn.linear_bias,
+                            fused_attn.pre_ln_scale,
+                            fused_attn.pre_ln_bias,
+                        ])
+                else:
+                    out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_2_scale, ln_2_bias = exe.run(
+                        paddle.static.default_main_program(),
+                        feed={
+                            "X": self.query,
+                            "SrcMask": self.attn_mask
+                        },
+                        fetch_list=[
+                            final_out, fused_attn.qkv_weight,
+                            fused_attn.qkv_bias, fused_attn.linear_weight,
+                            fused_attn.linear_bias, fused_attn.ln_scale,
+                            fused_attn.ln_bias
+                        ])
         else:
             if self.bias_attr is False:
-                out, qkv_weight, out_linear_weight, ln_scale, ln_2_scale = exe.run(
-                    paddle.static.default_main_program(),
-                    feed={
-                        "X": self.query,
-                    },
-                    fetch_list=[
-                        final_out, fused_attn.qkv_weight,
-                        fused_attn.linear_weight, fused_attn.pre_ln_scale,
-                        fused_attn.ln_scale
-                    ])
+                if self.pre_layer_norm:
+                    out, qkv_weight, out_linear_weight, ln_scale = exe.run(
+                        paddle.static.default_main_program(),
+                        feed={
+                            "X": self.query,
+                        },
+                        fetch_list=[
+                            final_out,
+                            fused_attn.qkv_weight,
+                            fused_attn.linear_weight,
+                            fused_attn.pre_ln_scale,
+                        ])
+                else:
+                    out, qkv_weight, out_linear_weight, ln_2_scale = exe.run(
+                        paddle.static.default_main_program(),
+                        feed={
+                            "X": self.query,
+                        },
+                        fetch_list=[
+                            final_out, fused_attn.qkv_weight,
+                            fused_attn.linear_weight, fused_attn.ln_scale
+                        ])
             else:
-                out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_scale, ln_bias, ln_2_scale, ln_2_bias = exe.run(
-                    paddle.static.default_main_program(),
-                    feed={
-                        "X": self.query,
-                    },
-                    fetch_list=[
-                        final_out, fused_attn.qkv_weight, fused_attn.qkv_bias,
-                        fused_attn.linear_weight, fused_attn.linear_bias,
-                        fused_attn.pre_ln_scale, fused_attn.pre_ln_bias,
-                        fused_attn.ln_scale, fused_attn.ln_bias
-                    ])
+                if self.pre_layer_norm:
+                    out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_scale, ln_bias = exe.run(
+                        paddle.static.default_main_program(),
+                        feed={
+                            "X": self.query,
+                        },
+                        fetch_list=[
+                            final_out,
+                            fused_attn.qkv_weight,
+                            fused_attn.qkv_bias,
+                            fused_attn.linear_weight,
+                            fused_attn.linear_bias,
+                            fused_attn.pre_ln_scale,
+                            fused_attn.pre_ln_bias,
+                        ])
+                else:
+                    out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_2_scale, ln_2_bias = exe.run(
+                        paddle.static.default_main_program(),
+                        feed={
+                            "X": self.query,
+                        },
+                        fetch_list=[
+                            final_out, fused_attn.qkv_weight,
+                            fused_attn.qkv_bias, fused_attn.linear_weight,
+                            fused_attn.linear_bias, fused_attn.ln_scale,
+                            fused_attn.ln_bias
+                        ])
         return out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_scale, ln_bias, ln_2_scale, ln_2_bias
 
     def test_static_api(self):
diff --git a/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py b/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py
index ffe6fa8d41aa0..ecfc8a5bc292c 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py
@@ -164,7 +164,7 @@ def generate_input_data(self):
                     self.attn_mask = (self.attn_mask - 1.0) * 1e4
                 else:
                     self.attn_mask = (np.tril(self.attn_mask) - 1.0) * 1e4
-            elif self.attn_mask_type == np.bool:
+            elif self.attn_mask_type == np.bool_:
                 if self.has_cache_kv and not self.gen_cache_kv:
                     self.attn_mask[:, :, :, -2] = 0
                 else:
@@ -395,7 +395,7 @@ def GetFusedMultiTransformerOut(self):
         epsilon = 1e-05
         ln2_epsilon = 1e-05
 
-        if attn_mask is not None and self.attn_mask_type != np.bool:
+        if attn_mask is not None and self.attn_mask_type != np.bool_:
             attn_mask = _convert_attention_mask(attn_mask, x.dtype)
 
         qkv_weights, qkv_biases = [], []
diff --git a/python/paddle/fluid/tests/unittests/test_generate_mask_labels_op.py b/python/paddle/fluid/tests/unittests/test_generate_mask_labels_op.py
index 8414cd941c207..a86fb0fc4596c 100644
--- a/python/paddle/fluid/tests/unittests/test_generate_mask_labels_op.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_mask_labels_op.py
@@ -79,8 +79,8 @@ def poly2mask(xy, k, h, w):
             u.extend([int(xs + s * t + .5) for t in ts])
 
     k = len(u)
-    x = np.zeros((k), np.int)
-    y = np.zeros((k), np.int)
+    x = np.zeros((k), np.int_)
+    y = np.zeros((k), np.int_)
     m = 0
     for j in six.moves.xrange(1, k):
         if u[j] != u[j - 1]:
@@ -116,7 +116,7 @@ def poly2mask(xy, k, h, w):
                 b[m - 1] += a[j]
                 j += 1
     mask = decode(b, m)
-    mask = np.array(mask, dtype=np.int).reshape((w, h))
+    mask = np.array(mask, dtype=np.int_).reshape((w, h))
     mask = mask.transpose((1, 0))
     return mask
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
index 822a0fcc449dd..3da576045c587 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
@@ -26,13 +26,6 @@
 from paddle.fluid.dygraph import Linear
 from paddle.fluid.framework import _test_eager_guard
 
-# Can use Amusic dataset as the DeepCF describes.
-DATA_PATH = os.environ.get('DATA_PATH', '')
-
-BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 128))
-NUM_BATCHES = int(os.environ.get('NUM_BATCHES', 5))
-NUM_EPOCHES = int(os.environ.get('NUM_EPOCHES', 1))
-
 
 class DMF(fluid.Layer):
 
@@ -129,84 +122,90 @@ def forward(self, users, items):
         return prediction
 
 
-def get_data():
-    user_ids = []
-    item_ids = []
-    labels = []
-    NUM_USERS = 100
-    NUM_ITEMS = 1000
-    matrix = np.zeros([NUM_USERS, NUM_ITEMS], dtype=np.float32)
+class TestDygraphDeepCF(unittest.TestCase):
 
-    for uid in range(NUM_USERS):
-        for iid in range(NUM_ITEMS):
-            label = float(random.randint(1, 6) == 1)
+    def setUp(self):
+        # Can use Amusic dataset as the DeepCF describes.
+        self.data_path = os.environ.get('DATA_PATH', '')
+
+        self.batch_size = int(os.environ.get('BATCH_SIZE', 128))
+        self.num_batches = int(os.environ.get('NUM_BATCHES', 5))
+        self.num_epoches = int(os.environ.get('NUM_EPOCHES', 1))
+
+    def get_data(self):
+        user_ids = []
+        item_ids = []
+        labels = []
+        NUM_USERS = 100
+        NUM_ITEMS = 1000
+        matrix = np.zeros([NUM_USERS, NUM_ITEMS], dtype=np.float32)
+
+        for uid in range(NUM_USERS):
+            for iid in range(NUM_ITEMS):
+                label = float(random.randint(1, 6) == 1)
+                user_ids.append(uid)
+                item_ids.append(iid)
+                labels.append(label)
+                matrix[uid, iid] = label
+        indices = np.arange(len(user_ids))
+        np.random.shuffle(indices)
+        users_np = np.array(user_ids, dtype=np.int32)[indices]
+        items_np = np.array(item_ids, dtype=np.int32)[indices]
+        labels_np = np.array(labels, dtype=np.float32)[indices]
+        return np.expand_dims(users_np, -1), \
+            np.expand_dims(items_np, -1), \
+            np.expand_dims(labels_np, -1), NUM_USERS, NUM_ITEMS, matrix
+
+    def load_data(self):
+        sys.stderr.write('loading from %s\n' % self.data_path)
+        likes = dict()
+        num_users = -1
+        num_items = -1
+        with open(self.data_path, 'r') as f:
+            for l in f.readlines():
+                uid, iid, rating = [int(v) for v in l.split('\t')]
+                num_users = max(num_users, uid + 1)
+                num_items = max(num_items, iid + 1)
+                if float(rating) > 0.0:
+                    likes[(uid, iid)] = 1.0
+
+        user_ids = []
+        item_ids = []
+        labels = []
+        matrix = np.zeros([num_users, num_items], dtype=np.float32)
+        for uid, iid in likes.keys():
             user_ids.append(uid)
             item_ids.append(iid)
-            labels.append(label)
-            matrix[uid, iid] = label
-    indices = np.arange(len(user_ids))
-    np.random.shuffle(indices)
-    users_np = np.array(user_ids, dtype=np.int32)[indices]
-    items_np = np.array(item_ids, dtype=np.int32)[indices]
-    labels_np = np.array(labels, dtype=np.float32)[indices]
-    return np.expand_dims(users_np, -1), \
-           np.expand_dims(items_np, -1), \
-           np.expand_dims(labels_np, -1), NUM_USERS, NUM_ITEMS, matrix
-
-
-def load_data(DATA_PATH):
-    sys.stderr.write('loading from %s\n' % DATA_PATH)
-    likes = dict()
-    num_users = -1
-    num_items = -1
-    with open(DATA_PATH, 'r') as f:
-        for l in f.readlines():
-            uid, iid, rating = [int(v) for v in l.split('\t')]
-            num_users = max(num_users, uid + 1)
-            num_items = max(num_items, iid + 1)
-            if float(rating) > 0.0:
-                likes[(uid, iid)] = 1.0
-
-    user_ids = []
-    item_ids = []
-    labels = []
-    matrix = np.zeros([num_users, num_items], dtype=np.float32)
-    for uid, iid in likes.keys():
-        user_ids.append(uid)
-        item_ids.append(iid)
-        labels.append(1.0)
-        matrix[uid, iid] = 1.0
-
-        negative = 0
-        while negative < 3:
-            nuid = random.randint(0, num_users - 1)
-            niid = random.randint(0, num_items - 1)
-            if (nuid, niid) not in likes:
-                negative += 1
-                user_ids.append(nuid)
-                item_ids.append(niid)
-                labels.append(0.0)
-
-    indices = np.arange(len(user_ids))
-    np.random.shuffle(indices)
-    users_np = np.array(user_ids, dtype=np.int32)[indices]
-    items_np = np.array(item_ids, dtype=np.int32)[indices]
-    labels_np = np.array(labels, dtype=np.float32)[indices]
-    return np.expand_dims(users_np, -1), \
-           np.expand_dims(items_np, -1), \
-           np.expand_dims(labels_np, -1), num_users, num_items, matrix
-
-
-class TestDygraphDeepCF(unittest.TestCase):
+            labels.append(1.0)
+            matrix[uid, iid] = 1.0
+
+            negative = 0
+            while negative < 3:
+                nuid = random.randint(0, num_users - 1)
+                niid = random.randint(0, num_items - 1)
+                if (nuid, niid) not in likes:
+                    negative += 1
+                    user_ids.append(nuid)
+                    item_ids.append(niid)
+                    labels.append(0.0)
+
+        indices = np.arange(len(user_ids))
+        np.random.shuffle(indices)
+        users_np = np.array(user_ids, dtype=np.int32)[indices]
+        items_np = np.array(item_ids, dtype=np.int32)[indices]
+        labels_np = np.array(labels, dtype=np.float32)[indices]
+        return np.expand_dims(users_np, -1), \
+            np.expand_dims(items_np, -1), \
+            np.expand_dims(labels_np, -1), num_users, num_items, matrix
 
     def test_deefcf(self):
         seed = 90
-        if DATA_PATH:
+        if self.data_path:
             (users_np, items_np, labels_np, num_users, num_items,
-             matrix) = load_data(DATA_PATH)
+             matrix) = self.load_data()
         else:
             (users_np, items_np, labels_np, num_users, num_items,
-             matrix) = get_data()
+             matrix) = self.get_data()
         paddle.seed(seed)
         paddle.framework.random._manual_program_seed(seed)
         startup = fluid.Program()
@@ -228,17 +227,19 @@ def test_deefcf(self):
             exe = fluid.Executor(fluid.CPUPlace(
             ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
             exe.run(startup)
-            for e in range(NUM_EPOCHES):
+            for e in range(self.num_epoches):
                 sys.stderr.write('epoch %d\n' % e)
-                for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE):
-                    if slice + BATCH_SIZE >= users_np.shape[0]:
+                for slice in range(0, self.batch_size * self.num_batches,
+                                   self.batch_size):
+                    if slice + self.batch_size >= users_np.shape[0]:
                         break
                     static_loss = exe.run(
                         main,
                         feed={
-                            users.name: users_np[slice:slice + BATCH_SIZE],
-                            items.name: items_np[slice:slice + BATCH_SIZE],
-                            labels.name: labels_np[slice:slice + BATCH_SIZE]
+                            users.name: users_np[slice:slice + self.batch_size],
+                            items.name: items_np[slice:slice + self.batch_size],
+                            labels.name:
+                            labels_np[slice:slice + self.batch_size]
                         },
                         fetch_list=[loss])[0]
                     sys.stderr.write('static loss %s\n' % static_loss)
@@ -250,18 +251,20 @@ def test_deefcf(self):
             deepcf = DeepCF(num_users, num_items, matrix)
             adam = fluid.optimizer.AdamOptimizer(
                 0.01, parameter_list=deepcf.parameters())
-            for e in range(NUM_EPOCHES):
+            for e in range(self.num_epoches):
                 sys.stderr.write('epoch %d\n' % e)
-                for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE):
-                    if slice + BATCH_SIZE >= users_np.shape[0]:
+                for slice in range(0, self.batch_size * self.num_batches,
+                                   self.batch_size):
+                    if slice + self.batch_size >= users_np.shape[0]:
                         break
                     prediction = deepcf(
-                        to_variable(users_np[slice:slice + BATCH_SIZE]),
-                        to_variable(items_np[slice:slice + BATCH_SIZE]))
+                        to_variable(users_np[slice:slice + self.batch_size]),
+                        to_variable(items_np[slice:slice + self.batch_size]))
                     loss = fluid.layers.reduce_sum(
                         fluid.layers.log_loss(
                             prediction,
-                            to_variable(labels_np[slice:slice + BATCH_SIZE])))
+                            to_variable(labels_np[slice:slice +
+                                                  self.batch_size])))
                     loss.backward()
                     adam.minimize(loss)
                     deepcf.clear_gradients()
@@ -276,18 +279,20 @@ def test_deefcf(self):
             adam2 = fluid.optimizer.AdamOptimizer(
                 0.01, parameter_list=deepcf2.parameters())
             fluid.set_flags({'FLAGS_sort_sum_gradient': True})
-            for e in range(NUM_EPOCHES):
+            for e in range(self.num_epoches):
                 sys.stderr.write('epoch %d\n' % e)
-                for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE):
-                    if slice + BATCH_SIZE >= users_np.shape[0]:
+                for slice in range(0, self.batch_size * self.num_batches,
+                                   self.batch_size):
+                    if slice + self.batch_size >= users_np.shape[0]:
                         break
                     prediction2 = deepcf2(
-                        to_variable(users_np[slice:slice + BATCH_SIZE]),
-                        to_variable(items_np[slice:slice + BATCH_SIZE]))
+                        to_variable(users_np[slice:slice + self.batch_size]),
+                        to_variable(items_np[slice:slice + self.batch_size]))
                     loss2 = fluid.layers.reduce_sum(
                         fluid.layers.log_loss(
                             prediction2,
-                            to_variable(labels_np[slice:slice + BATCH_SIZE])))
+                            to_variable(labels_np[slice:slice +
+                                                  self.batch_size])))
                     loss2.backward()
                     adam2.minimize(loss2)
                     deepcf2.clear_gradients()
@@ -306,19 +311,22 @@ def test_deefcf(self):
                 adam = fluid.optimizer.AdamOptimizer(
                     0.01, parameter_list=deepcf.parameters())
 
-                for e in range(NUM_EPOCHES):
+                for e in range(self.num_epoches):
                     sys.stderr.write('epoch %d\n' % e)
-                    for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE):
-                        if slice + BATCH_SIZE >= users_np.shape[0]:
+                    for slice in range(0, self.batch_size * self.num_batches,
+                                       self.batch_size):
+                        if slice + self.batch_size >= users_np.shape[0]:
                             break
                         prediction = deepcf(
-                            to_variable(users_np[slice:slice + BATCH_SIZE]),
-                            to_variable(items_np[slice:slice + BATCH_SIZE]))
+                            to_variable(users_np[slice:slice +
+                                                 self.batch_size]),
+                            to_variable(items_np[slice:slice +
+                                                 self.batch_size]))
                         loss = fluid.layers.reduce_sum(
                             fluid.layers.log_loss(
                                 prediction,
                                 to_variable(labels_np[slice:slice +
-                                                      BATCH_SIZE])))
+                                                      self.batch_size])))
                         loss.backward()
                         adam.minimize(loss)
                         deepcf.clear_gradients()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
index 6acab36221fa2..28d24f4b5b703 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
@@ -177,4 +177,5 @@ def test_gnn_float32(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_input_spec.py b/python/paddle/fluid/tests/unittests/test_input_spec.py
index f8f04229a4de8..a076b69cc0020 100644
--- a/python/paddle/fluid/tests/unittests/test_input_spec.py
+++ b/python/paddle/fluid/tests/unittests/test_input_spec.py
@@ -12,8 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import unittest
+import tempfile
 import numpy as np
+
 import paddle
 import paddle.fluid as fluid
 from paddle.static import InputSpec
@@ -160,6 +163,10 @@ def setUp(self):
         self.out_num = 16
         self.x_spec = paddle.static.InputSpec([-1, 16], name='x')
         self.x = paddle.randn([4, 16])
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
 
     @classmethod
     def setUpClass(cls):
@@ -182,7 +189,7 @@ def test_non_tensor_list(self):
         self.check_result(specs, 'list')
 
     def check_result(self, specs, path):
-        path = './net_non_tensor_' + path
+        path = os.path.join(self.temp_dir.name, './net_non_tensor_', path)
 
         net = NetWithNonTensorSpec(self.in_num, self.out_num)
         net.eval()
@@ -218,7 +225,7 @@ def test_spec_compatible(self):
         net = paddle.jit.to_static(net, input_spec=specs)
         net.eval()
 
-        path = './net_twice'
+        path = os.path.join(self.temp_dir.name, './net_twice')
 
         # NOTE: check input_specs_compatible
         new_specs = [self.x_spec, True, "bn", 10]
@@ -264,6 +271,7 @@ def setUp(self):
         self.y_spec = paddle.static.InputSpec([16], name='y')
         self.x = paddle.randn([4, 16])
         self.y = paddle.randn([16])
+        self.temp_dir = tempfile.TemporaryDirectory()
 
     @classmethod
     def setUpClass(cls):
@@ -271,7 +279,7 @@ def setUpClass(cls):
 
     def test_non_tensor_with_prune(self):
         specs = [self.x_spec, self.y_spec, True]
-        path = './net_non_tensor_prune_'
+        path = os.path.join(self.temp_dir.name, './net_non_tensor_prune_')
 
         net = NetWithNonTensorSpecPrune(self.in_num, self.out_num)
         net.eval()
diff --git a/python/paddle/fluid/tests/unittests/test_kron_op.py b/python/paddle/fluid/tests/unittests/test_kron_op.py
index 61b5b92c007e9..51a9cbc63d60f 100644
--- a/python/paddle/fluid/tests/unittests/test_kron_op.py
+++ b/python/paddle/fluid/tests/unittests/test_kron_op.py
@@ -151,7 +151,7 @@ def init_grad_input_output(self):
         self.grad_y = self.get_grad_y_by_numpy()
 
     def get_grad_x_by_numpy(self):
-        grad_x = np.zeros(self.x_shape, np.complex)
+        grad_x = np.zeros(self.x_shape, np.complex128)
         for x_i in range(self.x_shape[0]):
             for x_j in range(self.x_shape[1]):
                 for i in range(self.y_shape[0]):
@@ -163,7 +163,7 @@ def get_grad_x_by_numpy(self):
         return grad_x
 
     def get_grad_y_by_numpy(self):
-        grad_y = np.zeros(self.y_shape, np.complex)
+        grad_y = np.zeros(self.y_shape, np.complex128)
         for y_i in range(self.y_shape[0]):
             for y_j in range(self.y_shape[1]):
                 for x_i in range(self.x_shape[0]):
diff --git a/python/paddle/fluid/tests/unittests/test_layout_autotune.py b/python/paddle/fluid/tests/unittests/test_layout_autotune.py
index d29f47c8ab11d..f17bffe3b86ee 100644
--- a/python/paddle/fluid/tests/unittests/test_layout_autotune.py
+++ b/python/paddle/fluid/tests/unittests/test_layout_autotune.py
@@ -12,14 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle
+import os
+import json
+import tempfile
 import unittest
+import warnings
 import numpy
+
+import paddle
 import paddle.nn.functional as F
-import tempfile
-import warnings
-import json
-import os
 from paddle.fluid.framework import _enable_legacy_dygraph
 
 _enable_legacy_dygraph()
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
index 2d6cdac4854f7..e2ed2d8003a46 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
@@ -181,10 +181,11 @@ def _collate_fn(sample_list):
                 for i in range(10):
                     indices_queue.put([i, i + 10])
                 indices_queue.put(None)
+                base_seed = 1234
                 _worker_loop(loader._dataset, 0, indices_queue,
                              loader._data_queue, loader._workers_done_event,
                              True, _collate_fn, True, _init_fn, 0, 1,
-                             loader._use_shared_memory)
+                             loader._use_shared_memory, base_seed)
                 self.assertTrue(False)
         except AssertionError:
             pass
@@ -223,10 +224,11 @@ def _collate_fn(sample_list):
                     indices_queue.put([i, i + 10])
                 indices_queue.put(None)
                 loader._workers_done_event.set()
+                base_seed = 1234
                 _worker_loop(loader._dataset, 0, indices_queue,
                              loader._data_queue, loader._workers_done_event,
                              True, _collate_fn, True, _init_fn, 0, 1,
-                             loader._use_shared_memory)
+                             loader._use_shared_memory, base_seed)
                 self.assertTrue(True)
         except AssertionError:
             pass
diff --git a/python/paddle/fluid/tests/unittests/test_ones_like.py b/python/paddle/fluid/tests/unittests/test_ones_like.py
index 0c6e2476be324..31e28fe478707 100644
--- a/python/paddle/fluid/tests/unittests/test_ones_like.py
+++ b/python/paddle/fluid/tests/unittests/test_ones_like.py
@@ -41,7 +41,7 @@ def test_api(self):
 
             # 'bool', 'float32', 'float64', 'int32', 'int64'
             out1 = ones_like(x)
-            out2 = ones_like(x, np.bool)
+            out2 = ones_like(x, np.bool_)
             out3 = ones_like(x, 'float64')
             out4 = ones_like(x, 'int32')
             out5 = ones_like(x, 'int64')
@@ -54,7 +54,7 @@ def test_api(self):
                        fetch_list=[out1, out2, out3, out4, out5])
 
         for i, dtype in enumerate(
-            [np.float32, np.bool, np.float64, np.int32, np.int64]):
+            [np.float32, np.bool_, np.float64, np.int32, np.int64]):
             self.assertEqual(outs[i].dtype, dtype)
             self.assertEqual((outs[i] == np.ones(shape, dtype)).all(), True)
 
@@ -67,7 +67,7 @@ def test_out(self):
             0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
         paddle.disable_static(place)
         x = paddle.to_tensor(np.ones(shape))
-        for dtype in [np.bool, np.float32, np.float64, np.int32, np.int64]:
+        for dtype in [np.bool_, np.float32, np.float64, np.int32, np.int64]:
             out = ones_like(x, dtype)
             self.assertEqual((out.numpy() == np.ones(shape, dtype)).all(), True)
 
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py
index b70b69ca97c3d..490167a8ff796 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
@@ -14,6 +14,8 @@
 
 from __future__ import print_function
 
+import os
+import tempfile
 import unittest
 
 import paddle.fluid as fluid
@@ -29,8 +31,6 @@
 from paddle.io import Dataset
 import numpy
 
-paddle.enable_static()
-
 
 class TestOptimizer(unittest.TestCase):
 
@@ -1279,6 +1279,12 @@ class TestMasterWeightSaveForFP16(unittest.TestCase):
     Master weights will be saved by optimizer::state_dict.
     '''
 
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def check_with_opt_state_dict(self, use_save_load=True):
         paddle.seed(100)
         numpy.random.seed(100)
@@ -1340,10 +1346,12 @@ def __len__(self):
             optimizer.clear_grad(set_to_zero=False)
 
             if use_save_load and i == 5:
-                paddle.save(model.state_dict(), "model.pdparams")
-                paddle.save(optimizer.state_dict(), "opt.pdopt")
-                model.set_state_dict(paddle.load("model.pdparams"))
-                optimizer.set_state_dict(paddle.load("opt.pdopt"))
+                model_path = os.path.join(self.temp_dir.name, "model.pdparams")
+                optimizer_path = os.path.join(self.temp_dir.name, "opt.pdopt")
+                paddle.save(model.state_dict(), model_path)
+                paddle.save(optimizer.state_dict(), optimizer_path)
+                model.set_state_dict(paddle.load(model_path))
+                optimizer.set_state_dict(paddle.load(optimizer_path))
 
         return loss.numpy()
 
@@ -1359,4 +1367,5 @@ def test_with_state_dict(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_prior_box_op.py b/python/paddle/fluid/tests/unittests/test_prior_box_op.py
index 2e18f8b748efd..b0aaaec246f67 100644
--- a/python/paddle/fluid/tests/unittests/test_prior_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prior_box_op.py
@@ -81,9 +81,9 @@ def init_test_params(self):
         self.set_min_max_aspect_ratios_order()
         self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0]
         self.aspect_ratios = np.array(self.aspect_ratios,
-                                      dtype=np.float).flatten()
+                                      dtype=np.float64).flatten()
         self.variances = [0.1, 0.1, 0.2, 0.2]
-        self.variances = np.array(self.variances, dtype=np.float).flatten()
+        self.variances = np.array(self.variances, dtype=np.float64).flatten()
 
         self.clip = True
         self.num_priors = len(self.real_aspect_ratios) * len(self.min_sizes)
diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py
index d6fabb44b4fe2..8d0fcc7ae22ab 100644
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
@@ -965,7 +965,7 @@ def test_dygraph(self):
         paddle.disable_static()
         for place in self.places:
             with fluid.dygraph.guard(place):
-                np_x = np.random.randint(0, 2, (12, 10)).astype(np.bool)
+                np_x = np.random.randint(0, 2, (12, 10)).astype(np.bool_)
                 x = fluid.layers.assign(np_x)
                 x = fluid.layers.cast(x, 'bool')
 
@@ -1021,7 +1021,7 @@ def test_dygraph(self):
         paddle.disable_static()
         for place in self.places:
             with fluid.dygraph.guard(place):
-                np_x = np.random.randint(0, 2, (12, 10)).astype(np.bool)
+                np_x = np.random.randint(0, 2, (12, 10)).astype(np.bool_)
                 x = fluid.layers.assign(np_x)
                 x = fluid.layers.cast(x, 'bool')
 
diff --git a/python/paddle/fluid/tests/unittests/test_signal.py b/python/paddle/fluid/tests/unittests/test_signal.py
index 8257630cf2071..670b3aa40df4d 100644
--- a/python/paddle/fluid/tests/unittests/test_signal.py
+++ b/python/paddle/fluid/tests/unittests/test_signal.py
@@ -81,7 +81,7 @@ def normalize(S, norm=np.inf, axis=0, threshold=None, fill=None):
         raise Exception("Input must be finite")
 
     # All norms only depend on magnitude, let's do that first
-    mag = np.abs(S).astype(np.float)
+    mag = np.abs(S).astype(np.float64)
 
     # For max/min norms, filling with 1 works
     fill_norm = 1
@@ -598,8 +598,8 @@ def rand_x(dims=1,
             np.random.randint(min_dim_len, max_dim_len) for i in range(dims)
         ]
     if complex:
-        return np.random.randn(*shape).astype(
-            dtype) + 1.j * np.random.randn(*shape).astype(dtype)
+        return np.random.randn(
+            *shape).astype(dtype) + 1.j * np.random.randn(*shape).astype(dtype)
     else:
         return np.random.randn(*shape).astype(dtype)
 
diff --git a/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py b/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py
index a2ccfa925ed81..59b652b1d3d67 100644
--- a/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py
+++ b/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py
@@ -13,12 +13,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import numpy as np
+import tempfile
+import unittest
+
 import paddle
 import paddle.fluid as fluid
-import unittest
 import paddle.nn as nn
-import os
 
 
 class SimpleFCLayer(nn.Layer):
@@ -54,6 +56,10 @@ def setUp(self):
         self.fc_size = 2
         self.layer = self._train_simple_net()
         self.type_str = 'class'
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
 
     def test_trace_err(self):
         if fluid.framework.in_dygraph_mode():
@@ -122,7 +128,7 @@ def test_save_inference_model_err(self):
             dygraph_out, traced_layer = fluid.dygraph.TracedLayer.trace(
                 self.layer, [in_x])
 
-            path = './traced_layer_err_msg'
+            path = os.path.join(self.temp_dir.name, './traced_layer_err_msg')
             with self.assertRaises(TypeError) as e:
                 traced_layer.save_inference_model([0])
             self.assertEqual(
@@ -193,11 +199,15 @@ class TestTracedLayerSaveInferenceModel(unittest.TestCase):
     """test save_inference_model will automaticlly create non-exist dir"""
 
     def setUp(self):
-        self.save_path = "./nonexist_dir/fc"
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.save_path = os.path.join(self.temp_dir.name, "./nonexist_dir/fc")
         import shutil
         if os.path.exists(os.path.dirname(self.save_path)):
             shutil.rmtree(os.path.dirname(self.save_path))
 
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def test_mkdir_when_input_path_non_exist(self):
         if fluid.framework.in_dygraph_mode():
             return
diff --git a/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py b/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py
index c1294628a4e71..0d31dad81997e 100644
--- a/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py
+++ b/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py
@@ -24,7 +24,7 @@ class TestUpdateLossScalingOp(OpTest):
     def setUp(self):
         self.op_type = "update_loss_scaling"
         self.init()
-        found_inf = np.array([False], dtype=np.bool)
+        found_inf = np.array([False], dtype=np.bool_)
         x = np.random.random((1024, 1024)).astype(self.dtype)
 
         self.inputs = {
@@ -66,7 +66,7 @@ class TestUpdateLossScalingOpBad(TestUpdateLossScalingOp):
     def setUp(self):
         self.op_type = "update_loss_scaling"
         self.init()
-        found_inf = np.array([True], dtype=np.bool)
+        found_inf = np.array([True], dtype=np.bool_)
         x = np.random.random((1024, 1024)).astype(self.dtype)
         i = np.random.randint(0, 1024, 1)
         j = np.random.randint(0, 1024, 1)
diff --git a/python/paddle/fluid/tests/unittests/test_zeros_like_op.py b/python/paddle/fluid/tests/unittests/test_zeros_like_op.py
index 3be1fb85565f7..13911dff01601 100644
--- a/python/paddle/fluid/tests/unittests/test_zeros_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_zeros_like_op.py
@@ -43,7 +43,7 @@ def test_api(self):
         with program_guard(train_program, startup_program):
             x = paddle.fluid.data('X', shape)
             out1 = zeros_like(x)
-            out2 = zeros_like(x, np.bool)
+            out2 = zeros_like(x, np.bool_)
             out3 = zeros_like(x, 'float64')
             out4 = zeros_like(x, 'int32')
             out5 = zeros_like(x, 'int64')
@@ -54,7 +54,7 @@ def test_api(self):
                        feed={'X': np.ones(shape).astype('float32')},
                        fetch_list=[out1, out2, out3, out4, out5])
         for (i, dtype) in enumerate(
-            [np.float32, np.bool, np.float64, np.int32, np.int64]):
+            [np.float32, np.bool_, np.float64, np.int32, np.int64]):
             self.assertEqual(outs[i].dtype, dtype)
             self.assertEqual((outs[i] == np.zeros(shape, dtype)).all(), True)
 
@@ -71,7 +71,7 @@ def test_out(self):
                  if core.is_compiled_with_cuda() else fluid.CPUPlace())
         paddle.disable_static(place)
         x = paddle.to_tensor(np.ones(shape))
-        for dtype in [np.bool, np.float32, np.float64, np.int32, np.int64]:
+        for dtype in [np.bool_, np.float32, np.float64, np.int32, np.int64]:
             out = zeros_like(x, dtype)
             self.assertEqual((out.numpy() == np.zeros(shape, dtype)).all(),
                              True)
diff --git a/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt
index 233c4e6143615..6267526f33c12 100644
--- a/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt
@@ -21,11 +21,11 @@ list(REMOVE_ITEM TEST_OPS test_mean_op_xpu)
 
 foreach(TEST_OP ${TEST_OPS})
   py_test_modules(${TEST_OP} MODULES ${TEST_OP})
-endforeach(TEST_OP)
+endforeach()
 
 foreach(TEST_OP ${DIST_TEST_OPS})
   py_test_modules(${TEST_OP} MODULES ${TEST_OP})
-endforeach(TEST_OP)
+endforeach()
 
 set_tests_properties(test_mul_op_xpu PROPERTIES TIMEOUT 120)
 set_tests_properties(test_conv2d_op_xpu PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_compare_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_compare_op_xpu.py
index a4175ec25cf1b..f33da83bae7a1 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_compare_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_compare_op_xpu.py
@@ -240,8 +240,8 @@ def test_bool_api_4(self):
                 op = eval("paddle.%s" % (self.op_type))
                 out = op(x, y)
                 exe = paddle.static.Executor(self.place)
-                input_x = np.array([True, False, True]).astype(np.bool)
-                input_y = np.array([True, True, False]).astype(np.bool)
+                input_x = np.array([True, False, True]).astype(np.bool_)
+                input_y = np.array([True, True, False]).astype(np.bool_)
                 real_result = callback(input_x, input_y)
                 res, = exe.run(feed={
                     "x": input_x,
@@ -258,8 +258,8 @@ def test_bool_broadcast_api_4(self):
                 op = eval("paddle.%s" % (self.op_type))
                 out = op(x, y)
                 exe = paddle.static.Executor(self.place)
-                input_x = np.array([True, False, True]).astype(np.bool)
-                input_y = np.array([True]).astype(np.bool)
+                input_x = np.array([True, False, True]).astype(np.bool_)
+                input_y = np.array([True]).astype(np.bool_)
                 real_result = callback(input_x, input_y)
                 res, = exe.run(feed={
                     "x": input_x,
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_prior_box_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_prior_box_op_xpu.py
index c8fcffbd3d33d..32dd28f73851d 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_prior_box_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_prior_box_op_xpu.py
@@ -98,9 +98,10 @@ def init_test_params(self):
             self.set_min_max_aspect_ratios_order()
             self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0]
             self.aspect_ratios = np.array(self.aspect_ratios,
-                                          dtype=np.float).flatten()
+                                          dtype=np.float64).flatten()
             self.variances = [0.1, 0.1, 0.2, 0.2]
-            self.variances = np.array(self.variances, dtype=np.float).flatten()
+            self.variances = np.array(self.variances,
+                                      dtype=np.float64).flatten()
 
             self.clip = True
             self.num_priors = len(self.real_aspect_ratios) * len(self.min_sizes)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_update_loss_scaling_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_update_loss_scaling_op_xpu.py
index 0aecc48fe3506..5ed10d159ae05 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_update_loss_scaling_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_update_loss_scaling_op_xpu.py
@@ -31,7 +31,7 @@ class TestUpdateLossScalingOp(XPUOpTest):
     def setUp(self):
         self.op_type = "update_loss_scaling"
         self.init()
-        found_inf = np.array([False], dtype=np.bool)
+        found_inf = np.array([False], dtype=np.bool_)
         x = np.random.random((1024, 1024)).astype(self.dtype)
 
         self.inputs = {
@@ -75,7 +75,7 @@ class TestUpdateLossScalingOpBad(TestUpdateLossScalingOp):
     def setUp(self):
         self.op_type = "update_loss_scaling"
         self.init()
-        found_inf = np.array([True], dtype=np.bool)
+        found_inf = np.array([True], dtype=np.bool_)
         x = np.random.random((1024, 1024)).astype(self.dtype)
         i = np.random.randint(0, 1024, 1)
         j = np.random.randint(0, 1024, 1)
diff --git a/python/paddle/incubate/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py
index ab7e135adc6c4..87c38a46692de 100644
--- a/python/paddle/incubate/nn/functional/fused_transformer.py
+++ b/python/paddle/incubate/nn/functional/fused_transformer.py
@@ -45,6 +45,7 @@ def fused_feedforward(x,
                       pre_layer_norm=False,
                       training=True,
                       mode='upscale_in_train',
+                      ring_id=-1,
                       name=None):
     r"""
     This is a fusion operator to compute feed forward layer in transformer model architecture.
@@ -88,6 +89,7 @@ def fused_feedforward(x,
 
                                   - train: out = input * mask
                                   - inference: out = input * (1.0 - p)
+        ring_id (int, optional): For distributed forward in tensor model parallel, only support NCCL. Default is -1, means not using tensor parallel.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -132,7 +134,8 @@ def fused_feedforward(x,
             "dropout1_fix_seed", seed is not None, "dropout2_fix_seed", seed
             is not None, "dropout1_seed", seed if seed is not None else 0,
             "dropout2_seed", seed if seed is not None else 0,
-            'dropout1_implementation', mode, 'dropout2_implementation', mode)
+            'dropout1_implementation', mode, 'dropout2_implementation', mode,
+            'ring_id', ring_id)
         return out
 
     helper = LayerHelper("fused_feedforward")
@@ -206,7 +209,8 @@ def fused_feedforward(x,
                          'dropout1_seed': seed if seed is not None else 0,
                          'dropout2_seed': seed if seed is not None else 0,
                          'dropout1_implementation': mode,
-                         'dropout2_implementation': mode
+                         'dropout2_implementation': mode,
+                         'ring_id': ring_id,
                      })
     return out
 
diff --git a/python/paddle/incubate/nn/layer/fused_transformer.py b/python/paddle/incubate/nn/layer/fused_transformer.py
index 595b1d27fea8b..f52cbd2cd3ef4 100644
--- a/python/paddle/incubate/nn/layer/fused_transformer.py
+++ b/python/paddle/incubate/nn/layer/fused_transformer.py
@@ -101,12 +101,12 @@ def forward(self, x, residual):
         Applies fused_bias_dropout_residual_layer_norm operation.
 
         Parameters:
-            x (Tensor): The input tensor. It is a tensor with shape 
-                `[batch_size, seq_len, embed_dim]`. The data type should be 
-                float32 or float64. 
-            residual (Tensor, optional): The residual tensor. It is a tensor 
-                with shape `[batch_size, value_length, vdim]`. The data type 
-                should be float32 or float64. 
+            x (Tensor): The input tensor. It is a tensor with shape
+                `[batch_size, seq_len, embed_dim]`. The data type should be
+                float32 or float64.
+            residual (Tensor, optional): The residual tensor. It is a tensor
+                with shape `[batch_size, value_length, vdim]`. The data type
+                should be float32 or float64.
 
         Returns:
             Tensor|tuple: It is a tensor that has the same shape and data type \
@@ -158,15 +158,39 @@ class FusedMultiHeadAttention(Layer):
             (True) or post_layer_norm architecture (False). Default False.
         need_weights (bool, optional): Indicate whether to return the attention
             weights. Now, only False is supported. Default False.
-        weight_attr(ParamAttr, optional):  To specify the weight parameter property.
-            Default: None, which means the default weight parameter property is used.
-            See usage for details in :code:`ParamAttr`.
-        bias_attr (ParamAttr|bool, optional): To specify the bias parameter property.
-            Default: None, which means the default bias parameter property is used.
-            If it is set to False, this layer will not have trainable bias parameter.
-            See usage for details in :code:`ParamAttr`.
+        qkv_weight_attr(ParamAttr, optional): To specify the weight parameter property
+            for QKV projection computation. Default: None, which means the default weight
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        qkv_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
+            for QKV projection computation. The `False` value means the corresponding layer
+            would not have trainable bias parameter. Default: None, which means the
+            default bias parameter property is used. See usage for details in :code:`ParamAttr`.
+        linear_weight_attr(ParamAttr, optional): To specify the weight parameter property
+            for linear projection computation. Default: None, which means the default weight
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        linear_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
+            for linear projection computation. The `False` value means the corresponding layer would
+            not have trainable bias parameter. Default: None, which means the default bias
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        pre_ln_scale_attr(ParamAttr, optional): To specify the weight parameter property
+            for pre_layer_norm computation. Otherwise, all layers both use it as
+            `attr` to create parameters. Default: None, which means the default weight
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        pre_ln_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
+            for pre_layer_norm computation. The `False` value means the corresponding layer would
+            not have trainable bias parameter. Default: None, which means the default bias
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        ln_scale_attr(ParamAttr, optional): To specify the weight parameter property
+            for post_layer_norm computation. Default: None, which means the default weight
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        ln_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
+            for post_layer_norm computation. The `False` value means the corresponding layer would
+            not have trainable bias parameter. Default: None, which means the default bias
+            parameter property is used. See usage for details in :code:`ParamAttr`.
         epsilon (float, optional): The small value added to the variance to prevent
             division by zero. Default: 1e-05.
+        nranks (int, optional): Distributed tensor model parallel nranks. Default is 1, means not using tensor parallel.
+        ring_id (int, optional): For distributed tensor model parallel. Default is -1, means not using tensor parallel.
 
     Examples:
 
@@ -191,9 +215,17 @@ def __init__(self,
                  vdim=None,
                  normalize_before=False,
                  need_weights=False,
-                 weight_attr=None,
-                 bias_attr=None,
+                 qkv_weight_attr=None,
+                 qkv_bias_attr=None,
+                 linear_weight_attr=None,
+                 linear_bias_attr=None,
+                 pre_ln_scale_attr=None,
+                 pre_ln_bias_attr=None,
+                 ln_scale_attr=None,
+                 ln_bias_attr=None,
                  epsilon=1e-5,
+                 nranks=1,
+                 ring_id=-1,
                  name=None):
         super(FusedMultiHeadAttention, self).__init__()
 
@@ -204,9 +236,8 @@ def __init__(self,
 
         self.normalize_before = normalize_before
         self._dtype = self._helper.get_default_dtype()
-        self._weight_attr = weight_attr
-        self._bias_attr = bias_attr
         self._epsilon = epsilon
+        self._ring_id = ring_id
 
         self.embed_dim = embed_dim
         self.num_heads = num_heads
@@ -215,41 +246,61 @@ def __init__(self,
         self.vdim = vdim
         self.need_weights = need_weights
         assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
-        assert need_weights == False, "Only support need_weight is False now."
+        assert need_weights is False, "Only support need_weight is False now."
+
+        # tensor model parallel
+        assert num_heads % nranks == 0
+        num_heads = num_heads // nranks
 
         self.qkv_weight = self.create_parameter(
             shape=[3, num_heads, self.head_dim, embed_dim],
-            attr=self._weight_attr,
+            attr=qkv_weight_attr,
             dtype=self._dtype,
             is_bias=False)
         self.qkv_bias = self.create_parameter(
             shape=[3, num_heads, self.head_dim],
-            attr=self._bias_attr,
+            attr=qkv_bias_attr,
             dtype=self._dtype,
             is_bias=True)
-        self.linear_weight = self.create_parameter(shape=[embed_dim, embed_dim],
-                                                   attr=self._weight_attr,
-                                                   dtype=self._dtype,
-                                                   is_bias=False)
+        self.linear_weight = self.create_parameter(
+            shape=[num_heads * self.head_dim, embed_dim],
+            attr=linear_weight_attr,
+            dtype=self._dtype,
+            is_bias=False)
         self.linear_bias = self.create_parameter(shape=[embed_dim],
-                                                 attr=self._bias_attr,
+                                                 attr=linear_bias_attr,
                                                  dtype=self._dtype,
                                                  is_bias=True)
 
-        self.pre_ln_scale = self.create_parameter(
-            attr=self._weight_attr,
-            shape=[embed_dim],
-            default_initializer=Constant(value=1.0))
-        self.pre_ln_bias = self.create_parameter(attr=self._bias_attr,
+        # tensor model parallel
+        if nranks > 1:
+            assert ring_id != -1
+            # column parallel
+            _set_var_distributed(self.qkv_weight)
+            _set_var_distributed(self.qkv_bias)
+            # row parallel
+            _set_var_distributed(self.linear_weight)
+
+        if normalize_before:
+            self.pre_ln_scale = self.create_parameter(
+                attr=pre_ln_scale_attr,
+                shape=[embed_dim],
+                default_initializer=Constant(value=1.0))
+            self.pre_ln_bias = self.create_parameter(attr=pre_ln_bias_attr,
+                                                     shape=[embed_dim],
+                                                     is_bias=True)
+            self.ln_scale = None
+            self.ln_bias = None
+        else:
+            self.pre_ln_scale = None
+            self.pre_ln_bias = None
+            self.ln_scale = self.create_parameter(
+                attr=ln_scale_attr,
+                shape=[embed_dim],
+                default_initializer=Constant(value=1.0))
+            self.ln_bias = self.create_parameter(attr=ln_bias_attr,
                                                  shape=[embed_dim],
                                                  is_bias=True)
-        self.ln_scale = self.create_parameter(
-            attr=self._weight_attr,
-            shape=[embed_dim],
-            default_initializer=Constant(value=1.0))
-        self.ln_bias = self.create_parameter(attr=self._bias_attr,
-                                             shape=[embed_dim],
-                                             is_bias=True)
 
         self.dropout_rate = dropout_rate
         self.attn_dropout_rate = attn_dropout_rate
@@ -294,8 +345,6 @@ def forward(self, query, key=None, value=None, attn_mask=None, cache=None):
             # Support bool or int mask
             attn_mask = _convert_attention_mask(attn_mask, query.dtype)
 
-        assert cache == None, "Only support cache is None now."
-
         out = incubate_f.fused_multi_head_attention(
             x=query,
             qkv_weight=self.qkv_weight,
@@ -308,11 +357,13 @@ def forward(self, query, key=None, value=None, attn_mask=None, cache=None):
             pre_ln_epsilon=self._epsilon,
             qkv_bias=self.qkv_bias,
             linear_bias=self.linear_bias,
+            cache_kv=cache,
             attn_mask=attn_mask,
             dropout_rate=self.dropout_rate,
             attn_dropout_rate=self.attn_dropout_rate,
             ln_epsilon=self._epsilon,
             training=self.training,
+            ring_id=self._ring_id,
             name=self.name)
         return out
 
@@ -338,14 +389,38 @@ class FusedFeedForward(Layer):
             If None, use the value of `dropout_rate`. Default None
         normalize_before (bool, optional): Indicate whether to put layer normalization
             into, preprocessing or postprocessing. Default False
-        weight_attr (ParamAttr, optional): The attribute for the learnable weight of this layer.
-            The default value is None and the weight will be initialized to zero. For detailed
-            information, please refer to paddle.ParamAttr.
-        bias_attr (ParamAttr|bool, optional): The attribute for the learnable bias of thi layer.
-            If it is set to False, no bias will be added to the output. If it is set to None or one
-            kind of ParamAttr, a bias parameter will be created according to ParamAttr. For detailed
-            information, please refer to paddle.ParamAttr. The default value is None and the bias
-            will be initialized to zero.
+        linear1_weight_attr(ParamAttr, optional): To specify the weight parameter property
+            for FFN first linear. Default: None, which means the default weight
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        linear1_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
+            for FFN first linear. The `False` value means the corresponding layer would
+            not have trainable bias parameter. Default: None, which means the default bias
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        linear2_weight_attr(ParamAttr, optional): To specify the weight parameter property
+            for FFN second linear. Default: None, which means the default weight
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        linear2_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
+            for FFN second linear. The `False` value means the corresponding layer would
+            not have trainable bias parameter. Default: None, which means the default bias
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        ln1_scale_attr(ParamAttr, optional): To specify the weight parameter property
+            for FFN pre_layer_norm. Default: None, which means the default weight
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        ln1_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
+            for FFN pre_layer_norm. The `False` value means the corresponding layer would
+            not have trainable bias parameter. Default: None, which means the default bias
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        ln2_scale_attr(ParamAttr, optional): To specify the weight parameter property
+            for FFN post_layer_norm. Default: None, which means the default weight
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        ln2_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property
+            for FFN layer_norm. The `False` value means the corresponding layer would
+            not have trainable bias parameter. Default: None, which means the default bias
+            parameter property is used. See usage for details in :code:`ParamAttr`.
+        nranks (int, optional): Distributed tensor model parallel nranks. Default is 1, means not using tensor parallel.
+        ring_id (int, optional): For distributed tensor model parallel. Default is -1, means not using tensor parallel.
+        name (str, optional): The default value is None.  Normally there is no need for user to set
+            this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Examples:
         .. code-block:: python
@@ -369,8 +444,16 @@ def __init__(self,
                  activation="relu",
                  act_dropout_rate=None,
                  normalize_before=False,
-                 weight_attr=None,
-                 bias_attr=None,
+                 linear1_weight_attr=None,
+                 linear1_bias_attr=None,
+                 linear2_weight_attr=None,
+                 linear2_bias_attr=None,
+                 ln1_scale_attr=None,
+                 ln1_bias_attr=None,
+                 ln2_scale_attr=None,
+                 ln2_bias_attr=None,
+                 nranks=1,
+                 ring_id=-1,
                  name=None):
 
         super(FusedFeedForward, self).__init__()
@@ -383,51 +466,68 @@ def __init__(self,
 
         self._dtype = self._helper.get_default_dtype()
         self._d_model = d_model
+
+        assert dim_feedforward % nranks == 0
+        dim_feedforward = dim_feedforward // nranks
         self._dim_feedforward = dim_feedforward
         self._dropout_rate = dropout_rate
         self._act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate
         self._act_method = activation
         self._normalize_before = normalize_before
         self._epsilon = epsilon
+        self._ring_id = ring_id
 
         self._linear1_weight = self.create_parameter(
             shape=[d_model, dim_feedforward],
-            attr=weight_attr,
+            attr=linear1_weight_attr,
             dtype=self._dtype,
             is_bias=False)
         self._linear1_bias = self.create_parameter(shape=[dim_feedforward],
-                                                   attr=bias_attr,
+                                                   attr=linear1_bias_attr,
                                                    dtype=self._dtype,
                                                    is_bias=True)
 
         self._linear2_weight = self.create_parameter(
             shape=[dim_feedforward, d_model],
-            attr=weight_attr,
+            attr=linear2_weight_attr,
             dtype=self._dtype,
             is_bias=False)
 
         self._linear2_bias = self.create_parameter(shape=[d_model],
-                                                   attr=bias_attr,
+                                                   attr=linear2_bias_attr,
                                                    dtype=self._dtype,
                                                    is_bias=True)
 
-        self._ln1_scale = self.create_parameter(
-            shape=[d_model],
-            attr=None,
-            is_bias=False,
-            default_initializer=Constant(1.0))
-        self._ln1_bias = self.create_parameter(shape=[d_model],
-                                               attr=None,
-                                               is_bias=True)
-
-        self._ln2_scale = self.create_parameter(
-            shape=[d_model],
-            attr=None,
-            is_bias=False,
-            default_initializer=Constant(1.0))
-        self._ln2_bias = self.create_parameter(shape=[d_model],
-                                               attr=None,
-                                               is_bias=True)
+        if nranks > 1:
+            assert ring_id != -1
+            # column parallel
+            _set_var_distributed(self._linear1_weight)
+            _set_var_distributed(self._linear1_bias)
+            _set_var_distributed(self._linear2_weight)
+
+        if normalize_before:
+            self._ln1_scale = self.create_parameter(
+                shape=[d_model],
+                attr=ln1_scale_attr,
+                is_bias=False,
+                default_initializer=Constant(1.0))
+            self._ln1_bias = self.create_parameter(shape=[d_model],
+                                                   attr=ln1_bias_attr,
+                                                   is_bias=True)
+            self._ln2_scale = None
+            self._ln2_bias = None
+        else:
+            self._ln1_scale = None
+            self._ln1_bias = None
+            self._ln2_scale = self.create_parameter(
+                shape=[d_model],
+                attr=ln2_scale_attr,
+                is_bias=False,
+                default_initializer=Constant(1.0))
+            self._ln2_bias = self.create_parameter(shape=[d_model],
+                                                   attr=ln2_bias_attr,
+                                                   is_bias=True)
+
         self.name = name
 
     def forward(self, src, cache=None):
@@ -448,6 +548,7 @@ def forward(self, src, cache=None):
             ln2_epsilon=self._epsilon,
             pre_layer_norm=self._normalize_before,
             training=self.training,
+            ring_id=self._ring_id,
             name=self.name)
         return out
 
@@ -553,8 +654,14 @@ def __init__(self,
             dropout_rate=dropout_rate,
             attn_dropout_rate=attn_dropout_rate,
             normalize_before=self.normalize_before,
-            weight_attr=weight_attrs[0],
-            bias_attr=bias_attrs[0])
+            qkv_weight_attr=weight_attrs[0],
+            qkv_bias_attr=bias_attrs[0],
+            linear_weight_attr=weight_attrs[0],
+            linear_bias_attr=bias_attrs[0],
+            pre_ln_scale_attr=weight_attrs[0],
+            pre_ln_bias_attr=bias_attrs[0],
+            ln_scale_attr=weight_attrs[0],
+            ln_bias_attr=bias_attrs[0])
 
         self.ffn = FusedFeedForward(d_model,
                                     dim_feedforward,
@@ -562,8 +669,10 @@ def __init__(self,
                                     activation=activation,
                                     act_dropout_rate=act_dropout_rate,
                                     normalize_before=self.normalize_before,
-                                    weight_attr=weight_attrs[1],
-                                    bias_attr=bias_attrs[1])
+                                    linear1_weight_attr=weight_attrs[1],
+                                    linear1_bias_attr=bias_attrs[1],
+                                    linear2_weight_attr=weight_attrs[1],
+                                    linear2_bias_attr=bias_attrs[1])
 
     def forward(self, src, src_mask=None, cache=None):
         """
diff --git a/python/paddle/nn/initializer/constant.py b/python/paddle/nn/initializer/constant.py
index 292eaff362b40..66818dab451a3 100644
--- a/python/paddle/nn/initializer/constant.py
+++ b/python/paddle/nn/initializer/constant.py
@@ -22,11 +22,11 @@ class Constant(ConstantInitializer):
     """Implement the constant initializer.
 
     Args:
-        value (float32): constant value to initialize the parameter 
+        value (float32|float64, optional): constant value to initialize the parameter. Default: 0.0.
 
     Examples:
         .. code-block:: python
-
+          :name: code-example1
             import paddle
             import paddle.nn as nn
 
diff --git a/python/paddle/nn/initializer/kaiming.py b/python/paddle/nn/initializer/kaiming.py
index b8ed7febb6bc7..456496571924e 100644
--- a/python/paddle/nn/initializer/kaiming.py
+++ b/python/paddle/nn/initializer/kaiming.py
@@ -36,7 +36,7 @@ class KaimingNormal(MSRAInitializer):
         \sqrt{\frac{2.0}{fan\_in}}
 
     Args:
-        fan_in (float32|None): fan_in for Kaiming normal Initializer. If None, it is\
+        fan_in (float32|None, optional): fan_in for Kaiming normal Initializer. If None, it is 
         inferred from the variable. default is None.
 
     Note:
@@ -44,7 +44,7 @@ class KaimingNormal(MSRAInitializer):
 
     Examples:
         .. code-block:: python
-
+          :name: code-example1
             import paddle
             import paddle.nn as nn
 
@@ -79,7 +79,7 @@ class KaimingUniform(MSRAInitializer):
         x = \sqrt{\frac{6.0}{fan\_in}}
 
     Args:
-        fan_in (float32|None): fan_in for Kaiming uniform Initializer. If None, it is\
+        fan_in (float32|None, optional): fan_in for Kaiming uniform Initializer. If None, it is 
         inferred from the variable. default is None.
 
     Note:
@@ -87,7 +87,7 @@ class KaimingUniform(MSRAInitializer):
 
     Examples:
         .. code-block:: python
-
+          :name: code-example1
             import paddle
             import paddle.nn as nn
 
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index f724f7cfee52c..4ef987eccf2a4 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -76,7 +76,7 @@ def __init__(self,
                 format(valid_padding_modes, padding_mode))
 
         if padding_mode in {'reflect', 'replicate', 'circular'
-                            } and not isinstance(padding, np.int):
+                            } and not isinstance(padding, int):
             raise TypeError(
                 "when padding_mode in ['reflect', 'replicate', 'circular'], type of padding must be int"
             )
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 521839af902b5..9971a4d5a3e18 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -352,7 +352,7 @@ def _handle_dtype(data, dtype):
             data = np.array([data])
         elif isinstance(data, (list, tuple)):
             data = np.array(data)
-            if data.dtype == np.object:
+            if data.dtype == np.object_:
                 raise ValueError(
                     "\n\tFaild to convert input data to a regular ndarray :\n\t - Usually "
                     "this means the input data contains nested lists with different lengths. "
@@ -962,7 +962,7 @@ def tril(x, diagonal=0, name=None):
 
 def triu(x, diagonal=0, name=None):
     r"""
-    This op returns the upper triangular part of a matrix (2-D tensor) or batch of matrices
+    Return the upper triangular part of a matrix (2-D tensor) or batch of matrices
     :attr:`x`, the other elements of the result tensor are set to 0.
     The upper triangular part of the matrix is defined as the elements on and
     above the diagonal.
diff --git a/python/paddle/tensor/einsum.py b/python/paddle/tensor/einsum.py
index 0cdced2cf9b84..34a1ead2cb497 100644
--- a/python/paddle/tensor/einsum.py
+++ b/python/paddle/tensor/einsum.py
@@ -807,9 +807,9 @@ def gen_einsum_op(equation, *operands):
 
     if _in_legacy_dygraph():
         # dygraph
-        return _C_ops.einsum(operands, len(operands), 'equation', equation)[0]
+        return _C_ops.einsum(operands, len(operands), len(operands), 'equation',
+                             equation)[0]
 
-    # static graph
     for inp in operands:
         check_variable_and_dtype(inp, 'dtype', ['float32', 'float64'], 'einsum')
     check_type(equation, 'equation', str, 'einsum')
@@ -821,11 +821,16 @@ def gen_einsum_op(equation, *operands):
         helper.create_variable_for_type_inference(dtype=operands[0].dtype)
         for i in range(len(operands))
     ]
+    xshape = [
+        helper.create_variable_for_type_inference(dtype=operands[0].dtype)
+        for i in range(len(operands))
+    ]
     helper.append_op(type='einsum',
                      inputs={'Operands': operands},
                      outputs={
                          'Out': out,
-                         "InnerCache": caches
+                         "InnerCache": caches,
+                         "XShape": xshape
                      },
                      attrs=attrs)
     return out
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 0089ef21dc98a..137c85ac98938 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -430,7 +430,7 @@ def inf_norm(input,
         reduce_all = True if axis == None or axis == [] or asvector == True else False
         axis = axis if axis != None and axis != [] else [0]
 
-        reduce_type = 'reduce_max' if porder == np.float(
+        reduce_type = 'reduce_max' if porder == np.float64(
             'inf') else 'reduce_min'
         helper.append_op(type=reduce_type,
                          inputs={'X': out},
diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py
index c4b4c552c670d..8834ae1d400f1 100755
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
@@ -146,8 +146,8 @@ def logical_or(x, y, out=None, name=None):
             import paddle
             import numpy as np
 
-            x_data = np.array([True, False], dtype=np.bool).reshape(2, 1)
-            y_data = np.array([True, False, True, False], dtype=np.bool).reshape(2, 2)
+            x_data = np.array([True, False], dtype=np.bool_).reshape(2, 1)
+            y_data = np.array([True, False, True, False], dtype=np.bool_).reshape(2, 2)
             x = paddle.to_tensor(x_data)
             y = paddle.to_tensor(y_data)
             res = paddle.logical_or(x, y)
@@ -191,8 +191,8 @@ def logical_xor(x, y, out=None, name=None):
             import paddle
             import numpy as np
 
-            x_data = np.array([True, False], dtype=np.bool).reshape([2, 1])
-            y_data = np.array([True, False, True, False], dtype=np.bool).reshape([2, 2])
+            x_data = np.array([True, False], dtype=np.bool_).reshape([2, 1])
+            y_data = np.array([True, False, True, False], dtype=np.bool_).reshape([2, 2])
             x = paddle.to_tensor(x_data)
             y = paddle.to_tensor(y_data)
             res = paddle.logical_xor(x, y)
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 96d24a7f915ee..c445402412e16 100755
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -1887,7 +1887,7 @@ def _get_SectionsTensorList(one_list):
 
 def squeeze(x, axis=None, name=None):
     """
-    This OP will squeeze the dimension(s) of size 1 of input tensor x's shape. 
+    Squeeze the dimension(s) of size 1 of input tensor x's shape. 
     
     Note that the output Tensor will share data with origin Tensor and doesn't have a 
     Tensor copy in ``dygraph`` mode. If you want to use the Tensor copy version, 
@@ -1944,7 +1944,7 @@ def squeeze(x, axis=None, name=None):
 
     Examples:
         .. code-block:: python
-
+	  :name: code-example1
             import paddle
             
             x = paddle.rand([5, 1, 10])
@@ -2139,13 +2139,13 @@ def unique(x,
             :ref:`api_guide_Name`. Default: None.
 
     Returns: 
-        tuple: (out, indices, inverse, counts). `out` is the unique tensor for `x`. `indices` is \
+        tuple (out, indices, inverse, counts). `out` is the unique tensor for `x`. `indices` is \
             provided only if `return_index` is True. `inverse` is provided only if `return_inverse` \
             is True. `counts` is provided only if `return_counts` is True.
 
     Examples:
         .. code-block:: python
-
+	  :name: code-example1
             import paddle
 
             x = paddle.to_tensor([2, 3, 3, 1, 5, 3])
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index ffca233ff16bf..1cb350f4d7288 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -1319,7 +1319,7 @@ def nanmean(x, axis=None, keepdim=False, name=None):
 @templatedoc(op_type="sum")
 def add_n(inputs, name=None):
     """
-    This OP is used to sum one or more Tensor of the input.
+    Sum one or more Tensor of the input.
     
     For example:
 
@@ -1365,7 +1365,7 @@ def add_n(inputs, name=None):
 
     Examples:
         .. code-block:: python
-
+          :name: code-example1
             import paddle
 
             input0 = paddle.to_tensor([[1, 2, 3], [4, 5, 6]], dtype='float32')
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index f43bda1129589..990b20a26772c 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -631,13 +631,13 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None):
     If ``high`` is None (the default), the range is [0, ``low``).
 
     Args:
-        low (int): The lower bound on the range of random values to generate.
+        low (int, optional): The lower bound on the range of random values to generate.
             The ``low`` is included in the range. If ``high`` is None, the
             range is [0, ``low``). Default is 0.
         high (int, optional): The upper bound on the range of random values to
             generate, the ``high`` is excluded in the range. Default is None
             (see above for behavior if high = None). Default is None.
-        shape (list|tuple|Tensor): The shape of the output Tensor. If ``shape``
+        shape (list|tuple|Tensor, optional): The shape of the output Tensor. If ``shape``
             is a list or tuple, the elements of it should be integers or Tensors
             (with the shape [1], and the data type int32 or int64). If ``shape``
             is a Tensor, it should be a 1-D Tensor(with the data type int32 or
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index 94a05294aaa63..f46b53a3b71f4 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -207,7 +207,7 @@ def argmax(x, axis=None, keepdim=False, dtype="int64", name=None):
 
 def argmin(x, axis=None, keepdim=False, dtype="int64", name=None):
     """
-    This OP computes the indices of the min elements of the input tensor's
+    Computing the indices of the min elements of the input tensor's
     element along the provided axis.
 
     Args:
@@ -217,7 +217,7 @@ def argmin(x, axis=None, keepdim=False, dtype="int64", name=None):
             is [-R, R), where R is x.ndim. when axis < 0, it works the same way
             as axis + R. Default is None, the input `x` will be into the flatten tensor, and selecting the min value index.
         keepdim(bool, optional): Whether to keep the given axis in output. If it is True, the dimensions will be same as input x and with size one in the axis. Otherwise the output dimentions is one fewer than x since the axis is squeezed. Default is False.
-        dtype(str): Data type of the output tensor which can
+        dtype(str, optional): Data type of the output tensor which can
                     be int32, int64. The default value is 'int64', and it will
                     return the int64 indices.
         name(str, optional): The default value is None. Normally there is no
@@ -225,11 +225,11 @@ def argmin(x, axis=None, keepdim=False, dtype="int64", name=None):
             refer to :ref:`api_guide_Name`.
 
     Returns:
-        Tensor, return the tensor of `int32` if set :attr:`dtype` is `int32`, otherwise return the tensor of `int64`
+        Tensor, return the tensor of `int32` if set :attr:`dtype` is `int32`, otherwise return the tensor of `int64`.
 
     Examples:
         .. code-block:: python
-
+          :name: code-example1
             import paddle
 
             x =  paddle.to_tensor([[5,8,9,5],
@@ -834,7 +834,7 @@ def masked_select(x, mask, name=None):
 
 def topk(x, k, axis=None, largest=True, sorted=True, name=None):
     """
-    This OP is used to find values and indices of the k largest or smallest at the optional axis.
+    Return values and indices of the k largest or smallest at the optional axis.
     If the input is a 1-D Tensor, finds the k largest or smallest values and indices.
     If the input is a Tensor with higher rank, this operator computes the top k values and indices along the :attr:`axis`.
 
@@ -856,35 +856,27 @@ def topk(x, k, axis=None, largest=True, sorted=True, name=None):
     Examples:
 
         .. code-block:: python
+          :name: code-example1
+            import paddle
 
-           import paddle
+            data_1 = paddle.to_tensor([1, 4, 5, 7])
+            value_1, indices_1 = paddle.topk(data_1, k=1)
+            print(value_1) # [7]
+            print(indices_1) # [3]
+
+            data_2 = paddle.to_tensor([[1, 4, 5, 7], [2, 6, 2, 5]])
+            value_2, indices_2 = paddle.topk(data_2, k=1)
+            print(value_2) # [[7], [6]]
+            print(indices_2) # [[3], [1]]
+
+            value_3, indices_3 = paddle.topk(data_2, k=1, axis=-1)
+            print(value_3) # [[7], [6]]
+            print(indices_3) # [[3], [1]]
+
+            value_4, indices_4 = paddle.topk(data_2, k=1, axis=0)
+            print(value_4) # [[2, 6, 5, 7]]
+            print(indices_4) # [[1, 1, 0, 0]]
 
-           tensor_1 = paddle.to_tensor([1, 4, 5, 7])
-           value_1, indices_1 = paddle.topk(tensor_1, k=1)
-           print(value_1)
-           # [7]
-           print(indices_1)
-           # [3] 
-           tensor_2 = paddle.to_tensor([[1, 4, 5, 7], [2, 6, 2, 5]])
-           value_2, indices_2 = paddle.topk(tensor_2, k=1)
-           print(value_2)
-           # [[7]
-           #  [6]]
-           print(indices_2)
-           # [[3]
-           #  [1]]
-           value_3, indices_3 = paddle.topk(tensor_2, k=1, axis=-1)
-           print(value_3)
-           # [[7]
-           #  [6]]
-           print(indices_3)
-           # [[3]
-           #  [1]]
-           value_4, indices_4 = paddle.topk(tensor_2, k=1, axis=0)
-           print(value_4)
-           # [[2 6 5 7]]
-           print(indices_4)
-           # [[1 1 0 0]]
 
     """
 
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index 2b8cff3543e76..58c9ea6e5d2e8 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -603,7 +603,7 @@
 
 - api : einsum
   args : (Tensor[] x, str equation)
-  output : Tensor, Tensor[]{x.size()}
+  output : Tensor, Tensor[]{x.size()}, Tensor[]{x.size()}
   infer_meta :
     func : EinsumInferMeta
     param : [x, equation]
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index 8e20b05110e71..2cdf22beeed96 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -1,3 +1,14 @@
+#- backward_api : einsum_grad
+
+  #forward : einsum (Tensor[] x, str equation) -> Tensor(out), Tensor[](inner_cache)
+  #args : (Tensor[] x, Tensor[] inner_cache, Tensor out_grad, str equation)
+  #output : Tensor[](x_grad){x.size()}
+  #infer_meta :
+    #func : UnchangedMultiInferMeta
+    #param : [x]
+  #kernel :
+    #func : einsum_grad
+
 - backward_api : abs_double_grad
   forward : abs_grad (Tensor x, Tensor grad_out) -> Tensor(grad_x)
   args : (Tensor x, Tensor grad_x_grad)
@@ -616,12 +627,12 @@
     skip_transform : out_w, out_w_grad
 
 - backward_api : einsum_grad
-  forward : einsum (Tensor[] x, str equation) -> Tensor(out), Tensor[](inner_cache)
-  args : (Tensor[] x, Tensor[] inner_cache, Tensor out_grad, str equation)
+  forward : einsum (Tensor[] x, str equation) -> Tensor(out), Tensor[](inner_cache), Tensor[](x_shape)
+  args : (Tensor[] x_shape, Tensor[] inner_cache, Tensor out_grad, str equation)
   output : Tensor[](x_grad){x.size()}
   infer_meta :
     func : UnchangedMultiInferMeta
-    param : [x]
+    param : [x_shape]
   kernel :
     func : einsum_grad
 
diff --git a/tools/codestyle/pre_commit.sh b/tools/codestyle/pre_commit.sh
new file mode 100755
index 0000000000000..7ea8a1658da26
--- /dev/null
+++ b/tools/codestyle/pre_commit.sh
@@ -0,0 +1,44 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set +x
+
+# use pre-commit 2.17
+if ! [[ $(pre-commit --version) == *"2.17.0"* ]]; then
+    pip install pre-commit==2.17.0 1>nul
+fi
+
+diff_files=$(git diff --numstat ${BRANCH} | awk '{print $NF}')
+echo -e "diff files between pr and ${BRANCH}:\n${diff_files}"
+
+echo "Checking code style by pre-commit ..."
+pre-commit run --files ${diff_files};check_error=$?
+
+if test ! -z "$(git diff)"; then
+    echo -e '\n************************************************************************************'
+    echo -e "These files have been formated by code format hook. You should use pre-commit to \
+format them before git push."
+    echo -e '************************************************************************************\n'
+    git diff 2>&1
+fi
+
+echo -e '\n***********************************'
+if [ ${check_error} != 0 ];then
+    echo "Your PR code style check failed."
+else
+    echo "Your PR code style check passed."
+fi
+echo -e '***********************************\n'
+
+exit ${check_error}
diff --git a/tools/infer_prune_patches/analysis_predictor.cc.patch b/tools/infer_prune_patches/analysis_predictor.cc.patch
new file mode 100644
index 0000000000000..21fa24dd3d957
--- /dev/null
+++ b/tools/infer_prune_patches/analysis_predictor.cc.patch
@@ -0,0 +1,31 @@
+diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
+index 0645af611b..6b05a7fffb 100644
+--- a/paddle/fluid/inference/api/analysis_predictor.cc
++++ b/paddle/fluid/inference/api/analysis_predictor.cc
+@@ -1923,7 +1923,7 @@ USE_TRT_CONVERTER(shuffle_channel);
+ USE_TRT_CONVERTER(swish);
+ USE_TRT_CONVERTER(group_norm);
+ USE_TRT_CONVERTER(instance_norm);
+-USE_TRT_CONVERTER(layer_norm);
++//USE_TRT_CONVERTER(layer_norm);
+ USE_TRT_CONVERTER(gelu);
+ USE_TRT_CONVERTER(multihead_matmul);
+ USE_TRT_CONVERTER(fused_embedding_eltwise_layernorm);
+@@ -1933,13 +1933,13 @@ USE_TRT_CONVERTER(scale);
+ USE_TRT_CONVERTER(stack);
+ USE_TRT_CONVERTER(clip);
+ USE_TRT_CONVERTER(gather);
+-USE_TRT_CONVERTER(anchor_generator);
++//USE_TRT_CONVERTER(anchor_generator);
+ USE_TRT_CONVERTER(yolo_box);
+ USE_TRT_CONVERTER(yolo_box_head);
+ USE_TRT_CONVERTER(arg_max);
+-USE_TRT_CONVERTER(roi_align);
+-USE_TRT_CONVERTER(affine_channel);
+-USE_TRT_CONVERTER(multiclass_nms);
++//USE_TRT_CONVERTER(roi_align);
++//USE_TRT_CONVERTER(affine_channel);
++//USE_TRT_CONVERTER(multiclass_nms);
+ USE_TRT_CONVERTER(multiclass_nms3);
+ USE_TRT_CONVERTER(nearest_interp);
+ USE_TRT_CONVERTER(nearest_interp_v2);
diff --git a/tools/infer_prune_patches/analyzer.cc.patch b/tools/infer_prune_patches/analyzer.cc.patch
new file mode 100644
index 0000000000000..59a7b4d6b8c2e
--- /dev/null
+++ b/tools/infer_prune_patches/analyzer.cc.patch
@@ -0,0 +1,14 @@
+diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
+index be7d6ab868..498e09cb4d 100644
+--- a/paddle/fluid/inference/analysis/analyzer.cc
++++ b/paddle/fluid/inference/analysis/analyzer.cc
+@@ -32,6 +32,9 @@ void Analyzer::RunAnalysis(Argument *argument) {
+                         "analsis_passes is not valid in the argument."));
+   const bool disable_logs = argument->disable_logs();
+   for (auto &pass : argument->analysis_passes()) {
++    if (pass == "ir_params_sync_among_devices_pass") {
++      continue;
++    }
+     if (!disable_logs) {
+       string::PrettyLogH1("--- Running analysis [%s]", pass);
+     }
diff --git a/tools/infer_prune_patches/device_context.cc.patch b/tools/infer_prune_patches/device_context.cc.patch
new file mode 100644
index 0000000000000..75be9a0d1d997
--- /dev/null
+++ b/tools/infer_prune_patches/device_context.cc.patch
@@ -0,0 +1,46 @@
+diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
+index 904e4854ba..4f8c955d8c 100644
+--- a/paddle/fluid/platform/device_context.cc
++++ b/paddle/fluid/platform/device_context.cc
+@@ -466,15 +466,15 @@ CUDAContext::CUDAContext(const CUDAPlace& place,
+   place_ = place;
+   CUDADeviceGuard guard(place_.device);
+   stream_.reset(new stream::CUDAStream(place, priority, flag));
+-  InitEigenContext();
+-  InitCuBlasContext();
+-  InitCuDNNContext();
++  //InitEigenContext();
++  //InitCuBlasContext();
++  //InitCuDNNContext();
+ #ifndef PADDLE_WITH_HIP
+ #if CUDA_VERSION >= 11060
+-  InitCuBlasLtContext();
++  //InitCuBlasLtContext();
+ #endif
+-  InitCuSparseContext();
+-  InitCuSolverContext();
++  //InitCuSparseContext();
++  //InitCuSolverContext();
+ #endif
+ }
+ 
+@@ -506,14 +506,14 @@ void CUDAContext::SetStream(gpuStream_t stream) {
+ 
+ CUDAContext::~CUDAContext() {
+   CUDADeviceGuard guard(place_.device);
+-  DestoryCuDNNContext();
+-  DestoryCuBlasContext();
++  //DestoryCuDNNContext();
++  //DestoryCuBlasContext();
+ #ifndef PADDLE_WITH_HIP
+ #if CUDA_VERSION >= 11060
+-  InitCuBlasLtContext();
++  //InitCuBlasLtContext();
+ #endif
+-  DestoryCuSparseContext();
+-  DestoryCuSolverContext();
++  //DestoryCuSparseContext();
++  //DestoryCuSolverContext();
+ #endif
+ }
+ 
diff --git a/tools/infer_prune_patches/jitcode.h.patch b/tools/infer_prune_patches/jitcode.h.patch
new file mode 100644
index 0000000000000..9022b459db51c
--- /dev/null
+++ b/tools/infer_prune_patches/jitcode.h.patch
@@ -0,0 +1,15 @@
+diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h
+index 23650c8efc..24466e4327 100644
+--- a/paddle/fluid/operators/jit/gen/jitcode.h
++++ b/paddle/fluid/operators/jit/gen/jitcode.h
+@@ -97,8 +97,8 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator {
+     }
+     ret();
+   }
+-  void L(const char* label) { Xbyak::CodeGenerator::L(label); }
+-  void L(Xbyak::Label& label) { Xbyak::CodeGenerator::L(label); }  // NOLINT
++  void L(const char* label) { }
++  void L(Xbyak::Label& label) { }  // NOLINT
+   // Enhanced vector extension
+   Xbyak::Address EVEX_compress_addr(Xbyak::Reg64 base, int offt,
+                                     bool bcast = false) {
diff --git a/tools/infer_prune_patches/op_registry.h.patch b/tools/infer_prune_patches/op_registry.h.patch
new file mode 100644
index 0000000000000..a1d2a66347cc4
--- /dev/null
+++ b/tools/infer_prune_patches/op_registry.h.patch
@@ -0,0 +1,215 @@
+diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
+index a1f07f9f25..179df3b981 100644
+--- a/paddle/fluid/framework/op_registry.h
++++ b/paddle/fluid/framework/op_registry.h
+@@ -178,9 +178,8 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
+     RegisterKernelClass<PlaceType, T>(
+         op_type, library_type, customized_type_value,
+ 
+-        [op_type](const framework::ExecutionContext& ctx) {
++        [](const framework::ExecutionContext& ctx) {
+           KERNEL_TYPE().Compute(ctx);
+-          CheckKernelLaunch<PlaceType>(op_type);
+         });
+     constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
+     OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...>
+@@ -240,13 +239,8 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
+ 
+   void operator()(const char* op_type, const char* library_type,
+                   int customized_type_value) const {
+-    RegisterKernelClass<PlaceType, T>(
+-        op_type, library_type, customized_type_value,
+-
+-        [op_type](const framework::ExecutionContext& ctx) {
+-          Functor()(ctx);
+-          CheckKernelLaunch<PlaceType>(op_type);
+-        });
++    RegisterKernelClass<PlaceType, T>(op_type, library_type,
++                                      customized_type_value, Functor());
+ 
+     constexpr auto size =
+         std::tuple_size<std::tuple<DataTypeAndKernelType...>>::value;
+@@ -275,7 +269,7 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
+     VarTypeInference
+     InferShapeBase
+ */
+-#define REGISTER_OPERATOR(op_type, op_class, ...)                        \
++#define REGISTER_OPERATOR__(op_type, op_class, ...)                        \
+   STATIC_ASSERT_GLOBAL_NAMESPACE(                                        \
+       __reg_op__##op_type,                                               \
+       "REGISTER_OPERATOR must be called in global namespace");           \
+@@ -286,15 +280,22 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
+     return 0;                                                            \
+   }
+ 
+-#define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, ...) \
++#define REGISTER_OPERATOR(op_type, op_class, ...)
++
++#define REGISTER_OP_WITHOUT_GRADIENT__(op_type, op_class, ...) \
+   REGISTER_OPERATOR(op_type, op_class, __VA_ARGS__, \
+         paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,   \
+         paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>)
+ 
++#define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, ...)
+ /**
+  * Macro to register OperatorKernel.
+  */
+ #define REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(op_type, library_type,             \
++                                            place_class, customized_name,      \
++                                            customized_type_value, ...)
++
++#define REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE__(op_type, library_type,             \
+                                             place_class, customized_name,      \
+                                             customized_type_value, ...)        \
+   STATIC_ASSERT_GLOBAL_NAMESPACE(                                              \
+@@ -311,18 +312,22 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
+     return 0;                                                                  \
+   }
+ 
+-#define REGISTER_OP_KERNEL(op_type, library_type, place_class, ...)   \
+-  REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(                                \
++#define REGISTER_OP_KERNEL(op_type, library_type, place_class, ...)
++
++#define REGISTER_OP_KERNEL__(op_type, library_type, place_class, ...)   \
++  REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE__(                                \
+       op_type, library_type, place_class, DEFAULT_TYPE,               \
+       ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
+       __VA_ARGS__)
+ 
+-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
++#define REGISTER_OP_CUDA_KERNEL__(op_type, ...) \
++  REGISTER_OP_KERNEL__(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__)
++
+ #define REGISTER_OP_CUDA_KERNEL(op_type, ...) \
+   REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__)
+-#else
+-#define REGISTER_OP_CUDA_KERNEL(op_type, ...)
+-#endif
++
++#define REGISTER_OP_CPU_KERNEL__(op_type, ...) \
++  REGISTER_OP_KERNEL__(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
+ 
+ #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
+   REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
+@@ -340,6 +345,11 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
+   REGISTER_OP_KERNEL(op_type, MLU, ::paddle::platform::MLUPlace, __VA_ARGS__)
+ 
+ #define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class,  \
++                              customized_name,                     \
++                              customized_type_value,               \
++			      ...)
++
++#define REGISTER_OP_KERNEL_EX__(op_type, library_type, place_class,  \
+                               customized_name,                     \
+                               customized_type_value,               \
+                               ...)                                 \
+@@ -357,8 +367,10 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
+     return 0;                                                                  \
+   }
+ 
+-#define REGISTER_OP_CUDA_KERNEL_FUNCTOR(op_type, ...)                 \
+-  REGISTER_OP_KERNEL_EX(                                              \
++#define REGISTER_OP_CUDA_KERNEL_FUNCTOR(op_type, ...)
++
++#define REGISTER_OP_CUDA_KERNEL_FUNCTOR__(op_type, ...)                 \
++  REGISTER_OP_KERNEL_EX__(                                              \
+       op_type, CUDA, ::paddle::platform::CUDAPlace, DEFAULT_TYPE,     \
+       ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
+       __VA_ARGS__)
+@@ -375,12 +387,6 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
+       ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
+       __VA_ARGS__)
+ 
+-#define REGISTER_OP_NPU_KERNEL_FUNCTOR(op_type, ...)                  \
+-  REGISTER_OP_KERNEL_EX(                                              \
+-      op_type, NPU, ::paddle::platform::NPUPlace, DEFAULT_TYPE,       \
+-      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
+-      __VA_ARGS__)
+-
+ #define REGISTER_OP_MLU_KERNEL_FUNCTOR(op_type, ...)                  \
+   REGISTER_OP_KERNEL_EX(                                              \
+       op_type, MLU, ::paddle::platform::MLUPlace, DEFAULT_TYPE,       \
+@@ -392,7 +398,9 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
+  * we will use and tell the compiler to
+  * link them into target.
+  */
+-#define USE_OP_ITSELF(op_type)                             \
++#define USE_OP_ITSELF(op_type)
++
++#define USE_OP_ITSELF__(op_type)                             \
+   STATIC_ASSERT_GLOBAL_NAMESPACE(                          \
+       __use_op_itself_##op_type,                           \
+       "USE_OP_ITSELF must be called in global namespace"); \
+@@ -400,6 +408,10 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
+   UNUSED static int use_op_itself_##op_type##_ = TouchOpRegistrar_##op_type()
+ 
+ #define USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(op_type,                     \
++                                              LIBRARY_TYPE,                \
++                                              customized_name)
++
++#define USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE__(op_type,                     \
+                                               LIBRARY_TYPE,                \
+                                               customized_name)             \
+   STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
+@@ -410,33 +422,58 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
+   UNUSED static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_##customized_name##_ = /* NOLINT */ \
+       TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE##_##customized_name()
+ 
+-#define USE_OP_DEVICE_KERNEL(op_type, LIBRARY_TYPE) \
+-  USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(op_type, LIBRARY_TYPE, DEFAULT_TYPE)
++#define USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(op_type,                \
++                                              LIBRARY_TYPE,           \
++                                              customized_name)
++
++#define USE_OP_DEVICE_KERNEL__(op_type, LIBRARY_TYPE) \
++  USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE__(op_type, LIBRARY_TYPE, DEFAULT_TYPE)
++
++#define USE_OP_DEVICE_KERNEL(op_type, LIBRARY_TYPE)
+ 
+ // TODO(fengjiayi): The following macros
+ // seems ugly, do we have better method?
+ 
+-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
++#ifndef PADDLE_WITH_CUDA
+ #define USE_OP_KERNEL(op_type) USE_OP_DEVICE_KERNEL(op_type, CPU)
++#define USE_OP_KERNEL__(op_type) USE_OP_DEVICE_KERNEL__(op_type, CPU)
+ #else
+ #define USE_OP_KERNEL(op_type)        \
+   USE_OP_DEVICE_KERNEL(op_type, CPU); \
+   USE_OP_DEVICE_KERNEL(op_type, CUDA)
++
++#define USE_OP_KERNEL__(op_type)        \
++  USE_OP_DEVICE_KERNEL__(op_type, CPU); \
++  USE_OP_DEVICE_KERNEL__(op_type, CUDA)
+ #endif
+ 
+ #define USE_NO_KERNEL_OP(op_type) USE_OP_ITSELF(op_type);
+ 
++#define USE_NO_KERNEL_OP__(op_type) USE_OP_ITSELF__(op_type);
++
+ #define USE_CPU_ONLY_OP(op_type) \
+   USE_OP_ITSELF(op_type);        \
+   USE_OP_DEVICE_KERNEL(op_type, CPU);
+ 
++#define USE_CPU_ONLY_OP__(op_type) \
++  USE_OP_ITSELF__(op_type);        \
++  USE_OP_DEVICE_KERNEL__(op_type, CPU);
++
+ #define USE_CUDA_ONLY_OP(op_type) \
+   USE_OP_ITSELF(op_type);         \
+   USE_OP_DEVICE_KERNEL(op_type, CUDA)
+ 
++#define USE_CUDA_ONLY_OP__(op_type) \
++  USE_OP_ITSELF__(op_type);         \
++  USE_OP_DEVICE_KERNEL__(op_type, CUDA)
++
+ #define USE_OP(op_type)   \
+   USE_OP_ITSELF(op_type); \
+   USE_OP_KERNEL(op_type)
++
++#define USE_OP__(op_type)   \
++  USE_OP_ITSELF__(op_type); \
++  USE_OP_KERNEL__(op_type)
+ // clang-format on
+ 
+ }  // namespace framework
diff --git a/tools/infer_prune_patches/paddle_analysis_config.h.patch b/tools/infer_prune_patches/paddle_analysis_config.h.patch
new file mode 100644
index 0000000000000..292a1a7e10f00
--- /dev/null
+++ b/tools/infer_prune_patches/paddle_analysis_config.h.patch
@@ -0,0 +1,21 @@
+diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
+index d6a0b643c2..511844b482 100644
+--- a/paddle/fluid/inference/api/paddle_analysis_config.h
++++ b/paddle/fluid/inference/api/paddle_analysis_config.h
+@@ -46,6 +46,7 @@
+ namespace paddle {
+ 
+ class AnalysisPredictor;
++class TensorRTPredictor;
+ struct MkldnnQuantizerConfig;
+ 
+ struct LiteNNAdapterConfig {
+@@ -700,6 +701,8 @@ struct PD_INFER_DECL AnalysisConfig {
+ 
+   friend class ::paddle::AnalysisPredictor;
+ 
++  friend class ::paddle::TensorRTPredictor;
++
+   ///
+   /// \brief Get a pass builder for customize the passes in IR analysis phase.
+   /// NOTE: Just for developer, not an official API, easy to be broken.
diff --git a/tools/infer_prune_patches/paddle_api.h.patch b/tools/infer_prune_patches/paddle_api.h.patch
new file mode 100644
index 0000000000000..3d7baa7f21826
--- /dev/null
+++ b/tools/infer_prune_patches/paddle_api.h.patch
@@ -0,0 +1,12 @@
+diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
+index b28370fb82..aaf20a28b0 100644
+--- a/paddle/fluid/inference/api/paddle_api.h
++++ b/paddle/fluid/inference/api/paddle_api.h
+@@ -194,6 +194,7 @@ class PD_INFER_DECL ZeroCopyTensor : public paddle_infer::Tensor {
+ 
+  private:
+   friend class AnalysisPredictor;
++  friend class TensorRTPredictor;
+   friend class ONNXRuntimePredictor;
+   explicit ZeroCopyTensor(void* scope, const void* device_contexts)
+       : paddle_infer::Tensor{scope, device_contexts} {}
diff --git a/tools/infer_prune_patches/paddle_inference_api.h.patch b/tools/infer_prune_patches/paddle_inference_api.h.patch
new file mode 100644
index 0000000000000..96de7e189e744
--- /dev/null
+++ b/tools/infer_prune_patches/paddle_inference_api.h.patch
@@ -0,0 +1,16 @@
+diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
+index 35b90bfa54..ba8220d06a 100644
+--- a/paddle/fluid/inference/api/paddle_inference_api.h
++++ b/paddle/fluid/inference/api/paddle_inference_api.h
+@@ -41,6 +41,11 @@ limitations under the License. */
+ /// \since 2.0.0-beta
+ ///
+ 
++namespace paddle {
++std::unique_ptr<PaddlePredictor> CreateTensorRTPredictor(
++    const AnalysisConfig& config);
++}
++
+ namespace paddle_infer {
+ 
+ using PrecisionType = paddle::AnalysisConfig::Precision;
diff --git a/tools/infer_prune_patches/phi_cmake.patch b/tools/infer_prune_patches/phi_cmake.patch
new file mode 100644
index 0000000000000..2eba0e0c14c6a
--- /dev/null
+++ b/tools/infer_prune_patches/phi_cmake.patch
@@ -0,0 +1,13 @@
+diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
+index 58ad42ddd1..8ffdafcf0d 100644
+--- a/paddle/phi/CMakeLists.txt
++++ b/paddle/phi/CMakeLists.txt
+@@ -18,7 +18,7 @@ add_subdirectory(infermeta)
+ # phi operator definitions
+ add_subdirectory(ops)
+ # phi tools
+-add_subdirectory(tools)
++#add_subdirectory(tools)
+ # phi tests
+ add_subdirectory(tests)
+ 
diff --git a/tools/infer_prune_patches/tensorrt_subgraph_pass.cc.patch b/tools/infer_prune_patches/tensorrt_subgraph_pass.cc.patch
new file mode 100644
index 0000000000000..307f12ee97182
--- /dev/null
+++ b/tools/infer_prune_patches/tensorrt_subgraph_pass.cc.patch
@@ -0,0 +1,68 @@
+diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+index 394ce7799e..8edbef50be 100644
+--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
++++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+@@ -390,6 +390,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
+       graph->Has(framework::ir::kEmbEltwiseLayernormPass) &&
+       graph->Has(framework::ir::kMultiheadMatmulPass));
+ 
++  std::unordered_set<std::string> param_set(params.begin(), params.end());
+   if (use_static_engine) {
+     trt_engine_serialized_data = GetTrtEngineSerializedData(
+         Get<std::string>("model_opt_cache_dir"), engine_key);
+@@ -399,6 +400,19 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
+       LOG(INFO) << "Load TRT Optimized Info from "
+                 << GetTrtEngineSerializedPath(
+                        Get<std::string>("model_opt_cache_dir"), engine_key);
++      const auto* root_scope{param_scope()};
++      for (;root_scope->parent();) {
++        root_scope = root_scope->parent();
++      }
++      for (const auto& name: param_set) {
++        LOG(INFO) << " ===== Clear param: " << name;
++        root_scope->FindLocalVar(name)->Clear();
++      }
++      for (int dev_id = 0; dev_id < paddle::platform::GetGPUDeviceCount();
++          ++dev_id) {
++        memory::Release(platform::CUDAPlace(dev_id));
++      }
++      memory::Release(platform::CPUPlace());
+       return;
+     }
+   }
+@@ -411,12 +425,25 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
+ 
+   auto *scope = param_scope();
+   framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto());
+-  std::unordered_set<std::string> param_set(params.begin(), params.end());
+   inference::Singleton<inference::tensorrt::OpConverter>::Global()
+       .ConvertBlockToTRTEngine(
+           &block_desc_temp, *scope,
+           std::vector<std::string>(input_names.begin(), input_names.end()),
+           param_set, output_mapping, trt_engine);
++  const auto* root_scope{scope};
++  for (;root_scope->parent();) {
++    root_scope = root_scope->parent();
++  }
++  VLOG(4) << "root_scope->LocalVarNames().size: " << root_scope->LocalVarNames().size();
++  for (const auto& name: param_set) {
++    VLOG(4) << "  ===== Clear param: " << name;
++    root_scope->FindLocalVar(name)->Clear();
++  }
++  for (int dev_id = 0; dev_id < paddle::platform::GetGPUDeviceCount();
++       ++dev_id) {
++    memory::Release(platform::CUDAPlace(dev_id));
++  }
++  memory::Release(platform::CPUPlace());
+ 
+   if (use_static_engine) {
+     nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize();
+@@ -431,6 +458,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
+               << GetTrtEngineSerializedPath(
+                      Get<std::string>("model_opt_cache_dir"), engine_key);
+   }
++  trt_engine_serialized_data.clear();
++  trt_engine_serialized_data.shrink_to_fit();
+ }
+ 
+ }  // namespace analysis
diff --git a/tools/infer_prune_patches/thread_local_allocator.cc.patch b/tools/infer_prune_patches/thread_local_allocator.cc.patch
new file mode 100644
index 0000000000000..6a4486aae9457
--- /dev/null
+++ b/tools/infer_prune_patches/thread_local_allocator.cc.patch
@@ -0,0 +1,95 @@
+diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.cc b/paddle/fluid/memory/allocation/thread_local_allocator.cc
+index f125670a59..f858a30301 100644
+--- a/paddle/fluid/memory/allocation/thread_local_allocator.cc
++++ b/paddle/fluid/memory/allocation/thread_local_allocator.cc
+@@ -13,18 +13,62 @@
+ // limitations under the License.
+ 
+ #include "paddle/fluid/memory/allocation/thread_local_allocator.h"
++#include "paddle/fluid/platform/cuda_device_guard.h"
+ 
+ namespace paddle {
+ namespace memory {
+ namespace allocation {
+ 
++const int MALLOC_ALIGN = 64;
++
++#define CUDA_CALL(func)                                      \
++  {                                                          \
++    auto e = (func);                                         \
++    CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \
++        << "CUDA: " << cudaGetErrorString(e);                \
++  }
++
++void* DirectAllocator::Alloc(size_t unaligned_size) {
++  if (platform::is_cpu_place(place_)) {
++    size_t offset = sizeof(void*) + MALLOC_ALIGN - 1;
++    char* p = static_cast<char*>(std::malloc(offset + unaligned_size));
++    // Memory checking
++    CHECK(p) << "Error occurred in malloc period: available space is not enough "
++                "for mallocing "
++            << unaligned_size << " bytes.";
++    // Byte alignment
++    void* r = reinterpret_cast<void*>(reinterpret_cast<size_t>(p + offset) &
++                                      (~(MALLOC_ALIGN - 1)));
++    static_cast<void**>(r)[-1] = p;
++    return r;
++  } else if (platform::is_gpu_place(place_)) {
++    int dev_id = place_.GetDeviceId();
++    platform::CUDADeviceGuard guard(dev_id);
++    void* ptr{};
++    CUDA_CALL(cudaMalloc(&ptr, unaligned_size));
++    return ptr;
++  }
++  return nullptr;
++}
++
++void DirectAllocator::Free(void* ptr) {
++  if (platform::is_cpu_place(place_)) {
++    if (ptr) {
++      std::free(static_cast<void**>(ptr)[-1]);
++    } 
++  } else if (platform::is_gpu_place(place_)) {
++    int dev_id = place_.GetDeviceId();
++    platform::CUDADeviceGuard guard(dev_id);
++    CUDA_CALL(cudaFree(ptr));
++  }
++}
++
++
++
+ ThreadLocalAllocatorImpl::ThreadLocalAllocatorImpl(const platform::Place& p)
+     : place_(p) {
+   if (platform::is_gpu_place(place_)) {
+-    buddy_allocator_.reset(new memory::detail::BuddyAllocator(
+-        std::unique_ptr<memory::detail::SystemAllocator>(
+-            new memory::detail::GPUAllocator(place_.device)),
+-        platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()));
++    direct_allocator_.reset(new DirectAllocator{place_});
+   } else {
+     PADDLE_THROW(platform::errors::Unavailable(
+         "Thread local allocator only supports CUDAPlace now."));
+@@ -59,7 +103,7 @@ ThreadLocalCUDAAllocatorPool::ThreadLocalCUDAAllocatorPool()
+ 
+ ThreadLocalAllocation* ThreadLocalAllocatorImpl::AllocateImpl(size_t size) {
+   VLOG(10) << "ThreadLocalAllocatorImpl::AllocateImpl " << size;
+-  void* ptr = buddy_allocator_->Alloc(size);
++  void* ptr = direct_allocator_->Alloc(size);
+   auto* tl_allocation = new ThreadLocalAllocation(ptr, size, place_);
+   tl_allocation->SetThreadLocalAllocatorImpl(shared_from_this());
+   return tl_allocation;
+@@ -67,12 +111,12 @@ ThreadLocalAllocation* ThreadLocalAllocatorImpl::AllocateImpl(size_t size) {
+ 
+ void ThreadLocalAllocatorImpl::FreeImpl(ThreadLocalAllocation* allocation) {
+   VLOG(10) << "ThreadLocalAllocatorImpl::FreeImpl " << allocation;
+-  buddy_allocator_->Free(allocation->ptr());
++  direct_allocator_->Free(allocation->ptr());
+   delete allocation;
+ }
+ 
+ uint64_t ThreadLocalAllocatorImpl::ReleaseImpl() {
+-  return buddy_allocator_->Release();
++  return direct_allocator_->Release();
+ }
+ 
+ }  // namespace allocation
diff --git a/tools/infer_prune_patches/thread_local_allocator.h.patch b/tools/infer_prune_patches/thread_local_allocator.h.patch
new file mode 100644
index 0000000000000..a3c24178d2093
--- /dev/null
+++ b/tools/infer_prune_patches/thread_local_allocator.h.patch
@@ -0,0 +1,30 @@
+diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.h b/paddle/fluid/memory/allocation/thread_local_allocator.h
+index 654fb3fe7b..44c5dbf87f 100644
+--- a/paddle/fluid/memory/allocation/thread_local_allocator.h
++++ b/paddle/fluid/memory/allocation/thread_local_allocator.h
+@@ -26,6 +26,16 @@ namespace paddle {
+ namespace memory {
+ namespace allocation {
+ 
++class DirectAllocator {
++public:
++  DirectAllocator(const platform::Place& place) : place_{place} {}
++  void* Alloc(size_t unaligned_size);
++  void Free(void* ptr);
++  uint64_t Release() { return 0;}
++private:
++  platform::Place place_;
++};
++
+ class ThreadLocalAllocatorImpl;
+ 
+ class ThreadLocalAllocation : public Allocation {
+@@ -55,7 +65,7 @@ class ThreadLocalAllocatorImpl
+   uint64_t ReleaseImpl();
+ 
+  private:
+-  std::unique_ptr<memory::detail::BuddyAllocator> buddy_allocator_;
++  std::unique_ptr<DirectAllocator> direct_allocator_;
+   platform::Place place_;
+ };
+ 
diff --git a/tools/prune_for_jetson.py b/tools/prune_for_jetson.py
new file mode 100644
index 0000000000000..d53b21d6c3723
--- /dev/null
+++ b/tools/prune_for_jetson.py
@@ -0,0 +1,190 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This script simply removes all grad ops and kernels. You should use this script 
+when cmake ON_INFER=ON, which can greatly reduce the volume of the prediction library.
+"""
+
+import os
+import sys
+import re
+import glob
+import io
+
+
+def find_type_files(cur_dir, file_type, file_list=[]):
+    next_level_dirs = os.listdir(cur_dir)
+    for next_level_name in next_level_dirs:
+        next_level_dir = os.path.join(cur_dir, next_level_name)
+        if os.path.isfile(next_level_dir):
+            if os.path.splitext(next_level_dir)[1] == file_type:
+                file_list.append(next_level_dir)
+        elif os.path.isdir(next_level_dir):
+            find_type_files(next_level_dir, file_type, file_list)
+    return file_list
+
+
+def find_kernel(content, pattern):
+    res = re.findall(pattern, content, flags=re.DOTALL)
+    ret = []
+    for p in res:
+        left, right = 0, 0
+        for c in p:
+            if c == '{':
+                left += 1
+            elif c == '}':
+                right += 1
+
+        if left == right:
+            ret.append(p)
+
+    return ret, len(ret)
+
+
+def prune_phi_kernels():
+    tool_dir = os.path.dirname(os.path.abspath(__file__))
+    if sys.version_info[0] == 3:
+        all_op = glob.glob(os.path.join(tool_dir,
+                                        '../paddle/phi/kernels/**/*.cc'),
+                           recursive=True)
+        all_op += glob.glob(os.path.join(tool_dir,
+                                         '../paddle/phi/kernels/**/*.cu'),
+                            recursive=True)
+    elif sys.version_info[0] == 2:
+        all_op = find_type_files(
+            os.path.join(tool_dir, '../paddle/phi/kernels/'), '.cc')
+        all_op = find_type_files(
+            os.path.join(tool_dir, '../paddle/phi/kernels/'), '.cu', all_op)
+
+    register_op_count = 0
+    for op_file in all_op:
+        need_continue = False
+        file_blacklist = [
+            "kernels/empty_kernel.cc", "/cast_kernel.c", "/batch_norm_kernel.c"
+        ]
+        for bname in file_blacklist:
+            if op_file.find(bname) >= 0:
+                need_continue = True
+                break
+
+        if need_continue:
+            print("continue:", op_file)
+            continue
+
+        op_name = os.path.split(op_file)[1]
+        all_matches = []
+        with io.open(op_file, 'r', encoding='utf-8') as f:
+            content = ''.join(f.readlines())
+            op_pattern = 'PD_REGISTER_KERNEL\(.*?\).*?\{.*?\}'
+            op, op_count = find_kernel(content, op_pattern)
+            register_op_count += op_count
+            all_matches.extend(op)
+
+        for p in all_matches:
+            content = content.replace(p, '')
+
+        with io.open(op_file, 'w', encoding='utf-8') as f:
+            f.write(u'{}'.format(content))
+
+    print('We erase all grad op and kernel for Paddle-Inference lib.')
+    print('%50s%10s' % ('type', 'count'))
+    print('%50s%10s' % ('REGISTER_OPERATOR', register_op_count))
+    return True
+
+
+def apply_patches():
+    work_path = os.path.dirname(os.path.abspath(__file__)) + "/../"
+    ret = os.system(
+        "cd %s && rm -f paddle/fluid/inference/api/tensorrt_predictor.* "
+        " && rm -f paddle/fluid/inference/api/paddle_tensorrt_predictor.h "
+        " && git apply tools/infer_prune_patches/*.patch && cd -" % work_path)
+    return ret == 0
+
+
+def append_fluid_kernels():
+    op_white_list = ["load", "load_combine"]
+
+    #1. add to makefile
+    file_name = os.path.dirname(os.path.abspath(__file__)) \
+                  + "/../paddle/fluid/inference/tensorrt/CMakeLists.txt"
+    append_str = "\nfile(APPEND ${pybind_file} \"USE_NO_KERNEL_OP__(tensorrt_engine);\\n\")\n"
+    for op in op_white_list:
+        append_str = append_str + "file(APPEND ${pybind_file} \"USE_OP__(%s);\\n\")\n" % op
+
+    with io.open(file_name, 'r', encoding='utf-8') as f:
+        content = ''.join(f.readlines())
+
+    location_str = "nv_library(\n  tensorrt_op_teller\n  SRCS op_teller.cc\n  DEPS framework_proto device_context boost)"
+    new_content = content.replace(location_str, location_str + append_str)
+
+    if new_content == content:
+        print("ERROR: can not find \"%s\" in file \"%s\"" %
+              (location_str, file_name))
+        return False
+
+    with io.open(file_name, 'w', encoding='utf-8') as f:
+        f.write(u'{}'.format(new_content))
+
+    #2. add op and kernel register
+    op_white_list.append("tensorrt_engine")
+    tool_dir = os.path.dirname(os.path.abspath(__file__))
+    if sys.version_info[0] == 3:
+        all_op = glob.glob(os.path.join(tool_dir,
+                                        '../paddle/fluid/operators/**/*.cc'),
+                           recursive=True)
+        all_op += glob.glob(os.path.join(tool_dir,
+                                         '../paddle/fluid/operators/**/*.cu'),
+                            recursive=True)
+    elif sys.version_info[0] == 2:
+        all_op = find_type_files(
+            os.path.join(tool_dir, '../paddle/fluid/operators/'), '.cc')
+        all_op = find_type_files(
+            os.path.join(tool_dir, '../paddle/fluid/operators/'), '.cu', all_op)
+
+    for op_file in all_op:
+        with io.open(op_file, 'r', encoding='utf-8') as f:
+            content = ''.join(f.readlines())
+
+        for op in op_white_list:
+            patterns = {
+                "REGISTER_OPERATOR": "REGISTER_OPERATOR\(\s*%s\s*," % op,
+                "REGISTER_OP_CPU_KERNEL":
+                "REGISTER_OP_CPU_KERNEL\(\s*%s\s*," % op,
+                "REGISTER_OP_CUDA_KERNEL":
+                "REGISTER_OP_CUDA_KERNEL\(\s*%s\s*," % op
+            }
+            for k, p in patterns.items():
+                matches = re.findall(p, content, flags=re.DOTALL)
+                if len(matches) > 0:
+                    content = content.replace(matches[0],
+                                              matches[0].replace(k, k + "__"))
+                    with io.open(op_file, 'w', encoding='utf-8') as f:
+                        f.write(u'{}'.format(content))
+
+    return True
+
+
+if __name__ == '__main__':
+
+    print("================ step 1: apply patches =======================")
+    assert (apply_patches())
+    print("==============================================================\n")
+
+    print("================ step 2: append fluid op/kernels==============")
+    assert (append_fluid_kernels())
+    print("==============================================================\n")
+
+    print("================ step 3:prune phi kernels ====================")
+    assert (prune_phi_kernels())
+    print("==============================================================\n")
diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh
index 44dc4eac26118..bedd44c06d506 100644
--- a/tools/windows/run_unittests.sh
+++ b/tools/windows/run_unittests.sh
@@ -85,6 +85,7 @@ disable_win_inference_api_test="^trt_quant_int8_yolov3_r50_test$|\
 ^lite_resnet50_test$|\
 ^test_trt_dynamic_shape_transformer_prune$|\
 ^lite_mul_model_test$|\
+^trt_split_converter_test$|\
 ^paddle_infer_api_copy_tensor_tester$"
 
 
@@ -191,10 +192,6 @@ if [ -f "$PADDLE_ROOT/added_ut" ];then
         echo "========================================"
         exit 8;
     fi
-    if nvcc --version | grep 11.2; then
-        echo "Only test added_ut temporarily when running in CI-Windows-inference of CUDA 11.2."
-        exit 0;
-    fi
 fi
 set -e