diff --git a/.gitignore b/.gitignore
index b8a2e8fbce933..74cf6b8ab0230 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,7 @@ paddle/fluid/API_PR.spec
 paddle/fluid/eager/api/generated/*
 paddle/fluid/op_use_default_grad_maker_DEV.spec
 paddle/fluid/op_use_default_grad_maker_PR.spec
+paddle/fluid/operators/ops_extra_info.h
 paddle/phi/api/backward/backward_api.h
 paddle/phi/api/backward/sparse_bw_api.h
 paddle/phi/api/include/api.h
@@ -64,10 +65,9 @@ paddle/infrt/dialect/pd/common/pd_ops_info.h
 paddle/infrt/tests/dialect/Output
 paddle/infrt/tests/lit.cfg.py
 paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.cc
-paddle/fluid/pybind/eager_final_state_op_function_impl.h
-paddle/fluid/pybind/tmp_eager_final_state_op_function_impl.h
+paddle/fluid/pybind/eager_final_state_op_function.cc
 
 # these files (directories) are generated before build system generation
 paddle/fluid/operators/generated_op.cc
 paddle/phi/ops/compat/generated_sig.cc
-python/paddle/utils/code_gen/parsed_apis/
+paddle/phi/api/yaml/parsed_apis/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 5c6bded87ce4a..77bf882a312f9 100755
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -24,7 +24,7 @@ repos:
         files: (?!.*third_party)^.*$ | (?!.*book)^.*$
     -   id: end-of-file-fixer
     -   id: sort-simple-yaml
-        files: (api|backward)\.yaml$
+        files: (api|backward|api_[a-z_]+)\.yaml$
 -   repo: local
     hooks:
     -   id: clang-format
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ea4bc8a2d6c3e..78ebbccfb2e7a 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -336,7 +336,12 @@ endif()
 if(LINUX
    AND NOT WITH_CUSTOM_DEVICE
    AND NOT ON_INFER)
-  set(WITH_CUSTOM_DEVICE ON)
+  set(WITH_CUSTOM_DEVICE
+      ON
+      CACHE BOOL "Enable Custom Device when compiling for Linux" FORCE)
+  message(
+    "Enable Custom Device when compiling for Linux. Force WITH_CUSTOM_DEVICE=ON."
+  )
 endif()
 
 if(WIN32)
diff --git a/SECURITY.md b/SECURITY.md
index 79bf3353ad4f9..04ccdd8062f51 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -24,7 +24,7 @@ PaddlePaddle security team attaches great importance to the security of the fram
 
 ### Reporting vulnerabilities
 
-We encourage responsible disclosure of security issues to PaddlePaddle and please email reports about any security issues you find to security@paddlepaddle.org.
+We encourage responsible disclosure of security issues to PaddlePaddle and please email reports about any security issues you find to paddle-security@baidu.com.
 
 
 
diff --git a/SECURITY_cn.md b/SECURITY_cn.md
index 00b222912d277..68ad6b32176b8 100644
--- a/SECURITY_cn.md
+++ b/SECURITY_cn.md
@@ -20,7 +20,7 @@
 
 ### 报告安全问题
 
-我们鼓励向飞桨负责任地披露安全问题，请将所发现的安全问题发送电子邮件到 security@paddlepaddle.org。
+我们鼓励向飞桨负责任地披露安全问题，请将所发现的安全问题发送电子邮件到 paddle-security@baidu.com。
 
 在安全团队收到邮件后将会及时与您沟通并反馈问题修复进度。
 
diff --git a/cmake/experiments/cuda_module_loading_lazy.cmake b/cmake/experiments/cuda_module_loading_lazy.cmake
index 0f0793a8ee32b..bcbfaacad1240 100644
--- a/cmake/experiments/cuda_module_loading_lazy.cmake
+++ b/cmake/experiments/cuda_module_loading_lazy.cmake
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 # this file contains experimental build options for lazy cuda module loading
-# cuda moduel lazy loading is supported by CUDA 11.6+
-# this experiment option makes Paddle supports lazy loading before CUDA 11.6.
+# cuda moduel lazy loading is supported by CUDA 11.7+
+# this experiment option makes Paddle supports lazy loading before CUDA 11.7.
 
 option(EXP_CUDA_MODULE_LOADING_LAZY "enable lazy cuda module loading" OFF)
 if(${EXP_CUDA_MODULE_LOADING_LAZY})
@@ -28,13 +28,13 @@ if(${EXP_CUDA_MODULE_LOADING_LAZY})
     message("EXP_CUDA_MODULE_LOADING_LAZY only works with CUDA")
     return()
   endif()
-  if(${CUDA_VERSION} VERSION_GREATER_EQUAL "11.6")
-    message("cuda 11.6+ already support lazy module loading")
+  if(${CUDA_VERSION} VERSION_GREATER_EQUAL "11.7")
+    message("cuda 11.7+ already support lazy module loading")
     return()
   endif()
 
   message(
-    "for cuda before 11.6, libcudart.so must be used for the lazy module loading trick to work, instead of libcudart_static.a"
+    "for cuda before 11.7, libcudart.so must be used for the lazy module loading trick to work, instead of libcudart_static.a"
   )
   set(CUDA_USE_STATIC_CUDA_RUNTIME
       OFF
diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake
deleted file mode 100644
index 810796831e23e..0000000000000
--- a/cmake/external/boost.cmake
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-include(ExternalProject)
-
-set(BOOST_PROJECT "extern_boost")
-# To release PaddlePaddle as a pip package, we have to follow the
-# manylinux1 standard, which features as old Linux kernels and
-# compilers as possible and recommends CentOS 5. Indeed, the earliest
-# CentOS version that works with NVIDIA CUDA is CentOS 6.  And a new
-# version of boost, say, 1.66.0, doesn't build on CentOS 6.  We
-# checked that the devtools package of CentOS 6 installs boost 1.41.0.
-# So we use 1.41.0 here.
-set(BOOST_VER "1.41.0")
-# boost_1_41_0_2021_10.tar.gz is almost the same with boost_1_41_0.tar.gz,
-# except in visualc.hpp i comment a warning of "unknown compiler version",
-# so if you need to change boost, you may need to block the warning similarly.
-set(BOOST_TAR
-    "boost_1_41_0_2021_10"
-    CACHE STRING "" FORCE)
-set(BOOST_URL
-    "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz"
-    CACHE STRING "" FORCE)
-
-message(STATUS "BOOST_VERSION: ${BOOST_VER}, BOOST_URL: ${BOOST_URL}")
-
-set(BOOST_PREFIX_DIR ${THIRD_PARTY_PATH}/boost)
-set(BOOST_INCLUDE_DIR
-    "${THIRD_PARTY_PATH}/boost/src/extern_boost"
-    CACHE PATH "boost include directory." FORCE)
-set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)
-include_directories(${BOOST_INCLUDE_DIR})
-
-if(WIN32 AND MSVC_VERSION GREATER_EQUAL 1600)
-  add_definitions(-DBOOST_HAS_STATIC_ASSERT)
-endif()
-
-ExternalProject_Add(
-  ${BOOST_PROJECT}
-  ${EXTERNAL_PROJECT_LOG_ARGS}
-  URL ${BOOST_URL}
-  URL_MD5 51be7cc203628dc0848e97eee32d79e3
-  PREFIX ${BOOST_PREFIX_DIR}
-  DOWNLOAD_NO_PROGRESS 1
-  CONFIGURE_COMMAND ""
-  BUILD_COMMAND ""
-  INSTALL_COMMAND ""
-  UPDATE_COMMAND "")
-
-add_library(boost INTERFACE)
-
-add_dependencies(boost ${BOOST_PROJECT})
-set(Boost_INCLUDE_DIR ${BOOST_INCLUDE_DIR})
diff --git a/cmake/external/cub.cmake b/cmake/external/cub.cmake
index c6b435288e37e..de66e8d63d069 100644
--- a/cmake/external/cub.cmake
+++ b/cmake/external/cub.cmake
@@ -24,7 +24,7 @@ set(CUB_PREFIX_DIR ${CUB_PATH})
 
 set(CUB_REPOSITORY ${GIT_URL}/NVlabs/cub.git)
 
-if(WIN32 AND ${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.6)
+if(${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.6)
   # cuda_11.6.2_511.65‘s own cub is 1.15.0, which will cause compiling error in windows.
   set(CUB_TAG 1.16.0)
   # cub 1.16.0 is not compitable with current thrust version
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index df1b827ed1824..456c651a197f9 100755
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -45,7 +45,7 @@ if(WITH_ARM_BRPC)
   file(
     WRITE ${GLOG_SOURCE_DIR}/CMakeLists.txt
     "PROJECT(ARM_GLOGS)\n" "cmake_minimum_required(VERSION 3.0)\n"
-    "install(DIRECTORY arm_glog/include arm_glog/lib \n"
+    "install(DIRECTORY arm_glog/include arm_glog/lib\n"
     "        DESTINATION . USE_SOURCE_PERMISSIONS)\n")
   ExternalProject_Add(
     extern_glog
diff --git a/cmake/external/onnxruntime.cmake b/cmake/external/onnxruntime.cmake
index b52b2c00d9cce..15901568ae1cd 100644
--- a/cmake/external/onnxruntime.cmake
+++ b/cmake/external/onnxruntime.cmake
@@ -134,3 +134,15 @@ endif()
 add_library(onnxruntime STATIC IMPORTED GLOBAL)
 set_property(TARGET onnxruntime PROPERTY IMPORTED_LOCATION ${ONNXRUNTIME_LIB})
 add_dependencies(onnxruntime ${ONNXRUNTIME_PROJECT})
+
+function(copy_onnx TARGET_NAME)
+  # If error of Exitcode0xc000007b happened when a .exe running, copy onnxruntime.dll
+  # to the .exe folder.
+  if(TARGET ${TARGET_NAME})
+    add_custom_command(
+      TARGET ${TARGET_NAME}
+      POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SHARED_LIB}
+              ${CMAKE_CURRENT_BINARY_DIR} DEPENDS onnxruntime)
+  endif()
+endfunction()
diff --git a/cmake/external/paddle2onnx.cmake b/cmake/external/paddle2onnx.cmake
index cbb622f5cb952..b8a1b4548b822 100644
--- a/cmake/external/paddle2onnx.cmake
+++ b/cmake/external/paddle2onnx.cmake
@@ -24,7 +24,7 @@ endif()
 include(ExternalProject)
 
 set(PADDLE2ONNX_PROJECT "extern_paddle2onnx")
-set(PADDLE2ONNX_VERSION "0.9.9")
+set(PADDLE2ONNX_VERSION "1.0.0rc")
 set(PADDLE2ONNX_PREFIX_DIR ${THIRD_PARTY_PATH}/paddle2onnx)
 set(PADDLE2ONNX_SOURCE_DIR
     ${THIRD_PARTY_PATH}/paddle2onnx/src/${PADDLE2ONNX_PROJECT})
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 7d1cca4feb6a6..81128ccf3b6a0 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -10,7 +10,7 @@ set(XPU_RT_LIB_NAME "libxpurt.so")
 if(NOT DEFINED XPU_BASE_URL)
   set(XPU_BASE_URL_WITHOUT_DATE
       "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220601")
+  set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220718")
 else()
   set(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()
@@ -19,14 +19,17 @@ endif()
 if(NOT DEFINED XPU_XDNN_BASE_URL)
   set(XPU_XDNN_BASE_URL_WITHOUT_DATE
       "https://klx-sdk-release-public.su.bcebos.com/xdnn/dev")
-  set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220601")
+  set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220718")
 else()
   set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL}")
 endif()
 
+set(XPU_XCCL_BASE_URL
+    "https://klx-sdk-release-public.su.bcebos.com/xccl/release/1.0.0")
+
 if(WITH_AARCH64)
   set(XPU_XRE_DIR_NAME "xre-kylin_aarch64")
-  set(XPU_XDNN_DIR_NAME "XDNN-kylin_aarch64")
+  set(XPU_XDNN_DIR_NAME "xdnn-kylin_aarch64")
   set(XPU_XCCL_DIR_NAME "xccl-kylin_aarch64")
   set(XPU_XDNN_URL
       "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz"
@@ -40,7 +43,7 @@ elseif(WITH_SUNWAY)
       CACHE STRING "" FORCE)
 elseif(WITH_BDCENTOS)
   set(XPU_XRE_DIR_NAME "xre-bdcentos_x86_64")
-  set(XPU_XDNN_DIR_NAME "XDNN-bdcentos_x86_64")
+  set(XPU_XDNN_DIR_NAME "xdnn-bdcentos_x86_64")
   set(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
   # ubuntu and centos: use output by XDNN API team
   set(XPU_XDNN_URL
@@ -48,7 +51,7 @@ elseif(WITH_BDCENTOS)
       CACHE STRING "" FORCE)
 elseif(WITH_UBUNTU)
   set(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64")
-  set(XPU_XDNN_DIR_NAME "XDNN-ubuntu_x86_64")
+  set(XPU_XDNN_DIR_NAME "xdnn-ubuntu_x86_64")
   set(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
   # ubuntu and centos: use output by XDNN API team
   set(XPU_XDNN_URL
@@ -56,7 +59,7 @@ elseif(WITH_UBUNTU)
       CACHE STRING "" FORCE)
 elseif(WITH_CENTOS)
   set(XPU_XRE_DIR_NAME "xre-centos7_x86_64")
-  set(XPU_XDNN_DIR_NAME "XDNN-bdcentos_x86_64")
+  set(XPU_XDNN_DIR_NAME "xdnn-bdcentos_x86_64")
   set(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
   # ubuntu and centos: use output by XDNN API team
   set(XPU_XDNN_URL
@@ -64,7 +67,7 @@ elseif(WITH_CENTOS)
       CACHE STRING "" FORCE)
 else()
   set(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64")
-  set(XPU_XDNN_DIR_NAME "XDNN-ubuntu_x86_64")
+  set(XPU_XDNN_DIR_NAME "xdnn-ubuntu_x86_64")
   set(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
   # default: use output by XDNN API team
   set(XPU_XDNN_URL
@@ -76,7 +79,7 @@ set(XPU_XRE_URL
     "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz"
     CACHE STRING "" FORCE)
 set(XPU_XCCL_URL
-    "${XPU_BASE_URL_WITHOUT_DATE}/20220411/${XPU_XCCL_DIR_NAME}.tar.gz"
+    "${XPU_XCCL_BASE_URL}/${XPU_XCCL_DIR_NAME}.tar.gz"
     CACHE STRING "" FORCE)
 set(XPU_PACK_DEPENCE_URL
     "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/pack_paddle_depence.sh"
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 56345373dbe8c..2fc1be2545ddc 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -427,10 +427,8 @@ copy(
 set(module "memory")
 copy(
   fluid_lib_dist
-  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h
-       ${src_dir}/${module}/allocation/*.h
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail
-       ${dst_dir}/${module}/allocation)
+  SRCS ${src_dir}/${module}/allocation/*.h
+  DSTS ${dst_dir}/${module}/allocation)
 
 set(module "platform")
 set(platform_lib_deps profiler_proto errors)
@@ -473,12 +471,6 @@ copy(
        ${EIGEN_INCLUDE_DIR}/unsupported/Eigen
   DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported)
 
-set(dst_dir "${PADDLE_INSTALL_DIR}/third_party/boost")
-copy(
-  inference_lib_dist
-  SRCS ${BOOST_INCLUDE_DIR}/boost
-  DSTS ${dst_dir})
-
 set(dst_dir "${PADDLE_INSTALL_DIR}/third_party/dlpack")
 copy(
   inference_lib_dist
diff --git a/cmake/phi.cmake b/cmake/phi.cmake
index 9f716969dcdec..e320473d9be2f 100644
--- a/cmake/phi.cmake
+++ b/cmake/phi.cmake
@@ -103,6 +103,9 @@ function(kernel_declare TARGET_LIST)
       elseif(${kernel_path} MATCHES "./kps\/")
         file(APPEND ${kernel_declare_file}
              "PD_DECLARE_KERNEL(${kernel_name}, KPS, ALL_LAYOUT);\n")
+      elseif(${kernel_path} MATCHES "./onednn\/")
+        file(APPEND ${kernel_declare_file}
+             "PD_DECLARE_KERNEL(${kernel_name}, OneDNN, ALL_LAYOUT);\n")
       else()
         # deal with device independent kernel, now we use CPU temporaary
         file(APPEND ${kernel_declare_file}
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index b96656778d60c..dd8013d807b39 100755
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -225,7 +225,7 @@ if(NOT DEFINED WITH_MKLDNN)
   if(WITH_MKL AND AVX2_FOUND)
     set(WITH_MKLDNN ON)
   else()
-    message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN")
+    message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN.")
     set(WITH_MKLDNN OFF)
   endif()
 endif()
@@ -246,7 +246,6 @@ endif()
 include(external/zlib) # download, build, install zlib
 include(external/gflags) # download, build, install gflags
 include(external/glog) # download, build, install glog
-include(external/boost) # download boost
 include(external/eigen) # download eigen3
 include(external/threadpool) # download threadpool
 include(external/dlpack) # download dlpack
@@ -254,14 +253,8 @@ include(external/xxhash) # download, build, install xxhash
 include(external/warpctc) # download, build, install warpctc
 include(external/utf8proc) # download, build, install utf8proc
 
-list(
-  APPEND
-  third_party_deps
-  extern_eigen3
-  extern_gflags
-  extern_glog
-  extern_boost
-  extern_xxhash)
+list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog
+     extern_xxhash)
 list(
   APPEND
   third_party_deps
@@ -272,14 +265,8 @@ list(
   extern_utf8proc)
 include(external/lapack) # download, build, install lapack
 
-list(
-  APPEND
-  third_party_deps
-  extern_eigen3
-  extern_gflags
-  extern_glog
-  extern_boost
-  extern_xxhash)
+list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog
+     extern_xxhash)
 list(
   APPEND
   third_party_deps
diff --git a/paddle/fluid/distributed/CMakeLists.txt b/paddle/fluid/distributed/CMakeLists.txt
index 0b5f608122683..24e0a8c7a5d9f 100755
--- a/paddle/fluid/distributed/CMakeLists.txt
+++ b/paddle/fluid/distributed/CMakeLists.txt
@@ -33,11 +33,6 @@ if(NOT WITH_PSCORE)
 endif()
 
 proto_library(ps_framework_proto SRCS the_one_ps.proto)
-add_custom_command(
-  TARGET ps_framework_proto
-  POST_BUILD
-  COMMAND mv the_one_ps.pb.h ps.pb.h
-  COMMAND mv the_one_ps.pb.cc ps.pb.cc)
 
 set(DISTRIBUTE_COMPILE_FLAGS
     "-Wno-error=unused-value -Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result"
diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h
index 48dd6d8285699..be3bfc0dc0029 100644
--- a/paddle/fluid/distributed/collective/ProcessGroup.h
+++ b/paddle/fluid/distributed/collective/ProcessGroup.h
@@ -46,6 +46,7 @@ enum class CommType : std::uint8_t {
   SEND = 9,
   RECV = 10,
   BARRIER = 11,
+  ALLTOALL_SINGLE = 12,
   UNKNOWN = 100,
 };
 
@@ -143,6 +144,15 @@ class ProcessGroup {
         "ProcessGroup%s does not support AllToAll", GetBackendName()));
   }
 
+  virtual std::shared_ptr<ProcessGroup::Task> AllToAll_Single(
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<int64_t>&,
+      std::vector<int64_t>&) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support AllToAll_Single", GetBackendName()));
+  }
+
   virtual std::shared_ptr<ProcessGroup::Task> Reduce(
       std::vector<phi::DenseTensor>&,  // NOLINT
       std::vector<phi::DenseTensor>&,  // NOLINT
@@ -159,6 +169,14 @@ class ProcessGroup {
         "ProcessGroup%s does not support Scatter", GetBackendName()));
   }
 
+  virtual std::shared_ptr<ProcessGroup::Task> _ReduceScatterBase(
+      phi::DenseTensor&,              // NOLINT
+      phi::DenseTensor&,              // NOLINT
+      const ReduceScatterOptions&) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support ReduceScatter", GetBackendName()));
+  }
+
  protected:
   const int rank_;
   const int size_;
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
index e6e69f0be3ae5..1beca8022e9f9 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -85,6 +85,34 @@ bool ProcessGroupNCCL::NCCLTask::IsCompleted() {
   return true;
 }
 
+void ProcessGroupNCCL::CheckSplitSizes(std::vector<int64_t>& split_sizes,
+                                       std::vector<int64_t> tensor_shape) {
+  int64_t len_size = split_sizes.size();
+  if (len_size == 0) {
+    PADDLE_ENFORCE_EQ(tensor_shape[0] % size_ == 0,
+                      true,
+                      platform::errors::InvalidArgument(
+                          "Tensor's dim[0] must be divisible by group size "
+                          "when split_sizes not given."));
+    split_sizes.insert(split_sizes.end(),
+                       size_,
+                       static_cast<int64_t>(tensor_shape[0] / size_));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        len_size == size_,
+        true,
+        platform::errors::InvalidArgument(
+            "The length of split_sizes must be equal to group size."));
+    auto sum_size = std::accumulate(
+        split_sizes.begin(), split_sizes.end(), static_cast<int64_t>(0));
+    PADDLE_ENFORCE_EQ(
+        sum_size == tensor_shape[0],
+        true,
+        platform::errors::InvalidArgument(
+            "The sum of split_sizes must be equal to tensor's dim[0]."));
+  }
+}
+
 // TODO(sheniang03): Add timeout for wait, now timeout unused
 bool ProcessGroupNCCL::NCCLTask::Wait(std::chrono::milliseconds timeout) {
   SynchronizeStreams();
@@ -637,7 +665,69 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllToAll(
         }
         PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
       },
-      CommType::ALLREDUCE);
+      CommType::ALLTOALL);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllToAll_Single(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    std::vector<int64_t>& in_sizes,
+    std::vector<int64_t>& out_sizes) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(out_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          ncclComm_t comm,
+          const gpuStream_t& stream) {
+        PADDLE_ENFORCE_EQ(input.dtype() == output.dtype(),
+                          true,
+                          platform::errors::InvalidArgument(
+                              "The dtypes of input and output must be equal."));
+
+        std::vector<int64_t> in_dims = phi::vectorize(input.dims());
+        std::vector<int64_t> out_dims = phi::vectorize(output.dims());
+        CheckSplitSizes(in_sizes, in_dims);
+        CheckSplitSizes(out_sizes, out_dims);
+
+        size_t in_offset = 0, out_offset = 0;
+        size_t in_length = 0, out_length = 0;
+        size_t in_row_size = input.numel() / in_dims[0];
+        size_t out_row_size = output.numel() / out_dims[0];
+
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+        for (auto i = 0; i < size_; i++) {
+          in_length = in_sizes[i] * in_row_size;
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
+              GetPointerByOffset(input.data(), in_offset, input.dtype()),
+              in_length,
+              platform::ToNCCLDataType(input.dtype()),
+              i,
+              comm,
+              stream));
+          in_offset += in_length;
+
+          out_length = out_sizes[i] * out_row_size;
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
+              GetPointerByOffset(output.data(), out_offset, input.dtype()),
+              out_length,
+              platform::ToNCCLDataType(input.dtype()),
+              i,
+              comm,
+              stream));
+          out_offset += out_length;
+        }
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+      },
+      CommType::ALLTOALL_SINGLE);
 }
 
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Reduce(
@@ -721,5 +811,57 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Scatter(
       CommType::SCATTER);
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::_ReduceScatterBase(
+    phi::DenseTensor& out_tensor,
+    phi::DenseTensor& in_tensor,
+    const ReduceScatterOptions& opts) {
+  // auto tensor = out_tensors.back();
+  PADDLE_ENFORCE_EQ(
+      out_tensor.dtype(),
+      in_tensor.dtype(),
+      platform::errors::InvalidArgument(
+          "Input tensor and output tensor should be same dtype."));
+
+  PADDLE_ENFORCE_EQ(
+      out_tensor.numel() * size_,
+      in_tensor.numel(),
+      platform::errors::InvalidArgument("input tensor must be the same size as "
+                                        "output tensor size times world_size"));
+
+  auto inputs = std::vector<phi::DenseTensor>{in_tensor};
+  auto outputs = std::vector<phi::DenseTensor>{out_tensor};
+
+  return Collective(
+      inputs,
+      outputs,
+      [&](phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          ncclComm_t comm,
+          const gpuStream_t& stream) {
+        if (FLAGS_use_stream_safe_cuda_allocator) {
+          platform::CUDADeviceGuard cuda_guard;
+          cuda_guard.SetDevice(output.place());
+          memory::RecordStream(output.Holder(), stream);
+        }
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduceScatter(
+            input.data(),
+            output.data(),
+            output.numel(),
+            platform::ToNCCLDataType(input.dtype()),
+            ToNCCLRedType(opts.reduce_op),
+            comm,
+            stream));
+      },
+      CommType::REDUCE_SCATTER);
+}
+
+void ProcessGroupNCCL::GroupStart() {
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+}
+
+void ProcessGroupNCCL::GroupEnd() {
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+}
+
 }  //  namespace distributed
 }  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
index a26f5947ce2b8..a8adffe64e70d 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
@@ -129,6 +129,12 @@ class ProcessGroupNCCL : public ProcessGroup {
       std::vector<phi::DenseTensor>& in,
       std::vector<phi::DenseTensor>& out) override;
 
+  std::shared_ptr<ProcessGroup::Task> AllToAll_Single(
+      std::vector<phi::DenseTensor>& in,
+      std::vector<phi::DenseTensor>& out,
+      std::vector<int64_t>& in_sizes,
+      std::vector<int64_t>& out_sizes) override;
+
   std::shared_ptr<ProcessGroup::Task> Reduce(
       std::vector<phi::DenseTensor>& tensors,
       std::vector<phi::DenseTensor>& out_tensors,
@@ -139,6 +145,15 @@ class ProcessGroupNCCL : public ProcessGroup {
       std::vector<phi::DenseTensor>& out_tensors,
       const ScatterOptions&) override;
 
+  std::shared_ptr<ProcessGroup::Task> _ReduceScatterBase(
+      phi::DenseTensor&,  // NOLINT
+      phi::DenseTensor&,  // NOLINT
+      const ReduceScatterOptions&) override;
+
+  static void GroupStart();
+
+  static void GroupEnd();
+
  protected:
   virtual std::shared_ptr<ProcessGroupNCCL::NCCLTask> CreateTask(
       std::vector<Place> places,
@@ -162,8 +177,8 @@ class ProcessGroupNCCL : public ProcessGroup {
   std::set<int> used_place_ids_;
 
  private:
-  void BcastNCCLId(std::vector<ncclUniqueId>& nccl_ids,
-                   int root,  // NOLINT
+  void BcastNCCLId(std::vector<ncclUniqueId>& nccl_ids,  // NOLINT
+                   int root,                             // NOLINT
                    int server_fd);
 
   void BroadcastUniqueNCCLID(std::vector<ncclUniqueId>& nccl_ids);  // NOLINT
@@ -190,6 +205,9 @@ class ProcessGroupNCCL : public ProcessGroup {
 
   void CreateNCCLManagerCache(const std::string& places_key,
                               const std::vector<Place>& places);
+
+  void CheckSplitSizes(std::vector<int64_t>& split_sizes,
+                       std::vector<int64_t> tensor_shape);
 };
 
 }  //  namespace distributed
diff --git a/paddle/fluid/distributed/collective/Types.h b/paddle/fluid/distributed/collective/Types.h
index 973f7c6435427..0ce92111f6a13 100644
--- a/paddle/fluid/distributed/collective/Types.h
+++ b/paddle/fluid/distributed/collective/Types.h
@@ -45,5 +45,9 @@ struct ScatterOptions {
   int root_rank = 0;
 };
 
+struct ReduceScatterOptions {
+  ReduceOp reduce_op = ReduceOp::SUM;
+};
+
 }  //  namespace distributed
 }  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc
index 4262161b1bc45..dda5f2eee6e8f 100644
--- a/paddle/fluid/distributed/collective/reducer.cc
+++ b/paddle/fluid/distributed/collective/reducer.cc
@@ -251,7 +251,7 @@ void EagerGroup::ConcatTensors(const platform::Place &place) {
         "Please recompile or reinstall Paddle with NCCL support."));
 #endif
   } else if (platform::is_cpu_place(place)) {
-    auto *default_ctx = static_cast<platform::CPUDeviceContext *>(
+    auto *default_ctx = static_cast<phi::CPUContext *>(
         platform::DeviceContextPool::Instance().Get(place));
     ConcatTensorsWithType(
         *default_ctx, dense_tensors_, &dense_contents_, dtype_);
@@ -274,7 +274,7 @@ void EagerGroup::SplitTensors(const platform::Place &place) {
         "Please recompile or reinstall Paddle with NCCL support."));
 #endif
   } else if (platform::is_cpu_place(place)) {
-    auto *default_ctx = static_cast<platform::CPUDeviceContext *>(
+    auto *default_ctx = static_cast<phi::CPUContext *>(
         platform::DeviceContextPool::Instance().Get(place));
     SplitTensorsWithType(
         *default_ctx, &dense_contents_, &dense_tensors_, dtype_);
@@ -891,7 +891,7 @@ void EagerReducer::AllReduceSparse(EagerGroup *group,
         "Please recompile or reinstall Paddle with NCCL support."));
 #endif
   } else if (platform::is_cpu_place(inner_place_)) {
-    dev_ctx = static_cast<platform::CPUDeviceContext *>(
+    dev_ctx = static_cast<phi::CPUContext *>(
         platform::DeviceContextPool::Instance().Get(inner_place_));
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
diff --git a/paddle/fluid/distributed/common/afs_warpper.h b/paddle/fluid/distributed/common/afs_warpper.h
index f9e8118514670..542d65d7a649f 100644
--- a/paddle/fluid/distributed/common/afs_warpper.h
+++ b/paddle/fluid/distributed/common/afs_warpper.h
@@ -20,7 +20,7 @@
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/distributed/the_one_ps.pb.h"
 #include "paddle/fluid/string/string_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/common/utils.h b/paddle/fluid/distributed/common/utils.h
index 847e85a13415d..1abfb57b99dab 100644
--- a/paddle/fluid/distributed/common/utils.h
+++ b/paddle/fluid/distributed/common/utils.h
@@ -31,9 +31,9 @@ namespace paddle {
 namespace distributed {
 
 template <typename T>
-inline phi::funcs::BlasT<paddle::platform::CPUDeviceContext, T> GetBlas() {
-  paddle::platform::CPUDeviceContext cpu_ctx;
-  return phi::funcs::GetBlas<paddle::platform::CPUDeviceContext, T>(cpu_ctx);
+inline phi::funcs::BlasT<phi::CPUContext, T> GetBlas() {
+  phi::CPUContext cpu_ctx;
+  return phi::funcs::GetBlas<phi::CPUContext, T>(cpu_ctx);
 }
 
 template <typename T>
diff --git a/paddle/fluid/distributed/ps/service/CMakeLists.txt b/paddle/fluid/distributed/ps/service/CMakeLists.txt
index ad49b651e2e71..709d11f7fbb84 100755
--- a/paddle/fluid/distributed/ps/service/CMakeLists.txt
+++ b/paddle/fluid/distributed/ps/service/CMakeLists.txt
@@ -86,35 +86,29 @@ cc_library(
 cc_library(
   downpour_server
   SRCS graph_brpc_server.cc brpc_ps_server.cc
-  DEPS boost eigen3 table brpc_utils simple_threadpool ${RPC_DEPS})
+  DEPS eigen3 table brpc_utils simple_threadpool ${RPC_DEPS})
 cc_library(
   downpour_client
   SRCS graph_brpc_client.cc brpc_ps_client.cc ps_local_client.cc
-  DEPS boost eigen3 table brpc_utils simple_threadpool ${RPC_DEPS})
+  DEPS eigen3 table brpc_utils simple_threadpool ${RPC_DEPS})
 
 cc_library(
   client
   SRCS ps_client.cc
-  DEPS downpour_client boost ${RPC_DEPS})
+  DEPS downpour_client ${RPC_DEPS})
 cc_library(
   server
   SRCS server.cc
-  DEPS downpour_server boost ${RPC_DEPS})
+  DEPS downpour_server ${RPC_DEPS})
 
 cc_library(
   communicator
   SRCS communicator/communicator.cc
-  DEPS scope
-       client
-       boost
-       table
-       math_function
-       selected_rows_functor
-       ${RPC_DEPS})
+  DEPS scope client table math_function selected_rows_functor ${RPC_DEPS})
 cc_library(
   ps_service
   SRCS ps_service/service.cc
-  DEPS communicator client server boost ${RPC_DEPS})
+  DEPS communicator client server ${RPC_DEPS})
 
 cc_library(
   heter_client
diff --git a/paddle/fluid/distributed/ps/service/communicator/CMakeLists.txt b/paddle/fluid/distributed/ps/service/communicator/CMakeLists.txt
index 612358c71a6fb..03244ecba7b4a 100644
--- a/paddle/fluid/distributed/ps/service/communicator/CMakeLists.txt
+++ b/paddle/fluid/distributed/ps/service/communicator/CMakeLists.txt
@@ -6,10 +6,4 @@ set_source_files_properties(
 cc_library(
   communicator
   SRCS communicator.cc
-  DEPS scope
-       client
-       boost
-       table
-       math_function
-       selected_rows_functor
-       ${RPC_DEPS})
+  DEPS scope client table math_function selected_rows_functor ${RPC_DEPS})
diff --git a/paddle/fluid/distributed/ps/service/communicator/communicator.cc b/paddle/fluid/distributed/ps/service/communicator/communicator.cc
index 990dbc845f0ad..0856c81121f89 100644
--- a/paddle/fluid/distributed/ps/service/communicator/communicator.cc
+++ b/paddle/fluid/distributed/ps/service/communicator/communicator.cc
@@ -353,11 +353,12 @@ void Communicator::RpcRecvSparse(const std::string &varname,
 
   bool training = true;
 
-  auto status = _worker_ptr->PullSparseParam((float **)push_g_vec.data(),
-                                             table_id,  // NOLINT
-                                             sparse_push_keys.data(),
-                                             sparse_push_keys.size(),
-                                             training);
+  auto status =
+      _worker_ptr->PullSparseParam(static_cast<float **>(push_g_vec.data()),
+                                   table_id,
+                                   sparse_push_keys.data(),
+                                   sparse_push_keys.size(),
+                                   training);
   status.wait();
   return;
 }
@@ -1184,12 +1185,12 @@ void GeoCommunicator::SendDense(const CommContext &send_ctx) {
     auto &t_latest = var_latest->Get<framework::LoDTensor>();
     auto t_timestamp = var_timestamp->GetMutable<framework::LoDTensor>();
 
-    paddle::platform::CPUDeviceContext cpu_ctx;
+    phi::CPUContext cpu_ctx;
     auto *var_delta = delta_scope_->Var(varname);
     auto *t_delta = var_delta->GetMutable<framework::LoDTensor>();
     t_delta->mutable_data<float>(t_latest.dims(), cpu_ctx.GetPlace());
 
-    auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, float>(cpu_ctx);
+    auto blas = phi::funcs::GetBlas<phi::CPUContext, float>(cpu_ctx);
     blas.VSUB(t_latest.numel(),
               t_latest.data<float>(),
               t_timestamp->data<float>(),
@@ -1218,7 +1219,7 @@ void GeoCommunicator::RecvDense(const CommContext &send_ctx) {
   RpcRecvDense(varnames, table_id, pserver_scope_.get());
 
   // 2.1 pserver - old => delta; 2.2 latest + old => latest 2.3 old => pserver
-  paddle::platform::CPUDeviceContext cpu_ctx;
+  phi::CPUContext cpu_ctx;
   for (auto &varname : varnames) {
     auto *var_latest = recv_scope_->FindVar(varname);
     auto t_latest = var_latest->GetMutable<framework::LoDTensor>();
@@ -1233,7 +1234,7 @@ void GeoCommunicator::RecvDense(const CommContext &send_ctx) {
     auto *t_delta = var_delta->GetMutable<framework::LoDTensor>();
     t_delta->mutable_data<float>(t_latest->dims(), cpu_ctx.GetPlace());
 
-    auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, float>(cpu_ctx);
+    auto blas = phi::funcs::GetBlas<phi::CPUContext, float>(cpu_ctx);
     blas.VSUB(t_latest->numel(),
               t_pserver.data<float>(),
               t_old->data<float>(),
@@ -1334,7 +1335,7 @@ void GeoCommunicator::SendSparse(const std::string &varname,
   auto *t_old = var_old->GetMutable<framework::LoDTensor>();
 
   auto dims1 = t_latest.dims()[1];
-  paddle::platform::CPUDeviceContext cpu_ctx;
+  phi::CPUContext cpu_ctx;
 
   auto *var_delta = delta_scope_->Var(varname);
   auto *t_delta = var_delta->GetMutable<phi::SelectedRows>();
@@ -1345,7 +1346,7 @@ void GeoCommunicator::SendSparse(const std::string &varname,
   t_delta->set_rows(sparse_ids);
   t_delta->set_height(t_latest.dims()[0]);
 
-  auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, float>(cpu_ctx);
+  auto blas = phi::funcs::GetBlas<phi::CPUContext, float>(cpu_ctx);
   float coefficient = 1.0 / static_cast<float>(trainers_);
 
   std::vector<float *> push_g_vec;
@@ -1419,8 +1420,8 @@ void GeoCommunicator::RecvSparse(const std::string &varname,
   std::vector<float> v_delta;
   v_delta.resize(numel);
 
-  paddle::platform::CPUDeviceContext cpu_ctx;
-  auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, float>(cpu_ctx);
+  phi::CPUContext cpu_ctx;
+  auto blas = phi::funcs::GetBlas<phi::CPUContext, float>(cpu_ctx);
 
   for (auto j = 0; j < static_cast<int>(keys.size()); ++j) {
     VLOG(5) << "DEBUG GeoCommunicator::RecvSparse recv sparse key" << keys[j]
diff --git a/paddle/fluid/distributed/ps/service/communicator/communicator.h b/paddle/fluid/distributed/ps/service/communicator/communicator.h
index 69589da8b3031..f08208ed02d70 100644
--- a/paddle/fluid/distributed/ps/service/communicator/communicator.h
+++ b/paddle/fluid/distributed/ps/service/communicator/communicator.h
@@ -185,9 +185,8 @@ inline void MergeVars(const std::string &var_name,
     }
 
     // set output tensor to 0.
-    paddle::platform::CPUDeviceContext cpu_ctx;
-    phi::funcs::SetConstant<paddle::platform::CPUDeviceContext, T>
-        constant_functor;
+    phi::CPUContext cpu_ctx;
+    phi::funcs::SetConstant<phi::CPUContext, T> constant_functor;
     constant_functor(cpu_ctx, out_t, static_cast<T>(0));
     // sum all vars to out
     auto result = EigenVector<T>::Flatten(*out_t);
@@ -210,16 +209,13 @@ inline void MergeVars(const std::string &var_name,
     for (auto &var : vars) {
       inputs.push_back(&var->Get<phi::SelectedRows>());
     }
-    paddle::platform::CPUDeviceContext dev_ctx;
+    phi::CPUContext dev_ctx;
     if (merge_add) {
-      paddle::operators::math::scatter::
-          MergeAdd<paddle::platform::CPUDeviceContext, T>
-              merge_add;
+      paddle::operators::math::scatter::MergeAdd<phi::CPUContext, T> merge_add;
       merge_add(dev_ctx, inputs, out_slr);
     } else {
-      paddle::operators::math::scatter::
-          MergeAverage<paddle::platform::CPUDeviceContext, T>
-              merge_average;
+      paddle::operators::math::scatter::MergeAverage<phi::CPUContext, T>
+          merge_average;
       merge_average(dev_ctx, inputs, out_slr);
     }
 
diff --git a/paddle/fluid/distributed/ps/service/ps_client.h b/paddle/fluid/distributed/ps/service/ps_client.h
index 1e680345b7b49..01bf29b429191 100644
--- a/paddle/fluid/distributed/ps/service/ps_client.h
+++ b/paddle/fluid/distributed/ps/service/ps_client.h
@@ -22,11 +22,11 @@
 #include <vector>
 
 #include "paddle/fluid/distributed/common/cost_timer.h"
-#include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/service/env.h"
 #include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
 #include "paddle/fluid/distributed/ps/table/accessor.h"
 #include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
+#include "paddle/fluid/distributed/the_one_ps.pb.h"
 #include "paddle/fluid/platform/timer.h"
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
index b09ee358af010..4e915ab50fe86 100644
--- a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
+++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
@@ -27,12 +27,12 @@
 
 #include "google/protobuf/text_format.h"
 #include "gtest/gtest.h"
-#include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/service/env.h"
 #include "paddle/fluid/distributed/ps/service/graph_brpc_client.h"
 #include "paddle/fluid/distributed/ps/service/graph_brpc_server.h"
 #include "paddle/fluid/distributed/ps/service/ps_service/service.h"
 #include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
+#include "paddle/fluid/distributed/the_one_ps.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
diff --git a/paddle/fluid/distributed/ps/service/ps_service/service.h b/paddle/fluid/distributed/ps/service/ps_service/service.h
index 69e40da54f44f..1b9f9249c3bbe 100644
--- a/paddle/fluid/distributed/ps/service/ps_service/service.h
+++ b/paddle/fluid/distributed/ps/service/ps_service/service.h
@@ -19,10 +19,10 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/service/ps_client.h"
 #include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
 #include "paddle/fluid/distributed/ps/service/server.h"
+#include "paddle/fluid/distributed/the_one_ps.pb.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/ps/service/server.h b/paddle/fluid/distributed/ps/service/server.h
index cd4e39ae450d1..32c989826811f 100644
--- a/paddle/fluid/distributed/ps/service/server.h
+++ b/paddle/fluid/distributed/ps/service/server.h
@@ -24,9 +24,9 @@
 #include "butil/endpoint.h"
 #include "google/protobuf/service.h"
 #include "paddle/fluid/distributed/common/registerer.h"
-#include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/service/env.h"
 #include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
+#include "paddle/fluid/distributed/the_one_ps.pb.h"
 #include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/distributed/ps/table/CMakeLists.txt b/paddle/fluid/distributed/ps/table/CMakeLists.txt
index fdda59420f03c..3a9933cabdd7c 100644
--- a/paddle/fluid/distributed/ps/table/CMakeLists.txt
+++ b/paddle/fluid/distributed/ps/table/CMakeLists.txt
@@ -119,7 +119,6 @@ cc_library(
        string_helper
        device_context
        gflags
-       glog
-       boost)
+       glog)
 
 target_link_libraries(table -fopenmp)
diff --git a/paddle/fluid/distributed/ps/table/accessor.h b/paddle/fluid/distributed/ps/table/accessor.h
index 3261fb9f2ea01..b55c77bf52d84 100644
--- a/paddle/fluid/distributed/ps/table/accessor.h
+++ b/paddle/fluid/distributed/ps/table/accessor.h
@@ -21,7 +21,7 @@
 
 #include "paddle/fluid/distributed/common/afs_warpper.h"
 #include "paddle/fluid/distributed/common/registerer.h"
-#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/distributed/the_one_ps.pb.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/ps/table/ctr_accessor.h b/paddle/fluid/distributed/ps/table/ctr_accessor.h
index 46d991ef1d787..c9283d478feb4 100644
--- a/paddle/fluid/distributed/ps/table/ctr_accessor.h
+++ b/paddle/fluid/distributed/ps/table/ctr_accessor.h
@@ -19,9 +19,9 @@
 #include <vector>
 
 #include "paddle/fluid/distributed/common/registerer.h"
-#include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/table/accessor.h"
 #include "paddle/fluid/distributed/ps/table/sparse_sgd_rule.h"
+#include "paddle/fluid/distributed/the_one_ps.pb.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/ps/table/ctr_double_accessor.h b/paddle/fluid/distributed/ps/table/ctr_double_accessor.h
index e766f4c767c43..4b69054e555c5 100644
--- a/paddle/fluid/distributed/ps/table/ctr_double_accessor.h
+++ b/paddle/fluid/distributed/ps/table/ctr_double_accessor.h
@@ -19,9 +19,9 @@
 #include <vector>
 
 #include "paddle/fluid/distributed/common/registerer.h"
-#include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/table/accessor.h"
 #include "paddle/fluid/distributed/ps/table/sparse_sgd_rule.h"
+#include "paddle/fluid/distributed/the_one_ps.pb.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h
index 38b3e6ecae68d..a360030cb7d3d 100644
--- a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h
+++ b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h
@@ -19,9 +19,9 @@
 #include <vector>
 
 #include "paddle/fluid/distributed/common/registerer.h"
-#include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/table/accessor.h"
 #include "paddle/fluid/distributed/ps/table/sparse_sgd_rule.h"
+#include "paddle/fluid/distributed/the_one_ps.pb.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_table.cc b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc
index 60f012441c65c..115f8bcf58eaf 100644
--- a/paddle/fluid/distributed/ps/table/memory_sparse_table.cc
+++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc
@@ -18,7 +18,6 @@
 
 #include <sstream>
 
-#include "boost/lexical_cast.hpp"
 #include "glog/logging.h"
 #include "paddle/fluid/distributed/common/cost_timer.h"
 #include "paddle/fluid/framework/io/fs.h"
@@ -530,7 +529,7 @@ int32_t MemorySparseTable::PullSparsePtr(char** pull_values,
              mf_value_size]() -> int {
               auto& keys = task_keys[shard_id];
               auto& local_shard = _local_shards[shard_id];
-              float data_buffer[value_size];
+              float data_buffer[value_size];  // NOLINT
               float* data_buffer_ptr = data_buffer;
               for (size_t i = 0; i < keys.size(); ++i) {
                 uint64_t key = keys[i].first;
@@ -549,7 +548,7 @@ int32_t MemorySparseTable::PullSparsePtr(char** pull_values,
                   ret = itr.value_ptr();
                 }
                 int pull_data_idx = keys[i].second;
-                pull_values[pull_data_idx] = (char*)ret;
+                pull_values[pull_data_idx] = (char*)ret;  // NOLINT
               }
               return 0;
             });
diff --git a/paddle/fluid/distributed/ps/table/sparse_accessor.h b/paddle/fluid/distributed/ps/table/sparse_accessor.h
index d4fbddc934862..9a58476d8e373 100644
--- a/paddle/fluid/distributed/ps/table/sparse_accessor.h
+++ b/paddle/fluid/distributed/ps/table/sparse_accessor.h
@@ -19,9 +19,9 @@
 #include <vector>
 
 #include "paddle/fluid/distributed/common/registerer.h"
-#include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/table/accessor.h"
 #include "paddle/fluid/distributed/ps/table/sparse_sgd_rule.h"
+#include "paddle/fluid/distributed/the_one_ps.pb.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/ps/table/sparse_sgd_rule.h b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.h
index 215a15a7d31eb..f62cffdf232e7 100644
--- a/paddle/fluid/distributed/ps/table/sparse_sgd_rule.h
+++ b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.h
@@ -21,7 +21,7 @@
 #include "glog/logging.h"                                  // for CHECK
 #include "paddle/fluid/distributed/common/local_random.h"  // for local_uniform_real_distribution
 #include "paddle/fluid/distributed/common/registerer.h"
-#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/distributed/the_one_ps.pb.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
index 071a1703e2a6d..3e0f631ed41bc 100644
--- a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
+++ b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
@@ -39,6 +39,33 @@ int32_t SSDSparseTable::Initialize() {
 
 int32_t SSDSparseTable::InitializeShard() { return 0; }
 
+int32_t SSDSparseTable::Pull(TableContext& context) {
+  CHECK(context.value_type == Sparse);
+  if (context.use_ptr) {
+    char** pull_values = context.pull_context.ptr_values;
+    const uint64_t* keys = context.pull_context.keys;
+    return PullSparsePtr(pull_values, keys, context.num);
+  } else {
+    float* pull_values = context.pull_context.values;
+    const PullSparseValue& pull_value = context.pull_context.pull_value;
+    return PullSparse(pull_values, pull_value.feasigns_, pull_value.numel_);
+  }
+}
+
+int32_t SSDSparseTable::Push(TableContext& context) {
+  CHECK(context.value_type == Sparse);
+  if (context.use_ptr) {
+    return PushSparse(context.push_context.keys,
+                      context.push_context.ptr_values,
+                      context.num);
+  } else {
+    const uint64_t* keys = context.push_context.keys;
+    const float* values = context.push_context.values;
+    size_t num = context.num;
+    return PushSparse(keys, values, num);
+  }
+}
+
 int32_t SSDSparseTable::PullSparse(float* pull_values,
                                    const uint64_t* keys,
                                    size_t num) {
@@ -73,7 +100,7 @@ int32_t SSDSparseTable::PullSparse(float* pull_values,
                &missed_keys]() -> int {
                 auto& keys = task_keys[shard_id];
                 auto& local_shard = _local_shards[shard_id];
-                float data_buffer[value_size];
+                float data_buffer[value_size];  // NOLINT
                 float* data_buffer_ptr = data_buffer;
                 for (size_t i = 0; i < keys.size(); ++i) {
                   uint64_t key = keys[i].first;
@@ -83,7 +110,7 @@ int32_t SSDSparseTable::PullSparse(float* pull_values,
                     // pull rocksdb
                     std::string tmp_string("");
                     if (_db->get(shard_id,
-                                 (char*)&key,
+                                 reinterpret_cast<char*>(&key),
                                  sizeof(uint64_t),
                                  tmp_string) > 0) {
                       ++missed_keys;
@@ -110,7 +137,9 @@ int32_t SSDSparseTable::PullSparse(float* pull_values,
                       memcpy(const_cast<float*>(feature_value.data()),
                              data_buffer_ptr,
                              data_size * sizeof(float));
-                      _db->del_data(shard_id, (char*)&key, sizeof(uint64_t));
+                      _db->del_data(shard_id,
+                                    reinterpret_cast<char*>(&key),
+                                    sizeof(uint64_t));
                     }
                   } else {
                     data_size = itr.value().size();
@@ -142,6 +171,95 @@ int32_t SSDSparseTable::PullSparse(float* pull_values,
   return 0;
 }
 
+int32_t SSDSparseTable::PullSparsePtr(char** pull_values,
+                                      const uint64_t* keys,
+                                      size_t num) {
+  CostTimer timer("pserver_ssd_sparse_select_all");
+  size_t value_size = _value_accesor->GetAccessorInfo().size / sizeof(float);
+  size_t mf_value_size =
+      _value_accesor->GetAccessorInfo().mf_size / sizeof(float);
+
+  {  // 从table取值 or create
+    std::vector<std::future<int>> tasks(_real_local_shard_num);
+    std::vector<std::vector<std::pair<uint64_t, int>>> task_keys(
+        _real_local_shard_num);
+    for (size_t i = 0; i < num; ++i) {
+      int shard_id = (keys[i] % _sparse_table_shard_num) % _avg_local_shard_num;
+      task_keys[shard_id].push_back({keys[i], i});
+    }
+
+    std::atomic<uint32_t> missed_keys{0};
+    for (int shard_id = 0; shard_id < _real_local_shard_num; ++shard_id) {
+      tasks[shard_id] =
+          _shards_task_pool[shard_id % _shards_task_pool.size()]->enqueue(
+              [this,
+               shard_id,
+               &task_keys,
+               value_size,
+               mf_value_size,
+               pull_values,
+               &missed_keys]() -> int {
+                auto& keys = task_keys[shard_id];
+                auto& local_shard = _local_shards[shard_id];
+                float data_buffer[value_size];  // NOLINT
+                float* data_buffer_ptr = data_buffer;
+                for (size_t i = 0; i < keys.size(); ++i) {
+                  uint64_t key = keys[i].first;
+                  auto itr = local_shard.find(key);
+                  size_t data_size = value_size - mf_value_size;
+                  FixedFeatureValue* ret = NULL;
+                  if (itr == local_shard.end()) {
+                    // pull rocksdb
+                    std::string tmp_string("");
+                    if (_db->get(shard_id,
+                                 reinterpret_cast<char*>(&key),
+                                 sizeof(uint64_t),
+                                 tmp_string) > 0) {
+                      ++missed_keys;
+                      auto& feature_value = local_shard[key];
+                      feature_value.resize(data_size);
+                      float* data_ptr =
+                          const_cast<float*>(feature_value.data());
+                      _value_accesor->Create(&data_buffer_ptr, 1);
+                      memcpy(
+                          data_ptr, data_buffer_ptr, data_size * sizeof(float));
+                      ret = &feature_value;
+                    } else {
+                      data_size = tmp_string.size() / sizeof(float);
+                      memcpy(data_buffer_ptr,
+                             paddle::string::str_to_float(tmp_string),
+                             data_size * sizeof(float));
+                      // from rocksdb to mem
+                      auto& feature_value = local_shard[key];
+                      feature_value.resize(data_size);
+                      memcpy(const_cast<float*>(feature_value.data()),
+                             data_buffer_ptr,
+                             data_size * sizeof(float));
+                      _db->del_data(shard_id,
+                                    reinterpret_cast<char*>(&key),
+                                    sizeof(uint64_t));
+                      ret = &feature_value;
+                    }
+                  } else {
+                    ret = itr.value_ptr();
+                  }
+                  int pull_data_idx = keys[i].second;
+                  pull_values[pull_data_idx] = reinterpret_cast<char*>(ret);
+                }
+                return 0;
+              });
+    }
+    for (int i = 0; i < _real_local_shard_num; ++i) {
+      tasks[i].wait();
+    }
+    if (FLAGS_pserver_print_missed_key_num_every_push) {
+      LOG(WARNING) << "total pull keys:" << num
+                   << " missed_keys:" << missed_keys.load();
+    }
+  }
+  return 0;
+}
+
 int32_t SSDSparseTable::PushSparse(const uint64_t* keys,
                                    const float* values,
                                    size_t num) {
@@ -172,7 +290,7 @@ int32_t SSDSparseTable::PushSparse(const uint64_t* keys,
                &task_keys]() -> int {
                 auto& keys = task_keys[shard_id];
                 auto& local_shard = _local_shards[shard_id];
-                float data_buffer[value_col];
+                float data_buffer[value_col];  // NOLINT
                 float* data_buffer_ptr = data_buffer;
                 for (size_t i = 0; i < keys.size(); ++i) {
                   uint64_t key = keys[i].first;
@@ -201,7 +319,8 @@ int32_t SSDSparseTable::PushSparse(const uint64_t* keys,
                   if (value_size ==
                       value_col) {  // 已拓展到最大size, 则就地update
                     _value_accesor->Update(&value_data, &update_data, 1);
-                  } else {  // 拷入buffer区进行update，然后再回填，不需要的mf则回填时抛弃了
+                  } else {
+                    // 拷入buffer区进行update，然后再回填，不需要的mf则回填时抛弃了
                     memcpy(data_buffer_ptr,
                            value_data,
                            value_size * sizeof(float));
@@ -247,6 +366,90 @@ int32_t SSDSparseTable::PushSparse(const uint64_t* keys,
   return 0;
 }
 
+int32_t SSDSparseTable::PushSparse(const uint64_t* keys,
+                                   const float** values,
+                                   size_t num) {
+  CostTimer timer("pserver_downpour_sparse_update_all");
+  // 构造value push_value的数据指针
+  size_t value_col = _value_accesor->GetAccessorInfo().size / sizeof(float);
+  size_t mf_value_col =
+      _value_accesor->GetAccessorInfo().mf_size / sizeof(float);
+  size_t update_value_col =
+      _value_accesor->GetAccessorInfo().update_size / sizeof(float);
+  {
+    std::vector<std::future<int>> tasks(_real_local_shard_num);
+    std::vector<std::vector<std::pair<uint64_t, int>>> task_keys(
+        _real_local_shard_num);
+    for (size_t i = 0; i < num; ++i) {
+      int shard_id = (keys[i] % _sparse_table_shard_num) % _avg_local_shard_num;
+      task_keys[shard_id].push_back({keys[i], i});
+    }
+    for (int shard_id = 0; shard_id < _real_local_shard_num; ++shard_id) {
+      tasks[shard_id] =
+          _shards_task_pool[shard_id % _shards_task_pool.size()]->enqueue(
+              [this,
+               shard_id,
+               value_col,
+               mf_value_col,
+               update_value_col,
+               values,
+               &task_keys]() -> int {
+                auto& keys = task_keys[shard_id];
+                auto& local_shard = _local_shards[shard_id];
+                float data_buffer[value_col];  // NOLINT
+                float* data_buffer_ptr = data_buffer;
+                for (size_t i = 0; i < keys.size(); ++i) {
+                  uint64_t key = keys[i].first;
+                  uint64_t push_data_idx = keys[i].second;
+                  const float* update_data = values[push_data_idx];
+                  auto itr = local_shard.find(key);
+                  if (itr == local_shard.end()) {
+                    if (FLAGS_pserver_enable_create_feasign_randomly &&
+                        !_value_accesor->CreateValue(1, update_data)) {
+                      continue;
+                    }
+                    auto value_size = value_col - mf_value_col;
+                    auto& feature_value = local_shard[key];
+                    feature_value.resize(value_size);
+                    _value_accesor->Create(&data_buffer_ptr, 1);
+                    memcpy(const_cast<float*>(feature_value.data()),
+                           data_buffer_ptr,
+                           value_size * sizeof(float));
+                    itr = local_shard.find(key);
+                  }
+                  auto& feature_value = itr.value();
+                  float* value_data = const_cast<float*>(feature_value.data());
+                  size_t value_size = feature_value.size();
+
+                  if (value_size ==
+                      value_col) {  // 已拓展到最大size, 则就地update
+                    _value_accesor->Update(&value_data, &update_data, 1);
+                  } else {
+                    // 拷入buffer区进行update，然后再回填，不需要的mf则回填时抛弃了
+                    memcpy(data_buffer_ptr,
+                           value_data,
+                           value_size * sizeof(float));
+                    _value_accesor->Update(&data_buffer_ptr, &update_data, 1);
+                    if (_value_accesor->NeedExtendMF(data_buffer)) {
+                      feature_value.resize(value_col);
+                      value_data = const_cast<float*>(feature_value.data());
+                      _value_accesor->Create(&value_data, 1);
+                    }
+                    memcpy(value_data,
+                           data_buffer_ptr,
+                           value_size * sizeof(float));
+                  }
+                }
+                return 0;
+              });
+    }
+    for (int i = 0; i < _real_local_shard_num; ++i) {
+      tasks[i].wait();
+    }
+  }
+  return 0;
+}
+
 int32_t SSDSparseTable::Shrink(const std::string& param) {
   int thread_num = _real_local_shard_num < 20 ? _real_local_shard_num : 20;
   omp_set_num_threads(thread_num);
@@ -282,7 +485,7 @@ int32_t SSDSparseTable::Shrink(const std::string& param) {
     delete it;
     LOG(INFO) << "SSDSparseTable shrink success. shard:" << i << " delete MEM["
               << mem_count << "] SSD[" << ssd_count << "]";
-    //_db->flush(i);
+    // _db->flush(i);
   }
   return 0;
 }
diff --git a/paddle/fluid/distributed/ps/table/ssd_sparse_table.h b/paddle/fluid/distributed/ps/table/ssd_sparse_table.h
index 5b38e4b3d73f7..55a05bbab5ec2 100644
--- a/paddle/fluid/distributed/ps/table/ssd_sparse_table.h
+++ b/paddle/fluid/distributed/ps/table/ssd_sparse_table.h
@@ -33,26 +33,14 @@ class SSDSparseTable : public MemorySparseTable {
   // exchange data
   int32_t UpdateTable();
 
-  int32_t Pull(TableContext& context) override {
-    CHECK(context.value_type == Sparse);
-    float* pull_values = context.pull_context.values;
-    const PullSparseValue& pull_value = context.pull_context.pull_value;
-    return PullSparse(pull_values, pull_value.feasigns_, pull_value.numel_);
-  }
+  int32_t Pull(TableContext& context) override;
 
-  int32_t Push(TableContext& context) override {
-    const uint64_t* keys = context.push_context.keys;
-    const float* values = context.push_context.values;
-    size_t num = context.num;
-    return PushSparse(keys, values, num);
-  }
+  int32_t Push(TableContext& context) override;
 
-  virtual int32_t PullSparse(float* pull_values,
-                             const uint64_t* keys,
-                             size_t num);
-  virtual int32_t PushSparse(const uint64_t* keys,
-                             const float* values,
-                             size_t num);
+  int32_t PullSparse(float* pull_values, const uint64_t* keys, size_t num);
+  int32_t PullSparsePtr(char** pull_values, const uint64_t* keys, size_t num);
+  int32_t PushSparse(const uint64_t* keys, const float* values, size_t num);
+  int32_t PushSparse(const uint64_t* keys, const float** values, size_t num);
 
   int32_t Flush() override { return 0; }
   virtual int32_t Shrink(const std::string& param) override;
diff --git a/paddle/fluid/distributed/ps/table/tensor_accessor.h b/paddle/fluid/distributed/ps/table/tensor_accessor.h
index 3abf8156d9331..8401746a1e887 100644
--- a/paddle/fluid/distributed/ps/table/tensor_accessor.h
+++ b/paddle/fluid/distributed/ps/table/tensor_accessor.h
@@ -20,8 +20,8 @@
 #include <vector>
 
 #include "paddle/fluid/distributed/common/registerer.h"
-#include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/table/accessor.h"
+#include "paddle/fluid/distributed/the_one_ps.pb.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/store/tcp_store.cc b/paddle/fluid/distributed/store/tcp_store.cc
index a67ca29a543ab..e4228e4428d89 100644
--- a/paddle/fluid/distributed/store/tcp_store.cc
+++ b/paddle/fluid/distributed/store/tcp_store.cc
@@ -125,7 +125,10 @@ void MasterDaemon::CloseControlFd() {
 void MasterDaemon::StopByControlFd() {
   VLOG(4) << ("begin to run StopByControlFd");
   if (_control_fd[1] != -1) {
-    ::write(_control_fd[1], "\0", 1);
+    PADDLE_ENFORCE_NE(::write(_control_fd[1], "\0", 1),
+                      -1,
+                      platform::errors::Fatal(
+                          "failed to write control pipe errno:%d", errno));
     // close the write end of the pipe
     ::close(_control_fd[1]);
     _control_fd[1] = -1;
diff --git a/paddle/fluid/distributed/test/CMakeLists.txt b/paddle/fluid/distributed/test/CMakeLists.txt
index 9b7a304b0a92a..16681ea77bbea 100644
--- a/paddle/fluid/distributed/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/test/CMakeLists.txt
@@ -32,7 +32,6 @@ cc_test(
        client
        communicator
        ps_service
-       boost
        table
        ps_framework_proto
        ${COMMON_DEPS})
@@ -48,7 +47,6 @@ cc_test(
        client
        communicator
        ps_service
-       boost
        table
        ps_framework_proto
        ${COMMON_DEPS})
@@ -71,7 +69,6 @@ cc_test(
        client
        communicator
        ps_service
-       boost
        table
        ps_framework_proto
        ${COMMON_DEPS})
@@ -87,7 +84,6 @@ cc_test(
        client
        communicator
        ps_service
-       boost
        table
        ps_framework_proto
        ${COMMON_DEPS})
@@ -105,28 +101,28 @@ set_source_files_properties(
 cc_test(
   feature_value_test
   SRCS feature_value_test.cc
-  DEPS ${COMMON_DEPS} boost table)
+  DEPS ${COMMON_DEPS} table)
 
 set_source_files_properties(
   sparse_sgd_rule_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(
   sparse_sgd_rule_test
   SRCS sparse_sgd_rule_test.cc
-  DEPS ${COMMON_DEPS} boost table)
+  DEPS ${COMMON_DEPS} table)
 
 set_source_files_properties(
   ctr_accessor_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(
   ctr_accessor_test
   SRCS ctr_accessor_test.cc
-  DEPS ${COMMON_DEPS} boost table)
+  DEPS ${COMMON_DEPS} table)
 set_source_files_properties(
   ctr_dymf_accessor_test.cc PROPERTIES COMPILE_FLAGS
                                        ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(
   ctr_dymf_accessor_test
   SRCS ctr_dymf_accessor_test.cc
-  DEPS ${COMMON_DEPS} boost table)
+  DEPS ${COMMON_DEPS} table)
 
 set_source_files_properties(
   memory_sparse_table_test.cc PROPERTIES COMPILE_FLAGS
@@ -134,11 +130,11 @@ set_source_files_properties(
 cc_test(
   memory_sparse_table_test
   SRCS memory_sparse_table_test.cc
-  DEPS ${COMMON_DEPS} boost table)
+  DEPS ${COMMON_DEPS} table)
 
 set_source_files_properties(
   memory_geo_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(
   memory_sparse_geo_table_test
   SRCS memory_geo_table_test.cc
-  DEPS ${COMMON_DEPS} boost table)
+  DEPS ${COMMON_DEPS} table)
diff --git a/paddle/fluid/distributed/test/barrier_table_test.cc b/paddle/fluid/distributed/test/barrier_table_test.cc
index 12c389e9766b5..31f0f0844345c 100644
--- a/paddle/fluid/distributed/test/barrier_table_test.cc
+++ b/paddle/fluid/distributed/test/barrier_table_test.cc
@@ -18,9 +18,9 @@ limitations under the License. */
 #include <vector>
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/table/common_table.h"
 #include "paddle/fluid/distributed/ps/table/table.h"
+#include "paddle/fluid/distributed/the_one_ps.pb.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
index bed37e6036a5c..d10a34ddfe324 100644
--- a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
+++ b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
@@ -18,10 +18,10 @@ limitations under the License. */
 #include <thread>  // NOLINT
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/service/brpc_ps_client.h"
 #include "paddle/fluid/distributed/ps/service/brpc_ps_server.h"
 #include "paddle/fluid/distributed/ps/service/env.h"
+#include "paddle/fluid/distributed/the_one_ps.pb.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/distributed/test/ctr_accessor_test.cc b/paddle/fluid/distributed/test/ctr_accessor_test.cc
index 3f2ac69bd9a74..9b71e4524625c 100644
--- a/paddle/fluid/distributed/test/ctr_accessor_test.cc
+++ b/paddle/fluid/distributed/test/ctr_accessor_test.cc
@@ -19,8 +19,8 @@ limitations under the License. */
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/common/registerer.h"
-#include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/table/sparse_sgd_rule.h"
+#include "paddle/fluid/distributed/the_one_ps.pb.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/test/ctr_dymf_accessor_test.cc b/paddle/fluid/distributed/test/ctr_dymf_accessor_test.cc
index fbf179dbeeef0..39bff554ff9d2 100644
--- a/paddle/fluid/distributed/test/ctr_dymf_accessor_test.cc
+++ b/paddle/fluid/distributed/test/ctr_dymf_accessor_test.cc
@@ -19,8 +19,8 @@ limitations under the License. */
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/common/registerer.h"
-#include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/table/sparse_sgd_rule.h"
+#include "paddle/fluid/distributed/the_one_ps.pb.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/test/dense_table_test.cc b/paddle/fluid/distributed/test/dense_table_test.cc
index 185d9d3aed1d4..8b021e2c9624e 100644
--- a/paddle/fluid/distributed/test/dense_table_test.cc
+++ b/paddle/fluid/distributed/test/dense_table_test.cc
@@ -17,8 +17,8 @@ limitations under the License. */
 #include <vector>
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/table/memory_dense_table.h"
+#include "paddle/fluid/distributed/the_one_ps.pb.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/test/graph_node_split_test.cc b/paddle/fluid/distributed/test/graph_node_split_test.cc
index dd085d7510b60..96769cff83bb8 100644
--- a/paddle/fluid/distributed/test/graph_node_split_test.cc
+++ b/paddle/fluid/distributed/test/graph_node_split_test.cc
@@ -21,7 +21,6 @@ limitations under the License. */
 
 #include "google/protobuf/text_format.h"
 #include "gtest/gtest.h"
-#include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/service/brpc_ps_client.h"
 #include "paddle/fluid/distributed/ps/service/brpc_ps_server.h"
 #include "paddle/fluid/distributed/ps/service/env.h"
@@ -32,6 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/distributed/ps/service/ps_service/service.h"
 #include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
 #include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
+#include "paddle/fluid/distributed/the_one_ps.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
diff --git a/paddle/fluid/distributed/test/graph_node_test.cc b/paddle/fluid/distributed/test/graph_node_test.cc
index 3439ffe8a0c25..de12b715deb54 100644
--- a/paddle/fluid/distributed/test/graph_node_test.cc
+++ b/paddle/fluid/distributed/test/graph_node_test.cc
@@ -23,7 +23,6 @@ limitations under the License. */
 
 #include "google/protobuf/text_format.h"
 #include "gtest/gtest.h"
-#include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/service/brpc_ps_client.h"
 #include "paddle/fluid/distributed/ps/service/brpc_ps_server.h"
 #include "paddle/fluid/distributed/ps/service/env.h"
@@ -33,6 +32,7 @@ limitations under the License. */
 #include "paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h"
 #include "paddle/fluid/distributed/ps/service/ps_service/service.h"
 #include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
+#include "paddle/fluid/distributed/the_one_ps.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
diff --git a/paddle/fluid/distributed/test/graph_table_sample_test.cc b/paddle/fluid/distributed/test/graph_table_sample_test.cc
index 5c05a3a70f49f..15c86e2fdd378 100644
--- a/paddle/fluid/distributed/test/graph_table_sample_test.cc
+++ b/paddle/fluid/distributed/test/graph_table_sample_test.cc
@@ -25,8 +25,8 @@
 
 #include "google/protobuf/text_format.h"
 #include "gtest/gtest.h"
-#include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/table/common_graph_table.h"
+#include "paddle/fluid/distributed/the_one_ps.pb.h"
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace operators = paddle::operators;
diff --git a/paddle/fluid/distributed/test/memory_geo_table_test.cc b/paddle/fluid/distributed/test/memory_geo_table_test.cc
index 507211e69fa0f..f01c40f7043b4 100644
--- a/paddle/fluid/distributed/test/memory_geo_table_test.cc
+++ b/paddle/fluid/distributed/test/memory_geo_table_test.cc
@@ -20,10 +20,10 @@ limitations under the License. */
 
 #include "google/protobuf/text_format.h"
 #include "gtest/gtest.h"
-#include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/table/depends/sparse_utils.h"
 #include "paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h"
 #include "paddle/fluid/distributed/ps/table/table.h"
+#include "paddle/fluid/distributed/the_one_ps.pb.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/test/memory_sparse_table_test.cc b/paddle/fluid/distributed/test/memory_sparse_table_test.cc
index 485d81a7d6856..da311a7691fc8 100644
--- a/paddle/fluid/distributed/test/memory_sparse_table_test.cc
+++ b/paddle/fluid/distributed/test/memory_sparse_table_test.cc
@@ -22,8 +22,8 @@ limitations under the License. */
 
 #include "google/protobuf/text_format.h"
 #include "gtest/gtest.h"
-#include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/table/table.h"
+#include "paddle/fluid/distributed/the_one_ps.pb.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc b/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc
index 2dfc2961f39d1..e12e2757504a5 100644
--- a/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc
+++ b/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include <iostream>
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/distributed/the_one_ps.pb.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/test/table_test.cc b/paddle/fluid/distributed/test/table_test.cc
index 56809abad0c7c..afeaf273174f4 100644
--- a/paddle/fluid/distributed/test/table_test.cc
+++ b/paddle/fluid/distributed/test/table_test.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/table/memory_dense_table.h"
+#include "paddle/fluid/distributed/the_one_ps.pb.h"
 //#include "paddle/fluid/distributed/ps/table/sparse_geo_table.h"
 
 namespace paddle {
diff --git a/paddle/fluid/eager/amp_auto_cast.h b/paddle/fluid/eager/amp_auto_cast.h
index ed05a6e69c026..5110f6f883e67 100644
--- a/paddle/fluid/eager/amp_auto_cast.h
+++ b/paddle/fluid/eager/amp_auto_cast.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
+#include "paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h"
 #include "paddle/fluid/framework/convert_utils.h"
 
 namespace egr {
diff --git a/paddle/fluid/eager/api/CMakeLists.txt b/paddle/fluid/eager/api/CMakeLists.txt
index 4525a58a44d48..0da46bbbfbbd6 100644
--- a/paddle/fluid/eager/api/CMakeLists.txt
+++ b/paddle/fluid/eager/api/CMakeLists.txt
@@ -1,3 +1,4 @@
+add_subdirectory(manual)
 add_subdirectory(utils)
 add_subdirectory(generated)
 
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
index f704d2a49184b..1f2b30853c6bf 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
@@ -6,7 +6,7 @@ cc_library(
 if(NOT (NOT WITH_PYTHON AND ON_INFER))
   cc_library(
     final_dygraph_node
-    SRCS nodes.cc
+    SRCS nodes.cc ${eager_manual_nodes}
     DEPS ${eager_deps})
   add_dependencies(final_dygraph_node eager_final_state_codegen)
 endif()
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
index 4ee33ad100f16..1409119daf1d3 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
@@ -117,20 +117,20 @@ void ScaleAPI(const paddle::experimental::Tensor& x,
       paddle::platform::DeviceContextPool::Instance();
 
   if (expected_kernel_place == paddle::platform::CPUPlace()) {
-    auto* dev_ctx = dynamic_cast<paddle::platform::CPUDeviceContext*>(
-        pool.Get(expected_kernel_place));
+    auto* dev_ctx =
+        dynamic_cast<phi::CPUContext*>(pool.Get(expected_kernel_place));
     if (!dev_ctx) {
       PADDLE_THROW(paddle::platform::errors::Fatal(
-          "Cannot convert device_context to CPUDeviceContext."
+          "Cannot convert device_context to phi::CPUContext."
           "This indicates backend mismatch."
           "Pleas double check your expected place"));
     }
-    ScaleDeviceDispatch<paddle::platform::CPUDeviceContext>(*dense_tensor.get(),
-                                                            *dev_ctx,
-                                                            scale,
-                                                            bias,
-                                                            bias_after_scale,
-                                                            dense_out.get());
+    ScaleDeviceDispatch<phi::CPUContext>(*dense_tensor.get(),
+                                         *dev_ctx,
+                                         scale,
+                                         bias,
+                                         bias_after_scale,
+                                         dense_out.get());
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   } else if (expected_kernel_place == paddle::platform::CUDAPlace()) {
diff --git a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
index 8d6df647999bd..9baf8956fe2e4 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
+++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
@@ -6,7 +6,7 @@ cc_library(
 if(NOT (NOT WITH_PYTHON AND ON_INFER))
   cc_library(
     final_dygraph_function
-    SRCS dygraph_functions.cc
+    SRCS dygraph_functions.cc ${eager_manual_functions}
     DEPS ${eager_deps})
   add_dependencies(final_dygraph_function eager_final_state_codegen)
 endif()
diff --git a/paddle/fluid/eager/api/manual/CMakeLists.txt b/paddle/fluid/eager/api/manual/CMakeLists.txt
new file mode 100644
index 0000000000000..8c4ce6d2bdbf8
--- /dev/null
+++ b/paddle/fluid/eager/api/manual/CMakeLists.txt
@@ -0,0 +1,17 @@
+if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
+  add_subdirectory(fluid_manual)
+  set(fluid_manual_functions
+      ${fluid_manual_functions}
+      PARENT_SCOPE)
+  set(fluid_manual_nodes
+      ${fluid_manual_nodes}
+      PARENT_SCOPE)
+
+  add_subdirectory(eager_manual)
+  set(eager_manual_functions
+      ${eager_manual_functions}
+      PARENT_SCOPE)
+  set(eager_manual_nodes
+      ${eager_manual_nodes}
+      PARENT_SCOPE)
+endif()
diff --git a/paddle/fluid/eager/api/manual/eager_manual/CMakeLists.txt b/paddle/fluid/eager/api/manual/eager_manual/CMakeLists.txt
new file mode 100644
index 0000000000000..09420f368507d
--- /dev/null
+++ b/paddle/fluid/eager/api/manual/eager_manual/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_subdirectory(forwards)
+add_subdirectory(nodes)
+set(eager_manual_functions
+    ${eager_manual_functions}
+    PARENT_SCOPE)
+set(eager_manual_nodes
+    ${eager_manual_nodes}
+    PARENT_SCOPE)
diff --git a/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h b/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h
new file mode 100644
index 0000000000000..f9d10600a9a00
--- /dev/null
+++ b/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/api/include/tensor.h"
+
+paddle::experimental::Tensor add_n_final_state_dygraph_function(
+    const std::vector<paddle::experimental::Tensor>& x);
+
+paddle::experimental::Tensor conv2d_final_state_dygraph_function(
+    const paddle::experimental::Tensor& input,
+    const paddle::experimental::Tensor& filter,
+    std::vector<int> strides,
+    std::vector<int> paddings,
+    std::string paddding_algorithm,
+    int groups,
+    std::vector<int> dilations,
+    std::string data_format,
+    bool use_addto,
+    int workspace_size_MB,
+    bool exhaustive_search);
diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/CMakeLists.txt b/paddle/fluid/eager/api/manual/eager_manual/forwards/CMakeLists.txt
new file mode 100644
index 0000000000000..d25b3ba08b5a6
--- /dev/null
+++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(eager_manual_functions
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc
+    PARENT_SCOPE)
diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc
new file mode 100644
index 0000000000000..226197b0f84ad
--- /dev/null
+++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc
@@ -0,0 +1,109 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/eager/amp_utils.h"
+#include "paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h"
+#include "paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h"
+#include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/eager/eager_amp_auto_cast.h"
+#include "paddle/fluid/eager/nan_inf_utils.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
+
+#pragma GCC diagnostic ignored "-Wunused-variable"
+DECLARE_bool(check_nan_inf);
+
+paddle::experimental::Tensor add_n_final_state_dygraph_function(
+    const std::vector<paddle::experimental::Tensor>& x) {
+  // Dygraph Record Event
+  paddle::platform::RecordEvent dygraph_entrance_record_event(
+      "add_n dygraph", paddle::platform::TracerEventType::Operator, 1);
+
+  // AMP Logic
+  if (egr::Controller::Instance().GetAMPLevel() !=
+      paddle::imperative::AmpLevel::O0) {
+    VLOG(5) << "Check and Prepare For AMP";
+    auto op_name = phi::TransToFluidOpName("add_n");
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         egr::kSlotSmallVectorSize>
+        amp_tensors_vector = {x};
+
+    auto amp_dst_dtype = egr::GetAmpDestDtype(op_name, amp_tensors_vector);
+
+    auto NEW_x = egr::EagerAmpAutoCasts("x", x, amp_dst_dtype, op_name);
+
+    {
+      paddle::imperative::AutoCastGuard guard(
+          egr::Controller::Instance().GetCurrentTracer(),
+          paddle::imperative::AmpLevel::O0);
+      return add_n_final_state_dygraph_function(NEW_x);
+    }
+  }
+
+  // Get Input AutoGradMeta
+  std::vector<egr::AutogradMeta*> x_autograd_meta_vec =
+      egr::EagerUtils::nullable_autograd_meta(x);
+  std::vector<egr::AutogradMeta*>* x_autograd_meta = &x_autograd_meta_vec;
+  // Forward API Call
+  VLOG(3) << "Final State Running: "
+          << "add_n_final_state_dygraph_function";
+  auto api_result = paddle::experimental::add_n(x);
+  // Check NaN and Inf if needed
+  if (FLAGS_check_nan_inf) {
+    egr::CheckTensorHasNanOrInf("add_n", api_result);
+  }
+
+  // Get Outputs
+  auto& out = api_result;
+
+  // Get Output AutoGradMeta
+  egr::AutogradMeta* out_autograd_meta = egr::EagerUtils::autograd_meta(&out);
+  bool trace_backward = egr::Controller::Instance().HasGrad();
+  bool require_any_grad =
+      egr::EagerUtils::ComputeRequireGrad(trace_backward, x_autograd_meta);
+
+  // Check Inplace if needed
+
+  // Node Creation
+  if (require_any_grad) {
+    paddle::platform::RecordEvent node_creation_record_event(
+        "add_n node_creation",
+        paddle::platform::TracerEventType::OperatorInner,
+        1);
+
+    egr::EagerUtils::PassStopGradient(false, out_autograd_meta);
+
+    // Node Construction
+    auto grad_node =
+        std::shared_ptr<AddNGradNodeFinal>(new AddNGradNodeFinal(1, 1));
+    // SetAttributes if needed
+
+    // Set TensorWrappers for Forward Inputs if needed
+    grad_node->SetTensorWrapperx(x);
+    // SetGradOutMeta & SetEdges
+    grad_node->SetGradOutMeta(x, 0);
+    // SetOutRank & SetHistory & SetGradInMeta & RetainGrad
+    if (out_autograd_meta) {
+      egr::EagerUtils::SetOutRankWithSlot(out_autograd_meta, 0);
+    }
+    if (out_autograd_meta) {
+      egr::EagerUtils::SetHistory(out_autograd_meta, grad_node);
+    }
+    grad_node->SetGradInMeta(out, 0);
+    egr::EagerUtils::CheckAndRetainGrad(out);
+    // Set TensorWrappers for Forward Outputs if needed
+  }
+
+  // Returns
+  return out;
+}
diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc
new file mode 100644
index 0000000000000..f7bff6fb88997
--- /dev/null
+++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc
@@ -0,0 +1,153 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/eager/amp_utils.h"
+#include "paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h"
+#include "paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h"
+#include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/eager/eager_amp_auto_cast.h"
+#include "paddle/fluid/eager/nan_inf_utils.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
+
+#pragma GCC diagnostic ignored "-Wunused-variable"
+DECLARE_bool(check_nan_inf);
+
+paddle::experimental::Tensor conv2d_final_state_dygraph_function(
+    const paddle::experimental::Tensor& input,
+    const paddle::experimental::Tensor& filter,
+    std::vector<int> strides,
+    std::vector<int> paddings,
+    std::string paddding_algorithm,
+    int groups,
+    std::vector<int> dilations,
+    std::string data_format,
+    bool use_addto,
+    int workspace_size_MB,
+    bool exhaustive_search) {
+  // Dygraph Record Event
+  paddle::platform::RecordEvent dygraph_entrance_record_event(
+      "conv2d dygraph", paddle::platform::TracerEventType::Operator, 1);
+
+  // AMP Logic
+  if (egr::Controller::Instance().GetAMPLevel() !=
+      paddle::imperative::AmpLevel::O0) {
+    VLOG(5) << "Check and Prepare For AMP";
+    auto op_name = phi::TransToFluidOpName("conv2d");
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         egr::kSlotSmallVectorSize>
+        amp_tensors_vector = {{input}, {filter}};
+
+    auto amp_dst_dtype = egr::GetAmpDestDtype(op_name, amp_tensors_vector);
+
+    auto NEW_input =
+        egr::EagerAmpAutoCast("input", input, amp_dst_dtype, op_name);
+    auto NEW_filter =
+        egr::EagerAmpAutoCast("filter", filter, amp_dst_dtype, op_name);
+
+    {
+      paddle::imperative::AutoCastGuard guard(
+          egr::Controller::Instance().GetCurrentTracer(),
+          paddle::imperative::AmpLevel::O0);
+      return conv2d_final_state_dygraph_function(NEW_input,
+                                                 NEW_filter,
+                                                 strides,
+                                                 paddings,
+                                                 paddding_algorithm,
+                                                 groups,
+                                                 dilations,
+                                                 data_format,
+                                                 use_addto,
+                                                 workspace_size_MB,
+                                                 exhaustive_search);
+    }
+  }
+
+  // Get Input AutoGradMeta
+  egr::AutogradMeta* input_autograd_meta =
+      egr::EagerUtils::nullable_autograd_meta(input);
+  egr::AutogradMeta* filter_autograd_meta =
+      egr::EagerUtils::nullable_autograd_meta(filter);
+  // Forward API Call
+  VLOG(3) << "Final State Running: "
+          << "conv2d_final_state_dygraph_function";
+  auto api_result = paddle::experimental::conv2d(input,
+                                                 filter,
+                                                 strides,
+                                                 paddings,
+                                                 paddding_algorithm,
+                                                 groups,
+                                                 dilations,
+                                                 data_format,
+                                                 use_addto,
+                                                 workspace_size_MB,
+                                                 exhaustive_search);
+  // Check NaN and Inf if needed
+  if (FLAGS_check_nan_inf) {
+    egr::CheckTensorHasNanOrInf("conv2d", api_result);
+  }
+
+  // Get Outputs
+  auto& out = api_result;
+
+  // Get Output AutoGradMeta
+  egr::AutogradMeta* out_autograd_meta = egr::EagerUtils::autograd_meta(&out);
+  bool trace_backward = egr::Controller::Instance().HasGrad();
+  bool require_any_grad = egr::EagerUtils::ComputeRequireGrad(
+      trace_backward, input_autograd_meta, filter_autograd_meta);
+
+  // Check Inplace if needed
+
+  // Node Creation
+  if (require_any_grad) {
+    paddle::platform::RecordEvent node_creation_record_event(
+        "conv2d node_creation",
+        paddle::platform::TracerEventType::OperatorInner,
+        1);
+
+    egr::EagerUtils::PassStopGradient(false, out_autograd_meta);
+
+    // Node Construction
+    auto grad_node =
+        std::shared_ptr<Conv2dGradNodeFinal>(new Conv2dGradNodeFinal(1, 2));
+    // SetAttributes if needed
+    grad_node->SetAttributestrides(strides);
+    grad_node->SetAttributepaddings(paddings);
+    grad_node->SetAttributepaddding_algorithm(paddding_algorithm);
+    grad_node->SetAttributegroups(groups);
+    grad_node->SetAttributedilations(dilations);
+    grad_node->SetAttributedata_format(data_format);
+    grad_node->SetAttributeuse_addto(use_addto);
+    grad_node->SetAttributeworkspace_size_MB(workspace_size_MB);
+    grad_node->SetAttributeexhaustive_search(exhaustive_search);
+    // Set TensorWrappers for Forward Inputs if needed
+    grad_node->SetTensorWrapperinput(input);
+    grad_node->SetTensorWrapperfilter(filter);
+    // SetGradOutMeta & SetEdges
+    grad_node->SetGradOutMeta(input, 0);
+    grad_node->SetGradOutMeta(filter, 1);
+    // SetOutRank & SetHistory & SetGradInMeta & RetainGrad
+    if (out_autograd_meta) {
+      egr::EagerUtils::SetOutRankWithSlot(out_autograd_meta, 0);
+    }
+    if (out_autograd_meta) {
+      egr::EagerUtils::SetHistory(out_autograd_meta, grad_node);
+    }
+    grad_node->SetGradInMeta(out, 0);
+    egr::EagerUtils::CheckAndRetainGrad(out);
+    // Set TensorWrappers for Forward Outputs if needed
+  }
+
+  // Returns
+  return out;
+}
diff --git a/paddle/fluid/eager/api/manual/eager_manual/nodes/CMakeLists.txt b/paddle/fluid/eager/api/manual/eager_manual/nodes/CMakeLists.txt
new file mode 100644
index 0000000000000..ac5ce176f4e37
--- /dev/null
+++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(eager_manual_nodes
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/eager_manual/nodes/add_n_node.cc
+    PARENT_SCOPE)
diff --git a/paddle/fluid/eager/api/manual/eager_manual/nodes/add_n_node.cc b/paddle/fluid/eager/api/manual/eager_manual/nodes/add_n_node.cc
new file mode 100644
index 0000000000000..e314c0c2b5b4e
--- /dev/null
+++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/add_n_node.cc
@@ -0,0 +1,78 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "glog/logging.h"
+#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
+#include "paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h"
+#include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/eager/nan_inf_utils.h"
+#include "paddle/fluid/eager/utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/imperative/tracer.h"
+#include "paddle/phi/api/all.h"
+#include "paddle/phi/api/lib/api_custom_impl.h"
+DECLARE_bool(check_nan_inf);
+
+paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                     egr::kSlotSmallVectorSize>
+AddNGradNodeFinal::operator()(
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         egr::kSlotSmallVectorSize>& grads,
+    bool create_graph,
+    bool is_new_grad) {
+  // Fill Zero For GradIn Tensors
+
+  // Apply Gradient Hooks
+  auto hooked_grads = ApplyGradientHooks(grads);
+
+  // Collect GradIn Tensors, Attrs and Recovered TensorWrappers
+  auto x = egr::EagerUtils::RecoverTensorWrapper(&this->x_);
+  auto& out_grad = hooked_grads[0][0];
+  // Prepare Grad function call
+
+  const auto& out_metas = OutputMeta();
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       egr::kSlotSmallVectorSize>
+      returns(1);
+  for (int i = 0; i < 1; ++i) {
+    out_metas[i].size() == 0 ? returns[i].resize(1)
+                             : returns[i].resize(out_metas[i].size());
+  }
+
+  std::vector<paddle::experimental::Tensor*> api_output_0;
+  api_output_0.reserve(returns[0].size());
+  for (size_t i = 0; i < returns[0].size(); ++i) {
+    if (out_metas[0].empty() || out_metas[0][i].IsStopGradient()) {
+      api_output_0.push_back(nullptr);
+    } else {
+      api_output_0.push_back(&returns[0][i]);
+    }
+  }
+  // Call grad_api function
+  VLOG(3) << "Final State Running: AddNGradNodeFinal";
+
+  // dygraph function
+  for (size_t i = 0; i < returns[0].size(); i++) {
+    returns[0][i] = ::scale_final_state_dygraph_function(
+        out_grad, phi::Scalar(1.0), 0.0, true);
+  }
+
+  // Check NaN and Inf id needed
+  if (FLAGS_check_nan_inf) {
+    egr::CheckTensorHasNanOrInf("add_n_grad", returns);
+  }
+
+  if (NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&returns);
+  return returns;
+}
diff --git a/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc b/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc
new file mode 100644
index 0000000000000..ce8d647cb9ece
--- /dev/null
+++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc
@@ -0,0 +1,308 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "glog/logging.h"
+#include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/eager/nan_inf_utils.h"
+#include "paddle/fluid/eager/to_static/run_program_op_node.h"
+#include "paddle/fluid/eager/utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/imperative/tracer.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
+#include "paddle/phi/api/all.h"
+#include "paddle/phi/api/backward/backward_api.h"
+#include "paddle/phi/api/backward/sparse_bw_api.h"
+
+#include "paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h"
+#include "paddle/phi/api/include/sparse_api.h"
+DECLARE_bool(check_nan_inf);
+
+paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                     egr::kSlotSmallVectorSize>
+Conv2dGradNodeFinal::operator()(
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         egr::kSlotSmallVectorSize>& grads,
+    bool create_graph,
+    bool is_new_grad) {
+  // Fill Zero For GradIn Tensors
+  VLOG(3) << " Running Conv2dGradNodeFinal: " << this;
+  // Apply Gradient Hooks
+  auto hooked_grads = ApplyGradientHooks(grads);
+
+  // Collect GradIn Tensors, Attrs and Recovered TensorWrappers
+  auto input = egr::EagerUtils::RecoverTensorWrapper(&this->input_);
+  auto filter = egr::EagerUtils::RecoverTensorWrapper(&this->filter_);
+  auto& grad_out = hooked_grads[0][0];
+  auto& strides = this->strides_;
+  auto& paddings = this->paddings_;
+  auto& paddding_algorithm = this->paddding_algorithm_;
+  auto& groups = this->groups_;
+  auto& dilations = this->dilations_;
+  auto& data_format = this->data_format_;
+  auto& use_addto = this->use_addto_;
+  auto& workspace_size_MB = this->workspace_size_MB_;
+  auto& exhaustive_search = this->exhaustive_search_;
+  // Prepare Grad function call
+
+  const auto& out_metas = OutputMeta();
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       egr::kSlotSmallVectorSize>
+      returns(2);
+  for (int i = 0; i < 2; ++i) {
+    out_metas[i].size() == 0 ? returns[i].resize(1)
+                             : returns[i].resize(out_metas[i].size());
+  }
+
+  auto* api_output_0 =
+      (out_metas[0].empty() || out_metas[0][0].IsStopGradient())
+          ? nullptr
+          : &returns[0][0];
+  auto* api_output_1 =
+      (out_metas[1].empty() || out_metas[1][0].IsStopGradient())
+          ? nullptr
+          : &returns[1][0];
+  // Runtime check if we need next grad
+  bool trace_backward = egr::Controller::Instance().HasGrad() && create_graph;
+
+  // Inplace Check
+
+  // Inplace Strategy
+
+  // Call grad_api function
+  VLOG(3) << "Final State Running: Conv2dGradNodeFinal";
+
+  paddle::experimental::conv2d_grad(input,
+                                    filter,
+                                    grad_out,
+                                    strides,
+                                    paddings,
+                                    paddding_algorithm,
+                                    groups,
+                                    dilations,
+                                    data_format,
+                                    use_addto,
+                                    workspace_size_MB,
+                                    exhaustive_search,
+                                    api_output_0,
+                                    api_output_1);
+  // Check NaN and Inf id needed
+  if (FLAGS_check_nan_inf) {
+    egr::CheckTensorHasNanOrInf("conv2d_grad", returns);
+  }
+
+  // Get GradOut autograd_meta
+
+  auto& grad_input = returns[0][0];
+  egr::AutogradMeta* grad_input_autograd_meta =
+      returns[0][0].initialized() ? egr::EagerUtils::autograd_meta(&grad_input)
+                                  : nullptr;
+  if (grad_input_autograd_meta)
+    grad_input_autograd_meta->SetStopGradient(false);
+  VLOG(3) << "Conv2dGradNodeFinal grad_input_autograd_meta: "
+          << grad_input_autograd_meta;
+
+  auto& grad_filter = returns[1][0];
+  egr::AutogradMeta* grad_filter_autograd_meta =
+      returns[1][0].initialized() ? egr::EagerUtils::autograd_meta(&grad_filter)
+                                  : nullptr;
+  if (grad_filter_autograd_meta)
+    grad_filter_autograd_meta->SetStopGradient(false);
+  VLOG(3) << "Conv2dGradNodeFinal grad_filter_autograd_meta: "
+          << grad_filter_autograd_meta;
+
+  // Create Grad Node
+  if (trace_backward) {
+    paddle::platform::RecordEvent node_creation_record_event(
+        "conv2d_grad node_creation",
+        paddle::platform::TracerEventType::OperatorInner,
+        1);
+
+    // Node Construction
+    auto grad_node = std::shared_ptr<Conv2dDoubleGradNodeFinal>(
+        new Conv2dDoubleGradNodeFinal(2, 3));
+    // SetAttributes if needed
+    grad_node->SetAttributestrides(strides);
+    grad_node->SetAttributepaddings(paddings);
+    grad_node->SetAttributepaddding_algorithm(paddding_algorithm);
+    grad_node->SetAttributegroups(groups);
+    grad_node->SetAttributedilations(dilations);
+    grad_node->SetAttributedata_format(data_format);
+    grad_node->SetAttributeuse_addto(use_addto);
+    grad_node->SetAttributeworkspace_size_MB(workspace_size_MB);
+    grad_node->SetAttributeexhaustive_search(exhaustive_search);
+    // Set TensorWrappers for Forward Inputs if needed
+    grad_node->SetTensorWrapperinput(input);
+    grad_node->SetTensorWrapperfilter(filter);
+    grad_node->SetTensorWrappergrad_out(grad_out);
+    // SetGradOutMeta & SetEdges
+    if (grad_filter_autograd_meta) {
+      grad_node->SetGradOutMeta(input, 0);
+    }
+    if (grad_input_autograd_meta) {
+      grad_node->SetGradOutMeta(filter, 1);
+      grad_node->SetGradOutMeta(grad_out, 2);
+    }
+    // SetOutRank & SetHistory & SetGradInMeta & RetainGrad
+    if (grad_input_autograd_meta) {
+      egr::EagerUtils::SetOutRankWithSlot(grad_input_autograd_meta, 0);
+    }
+    if (grad_filter_autograd_meta) {
+      egr::EagerUtils::SetOutRankWithSlot(grad_filter_autograd_meta, 1);
+    }
+    if (grad_input_autograd_meta) {
+      egr::EagerUtils::SetHistory(grad_input_autograd_meta, grad_node);
+    }
+    if (grad_filter_autograd_meta) {
+      egr::EagerUtils::SetHistory(grad_filter_autograd_meta, grad_node);
+    }
+    grad_node->SetGradInMeta(grad_input, 0);
+    grad_node->SetGradInMeta(grad_filter, 1);
+    egr::EagerUtils::CheckAndRetainGrad(grad_input);
+    egr::EagerUtils::CheckAndRetainGrad(grad_filter);
+    // Set TensorWrappers for Forward Outputs if needed
+  }
+
+  // Return
+  if (NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&returns);
+  return returns;
+}
+
+paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                     egr::kSlotSmallVectorSize>
+Conv2dDoubleGradNodeFinal::operator()(
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         egr::kSlotSmallVectorSize>& grads,
+    bool create_graph,
+    bool is_new_grad) {
+  // Fill Zero For GradIn Tensors
+  const auto& input_metas = this->InputMeta();
+  egr::EagerUtils::FillZeroForEmptyOptionalGradInput(&grads[0][0],
+                                                     input_metas[0][0]);
+  egr::EagerUtils::FillZeroForEmptyOptionalGradInput(&grads[1][0],
+                                                     input_metas[1][0]);
+
+  // Apply Gradient Hooks
+  auto hooked_grads = ApplyGradientHooks(grads);
+
+  // Collect GradIn Tensors, Attrs and Recovered TensorWrappers
+  auto input = egr::EagerUtils::RecoverTensorWrapper(&this->input_);
+  auto filter = egr::EagerUtils::RecoverTensorWrapper(&this->filter_);
+  auto grad_out = egr::EagerUtils::RecoverTensorWrapper(&this->grad_out_);
+  auto& grad_input_grad = hooked_grads[0][0];
+
+  paddle::optional<paddle::experimental::Tensor> grad_input_grad_optional;
+  if (grad_input_grad.initialized())
+    grad_input_grad_optional =
+        paddle::make_optional<paddle::experimental::Tensor>(grad_input_grad);
+
+  auto& grad_filter_grad = hooked_grads[1][0];
+
+  paddle::optional<paddle::experimental::Tensor> grad_filter_grad_optional;
+  if (grad_filter_grad.initialized())
+    grad_filter_grad_optional =
+        paddle::make_optional<paddle::experimental::Tensor>(grad_filter_grad);
+
+  auto& strides = this->strides_;
+  auto& paddings = this->paddings_;
+  auto& paddding_algorithm = this->paddding_algorithm_;
+  auto& groups = this->groups_;
+  auto& dilations = this->dilations_;
+  auto& data_format = this->data_format_;
+  auto& use_addto = this->use_addto_;
+  auto& workspace_size_MB = this->workspace_size_MB_;
+  auto& exhaustive_search = this->exhaustive_search_;
+  // Prepare Grad function call
+
+  const auto& out_metas = OutputMeta();
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       egr::kSlotSmallVectorSize>
+      returns(3);
+  for (int i = 0; i < 3; ++i) {
+    out_metas[i].size() == 0 ? returns[i].resize(1)
+                             : returns[i].resize(out_metas[i].size());
+  }
+
+  auto* api_output_0 =
+      (out_metas[0].empty() || out_metas[0][0].IsStopGradient())
+          ? nullptr
+          : &returns[0][0];
+  auto* api_output_1 =
+      (out_metas[1].empty() || out_metas[1][0].IsStopGradient())
+          ? nullptr
+          : &returns[1][0];
+  auto* api_output_2 =
+      (out_metas[2].empty() || out_metas[2][0].IsStopGradient())
+          ? nullptr
+          : &returns[2][0];
+  // Runtime check if we need next grad
+
+  // Inplace Check
+
+  // Inplace Strategy
+
+  // Call grad_api function
+  VLOG(3) << "Final State Running: Conv2dGradGradNodeFinal";
+
+  paddle::experimental::conv2d_grad_grad(input,
+                                         filter,
+                                         grad_out,
+                                         grad_input_grad_optional,
+                                         grad_filter_grad_optional,
+                                         strides,
+                                         paddings,
+                                         paddding_algorithm,
+                                         groups,
+                                         dilations,
+                                         data_format,
+                                         use_addto,
+                                         workspace_size_MB,
+                                         exhaustive_search,
+                                         api_output_0,
+                                         api_output_1,
+                                         api_output_2);
+  // Check NaN and Inf id needed
+  if (FLAGS_check_nan_inf) {
+    egr::CheckTensorHasNanOrInf("conv2d_grad_grad", returns);
+  }
+
+  // Get GradOut autograd_meta
+
+  auto& input_grad = returns[0][0];
+  egr::AutogradMeta* input_grad_autograd_meta =
+      returns[0][0].initialized() ? egr::EagerUtils::autograd_meta(&input_grad)
+                                  : nullptr;
+  if (input_grad_autograd_meta)
+    input_grad_autograd_meta->SetStopGradient(false);
+
+  auto& filter_grad = returns[1][0];
+  egr::AutogradMeta* filter_grad_autograd_meta =
+      returns[1][0].initialized() ? egr::EagerUtils::autograd_meta(&filter_grad)
+                                  : nullptr;
+  if (filter_grad_autograd_meta)
+    filter_grad_autograd_meta->SetStopGradient(false);
+
+  auto& grad_out_grad = returns[2][0];
+  egr::AutogradMeta* grad_out_grad_autograd_meta =
+      returns[2][0].initialized()
+          ? egr::EagerUtils::autograd_meta(&grad_out_grad)
+          : nullptr;
+  if (grad_out_grad_autograd_meta)
+    grad_out_grad_autograd_meta->SetStopGradient(false);
+
+  // Create Grad Node
+
+  // Return
+  if (NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&returns);
+  return returns;
+}
diff --git a/paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h b/paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h
new file mode 100644
index 0000000000000..14fe144c0094a
--- /dev/null
+++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h
@@ -0,0 +1,229 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/eager/grad_node_info.h"
+#include "paddle/fluid/eager/tensor_wrapper.h"
+#include "paddle/fluid/imperative/tracer.h"
+
+class Conv2dGradNodeFinal : public egr::GradNodeBase {
+ public:
+  Conv2dGradNodeFinal() : egr::GradNodeBase() {}
+  Conv2dGradNodeFinal(size_t bwd_in_slot_num, size_t bwd_out_slot_num)
+      : egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {}
+  ~Conv2dGradNodeFinal() override = default;
+
+  virtual paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                               egr::kSlotSmallVectorSize>
+  operator()(
+      paddle::small_vector<std::vector<paddle::experimental::Tensor>,  // NOLINT
+                           egr::kSlotSmallVectorSize>& grads,          // NOLINT
+      bool create_graph = false,                                       // NOLINT
+      bool is_new_grad = false) override;                              // NOLINT
+  std::string name() override { return "Conv2dGradNodeFinal"; }
+
+  void ClearTensorWrappers() override {
+    input_.clear();
+    filter_.clear();
+
+    SetIsTensorWrappersCleared(true);
+  }
+
+  std::shared_ptr<GradNodeBase> Copy() const override {
+    auto copied_node =
+        std::shared_ptr<Conv2dGradNodeFinal>(new Conv2dGradNodeFinal(*this));
+    VLOG(3) << "Copy Conv2dGradNodeFinal: " << this
+            << " to: " << copied_node.get();
+    return copied_node;
+  }
+
+  // SetTensorWrapperX, SetTensorWrapperY, ...
+  void SetTensorWrapperinput(const paddle::experimental::Tensor& input) {
+    input_ = egr::TensorWrapper(input, false);
+  }
+  void SetTensorWrapperfilter(const paddle::experimental::Tensor& filter) {
+    filter_ = egr::TensorWrapper(filter, false);
+  }
+
+  // SetAttributes
+  void SetAttributestrides(const std::vector<int>& strides) {
+    strides_ = strides;
+  }
+  void SetAttributepaddings(const std::vector<int>& paddings) {
+    paddings_ = paddings;
+  }
+  void SetAttributepaddding_algorithm(const std::string& paddding_algorithm) {
+    paddding_algorithm_ = paddding_algorithm;
+  }
+  void SetAttributegroups(const int& groups) { groups_ = groups; }
+  void SetAttributedilations(const std::vector<int>& dilations) {
+    dilations_ = dilations;
+  }
+  void SetAttributedata_format(const std::string& data_format) {
+    data_format_ = data_format;
+  }
+  void SetAttributeuse_addto(const bool& use_addto) { use_addto_ = use_addto; }
+  void SetAttributeworkspace_size_MB(const int& workspace_size_MB) {
+    workspace_size_MB_ = workspace_size_MB;
+  }
+  void SetAttributeexhaustive_search(const bool& exhaustive_search) {
+    exhaustive_search_ = exhaustive_search;
+  }
+
+ private:
+  // TensorWrappers
+  egr::TensorWrapper input_;
+  egr::TensorWrapper filter_;
+
+  // Attributes
+  std::vector<int> strides_;
+  std::vector<int> paddings_;
+  std::string paddding_algorithm_;
+  int groups_;
+  std::vector<int> dilations_;
+  std::string data_format_;
+  bool use_addto_;
+  int workspace_size_MB_;
+  bool exhaustive_search_;
+};
+
+class Conv2dDoubleGradNodeFinal : public egr::GradNodeBase {
+ public:
+  Conv2dDoubleGradNodeFinal() : egr::GradNodeBase() {}
+  Conv2dDoubleGradNodeFinal(size_t bwd_in_slot_num, size_t bwd_out_slot_num)
+      : egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {}
+  ~Conv2dDoubleGradNodeFinal() override = default;
+
+  virtual paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                               egr::kSlotSmallVectorSize>
+  operator()(
+      paddle::small_vector<std::vector<paddle::experimental::Tensor>,  // NOLINT
+                           egr::kSlotSmallVectorSize>& grads,          // NOLINT
+      bool create_graph = false,                                       // NOLINT
+      bool is_new_grad = false) override;                              // NOLINT
+  std::string name() override { return "Conv2dDoubleGradNodeFinal"; }
+
+  void ClearTensorWrappers() override {
+    input_.clear();
+    filter_.clear();
+    grad_out_.clear();
+
+    SetIsTensorWrappersCleared(true);
+  }
+
+  std::shared_ptr<GradNodeBase> Copy() const override {
+    auto copied_node = std::shared_ptr<Conv2dDoubleGradNodeFinal>(
+        new Conv2dDoubleGradNodeFinal(*this));
+    return copied_node;
+  }
+
+  // SetTensorWrapperX, SetTensorWrapperY, ...
+  void SetTensorWrapperinput(const paddle::experimental::Tensor& input) {
+    input_ = egr::TensorWrapper(input, false);
+  }
+  void SetTensorWrapperfilter(const paddle::experimental::Tensor& filter) {
+    filter_ = egr::TensorWrapper(filter, false);
+  }
+  void SetTensorWrappergrad_out(const paddle::experimental::Tensor& grad_out) {
+    grad_out_ = egr::TensorWrapper(grad_out, false);
+  }
+
+  // SetAttributes
+  void SetAttributestrides(const std::vector<int>& strides) {
+    strides_ = strides;
+  }
+  void SetAttributepaddings(const std::vector<int>& paddings) {
+    paddings_ = paddings;
+  }
+  void SetAttributepaddding_algorithm(const std::string& paddding_algorithm) {
+    paddding_algorithm_ = paddding_algorithm;
+  }
+  void SetAttributegroups(const int& groups) { groups_ = groups; }
+  void SetAttributedilations(const std::vector<int>& dilations) {
+    dilations_ = dilations;
+  }
+  void SetAttributedata_format(const std::string& data_format) {
+    data_format_ = data_format;
+  }
+  void SetAttributeuse_addto(const bool& use_addto) { use_addto_ = use_addto; }
+  void SetAttributeworkspace_size_MB(const int& workspace_size_MB) {
+    workspace_size_MB_ = workspace_size_MB;
+  }
+  void SetAttributeexhaustive_search(const bool& exhaustive_search) {
+    exhaustive_search_ = exhaustive_search;
+  }
+
+ private:
+  // TensorWrappers
+  egr::TensorWrapper input_;
+  egr::TensorWrapper filter_;
+  egr::TensorWrapper grad_out_;
+
+  // Attributes
+  std::vector<int> strides_;
+  std::vector<int> paddings_;
+  std::string paddding_algorithm_;
+  int groups_;
+  std::vector<int> dilations_;
+  std::string data_format_;
+  bool use_addto_;
+  int workspace_size_MB_;
+  bool exhaustive_search_;
+};
+
+class AddNGradNodeFinal : public egr::GradNodeBase {
+ public:
+  AddNGradNodeFinal() : egr::GradNodeBase() {}
+  AddNGradNodeFinal(size_t bwd_in_slot_num, size_t bwd_out_slot_num)
+      : egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {}
+  ~AddNGradNodeFinal() override = default;
+
+  virtual paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                               egr::kSlotSmallVectorSize>
+  operator()(
+      paddle::small_vector<std::vector<paddle::experimental::Tensor>,  // NOLINT
+                           egr::kSlotSmallVectorSize>& grads,          // NOLINT
+      bool create_graph = false,
+      bool is_new_grad = false) override;
+  std::string name() override { return "AddNGradNodeFinal"; }
+
+  void ClearTensorWrappers() override {
+    for (auto& tw : x_) {
+      tw.clear();
+    }
+
+    SetIsTensorWrappersCleared(true);
+  }
+
+  std::shared_ptr<GradNodeBase> Copy() const override {
+    auto copied_node =
+        std::shared_ptr<AddNGradNodeFinal>(new AddNGradNodeFinal(*this));
+    return copied_node;
+  }
+
+  // SetTensorWrapperX, SetTensorWrapperY, ...
+  void SetTensorWrapperx(const std::vector<paddle::experimental::Tensor>& x) {
+    for (const auto& eager_tensor : x) {
+      x_.emplace_back(egr::TensorWrapper(eager_tensor, true));
+    }
+  }
+
+  // SetAttributes
+
+ private:
+  // TensorWrappers
+  std::vector<egr::TensorWrapper> x_;
+
+  // Attributes
+};
diff --git a/paddle/fluid/eager/api/manual/fluid_manual/CMakeLists.txt b/paddle/fluid/eager/api/manual/fluid_manual/CMakeLists.txt
new file mode 100644
index 0000000000000..254f4a7246da7
--- /dev/null
+++ b/paddle/fluid/eager/api/manual/fluid_manual/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_subdirectory(forwards)
+add_subdirectory(nodes)
+set(fluid_manual_functions
+    ${fluid_manual_functions}
+    PARENT_SCOPE)
+set(fluid_manual_nodes
+    ${fluid_manual_nodes}
+    PARENT_SCOPE)
diff --git a/paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h b/paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h
new file mode 100644
index 0000000000000..91d556f9557dc
--- /dev/null
+++ b/paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h
@@ -0,0 +1,103 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "glog/logging.h"
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/imperative/tracer.h"
+#include "paddle/phi/api/all.h"
+
+std::tuple<paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor>
+fused_gate_attention_dygraph_function(
+    const paddle::experimental::Tensor& Query,
+    const paddle::experimental::Tensor& Key,
+    const paddle::experimental::Tensor& QueryWeight,
+    const paddle::experimental::Tensor& KeyWeight,
+    const paddle::experimental::Tensor& ValueWeight,
+    const paddle::experimental::Tensor& QKVWeight,
+    const paddle::experimental::Tensor& NonbatchedBias,
+    const paddle::experimental::Tensor& SrcMask,
+    const paddle::experimental::Tensor& GateWeight,
+    const paddle::experimental::Tensor& GateBias,
+    const paddle::experimental::Tensor& OutLinearWeight,
+    const paddle::experimental::Tensor& OutLinearBias,
+    const paddle::framework::AttributeMap& attr_map);
+
+std::tuple<paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor>
+fused_feedforward_dygraph_function(
+    const paddle::experimental::Tensor& X,
+    const paddle::experimental::Tensor& Dropout1Seed,
+    const paddle::experimental::Tensor& Dropout2Seed,
+    const paddle::experimental::Tensor& Linear1Weight,
+    const paddle::experimental::Tensor& Linear1Bias,
+    const paddle::experimental::Tensor& Linear2Weight,
+    const paddle::experimental::Tensor& Linear2Bias,
+    const paddle::experimental::Tensor& Ln1Scale,
+    const paddle::experimental::Tensor& Ln1Bias,
+    const paddle::experimental::Tensor& Ln2Scale,
+    const paddle::experimental::Tensor& Ln2Bias,
+    const paddle::framework::AttributeMap& attr_map);
+
+std::tuple<paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor>
+fused_attention_dygraph_function(
+    const paddle::experimental::Tensor& X,
+    const paddle::experimental::Tensor& LnScale,
+    const paddle::experimental::Tensor& LnBias,
+    const paddle::experimental::Tensor& QKVW,
+    const paddle::experimental::Tensor& QKVBias,
+    const paddle::experimental::Tensor& CacheKV,
+    const paddle::experimental::Tensor& SrcMask,
+    const paddle::experimental::Tensor& OutLinearW,
+    const paddle::experimental::Tensor& OutLinearBias,
+    const paddle::experimental::Tensor& Ln2Scale,
+    const paddle::experimental::Tensor& Ln2Bias,
+    const paddle::framework::AttributeMap& attr_map);
diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt b/paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt
new file mode 100644
index 0000000000000..5c47b0870a203
--- /dev/null
+++ b/paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(fluid_manual_functions
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gate_attention_fwd_func.cc
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_feedforward_fwd_func.cc
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc
+    PARENT_SCOPE)
diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc
new file mode 100644
index 0000000000000..b058fa50acdd9
--- /dev/null
+++ b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc
@@ -0,0 +1,628 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/eager/accumulation/accumulation_node.h"
+#include "paddle/fluid/eager/amp_auto_cast.h"
+#include "paddle/fluid/eager/amp_utils.h"
+#include "paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h"
+#include "paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h"
+#include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
+
+#pragma GCC diagnostic ignored "-Wunused-variable"
+
+std::tuple<paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor>
+fused_attention_dygraph_function(
+    const paddle::experimental::Tensor& X,
+    const paddle::experimental::Tensor& LnScale,
+    const paddle::experimental::Tensor& LnBias,
+    const paddle::experimental::Tensor& QKVW,
+    const paddle::experimental::Tensor& QKVBias,
+    const paddle::experimental::Tensor& CacheKV,
+    const paddle::experimental::Tensor& SrcMask,
+    const paddle::experimental::Tensor& OutLinearW,
+    const paddle::experimental::Tensor& OutLinearBias,
+    const paddle::experimental::Tensor& Ln2Scale,
+    const paddle::experimental::Tensor& Ln2Bias,
+    const paddle::framework::AttributeMap& attr_map) {
+  paddle::platform::RecordEvent dygraph_entrance_record_event(
+      "fused_attention dygraph",
+      paddle::platform::TracerEventType::Operator,
+      1);
+  VLOG(3) << "Running Eager Forward Op: fused_attention";
+  // Dygraph Forward Pass
+
+  if (egr::Controller::Instance().GetAMPLevel() !=
+      paddle::imperative::AmpLevel::O0) {
+    VLOG(5) << "Check and Prepare For AMP";
+
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         egr::kSlotSmallVectorSize>
+        amp_tensors_vector = {{X}, {QKVW}, {OutLinearW}};
+    if (LnScale.initialized()) amp_tensors_vector.push_back({LnScale});
+    if (LnBias.initialized()) amp_tensors_vector.push_back({LnBias});
+    if (QKVBias.initialized()) amp_tensors_vector.push_back({QKVBias});
+    if (CacheKV.initialized()) amp_tensors_vector.push_back({CacheKV});
+    if (SrcMask.initialized()) amp_tensors_vector.push_back({SrcMask});
+    if (OutLinearBias.initialized())
+      amp_tensors_vector.push_back({OutLinearBias});
+    if (Ln2Scale.initialized()) amp_tensors_vector.push_back({Ln2Scale});
+    if (Ln2Bias.initialized()) amp_tensors_vector.push_back({Ln2Bias});
+
+    auto amp_dst_dtype =
+        egr::GetAmpDestDtype("fused_attention", amp_tensors_vector);
+
+    auto NEW_X = egr::AmpAutoCast("X", X, amp_dst_dtype, "fused_attention");
+    auto NEW_QKVW =
+        egr::AmpAutoCast("QKVW", QKVW, amp_dst_dtype, "fused_attention");
+    auto NEW_OutLinearW = egr::AmpAutoCast(
+        "OutLinearW", OutLinearW, amp_dst_dtype, "fused_attention");
+    auto NEW_LnScale =
+        ((LnScale.initialized())
+             ? egr::AmpAutoCast(
+                   "LnScale", LnScale, amp_dst_dtype, "fused_attention")
+             : LnScale);
+    auto NEW_LnBias =
+        ((LnBias.initialized())
+             ? egr::AmpAutoCast(
+                   "LnBias", LnBias, amp_dst_dtype, "fused_attention")
+             : LnBias);
+    auto NEW_QKVBias =
+        ((QKVBias.initialized())
+             ? egr::AmpAutoCast(
+                   "QKVBias", QKVBias, amp_dst_dtype, "fused_attention")
+             : QKVBias);
+    auto NEW_CacheKV =
+        ((CacheKV.initialized())
+             ? egr::AmpAutoCast(
+                   "CacheKV", CacheKV, amp_dst_dtype, "fused_attention")
+             : CacheKV);
+    auto NEW_SrcMask =
+        ((SrcMask.initialized())
+             ? egr::AmpAutoCast(
+                   "SrcMask", SrcMask, amp_dst_dtype, "fused_attention")
+             : SrcMask);
+    auto NEW_OutLinearBias =
+        ((OutLinearBias.initialized()) ? egr::AmpAutoCast("OutLinearBias",
+                                                          OutLinearBias,
+                                                          amp_dst_dtype,
+                                                          "fused_attention")
+                                       : OutLinearBias);
+    auto NEW_Ln2Scale =
+        ((Ln2Scale.initialized())
+             ? egr::AmpAutoCast(
+                   "Ln2Scale", Ln2Scale, amp_dst_dtype, "fused_attention")
+             : Ln2Scale);
+    auto NEW_Ln2Bias =
+        ((Ln2Bias.initialized())
+             ? egr::AmpAutoCast(
+                   "Ln2Bias", Ln2Bias, amp_dst_dtype, "fused_attention")
+             : Ln2Bias);
+
+    {
+      paddle::imperative::AutoCastGuard guard(
+          egr::Controller::Instance().GetCurrentTracer(),
+          paddle::imperative::AmpLevel::O0);
+      return fused_attention_dygraph_function(NEW_X,
+                                              NEW_LnScale,
+                                              NEW_LnBias,
+                                              NEW_QKVW,
+                                              NEW_QKVBias,
+                                              NEW_CacheKV,
+                                              NEW_SrcMask,
+                                              NEW_OutLinearW,
+                                              NEW_OutLinearBias,
+                                              NEW_Ln2Scale,
+                                              NEW_Ln2Bias,
+                                              attr_map);
+    }
+  }
+
+  std::map<std::string, std::vector<std::shared_ptr<egr::EagerVariable>>> ins =
+      {{"X", egr::EagerUtils::TrySyncToVars(X)},
+       {"QKVW", egr::EagerUtils::TrySyncToVars(QKVW)},
+       {"OutLinearW", egr::EagerUtils::TrySyncToVars(OutLinearW)}};
+  if (LnScale.initialized())
+    ins["LnScale"] = egr::EagerUtils::TrySyncToVars(LnScale);
+  if (LnBias.initialized())
+    ins["LnBias"] = egr::EagerUtils::TrySyncToVars(LnBias);
+  if (QKVBias.initialized())
+    ins["QKVBias"] = egr::EagerUtils::TrySyncToVars(QKVBias);
+  if (CacheKV.initialized())
+    ins["CacheKV"] = egr::EagerUtils::TrySyncToVars(CacheKV);
+  if (SrcMask.initialized())
+    ins["SrcMask"] = egr::EagerUtils::TrySyncToVars(SrcMask);
+  if (OutLinearBias.initialized())
+    ins["OutLinearBias"] = egr::EagerUtils::TrySyncToVars(OutLinearBias);
+  if (Ln2Scale.initialized())
+    ins["Ln2Scale"] = egr::EagerUtils::TrySyncToVars(Ln2Scale);
+  if (Ln2Bias.initialized())
+    ins["Ln2Bias"] = egr::EagerUtils::TrySyncToVars(Ln2Bias);
+
+  std::map<std::string, std::vector<std::shared_ptr<egr::EagerVariable>>> outs =
+      {{"LnMean",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}},
+       {"LnVariance",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}},
+       {"LnOut",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}},
+       {"QKVOut",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}},
+       {"QKVBiasOut",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}},
+       {"TransposeOut2",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}},
+       {"QKOut",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}},
+       {"QKTVOut",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}},
+       {"SoftmaxOut",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}},
+       {"AttnDropoutMaskOut",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}},
+       {"AttnDropoutOut",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}},
+       {"SrcMaskOut",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}},
+       {"FMHAOut",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}},
+       {"OutLinearOut",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}},
+       {"DropoutMaskOut",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}},
+       {"Ln2Mean",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}},
+       {"Ln2Variance",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}},
+       {"BiasDropoutResidualOut",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}},
+       {"CacheKVOut",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}},
+       {"Y",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}}};
+
+  // Prepare Autograd Meta
+  egr::AutogradMeta* p_autograd_X = egr::EagerUtils::nullable_autograd_meta(X);
+  egr::AutogradMeta* p_autograd_LnScale =
+      egr::EagerUtils::nullable_autograd_meta(LnScale);
+  egr::AutogradMeta* p_autograd_LnBias =
+      egr::EagerUtils::nullable_autograd_meta(LnBias);
+  egr::AutogradMeta* p_autograd_QKVW =
+      egr::EagerUtils::nullable_autograd_meta(QKVW);
+  egr::AutogradMeta* p_autograd_QKVBias =
+      egr::EagerUtils::nullable_autograd_meta(QKVBias);
+  egr::AutogradMeta* p_autograd_CacheKV =
+      egr::EagerUtils::nullable_autograd_meta(CacheKV);
+  egr::AutogradMeta* p_autograd_SrcMask =
+      egr::EagerUtils::nullable_autograd_meta(SrcMask);
+  egr::AutogradMeta* p_autograd_OutLinearW =
+      egr::EagerUtils::nullable_autograd_meta(OutLinearW);
+  egr::AutogradMeta* p_autograd_OutLinearBias =
+      egr::EagerUtils::nullable_autograd_meta(OutLinearBias);
+  egr::AutogradMeta* p_autograd_Ln2Scale =
+      egr::EagerUtils::nullable_autograd_meta(Ln2Scale);
+  egr::AutogradMeta* p_autograd_Ln2Bias =
+      egr::EagerUtils::nullable_autograd_meta(Ln2Bias);
+
+  bool trace_backward = egr::Controller::Instance().HasGrad();
+
+  bool require_any_grad =
+      egr::EagerUtils::ComputeRequireGrad(trace_backward,
+                                          p_autograd_X,
+                                          p_autograd_LnScale,
+                                          p_autograd_LnBias,
+                                          p_autograd_QKVW,
+                                          p_autograd_QKVBias,
+                                          p_autograd_CacheKV,
+                                          p_autograd_SrcMask,
+                                          p_autograd_OutLinearW,
+                                          p_autograd_OutLinearBias,
+                                          p_autograd_Ln2Scale,
+                                          p_autograd_Ln2Bias);
+
+  paddle::framework::AttributeMap attrs = attr_map;
+  paddle::framework::AttributeMap default_attrs;
+  egr::Controller::Instance().GetCurrentTracer()->TraceOp(
+      "fused_attention",
+      ins,
+      outs,
+      attrs,
+      egr::Controller::Instance().GetExpectedPlace(),
+      &default_attrs,
+      true,
+      {});
+
+  paddle::experimental::Tensor LnMean;
+  egr::EagerUtils::GetOutput(outs["LnMean"][0], &LnMean);
+  paddle::experimental::Tensor LnVariance;
+  egr::EagerUtils::GetOutput(outs["LnVariance"][0], &LnVariance);
+  paddle::experimental::Tensor LnOut;
+  egr::EagerUtils::GetOutput(outs["LnOut"][0], &LnOut);
+  paddle::experimental::Tensor QKVOut;
+  egr::EagerUtils::GetOutput(outs["QKVOut"][0], &QKVOut);
+  paddle::experimental::Tensor QKVBiasOut;
+  egr::EagerUtils::GetOutput(outs["QKVBiasOut"][0], &QKVBiasOut);
+  paddle::experimental::Tensor TransposeOut2;
+  egr::EagerUtils::GetOutput(outs["TransposeOut2"][0], &TransposeOut2);
+  paddle::experimental::Tensor QKOut;
+  egr::EagerUtils::GetOutput(outs["QKOut"][0], &QKOut);
+  paddle::experimental::Tensor QKTVOut;
+  egr::EagerUtils::GetOutput(outs["QKTVOut"][0], &QKTVOut);
+  paddle::experimental::Tensor SoftmaxOut;
+  egr::EagerUtils::GetOutput(outs["SoftmaxOut"][0], &SoftmaxOut);
+  paddle::experimental::Tensor AttnDropoutMaskOut;
+  egr::EagerUtils::GetOutput(outs["AttnDropoutMaskOut"][0],
+                             &AttnDropoutMaskOut);
+  paddle::experimental::Tensor AttnDropoutOut;
+  egr::EagerUtils::GetOutput(outs["AttnDropoutOut"][0], &AttnDropoutOut);
+  paddle::experimental::Tensor SrcMaskOut;
+  egr::EagerUtils::GetOutput(outs["SrcMaskOut"][0], &SrcMaskOut);
+  paddle::experimental::Tensor FMHAOut;
+  egr::EagerUtils::GetOutput(outs["FMHAOut"][0], &FMHAOut);
+  paddle::experimental::Tensor OutLinearOut;
+  egr::EagerUtils::GetOutput(outs["OutLinearOut"][0], &OutLinearOut);
+  paddle::experimental::Tensor DropoutMaskOut;
+  egr::EagerUtils::GetOutput(outs["DropoutMaskOut"][0], &DropoutMaskOut);
+  paddle::experimental::Tensor Ln2Mean;
+  egr::EagerUtils::GetOutput(outs["Ln2Mean"][0], &Ln2Mean);
+  paddle::experimental::Tensor Ln2Variance;
+  egr::EagerUtils::GetOutput(outs["Ln2Variance"][0], &Ln2Variance);
+  paddle::experimental::Tensor BiasDropoutResidualOut;
+  egr::EagerUtils::GetOutput(outs["BiasDropoutResidualOut"][0],
+                             &BiasDropoutResidualOut);
+  paddle::experimental::Tensor CacheKVOut;
+  egr::EagerUtils::GetOutput(outs["CacheKVOut"][0], &CacheKVOut);
+  paddle::experimental::Tensor Y;
+  egr::EagerUtils::GetOutput(outs["Y"][0], &Y);
+
+  {
+    paddle::platform::RecordEvent node_creation_record_event(
+        "fused_attention node_creation",
+        paddle::platform::TracerEventType::Operator,
+        1);
+    egr::AutogradMeta* p_autograd_LnMean =
+        egr::EagerUtils::autograd_meta(&LnMean);
+    egr::AutogradMeta* p_autograd_LnVariance =
+        egr::EagerUtils::autograd_meta(&LnVariance);
+    egr::AutogradMeta* p_autograd_LnOut =
+        egr::EagerUtils::autograd_meta(&LnOut);
+    egr::AutogradMeta* p_autograd_QKVOut =
+        egr::EagerUtils::autograd_meta(&QKVOut);
+    egr::AutogradMeta* p_autograd_QKVBiasOut =
+        egr::EagerUtils::autograd_meta(&QKVBiasOut);
+    egr::AutogradMeta* p_autograd_TransposeOut2 =
+        egr::EagerUtils::autograd_meta(&TransposeOut2);
+    egr::AutogradMeta* p_autograd_QKOut =
+        egr::EagerUtils::autograd_meta(&QKOut);
+    egr::AutogradMeta* p_autograd_QKTVOut =
+        egr::EagerUtils::autograd_meta(&QKTVOut);
+    egr::AutogradMeta* p_autograd_SoftmaxOut =
+        egr::EagerUtils::autograd_meta(&SoftmaxOut);
+    egr::AutogradMeta* p_autograd_AttnDropoutMaskOut =
+        egr::EagerUtils::autograd_meta(&AttnDropoutMaskOut);
+    egr::AutogradMeta* p_autograd_AttnDropoutOut =
+        egr::EagerUtils::autograd_meta(&AttnDropoutOut);
+    egr::AutogradMeta* p_autograd_SrcMaskOut =
+        egr::EagerUtils::autograd_meta(&SrcMaskOut);
+    egr::AutogradMeta* p_autograd_FMHAOut =
+        egr::EagerUtils::autograd_meta(&FMHAOut);
+    egr::AutogradMeta* p_autograd_OutLinearOut =
+        egr::EagerUtils::autograd_meta(&OutLinearOut);
+    egr::AutogradMeta* p_autograd_DropoutMaskOut =
+        egr::EagerUtils::autograd_meta(&DropoutMaskOut);
+    egr::AutogradMeta* p_autograd_Ln2Mean =
+        egr::EagerUtils::autograd_meta(&Ln2Mean);
+    egr::AutogradMeta* p_autograd_Ln2Variance =
+        egr::EagerUtils::autograd_meta(&Ln2Variance);
+    egr::AutogradMeta* p_autograd_BiasDropoutResidualOut =
+        egr::EagerUtils::autograd_meta(&BiasDropoutResidualOut);
+    egr::AutogradMeta* p_autograd_CacheKVOut =
+        egr::EagerUtils::autograd_meta(&CacheKVOut);
+    egr::AutogradMeta* p_autograd_Y = egr::EagerUtils::autograd_meta(&Y);
+    if (require_any_grad) {
+      VLOG(6) << " Construct Grad for fused_attention ";
+      egr::EagerUtils::PassStopGradient(false,
+                                        p_autograd_LnMean,
+                                        p_autograd_LnVariance,
+                                        p_autograd_LnOut,
+                                        p_autograd_QKVOut,
+                                        p_autograd_QKVBiasOut,
+                                        p_autograd_TransposeOut2,
+                                        p_autograd_QKOut,
+                                        p_autograd_QKTVOut,
+                                        p_autograd_SoftmaxOut,
+                                        p_autograd_AttnDropoutMaskOut,
+                                        p_autograd_AttnDropoutOut,
+                                        p_autograd_SrcMaskOut,
+                                        p_autograd_FMHAOut,
+                                        p_autograd_OutLinearOut,
+                                        p_autograd_DropoutMaskOut,
+                                        p_autograd_Ln2Mean,
+                                        p_autograd_Ln2Variance,
+                                        p_autograd_BiasDropoutResidualOut,
+                                        p_autograd_CacheKVOut,
+                                        p_autograd_Y);
+      // Create GradOpNode
+      auto grad_node = std::shared_ptr<fused_attentionGradNodeCompat>(
+          new fused_attentionGradNodeCompat(20, 23));
+
+      bool pre_layer_norm = false;
+      if (attrs.count("pre_layer_norm")) {
+        pre_layer_norm = BOOST_GET_CONST(bool, attrs.at("pre_layer_norm"));
+      }
+
+      // Set Attributes
+      grad_node->SetAttrMap(std::move(attrs));
+      grad_node->SetDefaultAttrMap(std::move(default_attrs));
+
+      grad_node->SetTensorWrapperX(X);
+      grad_node->SetTensorWrapperQKVW(QKVW);
+      grad_node->SetTensorWrapperOutLinearW(OutLinearW);
+      grad_node->SetTensorWrapperQKVOut(QKVOut);
+      grad_node->SetTensorWrapperTransposeOut2(TransposeOut2);
+      grad_node->SetTensorWrapperQKOut(QKOut);
+      grad_node->SetTensorWrapperQKTVOut(QKTVOut);
+      grad_node->SetTensorWrapperSoftmaxOut(SoftmaxOut);
+      grad_node->SetTensorWrapperAttnDropoutMaskOut(AttnDropoutMaskOut);
+      grad_node->SetTensorWrapperAttnDropoutOut(AttnDropoutOut);
+      grad_node->SetTensorWrapperFMHAOut(FMHAOut);
+      grad_node->SetTensorWrapperOutLinearOut(OutLinearOut);
+      grad_node->SetTensorWrapperDropoutMaskOut(DropoutMaskOut);
+
+      grad_node->SetGradOutMeta(X, 0);
+      grad_node->SetGradOutMeta(QKVW, 3);
+      grad_node->SetGradOutMeta(OutLinearW, 7);
+
+      if (QKVBias.initialized()) {
+        grad_node->SetTensorWrapperQKVBias(QKVBias);
+        grad_node->SetTensorWrapperQKVBiasOut(QKVBiasOut);
+        grad_node->SetGradOutMeta(QKVBias, 4);
+
+        auto QKVBiasOut_accumulation_node =
+            std::make_shared<egr::GradNodeAccumulation>(p_autograd_QKVBiasOut);
+        egr::EagerUtils::SetOutRankWithSlot(p_autograd_QKVBiasOut, 0);
+        egr::EagerUtils::SetHistory(p_autograd_QKVBiasOut,
+                                    QKVBiasOut_accumulation_node);
+        QKVBiasOut_accumulation_node->SetGradInMeta(QKVBiasOut, 0);
+        egr::EagerUtils::CheckAndRetainGrad(QKVBiasOut);
+        grad_node->SetGradOutMeta(QKVBiasOut, 11);
+      }
+
+      if (SrcMask.initialized()) {
+        grad_node->SetTensorWrapperSrcMask(SrcMask);
+        grad_node->SetTensorWrapperSrcMaskOut(SrcMaskOut);
+
+        auto SrcMaskOut_accumulation_node =
+            std::make_shared<egr::GradNodeAccumulation>(p_autograd_SrcMaskOut);
+        egr::EagerUtils::SetOutRankWithSlot(p_autograd_SrcMaskOut, 0);
+        egr::EagerUtils::SetHistory(p_autograd_SrcMaskOut,
+                                    SrcMaskOut_accumulation_node);
+        SrcMaskOut_accumulation_node->SetGradInMeta(SrcMaskOut, 0);
+        egr::EagerUtils::CheckAndRetainGrad(SrcMaskOut);
+        grad_node->SetGradOutMeta(SrcMaskOut, 12);
+      }
+
+      if (OutLinearBias.initialized()) {
+        grad_node->SetTensorWrapperOutLinearBias(OutLinearBias);
+        grad_node->SetGradOutMeta(OutLinearBias, 8);
+      }
+
+      if (pre_layer_norm) {
+        if (LnScale.initialized()) {
+          grad_node->SetTensorWrapperLnScale(LnScale);
+          grad_node->SetGradOutMeta(LnScale, 1);
+        }
+        if (LnBias.initialized()) {
+          grad_node->SetTensorWrapperLnBias(LnBias);
+          grad_node->SetGradOutMeta(LnBias, 2);
+        }
+        if (LnOut.initialized()) {
+          grad_node->SetTensorWrapperLnOut(LnOut);
+
+          auto LnOut_accumulation_node =
+              std::make_shared<egr::GradNodeAccumulation>(p_autograd_LnOut);
+          egr::EagerUtils::SetOutRankWithSlot(p_autograd_LnOut, 0);
+          egr::EagerUtils::SetHistory(p_autograd_LnOut,
+                                      LnOut_accumulation_node);
+          LnOut_accumulation_node->SetGradInMeta(LnOut, 0);
+          egr::EagerUtils::CheckAndRetainGrad(LnOut);
+          grad_node->SetGradOutMeta(LnOut, 13);
+        }
+        if (LnMean.initialized()) {
+          grad_node->SetTensorWrapperLnMean(LnMean);
+        }
+        if (LnVariance.initialized()) {
+          grad_node->SetTensorWrapperLnVariance(LnVariance);
+        }
+      } else {
+        if (Ln2Scale.initialized()) {
+          grad_node->SetTensorWrapperLn2Scale(Ln2Scale);
+          grad_node->SetGradOutMeta(Ln2Scale, 9);
+        }
+        if (Ln2Bias.initialized()) {
+          grad_node->SetTensorWrapperLn2Bias(Ln2Bias);
+          grad_node->SetGradOutMeta(Ln2Bias, 10);
+        }
+        grad_node->SetTensorWrapperBiasDropoutResidualOut(
+            BiasDropoutResidualOut);
+        grad_node->SetTensorWrapperLn2Mean(Ln2Mean);
+        grad_node->SetTensorWrapperLn2Variance(Ln2Variance);
+
+        auto BiasDropoutResidualOut_accumulation_node =
+            std::make_shared<egr::GradNodeAccumulation>(
+                p_autograd_BiasDropoutResidualOut);
+        egr::EagerUtils::SetOutRankWithSlot(p_autograd_BiasDropoutResidualOut,
+                                            0);
+        egr::EagerUtils::SetHistory(p_autograd_BiasDropoutResidualOut,
+                                    BiasDropoutResidualOut_accumulation_node);
+        BiasDropoutResidualOut_accumulation_node->SetGradInMeta(
+            BiasDropoutResidualOut, 0);
+        egr::EagerUtils::CheckAndRetainGrad(BiasDropoutResidualOut);
+        grad_node->SetGradOutMeta(BiasDropoutResidualOut, 14);
+      }
+
+      egr::EagerUtils::SetOutRankWithSlot(p_autograd_LnMean, 0);
+      grad_node->SetGradInMeta(LnMean, 0);
+      egr::EagerUtils::SetOutRankWithSlot(p_autograd_LnVariance, 1);
+      grad_node->SetGradInMeta(LnVariance, 1);
+      egr::EagerUtils::SetOutRankWithSlot(p_autograd_AttnDropoutMaskOut, 9);
+      grad_node->SetGradInMeta(AttnDropoutMaskOut, 9);
+      egr::EagerUtils::SetOutRankWithSlot(p_autograd_DropoutMaskOut, 14);
+      grad_node->SetGradInMeta(DropoutMaskOut, 14);
+      egr::EagerUtils::SetOutRankWithSlot(p_autograd_Ln2Mean, 15);
+      grad_node->SetGradInMeta(Ln2Mean, 15);
+      egr::EagerUtils::SetOutRankWithSlot(p_autograd_Ln2Variance, 16);
+      grad_node->SetGradInMeta(Ln2Variance, 16);
+      egr::EagerUtils::SetOutRankWithSlot(p_autograd_CacheKVOut, 18);
+      egr::EagerUtils::SetHistory(p_autograd_CacheKVOut, grad_node);
+      grad_node->SetGradInMeta(CacheKVOut, 18);
+      egr::EagerUtils::CheckAndRetainGrad(CacheKVOut);
+      egr::EagerUtils::SetOutRankWithSlot(p_autograd_Y, 19);
+      egr::EagerUtils::SetHistory(p_autograd_Y, grad_node);
+      grad_node->SetGradInMeta(Y, 19);
+      egr::EagerUtils::CheckAndRetainGrad(Y);
+
+      auto QKVOut_accumulation_node =
+          std::make_shared<egr::GradNodeAccumulation>(p_autograd_QKVOut);
+      egr::EagerUtils::SetOutRankWithSlot(p_autograd_QKVOut, 0);
+      egr::EagerUtils::SetHistory(p_autograd_QKVOut, QKVOut_accumulation_node);
+      QKVOut_accumulation_node->SetGradInMeta(QKVOut, 0);
+      egr::EagerUtils::CheckAndRetainGrad(QKVOut);
+      grad_node->SetGradOutMeta(QKVOut, 15);
+
+      auto QKTVOut_accumulation_node =
+          std::make_shared<egr::GradNodeAccumulation>(p_autograd_QKTVOut);
+      egr::EagerUtils::SetOutRankWithSlot(p_autograd_QKTVOut, 0);
+      egr::EagerUtils::SetHistory(p_autograd_QKTVOut,
+                                  QKTVOut_accumulation_node);
+      QKTVOut_accumulation_node->SetGradInMeta(QKTVOut, 0);
+      egr::EagerUtils::CheckAndRetainGrad(QKTVOut);
+      grad_node->SetGradOutMeta(QKTVOut, 16);
+
+      auto TransposeOut2_accumulation_node =
+          std::make_shared<egr::GradNodeAccumulation>(p_autograd_TransposeOut2);
+      egr::EagerUtils::SetOutRankWithSlot(p_autograd_TransposeOut2, 0);
+      egr::EagerUtils::SetHistory(p_autograd_TransposeOut2,
+                                  TransposeOut2_accumulation_node);
+      TransposeOut2_accumulation_node->SetGradInMeta(TransposeOut2, 0);
+      egr::EagerUtils::CheckAndRetainGrad(TransposeOut2);
+      grad_node->SetGradOutMeta(TransposeOut2, 17);
+
+      auto QKOut_accumulation_node =
+          std::make_shared<egr::GradNodeAccumulation>(p_autograd_QKOut);
+      egr::EagerUtils::SetOutRankWithSlot(p_autograd_QKOut, 0);
+      egr::EagerUtils::SetHistory(p_autograd_QKOut, QKOut_accumulation_node);
+      QKOut_accumulation_node->SetGradInMeta(QKOut, 0);
+      egr::EagerUtils::CheckAndRetainGrad(QKOut);
+      grad_node->SetGradOutMeta(QKOut, 18);
+
+      auto SoftmaxOut_accumulation_node =
+          std::make_shared<egr::GradNodeAccumulation>(p_autograd_SoftmaxOut);
+      egr::EagerUtils::SetOutRankWithSlot(p_autograd_SoftmaxOut, 0);
+      egr::EagerUtils::SetHistory(p_autograd_SoftmaxOut,
+                                  SoftmaxOut_accumulation_node);
+      SoftmaxOut_accumulation_node->SetGradInMeta(SoftmaxOut, 0);
+      egr::EagerUtils::CheckAndRetainGrad(SoftmaxOut);
+      grad_node->SetGradOutMeta(SoftmaxOut, 19);
+
+      auto AttnDropoutOut_accumulation_node =
+          std::make_shared<egr::GradNodeAccumulation>(
+              p_autograd_AttnDropoutOut);
+      egr::EagerUtils::SetOutRankWithSlot(p_autograd_AttnDropoutOut, 0);
+      egr::EagerUtils::SetHistory(p_autograd_AttnDropoutOut,
+                                  AttnDropoutOut_accumulation_node);
+      AttnDropoutOut_accumulation_node->SetGradInMeta(AttnDropoutOut, 0);
+      egr::EagerUtils::CheckAndRetainGrad(AttnDropoutOut);
+      grad_node->SetGradOutMeta(AttnDropoutOut, 20);
+
+      auto FMHAOut_accumulation_node =
+          std::make_shared<egr::GradNodeAccumulation>(p_autograd_FMHAOut);
+      egr::EagerUtils::SetOutRankWithSlot(p_autograd_FMHAOut, 0);
+      egr::EagerUtils::SetHistory(p_autograd_FMHAOut,
+                                  FMHAOut_accumulation_node);
+      FMHAOut_accumulation_node->SetGradInMeta(FMHAOut, 0);
+      egr::EagerUtils::CheckAndRetainGrad(FMHAOut);
+      grad_node->SetGradOutMeta(FMHAOut, 21);
+
+      auto OutLinearOut_accumulation_node =
+          std::make_shared<egr::GradNodeAccumulation>(p_autograd_OutLinearOut);
+      egr::EagerUtils::SetOutRankWithSlot(p_autograd_OutLinearOut, 0);
+      egr::EagerUtils::SetHistory(p_autograd_OutLinearOut,
+                                  OutLinearOut_accumulation_node);
+      OutLinearOut_accumulation_node->SetGradInMeta(OutLinearOut, 0);
+      egr::EagerUtils::CheckAndRetainGrad(OutLinearOut);
+      grad_node->SetGradOutMeta(OutLinearOut, 22);
+    }
+  }
+
+  return std::make_tuple(LnMean,
+                         LnVariance,
+                         LnOut,
+                         QKVOut,
+                         QKVBiasOut,
+                         TransposeOut2,
+                         QKOut,
+                         QKTVOut,
+                         SoftmaxOut,
+                         AttnDropoutMaskOut,
+                         AttnDropoutOut,
+                         SrcMaskOut,
+                         FMHAOut,
+                         OutLinearOut,
+                         DropoutMaskOut,
+                         Ln2Mean,
+                         Ln2Variance,
+                         BiasDropoutResidualOut,
+                         CacheKVOut,
+                         Y);
+}
diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_feedforward_fwd_func.cc b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_feedforward_fwd_func.cc
new file mode 100644
index 0000000000000..e246649314b52
--- /dev/null
+++ b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_feedforward_fwd_func.cc
@@ -0,0 +1,403 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/eager/amp_auto_cast.h"
+#include "paddle/fluid/eager/amp_utils.h"
+#include "paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h"
+#include "paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h"
+#include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
+
+#pragma GCC diagnostic ignored "-Wunused-variable"
+
+std::tuple<paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor>
+fused_feedforward_dygraph_function(
+    const paddle::experimental::Tensor& X,
+    const paddle::experimental::Tensor& Dropout1Seed,
+    const paddle::experimental::Tensor& Dropout2Seed,
+    const paddle::experimental::Tensor& Linear1Weight,
+    const paddle::experimental::Tensor& Linear1Bias,
+    const paddle::experimental::Tensor& Linear2Weight,
+    const paddle::experimental::Tensor& Linear2Bias,
+    const paddle::experimental::Tensor& Ln1Scale,
+    const paddle::experimental::Tensor& Ln1Bias,
+    const paddle::experimental::Tensor& Ln2Scale,
+    const paddle::experimental::Tensor& Ln2Bias,
+    const paddle::framework::AttributeMap& attr_map) {
+  paddle::platform::RecordEvent dygraph_entrance_record_event(
+      "fused_feedforward dygraph",
+      paddle::platform::TracerEventType::Operator,
+      1);
+  VLOG(3) << "Running Eager Forward Op: fused_feedforward";
+  // Dygraph Forward Pass
+
+  if (egr::Controller::Instance().GetAMPLevel() !=
+      paddle::imperative::AmpLevel::O0) {
+    VLOG(5) << "Check and Prepare For AMP";
+
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         egr::kSlotSmallVectorSize>
+        amp_tensors_vector = {{X}, {Linear1Weight}, {Linear2Weight}};
+    if (Dropout1Seed.initialized())
+      amp_tensors_vector.push_back({Dropout1Seed});
+    if (Dropout2Seed.initialized())
+      amp_tensors_vector.push_back({Dropout2Seed});
+    if (Linear1Bias.initialized()) amp_tensors_vector.push_back({Linear1Bias});
+    if (Linear2Bias.initialized()) amp_tensors_vector.push_back({Linear2Bias});
+    if (Ln1Scale.initialized()) amp_tensors_vector.push_back({Ln1Scale});
+    if (Ln1Bias.initialized()) amp_tensors_vector.push_back({Ln1Bias});
+    if (Ln2Scale.initialized()) amp_tensors_vector.push_back({Ln2Scale});
+    if (Ln2Bias.initialized()) amp_tensors_vector.push_back({Ln2Bias});
+
+    auto amp_dst_dtype =
+        egr::GetAmpDestDtype("fused_feedforward", amp_tensors_vector);
+
+    auto NEW_X = egr::AmpAutoCast("X", X, amp_dst_dtype, "fused_feedforward");
+    auto NEW_Linear1Weight = egr::AmpAutoCast(
+        "Linear1Weight", Linear1Weight, amp_dst_dtype, "fused_feedforward");
+    auto NEW_Linear2Weight = egr::AmpAutoCast(
+        "Linear2Weight", Linear2Weight, amp_dst_dtype, "fused_feedforward");
+    auto NEW_Dropout1Seed =
+        ((Dropout1Seed.initialized()) ? egr::AmpAutoCast("Dropout1Seed",
+                                                         Dropout1Seed,
+                                                         amp_dst_dtype,
+                                                         "fused_feedforward")
+                                      : Dropout1Seed);
+    auto NEW_Dropout2Seed =
+        ((Dropout2Seed.initialized()) ? egr::AmpAutoCast("Dropout2Seed",
+                                                         Dropout2Seed,
+                                                         amp_dst_dtype,
+                                                         "fused_feedforward")
+                                      : Dropout2Seed);
+    auto NEW_Linear1Bias =
+        ((Linear1Bias.initialized()) ? egr::AmpAutoCast("Linear1Bias",
+                                                        Linear1Bias,
+                                                        amp_dst_dtype,
+                                                        "fused_feedforward")
+                                     : Linear1Bias);
+    auto NEW_Linear2Bias =
+        ((Linear2Bias.initialized()) ? egr::AmpAutoCast("Linear2Bias",
+                                                        Linear2Bias,
+                                                        amp_dst_dtype,
+                                                        "fused_feedforward")
+                                     : Linear2Bias);
+    auto NEW_Ln1Scale =
+        ((Ln1Scale.initialized())
+             ? egr::AmpAutoCast(
+                   "Ln1Scale", Ln1Scale, amp_dst_dtype, "fused_feedforward")
+             : Ln1Scale);
+    auto NEW_Ln1Bias =
+        ((Ln1Bias.initialized())
+             ? egr::AmpAutoCast(
+                   "Ln1Bias", Ln1Bias, amp_dst_dtype, "fused_feedforward")
+             : Ln1Bias);
+    auto NEW_Ln2Scale =
+        ((Ln2Scale.initialized())
+             ? egr::AmpAutoCast(
+                   "Ln2Scale", Ln2Scale, amp_dst_dtype, "fused_feedforward")
+             : Ln2Scale);
+    auto NEW_Ln2Bias =
+        ((Ln2Bias.initialized())
+             ? egr::AmpAutoCast(
+                   "Ln2Bias", Ln2Bias, amp_dst_dtype, "fused_feedforward")
+             : Ln2Bias);
+
+    {
+      paddle::imperative::AutoCastGuard guard(
+          egr::Controller::Instance().GetCurrentTracer(),
+          paddle::imperative::AmpLevel::O0);
+      return fused_feedforward_dygraph_function(NEW_X,
+                                                NEW_Dropout1Seed,
+                                                NEW_Dropout2Seed,
+                                                NEW_Linear1Weight,
+                                                NEW_Linear1Bias,
+                                                NEW_Linear2Weight,
+                                                NEW_Linear2Bias,
+                                                NEW_Ln1Scale,
+                                                NEW_Ln1Bias,
+                                                NEW_Ln2Scale,
+                                                NEW_Ln2Bias,
+                                                attr_map);
+    }
+  }
+
+  std::map<std::string, std::vector<std::shared_ptr<egr::EagerVariable>>> ins =
+      {{"X", egr::EagerUtils::TrySyncToVars(X)},
+       {"Linear1Weight", egr::EagerUtils::TrySyncToVars(Linear1Weight)},
+       {"Linear2Weight", egr::EagerUtils::TrySyncToVars(Linear2Weight)}};
+  if (Dropout1Seed.initialized())
+    ins["Dropout1Seed"] = egr::EagerUtils::TrySyncToVars(Dropout1Seed);
+  if (Dropout2Seed.initialized())
+    ins["Dropout2Seed"] = egr::EagerUtils::TrySyncToVars(Dropout2Seed);
+  if (Linear1Bias.initialized())
+    ins["Linear1Bias"] = egr::EagerUtils::TrySyncToVars(Linear1Bias);
+  if (Linear2Bias.initialized())
+    ins["Linear2Bias"] = egr::EagerUtils::TrySyncToVars(Linear2Bias);
+  if (Ln1Scale.initialized())
+    ins["Ln1Scale"] = egr::EagerUtils::TrySyncToVars(Ln1Scale);
+  if (Ln1Bias.initialized())
+    ins["Ln1Bias"] = egr::EagerUtils::TrySyncToVars(Ln1Bias);
+  if (Ln2Scale.initialized())
+    ins["Ln2Scale"] = egr::EagerUtils::TrySyncToVars(Ln2Scale);
+  if (Ln2Bias.initialized())
+    ins["Ln2Bias"] = egr::EagerUtils::TrySyncToVars(Ln2Bias);
+
+  std::map<std::string, std::vector<std::shared_ptr<egr::EagerVariable>>> outs =
+      {{"Out",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}},
+       {"Dropout1Mask",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}},
+       {"Dropout2Mask",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}},
+       {"Ln1Mean",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}},
+       {"Ln1Variance",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}},
+       {"Ln2Mean",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}},
+       {"Ln2Variance",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}},
+       {"Linear1Out",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}},
+       {"Ln1Out",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}},
+       {"Dropout1Out",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}},
+       {"Dropout2Out",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}}};
+
+  // Prepare Autograd Meta
+  egr::AutogradMeta* p_autograd_X = egr::EagerUtils::nullable_autograd_meta(X);
+  egr::AutogradMeta* p_autograd_Dropout1Seed =
+      egr::EagerUtils::nullable_autograd_meta(Dropout1Seed);
+  egr::AutogradMeta* p_autograd_Dropout2Seed =
+      egr::EagerUtils::nullable_autograd_meta(Dropout2Seed);
+  egr::AutogradMeta* p_autograd_Linear1Weight =
+      egr::EagerUtils::nullable_autograd_meta(Linear1Weight);
+  egr::AutogradMeta* p_autograd_Linear1Bias =
+      egr::EagerUtils::nullable_autograd_meta(Linear1Bias);
+  egr::AutogradMeta* p_autograd_Linear2Weight =
+      egr::EagerUtils::nullable_autograd_meta(Linear2Weight);
+  egr::AutogradMeta* p_autograd_Linear2Bias =
+      egr::EagerUtils::nullable_autograd_meta(Linear2Bias);
+  egr::AutogradMeta* p_autograd_Ln1Scale =
+      egr::EagerUtils::nullable_autograd_meta(Ln1Scale);
+  egr::AutogradMeta* p_autograd_Ln1Bias =
+      egr::EagerUtils::nullable_autograd_meta(Ln1Bias);
+  egr::AutogradMeta* p_autograd_Ln2Scale =
+      egr::EagerUtils::nullable_autograd_meta(Ln2Scale);
+  egr::AutogradMeta* p_autograd_Ln2Bias =
+      egr::EagerUtils::nullable_autograd_meta(Ln2Bias);
+
+  bool trace_backward = egr::Controller::Instance().HasGrad();
+
+  bool require_any_grad =
+      egr::EagerUtils::ComputeRequireGrad(trace_backward,
+                                          p_autograd_X,
+                                          p_autograd_Dropout1Seed,
+                                          p_autograd_Dropout2Seed,
+                                          p_autograd_Linear1Weight,
+                                          p_autograd_Linear1Bias,
+                                          p_autograd_Linear2Weight,
+                                          p_autograd_Linear2Bias,
+                                          p_autograd_Ln1Scale,
+                                          p_autograd_Ln1Bias,
+                                          p_autograd_Ln2Scale,
+                                          p_autograd_Ln2Bias);
+
+  paddle::framework::AttributeMap attrs = attr_map;
+  paddle::framework::AttributeMap default_attrs;
+  egr::Controller::Instance().GetCurrentTracer()->TraceOp(
+      "fused_feedforward",
+      ins,
+      outs,
+      attrs,
+      egr::Controller::Instance().GetExpectedPlace(),
+      &default_attrs,
+      true,
+      {});
+
+  paddle::experimental::Tensor Out;
+  egr::EagerUtils::GetOutput(outs["Out"][0], &Out);
+  paddle::experimental::Tensor Dropout1Mask;
+  egr::EagerUtils::GetOutput(outs["Dropout1Mask"][0], &Dropout1Mask);
+  paddle::experimental::Tensor Dropout2Mask;
+  egr::EagerUtils::GetOutput(outs["Dropout2Mask"][0], &Dropout2Mask);
+  paddle::experimental::Tensor Ln1Mean;
+  egr::EagerUtils::GetOutput(outs["Ln1Mean"][0], &Ln1Mean);
+  paddle::experimental::Tensor Ln1Variance;
+  egr::EagerUtils::GetOutput(outs["Ln1Variance"][0], &Ln1Variance);
+  paddle::experimental::Tensor Ln2Mean;
+  egr::EagerUtils::GetOutput(outs["Ln2Mean"][0], &Ln2Mean);
+  paddle::experimental::Tensor Ln2Variance;
+  egr::EagerUtils::GetOutput(outs["Ln2Variance"][0], &Ln2Variance);
+  paddle::experimental::Tensor Linear1Out;
+  egr::EagerUtils::GetOutput(outs["Linear1Out"][0], &Linear1Out);
+  paddle::experimental::Tensor Ln1Out;
+  egr::EagerUtils::GetOutput(outs["Ln1Out"][0], &Ln1Out);
+  paddle::experimental::Tensor Dropout1Out;
+  egr::EagerUtils::GetOutput(outs["Dropout1Out"][0], &Dropout1Out);
+  paddle::experimental::Tensor Dropout2Out;
+  egr::EagerUtils::GetOutput(outs["Dropout2Out"][0], &Dropout2Out);
+
+  {
+    paddle::platform::RecordEvent node_creation_record_event(
+        "fused_feedforward node_creation",
+        paddle::platform::TracerEventType::Operator,
+        1);
+    egr::AutogradMeta* p_autograd_Out = egr::EagerUtils::autograd_meta(&Out);
+    egr::AutogradMeta* p_autograd_Dropout1Mask =
+        egr::EagerUtils::autograd_meta(&Dropout1Mask);
+    egr::AutogradMeta* p_autograd_Dropout2Mask =
+        egr::EagerUtils::autograd_meta(&Dropout2Mask);
+    egr::AutogradMeta* p_autograd_Ln1Mean =
+        egr::EagerUtils::autograd_meta(&Ln1Mean);
+    egr::AutogradMeta* p_autograd_Ln1Variance =
+        egr::EagerUtils::autograd_meta(&Ln1Variance);
+    egr::AutogradMeta* p_autograd_Ln2Mean =
+        egr::EagerUtils::autograd_meta(&Ln2Mean);
+    egr::AutogradMeta* p_autograd_Ln2Variance =
+        egr::EagerUtils::autograd_meta(&Ln2Variance);
+    egr::AutogradMeta* p_autograd_Linear1Out =
+        egr::EagerUtils::autograd_meta(&Linear1Out);
+    egr::AutogradMeta* p_autograd_Ln1Out =
+        egr::EagerUtils::autograd_meta(&Ln1Out);
+    egr::AutogradMeta* p_autograd_Dropout1Out =
+        egr::EagerUtils::autograd_meta(&Dropout1Out);
+    egr::AutogradMeta* p_autograd_Dropout2Out =
+        egr::EagerUtils::autograd_meta(&Dropout2Out);
+    if (require_any_grad) {
+      VLOG(6) << " Construct Grad for fused_feedforward ";
+      egr::EagerUtils::PassStopGradient(false,
+                                        p_autograd_Out,
+                                        p_autograd_Dropout1Mask,
+                                        p_autograd_Dropout2Mask,
+                                        p_autograd_Ln1Mean,
+                                        p_autograd_Ln1Variance,
+                                        p_autograd_Ln2Mean,
+                                        p_autograd_Ln2Variance,
+                                        p_autograd_Linear1Out,
+                                        p_autograd_Ln1Out,
+                                        p_autograd_Dropout1Out,
+                                        p_autograd_Dropout2Out);
+      // Create GradOpNode
+      auto grad_node = std::shared_ptr<fused_feedforwardGradNodeCompat>(
+          new fused_feedforwardGradNodeCompat(11, 11));
+
+      bool pre_layer_norm = false;
+      if (attrs.count("pre_layer_norm")) {
+        pre_layer_norm = BOOST_GET_CONST(bool, attrs.at("pre_layer_norm"));
+      }
+
+      // Set Attributes
+      grad_node->SetAttrMap(std::move(attrs));
+      grad_node->SetDefaultAttrMap(std::move(default_attrs));
+
+      grad_node->SetTensorWrapperX(X);
+      grad_node->SetTensorWrapperLinear1Weight(Linear1Weight);
+      grad_node->SetTensorWrapperLinear1Bias(Linear1Bias);
+      grad_node->SetTensorWrapperLinear2Weight(Linear2Weight);
+      grad_node->SetTensorWrapperDropout1Mask(Dropout1Mask);
+      grad_node->SetTensorWrapperDropout2Mask(Dropout2Mask);
+      grad_node->SetTensorWrapperLinear1Out(Linear1Out);
+      grad_node->SetTensorWrapperDropout1Out(Dropout1Out);
+      grad_node->SetTensorWrapperDropout2Out(Dropout2Out);
+
+      grad_node->SetGradOutMeta(X, 0);
+      grad_node->SetGradOutMeta(Linear1Weight, 3);
+      grad_node->SetGradOutMeta(Linear1Bias, 4);
+      grad_node->SetGradOutMeta(Linear2Weight, 5);
+
+      if (pre_layer_norm) {
+        grad_node->SetTensorWrapperLn1Scale(Ln1Scale);
+        grad_node->SetTensorWrapperLn1Bias(Ln1Bias);
+        grad_node->SetTensorWrapperLn1Out(Ln1Out);
+        grad_node->SetTensorWrapperLn1Mean(Ln1Mean);
+        grad_node->SetTensorWrapperLn1Variance(Ln1Variance);
+        grad_node->SetGradOutMeta(Ln1Scale, 7);
+        grad_node->SetGradOutMeta(Ln1Bias, 8);
+      } else {
+        grad_node->SetTensorWrapperLn2Scale(Ln2Scale);
+        grad_node->SetGradOutMeta(Ln2Scale, 9);
+        grad_node->SetTensorWrapperLn2Bias(Ln2Bias);
+        grad_node->SetGradOutMeta(Ln2Bias, 10);
+        grad_node->SetTensorWrapperLn2Mean(Ln2Mean);
+        grad_node->SetTensorWrapperLn2Variance(Ln2Variance);
+      }
+
+      if (Linear2Bias.initialized()) {
+        grad_node->SetTensorWrapperLinear2Bias(Linear2Bias);
+        grad_node->SetGradOutMeta(Linear2Bias, 6);
+      }
+
+      egr::EagerUtils::SetOutRankWithSlot(p_autograd_Out, 0);
+      egr::EagerUtils::SetHistory(p_autograd_Out, grad_node);
+      grad_node->SetGradInMeta(Out, 0);
+      egr::EagerUtils::CheckAndRetainGrad(Out);
+      egr::EagerUtils::SetOutRankWithSlot(p_autograd_Dropout1Mask, 1);
+      grad_node->SetGradInMeta(Dropout1Mask, 1);
+      egr::EagerUtils::SetOutRankWithSlot(p_autograd_Dropout2Mask, 2);
+      grad_node->SetGradInMeta(Dropout2Mask, 2);
+      egr::EagerUtils::SetOutRankWithSlot(p_autograd_Ln1Mean, 3);
+      grad_node->SetGradInMeta(Ln1Mean, 3);
+      egr::EagerUtils::SetOutRankWithSlot(p_autograd_Ln1Variance, 4);
+      grad_node->SetGradInMeta(Ln1Variance, 4);
+      egr::EagerUtils::SetOutRankWithSlot(p_autograd_Ln2Mean, 5);
+      grad_node->SetGradInMeta(Ln2Mean, 5);
+      egr::EagerUtils::SetOutRankWithSlot(p_autograd_Ln2Variance, 6);
+      grad_node->SetGradInMeta(Ln2Variance, 6);
+      egr::EagerUtils::SetOutRankWithSlot(p_autograd_Linear1Out, 7);
+      grad_node->SetGradInMeta(Linear1Out, 7);
+      egr::EagerUtils::SetOutRankWithSlot(p_autograd_Ln1Out, 8);
+      grad_node->SetGradInMeta(Ln1Out, 8);
+      egr::EagerUtils::SetOutRankWithSlot(p_autograd_Dropout1Out, 9);
+      grad_node->SetGradInMeta(Dropout1Out, 9);
+      egr::EagerUtils::SetOutRankWithSlot(p_autograd_Dropout2Out, 10);
+      grad_node->SetGradInMeta(Dropout2Out, 10);
+    }
+  }
+
+  return std::make_tuple(Out,
+                         Dropout1Mask,
+                         Dropout2Mask,
+                         Ln1Mean,
+                         Ln1Variance,
+                         Ln2Mean,
+                         Ln2Variance,
+                         Linear1Out,
+                         Ln1Out,
+                         Dropout1Out,
+                         Dropout2Out);
+}
diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gate_attention_fwd_func.cc b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gate_attention_fwd_func.cc
new file mode 100644
index 0000000000000..81b4db4df207e
--- /dev/null
+++ b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gate_attention_fwd_func.cc
@@ -0,0 +1,389 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/eager/amp_auto_cast.h"
+#include "paddle/fluid/eager/amp_utils.h"
+#include "paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h"
+#include "paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h"
+#include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
+
+#pragma GCC diagnostic ignored "-Wunused-variable"
+
+std::tuple<paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor,
+           paddle::experimental::Tensor>
+fused_gate_attention_dygraph_function(
+    const paddle::experimental::Tensor& Query,
+    const paddle::experimental::Tensor& Key,
+    const paddle::experimental::Tensor& QueryWeight,
+    const paddle::experimental::Tensor& KeyWeight,
+    const paddle::experimental::Tensor& ValueWeight,
+    const paddle::experimental::Tensor& QKVWeight,
+    const paddle::experimental::Tensor& NonbatchedBias,
+    const paddle::experimental::Tensor& SrcMask,
+    const paddle::experimental::Tensor& GateWeight,
+    const paddle::experimental::Tensor& GateBias,
+    const paddle::experimental::Tensor& OutLinearWeight,
+    const paddle::experimental::Tensor& OutLinearBias,
+    const paddle::framework::AttributeMap& attr_map) {
+  paddle::platform::RecordEvent dygraph_entrance_record_event(
+      "fused_gate_attention dygraph",
+      paddle::platform::TracerEventType::Operator,
+      1);
+  VLOG(3) << "Running Eager Forward Op: fused_gate_attention";
+  // Dygraph Forward Pass
+
+  if (egr::Controller::Instance().GetAMPLevel() !=
+      paddle::imperative::AmpLevel::O0) {
+    VLOG(5) << "Check and Prepare For AMP";
+
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         egr::kSlotSmallVectorSize>
+        amp_tensors_vector = {
+            {Query}, {SrcMask}, {OutLinearWeight}, {OutLinearBias}};
+    if (Key.initialized()) amp_tensors_vector.push_back({Key});
+    if (QueryWeight.initialized()) amp_tensors_vector.push_back({QueryWeight});
+    if (KeyWeight.initialized()) amp_tensors_vector.push_back({KeyWeight});
+    if (ValueWeight.initialized()) amp_tensors_vector.push_back({ValueWeight});
+    if (QKVWeight.initialized()) amp_tensors_vector.push_back({QKVWeight});
+    if (NonbatchedBias.initialized())
+      amp_tensors_vector.push_back({NonbatchedBias});
+    if (GateWeight.initialized()) amp_tensors_vector.push_back({GateWeight});
+    if (GateBias.initialized()) amp_tensors_vector.push_back({GateBias});
+
+    auto amp_dst_dtype =
+        egr::GetAmpDestDtype("fused_gate_attention", amp_tensors_vector);
+
+    auto NEW_Query =
+        egr::AmpAutoCast("Query", Query, amp_dst_dtype, "fused_gate_attention");
+    auto NEW_SrcMask = egr::AmpAutoCast(
+        "SrcMask", SrcMask, amp_dst_dtype, "fused_gate_attention");
+    auto NEW_OutLinearWeight = egr::AmpAutoCast("OutLinearWeight",
+                                                OutLinearWeight,
+                                                amp_dst_dtype,
+                                                "fused_gate_attention");
+    auto NEW_OutLinearBias = egr::AmpAutoCast(
+        "OutLinearBias", OutLinearBias, amp_dst_dtype, "fused_gate_attention");
+    auto NEW_Key = ((Key.initialized())
+                        ? egr::AmpAutoCast(
+                              "Key", Key, amp_dst_dtype, "fused_gate_attention")
+                        : Key);
+    auto NEW_QueryWeight =
+        ((QueryWeight.initialized()) ? egr::AmpAutoCast("QueryWeight",
+                                                        QueryWeight,
+                                                        amp_dst_dtype,
+                                                        "fused_gate_attention")
+                                     : QueryWeight);
+    auto NEW_KeyWeight =
+        ((KeyWeight.initialized()) ? egr::AmpAutoCast("KeyWeight",
+                                                      KeyWeight,
+                                                      amp_dst_dtype,
+                                                      "fused_gate_attention")
+                                   : KeyWeight);
+    auto NEW_ValueWeight =
+        ((ValueWeight.initialized()) ? egr::AmpAutoCast("ValueWeight",
+                                                        ValueWeight,
+                                                        amp_dst_dtype,
+                                                        "fused_gate_attention")
+                                     : ValueWeight);
+    auto NEW_QKVWeight =
+        ((QKVWeight.initialized()) ? egr::AmpAutoCast("QKVWeight",
+                                                      QKVWeight,
+                                                      amp_dst_dtype,
+                                                      "fused_gate_attention")
+                                   : QKVWeight);
+    auto NEW_NonbatchedBias = ((NonbatchedBias.initialized())
+                                   ? egr::AmpAutoCast("NonbatchedBias",
+                                                      NonbatchedBias,
+                                                      amp_dst_dtype,
+                                                      "fused_gate_attention")
+                                   : NonbatchedBias);
+    auto NEW_GateWeight =
+        ((GateWeight.initialized()) ? egr::AmpAutoCast("GateWeight",
+                                                       GateWeight,
+                                                       amp_dst_dtype,
+                                                       "fused_gate_attention")
+                                    : GateWeight);
+    auto NEW_GateBias =
+        ((GateBias.initialized())
+             ? egr::AmpAutoCast(
+                   "GateBias", GateBias, amp_dst_dtype, "fused_gate_attention")
+             : GateBias);
+
+    {
+      paddle::imperative::AutoCastGuard guard(
+          egr::Controller::Instance().GetCurrentTracer(),
+          paddle::imperative::AmpLevel::O0);
+      return fused_gate_attention_dygraph_function(NEW_Query,
+                                                   NEW_Key,
+                                                   NEW_QueryWeight,
+                                                   NEW_KeyWeight,
+                                                   NEW_ValueWeight,
+                                                   NEW_QKVWeight,
+                                                   NEW_NonbatchedBias,
+                                                   NEW_SrcMask,
+                                                   NEW_GateWeight,
+                                                   NEW_GateBias,
+                                                   NEW_OutLinearWeight,
+                                                   NEW_OutLinearBias,
+                                                   attr_map);
+    }
+  }
+
+  std::map<std::string, std::vector<std::shared_ptr<egr::EagerVariable>>> ins =
+      {{"Query", egr::EagerUtils::TrySyncToVars(Query)},
+       {"SrcMask", egr::EagerUtils::TrySyncToVars(SrcMask)},
+       {"OutLinearWeight", egr::EagerUtils::TrySyncToVars(OutLinearWeight)},
+       {"OutLinearBias", egr::EagerUtils::TrySyncToVars(OutLinearBias)}};
+  if (Key.initialized()) ins["Key"] = egr::EagerUtils::TrySyncToVars(Key);
+  if (QueryWeight.initialized())
+    ins["QueryWeight"] = egr::EagerUtils::TrySyncToVars(QueryWeight);
+  if (KeyWeight.initialized())
+    ins["KeyWeight"] = egr::EagerUtils::TrySyncToVars(KeyWeight);
+  if (ValueWeight.initialized())
+    ins["ValueWeight"] = egr::EagerUtils::TrySyncToVars(ValueWeight);
+  if (QKVWeight.initialized())
+    ins["QKVWeight"] = egr::EagerUtils::TrySyncToVars(QKVWeight);
+  if (NonbatchedBias.initialized())
+    ins["NonbatchedBias"] = egr::EagerUtils::TrySyncToVars(NonbatchedBias);
+  if (GateWeight.initialized())
+    ins["GateWeight"] = egr::EagerUtils::TrySyncToVars(GateWeight);
+  if (GateBias.initialized())
+    ins["GateBias"] = egr::EagerUtils::TrySyncToVars(GateBias);
+
+  std::map<std::string, std::vector<std::shared_ptr<egr::EagerVariable>>> outs =
+      {{"QueryTransposeOut",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}},
+       {"KeyTransposeOut",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}},
+       {"ValueTransposeOut",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}},
+       {"QKVTransposeOut",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}},
+       {"SoftmaxOut",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}},
+       {"FMHAOut",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}},
+       {"GateOut",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}},
+       {"Out",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}}};
+
+  // Prepare Autograd Meta
+  egr::AutogradMeta* p_autograd_Query =
+      egr::EagerUtils::nullable_autograd_meta(Query);
+  egr::AutogradMeta* p_autograd_Key =
+      egr::EagerUtils::nullable_autograd_meta(Key);
+  egr::AutogradMeta* p_autograd_QueryWeight =
+      egr::EagerUtils::nullable_autograd_meta(QueryWeight);
+  egr::AutogradMeta* p_autograd_KeyWeight =
+      egr::EagerUtils::nullable_autograd_meta(KeyWeight);
+  egr::AutogradMeta* p_autograd_ValueWeight =
+      egr::EagerUtils::nullable_autograd_meta(ValueWeight);
+  egr::AutogradMeta* p_autograd_QKVWeight =
+      egr::EagerUtils::nullable_autograd_meta(QKVWeight);
+  egr::AutogradMeta* p_autograd_NonbatchedBias =
+      egr::EagerUtils::nullable_autograd_meta(NonbatchedBias);
+  egr::AutogradMeta* p_autograd_SrcMask =
+      egr::EagerUtils::nullable_autograd_meta(SrcMask);
+  egr::AutogradMeta* p_autograd_GateWeight =
+      egr::EagerUtils::nullable_autograd_meta(GateWeight);
+  egr::AutogradMeta* p_autograd_GateBias =
+      egr::EagerUtils::nullable_autograd_meta(GateBias);
+  egr::AutogradMeta* p_autograd_OutLinearWeight =
+      egr::EagerUtils::nullable_autograd_meta(OutLinearWeight);
+  egr::AutogradMeta* p_autograd_OutLinearBias =
+      egr::EagerUtils::nullable_autograd_meta(OutLinearBias);
+
+  bool trace_backward = egr::Controller::Instance().HasGrad();
+
+  bool require_any_grad =
+      egr::EagerUtils::ComputeRequireGrad(trace_backward,
+                                          p_autograd_Query,
+                                          p_autograd_Key,
+                                          p_autograd_QueryWeight,
+                                          p_autograd_KeyWeight,
+                                          p_autograd_ValueWeight,
+                                          p_autograd_QKVWeight,
+                                          p_autograd_NonbatchedBias,
+                                          p_autograd_SrcMask,
+                                          p_autograd_GateWeight,
+                                          p_autograd_GateBias,
+                                          p_autograd_OutLinearWeight,
+                                          p_autograd_OutLinearBias);
+
+  paddle::framework::AttributeMap attrs = attr_map;
+  paddle::framework::AttributeMap default_attrs;
+  egr::Controller::Instance().GetCurrentTracer()->TraceOp(
+      "fused_gate_attention",
+      ins,
+      outs,
+      attrs,
+      egr::Controller::Instance().GetExpectedPlace(),
+      &default_attrs,
+      true,
+      {});
+
+  paddle::experimental::Tensor QueryTransposeOut;
+  egr::EagerUtils::GetOutput(outs["QueryTransposeOut"][0], &QueryTransposeOut);
+  paddle::experimental::Tensor KeyTransposeOut;
+  egr::EagerUtils::GetOutput(outs["KeyTransposeOut"][0], &KeyTransposeOut);
+  paddle::experimental::Tensor ValueTransposeOut;
+  egr::EagerUtils::GetOutput(outs["ValueTransposeOut"][0], &ValueTransposeOut);
+  paddle::experimental::Tensor QKVTransposeOut;
+  egr::EagerUtils::GetOutput(outs["QKVTransposeOut"][0], &QKVTransposeOut);
+  paddle::experimental::Tensor SoftmaxOut;
+  egr::EagerUtils::GetOutput(outs["SoftmaxOut"][0], &SoftmaxOut);
+  paddle::experimental::Tensor FMHAOut;
+  egr::EagerUtils::GetOutput(outs["FMHAOut"][0], &FMHAOut);
+  paddle::experimental::Tensor GateOut;
+  egr::EagerUtils::GetOutput(outs["GateOut"][0], &GateOut);
+  paddle::experimental::Tensor Out;
+  egr::EagerUtils::GetOutput(outs["Out"][0], &Out);
+
+  {
+    paddle::platform::RecordEvent node_creation_record_event(
+        "fused_gate_attention node_creation",
+        paddle::platform::TracerEventType::Operator,
+        1);
+    egr::AutogradMeta* p_autograd_QueryTransposeOut =
+        egr::EagerUtils::autograd_meta(&QueryTransposeOut);
+    egr::AutogradMeta* p_autograd_KeyTransposeOut =
+        egr::EagerUtils::autograd_meta(&KeyTransposeOut);
+    egr::AutogradMeta* p_autograd_ValueTransposeOut =
+        egr::EagerUtils::autograd_meta(&ValueTransposeOut);
+    egr::AutogradMeta* p_autograd_QKVTransposeOut =
+        egr::EagerUtils::autograd_meta(&QKVTransposeOut);
+    egr::AutogradMeta* p_autograd_SoftmaxOut =
+        egr::EagerUtils::autograd_meta(&SoftmaxOut);
+    egr::AutogradMeta* p_autograd_FMHAOut =
+        egr::EagerUtils::autograd_meta(&FMHAOut);
+    egr::AutogradMeta* p_autograd_GateOut =
+        egr::EagerUtils::autograd_meta(&GateOut);
+    egr::AutogradMeta* p_autograd_Out = egr::EagerUtils::autograd_meta(&Out);
+    if (require_any_grad) {
+      VLOG(6) << " Construct Grad for fused_gate_attention ";
+      egr::EagerUtils::PassStopGradient(false,
+                                        p_autograd_QueryTransposeOut,
+                                        p_autograd_KeyTransposeOut,
+                                        p_autograd_ValueTransposeOut,
+                                        p_autograd_QKVTransposeOut,
+                                        p_autograd_SoftmaxOut,
+                                        p_autograd_FMHAOut,
+                                        p_autograd_GateOut,
+                                        p_autograd_Out);
+      // Create GradOpNode
+      auto grad_node = std::shared_ptr<fused_gate_attentionGradNodeCompat>(
+          new fused_gate_attentionGradNodeCompat(8, 12));
+
+      bool merge_qkv = true;
+      if (attrs.count("merge_qkv")) {
+        merge_qkv = BOOST_GET_CONST(bool, attrs.at("merge_qkv"));
+      }
+
+      bool has_gating = true;
+      if (attrs.count("has_gating")) {
+        has_gating = BOOST_GET_CONST(bool, attrs.at("has_gating"));
+      }
+
+      // Set Attributes
+      grad_node->SetAttrMap(std::move(attrs));
+      grad_node->SetDefaultAttrMap(std::move(default_attrs));
+
+      grad_node->SetTensorWrapperFMHAOut(FMHAOut);
+      grad_node->SetTensorWrapperQuery(Query);
+      grad_node->SetTensorWrapperSoftmaxOut(SoftmaxOut);
+      grad_node->SetTensorWrapperOutLinearBias(OutLinearBias);
+      grad_node->SetTensorWrapperOutLinearWeight(OutLinearWeight);
+
+      grad_node->SetGradOutMeta(Query, 0);
+      grad_node->SetGradOutMeta(OutLinearWeight, 10);
+      grad_node->SetGradOutMeta(OutLinearBias, 11);
+
+      if (merge_qkv) {
+        grad_node->SetTensorWrapperQKVTransposeOut(QKVTransposeOut);
+        grad_node->SetTensorWrapperQKVWeight(QKVWeight);
+        grad_node->SetGradOutMeta(QKVWeight, 5);
+      } else {
+        grad_node->SetTensorWrapperKey(Key);
+        grad_node->SetTensorWrapperQueryWeight(QueryWeight);
+        grad_node->SetTensorWrapperKeyWeight(KeyWeight);
+        grad_node->SetTensorWrapperValueWeight(ValueWeight);
+        grad_node->SetTensorWrapperQueryTransposeOut(QueryTransposeOut);
+        grad_node->SetTensorWrapperKeyTransposeOut(KeyTransposeOut);
+        grad_node->SetTensorWrapperValueTransposeOut(ValueTransposeOut);
+
+        grad_node->SetGradOutMeta(Key, 1);
+        grad_node->SetGradOutMeta(QueryWeight, 2);
+        grad_node->SetGradOutMeta(KeyWeight, 3);
+        grad_node->SetGradOutMeta(ValueWeight, 4);
+      }
+
+      if (has_gating) {
+        grad_node->SetTensorWrapperGateWeight(GateWeight);
+        grad_node->SetGradOutMeta(GateWeight, 8);
+        grad_node->SetTensorWrapperGateBias(GateBias);
+        grad_node->SetGradOutMeta(GateBias, 9);
+        grad_node->SetTensorWrapperGateOut(GateOut);
+      }
+
+      if (NonbatchedBias.initialized()) {
+        grad_node->SetTensorWrapperNonbatchedBias(NonbatchedBias);
+        grad_node->SetGradOutMeta(NonbatchedBias, 6);
+      }
+
+      egr::EagerUtils::SetOutRankWithSlot(p_autograd_QueryTransposeOut, 0);
+      grad_node->SetGradInMeta(QueryTransposeOut, 0);
+      egr::EagerUtils::SetOutRankWithSlot(p_autograd_KeyTransposeOut, 1);
+      grad_node->SetGradInMeta(KeyTransposeOut, 1);
+      egr::EagerUtils::SetOutRankWithSlot(p_autograd_ValueTransposeOut, 2);
+      grad_node->SetGradInMeta(ValueTransposeOut, 2);
+      egr::EagerUtils::SetOutRankWithSlot(p_autograd_QKVTransposeOut, 3);
+      grad_node->SetGradInMeta(QKVTransposeOut, 3);
+      egr::EagerUtils::SetOutRankWithSlot(p_autograd_SoftmaxOut, 4);
+      grad_node->SetGradInMeta(SoftmaxOut, 4);
+      egr::EagerUtils::SetOutRankWithSlot(p_autograd_FMHAOut, 5);
+      grad_node->SetGradInMeta(FMHAOut, 5);
+      egr::EagerUtils::SetOutRankWithSlot(p_autograd_GateOut, 6);
+      grad_node->SetGradInMeta(GateOut, 6);
+      egr::EagerUtils::SetOutRankWithSlot(p_autograd_Out, 7);
+      egr::EagerUtils::SetHistory(p_autograd_Out, grad_node);
+      grad_node->SetGradInMeta(Out, 7);
+      egr::EagerUtils::CheckAndRetainGrad(Out);
+    }
+  }
+
+  return std::make_tuple(QueryTransposeOut,
+                         KeyTransposeOut,
+                         ValueTransposeOut,
+                         QKVTransposeOut,
+                         SoftmaxOut,
+                         FMHAOut,
+                         GateOut,
+                         Out);
+}
diff --git a/paddle/fluid/eager/api/manual/fluid_manual/nodes/CMakeLists.txt b/paddle/fluid/eager/api/manual/fluid_manual/nodes/CMakeLists.txt
new file mode 100644
index 0000000000000..101ed5d589075
--- /dev/null
+++ b/paddle/fluid/eager/api/manual/fluid_manual/nodes/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(fluid_manual_nodes
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_gate_attention_node.cc
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_feedforward_node.cc
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_attention_node.cc
+    PARENT_SCOPE)
diff --git a/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_attention_node.cc b/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_attention_node.cc
new file mode 100644
index 0000000000000..990cfb5226dbb
--- /dev/null
+++ b/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_attention_node.cc
@@ -0,0 +1,366 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "glog/logging.h"
+#include "paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h"
+#include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/eager/utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/imperative/tracer.h"
+#include "paddle/phi/api/all.h"
+
+paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                     egr::kSlotSmallVectorSize>
+fused_attentionGradNodeCompat::operator()(
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         egr::kSlotSmallVectorSize>& grads,
+    bool create_graph,
+    bool is_new_grad) {
+  VLOG(3) << "Running Eager Backward Node: fused_attentionGradNodeCompat";
+  const auto& out_metas = OutputMeta();
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       egr::kSlotSmallVectorSize>
+      outputs(23);
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       egr::kSlotSmallVectorSize>
+      hooked_grads0 = fused_attentionGradNodeCompat::ApplyGradientHooks(grads);
+
+  bool pre_layer_norm = false;
+  if (attr_map_.count("pre_layer_norm")) {
+    pre_layer_norm = BOOST_GET_CONST(bool, attr_map_.at("pre_layer_norm"));
+  }
+
+  std::map<std::string, std::vector<std::shared_ptr<egr::EagerVariable>>> ins0 =
+      {{"AttnDropoutMaskOut",
+        egr::EagerUtils::TrySyncToVars(
+            egr::EagerUtils::RecoverTensorWrapper(&this->AttnDropoutMaskOut_))},
+       {"AttnDropoutOut",
+        egr::EagerUtils::TrySyncToVars(
+            egr::EagerUtils::RecoverTensorWrapper(&this->AttnDropoutOut_))},
+       {"DropoutMaskOut",
+        egr::EagerUtils::TrySyncToVars(
+            egr::EagerUtils::RecoverTensorWrapper(&this->DropoutMaskOut_))},
+       {"FMHAOut",
+        egr::EagerUtils::TrySyncToVars(
+            egr::EagerUtils::RecoverTensorWrapper(&this->FMHAOut_))},
+       {"OutLinearOut",
+        egr::EagerUtils::TrySyncToVars(
+            egr::EagerUtils::RecoverTensorWrapper(&this->OutLinearOut_))},
+       {"OutLinearW",
+        egr::EagerUtils::TrySyncToVars(
+            egr::EagerUtils::RecoverTensorWrapper(&this->OutLinearW_))},
+       {"QKOut",
+        egr::EagerUtils::TrySyncToVars(
+            egr::EagerUtils::RecoverTensorWrapper(&this->QKOut_))},
+       {"QKTVOut",
+        egr::EagerUtils::TrySyncToVars(
+            egr::EagerUtils::RecoverTensorWrapper(&this->QKTVOut_))},
+       {"QKVOut",
+        egr::EagerUtils::TrySyncToVars(
+            egr::EagerUtils::RecoverTensorWrapper(&this->QKVOut_))},
+       {"QKVW",
+        egr::EagerUtils::TrySyncToVars(
+            egr::EagerUtils::RecoverTensorWrapper(&this->QKVW_))},
+       {"SoftmaxOut",
+        egr::EagerUtils::TrySyncToVars(
+            egr::EagerUtils::RecoverTensorWrapper(&this->SoftmaxOut_))},
+       {"TransposeOut2",
+        egr::EagerUtils::TrySyncToVars(
+            egr::EagerUtils::RecoverTensorWrapper(&this->TransposeOut2_))},
+       {"X",
+        egr::EagerUtils::TrySyncToVars(
+            egr::EagerUtils::RecoverTensorWrapper(&this->X_))},
+       {"Y@GRAD", egr::EagerUtils::TrySyncToVars(hooked_grads0[19])}};
+  std::map<std::string, std::vector<std::shared_ptr<egr::EagerVariable>>> outs0;
+
+  if ((!out_metas[7].empty()) && (!(out_metas[7][0].IsStopGradient()))) {
+    outs0.insert({"OutLinearW@GRAD",
+                  {std::make_shared<egr::EagerVariable>(
+                      egr::Controller::Instance().GenerateUniqueName())}});
+  }
+  if ((!out_metas[3].empty()) && (!(out_metas[3][0].IsStopGradient()))) {
+    outs0.insert({"QKVW@GRAD",
+                  {std::make_shared<egr::EagerVariable>(
+                      egr::Controller::Instance().GenerateUniqueName())}});
+  }
+  if ((!out_metas[0].empty()) && (!(out_metas[0][0].IsStopGradient()))) {
+    outs0.insert({"X@GRAD",
+                  {std::make_shared<egr::EagerVariable>(
+                      egr::Controller::Instance().GenerateUniqueName())}});
+  }
+
+  auto QKVOut = egr::EagerUtils::RecoverTensorWrapper(&this->QKVOut_);
+  if (QKVOut.defined() && (!out_metas[15].empty()) &&
+      (!out_metas[15][0].IsStopGradient()))
+    outs0["QKVOut@GRAD"] = {std::make_shared<egr::EagerVariable>(
+        egr::Controller::Instance().GenerateUniqueName())};
+  auto QKTVOut = egr::EagerUtils::RecoverTensorWrapper(&this->QKTVOut_);
+  if (QKTVOut.defined() && (!out_metas[16].empty()) &&
+      (!out_metas[16][0].IsStopGradient()))
+    outs0["QKTVOut@GRAD"] = {std::make_shared<egr::EagerVariable>(
+        egr::Controller::Instance().GenerateUniqueName())};
+  auto TransposeOut2 =
+      egr::EagerUtils::RecoverTensorWrapper(&this->TransposeOut2_);
+  if (TransposeOut2.defined() && (!out_metas[17].empty()) &&
+      (!out_metas[17][0].IsStopGradient()))
+    outs0["TransposeOut2@GRAD"] = {std::make_shared<egr::EagerVariable>(
+        egr::Controller::Instance().GenerateUniqueName())};
+  auto QKOut = egr::EagerUtils::RecoverTensorWrapper(&this->QKOut_);
+  if (QKOut.defined() && (!out_metas[18].empty()) &&
+      (!out_metas[18][0].IsStopGradient()))
+    outs0["QKOut@GRAD"] = {std::make_shared<egr::EagerVariable>(
+        egr::Controller::Instance().GenerateUniqueName())};
+  auto SoftmaxOut = egr::EagerUtils::RecoverTensorWrapper(&this->SoftmaxOut_);
+  if (SoftmaxOut.defined() && (!out_metas[19].empty()) &&
+      (!out_metas[19][0].IsStopGradient()))
+    outs0["SoftmaxOut@GRAD"] = {std::make_shared<egr::EagerVariable>(
+        egr::Controller::Instance().GenerateUniqueName())};
+  auto AttnDropoutOut =
+      egr::EagerUtils::RecoverTensorWrapper(&this->AttnDropoutOut_);
+  if (AttnDropoutOut.defined() && (!out_metas[20].empty()) &&
+      (!out_metas[20][0].IsStopGradient()))
+    outs0["AttnDropoutOut@GRAD"] = {std::make_shared<egr::EagerVariable>(
+        egr::Controller::Instance().GenerateUniqueName())};
+  auto FMHAOut = egr::EagerUtils::RecoverTensorWrapper(&this->FMHAOut_);
+  if (FMHAOut.defined() && (!out_metas[21].empty()) &&
+      (!out_metas[21][0].IsStopGradient()))
+    outs0["FMHAOut@GRAD"] = {std::make_shared<egr::EagerVariable>(
+        egr::Controller::Instance().GenerateUniqueName())};
+  auto OutLinearOut =
+      egr::EagerUtils::RecoverTensorWrapper(&this->OutLinearOut_);
+  if (OutLinearOut.defined() && (!out_metas[22].empty()) &&
+      (!out_metas[22][0].IsStopGradient()))
+    outs0["OutLinearOut@GRAD"] = {std::make_shared<egr::EagerVariable>(
+        egr::Controller::Instance().GenerateUniqueName())};
+
+  auto QKVBias = egr::EagerUtils::RecoverTensorWrapper(&this->QKVBias_);
+  if (QKVBias.defined()) {
+    ins0["QKVBias"] = egr::EagerUtils::TrySyncToVars(QKVBias);
+    auto QKVBiasOut = egr::EagerUtils::RecoverTensorWrapper(&this->QKVBiasOut_);
+    ins0["QKVBiasOut"] = egr::EagerUtils::TrySyncToVars(QKVBiasOut);
+    if (QKVBias.defined() && (!out_metas[4].empty()) &&
+        (!out_metas[4][0].IsStopGradient()))
+      outs0["QKVBias@GRAD"] = {std::make_shared<egr::EagerVariable>(
+          egr::Controller::Instance().GenerateUniqueName())};
+    if (QKVBiasOut.defined() && (!out_metas[11].empty()) &&
+        (!out_metas[11][0].IsStopGradient()))
+      outs0["QKVBiasOut@GRAD"] = {std::make_shared<egr::EagerVariable>(
+          egr::Controller::Instance().GenerateUniqueName())};
+  }
+
+  auto SrcMask = egr::EagerUtils::RecoverTensorWrapper(&this->SrcMask_);
+  if (SrcMask.defined()) {
+    ins0["SrcMask"] = egr::EagerUtils::TrySyncToVars(SrcMask);
+    auto SrcMaskOut = egr::EagerUtils::RecoverTensorWrapper(&this->SrcMaskOut_);
+    ins0["SrcMaskOut"] = egr::EagerUtils::TrySyncToVars(SrcMaskOut);
+    if (SrcMaskOut.defined() && (!out_metas[12].empty()) &&
+        (!out_metas[12][0].IsStopGradient()))
+      outs0["SrcMaskOut@GRAD"] = {std::make_shared<egr::EagerVariable>(
+          egr::Controller::Instance().GenerateUniqueName())};
+  }
+
+  auto OutLinearBias =
+      egr::EagerUtils::RecoverTensorWrapper(&this->OutLinearBias_);
+  if (OutLinearBias.defined()) {
+    ins0["OutLinearBias"] = egr::EagerUtils::TrySyncToVars(OutLinearBias);
+    if (OutLinearBias.defined() && (!out_metas[8].empty()) &&
+        (!out_metas[8][0].IsStopGradient()))
+      outs0["OutLinearBias@GRAD"] = {std::make_shared<egr::EagerVariable>(
+          egr::Controller::Instance().GenerateUniqueName())};
+  }
+
+  if (pre_layer_norm) {
+    auto LnScale = egr::EagerUtils::RecoverTensorWrapper(&this->LnScale_);
+    if (LnScale.defined()) {
+      ins0["LnScale"] = egr::EagerUtils::TrySyncToVars(LnScale);
+      if (LnScale.defined() && (!out_metas[1].empty()) &&
+          (!out_metas[1][0].IsStopGradient()))
+        outs0["LnScale@GRAD"] = {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())};
+    }
+
+    auto LnBias = egr::EagerUtils::RecoverTensorWrapper(&this->LnBias_);
+    if (LnBias.defined()) {
+      ins0["LnBias"] = egr::EagerUtils::TrySyncToVars(LnBias);
+      if (LnBias.defined() && (!out_metas[2].empty()) &&
+          (!out_metas[2][0].IsStopGradient()))
+        outs0["LnBias@GRAD"] = {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())};
+    }
+
+    auto LnOut = egr::EagerUtils::RecoverTensorWrapper(&this->LnOut_);
+    if (LnOut.defined()) {
+      ins0["LnOut"] = egr::EagerUtils::TrySyncToVars(LnOut);
+      if (LnOut.defined() && (!out_metas[13].empty()) &&
+          (!out_metas[13][0].IsStopGradient()))
+        outs0["LnOut@GRAD"] = {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())};
+    }
+
+    auto LnMean = egr::EagerUtils::RecoverTensorWrapper(&this->LnMean_);
+    if (LnMean.defined()) {
+      ins0["LnMean"] = egr::EagerUtils::TrySyncToVars(LnMean);
+    }
+
+    auto LnVariance = egr::EagerUtils::RecoverTensorWrapper(&this->LnVariance_);
+    if (LnVariance.defined()) {
+      ins0["LnVariance"] = egr::EagerUtils::TrySyncToVars(LnVariance);
+    }
+  } else {
+    auto Ln2Scale = egr::EagerUtils::RecoverTensorWrapper(&this->Ln2Scale_);
+    if (Ln2Scale.defined()) {
+      ins0["Ln2Scale"] = egr::EagerUtils::TrySyncToVars(Ln2Scale);
+      if (Ln2Scale.defined() && (!out_metas[9].empty()) &&
+          (!out_metas[9][0].IsStopGradient()))
+        outs0["Ln2Scale@GRAD"] = {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())};
+    }
+
+    auto Ln2Bias = egr::EagerUtils::RecoverTensorWrapper(&this->Ln2Bias_);
+    if (Ln2Bias.defined()) {
+      ins0["Ln2Bias"] = egr::EagerUtils::TrySyncToVars(Ln2Bias);
+      if (Ln2Bias.defined() && (!out_metas[10].empty()) &&
+          (!out_metas[10][0].IsStopGradient()))
+        outs0["Ln2Bias@GRAD"] = {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())};
+    }
+    auto BiasDropoutResidualOut =
+        egr::EagerUtils::RecoverTensorWrapper(&this->BiasDropoutResidualOut_);
+    auto Ln2Mean = egr::EagerUtils::RecoverTensorWrapper(&this->Ln2Mean_);
+    auto Ln2Variance =
+        egr::EagerUtils::RecoverTensorWrapper(&this->Ln2Variance_);
+    ins0["BiasDropoutResidualOut"] =
+        egr::EagerUtils::TrySyncToVars(BiasDropoutResidualOut);
+    ins0["Ln2Mean"] = egr::EagerUtils::TrySyncToVars(Ln2Mean);
+    ins0["Ln2Variance"] = egr::EagerUtils::TrySyncToVars(Ln2Variance);
+    if (BiasDropoutResidualOut.defined() && (!out_metas[14].empty()) &&
+        (!out_metas[14][0].IsStopGradient()))
+      outs0["BiasDropoutResidualOut@GRAD"] = {
+          std::make_shared<egr::EagerVariable>(
+              egr::Controller::Instance().GenerateUniqueName())};
+  }
+
+  auto& attrs_map0 = this->attr_map_;
+  // Pass the entire attribute map to TraceOp
+  // The underlying kernel will pickup whatever attribute they need at runtime
+  egr::Controller::Instance().GetCurrentTracer()->TraceOp(
+      "fused_attention_grad",
+      ins0,
+      outs0,
+      attrs_map0,
+      egr::Controller::Instance().GetExpectedPlace(),
+      &this->default_attr_map_,
+      false,
+      {});
+
+  if (outs0.find("OutLinearW@GRAD") != outs0.end()) {
+    outputs[7] = egr::EagerUtils::GetOutputs(outs0["OutLinearW@GRAD"]);
+  }
+  if (outs0.find("QKVW@GRAD") != outs0.end()) {
+    outputs[3] = egr::EagerUtils::GetOutputs(outs0["QKVW@GRAD"]);
+  }
+  if (outs0.find("X@GRAD") != outs0.end()) {
+    outputs[0] = egr::EagerUtils::GetOutputs(outs0["X@GRAD"]);
+  }
+
+  if (outs0.find("QKVOut@GRAD") != outs0.end()) {
+    outputs[15] = egr::EagerUtils::GetOutputs(outs0["QKVOut@GRAD"]);
+  }
+  if (outs0.find("QKTVOut@GRAD") != outs0.end()) {
+    outputs[16] = egr::EagerUtils::GetOutputs(outs0["QKTVOut@GRAD"]);
+  }
+  if (outs0.find("TransposeOut2@GRAD") != outs0.end()) {
+    outputs[17] = egr::EagerUtils::GetOutputs(outs0["TransposeOut2@GRAD"]);
+  }
+  if (outs0.find("QKOut@GRAD") != outs0.end()) {
+    outputs[18] = egr::EagerUtils::GetOutputs(outs0["QKOut@GRAD"]);
+  }
+  if (outs0.find("SoftmaxOut@GRAD") != outs0.end()) {
+    outputs[19] = egr::EagerUtils::GetOutputs(outs0["SoftmaxOut@GRAD"]);
+  }
+  if (outs0.find("AttnDropoutOut@GRAD") != outs0.end()) {
+    outputs[20] = egr::EagerUtils::GetOutputs(outs0["AttnDropoutOut@GRAD"]);
+  }
+  if (outs0.find("FMHAOut@GRAD") != outs0.end()) {
+    outputs[21] = egr::EagerUtils::GetOutputs(outs0["FMHAOut@GRAD"]);
+  }
+  if (outs0.find("OutLinearOut@GRAD") != outs0.end()) {
+    outputs[22] = egr::EagerUtils::GetOutputs(outs0["OutLinearOut@GRAD"]);
+  }
+
+  if (QKVBias.defined()) {
+    if (outs0.find("QKVBias@GRAD") != outs0.end()) {
+      outputs[4] = egr::EagerUtils::GetOutputs(outs0["QKVBias@GRAD"]);
+    }
+    if (outs0.find("QKVBiasOut@GRAD") != outs0.end()) {
+      outputs[11] = egr::EagerUtils::GetOutputs(outs0["QKVBiasOut@GRAD"]);
+    }
+  }
+
+  if (SrcMask.defined()) {
+    if (outs0.find("SrcMaskOut@GRAD") != outs0.end()) {
+      outputs[12] = egr::EagerUtils::GetOutputs(outs0["SrcMaskOut@GRAD"]);
+    }
+  }
+
+  if (OutLinearBias.defined()) {
+    if (outs0.find("OutLinearBias@GRAD") != outs0.end()) {
+      outputs[8] = egr::EagerUtils::GetOutputs(outs0["OutLinearBias@GRAD"]);
+    }
+  }
+
+  if (pre_layer_norm) {
+    auto LnScale = egr::EagerUtils::RecoverTensorWrapper(&this->LnScale_);
+    if (LnScale.defined()) {
+      if (outs0.find("LnScale@GRAD") != outs0.end()) {
+        outputs[1] = egr::EagerUtils::GetOutputs(outs0["LnScale@GRAD"]);
+      }
+    }
+
+    auto LnBias = egr::EagerUtils::RecoverTensorWrapper(&this->LnBias_);
+    if (LnBias.defined()) {
+      if (outs0.find("LnBias@GRAD") != outs0.end()) {
+        outputs[2] = egr::EagerUtils::GetOutputs(outs0["LnBias@GRAD"]);
+      }
+    }
+
+    auto LnOut = egr::EagerUtils::RecoverTensorWrapper(&this->LnOut_);
+    if (LnOut.defined()) {
+      if (outs0.find("LnOut@GRAD") != outs0.end()) {
+        outputs[13] = egr::EagerUtils::GetOutputs(outs0["LnOut@GRAD"]);
+      }
+    }
+  } else {
+    auto Ln2Scale = egr::EagerUtils::RecoverTensorWrapper(&this->Ln2Scale_);
+    if (Ln2Scale.defined()) {
+      if (outs0.find("Ln2Scale@GRAD") != outs0.end()) {
+        outputs[9] = egr::EagerUtils::GetOutputs(outs0["Ln2Scale@GRAD"]);
+      }
+    }
+
+    auto Ln2Bias = egr::EagerUtils::RecoverTensorWrapper(&this->Ln2Bias_);
+    if (Ln2Bias.defined()) {
+      if (outs0.find("Ln2Bias@GRAD") != outs0.end()) {
+        outputs[10] = egr::EagerUtils::GetOutputs(outs0["Ln2Bias@GRAD"]);
+      }
+    }
+    if (outs0.find("BiasDropoutResidualOut@GRAD") != outs0.end()) {
+      outputs[14] =
+          egr::EagerUtils::GetOutputs(outs0["BiasDropoutResidualOut@GRAD"]);
+    }
+  }
+
+  if (NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&outputs);
+  return outputs;
+}
diff --git a/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_feedforward_node.cc b/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_feedforward_node.cc
new file mode 100644
index 0000000000000..5228cb3657825
--- /dev/null
+++ b/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_feedforward_node.cc
@@ -0,0 +1,208 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "glog/logging.h"
+#include "paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h"
+#include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/eager/utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/imperative/tracer.h"
+#include "paddle/phi/api/all.h"
+
+paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                     egr::kSlotSmallVectorSize>
+fused_feedforwardGradNodeCompat::operator()(
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         egr::kSlotSmallVectorSize>& grads,
+    bool create_graph,
+    bool is_new_grad) {
+  VLOG(3) << "Running Eager Backward Node: fused_feedforwardGradNodeCompat";
+  const auto& out_metas = OutputMeta();
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       egr::kSlotSmallVectorSize>
+      outputs(11);
+
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       egr::kSlotSmallVectorSize>
+      hooked_grads0 =
+          fused_feedforwardGradNodeCompat::ApplyGradientHooks(grads);
+
+  bool pre_layer_norm = false;
+  if (attr_map_.count("pre_layer_norm")) {
+    pre_layer_norm = BOOST_GET_CONST(bool, attr_map_.at("pre_layer_norm"));
+  }
+
+  std::map<std::string, std::vector<std::shared_ptr<egr::EagerVariable>>> ins0 =
+      {{"Dropout1Mask",
+        egr::EagerUtils::TrySyncToVars(
+            egr::EagerUtils::RecoverTensorWrapper(&this->Dropout1Mask_))},
+       {"Dropout1Out",
+        egr::EagerUtils::TrySyncToVars(
+            egr::EagerUtils::RecoverTensorWrapper(&this->Dropout1Out_))},
+       {"Dropout2Mask",
+        egr::EagerUtils::TrySyncToVars(
+            egr::EagerUtils::RecoverTensorWrapper(&this->Dropout2Mask_))},
+       {"Dropout2Out",
+        egr::EagerUtils::TrySyncToVars(
+            egr::EagerUtils::RecoverTensorWrapper(&this->Dropout2Out_))},
+       {"Linear1Out",
+        egr::EagerUtils::TrySyncToVars(
+            egr::EagerUtils::RecoverTensorWrapper(&this->Linear1Out_))},
+       {"Linear1Weight",
+        egr::EagerUtils::TrySyncToVars(
+            egr::EagerUtils::RecoverTensorWrapper(&this->Linear1Weight_))},
+       {"Linear2Weight",
+        egr::EagerUtils::TrySyncToVars(
+            egr::EagerUtils::RecoverTensorWrapper(&this->Linear2Weight_))},
+       {"Out@GRAD", egr::EagerUtils::TrySyncToVars(hooked_grads0[0])},
+       {"X",
+        egr::EagerUtils::TrySyncToVars(
+            egr::EagerUtils::RecoverTensorWrapper(&this->X_))}};
+
+  std::map<std::string, std::vector<std::shared_ptr<egr::EagerVariable>>> outs0;
+
+  auto Linear1Bias = egr::EagerUtils::RecoverTensorWrapper(&this->Linear1Bias_);
+  if (Linear1Bias.defined())
+    ins0["Linear1Bias"] = egr::EagerUtils::TrySyncToVars(Linear1Bias);
+
+  if ((!out_metas[3].empty()) && (!(out_metas[3][0].IsStopGradient()))) {
+    outs0.insert({"Linear1Weight@GRAD",
+                  {std::make_shared<egr::EagerVariable>(
+                      egr::Controller::Instance().GenerateUniqueName())}});
+  }
+  if ((!out_metas[5].empty()) && (!(out_metas[5][0].IsStopGradient()))) {
+    outs0.insert({"Linear2Weight@GRAD",
+                  {std::make_shared<egr::EagerVariable>(
+                      egr::Controller::Instance().GenerateUniqueName())}});
+  }
+  if ((!out_metas[0].empty()) && (!(out_metas[0][0].IsStopGradient()))) {
+    outs0.insert({"X@GRAD",
+                  {std::make_shared<egr::EagerVariable>(
+                      egr::Controller::Instance().GenerateUniqueName())}});
+  }
+  if (Linear1Bias.defined() && (!out_metas[4].empty()) &&
+      (!out_metas[4][0].IsStopGradient()))
+    outs0["Linear1Bias@GRAD"] = {std::make_shared<egr::EagerVariable>(
+        egr::Controller::Instance().GenerateUniqueName())};
+
+  if (pre_layer_norm) {
+    auto Ln1Scale = egr::EagerUtils::RecoverTensorWrapper(&this->Ln1Scale_);
+    if (Ln1Scale.defined())
+      ins0["Ln1Scale"] = egr::EagerUtils::TrySyncToVars(Ln1Scale);
+    auto Ln1Bias = egr::EagerUtils::RecoverTensorWrapper(&this->Ln1Bias_);
+    if (Ln1Bias.defined())
+      ins0["Ln1Bias"] = egr::EagerUtils::TrySyncToVars(Ln1Bias);
+    auto Ln1Out = egr::EagerUtils::RecoverTensorWrapper(&this->Ln1Out_);
+    if (Ln1Out.defined())
+      ins0["Ln1Out"] = egr::EagerUtils::TrySyncToVars(Ln1Out);
+    auto Ln1Mean = egr::EagerUtils::RecoverTensorWrapper(&this->Ln1Mean_);
+    if (Ln1Mean.defined())
+      ins0["Ln1Mean"] = egr::EagerUtils::TrySyncToVars(Ln1Mean);
+    auto Ln1Variance =
+        egr::EagerUtils::RecoverTensorWrapper(&this->Ln1Variance_);
+    if (Ln1Variance.defined())
+      ins0["Ln1Variance"] = egr::EagerUtils::TrySyncToVars(Ln1Variance);
+    if (Ln1Scale.defined() && (!out_metas[7].empty()) &&
+        (!out_metas[7][0].IsStopGradient()))
+      outs0["Ln1Scale@GRAD"] = {std::make_shared<egr::EagerVariable>(
+          egr::Controller::Instance().GenerateUniqueName())};
+    if (Ln1Bias.defined() && (!out_metas[8].empty()) &&
+        (!out_metas[8][0].IsStopGradient()))
+      outs0["Ln1Bias@GRAD"] = {std::make_shared<egr::EagerVariable>(
+          egr::Controller::Instance().GenerateUniqueName())};
+
+  } else {
+    auto Ln2Scale = egr::EagerUtils::RecoverTensorWrapper(&this->Ln2Scale_);
+    if (Ln2Scale.defined())
+      ins0["Ln2Scale"] = egr::EagerUtils::TrySyncToVars(Ln2Scale);
+    auto Ln2Bias = egr::EagerUtils::RecoverTensorWrapper(&this->Ln2Bias_);
+    if (Ln2Bias.defined())
+      ins0["Ln2Bias"] = egr::EagerUtils::TrySyncToVars(Ln2Bias);
+    auto Ln2Mean = egr::EagerUtils::RecoverTensorWrapper(&this->Ln2Mean_);
+    if (Ln2Mean.defined())
+      ins0["Ln2Mean"] = egr::EagerUtils::TrySyncToVars(Ln2Mean);
+    auto Ln2Variance =
+        egr::EagerUtils::RecoverTensorWrapper(&this->Ln2Variance_);
+    if (Ln2Variance.defined())
+      ins0["Ln2Variance"] = egr::EagerUtils::TrySyncToVars(Ln2Variance);
+    if (Ln2Scale.defined() && (!out_metas[9].empty()) &&
+        (!out_metas[9][0].IsStopGradient()))
+      outs0["Ln2Scale@GRAD"] = {std::make_shared<egr::EagerVariable>(
+          egr::Controller::Instance().GenerateUniqueName())};
+    if (Ln2Bias.defined() && (!out_metas[10].empty()) &&
+        (!out_metas[10][0].IsStopGradient()))
+      outs0["Ln2Bias@GRAD"] = {std::make_shared<egr::EagerVariable>(
+          egr::Controller::Instance().GenerateUniqueName())};
+  }
+
+  auto Linear2Bias = egr::EagerUtils::RecoverTensorWrapper(&this->Linear2Bias_);
+  if (Linear2Bias.defined()) {
+    ins0["Linear2Bias"] = egr::EagerUtils::TrySyncToVars(Linear2Bias);
+    if ((!out_metas[6].empty()) && (!out_metas[6][0].IsStopGradient()))
+      outs0["Linear2Bias@GRAD"] = {std::make_shared<egr::EagerVariable>(
+          egr::Controller::Instance().GenerateUniqueName())};
+  }
+
+  auto& attrs_map0 = this->attr_map_;
+  // Pass the entire attribute map to TraceOp
+  // The underlying kernel will pickup whatever attribute they need at runtime
+  egr::Controller::Instance().GetCurrentTracer()->TraceOp(
+      "fused_feedforward_grad",
+      ins0,
+      outs0,
+      attrs_map0,
+      egr::Controller::Instance().GetExpectedPlace(),
+      &this->default_attr_map_,
+      false,
+      {});
+
+  if (outs0.find("Linear1Weight@GRAD") != outs0.end()) {
+    outputs[3] = egr::EagerUtils::GetOutputs(outs0["Linear1Weight@GRAD"]);
+  }
+  if (outs0.find("Linear2Weight@GRAD") != outs0.end()) {
+    outputs[5] = egr::EagerUtils::GetOutputs(outs0["Linear2Weight@GRAD"]);
+  }
+  if (outs0.find("X@GRAD") != outs0.end()) {
+    outputs[0] = egr::EagerUtils::GetOutputs(outs0["X@GRAD"]);
+  }
+  if (outs0.find("Linear1Bias@GRAD") != outs0.end()) {
+    outputs[4] = egr::EagerUtils::GetOutputs(outs0["Linear1Bias@GRAD"]);
+  }
+
+  if (pre_layer_norm) {
+    if (outs0.find("Ln1Scale@GRAD") != outs0.end()) {
+      outputs[7] = egr::EagerUtils::GetOutputs(outs0["Ln1Scale@GRAD"]);
+    }
+    if (outs0.find("Ln1Bias@GRAD") != outs0.end()) {
+      outputs[8] = egr::EagerUtils::GetOutputs(outs0["Ln1Bias@GRAD"]);
+    }
+
+  } else {
+    if (outs0.find("Ln2Bias@GRAD") != outs0.end()) {
+      outputs[10] = egr::EagerUtils::GetOutputs(outs0["Ln2Bias@GRAD"]);
+    }
+    if (outs0.find("Ln2Scale@GRAD") != outs0.end()) {
+      outputs[9] = egr::EagerUtils::GetOutputs(outs0["Ln2Scale@GRAD"]);
+    }
+  }
+
+  if (Linear2Bias.defined()) {
+    if (outs0.find("Linear2Bias@GRAD") != outs0.end()) {
+      outputs[6] = egr::EagerUtils::GetOutputs(outs0["Linear2Bias@GRAD"]);
+    }
+  }
+
+  if (NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&outputs);
+  return outputs;
+}
diff --git a/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_gate_attention_node.cc b/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_gate_attention_node.cc
new file mode 100644
index 0000000000000..a1ccaf09de8b4
--- /dev/null
+++ b/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_gate_attention_node.cc
@@ -0,0 +1,233 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "glog/logging.h"
+#include "paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h"
+#include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/eager/utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/imperative/tracer.h"
+#include "paddle/phi/api/all.h"
+
+paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                     egr::kSlotSmallVectorSize>
+fused_gate_attentionGradNodeCompat::operator()(
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         egr::kSlotSmallVectorSize>& grads,
+    bool create_graph,
+    bool is_new_grad) {
+  VLOG(3) << "Running Eager Backward Node: fused_gate_attentionGradNodeCompat";
+
+  const auto& out_metas = OutputMeta();
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       egr::kSlotSmallVectorSize>
+      outputs(12);
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       egr::kSlotSmallVectorSize>
+      hooked_grads0 =
+          fused_gate_attentionGradNodeCompat::ApplyGradientHooks(grads);
+
+  bool merge_qkv = true;
+  if (attr_map_.count("merge_qkv")) {
+    merge_qkv = BOOST_GET_CONST(bool, attr_map_.at("merge_qkv"));
+  }
+
+  bool has_gating = true;
+  if (attr_map_.count("has_gating")) {
+    has_gating = BOOST_GET_CONST(bool, attr_map_.at("has_gating"));
+  }
+
+  std::map<std::string, std::vector<std::shared_ptr<egr::EagerVariable>>> ins0 =
+      {{"FMHAOut",
+        egr::EagerUtils::TrySyncToVars(
+            egr::EagerUtils::RecoverTensorWrapper(&this->FMHAOut_))},
+       {"Out@GRAD", egr::EagerUtils::TrySyncToVars(hooked_grads0[7])},
+       {"OutLinearBias",
+        egr::EagerUtils::TrySyncToVars(
+            egr::EagerUtils::RecoverTensorWrapper(&this->OutLinearBias_))},
+       {"OutLinearWeight",
+        egr::EagerUtils::TrySyncToVars(
+            egr::EagerUtils::RecoverTensorWrapper(&this->OutLinearWeight_))},
+       {"Query",
+        egr::EagerUtils::TrySyncToVars(
+            egr::EagerUtils::RecoverTensorWrapper(&this->Query_))},
+       {"SoftmaxOut",
+        egr::EagerUtils::TrySyncToVars(
+            egr::EagerUtils::RecoverTensorWrapper(&this->SoftmaxOut_))}};
+  std::map<std::string, std::vector<std::shared_ptr<egr::EagerVariable>>> outs0;
+
+  if ((!out_metas[11].empty()) && (!(out_metas[11][0].IsStopGradient()))) {
+    outs0.insert({"OutLinearBias@GRAD",
+                  {std::make_shared<egr::EagerVariable>(
+                      egr::Controller::Instance().GenerateUniqueName())}});
+  }
+  if ((!out_metas[10].empty()) && (!(out_metas[10][0].IsStopGradient()))) {
+    outs0.insert({"OutLinearWeight@GRAD",
+                  {std::make_shared<egr::EagerVariable>(
+                      egr::Controller::Instance().GenerateUniqueName())}});
+  }
+  if ((!out_metas[0].empty()) && (!(out_metas[0][0].IsStopGradient()))) {
+    outs0.insert({"Query@GRAD",
+                  {std::make_shared<egr::EagerVariable>(
+                      egr::Controller::Instance().GenerateUniqueName())}});
+  }
+
+  if (merge_qkv) {
+    auto QKVTransposeOut =
+        egr::EagerUtils::RecoverTensorWrapper(&this->QKVTransposeOut_);
+    if (QKVTransposeOut.defined())
+      ins0["QKVTransposeOut"] = egr::EagerUtils::TrySyncToVars(QKVTransposeOut);
+    auto QKVWeight = egr::EagerUtils::RecoverTensorWrapper(&this->QKVWeight_);
+    if (QKVWeight.defined())
+      ins0["QKVWeight"] = egr::EagerUtils::TrySyncToVars(QKVWeight);
+    if (QKVWeight.defined() && (!out_metas[5].empty()) &&
+        (!out_metas[5][0].IsStopGradient()))
+      outs0["QKVWeight@GRAD"] = {std::make_shared<egr::EagerVariable>(
+          egr::Controller::Instance().GenerateUniqueName())};
+  } else {
+    auto Key = egr::EagerUtils::RecoverTensorWrapper(&this->Key_);
+    if (Key.defined()) ins0["Key"] = egr::EagerUtils::TrySyncToVars(Key);
+    auto QueryWeight =
+        egr::EagerUtils::RecoverTensorWrapper(&this->QueryWeight_);
+    if (QueryWeight.defined())
+      ins0["QueryWeight"] = egr::EagerUtils::TrySyncToVars(QueryWeight);
+    auto KeyWeight = egr::EagerUtils::RecoverTensorWrapper(&this->KeyWeight_);
+    if (KeyWeight.defined())
+      ins0["KeyWeight"] = egr::EagerUtils::TrySyncToVars(KeyWeight);
+    auto ValueWeight =
+        egr::EagerUtils::RecoverTensorWrapper(&this->ValueWeight_);
+    if (ValueWeight.defined())
+      ins0["ValueWeight"] = egr::EagerUtils::TrySyncToVars(ValueWeight);
+    auto QueryTransposeOut =
+        egr::EagerUtils::RecoverTensorWrapper(&this->QueryTransposeOut_);
+    if (QueryTransposeOut.defined())
+      ins0["QueryTransposeOut"] =
+          egr::EagerUtils::TrySyncToVars(QueryTransposeOut);
+    auto KeyTransposeOut =
+        egr::EagerUtils::RecoverTensorWrapper(&this->KeyTransposeOut_);
+    if (KeyTransposeOut.defined())
+      ins0["KeyTransposeOut"] = egr::EagerUtils::TrySyncToVars(KeyTransposeOut);
+    auto ValueTransposeOut =
+        egr::EagerUtils::RecoverTensorWrapper(&this->ValueTransposeOut_);
+    if (ValueTransposeOut.defined())
+      ins0["ValueTransposeOut"] =
+          egr::EagerUtils::TrySyncToVars(ValueTransposeOut);
+
+    if (Key.defined() && (!out_metas[1].empty()) &&
+        (!out_metas[1][0].IsStopGradient()))
+      outs0["Key@GRAD"] = {std::make_shared<egr::EagerVariable>(
+          egr::Controller::Instance().GenerateUniqueName())};
+    if (QueryWeight.defined() && (!out_metas[2].empty()) &&
+        (!out_metas[2][0].IsStopGradient()))
+      outs0["QueryWeight@GRAD"] = {std::make_shared<egr::EagerVariable>(
+          egr::Controller::Instance().GenerateUniqueName())};
+    if (KeyWeight.defined() && (!out_metas[3].empty()) &&
+        (!out_metas[3][0].IsStopGradient()))
+      outs0["KeyWeight@GRAD"] = {std::make_shared<egr::EagerVariable>(
+          egr::Controller::Instance().GenerateUniqueName())};
+    if (ValueWeight.defined() && (!out_metas[4].empty()) &&
+        (!out_metas[4][0].IsStopGradient()))
+      outs0["ValueWeight@GRAD"] = {std::make_shared<egr::EagerVariable>(
+          egr::Controller::Instance().GenerateUniqueName())};
+  }
+
+  if (has_gating) {
+    auto GateBias = egr::EagerUtils::RecoverTensorWrapper(&this->GateBias_);
+    if (GateBias.defined())
+      ins0["GateBias"] = egr::EagerUtils::TrySyncToVars(GateBias);
+    auto GateWeight = egr::EagerUtils::RecoverTensorWrapper(&this->GateWeight_);
+    if (GateWeight.defined())
+      ins0["GateWeight"] = egr::EagerUtils::TrySyncToVars(GateWeight);
+    auto GateOut = egr::EagerUtils::RecoverTensorWrapper(&this->GateOut_);
+    if (GateOut.defined())
+      ins0["GateOut"] = egr::EagerUtils::TrySyncToVars(GateOut);
+    if (GateBias.defined() && (!out_metas[9].empty()) &&
+        (!out_metas[9][0].IsStopGradient()))
+      outs0["GateBias@GRAD"] = {std::make_shared<egr::EagerVariable>(
+          egr::Controller::Instance().GenerateUniqueName())};
+    if (GateWeight.defined() && (!out_metas[8].empty()) &&
+        (!out_metas[8][0].IsStopGradient()))
+      outs0["GateWeight@GRAD"] = {std::make_shared<egr::EagerVariable>(
+          egr::Controller::Instance().GenerateUniqueName())};
+  }
+
+  auto NonbatchedBias =
+      egr::EagerUtils::RecoverTensorWrapper(&this->NonbatchedBias_);
+  if (NonbatchedBias.defined()) {
+    ins0["NonbatchedBias"] = egr::EagerUtils::TrySyncToVars(NonbatchedBias);
+    if ((!out_metas[6].empty()) && (!out_metas[6][0].IsStopGradient()))
+      outs0["NonbatchedBias@GRAD"] = {std::make_shared<egr::EagerVariable>(
+          egr::Controller::Instance().GenerateUniqueName())};
+  }
+
+  auto& attrs_map0 = this->attr_map_;
+  // Pass the entire attribute map to TraceOp
+  // The underlying kernel will pickup whatever attribute they need at runtime
+  egr::Controller::Instance().GetCurrentTracer()->TraceOp(
+      "fused_gate_attention_grad",
+      ins0,
+      outs0,
+      attrs_map0,
+      egr::Controller::Instance().GetExpectedPlace(),
+      &this->default_attr_map_,
+      false,
+      {});
+
+  if (outs0.find("Query@GRAD") != outs0.end()) {
+    outputs[0] = egr::EagerUtils::GetOutputs(outs0["Query@GRAD"]);
+  }
+  if (outs0.find("OutLinearBias@GRAD") != outs0.end()) {
+    outputs[11] = egr::EagerUtils::GetOutputs(outs0["OutLinearBias@GRAD"]);
+  }
+  if (outs0.find("OutLinearWeight@GRAD") != outs0.end()) {
+    outputs[10] = egr::EagerUtils::GetOutputs(outs0["OutLinearWeight@GRAD"]);
+  }
+
+  if (merge_qkv) {
+    if (outs0.find("QKVWeight@GRAD") != outs0.end()) {
+      outputs[5] = egr::EagerUtils::GetOutputs(outs0["QKVWeight@GRAD"]);
+    }
+  } else {
+    if (outs0.find("Key@GRAD") != outs0.end()) {
+      outputs[1] = egr::EagerUtils::GetOutputs(outs0["Key@GRAD"]);
+    }
+    if (outs0.find("QueryWeight@GRAD") != outs0.end()) {
+      outputs[2] = egr::EagerUtils::GetOutputs(outs0["QueryWeight@GRAD"]);
+    }
+    if (outs0.find("KeyWeight@GRAD") != outs0.end()) {
+      outputs[3] = egr::EagerUtils::GetOutputs(outs0["KeyWeight@GRAD"]);
+    }
+    if (outs0.find("ValueWeight@GRAD") != outs0.end()) {
+      outputs[4] = egr::EagerUtils::GetOutputs(outs0["ValueWeight@GRAD"]);
+    }
+  }
+
+  if (has_gating) {
+    if (outs0.find("GateBias@GRAD") != outs0.end()) {
+      outputs[9] = egr::EagerUtils::GetOutputs(outs0["GateBias@GRAD"]);
+    }
+    if (outs0.find("GateWeight@GRAD") != outs0.end()) {
+      outputs[8] = egr::EagerUtils::GetOutputs(outs0["GateWeight@GRAD"]);
+    }
+  }
+
+  if (NonbatchedBias.defined()) {
+    if (outs0.find("NonbatchedBias@GRAD") != outs0.end()) {
+      outputs[6] = egr::EagerUtils::GetOutputs(outs0["NonbatchedBias@GRAD"]);
+    }
+  }
+
+  if (NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&outputs);
+  return outputs;
+}
diff --git a/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h b/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h
new file mode 100644
index 0000000000000..571deb4e9ca74
--- /dev/null
+++ b/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h
@@ -0,0 +1,533 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/eager/grad_node_info.h"
+#include "paddle/fluid/eager/tensor_wrapper.h"
+#include "paddle/fluid/imperative/tracer.h"
+
+class fused_gate_attentionGradNodeCompat : public egr::GradNodeBase {
+ public:
+  fused_gate_attentionGradNodeCompat() : egr::GradNodeBase() {
+    VLOG(7) << " Construct fused_gate_attentionGradNodeCompat ";
+  }
+  fused_gate_attentionGradNodeCompat(size_t bwd_in_slot_num,
+                                     size_t bwd_out_slot_num)
+      : egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {
+    VLOG(7) << " Construct fused_gate_attentionGradNodeCompat ";
+  }
+  ~fused_gate_attentionGradNodeCompat() override {
+    VLOG(6) << " Destruct fused_gate_attentionGradNodeCompat ";
+  }
+
+  virtual paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                               egr::kSlotSmallVectorSize>
+  operator()(
+      paddle::small_vector<std::vector<paddle::experimental::Tensor>,  // NOLINT
+                           egr::kSlotSmallVectorSize>& grads,          // NOLINT
+      bool create_graph = false,
+      bool is_new_grad = false) override;
+
+  void ClearTensorWrappers() override {
+    FMHAOut_.clear();
+    GateBias_.clear();
+    GateOut_.clear();
+    GateWeight_.clear();
+    NonbatchedBias_.clear();
+    OutLinearBias_.clear();
+    OutLinearWeight_.clear();
+    QKVTransposeOut_.clear();
+    QKVWeight_.clear();
+    Query_.clear();
+    SoftmaxOut_.clear();
+    Key_.clear();
+    QueryWeight_.clear();
+    KeyWeight_.clear();
+    ValueWeight_.clear();
+    QueryTransposeOut_.clear();
+    KeyTransposeOut_.clear();
+    ValueTransposeOut_.clear();
+
+    SetIsTensorWrappersCleared(true);
+  }
+  std::string name() override { return "fused_gate_attentionGradNodeCompat"; }
+
+  std::shared_ptr<GradNodeBase> Copy() const override {
+    {
+      auto copied_node = std::shared_ptr<fused_gate_attentionGradNodeCompat>(
+          new fused_gate_attentionGradNodeCompat(*this));
+      return copied_node;
+    }
+  }
+
+  // SetX, SetY, ...
+  void SetTensorWrapperFMHAOut(const paddle::experimental::Tensor& FMHAOut) {
+    FMHAOut_ = egr::TensorWrapper(FMHAOut, false);
+  }
+  void SetTensorWrapperGateBias(const paddle::experimental::Tensor& GateBias) {
+    GateBias_ = egr::TensorWrapper(GateBias, false);
+  }
+  void SetTensorWrapperGateOut(const paddle::experimental::Tensor& GateOut) {
+    GateOut_ = egr::TensorWrapper(GateOut, false);
+  }
+  void SetTensorWrapperGateWeight(
+      const paddle::experimental::Tensor& GateWeight) {
+    GateWeight_ = egr::TensorWrapper(GateWeight, false);
+  }
+  void SetTensorWrapperNonbatchedBias(
+      const paddle::experimental::Tensor& NonbatchedBias) {
+    NonbatchedBias_ = egr::TensorWrapper(NonbatchedBias, false);
+  }
+  void SetTensorWrapperOutLinearBias(
+      const paddle::experimental::Tensor& OutLinearBias) {
+    OutLinearBias_ = egr::TensorWrapper(OutLinearBias, false);
+  }
+  void SetTensorWrapperOutLinearWeight(
+      const paddle::experimental::Tensor& OutLinearWeight) {
+    OutLinearWeight_ = egr::TensorWrapper(OutLinearWeight, false);
+  }
+  void SetTensorWrapperQKVTransposeOut(
+      const paddle::experimental::Tensor& QKVTransposeOut) {
+    QKVTransposeOut_ = egr::TensorWrapper(QKVTransposeOut, false);
+  }
+  void SetTensorWrapperQKVWeight(
+      const paddle::experimental::Tensor& QKVWeight) {
+    QKVWeight_ = egr::TensorWrapper(QKVWeight, false);
+  }
+  void SetTensorWrapperQuery(const paddle::experimental::Tensor& Query) {
+    Query_ = egr::TensorWrapper(Query, false);
+  }
+  void SetTensorWrapperSoftmaxOut(
+      const paddle::experimental::Tensor& SoftmaxOut) {
+    SoftmaxOut_ = egr::TensorWrapper(SoftmaxOut, false);
+  }
+  void SetTensorWrapperKey(const paddle::experimental::Tensor& Key) {
+    Key_ = egr::TensorWrapper(Key, false);
+  }
+  void SetTensorWrapperQueryWeight(
+      const paddle::experimental::Tensor& QueryWeight) {
+    QueryWeight_ = egr::TensorWrapper(QueryWeight, false);
+  }
+  void SetTensorWrapperKeyWeight(
+      const paddle::experimental::Tensor& KeyWeight) {
+    KeyWeight_ = egr::TensorWrapper(KeyWeight, false);
+  }
+  void SetTensorWrapperValueWeight(
+      const paddle::experimental::Tensor& ValueWeight) {
+    ValueWeight_ = egr::TensorWrapper(ValueWeight, false);
+  }
+  void SetTensorWrapperQueryTransposeOut(
+      const paddle::experimental::Tensor& QueryTransposeOut) {
+    QueryTransposeOut_ = egr::TensorWrapper(QueryTransposeOut, false);
+  }
+  void SetTensorWrapperKeyTransposeOut(
+      const paddle::experimental::Tensor& KeyTransposeOut) {
+    KeyTransposeOut_ = egr::TensorWrapper(KeyTransposeOut, false);
+  }
+  void SetTensorWrapperValueTransposeOut(
+      const paddle::experimental::Tensor& ValueTransposeOut) {
+    ValueTransposeOut_ = egr::TensorWrapper(ValueTransposeOut, false);
+  }
+
+  // SetAttrMap
+  void SetAttrMap(paddle::framework::AttributeMap&& attr_map) {
+    attr_map_ = std::move(attr_map);
+  }
+  void SetDefaultAttrMap(paddle::framework::AttributeMap&& default_attr_map) {
+    default_attr_map_ = std::move(default_attr_map);
+  }
+
+ private:
+  // TensorWrappers
+  egr::TensorWrapper FMHAOut_;
+  egr::TensorWrapper GateBias_;
+  egr::TensorWrapper GateOut_;
+  egr::TensorWrapper GateWeight_;
+  egr::TensorWrapper NonbatchedBias_;
+  egr::TensorWrapper OutLinearBias_;
+  egr::TensorWrapper OutLinearWeight_;
+  egr::TensorWrapper QKVTransposeOut_;
+  egr::TensorWrapper QKVWeight_;
+  egr::TensorWrapper Query_;
+  egr::TensorWrapper SoftmaxOut_;
+
+  egr::TensorWrapper Key_;
+  egr::TensorWrapper QueryWeight_;
+  egr::TensorWrapper KeyWeight_;
+  egr::TensorWrapper ValueWeight_;
+  egr::TensorWrapper QueryTransposeOut_;
+  egr::TensorWrapper KeyTransposeOut_;
+  egr::TensorWrapper ValueTransposeOut_;
+
+  // Attribute Map
+  paddle::framework::AttributeMap attr_map_;
+  paddle::framework::AttributeMap default_attr_map_;
+};
+
+class fused_feedforwardGradNodeCompat : public egr::GradNodeBase {
+ public:
+  fused_feedforwardGradNodeCompat() : egr::GradNodeBase() {
+    VLOG(7) << " Construct fused_feedforwardGradNodeCompat ";
+  }
+  fused_feedforwardGradNodeCompat(size_t bwd_in_slot_num,
+                                  size_t bwd_out_slot_num)
+      : egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {
+    VLOG(7) << " Construct fused_feedforwardGradNodeCompat ";
+  }
+  ~fused_feedforwardGradNodeCompat() override {
+    VLOG(6) << " Destruct fused_feedforwardGradNodeCompat ";
+  }
+
+  virtual paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                               egr::kSlotSmallVectorSize>
+  operator()(
+      paddle::small_vector<std::vector<paddle::experimental::Tensor>,  // NOLINT
+                           egr::kSlotSmallVectorSize>& grads,          // NOLINT
+      bool create_graph = false,
+      bool is_new_grad = false) override;
+
+  void ClearTensorWrappers() override {
+    Dropout1Mask_.clear();
+    Dropout1Out_.clear();
+    Dropout2Mask_.clear();
+    Dropout2Out_.clear();
+    Linear1Bias_.clear();
+    Linear1Out_.clear();
+    Linear1Weight_.clear();
+    Linear2Bias_.clear();
+    Linear2Weight_.clear();
+    Ln2Bias_.clear();
+    Ln2Mean_.clear();
+    Ln2Scale_.clear();
+    Ln2Variance_.clear();
+    X_.clear();
+
+    SetIsTensorWrappersCleared(true);
+  }
+  std::string name() override { return "fused_feedforwardGradNodeCompat"; }
+
+  std::shared_ptr<GradNodeBase> Copy() const override {
+    {
+      auto copied_node = std::shared_ptr<fused_feedforwardGradNodeCompat>(
+          new fused_feedforwardGradNodeCompat(*this));
+      return copied_node;
+    }
+  }
+
+  // SetX, SetY, ...
+  void SetTensorWrapperDropout1Mask(
+      const paddle::experimental::Tensor& Dropout1Mask) {
+    Dropout1Mask_ = egr::TensorWrapper(Dropout1Mask, false);
+  }
+  void SetTensorWrapperDropout1Out(
+      const paddle::experimental::Tensor& Dropout1Out) {
+    Dropout1Out_ = egr::TensorWrapper(Dropout1Out, false);
+  }
+  void SetTensorWrapperDropout2Mask(
+      const paddle::experimental::Tensor& Dropout2Mask) {
+    Dropout2Mask_ = egr::TensorWrapper(Dropout2Mask, false);
+  }
+  void SetTensorWrapperDropout2Out(
+      const paddle::experimental::Tensor& Dropout2Out) {
+    Dropout2Out_ = egr::TensorWrapper(Dropout2Out, false);
+  }
+  void SetTensorWrapperLinear1Bias(
+      const paddle::experimental::Tensor& Linear1Bias) {
+    Linear1Bias_ = egr::TensorWrapper(Linear1Bias, false);
+  }
+  void SetTensorWrapperLinear1Out(
+      const paddle::experimental::Tensor& Linear1Out) {
+    Linear1Out_ = egr::TensorWrapper(Linear1Out, false);
+  }
+  void SetTensorWrapperLinear1Weight(
+      const paddle::experimental::Tensor& Linear1Weight) {
+    Linear1Weight_ = egr::TensorWrapper(Linear1Weight, false);
+  }
+  void SetTensorWrapperLinear2Bias(
+      const paddle::experimental::Tensor& Linear2Bias) {
+    Linear2Bias_ = egr::TensorWrapper(Linear2Bias, false);
+  }
+  void SetTensorWrapperLinear2Weight(
+      const paddle::experimental::Tensor& Linear2Weight) {
+    Linear2Weight_ = egr::TensorWrapper(Linear2Weight, false);
+  }
+  void SetTensorWrapperLn2Bias(const paddle::experimental::Tensor& Ln2Bias) {
+    Ln2Bias_ = egr::TensorWrapper(Ln2Bias, false);
+  }
+  void SetTensorWrapperLn2Mean(const paddle::experimental::Tensor& Ln2Mean) {
+    Ln2Mean_ = egr::TensorWrapper(Ln2Mean, false);
+  }
+  void SetTensorWrapperLn2Scale(const paddle::experimental::Tensor& Ln2Scale) {
+    Ln2Scale_ = egr::TensorWrapper(Ln2Scale, false);
+  }
+  void SetTensorWrapperLn2Variance(
+      const paddle::experimental::Tensor& Ln2Variance) {
+    Ln2Variance_ = egr::TensorWrapper(Ln2Variance, false);
+  }
+  void SetTensorWrapperX(const paddle::experimental::Tensor& X) {
+    X_ = egr::TensorWrapper(X, false);
+  }
+  void SetTensorWrapperLn1Scale(const paddle::experimental::Tensor& Ln1Scale) {
+    Ln1Scale_ = egr::TensorWrapper(Ln1Scale, false);
+  }
+  void SetTensorWrapperLn1Bias(const paddle::experimental::Tensor& Ln1Bias) {
+    Ln1Bias_ = egr::TensorWrapper(Ln1Bias, false);
+  }
+  void SetTensorWrapperLn1Out(const paddle::experimental::Tensor& Ln1Out) {
+    Ln1Out_ = egr::TensorWrapper(Ln1Out, false);
+  }
+  void SetTensorWrapperLn1Mean(const paddle::experimental::Tensor& Ln1Mean) {
+    Ln1Mean_ = egr::TensorWrapper(Ln1Mean, false);
+  }
+  void SetTensorWrapperLn1Variance(
+      const paddle::experimental::Tensor& Ln1Variance) {
+    Ln1Variance_ = egr::TensorWrapper(Ln1Variance, false);
+  }
+  // SetAttrMap
+  void SetAttrMap(paddle::framework::AttributeMap&& attr_map) {
+    attr_map_ = std::move(attr_map);
+  }
+  void SetDefaultAttrMap(paddle::framework::AttributeMap&& default_attr_map) {
+    default_attr_map_ = std::move(default_attr_map);
+  }
+
+ private:
+  // TensorWrappers
+  egr::TensorWrapper Dropout1Mask_;
+  egr::TensorWrapper Dropout1Out_;
+  egr::TensorWrapper Dropout2Mask_;
+  egr::TensorWrapper Dropout2Out_;
+  egr::TensorWrapper Linear1Bias_;
+  egr::TensorWrapper Linear1Out_;
+  egr::TensorWrapper Linear1Weight_;
+  egr::TensorWrapper Linear2Bias_;
+  egr::TensorWrapper Linear2Weight_;
+  egr::TensorWrapper Ln2Bias_;
+  egr::TensorWrapper Ln2Mean_;
+  egr::TensorWrapper Ln2Scale_;
+  egr::TensorWrapper Ln2Variance_;
+  egr::TensorWrapper X_;
+
+  egr::TensorWrapper Ln1Scale_;
+  egr::TensorWrapper Ln1Bias_;
+  egr::TensorWrapper Ln1Out_;
+  egr::TensorWrapper Ln1Mean_;
+  egr::TensorWrapper Ln1Variance_;
+
+  // Attribute Map
+  paddle::framework::AttributeMap attr_map_;
+  paddle::framework::AttributeMap default_attr_map_;
+};
+
+class fused_attentionGradNodeCompat : public egr::GradNodeBase {
+ public:
+  fused_attentionGradNodeCompat() : egr::GradNodeBase() {
+    VLOG(7) << " Construct fused_attentionGradNodeCompat ";
+  }
+  fused_attentionGradNodeCompat(size_t bwd_in_slot_num, size_t bwd_out_slot_num)
+      : egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {
+    VLOG(7) << " Construct fused_attentionGradNodeCompat ";
+  }
+  ~fused_attentionGradNodeCompat() override {
+    VLOG(6) << " Destruct fused_attentionGradNodeCompat ";
+  }
+
+  virtual paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                               egr::kSlotSmallVectorSize>
+  operator()(
+      paddle::small_vector<std::vector<paddle::experimental::Tensor>,  // NOLINT
+                           egr::kSlotSmallVectorSize>& grads,          // NOLINT
+      bool create_graph = false,
+      bool is_new_grad = false) override;
+
+  void ClearTensorWrappers() override {
+    AttnDropoutMaskOut_.clear();
+    AttnDropoutOut_.clear();
+    BiasDropoutResidualOut_.clear();
+    DropoutMaskOut_.clear();
+    FMHAOut_.clear();
+    Ln2Bias_.clear();
+    Ln2Mean_.clear();
+    Ln2Scale_.clear();
+    Ln2Variance_.clear();
+    OutLinearBias_.clear();
+    OutLinearOut_.clear();
+    OutLinearW_.clear();
+    QKOut_.clear();
+    QKTVOut_.clear();
+    QKVBias_.clear();
+    QKVBiasOut_.clear();
+    QKVOut_.clear();
+    QKVW_.clear();
+    SoftmaxOut_.clear();
+    SrcMask_.clear();
+    SrcMaskOut_.clear();
+    TransposeOut2_.clear();
+    X_.clear();
+
+    SetIsTensorWrappersCleared(true);
+  }
+  std::string name() override { return "fused_attentionGradNodeCompat"; }
+
+  std::shared_ptr<GradNodeBase> Copy() const override {
+    {
+      auto copied_node = std::shared_ptr<fused_attentionGradNodeCompat>(
+          new fused_attentionGradNodeCompat(*this));
+      return copied_node;
+    }
+  }
+
+  // SetX, SetY, ...
+  void SetTensorWrapperAttnDropoutMaskOut(
+      const paddle::experimental::Tensor& AttnDropoutMaskOut) {
+    AttnDropoutMaskOut_ = egr::TensorWrapper(AttnDropoutMaskOut, false);
+  }
+  void SetTensorWrapperAttnDropoutOut(
+      const paddle::experimental::Tensor& AttnDropoutOut) {
+    AttnDropoutOut_ = egr::TensorWrapper(AttnDropoutOut, false);
+  }
+  void SetTensorWrapperBiasDropoutResidualOut(
+      const paddle::experimental::Tensor& BiasDropoutResidualOut) {
+    BiasDropoutResidualOut_ = egr::TensorWrapper(BiasDropoutResidualOut, false);
+  }
+  void SetTensorWrapperDropoutMaskOut(
+      const paddle::experimental::Tensor& DropoutMaskOut) {
+    DropoutMaskOut_ = egr::TensorWrapper(DropoutMaskOut, false);
+  }
+  void SetTensorWrapperFMHAOut(const paddle::experimental::Tensor& FMHAOut) {
+    FMHAOut_ = egr::TensorWrapper(FMHAOut, false);
+  }
+  void SetTensorWrapperLn2Bias(const paddle::experimental::Tensor& Ln2Bias) {
+    Ln2Bias_ = egr::TensorWrapper(Ln2Bias, false);
+  }
+  void SetTensorWrapperLn2Mean(const paddle::experimental::Tensor& Ln2Mean) {
+    Ln2Mean_ = egr::TensorWrapper(Ln2Mean, false);
+  }
+  void SetTensorWrapperLn2Scale(const paddle::experimental::Tensor& Ln2Scale) {
+    Ln2Scale_ = egr::TensorWrapper(Ln2Scale, false);
+  }
+  void SetTensorWrapperLn2Variance(
+      const paddle::experimental::Tensor& Ln2Variance) {
+    Ln2Variance_ = egr::TensorWrapper(Ln2Variance, false);
+  }
+  void SetTensorWrapperOutLinearBias(
+      const paddle::experimental::Tensor& OutLinearBias) {
+    OutLinearBias_ = egr::TensorWrapper(OutLinearBias, false);
+  }
+  void SetTensorWrapperOutLinearOut(
+      const paddle::experimental::Tensor& OutLinearOut) {
+    OutLinearOut_ = egr::TensorWrapper(OutLinearOut, false);
+  }
+  void SetTensorWrapperOutLinearW(
+      const paddle::experimental::Tensor& OutLinearW) {
+    OutLinearW_ = egr::TensorWrapper(OutLinearW, false);
+  }
+  void SetTensorWrapperQKOut(const paddle::experimental::Tensor& QKOut) {
+    QKOut_ = egr::TensorWrapper(QKOut, false);
+  }
+  void SetTensorWrapperQKTVOut(const paddle::experimental::Tensor& QKTVOut) {
+    QKTVOut_ = egr::TensorWrapper(QKTVOut, false);
+  }
+  void SetTensorWrapperQKVBias(const paddle::experimental::Tensor& QKVBias) {
+    QKVBias_ = egr::TensorWrapper(QKVBias, false);
+  }
+  void SetTensorWrapperQKVBiasOut(
+      const paddle::experimental::Tensor& QKVBiasOut) {
+    QKVBiasOut_ = egr::TensorWrapper(QKVBiasOut, false);
+  }
+  void SetTensorWrapperQKVOut(const paddle::experimental::Tensor& QKVOut) {
+    QKVOut_ = egr::TensorWrapper(QKVOut, false);
+  }
+  void SetTensorWrapperQKVW(const paddle::experimental::Tensor& QKVW) {
+    QKVW_ = egr::TensorWrapper(QKVW, false);
+  }
+  void SetTensorWrapperSoftmaxOut(
+      const paddle::experimental::Tensor& SoftmaxOut) {
+    SoftmaxOut_ = egr::TensorWrapper(SoftmaxOut, false);
+  }
+  void SetTensorWrapperSrcMask(const paddle::experimental::Tensor& SrcMask) {
+    SrcMask_ = egr::TensorWrapper(SrcMask, false);
+  }
+  void SetTensorWrapperSrcMaskOut(
+      const paddle::experimental::Tensor& SrcMaskOut) {
+    SrcMaskOut_ = egr::TensorWrapper(SrcMaskOut, false);
+  }
+  void SetTensorWrapperTransposeOut2(
+      const paddle::experimental::Tensor& TransposeOut2) {
+    TransposeOut2_ = egr::TensorWrapper(TransposeOut2, false);
+  }
+  void SetTensorWrapperX(const paddle::experimental::Tensor& X) {
+    X_ = egr::TensorWrapper(X, false);
+  }
+  void SetTensorWrapperLnScale(const paddle::experimental::Tensor& LnScale) {
+    LnScale_ = egr::TensorWrapper(LnScale, false);
+  }
+  void SetTensorWrapperLnBias(const paddle::experimental::Tensor& LnBias) {
+    LnBias_ = egr::TensorWrapper(LnBias, false);
+  }
+  void SetTensorWrapperLnOut(const paddle::experimental::Tensor& LnOut) {
+    LnOut_ = egr::TensorWrapper(LnOut, false);
+  }
+  void SetTensorWrapperLnMean(const paddle::experimental::Tensor& LnMean) {
+    LnMean_ = egr::TensorWrapper(LnMean, false);
+  }
+  void SetTensorWrapperLnVariance(
+      const paddle::experimental::Tensor& LnVariance) {
+    LnVariance_ = egr::TensorWrapper(LnVariance, false);
+  }
+
+  // SetAttrMap
+  void SetAttrMap(paddle::framework::AttributeMap&& attr_map) {
+    attr_map_ = std::move(attr_map);
+  }
+  void SetDefaultAttrMap(paddle::framework::AttributeMap&& default_attr_map) {
+    default_attr_map_ = std::move(default_attr_map);
+  }
+
+ private:
+  // TensorWrappers
+  egr::TensorWrapper AttnDropoutMaskOut_;
+  egr::TensorWrapper AttnDropoutOut_;
+  egr::TensorWrapper BiasDropoutResidualOut_;
+  egr::TensorWrapper DropoutMaskOut_;
+  egr::TensorWrapper FMHAOut_;
+  egr::TensorWrapper Ln2Bias_;
+  egr::TensorWrapper Ln2Mean_;
+  egr::TensorWrapper Ln2Scale_;
+  egr::TensorWrapper Ln2Variance_;
+  egr::TensorWrapper OutLinearBias_;
+  egr::TensorWrapper OutLinearOut_;
+  egr::TensorWrapper OutLinearW_;
+  egr::TensorWrapper QKOut_;
+  egr::TensorWrapper QKTVOut_;
+  egr::TensorWrapper QKVBias_;
+  egr::TensorWrapper QKVBiasOut_;
+  egr::TensorWrapper QKVOut_;
+  egr::TensorWrapper QKVW_;
+  egr::TensorWrapper SoftmaxOut_;
+  egr::TensorWrapper SrcMask_;
+  egr::TensorWrapper SrcMaskOut_;
+  egr::TensorWrapper TransposeOut2_;
+  egr::TensorWrapper X_;
+
+  egr::TensorWrapper LnScale_;
+  egr::TensorWrapper LnBias_;
+  egr::TensorWrapper LnOut_;
+  egr::TensorWrapper LnMean_;
+  egr::TensorWrapper LnVariance_;
+
+  // Attribute Map
+  paddle::framework::AttributeMap attr_map_;
+  paddle::framework::AttributeMap default_attr_map_;
+};
diff --git a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
index ecfb40e947f91..162801c716962 100644
--- a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
@@ -26,36 +26,14 @@ endif()
 message(
   "Generate dygraph file structure at path: ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/generated"
 )
+
+set(CODE_GEN_SPLIT_FILE_COUNT "8")
+
 execute_process(
   COMMAND
     "${PYTHON_EXECUTABLE}"
     "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/generate_file_structures.py"
-    "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/")
-
-set(tmp_dygraph_forward_h_path
-    "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.tmp.h"
-)
-set(tmp_dygraph_forward_cc_path
-    "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions.tmp.cc"
-)
-set(tmp_dygraph_node_h_path
-    "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.tmp.h"
-)
-set(tmp_dygraph_node_cc_path
-    "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.tmp.cc"
-)
-set(dygraph_forward_h_path
-    "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
-)
-set(dygraph_forward_cc_path
-    "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions.cc"
-)
-set(dygraph_node_h_path
-    "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h"
-)
-set(dygraph_node_cc_path
-    "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.cc"
-)
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/" "${CODE_GEN_SPLIT_FILE_COUNT}")
 
 if(WIN32)
   set(EAGER_CODEGEN_DEPS eager_generator)
@@ -114,22 +92,7 @@ if(WIN32)
     COMMAND
       "${eager_generator_path}/eager_generator.exe"
       "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated"
-    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_forward_h_path}
-            ${dygraph_forward_h_path}
-    COMMENT
-      "copy_if_different ${tmp_dygraph_forward_h_path} to ${dygraph_forward_h_path}"
-    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_forward_cc_path}
-            ${dygraph_forward_cc_path}
-    COMMENT
-      "copy_if_different ${tmp_dygraph_forward_cc_path} to ${dygraph_forward_cc_path}"
-    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_node_h_path}
-            ${dygraph_node_h_path}
-    COMMENT
-      "copy_if_different ${tmp_dygraph_node_h_path} to ${dygraph_node_h_path}"
-    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_node_cc_path}
-            ${dygraph_node_cc_path}
-    COMMENT
-      "copy_if_different ${tmp_dygraph_node_cc_path} to ${dygraph_node_cc_path}"
+      "${CODE_GEN_SPLIT_FILE_COUNT}"
     DEPENDS ${EAGER_CODEGEN_DEPS}
     VERBATIM)
 else()
@@ -140,22 +103,7 @@ else()
       "LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${CMAKE_CURRENT_BINARY_DIR}/../../pybind"
       "${CMAKE_CURRENT_BINARY_DIR}/eager_generator"
       "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated"
-    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_forward_h_path}
-            ${dygraph_forward_h_path}
-    COMMENT
-      "copy_if_different ${tmp_dygraph_forward_h_path} to ${dygraph_forward_h_path}"
-    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_forward_cc_path}
-            ${dygraph_forward_cc_path}
-    COMMENT
-      "copy_if_different ${tmp_dygraph_forward_cc_path} to ${dygraph_forward_cc_path}"
-    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_node_h_path}
-            ${dygraph_node_h_path}
-    COMMENT
-      "copy_if_different ${tmp_dygraph_node_h_path} to ${dygraph_node_h_path}"
-    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_node_cc_path}
-            ${dygraph_node_cc_path}
-    COMMENT
-      "copy_if_different ${tmp_dygraph_node_cc_path} to ${dygraph_node_cc_path}"
+      "${CODE_GEN_SPLIT_FILE_COUNT}"
     DEPENDS eager_generator
     VERBATIM)
 endif()
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index 6910f9e537fc8..54b40c72d0215 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -51,7 +51,10 @@ static std::unordered_set<std::string> ops_to_fill_zero_for_empty_grads = {
     "split", "rnn"};
 
 /* --- Black Ops list that's NO NEED to apply code generation --- */
-static std::unordered_set<std::string> black_ops_list = {"run_program"};
+static std::unordered_set<std::string> black_ops_list = {"run_program",
+                                                         "fused_gate_attention",
+                                                         "fused_feedforward",
+                                                         "fused_attention"};
 
 static std::string LegalizeVariableName(const std::string& var_name) {
   std::string ret = var_name;
@@ -2972,7 +2975,10 @@ static std::string GenerateDygraphHFileIncludes() {
       "#include \"paddle/phi/api/all.h\"\n"
       "#include \"paddle/fluid/eager/utils.h\"\n"
       "#include \"paddle/fluid/imperative/tracer.h\"\n"
-      "#include \"paddle/fluid/framework/op_registry.h\"\n\n";
+      "#include \"paddle/fluid/framework/op_registry.h\"\n"
+      "#include "
+      "\"paddle/fluid/eager/api/manual/fluid_manual/"
+      "dygraph_forward_api.h\"\n\n";
 
   dygraph_forward_api_includes_str +=
       "extern std::unordered_map<std::string, std::vector<std::string>> "
@@ -3021,7 +3027,10 @@ static void GenerateNodeHFile(const std::string& node_h_path,
       "#pragma once\n"
       "#include \"paddle/fluid/eager/tensor_wrapper.h\"\n"
       "#include \"paddle/fluid/imperative/tracer.h\"\n"
-      "#include \"paddle/fluid/eager/grad_node_info.h\"\n\n";
+      "#include \"paddle/fluid/eager/grad_node_info.h\"\n"
+      "#include "
+      "\"paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h\"\n\n";
+
   std::ofstream node_h_stream(node_h_path, std::ios::out);
   node_h_stream << node_h_include_str;
   node_h_stream << grad_node_str;
@@ -3074,32 +3083,50 @@ static std::string ConvertCoreOpsInfosToString(
   return core_ops_returns_info_init_str;
 }
 
-static std::string GenerateCoreOpsReturnsInfo() {
+static std::string GenerateCoreOpsArgsInfo() {
   const char* Core_Ops_Returns_MAP_TEMPLATE =
       "std::unordered_map<std::string, std::vector<std::string>> "
-      "core_ops_args_info = { %s };\n"
-      "std::unordered_map<std::string, std::vector<std::string>> "
-      "core_ops_args_type_info = { %s };\n"
-      "std::unordered_map<std::string, std::vector<std::string>> "
-      "core_ops_returns_info = { %s };\n";
+      "core_ops_args_info = { %s };\n";
 
   std::string core_ops_args_info_init_str =
       ConvertCoreOpsInfosToString(core_ops_args_info);
+
+  std::string core_ops_info_str = paddle::string::Sprintf(
+      Core_Ops_Returns_MAP_TEMPLATE, core_ops_args_info_init_str);
+
+  return core_ops_info_str;
+}
+
+static std::string GenerateCoreOpsArgsTypeInfo() {
+  const char* Core_Ops_Returns_MAP_TEMPLATE =
+      "std::unordered_map<std::string, std::vector<std::string>> "
+      "core_ops_args_type_info = { %s };\n";
+
   std::string core_ops_args_type_info_init_str =
       ConvertCoreOpsInfosToString(core_ops_args_type_info);
+
+  std::string core_ops_info_str = paddle::string::Sprintf(
+      Core_Ops_Returns_MAP_TEMPLATE, core_ops_args_type_info_init_str);
+
+  return core_ops_info_str;
+}
+
+static std::string GenerateCoreOpsReturnsInfo() {
+  const char* Core_Ops_Returns_MAP_TEMPLATE =
+      "std::unordered_map<std::string, std::vector<std::string>> "
+      "core_ops_returns_info = { %s };\n";
+
   std::string core_ops_returns_info_init_str =
       ConvertCoreOpsInfosToString(core_ops_returns_info);
 
-  std::string core_ops_info_str =
-      paddle::string::Sprintf(Core_Ops_Returns_MAP_TEMPLATE,
-                              core_ops_args_info_init_str,
-                              core_ops_args_type_info_init_str,
-                              core_ops_returns_info_init_str);
+  std::string core_ops_info_str = paddle::string::Sprintf(
+      Core_Ops_Returns_MAP_TEMPLATE, core_ops_returns_info_init_str);
 
   return core_ops_info_str;
 }
 
-static void DygraphCodeGeneration(const std::string& output_dir) {
+static void DygraphCodeGeneration(const std::string& output_dir,
+                                  int split_count) {
   std::string dygraph_forward_api_str = GenerateDygraphHFileIncludes();
   std::string fwd_function_str = "";
   std::string grad_node_h_str = "";
@@ -3107,6 +3134,8 @@ static void DygraphCodeGeneration(const std::string& output_dir) {
 
   auto& op_info_map = paddle::framework::OpInfoMap::Instance().map();
 
+  paddle::flat_hash_map<std::string, OpInfo> op_info_map_need_gen;
+
   for (auto& pair : op_info_map) {
     const OpInfo& op_info = pair.second;
     proto::OpProto* op_proto = op_info.proto_;
@@ -3117,6 +3146,31 @@ static void DygraphCodeGeneration(const std::string& output_dir) {
       continue;
     }
 
+    GradNodeGenerationInfo bwd_info;
+
+    bool is_available = CollectGradInformationFromOpInfo(op_info, &bwd_info);
+
+    if (!is_available && !bwd_info.GenerateForwardOnly()) {
+      VLOG(6) << "Skipped operator: " << op_type;
+      continue;
+    }
+
+    op_info_map_need_gen.emplace(pair);
+  }
+
+  int each_cc_file_api_size = op_info_map_need_gen.size() / split_count;
+  if (op_info_map_need_gen.size() % split_count != 0) {
+    each_cc_file_api_size++;
+  }
+  int api_index = 0;
+  int file_index = 0;
+
+  for (auto& pair : op_info_map_need_gen) {
+    const OpInfo& op_info = pair.second;
+    proto::OpProto* op_proto = op_info.proto_;
+
+    const std::string& op_type = op_proto->type();
+
     /* ----------------------------- */
     /* ---- Collect Information ---- */
     /* ----------------------------- */
@@ -3128,12 +3182,7 @@ static void DygraphCodeGeneration(const std::string& output_dir) {
 
     CollectForwardInformationFromOpInfo(op_info, &fwd_info);
 
-    bool is_available = CollectGradInformationFromOpInfo(op_info, &bwd_info);
-
-    if (!is_available && !bwd_info.GenerateForwardOnly()) {
-      VLOG(6) << "Skipped operator: " << op_type;
-      continue;
-    }
+    CollectGradInformationFromOpInfo(op_info, &bwd_info);
 
     VLOG(6) << "-------- PurifyOpProto -------";
     PurifyForwardOpProto(*op_proto, &fwd_info);
@@ -3179,25 +3228,60 @@ static void DygraphCodeGeneration(const std::string& output_dir) {
       dygraph_forward_api_str += inplace_fwd_function_declare_str;
     }
 
-    if (bwd_info.GenerateForwardOnly()) continue;
-
-    VLOG(6) << "-------- GenerateGradNodeHeaderContents -------";
-    grad_node_h_str += GenerateGradNodeHeaderContents(fwd_info, bwd_info);
-    grad_node_h_str += "\n";
+    if (!bwd_info.GenerateForwardOnly()) {
+      VLOG(6) << "-------- GenerateGradNodeHeaderContents -------";
+      grad_node_h_str += GenerateGradNodeHeaderContents(fwd_info, bwd_info);
+      grad_node_h_str += "\n";
 
-    VLOG(6) << "-------- GenerateGradNodeCCContents -------";
-    grad_node_cc_str += GenerateGradNodeCCContents(fwd_info, bwd_info);
-    grad_node_cc_str += "\n";
+      VLOG(6) << "-------- GenerateGradNodeCCContents -------";
+      grad_node_cc_str += GenerateGradNodeCCContents(fwd_info, bwd_info);
+      grad_node_cc_str += "\n";
+    }
 
     VLOG(6) << op_type << ": Finished Generating Op: " << op_type;
+
+    api_index++;
+    if (api_index / each_cc_file_api_size > file_index) {
+      file_index++;
+      VLOG(6) << "-------- GenerateDygraphForwardCCFile -------";
+      std::string forward_cc_path = output_dir +
+                                    "/forwards/dygraph_forward_functions" +
+                                    std::to_string(file_index) + ".tmp.cc";
+      fwd_function_str += "\n";
+      GenerateForwardDygraphFile(forward_cc_path, fwd_function_str);
+      fwd_function_str = "";
+
+      VLOG(6) << "-------- GenerateNodeCCFile -------";
+      std::string node_cc_path =
+          output_dir + "/nodes/nodes" + std::to_string(file_index) + ".tmp.cc";
+      GenerateNodeCCFile(node_cc_path, grad_node_cc_str);
+      grad_node_cc_str = "";
+    }
   }
 
+  file_index++;
   VLOG(6) << "-------- GenerateDygraphForwardCCFile -------";
-  std::string forward_cc_path =
-      output_dir + "/forwards/dygraph_forward_functions.tmp.cc";
-  fwd_function_str += "\n";
-  fwd_function_str += GenerateCoreOpsReturnsInfo();
+  std::string forward_cc_path = output_dir +
+                                "/forwards/dygraph_forward_functions" +
+                                std::to_string(file_index) + ".tmp.cc";
   GenerateForwardDygraphFile(forward_cc_path, fwd_function_str);
+  fwd_function_str = "";
+
+  GenerateForwardDygraphFile(
+      output_dir + "/forwards/dygraph_forward_functions_args_info.tmp.cc",
+      GenerateCoreOpsArgsInfo());
+  GenerateForwardDygraphFile(
+      output_dir + "/forwards/dygraph_forward_functions_args_type_info.tmp.cc",
+      GenerateCoreOpsArgsTypeInfo());
+  GenerateForwardDygraphFile(
+      output_dir + "/forwards/dygraph_forward_functions_returns_info.tmp.cc",
+      GenerateCoreOpsReturnsInfo());
+
+  VLOG(6) << "-------- GenerateNodeCCFile -------";
+  std::string node_cc_path =
+      output_dir + "/nodes/nodes" + std::to_string(file_index) + ".tmp.cc";
+  GenerateNodeCCFile(node_cc_path, grad_node_cc_str);
+  grad_node_cc_str = "";
 
   VLOG(6) << "-------- GenerateForwardHFile -------";
   std::string dygraph_forward_api_path =
@@ -3207,26 +3291,23 @@ static void DygraphCodeGeneration(const std::string& output_dir) {
   VLOG(6) << "-------- GenerateNodeHFile -------";
   std::string node_h_path = output_dir + "/nodes/nodes.tmp.h";
   GenerateNodeHFile(node_h_path, grad_node_h_str);
-
-  VLOG(6) << "-------- GenerateNodeCCFile -------";
-  std::string node_cc_path = output_dir + "/nodes/nodes.tmp.cc";
-  GenerateNodeCCFile(node_cc_path, grad_node_cc_str);
 }
 
 }  // namespace framework
 }  // namespace paddle
 
 int main(int argc, char* argv[]) {
-  if (argc != 2) {
-    std::cerr << "argc must be 2" << std::endl;
+  if (argc != 3) {
+    std::cerr << "argc must be 3" << std::endl;
     return -1;
   }
 
   std::string eager_root = argv[1];
+  int split_count = atoi(argv[2]);
 
   paddle::framework::PrepareAttrMapForOps();
 
-  paddle::framework::DygraphCodeGeneration(eager_root);
+  paddle::framework::DygraphCodeGeneration(eager_root, split_count);
 
   return 0;
 }
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
index 2a7e9b1cde181..ce1e81dd971ad 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
@@ -1,8 +1,8 @@
 set(api_yaml_path
-    "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/legacy_api.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api.yaml"
+    "${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/api.yaml,${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/legacy_api.yaml,${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/sparse_api.yaml"
 )
 set(backward_yaml_path
-    "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/legacy_backward.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api.yaml"
+    "${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/backward.yaml,${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/legacy_backward.yaml,${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/sparse_bw_api.yaml"
 )
 set(tmp_forwards_cc_path
     "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.cc"
@@ -30,7 +30,7 @@ set(nodes_h_path
 )
 # StringTensor only needs forward api
 set(fwd_api_yaml_path
-    "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/strings_api.yaml")
+    "${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/strings_api.yaml")
 
 message("Final State Eager CodeGen")
 add_custom_target(
@@ -54,11 +54,10 @@ add_custom_target(
   VERBATIM)
 
 set(tmp_python_c_output_path
-    "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/tmp_eager_final_state_op_function_impl.h"
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/eager_final_state_op_function.cc.tmp"
 )
 set(python_c_output_path
-    "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/eager_final_state_op_function_impl.h"
-)
+    "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/eager_final_state_op_function.cc")
 
 add_custom_target(
   eager_final_state_python_c_codegen
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
index dee3b3d79a2e7..79f5da4bec79e 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
@@ -32,7 +32,7 @@
     "square_double_grad", "celu_double_grad", "pad_double_grad",
     "pad3d_double_grad", "squeeze_double_grad", "unsqueeze_double_grad",
     "instance_norm_double_grad", "conv3d_double_grad",
-    "depthwise_conv2d_grad_grad"
+    "depthwise_conv2d_grad_grad", "concat_double_grad"
 ])
 
 # For API dispatch used at python-level
@@ -45,6 +45,7 @@
     'int' : 'int', 'int32_t' : 'int32_t', 'int64_t' : 'int64_t',  'size_t' : 'size_t', \
     'float' : 'float', 'double' : 'double', 'bool' : 'bool', \
     'str' : 'std::string', \
+    'str[]' : 'std::vector<std::string>', 'float[]' : 'std::vector<float>', \
     'Place' : 'paddle::Place', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \
     'int64_t[]' : 'std::vector<int64_t>', 'int[]' : 'std::vector<int>',
     'Tensor' : 'Tensor',
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index d406f00b25039..a3beb268cfafb 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -40,6 +40,8 @@
 # keeping the code compatible, here we also skip inplace check in new dygraph temporarily,
 # and this will be fixed in the futrue.
 inplace_check_blacklist = set(["assign_out_"])
+# # --- Black Ops list that's NO NEED to apply backward code generation
+black_ops_list = ["conv2d", "conv2d_grad", "conv2d_grad_grad", "add_n"]
 
 
 ###########
@@ -154,9 +156,7 @@ class {} : public egr::GradNodeBase {{
 {}
   // Prepare Grad function call
 {}
-  // Get GradIn autograd_meta
-{}
-  // Compute Require Grad
+  // Runtime check if we need next grad
 {}
   // Inplace Check
 {}
@@ -229,6 +229,27 @@ class {} : public egr::GradNodeBase {{
   }}
 """
 
+HIHGER_ORDER_DERIVATIVE_VALUE_TEMPLATE = \
+"""  if(trace_backward) {{
+{}
+    // Node Construction
+{}
+    // SetAttributes if needed
+{}
+    // Set TensorWrappers for Forward Inputs if needed
+{}
+    // SetGradOutMeta & SetEdges
+{}
+    // SetOutRank & SetHistory & SetGradInMeta & RetainGrad
+{}
+{}
+{}
+{}
+    // Set TensorWrappers for Forward Outputs if needed
+{}
+  }}
+"""
+
 NAMESPACE_WRAPPER_TEMPLATE = \
 """
 namespace {} {{
@@ -252,7 +273,7 @@ class {} : public egr::GradNodeBase {{
 #include "paddle/fluid/eager/nan_inf_utils.h"
 
 #include "paddle/phi/api/include/sparse_api.h"
-
+#include "paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h"
 DECLARE_bool(check_nan_inf);
 {}
 """
@@ -262,6 +283,7 @@ class {} : public egr::GradNodeBase {{
 #pragma once
 #include "paddle/fluid/eager/tensor_wrapper.h"
 #include "paddle/fluid/eager/grad_node_info.h"
+#include "paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h"
 
 {}
 """
@@ -279,7 +301,7 @@ class {} : public egr::GradNodeBase {{
 #include "paddle/fluid/eager/eager_amp_auto_cast.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/fluid/eager/nan_inf_utils.h"
-
+#include "paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h"
 DECLARE_bool(check_nan_inf);
 {}
 {}
@@ -294,6 +316,7 @@ class {} : public egr::GradNodeBase {{
 #include "paddle/fluid/eager/utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/eager/to_static/run_program_op_func.h"
+#include "paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h"
 
 {}
 {}
@@ -584,7 +607,6 @@ def CollectBackwardInfo(self):
 
         self.backward_api_name = forward_api_contents['backward']
         self.backward_forward_str = grad_api_contents['forward']
-
         backward_args_str = grad_api_contents['args']
         backward_returns_str = grad_api_contents['output']
 
@@ -663,7 +685,7 @@ def SlotNameMatching(self):
                 backward_output_pos
             ]
 
-    def GenerateNodeCreationCodes(self):
+    def GenerateNodeCreationCodes(self, for_backward=False):
         forward_api_name = self.forward_api_name
         forward_inputs_position_map = self.forward_inputs_position_map
         forward_outputs_position_map = self.forward_outputs_position_map
@@ -794,13 +816,21 @@ def GenerateNodeCreationCodes(self):
 
         node_event_name = forward_api_name + " node_creation"
         node_creation_event_str = f"{indent}paddle::platform::RecordEvent node_creation_record_event(\"{node_event_name}\", paddle::platform::TracerEventType::OperatorInner, 1);\n"
+        if not for_backward:
+            self.node_creation_str = FORWARD_BODY_TEMPLATE.format(
+                node_creation_event_str, pass_stop_gradient_args_str,
+                node_construction_str, set_attributes_str,
+                set_input_tensor_wrappers_str, set_grad_out_meta_str,
+                set_out_rank_str, set_history_str, set_grad_in_meta_str,
+                set_retain_grad_str, set_output_tensor_wrappers_str)
+        else:
+            self.node_creation_str = HIHGER_ORDER_DERIVATIVE_VALUE_TEMPLATE.format(
+                node_creation_event_str, node_construction_str,
+                set_attributes_str, set_input_tensor_wrappers_str,
+                set_grad_out_meta_str, set_out_rank_str, set_history_str,
+                set_grad_in_meta_str, set_retain_grad_str,
+                set_output_tensor_wrappers_str)
 
-        self.node_creation_str = FORWARD_BODY_TEMPLATE.format(
-            node_creation_event_str, pass_stop_gradient_args_str,
-            node_construction_str, set_attributes_str,
-            set_input_tensor_wrappers_str, set_grad_out_meta_str,
-            set_out_rank_str, set_history_str, set_grad_in_meta_str,
-            set_retain_grad_str, set_output_tensor_wrappers_str)
         self.grad_node_out_list = grad_node_out_list
 
     def run(self):
@@ -1234,7 +1264,7 @@ def GenerateHigherOrderNodeCreationCode(self):
             next_node_generator = DygraphFunctionGeneratorBase(
                 forward_api_contents, backward_api_contents, namespace)
             next_node_generator.run()
-            next_node_generator.GenerateNodeCreationCodes()
+            next_node_generator.GenerateNodeCreationCodes(True)
 
             next_grad_node_creation_str = next_node_generator.node_creation_str
             next_grad_node_out_list = next_node_generator.grad_node_out_list
@@ -1342,6 +1372,7 @@ def GenerateNodeDefinition(self, next_grad_node_creation_str,
         inplace_grad_input_str = ""
         inplaced_tensor_wrapper = False
         inplace_check_str = ""
+        optional_inplace_var_name = []
         # Grad Ins from TensorWrappers
         for name, (_, is_fwd_input,
                    grad_api_position), in backward_forward_inputs_map.items():
@@ -1351,6 +1382,13 @@ def GenerateNodeDefinition(self, next_grad_node_creation_str,
             is_optional = (name in self.optional_inputs)
             tensor_wrapper_recover_str = f"{indent}auto {transformed_tensor_name} = egr::EagerUtils::RecoverTensorWrapper(&this->{tensor_wrapper_name});"
             if backward_inplace_map and name in backward_inplace_map.keys():
+                if len(next_grad_node_creation_str) > 0:
+                    if (transformed_tensor_name
+                            in backward_forward_inputs_map_next) and (
+                                backward_forward_inputs_map_next[
+                                    transformed_tensor_name][1]):
+                        optional_inplace_var_name.append(
+                            transformed_tensor_name)
                 tensor_wrapper_intermidiate_tensor_str = f"(&this->{tensor_wrapper_name})->get_intermidiate_tensor()"
                 inplace_check_str += CHECK_BACKWARD_INPLACE_TEMPLATE.format(
                     transformed_tensor_name, transformed_tensor_name, name,
@@ -1371,7 +1409,6 @@ def GenerateNodeDefinition(self, next_grad_node_creation_str,
 
             get_grad_in_args_list.append(tensor_wrapper_recover_str)
 
-        optional_inplace_check = False
         # Grad Ins from grads
         for name, (ttype, fwd_position,
                    grad_api_position) in backward_grad_inputs_map.items():
@@ -1388,7 +1425,8 @@ def GenerateNodeDefinition(self, next_grad_node_creation_str,
                                 in backward_forward_inputs_map_next) and (
                                     backward_forward_inputs_map_next[
                                         transformed_tensor_name][1]):
-                            optional_inplace_check = False
+                            optional_inplace_var_name.append(
+                                transformed_tensor_name)
                     grads_tensor_str = f"grads[{fwd_position}][0]"
                     inplace_check_str += CHECK_BACKWARD_INPLACE_TEMPLATE.format(
                         transformed_tensor_name, transformed_tensor_name, name,
@@ -1441,25 +1479,25 @@ def GenerateNodeDefinition(self, next_grad_node_creation_str,
             transformed_tensor_name = self.TransformToNextGradName(name)
             out_index = out_index + 1
             grad_api_args.append(f"api_output_{out_index}")
-            if not optional_inplace_check:
-                optional_inplace_str = "VLOG(6) << \"No Inplace should happend for wrappered input\";"
+            if inplace_grad_input_str in optional_inplace_var_name:
+                optional_inplace_str = "VLOG(6) << \"No Inplace should happend for wrappered input: {inplace_grad_input_str}\";"
             else:
                 optional_inplace_str = f"""if (api_output_{out_index} != nullptr && can_be_inplaced) {{
-            egr::EagerUtils::HandleViewBetweenInputAndOutput({inplace_grad_input_str}, api_output_{out_index});
-            }}"""
+      egr::EagerUtils::HandleViewBetweenInputAndOutput({inplace_grad_input_str}, api_output_{out_index});
+    }}"""
             if IsPlainTensorType(ttype):
 
                 if backward_inplace_map and name in backward_inplace_map.values(
                 ):
-                    inplace_str = f"""if (api_output_{out_index} != nullptr && can_be_inplaced) {{
-    egr::EagerUtils::HandleViewBetweenInputAndOutput({inplace_grad_input_str}, api_output_{out_index});
-  }}"""
+                    inplace_str = f""" if (api_output_{out_index} != nullptr && can_be_inplaced) {{
+      egr::EagerUtils::HandleViewBetweenInputAndOutput({inplace_grad_input_str}, api_output_{out_index});
+    }}"""
                     if len(next_grad_node_creation_str) > 0:
                         inplace_for_grad_outs_str += f"""
-  if (!require_any_grad) {{
-    {inplace_str}
-  }}else{{
+  if (trace_backward) {{
     {optional_inplace_str}
+  }} else {{
+    {inplace_str}
   }}"""
                     else:
                         inplace_for_grad_outs_str += inplace_str
@@ -1490,84 +1528,53 @@ def GenerateNodeDefinition(self, next_grad_node_creation_str,
             backward_api_name, "returns")
 
         # Prepare for Node Creation if Necessary
-        inputs_autograd_meta_str = ""
         outputs_autograd_meta_str = ""
-        compute_require_grad_str = ""
+        compute_require_next_grad_str = ""
         if len(next_grad_node_creation_str) > 0:
-            # 1. Get Grad Input AutoGradMeta
-            inputs_autograd_meta_list = []
-            compute_require_grad_args_list = ["trace_backward"]
-            for name, (ttype, pos,
-                       grad_api_position) in backward_grad_inputs_map.items():
-                transformed_tensor_name = self.TransformToNextGradName(name)
-                if transformed_tensor_name in next_grad_node_out_list:
-                    input_autograd_meta_name = GetAutoGradMetaName(
-                        transformed_tensor_name)
-                    if IsPlainTensorType(ttype):
-                        input_autograd_meta = f"{indent}egr::AutogradMeta* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({transformed_tensor_name});"
-                    else:
-                        assert IsVectorTensorType(ttype)
-                        input_autograd_meta_vec_name = GetAutoGradMetaVectorName(
-                            transformed_tensor_name)
-                        input_autograd_meta = f"{indent}std::vector<egr::AutogradMeta*> {input_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta({transformed_tensor_name});\n"
-                        input_autograd_meta += f"{indent}std::vector<egr::AutogradMeta*>* {input_autograd_meta_name} = &{input_autograd_meta_vec_name};"
-
-                    inputs_autograd_meta_list.append(input_autograd_meta)
-                    compute_require_grad_args_list.append(
-                        input_autograd_meta_name)
-
-            # 2. Get TensorWrapper AutoGradMeta
-            for name, (ttype, _, pos), in backward_forward_inputs_map.items():
-                transformed_tensor_name = self.TransformToNextGradName(name)
-                if transformed_tensor_name in next_grad_node_out_list:
-                    input_autograd_meta_name = GetAutoGradMetaName(
-                        transformed_tensor_name)
-                    if IsPlainTensorType(ttype):
-                        input_autograd_meta = f"{indent}egr::AutogradMeta* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({transformed_tensor_name});"
-                    else:
-                        assert IsVectorTensorType(ttype)
-                        input_autograd_meta_vec_name = GetAutoGradMetaVectorName(
-                            transformed_tensor_name)
-                        input_autograd_meta = f"{indent}std::vector<egr::AutogradMeta*> {input_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta({transformed_tensor_name});\n"
-                        input_autograd_meta += f"{indent}std::vector<egr::AutogradMeta*>* {input_autograd_meta_name} = &{input_autograd_meta_vec_name};"
-
-                    inputs_autograd_meta_list.append(input_autograd_meta)
-                    compute_require_grad_args_list.append(
-                        input_autograd_meta_name)
-
-            inputs_autograd_meta_str = "\n".join(inputs_autograd_meta_list)
-            compute_require_grad_args_str = ",".join(
-                compute_require_grad_args_list)
-
-            # 3. Get Output AutoGradMeta
-            outputs_autograd_meta_list = []
-            num_fwd_outputs = len(backward_grad_outputs_map.keys())
-            for name, (rtype, pos,
-                       grad_api_position) in backward_grad_outputs_map.items():
-                transformed_tensor_name = self.TransformToNextGradName(name)
-
-                output_autograd_meta_name = GetAutoGradMetaName(
-                    transformed_tensor_name)
-                output_autograd_meta_vec_name = GetAutoGradMetaVectorName(
-                    transformed_tensor_name)
-                if IsPlainTensorType(rtype):
-                    output_autograd_meta = f"""
+            compute_require_next_grad_str = f"{indent}bool trace_backward = egr::Controller::Instance().HasGrad() && create_graph;\n"
+
+        # 3. Get Output AutoGradMeta
+        outputs_autograd_meta_list = []
+        # TODO(jiabin): Optimize this with SetStopGradient instead of Pass Stop gradient
+
+        num_fwd_outputs = len(backward_grad_outputs_map.keys())
+        for name, (rtype, pos,
+                   grad_api_position) in backward_grad_outputs_map.items():
+            transformed_tensor_name = self.TransformToNextGradName(name)
+
+            output_autograd_meta_name = GetAutoGradMetaName(
+                transformed_tensor_name)
+            output_autograd_meta_vec_name = GetAutoGradMetaVectorName(
+                transformed_tensor_name)
+            if IsPlainTensorType(rtype):
+                output_autograd_meta = f"""
   auto& {transformed_tensor_name} = returns[{pos}][0];
-  egr::AutogradMeta* {output_autograd_meta_name} = returns[{pos}][0].initialized() ? egr::EagerUtils::autograd_meta(&{transformed_tensor_name}) : nullptr;"""
+  egr::AutogradMeta* {output_autograd_meta_name} = returns[{pos}][0].initialized() ? egr::EagerUtils::autograd_meta(&{transformed_tensor_name}) : nullptr;
+  if ({output_autograd_meta_name}) {output_autograd_meta_name}->SetStopGradient(false);
+  """
 
+            else:
+                assert IsVectorTensorType(rtype)
+                if len(next_grad_node_creation_str) > 0:
+                    output_autograd_meta = f"""
+    auto& {transformed_tensor_name} = returns[{pos}];
+    std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&{transformed_tensor_name});
+    std::vector<egr::AutogradMeta*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};
+    for(auto* meta : {output_autograd_meta_vec_name}){{
+        meta->SetStopGradient(false);
+    }}
+"""
                 else:
-                    assert IsVectorTensorType(rtype)
                     output_autograd_meta = f"""
-  auto& {transformed_tensor_name} = returns[{pos}];
-  std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&{transformed_tensor_name});
-  std::vector<egr::AutogradMeta*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};
+    auto& {transformed_tensor_name} = returns[{pos}];
+    std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&{transformed_tensor_name});
+    for(auto* meta : {output_autograd_meta_vec_name}){{
+        meta->SetStopGradient(false);
+    }}
 """
+            outputs_autograd_meta_list.append(output_autograd_meta)
 
-                outputs_autograd_meta_list.append(output_autograd_meta)
-            outputs_autograd_meta_str = "\n".join(outputs_autograd_meta_list)
-
-            compute_require_grad_str = f"{indent}bool trace_backward = egr::Controller::Instance().HasGrad() && create_graph;\n"
-            compute_require_grad_str += f"{indent}bool require_any_grad = egr::EagerUtils::ComputeRequireGrad({compute_require_grad_args_str});"
+        outputs_autograd_meta_str = "\n".join(outputs_autograd_meta_list)
 
         returns_str = f"{indent}if(NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&returns);\n"
         returns_str += f"{indent}return returns;\n"
@@ -1576,11 +1583,10 @@ def GenerateNodeDefinition(self, next_grad_node_creation_str,
 
         self.node_definition_str = GRAD_FUNCTION_TEMPLATE.format(
             grad_node_name, fill_zero_str, get_grad_in_args_str,
-            grad_function_prepare_str, inputs_autograd_meta_str,
-            compute_require_grad_str, inplace_check_str,
-            inplace_for_grad_outs_str, grad_node_name, grad_function_call_str,
-            check_nan_inf_str, outputs_autograd_meta_str,
-            next_grad_node_creation_str, returns_str)
+            grad_function_prepare_str, compute_require_next_grad_str,
+            inplace_check_str, inplace_for_grad_outs_str, grad_node_name,
+            grad_function_call_str, check_nan_inf_str,
+            outputs_autograd_meta_str, next_grad_node_creation_str, returns_str)
 
     def run(self):
         super().run()
@@ -1631,6 +1637,7 @@ def GetBackwardAPIContents(self, forward_api_contents):
         if 'backward' not in forward_api_contents.keys(): return None
 
         backward_api_name = forward_api_contents['backward']
+        if backward_api_name in black_ops_list: return None
         assert backward_api_name in grad_api_dict.keys(), AssertMessage(
             backward_api_name, grad_api_dict.keys())
         backward_api_contents = grad_api_dict[backward_api_name]
@@ -1643,10 +1650,12 @@ def GenerateCode(self):
         namespace = self.namespace
 
         for forward_api_contents in forward_api_list:
+            if forward_api_contents['api'] in black_ops_list: continue
+
             backward_api_contents = self.GetBackwardAPIContents(
                 forward_api_contents)
             if backward_api_contents is None: continue
-
+            if forward_api_contents['api'] in black_ops_list: continue
             # Generate Dygraph Forward Function
             function_generator = DygraphForwardFunctionGenerator(
                 forward_api_contents, backward_api_contents, namespace)
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
index 66d8e8bfadab2..9d5706f65bdf0 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
@@ -50,6 +50,45 @@ def SkipAPIGeneration(forward_api_name):
     "paddle::experimental::DataType": "CastPyArg2DataType",
 }
 
+# This list contains ops that do not need to generate amp logic
+# All optimizer ops in this list
+no_amp_list = [
+    'adam_',
+    'adam',
+    'adamw_',
+    'adamw',
+    'decayed_adagrad_',
+    'decayed_adagrad',
+    'dgc_momentum_',
+    'dgc_momentum',
+    'distributed_fused_lamb_',
+    'distributed_fused_lamb',
+    'dpsgd_',
+    'dpsgd',
+    'ftrl_',
+    'ftrl',
+    'lamb_',
+    'lamb',
+    'lars_momentum_',
+    'lars_momentum',
+    'merged_adam_',
+    'merged_adam',
+    'merged_momentum_',
+    'merged_momentum',
+    'momentum_',
+    'momentum',
+    'proximal_adagrad_',
+    'proximal_adagrad',
+    'proximal_gd_',
+    'proximal_gd',
+    'rmsprop_',
+    'rmsprop',
+    'sgd_',
+    'sgd',
+    'sparse_momentum_',
+    'sparse_momentum',
+]
+
 
 def FindParsingFunctionFromAttributeType(atype):
     if atype not in atype_to_parsing_function.keys():
@@ -99,7 +138,7 @@ def FindParsingFunctionFromAttributeType(atype):
     // Set Device ID
 {}
     // Call dygraph function
-    decltype({}({})) out = {}({});
+    {}
 
     PyEval_RestoreThread(tstate);
     tstate = nullptr;
@@ -114,6 +153,25 @@ def FindParsingFunctionFromAttributeType(atype):
 }}
 """
 
+NOAMP_DYGRAPH_FUNCTION_TEMPLATE = "decltype({}({})) out = {}({});\n"
+
+AMP_DYGRAPH_FUNCTION_TEMPLATE = \
+"""
+    decltype({}({})) out;
+    // AMP Logic
+    if (egr::Controller::Instance().GetAMPLevel() != paddle::imperative::AmpLevel::O0) {{
+        VLOG(5) << "Check and Prepare For AMP";
+        {}
+        paddle::small_vector<std::vector<paddle::experimental::Tensor>, egr::kSlotSmallVectorSize> amp_tensors_vector = {};
+        {}
+        {}
+        {}
+        out = {}({});
+    }} else {{
+        out = {}({});
+    }}
+"""
+
 FUNCTION_SET_DEVICE_TEMPLATE = \
 """{}    if (paddle::platform::is_gpu_place(place)) {{
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -122,6 +180,15 @@ def FindParsingFunctionFromAttributeType(atype):
 #else
       PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
         "PaddlePaddle should compile with GPU if use CUDAPlace."));
+#endif
+    }}
+    if (paddle::platform::is_custom_place(place)) {{
+#if defined(PADDLE_WITH_CUSTOM_DEVICE)
+      phi::DeviceManager::SetDevice(place);
+      VLOG(1) <<"CurrentDeviceId: " << phi::DeviceManager::GetDevice(place.GetDeviceType()) << " from " << (int)place.device;
+#else
+      PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with CUSTOM_DEVICE if use CustomPlace."));
 #endif
     }}
 """
@@ -139,22 +206,19 @@ def FindParsingFunctionFromAttributeType(atype):
 
 PYTHON_C_WRAPPER_TEMPLATE = \
 """
-#pragma once
-
-#include  "pybind11/detail/common.h"
-#include  "paddle/phi/api/all.h"
-#include  "paddle/phi/api/lib/dygraph_api.h"
-#include  "paddle/phi/common/backend.h"
-#include  "paddle/phi/common/data_type.h"
-#include  "paddle/phi/common/scalar.h"
-#include  "paddle/phi/common/int_array.h"
-#include  "paddle/phi/api/include/sparse_api.h"
-#include  "paddle/phi/api/include/strings_api.h"
-#include  "paddle/fluid/pybind/op_function_common.h"
-#include  "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
-#include  "paddle/fluid/pybind/exception.h"
-#include  "paddle/fluid/platform/profiler/event_tracing.h"
-#include  <Python.h>
+#include <Python.h>
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/api/include/strings_api.h"
+#include "paddle/phi/backends/device_manager.h"
+#include "paddle/fluid/pybind/eager_utils.h"
+#include "paddle/fluid/pybind/exception.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
+#include "paddle/fluid/pybind/op_function_common.h"
+#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
+#include "paddle/fluid/pybind/eager_final_state_custom_python_api.h"
+#include "paddle/fluid/pybind/eager.h"
+#include "paddle/fluid/eager/amp_utils.h"
+#include "paddle/fluid/eager/eager_amp_auto_cast.h"
 
 namespace paddle {{
 namespace pybind {{
@@ -165,6 +229,16 @@ def FindParsingFunctionFromAttributeType(atype):
     {}
 }};
 
+void BindFinalStateEagerOpFunctions(pybind11::module *module) {{
+  if (PyModule_AddFunctions(module->ptr(), EagerFinalStateMethods) < 0) {{
+    PADDLE_THROW(platform::errors::Fatal ("Add functions to core.eager.ops failed!"));
+  }}
+
+  if (PyModule_AddFunctions(module->ptr(), CustomEagerFinalStateMethods) < 0) {{
+    PADDLE_THROW(platform::errors::Fatal ("Add functions to core.eager.ops failed!"));
+  }}
+}}
+
 }} // namespace pybind
 }} // namespace paddle
 """
@@ -331,11 +405,15 @@ def GeneratePythonCFunction(self):
         num_args = len(
             forward_inputs_position_map.keys()) + len(orig_forward_attrs_list)
         dygraph_function_call_list = ["" for i in range(num_args)]
+        amp_dygraph_function_call_list = ["" for i in range(num_args)]
         for name, (_, pos) in forward_inputs_position_map.items():
             dygraph_function_call_list[pos] = f"{name}"
+            amp_dygraph_function_call_list[pos] = f"NEW_{name}"
         for name, _, _, pos in orig_forward_attrs_list:
             dygraph_function_call_list[pos] = f"{name}"
+            amp_dygraph_function_call_list[pos] = f"{name}"
         dygraph_function_call_str = ",".join(dygraph_function_call_list)
+        amp_dygraph_function_call_str = ",".join(amp_dygraph_function_call_list)
 
         # Generate Python-C Function Definitions
         if is_forward_only:
@@ -351,12 +429,82 @@ def GeneratePythonCFunction(self):
         pythonc_record_event_str = RECORD_EVENT_TEMPLATE.format(
             "pythonc_record_event", forward_api_name, "pybind_imperative_func")
 
-        # Generate Python-C Function Definetion
-        self.python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format(
-            forward_api_name, pythonc_record_event_str, forward_api_name,
-            get_eager_tensor_str, parse_attributes_str, set_device_str,
+        # Forward amp logic
+        amp_tensors_vector_list = []
+        amp_tensors_vector_optional_list = []
+        amp_autocast_list = []
+        amp_autocast_optional_list = []
+
+        for name, (ttype, pos) in forward_inputs_position_map.items():
+            is_optional = (name in optional_inputs)
+            if IsVectorTensorType(ttype):
+                if is_optional:
+                    amp_tensors_vector_optional_list.append(
+                        f"if ({name}.is_initialized()) amp_tensors_vector.push_back({name}.get());\n"
+                    )
+                    amp_autocast_optional_list.append(
+                        f"auto NEW_{name} = {name}.is_initialized() ? egr::EagerAmpAutoCast(\"{name}\", {name}, amp_dst_dtype, op_name, false) : {name};\n"
+                    )
+                else:
+                    amp_tensors_vector_list.append(f"{name}")
+                    amp_autocast_list.append(
+                        f"auto NEW_{name} = egr::EagerAmpAutoCasts(\"{name}\", {name}, amp_dst_dtype, op_name, false);\n"
+                    )
+            else:
+                if is_optional:
+                    amp_tensors_vector_optional_list.append(
+                        f"if ({name}.is_initialized()) amp_tensors_vector.push_back({{{name}.get()}});\n"
+                    )
+                    amp_autocast_optional_list.append(
+                        f"auto NEW_{name} = {name}.is_initialized() ? egr::EagerAmpAutoCast(\"{name}\", {name}, amp_dst_dtype, op_name, false) : {name};\n"
+                    )
+                else:
+                    if forward_inplace_map and name in forward_inplace_map.keys(
+                    ):
+                        amp_tensors_vector_list.append(f"{{{name}}}")
+                        amp_autocast_list.append(
+                            f"auto NEW_{name} = egr::EagerAmpAutoCast(\"{name}\", {name}, amp_dst_dtype, op_name, false);\n"
+                        )
+                    else:
+                        amp_tensors_vector_list.append(f"{{{name}}}")
+                        amp_autocast_list.append(
+                            f"auto NEW_{name} = egr::EagerAmpAutoCast(\"{name}\", {name}, amp_dst_dtype, op_name, false);\n"
+                        )
+        amp_tensors_vector_list_str = "{ " + ",".join(
+            amp_tensors_vector_list) + " }"
+        amp_tensors_vector_optional_list_str = "".join(
+            amp_tensors_vector_optional_list)
+        amp_autocast_list_str = "    ".join(
+            amp_autocast_list) + "        " + "    ".join(
+                amp_autocast_optional_list)
+
+        kernel_trans2_op_name_str = f"auto op_name = phi::TransToFluidOpName(\"{forward_api_name}\");"
+        amp_get_dst_dtype_str = f"auto amp_dst_dtype = egr::GetAmpDestDtype(op_name, amp_tensors_vector);\n"
+
+        noamp_dygraph_function_str = NOAMP_DYGRAPH_FUNCTION_TEMPLATE.format(
             fwd_function_name, dygraph_function_call_str, fwd_function_name,
-            dygraph_function_call_str, return_str)
+            dygraph_function_call_str)
+
+        amp_dygraph_function_str = AMP_DYGRAPH_FUNCTION_TEMPLATE.format(
+            fwd_function_name, dygraph_function_call_str,
+            kernel_trans2_op_name_str, amp_tensors_vector_list_str,
+            amp_tensors_vector_optional_list_str, amp_get_dst_dtype_str,
+            amp_autocast_list_str, fwd_function_name,
+            amp_dygraph_function_call_str, fwd_function_name,
+            dygraph_function_call_str)
+
+        # Generate Python-C Function Definetion
+        if (is_forward_only) and (len(amp_tensors_vector_list) >
+                                  0) and (forward_api_name not in no_amp_list):
+            self.python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format(
+                forward_api_name, pythonc_record_event_str, forward_api_name,
+                get_eager_tensor_str, parse_attributes_str, set_device_str,
+                amp_dygraph_function_str, return_str)
+        else:
+            self.python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format(
+                forward_api_name, pythonc_record_event_str, forward_api_name,
+                get_eager_tensor_str, parse_attributes_str, set_device_str,
+                noamp_dygraph_function_str, return_str)
 
         # Set prefix of forward_api_name to avoid conflicts
         prefix = self.namespace.strip("::")
@@ -379,6 +527,18 @@ def GeneratePythonCFunction(self):
                     "::", namespace,
                     GetForwardFunctionName(inplaced_forward_api_name))
 
+            inplace_noamp_dygraph_function_str = NOAMP_DYGRAPH_FUNCTION_TEMPLATE.format(
+                inplaced_fwd_function_name, dygraph_function_call_str,
+                inplaced_fwd_function_name, dygraph_function_call_str)
+
+            inplace_amp_dygraph_function_str = AMP_DYGRAPH_FUNCTION_TEMPLATE.format(
+                inplaced_fwd_function_name, dygraph_function_call_str,
+                kernel_trans2_op_name_str, amp_tensors_vector_list_str,
+                amp_tensors_vector_optional_list_str, amp_get_dst_dtype_str,
+                amp_autocast_list_str, inplaced_fwd_function_name,
+                amp_dygraph_function_call_str, inplaced_fwd_function_name,
+                dygraph_function_call_str)
+
             return_str = "    std::map<ssize_t, ssize_t> inplace_var_idx_map;"
             for inplace_input, inplace_output in forward_inplace_map.items():
                 return_str += RETURN_INPLACE_PYOBJECT_TEMPLATE.format(
@@ -387,13 +547,19 @@ def GeneratePythonCFunction(self):
             return_str += "    return ToPyObject(out, args, inplace_var_idx_map);"
 
             # Generate Python-C Function Definetion
-            python_c_inplace_func_str = PYTHON_C_FUNCTION_TEMPLATE.format(
-                inplaced_forward_api_name, pythonc_record_event_str,
-                inplaced_forward_api_name, get_eager_tensor_str,
-                parse_attributes_str, set_device_str,
-                inplaced_fwd_function_name, dygraph_function_call_str,
-                inplaced_fwd_function_name, dygraph_function_call_str,
-                return_str)
+            if (is_forward_only) and (len(amp_tensors_vector_list) > 0) and (
+                    inplaced_forward_api_name not in no_amp_list):
+                python_c_inplace_func_str = PYTHON_C_FUNCTION_TEMPLATE.format(
+                    inplaced_forward_api_name, pythonc_record_event_str,
+                    inplaced_forward_api_name, get_eager_tensor_str,
+                    parse_attributes_str, set_device_str,
+                    inplace_amp_dygraph_function_str, return_str)
+            else:
+                python_c_inplace_func_str = PYTHON_C_FUNCTION_TEMPLATE.format(
+                    inplaced_forward_api_name, pythonc_record_event_str,
+                    inplaced_forward_api_name, get_eager_tensor_str,
+                    parse_attributes_str, set_device_str,
+                    inplace_noamp_dygraph_function_str, return_str)
 
             python_c_inplace_func_reg_str = PYTHON_C_FUNCTION_REG_TEMPLATE.format(
                 forward_api_name_prefix, inplaced_forward_api_name, namespace,
@@ -449,8 +615,8 @@ def __init__(self, path):
 
     def GeneratePythonCFunctions(self):
         namespace = self.namespace
-        forward_api_list = self.forward_api_list
 
+        forward_api_list = self.forward_api_list
         for forward_api_content in forward_api_list:
             f_generator = PythonCSingleFunctionGenerator(
                 forward_api_content, namespace)
diff --git a/paddle/fluid/eager/auto_code_generator/generate_file_structures.py b/paddle/fluid/eager/auto_code_generator/generate_file_structures.py
index fdb8529515d30..9fbf1ed6cd4a1 100644
--- a/paddle/fluid/eager/auto_code_generator/generate_file_structures.py
+++ b/paddle/fluid/eager/auto_code_generator/generate_file_structures.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -53,7 +53,7 @@ def GenerateFileStructureForFinalDygraph(eager_dir):
             open(path, 'a').close()
 
 
-def GenerateFileStructureForIntermediateDygraph(eager_dir):
+def GenerateFileStructureForIntermediateDygraph(eager_dir, split_count):
     """
     paddle/fluid/eager
     |- generated
@@ -86,11 +86,21 @@ def GenerateFileStructureForIntermediateDygraph(eager_dir):
     dygraph_forward_api_h_path = os.path.join(generated_dir,
                                               "dygraph_forward_api.h")
     empty_files = [dygraph_forward_api_h_path]
-    empty_files.append(
-        os.path.join(forwards_dir, "dygraph_forward_functions.cc"))
-    empty_files.append(os.path.join(nodes_dir, "nodes.cc"))
     empty_files.append(os.path.join(nodes_dir, "nodes.h"))
 
+    for i in range(split_count):
+        empty_files.append(
+            os.path.join(forwards_dir,
+                         "dygraph_forward_functions" + str(i + 1) + ".cc"))
+        empty_files.append(os.path.join(nodes_dir,
+                                        "nodes" + str(i + 1) + ".cc"))
+    empty_files.append(
+        os.path.join(forwards_dir, "dygraph_forward_functions_args_info.cc"))
+    empty_files.append(
+        os.path.join(forwards_dir,
+                     "dygraph_forward_functions_args_type_info.cc"))
+    empty_files.append(
+        os.path.join(forwards_dir, "dygraph_forward_functions_returns_info.cc"))
     for path in empty_files:
         if not os.path.exists(path):
             open(path, 'a').close()
@@ -102,23 +112,70 @@ def GenerateFileStructureForIntermediateDygraph(eager_dir):
     forwards_level_cmakelist_path = os.path.join(forwards_dir, "CMakeLists.txt")
 
     with open(nodes_level_cmakelist_path, "w") as f:
+        f.write("add_custom_target(\n")
+        f.write("  copy_dygraph_node\n")
         f.write(
-            "cc_library(dygraph_node SRCS nodes.cc DEPS ${eager_deps} ${fluid_deps})\n"
+            "  COMMAND ${CMAKE_COMMAND} -E copy_if_different \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.tmp.h\" \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h\"\n"
         )
-        f.write("add_dependencies(dygraph_node eager_codegen)")
+        for i in range(split_count):
+            f.write(
+                "  COMMAND ${CMAKE_COMMAND} -E copy_if_different \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes"
+                + str(i + 1) +
+                ".tmp.cc\" \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes"
+                + str(i + 1) + ".cc\"\n")
+
+        f.write("  DEPENDS eager_codegen\n")
+        f.write("  VERBATIM)\n")
+
+        f.write("cc_library(dygraph_node SRCS ")
+        for i in range(split_count):
+            f.write("nodes" + str(i + 1) + ".cc ")
+        f.write("${fluid_manual_nodes} DEPS ${eager_deps} ${fluid_deps})\n")
+        f.write("add_dependencies(dygraph_node copy_dygraph_node)")
 
     with open(forwards_level_cmakelist_path, "w") as f:
+        f.write("add_custom_target(\n")
+        f.write("  copy_dygraph_forward_functions\n")
+        f.write(
+            "  COMMAND ${CMAKE_COMMAND} -E copy_if_different \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.tmp.h\" \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h\"\n"
+        )
+        for i in range(split_count):
+            f.write(
+                "  COMMAND ${CMAKE_COMMAND} -E copy_if_different \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions"
+                + str(i + 1) +
+                ".tmp.cc\" \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions"
+                + str(i + 1) + ".cc\"\n")
         f.write(
-            "cc_library(dygraph_function SRCS dygraph_forward_functions.cc DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})\n"
+            "  COMMAND ${CMAKE_COMMAND} -E copy_if_different \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions_args_info.tmp.cc\" \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions_args_info.cc\"\n"
         )
-        f.write("add_dependencies(dygraph_function eager_codegen)")
+        f.write(
+            "  COMMAND ${CMAKE_COMMAND} -E copy_if_different \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions_args_type_info.tmp.cc\" \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions_args_type_info.cc\"\n"
+        )
+        f.write(
+            "  COMMAND ${CMAKE_COMMAND} -E copy_if_different \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions_returns_info.tmp.cc\" \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions_returns_info.cc\"\n"
+        )
+        f.write("  DEPENDS eager_codegen\n")
+        f.write("  VERBATIM)\n")
+
+        f.write("cc_library(dygraph_function SRCS ")
+        for i in range(split_count):
+            f.write("dygraph_forward_functions" + str(i + 1) + ".cc ")
+        f.write("dygraph_forward_functions_args_info.cc ")
+        f.write("dygraph_forward_functions_args_type_info.cc ")
+        f.write("dygraph_forward_functions_returns_info.cc ")
+        f.write(
+            "${fluid_manual_functions} DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})\n"
+        )
+        f.write(
+            "add_dependencies(dygraph_function copy_dygraph_forward_functions)")
 
     with open(generated_level_cmakelist_path, "w") as f:
         f.write("add_subdirectory(forwards)\nadd_subdirectory(nodes)")
 
 
 if __name__ == "__main__":
-    assert len(sys.argv) == 2
+    assert len(sys.argv) == 3
     eager_dir = sys.argv[1]
-    GenerateFileStructureForIntermediateDygraph(eager_dir)
+    split_count = int(sys.argv[2])
+    GenerateFileStructureForIntermediateDygraph(eager_dir, split_count)
     GenerateFileStructureForFinalDygraph(eager_dir)
diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
index 26165c59e0153..c4797029abf3c 100644
--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -52,7 +52,14 @@ class GeneralGrad {
         AutogradMeta* auto_grad_meta =
             EagerUtils::unsafe_autograd_meta(inputs[i]);
         auto* target_node = auto_grad_meta->GetMutableGradNode().get();
-
+        VLOG(8) << "Get no grad vars' grad_node: " << target_node->name()
+                << ", " << target_node << " with output rank info: "
+                << auto_grad_meta->OutRankInfo().first << ", "
+                << auto_grad_meta->OutRankInfo().second;
+        if (is_no_grad_vars) {
+          (no_grad_var_nodes_inputmeta_map_)[target_node] = auto_grad_meta;
+          continue;
+        }
         if (orig_to_copied_node_mapping_.count(target_node)) {
           target_node = orig_to_copied_node_mapping_[target_node].get();
         } else {
@@ -67,11 +74,8 @@ class GeneralGrad {
                                     "stop_gradient=True.",
                                     msg,
                                     i));
-        if (is_no_grad_vars) {
-          (no_grad_var_nodes_inputmeta_map_)[target_node] = auto_grad_meta;
-        } else {  // normal input
-          (input_target_nodes_inputmeta_map_)[target_node] = auto_grad_meta;
-        }
+        // normal input
+        (input_target_nodes_inputmeta_map_)[target_node] = auto_grad_meta;
       }
     }
   }
@@ -305,8 +309,6 @@ class GeneralGrad {
       const std::unordered_map<GradNodeBase*,
                                std::unique_ptr<GradTensorHolder>>&
           node_input_buffers_dict) {
-    // Get no_grad_vars's GradNodes and InputMeta Info
-    GetTargetNodesInfo(no_grad_vars, true /* is_no_grad_vars */);
     // Get inputs's GradNodes and InputMeta Info
     GetTargetNodesInfo(inputs, false /* is_no_grad_vars */);
     // Purify potentialstartup_ops, remove those nodes that are the same as
@@ -402,6 +404,21 @@ class GeneralGrad {
 
           std::shared_ptr<GradNodeBase> orig_next_node =
               orig_edge.GetMutableGradNode();
+
+          if (no_grad_var_nodes_inputmeta_map_.count(orig_next_node.get()) &&
+              (no_grad_var_nodes_inputmeta_map_[orig_next_node.get()]
+                   ->OutRankInfo() == orig_edge.GetEdgeRankInfo())) {
+            VLOG(3) << "Get no grad edge from grad_node: " << orig_node->name()
+                    << " : " << orig_node << " to:" << orig_next_node->name()
+                    << ", " << orig_next_node.get()
+                    << " with output rank info: "
+                    << orig_edge.GetEdgeRankInfo().first << ", "
+                    << orig_edge.GetEdgeRankInfo().second;
+            // Stop no grad var's preceding node
+            copied_node->MutableOutputMeta()[i][j].SetStopGradient(true);
+            copied_edge.Clear();
+            continue;
+          }
           if (!orig_next_node) continue;
 
           // Copy Next Node
@@ -638,6 +655,9 @@ std::vector<paddle::experimental::Tensor> RunBackward(
   }
 
   if (is_general_grad) {
+    // Get no_grad_vars's GradNodes and InputMeta Info
+    GeneralGrad::Instance().GetTargetNodesInfo(no_grad_vars,
+                                               true /* is_no_grad_vars */);
     // Copy Backward Graph
     GeneralGrad::Instance().ReconstructBackwardGraph(orig_queue);
   }
@@ -696,19 +716,6 @@ std::vector<paddle::experimental::Tensor> RunBackward(
                                                          node);
     }
 
-    // no_grad_vars
-    if (!no_grad_vars.empty() && is_general_grad) {
-      auto iter =
-          GeneralGrad::Instance().GetNoGradVarNodesInputMetaMap()->find(node);
-      if (iter !=
-          GeneralGrad::Instance().GetNoGradVarNodesInputMetaMap()->end()) {
-        VLOG(6) << "Change the input buffer[slot][rank] by Zeros";
-        auto rank_info = (iter->second)->OutRankInfo();
-        node_input_buffer->SetBufferSlotRankZeros(rank_info.first,
-                                                  rank_info.second);
-      }
-    }
-
     // Check input
     EnforceGradNodeHasInput(node);
 
@@ -750,7 +757,8 @@ std::vector<paddle::experimental::Tensor> RunBackward(
         // Since we make edge has as same rank as bwd outputs, we indexing them
         // with the same rank(i, j)
         auto next_node_shared = edge.GetMutableGradNode();
-        VLOG(3) << "Found pending node: " << next_node_shared->name();
+        VLOG(3) << "Found pending node: " << next_node_shared->name() << ": "
+                << next_node_shared.get();
         // Next node could be nullptr if it is leaf tensor with no
         // AccumulationNode attached
         // Or it could also originated from dispensable inputs
@@ -800,6 +808,8 @@ std::vector<paddle::experimental::Tensor> RunBackward(
 
         // Update queue
         node_in_degree_map[next_node]--;
+        VLOG(6) << next_node->name()
+                << " ref_cnt is: " << node_in_degree_map[next_node];
 
         PADDLE_ENFORCE(
             node_in_degree_map[next_node] >= 0,
diff --git a/paddle/fluid/eager/eager_amp_auto_cast.h b/paddle/fluid/eager/eager_amp_auto_cast.h
index 438ccbaca8a5e..f98f25635f703 100644
--- a/paddle/fluid/eager/eager_amp_auto_cast.h
+++ b/paddle/fluid/eager/eager_amp_auto_cast.h
@@ -39,19 +39,40 @@ static inline bool NeedCast(const paddle::experimental::Tensor& tensor,
   return false;
 }
 
+inline paddle::experimental::Tensor Cast(
+    const paddle::experimental::Tensor& input,
+    const paddle::experimental::DataType& dst_dtype,
+    const bool trace_backward = true) {
+  if (input.is_sparse_coo_tensor() || input.is_sparse_csr_tensor()) {
+    if (trace_backward) {
+      return sparse::cast_final_state_dygraph_function(
+          input, paddle::experimental::DataType::UNDEFINED, dst_dtype);
+    } else {
+      return paddle::experimental::sparse::cast(
+          input, paddle::experimental::DataType::UNDEFINED, dst_dtype);
+    }
+  } else {
+    if (trace_backward) {
+      return cast_final_state_dygraph_function(input, dst_dtype);
+    } else {
+      return paddle::experimental::cast(input, dst_dtype);
+    }
+  }
+}
+
 inline std::vector<paddle::experimental::Tensor> EagerAmpAutoCasts(
     const std::string& inputs_name,
     const std::vector<paddle::experimental::Tensor>& inputs,
     const paddle::experimental::DataType& dst_dtype,
-    std::string op_name) {
+    std::string op_name,
+    bool trace_backward = true) {
   VLOG(6) << "AMP AmpAutoCasts:"
           << " inputs(" << inputs_name << ") dst_dtype("
           << paddle::framework::DataType2String(dst_dtype) << ").";
   std::vector<paddle::experimental::Tensor> inputs_casted;
   for (auto& input : inputs) {
     if (NeedCast(input, dst_dtype)) {
-      inputs_casted.emplace_back(
-          std::move(cast_final_state_dygraph_function(input, dst_dtype)));
+      inputs_casted.emplace_back(std::move(Cast(input, dst_dtype)));
     } else {
       inputs_casted.emplace_back(input);
     }
@@ -63,7 +84,8 @@ inline paddle::experimental::Tensor EagerAmpAutoCast(
     const std::string& input_name,
     const paddle::experimental::Tensor& input,
     const paddle::experimental::DataType& dst_dtype,
-    const std::string& op_name) {
+    const std::string& op_name,
+    bool trace_backward = true) {
   VLOG(6) << "AMP AmpAutoCasts:"
           << " input(" << input_name << ") dst_dtype("
           << paddle::framework::DataType2String(dst_dtype) << ").";
@@ -85,7 +107,7 @@ inline paddle::experimental::Tensor EagerAmpAutoCast(
     }
   }
   if (NeedCast(input, dst_dtype)) {
-    return cast_final_state_dygraph_function(input, dst_dtype);
+    return Cast(input, dst_dtype, trace_backward);
   }
   return input;
 }
@@ -94,9 +116,11 @@ inline paddle::optional<paddle::experimental::Tensor> EagerAmpAutoCast(
     const std::string& input_name,
     const paddle::optional<paddle::experimental::Tensor>& input,
     const paddle::experimental::DataType& dst_dtype,
-    const std::string& op_name) {
+    const std::string& op_name,
+    bool trace_backward = true) {
   if (input) {
-    return EagerAmpAutoCast(input_name, *input, dst_dtype, op_name);
+    return EagerAmpAutoCast(
+        input_name, *input, dst_dtype, op_name, trace_backward);
   }
   return paddle::none;
 }
diff --git a/paddle/fluid/eager/eager_tensor.h b/paddle/fluid/eager/eager_tensor.h
index d61a55b6dea88..8026b8e368478 100644
--- a/paddle/fluid/eager/eager_tensor.h
+++ b/paddle/fluid/eager/eager_tensor.h
@@ -209,6 +209,7 @@ class EagerVariable final {
     if (tensor.defined()) {
       if (tensor.is_dense_tensor()) {
         ConstructVariableFromTensor<phi::DenseTensor>(tensor);
+        src_tensor_ = tensor.impl();
       } else if (tensor.is_selected_rows()) {
         ConstructVariableFromTensor<phi::SelectedRows>(tensor);
       } else if (IsVariableCompatTensor(tensor) &&
@@ -229,6 +230,19 @@ class EagerVariable final {
     }
   }
 
+  ~EagerVariable() {
+    if (src_tensor_) {
+      auto* framework_tensor = var_.GetMutable<phi::DenseTensor>();
+      auto tensor_dense = static_cast<phi::DenseTensor*>(src_tensor_.get());
+      if (framework_tensor->memory_size() > 0 &&
+          (!paddle::platform::is_same_place(framework_tensor->place(),
+                                            tensor_dense->place()) ||
+           framework_tensor->dtype() != tensor_dense->dtype())) {
+        tensor_dense->ShareBufferWith(*framework_tensor);
+      }
+    }
+  }
+
   /** Part 11: Construct paddle::framework::Variable with phi::Tensor **/
   std::shared_ptr<phi::TensorBase> GetTensorBase() {
     // Construct allocation only once.
@@ -304,5 +318,6 @@ class EagerVariable final {
  private:
   std::string name_{""};
   paddle::framework::Variable var_;
+  std::shared_ptr<phi::TensorBase> src_tensor_;
 };
 }  // namespace egr
diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h
index 269753f3c04f9..2f8ca2bb42095 100644
--- a/paddle/fluid/eager/grad_node_info.h
+++ b/paddle/fluid/eager/grad_node_info.h
@@ -106,6 +106,12 @@ class Edge {
     }
   }
 
+  void Clear() {
+    grad_node_.reset();
+    in_slot_id_ = 0;
+    in_rank_ = 0;
+  }
+
  private:
   size_t in_slot_id_;
   size_t in_rank_;
diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc
index c8d8b9ab548c0..231d81b5e73a6 100644
--- a/paddle/fluid/eager/grad_tensor_holder.cc
+++ b/paddle/fluid/eager/grad_tensor_holder.cc
@@ -24,6 +24,7 @@
 namespace egr {
 
 void GradTensorHolder::SetBufferSlotRankZeros(size_t slot_id, size_t rank) {
+  // Set not grad var to zero and set stop gradient as default value: true
   buffer_[slot_id][rank] =
       paddle::experimental::zeros_like(buffer_[slot_id][rank]);
 }
@@ -59,8 +60,15 @@ void GradTensorHolder::CopyValueFromTensor(
     if ((!buffer_tensor.defined() || !buffer_tensor.initialized())) {
       // Perform deep copy here
       buffer_tensor.copy_(t, t.place(), false);
-      buffer_tensor.set_autograd_meta(t.mutable_autograd_meta());
-
+      auto* meta = egr::EagerUtils::autograd_meta(&buffer_tensor);
+      auto* origin_meta = egr::EagerUtils::nullable_autograd_meta(t);
+      if (origin_meta) {
+        auto grad_node = origin_meta->GetMutableGradNode();
+        if (grad_node && grad_node.get()) {
+          meta->SetGradNode(origin_meta->GetMutableGradNode());
+        }
+        meta->WeakGrad() = origin_meta->WeakGrad();
+      }
     } else {
       PADDLE_THROW(paddle::platform::errors::Fatal(
           "Cannot copy grad_tensors' value to grad tensor holders,"
@@ -81,10 +89,10 @@ void GradTensorHolder::CopyValueFromTensor(
             "Only Support DENSE_TENSOR, SPARSE_COO_TENSOR, SPARSE_CSR_TENSOR "
             "now."));
       }
-      egr::EagerUtils::autograd_meta(&(buffer_[slot_id][rank]))
-          ->SetStopGradient(false);
     }
   }
+  egr::EagerUtils::autograd_meta(&(buffer_[slot_id][rank]))
+      ->SetStopGradient(false);
 }
 
 void GradTensorHolder::add(size_t slot_id,
diff --git a/paddle/fluid/eager/nan_inf_utils.cc b/paddle/fluid/eager/nan_inf_utils.cc
index 1d45ef696b880..6b2b9c9f34a6d 100644
--- a/paddle/fluid/eager/nan_inf_utils.cc
+++ b/paddle/fluid/eager/nan_inf_utils.cc
@@ -48,8 +48,7 @@ void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor) {
 #endif
       return;
     }
-    paddle::framework::details::tensor_check<
-        paddle::platform::CPUDeviceContext>(
+    paddle::framework::details::tensor_check<phi::CPUContext>(
         api_name, tensor_name, *dense_tensor, place);
   }
 }
diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h
index 66c13c66de9fc..a6fd57ac6a4bc 100644
--- a/paddle/fluid/eager/tensor_wrapper.h
+++ b/paddle/fluid/eager/tensor_wrapper.h
@@ -28,6 +28,7 @@
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/utils.h"
+#include "paddle/phi/api/lib/utils/allocator.h"
 
 namespace egr {
 class TensorWrapper {
@@ -57,9 +58,12 @@ class TensorWrapper {
         // Only Copy Meta
         phi::DenseTensor* dense_tensor =
             static_cast<phi::DenseTensor*>(tensor.impl().get());
-        auto tw_dense_tensor = std::make_shared<phi::DenseTensor>();
-        tw_dense_tensor->set_meta(dense_tensor->meta());
-        intermidiate_tensor_.set_impl(tw_dense_tensor);
+        // TODO(jiabin): It's not a good idea to set memory size to zero, find
+        // another way and change this.
+        intermidiate_tensor_.set_impl(
+            std::move(std::make_shared<phi::DenseTensor>(
+                std::make_shared<phi::Allocation>(nullptr, 0, tensor.place()),
+                std::move(dense_tensor->meta()))));
       } else {
         PADDLE_THROW(paddle::platform::errors::Fatal(
             "Unrecognized tensor type for no_need_buffer feature"));
diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h
index 4d08146f7aafe..2af2bd369b42b 100644
--- a/paddle/fluid/eager/to_static/run_program_op_node.h
+++ b/paddle/fluid/eager/to_static/run_program_op_node.h
@@ -58,13 +58,14 @@ static void CheckInputVarStatus(const Tensor &tensor) {
                         "wrong type. Expect type is DenseTensor.",
                         tensor.name()));
 
-  PADDLE_ENFORCE_EQ(tensor.initialized(),
-                    true,
-                    paddle::platform::errors::InvalidArgument(
-                        "The tensor in input tensor %s of "
-                        "RunProgram(Grad)Op "
-                        "is not initialized.",
-                        tensor.name()));
+  PADDLE_ENFORCE_EQ(
+      static_cast<phi::DenseTensor *>(tensor.impl().get())->IsInitialized(),
+      true,
+      paddle::platform::errors::InvalidArgument(
+          "The tensor in input tensor %s of "
+          "RunProgram(Grad)Op "
+          "is not initialized.",
+          tensor.name()));
 }
 
 static void CheckOutputVarStatus(const paddle::framework::Variable &src_var,
@@ -84,7 +85,7 @@ static void CheckOutputVarStatus(const paddle::framework::Variable &src_var,
                           "RunProgram(Grad)Op's internal scope holds "
                           "wrong type. Expect type is DenseTensor",
                           name));
-    PADDLE_ENFORCE_EQ(src_tensor.initialized(),
+    PADDLE_ENFORCE_EQ(src_tensor.IsInitialized(),
                       true,
                       paddle::platform::errors::InvalidArgument(
                           "The tensor in output tensor %s get from "
@@ -120,7 +121,7 @@ static void ShareTensorsIntoScope(const std::vector<Tensor> &tensors,
                                   paddle::framework::Scope *scope) {
   for (size_t i = 0; i < tensors.size(); ++i) {
     auto name = tensors[i].name();
-    if (name == "Fake_var" || !tensors[i].initialized()) {
+    if (name == "Fake_var") {
       continue;
     }
     auto *var = scope->Var(name);
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 2aaa0c96e0a33..bd70e55ac45c4 100755
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -45,7 +45,7 @@ proto_library(op_def_proto SRCS op_def.proto DEPS framework_proto)
 cc_library(
   op_def_api
   SRCS op_def_api.cc
-  DEPS op_def_proto boost)
+  DEPS op_def_proto)
 
 file(GLOB OP_DEF_FILES
      ${PADDLE_SOURCE_DIR}/paddle/fluid/operators/compat/*.pbtxt)
@@ -341,7 +341,7 @@ cc_library(
 cc_library(
   attribute
   SRCS attribute.cc
-  DEPS framework_proto boost enforce)
+  DEPS framework_proto enforce)
 cc_test(
   attribute_test
   SRCS attribute_test.cc
@@ -354,12 +354,12 @@ cc_test(
 cc_library(
   op_version_proto
   SRCS op_version_proto.cc
-  DEPS framework_proto boost)
+  DEPS framework_proto)
 
 cc_library(
   op_version_registry
   SRCS op_version_registry.cc
-  DEPS op_version_proto framework_proto boost)
+  DEPS op_version_proto framework_proto)
 cc_test(
   op_version_registry_test
   SRCS op_version_registry_test.cc
@@ -519,7 +519,7 @@ cc_test(
 cc_library(
   program_processing
   SRCS program_processing.cc
-  DEPS boost proto_desc)
+  DEPS proto_desc)
 cc_test(
   program_processing_test
   SRCS program_processing_test.cc
@@ -1025,7 +1025,7 @@ endif()
 cc_library(
   prune
   SRCS prune.cc
-  DEPS framework_proto boost)
+  DEPS framework_proto)
 cc_test(
   prune_test
   SRCS prune_test.cc
diff --git a/paddle/fluid/framework/attribute.cc b/paddle/fluid/framework/attribute.cc
index ed50d5f6bfc4f..a2d0f2db2829d 100644
--- a/paddle/fluid/framework/attribute.cc
+++ b/paddle/fluid/framework/attribute.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/attribute.h"
-#include "boost/blank.hpp"
+#include "paddle/utils/blank.h"
 
 namespace paddle {
 namespace framework {
@@ -118,7 +118,7 @@ Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc) {
       PADDLE_THROW(platform::errors::Unavailable("Unsupport attribute type %d.",
                                                  attr_desc.type()));
   }
-  return boost::blank();
+  return paddle::blank();
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu
index a3a5a25eec842..94e7918e800ef 100644
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -90,9 +90,8 @@ REGISTER_OP_WITHOUT_GRADIENT(
     test_op,
     paddle::framework::TestOpWithKernel,
     paddle::framework::OpKernelTestProtoAndCheckerMaker);
-REGISTER_OP_CPU_KERNEL(
-    test_op,
-    paddle::framework::TestKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(test_op,
+                       paddle::framework::TestKernel<phi::CPUContext, float>);
 REGISTER_OP_CUDA_KERNEL(
     test_op,
     paddle::framework::TestKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index 90639255c3aab..4bf81b46b3456 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -44,8 +44,8 @@ void CastDataLayout::apply() {
   auto place = ctx_->GetPlace();
 
   if (platform::is_cpu_place(place)) {
-    phi::funcs::Transpose<platform::CPUDeviceContext, T, 4> trans4;
-    auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
+    phi::funcs::Transpose<phi::CPUContext, T, 4> trans4;
+    auto* context = static_cast<const phi::CPUContext*>(ctx_);
     trans4(*context, in_, out_, axis_);
   } else {
     PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc
index 2576df3483412..9333e246c68bc 100644
--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@@ -94,8 +94,8 @@ struct CastDataType {
     auto* out_begin = out_->mutable_data<OutType>(in_.place());
 
     if (platform::is_cpu_place(in_.place())) {
-      platform::Transform<platform::CPUDeviceContext> trans;
-      auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
+      platform::Transform<phi::CPUContext> trans;
+      auto* context = static_cast<const phi::CPUContext*>(ctx_);
       trans(*context,
             in_begin,
             in_end,
diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index f22e62fa0aa5b..0ae69695549e5 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -174,7 +174,7 @@ FetchResultType AsyncSSAGraphExecutor::Run(
   HandleException();
 
   FetchList ret;
-  auto &val = boost::get<FetchList>(fetch_data);
+  auto &val = BOOST_GET(FetchList, fetch_data);
   for (size_t fetch_idx = 0; fetch_idx < fetch_tensors.size(); ++fetch_idx) {
     if (data_is_lod_tensor(val.at(fetch_idx))) {
       std::vector<const LoDTensor *> lodtensor_ptrs;
diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.h b/paddle/fluid/framework/details/broadcast_op_handle_test.h
index 9c666d00ab9d1..26ad71bafe6ff 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h
@@ -117,7 +117,7 @@ struct TestBroadcastOpHandle {
       for (int i = 0; i < count; ++i) {
         auto p = p::CPUPlace();
         place_list_.push_back(p);
-        ctxs_.emplace_back(new p::CPUDeviceContext(p));
+        ctxs_.emplace_back(new phi::CPUContext(p));
       }
 #if defined(PADDLE_WITH_XPU_BKCL)
       bkcl_ctxs_.reset(nullptr);
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index baae0922ccd5d..1e27e381500aa 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -21,7 +21,6 @@
 #include <utility>
 #include <vector>
 
-#include "boost/optional.hpp"
 #include "paddle/fluid/framework/ir/pass_builder.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
diff --git a/paddle/fluid/framework/details/fetch_async_op_handle.cc b/paddle/fluid/framework/details/fetch_async_op_handle.cc
index 8d8bb96f5c8ed..a9e4bf826bc4b 100644
--- a/paddle/fluid/framework/details/fetch_async_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_async_op_handle.cc
@@ -228,7 +228,7 @@ void FetchAsyncOpHandle::RunImpl() {
   }
 
   if (return_merged_) {
-    auto &val = boost::get<FetchList>(*data_);
+    auto &val = BOOST_GET(FetchList, *data_);
     if (src_vars[0]->IsType<LoDTensor>()) {
       // to lodtensor type
       std::vector<const LoDTensor *> src_lodtensors;
@@ -263,7 +263,7 @@ void FetchAsyncOpHandle::RunImpl() {
       val.at(offset_) = std::move(dst_lodtensor_array);
     }
   } else {
-    auto &val = boost::get<FetchUnmergedList>(*data_);
+    auto &val = BOOST_GET(FetchUnmergedList, *data_);
     auto &dst_tensors = val.at(offset_);
     dst_tensors.reserve(src_vars.size());
 
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index f160650f0b9f4..a9f7de8ee312f 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -84,7 +84,7 @@ void FetchOpHandle::WaitAndMergeCPUFetchVars() const {
       for (auto &t : tensors_) {
         tensors_ptr.emplace_back(&BOOST_GET_CONST(LoDTensor, t));
       }
-      auto &val = boost::get<FetchList>(*data_);
+      auto &val = BOOST_GET(FetchList, *data_);
       LoDTensor var;
       MergeLoDTensor(&var, tensors_ptr, platform::CPUPlace());
       val.at(offset_) = std::move(var);
@@ -106,11 +106,11 @@ void FetchOpHandle::WaitAndMergeCPUFetchVars() const {
         tmp_array.emplace_back();
         MergeLoDTensor(&(tmp_array.back()), tensors_ptr, platform::CPUPlace());
       }
-      auto &val = boost::get<FetchList>(*data_);
+      auto &val = BOOST_GET(FetchList, *data_);
       val.at(offset_) = std::move(tmp_array);
     }
   } else {
-    auto &val = boost::get<FetchUnmergedList>(*data_);
+    auto &val = BOOST_GET(FetchUnmergedList, *data_);
     val.at(offset_) = std::move(tensors_);
   }
 }
diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc
index ea63595cb2cfc..9cc1929e19ae8 100644
--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@@ -69,7 +69,7 @@ struct TestGatherOpHandle {
       for (int i = 0; i < count; ++i) {
         auto p = p::CPUPlace();
         gpu_list_.push_back(p);
-        ctxs_.emplace_back(new p::CPUDeviceContext(p));
+        ctxs_.emplace_back(new phi::CPUContext(p));
       }
     }
   }
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
index cce26f1e0dca1..767f7b1e48b43 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -316,7 +316,7 @@ template <>
 
 template <>
 template <typename T>
-void TensorCheckerVisitor<platform::CPUDeviceContext>::apply(
+void TensorCheckerVisitor<phi::CPUContext>::apply(
     typename std::enable_if<
         std::is_floating_point<T>::value ||
         std::is_same<T, ::paddle::platform::complex<float>>::value ||
@@ -329,11 +329,11 @@ void TensorCheckerVisitor<platform::CPUDeviceContext>::apply(
 }
 
 template <>
-void tensor_check<platform::CPUDeviceContext>(const std::string& op_type,
-                                              const std::string& var_name,
-                                              const framework::Tensor& tensor,
-                                              const platform::Place& place) {
-  TensorCheckerVisitor<platform::CPUDeviceContext> vistor(
+void tensor_check<phi::CPUContext>(const std::string& op_type,
+                                   const std::string& var_name,
+                                   const framework::Tensor& tensor,
+                                   const platform::Place& place) {
+  TensorCheckerVisitor<phi::CPUContext> vistor(
       op_type, var_name, tensor, place);
   VisitDataType(framework::TransToProtoVarType(tensor.dtype()), vistor);
 }
@@ -439,7 +439,7 @@ void CheckVarHasNanOrInf(const std::string& op_type,
 #endif
     return;
   }
-  tensor_check<platform::CPUDeviceContext>(op_type, var_name, *tensor, place);
+  tensor_check<phi::CPUContext>(op_type, var_name, *tensor, place);
 }
 
 void CheckVarHasNanOrInf(const std::string& op_type,
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
index 86536b74a3d7c..bc870c0eaa18d 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@@ -278,7 +278,8 @@ FetchResultType ParallelSSAGraphExecutor::Run(
         if (!is_valid[scope_idx]) {
           continue;
         }
-        const auto &fetch_list = boost::get<FetchList>(fetch_data[scope_idx]);
+        const auto &fetch_list =
+            BOOST_GET_CONST(FetchList, fetch_data[scope_idx]);
         if (data_is_lod_tensor(fetch_list[fetch_idx])) {
           lodtensor_ptrs.push_back(
               &(BOOST_GET_CONST(LoDTensor, fetch_list[fetch_idx])));
@@ -317,7 +318,7 @@ FetchResultType ParallelSSAGraphExecutor::Run(
           continue;
         }
         const auto &fetch_list =
-            boost::get<FetchUnmergedList>(fetch_data[scope_idx]);
+            BOOST_GET_CONST(FetchUnmergedList, fetch_data[scope_idx]);
         PADDLE_ENFORCE_EQ(
             fetch_list[fetch_idx].size(),
             1,
diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc
index 2a7ac790e8049..0d957bf81306f 100644
--- a/paddle/fluid/framework/details/reduce_op_handle_test.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc
@@ -81,7 +81,7 @@ struct TestReduceOpHandle {
       for (int i = 0; i < count; ++i) {
         auto p = p::CPUPlace();
         gpu_list_.push_back(p);
-        ctxs_.emplace_back(new p::CPUDeviceContext(p));
+        ctxs_.emplace_back(new phi::CPUContext(p));
       }
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
       nccl_ctxs_.reset(nullptr);
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index 057a19f31759b..b7bca733b8f9e 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -69,7 +69,8 @@ static DLDataType GetDLDataTypeFromTypeIndex(proto::VarType::Type type) {
 #undef REG_DL_DATA_TYPE
 }
 
-struct DLDeviceVisitor : public boost::static_visitor<::DLDevice> {
+struct DLDeviceVisitor
+    : public std::unary_function<const platform::Place &, ::DLDevice> {
   inline ::DLDevice operator()(const platform::CPUPlace &place) const {
     ::DLDevice device;
     device.device_type = kDLCPU;
diff --git a/paddle/fluid/framework/executor_cache.cc b/paddle/fluid/framework/executor_cache.cc
index bac3ce5c4f88e..2df86e86a75e0 100644
--- a/paddle/fluid/framework/executor_cache.cc
+++ b/paddle/fluid/framework/executor_cache.cc
@@ -46,6 +46,10 @@ static ExecutionStrategy GetExecutionStrategy(const platform::Place &place) {
       execution_strategy.num_threads_ = 1;
       break;
     }
+    case platform::DeviceType::IPU: {
+      execution_strategy.num_threads_ = 1;
+      break;
+    }
     default:
       PADDLE_THROW(platform::errors::Unavailable("Unsupported Device type %d.",
                                                  device_type));
diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc
index 36ab906181be5..47bb60810eb48 100644
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/feed_fetch_method.h"
 
-#include <boost/variant.hpp>
 #include <string>
 
 #include "glog/logging.h"
diff --git a/paddle/fluid/framework/feed_fetch_type.h b/paddle/fluid/framework/feed_fetch_type.h
index c86cdc998133b..3fe545ec9c569 100644
--- a/paddle/fluid/framework/feed_fetch_type.h
+++ b/paddle/fluid/framework/feed_fetch_type.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/string_array.h"
-#include "paddle/fluid/platform/variant.h"
 
 namespace paddle {
 namespace framework {
@@ -30,7 +29,7 @@ using FetchType = paddle::variant<LoDTensor, LoDTensorArray, framework::Vocab>;
 using FetchList = std::vector<FetchType>;
 
 using FetchUnmergedList = std::vector<std::vector<FetchType>>;
-using FetchResultType = boost::variant<FetchList, FetchUnmergedList>;
+using FetchResultType = paddle::variant<FetchList, FetchUnmergedList>;
 
 inline bool data_is_lod_tensor(const FetchType &data) {
   if (data.type() == typeid(LoDTensor)) {
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h
index 982c1b85a5b03..c9c03fb66f8fa 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.h
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.h
@@ -38,6 +38,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_HETERPS
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #endif
+#include "paddle/fluid/framework/fleet/heter_ps/log_patch.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
index a7e00bb083f40..bb9998249048e 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
@@ -89,42 +89,30 @@ __global__ void dy_mf_search_kernel(Table* table,
                                     char* vals,
                                     size_t len,
                                     size_t pull_feature_value_size) {
-  const size_t i = blockIdx.x * blockDim.y + threadIdx.y;
-  const size_t k = threadIdx.x;
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
   if (i < len) {
     auto it = table->find(keys[i]);
+
     if (it != table->end()) {
       uint64_t offset = i * pull_feature_value_size;
       FeatureValue* cur = (FeatureValue*)(vals + offset);
       FeatureValue& input = *(FeatureValue*)(it->second);
-      char* cur_p = (char*)cur;
-      char* input_p = (char*)(&input);
-      int len = 9 + input.mf_dim + 1;
-      if (k == 3 || k == 6 || k == 7)
-        *(int*)(cur_p + k * 4) = *(int*)(input_p + k * 4);
-      else if (k < 8)
-        *(float*)(cur_p + k * 4) = *(float*)(input_p + k * 4);
-      else if (k == 8) {
-        *(uint64_t*)(cur_p + k * 4) = *(uint64_t*)(input_p + k * 4);
-      } else {
-        int len_per_thread = (len - 9) / (blockDim.y - 9);
-        int remain = (len - 9) % (blockDim.y - 9);
-        int real_len = len_per_thread;
-        if ((k - 9) < remain) real_len++;
-        int left = -1, right = -1;
-        if ((k - 9) < remain) {
-          left = 9 + (k - 9) * (len_per_thread + 1);
-          right = left + real_len;
-        } else {
-          left = 9 + remain * (len_per_thread + 1) +
-                 (k - 9 - remain) * len_per_thread;
-          right = left + real_len;
-        }
-        for (int j = left; j < right; j++)
-          *(float*)(cur_p + (j + 1) * 4) = *(float*)(input_p + (j + 1) * 4);
+      cur->slot = input.slot;
+      cur->show = input.show;
+      cur->clk = input.clk;
+      cur->mf_dim = input.mf_dim;
+      cur->lr = input.lr;
+      cur->mf_size = input.mf_size;
+      cur->cpu_ptr = input.cpu_ptr;
+      cur->delta_score = input.delta_score;
+      cur->lr_g2sum = input.lr_g2sum;
+      for (int j = 0; j < cur->mf_dim + 1; ++j) {
+        cur->mf[j] = input.mf[j];
       }
     } else {
-      if (keys[i] != 0) printf("pull miss key: %llu", keys[i]);
+      if (keys[i] != 0) {
+        printf("warning::pull miss key: %llu", keys[i]);
+      }
     }
   }
 }
@@ -181,6 +169,7 @@ HashTable<KeyType, ValType>::HashTable(size_t capacity) {
 template <typename KeyType, typename ValType>
 HashTable<KeyType, ValType>::~HashTable() {
   delete container_;
+  cudaFree(device_optimizer_config_);
 }
 
 template <typename KeyType, typename ValType>
@@ -231,10 +220,8 @@ void HashTable<KeyType, ValType>::get(const KeyType* d_keys,
   if (len == 0) {
     return;
   }
-  dim3 block_dims(32, 32);
-  const int grid_size = (len - 1) / 32 + 1;
-  dim3 grid_dims(grid_size);
-  dy_mf_search_kernel<<<grid_dims, block_dims, 0, stream>>>(
+  const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
+  dy_mf_search_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(
       container_, d_keys, d_vals, len, pull_feature_value_size_);
 }
 
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
index 8952039299d06..a7333cd01c6ec 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
@@ -426,16 +426,26 @@ int HeterComm<KeyType, ValType, GradType>::get_index_by_devid(int devid) {
 template <typename KeyType, typename ValType, typename GradType>
 void HeterComm<KeyType, ValType, GradType>::set_sparse_sgd(
     const OptimizerConfig& optimizer_config) {
-  for (auto& table : tables_) {
-    table->set_sparse_sgd(optimizer_config);
+  for (int i = 0; i < resource_->total_device(); ++i) {
+    AnyDeviceGuard guard(resource_->dev_id(i));
+    if (!multi_mf_dim_) {
+      tables_[i]->set_sparse_sgd(optimizer_config);
+    } else {
+      ptr_tables_[i]->set_sparse_sgd(optimizer_config);
+    }
   }
 }
 
 template <typename KeyType, typename ValType, typename GradType>
 void HeterComm<KeyType, ValType, GradType>::set_embedx_sgd(
     const OptimizerConfig& optimizer_config) {
-  for (auto& table : tables_) {
-    table->set_embedx_sgd(optimizer_config);
+  for (int i = 0; i < resource_->total_device(); ++i) {
+    AnyDeviceGuard guard(resource_->dev_id(i));
+    if (!multi_mf_dim_) {
+      tables_[i]->set_embedx_sgd(optimizer_config);
+    } else {
+      ptr_tables_[i]->set_embedx_sgd(optimizer_config);
+    }
   }
 }
 
@@ -760,7 +770,6 @@ void HeterComm<KeyType, ValType, GradType>::dynamic_merge_grad(
                                      (char*)d_grads,
                                      (char*)d_merge_grads_ptr,
                                      uniq_len,
-                                     max_mf_dim_,
                                      grad_value_size,
                                      merger_,
                                      stream);
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
index 8a13d9abe635d..fd0dd1a72cca1 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
@@ -144,106 +144,28 @@ __global__ void dy_mf_fill_shard_grads_kernel(KeyType* d_shard_keys,
   }
 }
 
-// optimized version
-template <>
-__global__ void
-dy_mf_fill_shard_grads_kernel<FeatureKey, FeaturePushValue, int>(
-    FeatureKey* d_shard_keys,
-    FeatureKey* d_keys,
-    FeaturePushValue* d_shard_grads,
-    FeaturePushValue* d_grads,
-    int* idx,
-    size_t len,
-    size_t grad_value_size) {
-  const size_t i = blockIdx.x * blockDim.y + threadIdx.y;
-  const size_t k = threadIdx.x;
-  if (i < len) {
-    if (k == 0) {
-      d_shard_keys[i] = d_keys[idx[i]];
-    }
-    FeaturePushValue* cur =
-        (FeaturePushValue*)((char*)d_shard_grads + i * grad_value_size);
-    FeaturePushValue& input = *(
-        FeaturePushValue*)((char*)d_grads + uint64_t(idx[i]) * grad_value_size);
-    char* cur_p = (char*)cur;
-    char* input_p = (char*)(&input);
-    int len = 5 + input.mf_dim;
-    if (k == 2 || k == 4)
-      *(int*)(cur_p + k * 4) = *(int*)(input_p + k * 4);
-    else if (k < 5)
-      *(float*)(cur_p + k * 4) = *(float*)(input_p + k * 4);
-    else {
-      int len_per_thread = (len - 5) / (blockDim.y - 5);
-      int remain = (len - 5) % (blockDim.y - 5);
-      int real_len = len_per_thread;
-      if ((k - 5) < remain) real_len++;
-      int left = -1, right = -1;
-      if ((k - 5) < remain) {
-        left = 5 + (k - 5) * (len_per_thread + 1);
-        right = left + real_len;
-      } else {
-        left = 5 + remain * (len_per_thread + 1) +
-               (k - 5 - remain) * len_per_thread;
-        right = left + real_len;
-      }
-      for (int j = left; j < right; j++)
-        *(float*)(cur_p + j * 4) = *(float*)(input_p + j * 4);
-    }
-  }
-}
-
-__global__ void merge_gradients_basic_kernel(const uint32_t* offset,
-                                             const uint32_t* fea_num,
-                                             const uint32_t* index,
-                                             const char* input,
-                                             char* output,
-                                             int n,
-                                             size_t grad_value_size,
-                                             DynamicGradMerger& merger) {
+__global__ void merge_gradients_kernel(const uint32_t* offset,
+                                       const uint32_t* fea_num,
+                                       const uint32_t* index,
+                                       const char* input,
+                                       char* output,
+                                       int n,
+                                       size_t grad_value_size,
+                                       DynamicGradMerger& merger_) {
   const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
   if (i < n) {
     uint32_t start = offset[i];
     uint32_t num = fea_num[i];
     int ori_index = index[start];
-    FeaturePushValue& lhs = *(FeaturePushValue*)(output + i * grad_value_size);
+    FeaturePushValue& out = *(FeaturePushValue*)(output + i * grad_value_size);
     FeaturePushValue& in =
         *(FeaturePushValue*)(input + size_t(ori_index) * grad_value_size);
-    merger.update_basic(lhs, in);
+    merger_.update_one(out, in);
     for (int j = 1; j < num; ++j) {
       ori_index = index[start + j];
       FeaturePushValue& rhs =
           *(FeaturePushValue*)(input + size_t(ori_index) * grad_value_size);
-      merger.merge_basic(lhs, rhs);
-    }
-  }
-}
-
-__global__ void merge_gradients_embedx_kernel(const uint32_t* offset,
-                                              const uint32_t* fea_num,
-                                              const uint32_t* index,
-                                              const char* input,
-                                              char* output,
-                                              int n,
-                                              size_t grad_dim,
-                                              size_t grad_value_size,
-                                              DynamicGradMerger& merger) {
-  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i < n) {
-    size_t value_idx = i / grad_dim;
-    size_t field_idx = i % grad_dim;
-    uint32_t start = offset[value_idx];
-    uint32_t num = fea_num[value_idx];
-    int ori_index = index[start];
-    FeaturePushValue& in =
-        *(FeaturePushValue*)(input + size_t(ori_index) * grad_value_size);
-    FeaturePushValue& lhs =
-        *(FeaturePushValue*)(output + value_idx * grad_value_size);
-    merger.update_embedx(lhs, in, field_idx);
-    for (int j = 1; j < num; ++j) {
-      int ori_index = index[start + j];
-      FeaturePushValue& rhs =
-          *(FeaturePushValue*)(input + size_t(ori_index) * grad_value_size);
-      merger.merge_embedx(lhs, rhs, field_idx);
+      merger_.merge_one(out, rhs);
     }
   }
 }
@@ -262,49 +184,6 @@ __global__ void dy_mf_fill_dvals_kernel(ValType* d_shard_vals,
   }
 }
 
-// optimized version
-template <>
-__global__ void dy_mf_fill_dvals_kernel<FeatureValue, int>(
-    FeatureValue* d_shard_vals,
-    FeatureValue* d_vals,
-    int* idx,
-    size_t len,
-    size_t val_size) {
-  const size_t i = blockIdx.x * blockDim.y + threadIdx.y;
-  const size_t k = threadIdx.x;
-  if (i < len) {
-    uint64_t new_offset = uint64_t(idx[i]) * val_size;
-    FeatureValue* cur = (FeatureValue*)((char*)d_vals + new_offset);
-    FeatureValue& input = *(FeatureValue*)((char*)d_shard_vals + i * val_size);
-    char* cur_p = (char*)cur;
-    char* input_p = (char*)(&input);
-    int len = 9 + input.mf_dim + 1;
-    if (k == 3 || k == 6 || k == 7)
-      *(int*)(cur_p + k * 4) = *(int*)(input_p + k * 4);
-    else if (k < 8)
-      *(float*)(cur_p + k * 4) = *(float*)(input_p + k * 4);
-    else if (k == 8) {
-      *(uint64_t*)(cur_p + k * 4) = *(uint64_t*)(input_p + k * 4);
-    } else {
-      int len_per_thread = (len - 9) / (blockDim.x - 9);
-      int remain = (len - 9) % (blockDim.y - 9);
-      int real_len = len_per_thread;
-      if ((k - 9) < remain) real_len++;
-      int left = -1, right = -1;
-      if ((k - 9) < remain) {
-        left = 9 + (k - 9) * (len_per_thread + 1);
-        right = left + real_len;
-      } else {
-        left = 9 + remain * (len_per_thread + 1) +
-               (k - 9 - remain) * len_per_thread;
-        right = left + real_len;
-      }
-      for (int j = left; j < right; j++)
-        *(float*)(cur_p + (j + 1) * 4) = *(float*)(input_p + (j + 1) * 4);
-    }
-  }
-}
-
 // cuda implemention of  heter_comm_kernel.h
 template <typename T, typename StreamType>
 void HeterCommKernel::fill_idx(T* idx,
@@ -442,12 +321,9 @@ void HeterCommKernel::dy_mf_fill_shard_grads(KeyType* d_shard_keys,
                                              long long len,
                                              size_t grad_value_size,
                                              const StreamType& stream) {
-  // int grid_size = (len - 1) / block_size_ + 1;
+  int grid_size = (len - 1) / block_size_ + 1;
   size_t c_len = (size_t)len;
-  dim3 block_dims(32, 32);
-  const size_t grid_size = (len - 1) / 32 + 1;
-  dim3 grid_dims(grid_size);
-  dy_mf_fill_shard_grads_kernel<<<grid_dims, block_dims, 0, stream>>>(
+  dy_mf_fill_shard_grads_kernel<<<grid_size, block_size_, 0, stream>>>(
       d_shard_keys,
       d_keys,
       d_shard_grads,
@@ -464,26 +340,12 @@ void HeterCommKernel::merge_gradient(const uint32_t* offset,
                                      const char* input,
                                      char* output,
                                      int n,
-                                     size_t grad_dim,
                                      size_t grad_value_size,
                                      DynamicGradMerger& merger_,
                                      const StreamType& stream) {
   int grid_size = (n - 1) / block_size_ + 1;
-  merge_gradients_basic_kernel<<<grid_size, block_size_, 0, stream>>>(
+  merge_gradients_kernel<<<grid_size, block_size_, 0, stream>>>(
       offset, fea_num, index, input, output, n, grad_value_size, merger_);
-  if (grad_dim > 0) {
-    int grid_size2 = (n * grad_dim - 1) / block_size_ + 1;
-    merge_gradients_embedx_kernel<<<grid_size2, block_size_, 0, stream>>>(
-        offset,
-        fea_num,
-        index,
-        input,
-        output,
-        n * grad_dim,
-        grad_dim,
-        grad_value_size,
-        merger_);
-  }
 }
 
 template <typename ValType, typename T, typename StreamType>
@@ -493,12 +355,9 @@ void HeterCommKernel::dy_mf_fill_dvals(ValType* d_shard_vals,
                                        long long len,
                                        size_t val_size,
                                        const StreamType& stream) {
-  // int grid_size = (len - 1) / block_size_ + 1;
+  int grid_size = (len - 1) / block_size_ + 1;
   size_t c_len = (size_t)len;
-  dim3 block_dims(32, 32);
-  const size_t grid_size_ = (len - 1) / 32 + 1;
-  dim3 grid_dims(grid_size_);
-  dy_mf_fill_dvals_kernel<<<grid_dims, block_dims, 0, stream>>>(
+  dy_mf_fill_dvals_kernel<<<grid_size, block_size_, 0, stream>>>(
       d_shard_vals, d_vals, idx, c_len, val_size);
 }
 
@@ -628,7 +487,6 @@ template void HeterCommKernel::merge_gradient<cudaStream_t>(
     const char* input,
     char* output,
     int n,
-    size_t grad_dim,
     size_t grad_value_size,
     DynamicGradMerger& merger_,
     const cudaStream_t& stream);
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h
index 6859161a5fe48..d1555dc2e0919 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h
@@ -42,41 +42,23 @@ struct DynamicGradMerger {
   }
 
   template <typename T>
-  __device__ __forceinline__ void update_basic(T& output, const T& input) {
+  __device__ __forceinline__ void update_one(T& output, const T& input) {
     output.slot = input.slot;
     output.show = input.show;
     output.clk = input.clk;
     output.mf_dim = input.mf_dim;
     output.lr_g = input.lr_g;
-    // for (int i = 0; i < output.mf_dim; ++i) {
-    //  output.mf_g[i] = input.mf_g[i];
-    //}
+    for (int i = 0; i < output.mf_dim; ++i) {
+      output.mf_g[i] = input.mf_g[i];
+    }
   }
   template <typename T>
-  __device__ __forceinline__ void merge_basic(T& output, const T& input) {
+  __device__ __forceinline__ void merge_one(T& output, const T& input) {
     output.show += input.show;
     output.clk += input.clk;
     output.lr_g += input.lr_g;
-    // for (int i = 0; i < input.mf_dim; ++i) {
-    //  output.mf_g[i] += input.mf_g[i];
-    //}
-  }
-
-  template <typename T>
-  __device__ __forceinline__ void update_embedx(T& output,
-                                                const T& input,
-                                                size_t embedx_id) {
-    if (embedx_id < output.mf_dim) {
-      output.mf_g[embedx_id] = input.mf_g[embedx_id];
-    }
-  }
-
-  template <typename T>
-  __device__ __forceinline__ void merge_embedx(T& output,
-                                               const T& input,
-                                               size_t embedx_id) {
-    if (embedx_id < output.mf_dim) {
-      output.mf_g[embedx_id] += input.mf_g[embedx_id];
+    for (int i = 0; i < input.mf_dim; ++i) {
+      output.mf_g[i] += input.mf_g[i];
     }
   }
 };
@@ -183,7 +165,6 @@ class HeterCommKernel {
                       const char* input,
                       char* output,
                       int n,
-                      size_t grad_dim,
                       size_t grad_value_size,
                       DynamicGradMerger& merger_,
                       const StreamType& stream);
diff --git a/paddle/fluid/framework/fleet/heter_ps/log_patch.h b/paddle/fluid/framework/fleet/heter_ps/log_patch.h
new file mode 100644
index 0000000000000..84c83a56f3061
--- /dev/null
+++ b/paddle/fluid/framework/fleet/heter_ps/log_patch.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+
+#undef VLOG_IS_ON
+#define VLOG_IS_ON(verboselevel) (FLAGS_v >= (verboselevel))
+
+#undef COMPACT_GOOGLE_LOG_INFO
+#define COMPACT_GOOGLE_LOG_INFO google::LogMessage(__FILE__, __LINE__)
+
+#undef LOG
+#define LOG(severity) COMPACT_GOOGLE_LOG_##severity.stream()
+
+#undef LOG_IF
+#define LOG_IF(severity, condition) \
+  static_cast<void>(0),             \
+      !(condition) ? (void)0 : google::LogMessageVoidify() & LOG(severity)
+
+#undef VLOG
+#define VLOG(verboselevel) LOG_IF(INFO, VLOG_IS_ON(verboselevel))
diff --git a/paddle/fluid/framework/fleet/heter_ps/test_sample_rate.cu b/paddle/fluid/framework/fleet/heter_ps/test_sample_rate.cu
index 03ef905b9ab48..5fc0625992c79 100644
--- a/paddle/fluid/framework/fleet/heter_ps/test_sample_rate.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/test_sample_rate.cu
@@ -25,11 +25,11 @@
 
 #include "google/protobuf/text_format.h"
 #include "gtest/gtest.h"
-#include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/ps/service/env.h"
 #include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
 #include "paddle/fluid/distributed/ps/table/common_graph_table.h"
 #include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
+#include "paddle/fluid/distributed/the_one_ps.pb.h"
 #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
 #include "paddle/fluid/framework/fleet/heter_ps/graph_sampler.h"
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
index 0e816beef0d33..65f86acce9151 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
@@ -59,6 +59,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_PSLIB
 #include "downpour_accessor.h"  // NOLINT
 #endif
+#include "paddle/fluid/framework/fleet/heter_ps/log_patch.h"
 
 namespace paddle {
 namespace framework {
@@ -322,26 +323,19 @@ class PSGPUWrapper {
     float mf_max_bound = (config.find("mf_max_bound") == config.end())
                              ? 1.0
                              : config["mf_max_bound"];
-    for (size_t i = 0; i < heter_devices_.size(); i++) {
-#ifdef PADDLE_WITH_CUDA
-      PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(heter_devices_[i]));
-#elif defined(PADDLE_WITH_XPU_KP)
-      PADDLE_ENFORCE_XPU_SUCCESS(xpu_set_device(heter_devices_[i]));
-#endif
-      this->SetSparseSGD(nonclk_coeff,
-                         clk_coeff,
-                         min_bound,
-                         max_bound,
-                         learning_rate,
-                         initial_g2sum,
-                         initial_range);
-      this->SetEmbedxSGD(mf_create_thresholds,
-                         mf_learning_rate,
-                         mf_initial_g2sum,
-                         mf_initial_range,
-                         mf_min_bound,
-                         mf_max_bound);
-    }
+    this->SetSparseSGD(nonclk_coeff,
+                       clk_coeff,
+                       min_bound,
+                       max_bound,
+                       learning_rate,
+                       initial_g2sum,
+                       initial_range);
+    this->SetEmbedxSGD(mf_create_thresholds,
+                       mf_learning_rate,
+                       mf_initial_g2sum,
+                       mf_initial_range,
+                       mf_min_bound,
+                       mf_max_bound);
   }
 
   void SetDate(int year, int month, int day) {
diff --git a/paddle/fluid/framework/io/CMakeLists.txt b/paddle/fluid/framework/io/CMakeLists.txt
index 4d21c6a892349..28644d530854f 100644
--- a/paddle/fluid/framework/io/CMakeLists.txt
+++ b/paddle/fluid/framework/io/CMakeLists.txt
@@ -5,7 +5,7 @@ cc_library(
 cc_library(
   fs
   SRCS fs.cc
-  DEPS string_helper glog boost enforce shell)
+  DEPS string_helper glog enforce shell)
 
 cc_test(
   test_fs
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index d19b163817e41..d31555bf7247c 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -212,6 +212,7 @@ if(WITH_MKLDNN)
   pass_library(shuffle_channel_mkldnn_detect_pass inference DIR mkldnn)
   pass_library(fc_act_mkldnn_fuse_pass inference DIR mkldnn)
   pass_library(elt_act_mkldnn_fuse_pass inference DIR mkldnn)
+  pass_library(matmul_activation_mkldnn_fuse_pass inference DIR mkldnn)
   pass_library(cpu_quantize_placement_pass base DIR mkldnn)
   pass_library(cpu_quantize_pass inference DIR mkldnn)
   pass_library(cpu_quantize_squash_pass inference DIR mkldnn)
@@ -240,6 +241,7 @@ if(WITH_IPU)
   pass_library(infer_shape_pass base DIR ipu)
   pass_library(delete_scale_op_pass base DIR ipu)
   pass_library(avg_shard_pass base DIR ipu)
+  pass_library(inference_dtype_transfer_pass base DIR ipu)
 endif()
 
 cc_library(
@@ -472,6 +474,13 @@ if(WITH_MKLDNN)
     test_compute_propagate_scales_mkldnn_pass
     SRCS mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc
     DEPS compute_propagate_scales_mkldnn_pass naive_executor)
+
+  if(WITH_ONNXRUNTIME AND WIN32)
+    # Copy onnxruntime for some c++ test in Windows, since the test will
+    # be build only in CI, so suppose the generator in Windows is Ninja.
+    copy_onnx(test_compute_propagate_scales_mkldnn_pass)
+  endif()
+
   cc_test(
     test_cpu_quantize_placement_pass
     SRCS mkldnn/cpu_quantize_placement_pass_tester.cc
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
index ff86bdb8fa86f..6d9611ebd1393 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
@@ -105,6 +105,22 @@ ConvElementwiseAdd2ActFusePass::ConvElementwiseAdd2ActFusePass() {
       .AddOutput("Out")
       .IsTensor()
       .End();
+
+  AddOpCompat(OpCompat("sigmoid"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End();
+
+  AddOpCompat(OpCompat("tanh"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End();
 }
 
 void ConvElementwiseAdd2ActFusePass::ApplyImpl(ir::Graph* graph) const {
@@ -188,4 +204,6 @@ REGISTER_PASS_CAPABILITY(conv_elementwise_add2_act_fuse_pass)
             .LE("conv2d", 1)
             .LE("elementwise_add", 1)
             .EQ("relu", 0)
+            .EQ("sigmoid", 0)
+            .EQ("tanh", 0)
             .EQ("identity", 0));
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
index f67e83bc10171..47e2c5e380bcb 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
@@ -102,6 +102,22 @@ ConvElementwiseAddActFusePass::ConvElementwiseAddActFusePass() {
       .AddOutput("Out")
       .IsTensor()
       .End();
+
+  AddOpCompat(OpCompat("sigmoid"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End();
+
+  AddOpCompat(OpCompat("tanh"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End();
 }
 
 void ConvElementwiseAddActFusePass::ApplyImpl(ir::Graph* graph) const {
@@ -170,4 +186,6 @@ REGISTER_PASS_CAPABILITY(conv_elementwise_add_act_fuse_pass)
             .LE("conv2d", 1)
             .LE("elementwise_add", 1)
             .EQ("relu", 0)
+            .EQ("sigmoid", 0)
+            .EQ("tanh", 0)
             .EQ("identity", 0));
diff --git a/paddle/fluid/framework/ir/cost_model.h b/paddle/fluid/framework/ir/cost_model.h
index 27f059a81eb1f..558f158e84e72 100644
--- a/paddle/fluid/framework/ir/cost_model.h
+++ b/paddle/fluid/framework/ir/cost_model.h
@@ -27,7 +27,6 @@
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
-#include "paddle/fluid/platform/variant.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc
index 40861638a2ab2..e0d490ce83680 100644
--- a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc
@@ -89,6 +89,7 @@ void DeleteQuantDequantOpPass::ApplyImpl(ir::Graph* graph) const {
       std::string quantized_op_type = op_desc->Type();
       op_desc->SetAttr("Input_scale", input_scale);
       op_desc->SetAttr("bit_length", bit_length);
+      op_desc->SetAttr("enable_int8", true);
       op_desc->RenameInput(quant_dequant_output_name, input_name);
       op_desc->Flush();
       IR_NODE_LINK_TO(input, quantized_node);
diff --git a/paddle/fluid/framework/ir/generate_pass.cc b/paddle/fluid/framework/ir/generate_pass.cc
index 83c3ab9933d61..455af83427819 100644
--- a/paddle/fluid/framework/ir/generate_pass.cc
+++ b/paddle/fluid/framework/ir/generate_pass.cc
@@ -14,14 +14,14 @@
 
 #include "paddle/fluid/framework/ir/generate_pass.h"
 
-#include "boost/blank.hpp"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/utils/blank.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
-class element_visitor : public boost::static_visitor<Attribute> {
+class element_visitor {
  public:
   explicit element_visitor(int index) : index_(index) {}
 
@@ -40,14 +40,14 @@ class element_visitor : public boost::static_visitor<Attribute> {
     if (index >= 0 && static_cast<size_t>(index) < attr.size()) {
       return static_cast<ET>(attr[index]);
     }
-    return boost::blank();
+    return paddle::blank();
   }
 
  private:
   int index_;
 };
 
-class operation_visitor : public boost::static_visitor<Attribute> {
+class operation_visitor {
  public:
   explicit operation_visitor(const proto::PassDesc::OperationType& type)
       : type_(type) {}
@@ -99,7 +99,7 @@ Attribute GetVarAttrValue(const VarDesc* desc,
       return shape;
     }
   }
-  return boost::blank();
+  return paddle::blank();
 }
 
 Attribute GetOpAttrValue(const OpDesc* desc,
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index ea5c46e3040bd..5a954110775d6 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -25,7 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/variant.h"
+
 #include "paddle/utils/any.h"
 
 DECLARE_bool(convert_all_blocks);
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
index 315fe3b1e7eee..b0a2b6754cb2a 100644
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -452,6 +452,7 @@ static OpDesc *ReplaceScaleLossGradOp(const Node &node, OpDesc *desc) {
       OpProtoAndCheckerMaker::OpRoleAttrName(),
       (static_cast<int>(OpRole::kBackward) | static_cast<int>(OpRole::kLoss)));
   desc->SetAttr("value", 1.0f);
+  desc->SetAttr("shape", std::vector<int64_t>({1}));
   std::vector<std::string> output_names;
   for (auto out : node.outputs) {
     output_names.emplace_back(out->Name());
@@ -578,6 +579,12 @@ void GraphToProgram(const Graph &graph,
 
     VLOG(3) << "Graph to program need convert " << graph.SubGraphsSize()
             << " sub graph";
+
+    std::unordered_set<std::string> vars_in_root_block;
+    for (const proto::VarDesc &var : block->vars()) {
+      vars_in_root_block.insert(var.name());
+    }
+
     for (size_t idx = 0; idx < graph.SubGraphsSize(); ++idx) {
       // avoid kRootBlockIndex not 0
       if (idx == kRootBlockIndex) continue;
@@ -585,7 +592,14 @@ void GraphToProgram(const Graph &graph,
       block = program_pb.add_blocks();
       block->set_idx(idx);
       block->set_parent_idx(kRootBlockIndex);
-      GraphToBlock(*graph.GetSubGraph(idx), block, sort_kind);
+
+      Graph *subgraph = graph.GetSubGraph(idx);
+      subgraph->SetNotOwned<std::unordered_set<std::string>>(
+          kGraphToProgramVarsToRemove, &vars_in_root_block);
+
+      GraphToBlock(*subgraph, block, sort_kind);
+
+      subgraph->Erase(kGraphToProgramVarsToRemove);
     }
   } else {
     GraphToBlock(graph, block, sort_kind);
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index d5b6122886850..b0792ee0812c9 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -29,6 +29,12 @@ using string::Style;
 
 size_t PDPattern::id_ = 0UL;
 
+#ifdef PADDLE_WITH_TENSORRT
+namespace patterns {
+thread_local std::unordered_map<std::string, size_t> KeyCounter::dic_;
+}
+#endif
+
 PDNode *PDPattern::NewNode(const std::string &name) {
   if (!name.empty()) {
     PADDLE_ENFORCE_EQ(
@@ -925,65 +931,22 @@ PDNode *patterns::ConvBN::operator()(paddle::framework::ir::PDNode *conv_input,
   return bn_out_var;
 }
 
-PDNode *patterns::ConvActivation::operator()(
-    paddle::framework::ir::PDNode *conv_input,
-    std::string conv_type,
-    std::string activation_type) {
-  // Create Operators
-  conv_input->assert_is_op_input(conv_type, "Input");
-  auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op(conv_type);
-  auto *activation_op =
-      pattern->NewNode(activation_repr())->assert_is_op(activation_type);
-  // Create variables
-  // Filter
-  auto *conv_weight_var = pattern->NewNode(conv_weight_repr())
-                              ->AsInput()
-                              ->assert_is_persistable_var()
-                              ->assert_is_op_input(conv_type, "Filter");
-  // intermediate variable, will be removed in the IR after fuse.
-  auto *conv_out_var = pattern->NewNode(conv_out_repr())
-                           ->AsIntermediate()
-                           ->assert_is_only_output_of_op(conv_type)
-                           ->assert_is_op_input(activation_type);
-  // output
-  auto *activation_out_var = pattern->NewNode(activation_out_repr())
-                                 ->AsOutput()
-                                 ->assert_is_op_output(activation_type);
-
-  conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var});
-  activation_op->LinksFrom({conv_out_var}).LinksTo({activation_out_var});
-  return activation_out_var;
-}
-
-PDNode *patterns::ElementwiseActivation::operator()(
-    paddle::framework::ir::PDNode *elementwise_a,
-    const std::string &elementwise_type,
-    const std::string &activation_type) {
-  // Create Operators
-  elementwise_a->assert_is_op_input(elementwise_type, "X");
-  auto *elementwise_op =
-      pattern->NewNode(elementwise_repr())->assert_is_op(elementwise_type);
+PDNode *patterns::OperatorActivation::operator()(
+    const std::string &operator_type, const std::string &activation_type) {
+  auto *preceding_op =
+      pattern->NewNode(preceding_op_repr())->assert_is_op(operator_type);
+  auto *preceding_op_out = pattern->NewNode(preceding_op_out_repr())
+                               ->AsIntermediate()
+                               ->assert_is_only_output_of_op(operator_type)
+                               ->assert_is_op_input(activation_type);
   auto *activation_op =
       pattern->NewNode(activation_repr())->assert_is_op(activation_type);
-  // Create variables
-  auto *elementwise_b = pattern->NewNode(elementwise_b_repr())
-                            ->AsInput()
-                            ->assert_is_op_input(elementwise_type, "Y");
-  // intermediate variable, will be removed in the IR after fuse.
-  auto *elementwise_out_var =
-      pattern->NewNode(elementwise_out_repr())
-          ->AsIntermediate()
-          ->assert_is_only_output_of_op(elementwise_type)
-          ->assert_is_op_input(activation_type);
-  // output
-  auto *activation_out_var = pattern->NewNode(activation_out_repr())
-                                 ->AsOutput()
-                                 ->assert_is_op_output(activation_type);
-
-  elementwise_op->LinksFrom({elementwise_a, elementwise_b})
-      .LinksTo({elementwise_out_var});
-  activation_op->LinksFrom({elementwise_out_var}).LinksTo({activation_out_var});
-  return activation_out_var;
+  auto *activation_out = pattern->NewNode(activation_out_repr())
+                             ->AsOutput()
+                             ->assert_is_op_output(activation_type);
+  preceding_op->LinksTo({preceding_op_out});
+  activation_op->LinksFrom({preceding_op_out}).LinksTo({activation_out});
+  return activation_out;
 }
 
 PDNode *patterns::SeqConvEltAddRelu::operator()(
@@ -1115,44 +1078,6 @@ PDNode *patterns::FCMKLDNN::operator()(paddle::framework::ir::PDNode *x,
   return fc_out_var;
 }
 
-PDNode *patterns::FCActOneDNN::operator()(const std::string &act_type) {
-  auto *fc = pattern->NewNode(fc_repr())->assert_is_op("fc");
-  auto *fc_out = pattern->NewNode(fc_out_repr())
-                     ->assert_is_op_output("fc", "Out")
-                     ->assert_is_op_input(act_type);
-  auto *act =
-      pattern->NewNode(act_repr())->assert_is_op(act_type)->AsIntermediate();
-  auto *act_out = pattern->NewNode(act_out_repr())
-                      ->assert_is_op_output(act_type, "Out")
-                      ->AsOutput();
-
-  fc->LinksTo({fc_out});
-  act->LinksFrom({fc_out}).LinksTo({act_out});
-
-  return act_out;
-}
-
-PDNode *patterns::SoftplusActivation::operator()(std::string activation_type) {
-  // Create Operators
-  auto *softplus_op =
-      pattern->NewNode(softplus_repr())->assert_is_op("softplus");
-  auto *activation_op =
-      pattern->NewNode(activation_repr())->assert_is_op(activation_type);
-  // intermediate variable, will be removed in the IR after fuse.
-  auto *softplus_out = pattern->NewNode(softplus_out_repr())
-                           ->AsIntermediate()
-                           ->assert_is_only_output_of_op("softplus")
-                           ->assert_is_op_input(activation_type);
-  // output
-  auto *activation_out = pattern->NewNode(activation_out_repr())
-                             ->AsOutput()
-                             ->assert_is_op_output(activation_type);
-
-  softplus_op->LinksTo({softplus_out});
-  activation_op->LinksFrom({softplus_out}).LinksTo({activation_out});
-  return activation_out;
-}
-
 PDNode *patterns::Embedding::operator()(PDNode *x) {
   x->assert_is_op_input("lookup_table", "Ids");
   auto *lookup_table_op =
@@ -1796,80 +1721,23 @@ PDNode *patterns::Conv::operator()() {
   return output_var;
 }
 
-PDNode *patterns::Transpose::operator()() {
+PDNode *patterns::Immutable::operator()(const std::string &immutable_type,
+                                        const std::string &input_name) {
   auto prev_op = pattern->NewNode(prev_op_repr())->assert_is_op();
 
-  auto transpose_op =
-      pattern->NewNode(transpose_op_repr())->assert_is_op("transpose2");
+  auto immutable_op =
+      pattern->NewNode(immutable_op_repr())->assert_is_op(immutable_type);
 
-  auto transpose_in = pattern->NewNode(transpose_in_repr())
+  auto immutable_in = pattern->NewNode(immutable_in_repr())
                           ->AsInput()
-                          ->assert_is_op_input("transpose2");
-  auto transpose_out = pattern->NewNode(transpose_out_repr())
+                          ->assert_is_op_input(immutable_type, input_name);
+  auto immutable_out = pattern->NewNode(immutable_out_repr())
                            ->AsOutput()
-                           ->assert_is_op_output("transpose2", "Out");
+                           ->assert_is_op_output(immutable_type, "Out");
 
-  prev_op->LinksTo({transpose_in});
-  transpose_op->LinksFrom({transpose_in}).LinksTo({transpose_out});
-  return transpose_out;
-}
-
-PDNode *patterns::Reshape::operator()() {
-  auto prev_op = pattern->NewNode(prev_op_repr())->assert_is_op();
-
-  auto reshape_op =
-      pattern->NewNode(reshape_op_repr())->assert_is_op("reshape2");
-
-  auto reshape_in = pattern->NewNode(reshape_in_repr())
-                        ->AsInput()
-                        ->assert_is_op_input("reshape2", "X");
-  auto reshape_out = pattern->NewNode(reshape_out_repr())
-                         ->AsOutput()
-                         ->assert_is_op_output("reshape2", "Out");
-
-  prev_op->LinksTo({reshape_in});
-  reshape_op->LinksFrom({reshape_in}).LinksTo({reshape_out});
-  return reshape_out;
-}
-
-PDNode *patterns::Slice::operator()() {
-  auto prev_op = pattern->NewNode(prev_op_repr())->assert_is_op();
-
-  auto slice_op = pattern->NewNode(slice_op_repr())->assert_is_op("slice");
-
-  auto slice_in = pattern->NewNode(slice_in_repr())
-                      ->AsInput()
-                      ->assert_is_op_input("slice", "Input");
-  auto slice_out = pattern->NewNode(slice_out_repr())
-                       ->AsOutput()
-                       ->assert_is_op_output("slice", "Out");
-
-  prev_op->LinksTo({slice_in});
-  slice_op->LinksFrom({slice_in}).LinksTo({slice_out});
-  return slice_out;
-}
-
-PDNode *patterns::NearestInterp::operator()() {
-  auto prev_op = pattern->NewNode(prev_op_repr())->assert_is_op();
-
-  auto nearest_interp_op =
-      pattern->NewNode(nearest_interp_op_repr())
-          ->assert_is_ops({"nearest_interp", "nearest_interp_v2"});
-
-  auto nearest_interp_in =
-      pattern->NewNode(nearest_interp_in_repr())
-          ->AsInput()
-          ->assert_is_ops_input({"nearest_interp", "nearest_interp_v2"}, "X");
-  auto nearest_interp_out =
-      pattern->NewNode(nearest_interp_out_repr())
-          ->AsOutput()
-          ->assert_is_ops_output({"nearest_interp", "nearest_interp_v2"},
-                                 "Out");
-
-  prev_op->LinksTo({nearest_interp_in});
-  nearest_interp_op->LinksFrom({nearest_interp_in})
-      .LinksTo({nearest_interp_out});
-  return nearest_interp_out;
+  prev_op->LinksTo({immutable_in});
+  immutable_op->LinksFrom({immutable_in}).LinksTo({immutable_out});
+  return immutable_out;
 }
 
 PDNode *patterns::Matmul::operator()() {
@@ -2112,7 +1980,7 @@ PDNode *patterns::Pool::operator()() {
 
 PDNode *patterns::Elementwise::operator()(PDNode *x_var,
                                           PDNode *y_var,
-                                          const std::string elementwise_type) {
+                                          const std::string &elementwise_type) {
   auto elementwise_op =
       pattern->NewNode(elementwise_op_repr())->assert_is_op(elementwise_type);
 
@@ -2129,7 +1997,7 @@ PDNode *patterns::Elementwise::operator()(PDNode *x_var,
 }
 
 PDNode *patterns::ElementwiseOp::operator()(
-    const std::string elementwise_type) {
+    const std::string &elementwise_type) {
   auto elementwise_op =
       pattern->NewNode(elementwise_op_repr())->assert_is_op(elementwise_type);
 
@@ -2145,7 +2013,7 @@ PDNode *patterns::ElementwiseOp::operator()(
 PDNode *patterns::ResidualElementwise::operator()(
     PDNode *op_var,
     PDNode *residual_var,
-    const std::string elementwise_type,
+    const std::string &elementwise_type,
     bool as_x) {
   auto elementwise_op =
       pattern->NewNode(elementwise_op_repr())->assert_is_op(elementwise_type);
@@ -2375,7 +2243,12 @@ PDNode *patterns::PriorBox::operator()() {
   return boxes_var;
 }
 
+#if CUDNN_VERSION >= 8000
+std::unordered_set<std::string> conv_act_set(
+    {"identity", "relu", "sigmoid", "tanh"});
+#else
 std::unordered_set<std::string> conv_act_set({"identity", "relu"});
+#endif
 
 PDNode *patterns::ConvElementwiseaddAct::operator()(PDNode *conv_in) {
   conv_in->AsInput();
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 0cc216b6e0de2..09dd426be2daf 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -406,10 +406,18 @@ struct KeyCounter {
     return x;
   }
 
+#ifdef PADDLE_WITH_TENSORRT
+  static int IncCounter(const std::string& key) { return dic_[key]++; }
+  static void CleanCounter() { dic_.clear(); }
+
+ private:
+  static thread_local std::unordered_map<std::string, size_t> dic_;
+#else
   int IncCounter(const std::string& key) { return dic_[key]++; }
 
  private:
   std::unordered_map<std::string, size_t> dic_;
+#endif
 };
 
 // Generate a unique PDNode's name with name_scope and id.
@@ -516,49 +524,16 @@ struct ConvBN : public PatternBase {
   PATTERN_DECL_NODE(bn_saved_variance);
 };
 
-// Conv with Activation
-// op: conv + activation
-// named nodes:
-// conv_input, conv_weight,
-// conv_out, conv,
-// activation_out, activation
-struct ConvActivation : public PatternBase {
-  ConvActivation(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "conv_activation") {}
-
-  PDNode* operator()(PDNode* conv_input,
-                     std::string conv_type = "conv2d",
-                     std::string activation_type = "relu");
+struct OperatorActivation : public PatternBase {
+  OperatorActivation(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "operator_activation") {}
 
-  // declare operator node's name
-  PATTERN_DECL_NODE(conv);
-  PATTERN_DECL_NODE(activation);
-  // declare variable node's name
-  PATTERN_DECL_NODE(conv_weight);
-  PATTERN_DECL_NODE(conv_out);
-  PATTERN_DECL_NODE(activation_out);
-};
-
-// Elementwise with Activation
-// op: elementwise + activation
-// named nodes:
-// elementwise_a, elementwise_b,
-// elementwise_out, elementwise,
-// activation_out, activation
-struct ElementwiseActivation : public PatternBase {
-  ElementwiseActivation(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "elementwise_add_activation") {}
-
-  PDNode* operator()(PDNode* elementwise_a,
-                     const std::string& elementwise_type,
+  PDNode* operator()(const std::string& operator_type,
                      const std::string& activation_type);
 
-  // declare operator node's name
-  PATTERN_DECL_NODE(elementwise);
+  PATTERN_DECL_NODE(preceding_op);
+  PATTERN_DECL_NODE(preceding_op_out);
   PATTERN_DECL_NODE(activation);
-  // declare variable node's name
-  PATTERN_DECL_NODE(elementwise_b);
-  PATTERN_DECL_NODE(elementwise_out);
   PATTERN_DECL_NODE(activation_out);
 };
 
@@ -631,45 +606,6 @@ struct FCMKLDNN : public PatternBase {
   PATTERN_DECL_NODE(output);
 };
 
-//
-// \brief   Pattern looking for fc and a directly following activation
-// operator.
-//
-// \note    Currently only gelu and tanh are supported as an activation
-// function.
-//          Formula: act(fc(x))
-//          Op: fc + act
-struct FCActOneDNN : public PatternBase {
-  FCActOneDNN(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "fc_act_onednn") {}
-
-  PDNode* operator()(const std::string& act_type);
-
-  // declare operator node's name
-  PATTERN_DECL_NODE(fc);
-  PATTERN_DECL_NODE(act);
-  PATTERN_DECL_NODE(fc_out);
-  PATTERN_DECL_NODE(act_out);
-};
-
-// Fuse softplus with activation
-// ops: softplus + activation
-// nodes:
-// softplus, softplus_out,
-// activation, activation_out
-struct SoftplusActivation : public PatternBase {
-  SoftplusActivation(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "softplus_activation") {}
-
-  PDNode* operator()(std::string activation_type);
-
-  // declare operator node's name
-  PATTERN_DECL_NODE(softplus);
-  PATTERN_DECL_NODE(activation);
-  PATTERN_DECL_NODE(softplus_out);
-  PATTERN_DECL_NODE(activation_out);
-};
-
 // Embedding
 struct Embedding : public PatternBase {
   Embedding(PDPattern* pattern, const std::string& name_scope)
@@ -1079,7 +1015,7 @@ struct Elementwise : public PatternBase {
 
   PDNode* operator()(PDNode* x_var,
                      PDNode* y_var,
-                     const std::string elementwise_type);
+                     const std::string& elementwise_type);
 
   PATTERN_DECL_NODE(elementwise_op);
   PATTERN_DECL_NODE(elementwise_x);
@@ -1094,7 +1030,7 @@ struct ElementwiseOp : public PatternBase {
   ElementwiseOp(PDPattern* pattern, const std::string& name_scope)
       : PatternBase(pattern, name_scope, "elementwise") {}
 
-  PDNode* operator()(const std::string elementwise_type);
+  PDNode* operator()(const std::string& elementwise_type);
 
   PATTERN_DECL_NODE(elementwise_op);
   PATTERN_DECL_NODE(elementwise_out);
@@ -1110,7 +1046,7 @@ struct ResidualElementwise : public PatternBase {
       : PatternBase(pattern, name_scope, "residual_elementwise") {}
   PDNode* operator()(PDNode* op_var,
                      PDNode* residual_var,
-                     const std::string elementwise_type,
+                     const std::string& elementwise_type,
                      bool as_x);
 
   PATTERN_DECL_NODE(operator_output);
@@ -1119,59 +1055,20 @@ struct ResidualElementwise : public PatternBase {
   PATTERN_DECL_NODE(elementwise_out);
 };
 
-// Transpose op
-// Forward pass for transpose.
-// transpose_out is a result of the operator.
-struct Transpose : public PatternBase {
-  Transpose(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "transpose2") {}
+// General struct for immutable ops:
+// reshape, transpose, slice, shape, nearest-interp
+// Forward pass for no weights-op.
+// immutable_out is a result of the operator.
+struct Immutable : public PatternBase {
+  Immutable(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "immutable") {}
 
-  PDNode* operator()();
-  PATTERN_DECL_NODE(prev_op);
-  PATTERN_DECL_NODE(transpose_in);
-  PATTERN_DECL_NODE(transpose_op);
-  PATTERN_DECL_NODE(transpose_out);
-};
-
-// Reshape op
-// Forward pass for reshape.
-// reshape_out is a result of the operator.
-struct Reshape : public PatternBase {
-  Reshape(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "reshape2") {}
-
-  PDNode* operator()();
-  PATTERN_DECL_NODE(prev_op);
-  PATTERN_DECL_NODE(reshape_in);
-  PATTERN_DECL_NODE(reshape_op);
-  PATTERN_DECL_NODE(reshape_out);
-};
-// Slice op
-// Forward pass for slice.
-// slice_out is a result of the operator.
-struct Slice : public PatternBase {
-  Slice(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "slice") {}
-
-  PDNode* operator()();
-  PATTERN_DECL_NODE(prev_op);
-  PATTERN_DECL_NODE(slice_in);
-  PATTERN_DECL_NODE(slice_op);
-  PATTERN_DECL_NODE(slice_out);
-};
-
-// Nearest Interp op
-// Forward pass for nearest_interp.
-// nearest_interp_out is a result of the operator.
-struct NearestInterp : public PatternBase {
-  NearestInterp(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "nearest_interp") {}
-
-  PDNode* operator()();
+  PDNode* operator()(const std::string& immutable_type,
+                     const std::string& input_name);
   PATTERN_DECL_NODE(prev_op);
-  PATTERN_DECL_NODE(nearest_interp_in);
-  PATTERN_DECL_NODE(nearest_interp_op);
-  PATTERN_DECL_NODE(nearest_interp_out);
+  PATTERN_DECL_NODE(immutable_in);
+  PATTERN_DECL_NODE(immutable_op);
+  PATTERN_DECL_NODE(immutable_out);
 };
 
 // Matmul op
diff --git a/paddle/fluid/framework/ir/ipu/inference_dtype_transfer_pass.cc b/paddle/fluid/framework/ir/ipu/inference_dtype_transfer_pass.cc
new file mode 100644
index 0000000000000..f06f05e9f0242
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/inference_dtype_transfer_pass.cc
@@ -0,0 +1,104 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/inference_dtype_transfer_pass.h"
+
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/phi/common/place.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void InferenceDtypeTransferPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter InferenceDtypeTransferPass::ApplyImpl";
+  VLOG(10) << "Raw Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  auto* ipu_backend = platform::ipu::IpuBackend::GetInstance();
+  auto enable_fp16 = ipu_backend->GetIpuStrategy()->enable_fp16;
+
+  if (enable_fp16) {
+    VLOG(10) << "Transfer var to fp16...";
+    auto* scope = ipu_backend->GetScope();
+
+    std::unordered_set<std::string> used_var_names;
+    for (auto* node : graph->Nodes()) {
+      if (node->IsVar()) {
+        auto var_desc = node->Var();
+        if (var_desc->GetDataType() == proto::VarType::FP32) {
+          // Transfer the dtypes of var_desc
+          var_desc->SetDataType(proto::VarType::FP16);
+          VLOG(10) << "Transfer the VarDesc of " << var_desc->Name() << " to "
+                   << var_desc->GetDataType();
+
+          if (node->inputs.empty() && node->Var()->Persistable() &&
+              scope->FindVar(var_desc->Name()) &&
+              used_var_names.find(var_desc->Name()) == used_var_names.end()) {
+            // Transfer the dtypes of weight tensors
+            std::vector<float16> fp16_data;
+            auto* tensor = scope->FindVar(var_desc->Name())
+                               ->GetMutable<framework::LoDTensor>();
+            auto* data_ptr = tensor->data<float>();
+            auto num_elem = tensor->numel();
+
+            std::transform(data_ptr,
+                           data_ptr + num_elem,
+                           std::back_inserter(fp16_data),
+                           [&](float elem) { return float16(elem); });
+            memcpy(reinterpret_cast<void*>(data_ptr),
+                   fp16_data.data(),
+                   num_elem * sizeof(float16));
+            tensor->set_type(
+                framework::TransToPhiDataType(proto::VarType::FP16));
+          }
+        }
+        used_var_names.insert(var_desc->Name());
+      }
+      if (node->IsOp()) {
+        auto* op_desc = node->Op();
+        if (op_desc->Type() == "popart_cast") {
+          // Transfer the target dtype of cast Op
+          if (BOOST_GET_CONST(std::string, op_desc->GetAttr("to")) == "FLOAT") {
+            op_desc->SetAttr("to", std::string("FLOAT16"));
+            op_desc->Flush();
+          }
+        }
+        if (op_desc->Type() == "popart_constant") {
+          // Transfer the dtype of fill_constant Op
+          if (op_desc->GetAttrIfExists<int>("dtype") == 1) {
+            op_desc->SetAttr("dtype", 10);
+            op_desc->Flush();
+          }
+        }
+      }
+    }
+    VLOG(10) << "Transfer var to fp16...Done";
+  }
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+  VLOG(10) << "leave InferenceDtypeTransferPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(inference_dtype_transfer_pass,
+              paddle::framework::ir::InferenceDtypeTransferPass);
diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op.cu b/paddle/fluid/framework/ir/ipu/inference_dtype_transfer_pass.h
similarity index 55%
rename from paddle/fluid/operators/optimizers/merged_momentum_op.cu
rename to paddle/fluid/framework/ir/ipu/inference_dtype_transfer_pass.h
index 7e4bbd9807938..3111968ea2bba 100644
--- a/paddle/fluid/operators/optimizers/merged_momentum_op.cu
+++ b/paddle/fluid/framework/ir/ipu/inference_dtype_transfer_pass.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,13 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/optimizers/merged_momentum_op.h"
+#pragma once
 
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
+#include "paddle/fluid/framework/ir/pass.h"
 
-REGISTER_OP_CUDA_KERNEL(
-    merged_momentum,
-    ops::MergedMomentumOpKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::MergedMomentumOpKernel<plat::CUDADeviceContext, float>,
-    ops::MergedMomentumOpKernel<plat::CUDADeviceContext, double>);
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class InferenceDtypeTransferPass : public Pass {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/ipu/inference_process_pass.cc b/paddle/fluid/framework/ir/ipu/inference_process_pass.cc
index 8357ec05c24f6..1ef03b1bd9cfb 100644
--- a/paddle/fluid/framework/ir/ipu/inference_process_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/inference_process_pass.cc
@@ -90,6 +90,9 @@ void InferenceProcessPass::ApplyImpl(ir::Graph* graph) const {
   ipu_strategy_instance_->available_memory_proportion =
       graph->Get<float>("available_memory_proportion");
 
+  // Set tiles_per_ipu for IPUMODEL
+  ipu_strategy_instance_->tiles_per_ipu = 128;
+
   ipu_backend->SetIpuStrategy(*(ipu_strategy_instance_.get()));
 
   // Get feed_list and fetch list
@@ -124,7 +127,8 @@ void InferenceProcessPass::ApplyImpl(ir::Graph* graph) const {
   std::vector<std::string> graph_pass = {"forward_graph_extract_pass",
                                          "infer_shape_pass",
                                          "avg_shard_pass",
-                                         "popart_canonicalization_pass"};
+                                         "popart_canonicalization_pass",
+                                         "inference_dtype_transfer_pass"};
   std::vector<std::string> compile_pass = {"ipu_inplace_pass",
                                            "ipu_graph_builder_pass",
                                            "ipu_runtime_replacer_pass",
diff --git a/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.cc b/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.cc
index 65ebd3ec8080d..3dc9f3d10d920 100644
--- a/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.cc
@@ -37,6 +37,9 @@ void IpuRuntimeReplacerPass::ApplyImpl(ir::Graph* graph) const {
   ipu_rt_op_desc.SetInput("FeedList", feed_list);
   ipu_rt_op_desc.SetOutput("FetchList", fetch_list);
   ipu_rt_op_desc.Flush();
+  // set op_role to avoid program.clone failure
+  ipu_rt_op_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
+                         {static_cast<int>(framework::OpRole::kForward)});
 
   // Create a new node for the ipu_runtime_op.
   auto* ipu_rt_node = graph->CreateOpNode(&ipu_rt_op_desc);
diff --git a/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc
index 6806e44f09505..222ca619c223f 100644
--- a/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc
@@ -30,8 +30,13 @@ void PopartCanonicalizationPass::ApplyImpl(ir::Graph* graph) const {
   auto custom_ops = Get<std::unordered_set<std::string>>("custom_ops");
   std::vector<std::string> missing_ops;
   auto sorted_ops = TopologySortOperations(*graph);
+  std::unordered_set<Node*> delete_nodes;
   for (auto* node : sorted_ops) {
     auto* op = node->Op();
+    if (platform::ipu::IsMarkedForDeletion(node)) {
+      delete_nodes.insert(node);
+      continue;
+    }
     auto op_type = op->Type();
 
     ir::Node* new_node = nullptr;
@@ -67,6 +72,12 @@ void PopartCanonicalizationPass::ApplyImpl(ir::Graph* graph) const {
         "Found unimplemented op_handler(s) for IPU"));
   }
 
+  for (auto* node : delete_nodes) {
+    // TODO(czr): possible remove
+    platform::ipu::ClearNode(node);
+    graph->RemoveNode(node);
+  }
+
   // post popart_canonicalization
 
   VLOG(10) << "Post Graph: ";
diff --git a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc
index ed4d586b8bb5c..99eaab49b7926 100644
--- a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc
@@ -372,7 +372,7 @@ std::unordered_set<std::string> ComputePropagateScalesMkldnnPass::UpdateScales(
     const auto op_name = op_node->Name();
     if (scale_immutable_ops.count(op_name)) {
       std::string input_name;
-      if (op_name == "slice") {
+      if (op_name == "slice" || op_name == "shape") {
         input_name = op_node->Op()->Input("Input")[0];
       } else {
         input_name = op_node->Op()->Input("X")[0];
@@ -390,6 +390,13 @@ std::unordered_set<std::string> ComputePropagateScalesMkldnnPass::UpdateScales(
       } else if (out_iter != var_quant_scales->end()) {
         (*var_quant_scales)[input_name] = out_iter->second;
       }
+    } else if (op_name == "concat") {
+      auto out_iter = var_quant_scales->find(op_node->Op()->Output("Out")[0]);
+      if (out_iter != var_quant_scales->end()) {
+        std::vector<std::string> input_names = op_node->Op()->Input("X");
+        for (auto input_name : input_names)
+          (*var_quant_scales)[input_name] = out_iter->second;
+      }
     } else if (op_name == "scale") {
       const std::string output_name = op_node->Op()->Output("Out")[0];
       auto out_iter = var_quant_scales->find(output_name);
@@ -445,6 +452,7 @@ void ComputePropagateScalesMkldnnPass::ApplyImpl(ir::Graph* graph) const {
       "reshape2",
       "pool2d",
       "slice",
+      "shape",
       "nearest_interp",
       "nearest_interp_v2"};
 
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
index 4eefc2987bcb4..bd07967757b8a 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h"
 
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
 #include "paddle/fluid/string/pretty_log.h"
 
 namespace paddle {
@@ -24,61 +25,26 @@ namespace ir {
 using string::PrettyLogDetail;
 
 void ConvActivationMkldnnFusePass::ApplyImpl(Graph* graph) const {
-  std::vector<std::string> act_types = {"relu",
-                                        "mish",
-                                        "swish",
-                                        "sqrt",
-                                        "hard_swish",
-                                        "sigmoid",
-                                        "abs",
-                                        "gelu",
-                                        "relu6",
-                                        "clip",
-                                        "tanh",
-                                        "hard_sigmoid",
-                                        "leaky_relu"};
-
+  auto act_types = paddle::platform::GetSupportedActivations();
   std::vector<std::string> conv_types = {"conv2d"};
 
   for (const auto& conv_type : conv_types)
     for (auto& act_type : act_types) {
-      std::unordered_map<std::string, std::string> attrs_map;
-
-      if (act_type == "swish")
-        attrs_map.emplace("beta", "fuse_alpha");
-      else if (act_type == "relu6")
-        attrs_map.emplace("threshold", "fuse_alpha");
-      else if (act_type == "hard_sigmoid") {
-        attrs_map.emplace("slope", "fuse_alpha");
-        attrs_map.emplace("offset", "fuse_beta");
-      } else if (act_type == "clip") {
-        attrs_map.emplace("min", "fuse_alpha");
-        attrs_map.emplace("max", "fuse_beta");
-      } else {
-        attrs_map.emplace("alpha", "fuse_alpha");
-        attrs_map.emplace("beta", "fuse_beta");
-      }
-      FuseConvAct(graph, conv_type, act_type, attrs_map);
+      FuseConvAct(graph, conv_type, act_type);
     }
 }
 
-void ConvActivationMkldnnFusePass::FuseConvAct(
-    Graph* graph,
-    const std::string& conv_type,
-    std::string& act_type,
-    const std::unordered_map<std::string, std::string>& attrs_map) const {
+void ConvActivationMkldnnFusePass::FuseConvAct(Graph* graph,
+                                               const std::string& conv_type,
+                                               std::string& act_type) const {
   PADDLE_ENFORCE_NOT_NULL(
       graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
   FusePassBase::Init(conv_type + "_" + act_type + "_mkldnn_fuse_pass", graph);
 
   GraphPatternDetector gpd;
-  auto* conv_input = gpd.mutable_pattern()
-                         ->NewNode("conv_activation_mkldnn_fuse/conv_input")
-                         ->AsInput()
-                         ->assert_is_op_input(conv_type, "Input");
-  patterns::ConvActivation conv_act_pattern(gpd.mutable_pattern(),
-                                            "conv_activation_mkldnn_fuse");
-  conv_act_pattern(conv_input, conv_type, act_type);
+  patterns::OperatorActivation conv_act_pattern(gpd.mutable_pattern(),
+                                                "conv_activation_mkldnn_fuse");
+  conv_act_pattern(conv_type, act_type);
 
   int found_conv_activation_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
@@ -90,16 +56,16 @@ void ConvActivationMkldnnFusePass::FuseConvAct(
       return;
     }
 
-    GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, conv_act_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_act_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_act_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(activation_out, activation_out, conv_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv, preceding_op, conv_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_out, preceding_op_out, conv_act_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(activation, activation, conv_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(activation_out, activation_out, conv_act_pattern);
 
     OpDesc* conv_op = conv->Op();
     OpDesc* act_op = activation->Op();
 
-    for (const auto& attrs : attrs_map) {
+    auto attr_map = paddle::platform::GetAttributeMap(act_type);
+    for (const auto& attrs : attr_map) {
       if (act_op->HasAttr(attrs.first)) {
         conv_op->SetAttr(attrs.second, act_op->GetAttr(attrs.first));
       }
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h
index e1e2898384609..11925e1992df4 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h
@@ -31,11 +31,9 @@ class ConvActivationMkldnnFusePass : public FusePassBase {
  protected:
   void ApplyImpl(Graph *graph) const override;
 
-  void FuseConvAct(
-      Graph *graph,
-      const std::string &conv_type,
-      std::string &act_type,
-      const std::unordered_map<std::string, std::string> &attrs_map) const;
+  void FuseConvAct(Graph *graph,
+                   const std::string &conv_type,
+                   std::string &act_type) const;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index b0d41c16f5e98..7cfc3f3336d5f 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -669,165 +669,68 @@ void CPUQuantizePass::QuantizePriorBox(Graph* graph) const {
   LogQuantizedOpsCounter("prior_box", quantize_prior_box_count);
 }
 
-void CPUQuantizePass::QuantizeTranspose(Graph* graph) const {
+void CPUQuantizePass::QuantizeImmutable(Graph* graph,
+                                        const std::string& immutable_type,
+                                        const std::string& input_name) const {
   GraphPatternDetector gpd;
   auto pattern = gpd.mutable_pattern();
-  patterns::Transpose transpose_pattern{pattern, name_scope_};
-  transpose_pattern();
+  patterns::Immutable immutable_pattern{pattern, name_scope_};
+  immutable_pattern(immutable_type, input_name);
 
-  int quantize_transpose_count = 0;
+  int quantize_immutable_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
-    VLOG(4) << "Quantize transpose op";
-    GET_IR_NODE_FROM_SUBGRAPH(transpose_op, transpose_op, transpose_pattern);
+    VLOG(4) << "Quantize " + immutable_type + " op";
+    GET_IR_NODE_FROM_SUBGRAPH(immutable_op, immutable_op, immutable_pattern);
 
     // skip if should not be quantized
-    if (!platform::HasOpINT8DataType(transpose_op->Op())) {
-      LogQuantizationDisabled(transpose_op);
+    if (!platform::HasOpINT8DataType(immutable_op->Op())) {
+      LogQuantizationDisabled(immutable_op);
       return;
     }
-    GET_IR_NODE_FROM_SUBGRAPH(prev_op, prev_op, transpose_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(transpose_in, transpose_in, transpose_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(transpose_out, transpose_out, transpose_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(prev_op, prev_op, immutable_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(immutable_in, immutable_in, immutable_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(immutable_out, immutable_out, immutable_pattern);
 
     // skip if prev op and next op is not quantized
-    if (!(IsOpDequantized(prev_op)) && !(IsOpQuantized(transpose_out))) {
-      MarkAndLogCannotQuantizeOp(transpose_op,
+    if (!IsOpDequantized(prev_op) && !IsOpQuantized(immutable_out)) {
+      MarkAndLogCannotQuantizeOp(immutable_op,
                                  "No other quantizable operators nearby");
       return;
     }
 
-    if (!AreScalesPresentForNodes({transpose_in, transpose_out})) {
-      MarkAndLogCannotQuantizeOp(transpose_op,
+    if (!AreScalesPresentForNodes({immutable_out})) {
+      MarkAndLogCannotQuantizeOp(immutable_op,
                                  "No scale available for the operator");
       return;
     }
 
     bool is_input_unsigned{false};
-    auto input_scale = GetScaleValueForNode(transpose_in, &is_input_unsigned);
-    QuantizeInput(
-        g, transpose_op, transpose_in, "X", input_scale, is_input_unsigned);
+    auto input_scale = GetScaleValueForNode(immutable_out, &is_input_unsigned);
+
+    QuantizeInput(g,
+                  immutable_op,
+                  immutable_in,
+                  input_name,
+                  input_scale,
+                  is_input_unsigned);
 
     bool is_output_unsigned{false};
     auto output_scale =
-        GetScaleValueForNode(transpose_out, &is_output_unsigned);
+        GetScaleValueForNode(immutable_out, &is_output_unsigned);
     DequantizeOutput(g,
-                     transpose_op,
-                     transpose_out,
+                     immutable_op,
+                     immutable_out,
                      "Out",
                      output_scale,
                      is_output_unsigned);
 
-    ++quantize_transpose_count;
+    ++quantize_immutable_count;
   };
 
   gpd(graph, handler);
-  AddStatis(quantize_transpose_count);
-  LogQuantizedOpsCounter("transpose2", quantize_transpose_count);
-}
-
-void CPUQuantizePass::QuantizeReshape(Graph* graph) const {
-  GraphPatternDetector gpd;
-  auto pattern = gpd.mutable_pattern();
-  patterns::Reshape reshape_pattern{pattern, name_scope_};
-  reshape_pattern();
-
-  int quantize_reshape_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    VLOG(4) << "Quantize reshape op";
-    GET_IR_NODE_FROM_SUBGRAPH(reshape_op, reshape_op, reshape_pattern);
-
-    // skip if should not be quantized
-    if (!platform::HasOpINT8DataType(reshape_op->Op())) {
-      LogQuantizationDisabled(reshape_op);
-      return;
-    }
-    GET_IR_NODE_FROM_SUBGRAPH(prev_op, prev_op, reshape_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(reshape_in, reshape_in, reshape_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(reshape_out, reshape_out, reshape_pattern);
-
-    // skip if prev op is not quantized
-    if (!(IsOpDequantized(prev_op)) && !(IsOpQuantized(reshape_out))) {
-      MarkAndLogCannotQuantizeOp(reshape_op,
-                                 "No other quantizable operators nearby");
-      return;
-    }
-
-    if (!AreScalesPresentForNodes({reshape_in, reshape_out})) {
-      MarkAndLogCannotQuantizeOp(reshape_op,
-                                 "No scale available for the operator");
-      return;
-    }
-
-    bool is_input_unsigned{false};
-    auto input_scale = GetScaleValueForNode(reshape_in, &is_input_unsigned);
-    QuantizeInput(
-        g, reshape_op, reshape_in, "X", input_scale, is_input_unsigned);
-
-    bool is_output_unsigned{false};
-    auto output_scale = GetScaleValueForNode(reshape_out, &is_output_unsigned);
-    DequantizeOutput(
-        g, reshape_op, reshape_out, "Out", output_scale, is_output_unsigned);
-
-    ++quantize_reshape_count;
-  };
-
-  gpd(graph, handler);
-  AddStatis(quantize_reshape_count);
-  LogQuantizedOpsCounter("reshape2", quantize_reshape_count);
-}
-
-void CPUQuantizePass::QuantizeSlice(Graph* graph) const {
-  GraphPatternDetector gpd;
-  auto pattern = gpd.mutable_pattern();
-  patterns::Slice slice_pattern{pattern, name_scope_};
-  slice_pattern();
-
-  int quantize_slice_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    VLOG(4) << "Quantize slice op";
-    GET_IR_NODE_FROM_SUBGRAPH(slice_op, slice_op, slice_pattern);
-
-    // skip if should not be quantized
-    if (!platform::HasOpINT8DataType(slice_op->Op())) {
-      LogQuantizationDisabled(slice_op);
-      return;
-    }
-    GET_IR_NODE_FROM_SUBGRAPH(prev_op, prev_op, slice_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(slice_in, slice_in, slice_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(slice_out, slice_out, slice_pattern);
-
-    // skip if prev op and next op is not quantized
-    if (!IsOpDequantized(prev_op) && !IsOpQuantized(slice_out)) {
-      MarkAndLogCannotQuantizeOp(slice_op,
-                                 "No other quantizable operators nearby");
-      return;
-    }
-
-    if (!AreScalesPresentForNodes({slice_out})) {
-      MarkAndLogCannotQuantizeOp(slice_op,
-                                 "No scale available for the operator");
-      return;
-    }
-
-    bool is_input_unsigned{false};
-    auto input_scale = GetScaleValueForNode(slice_out, &is_input_unsigned);
-    QuantizeInput(
-        g, slice_op, slice_in, "Input", input_scale, is_input_unsigned);
-
-    bool is_output_unsigned{false};
-    auto output_scale = GetScaleValueForNode(slice_out, &is_output_unsigned);
-    DequantizeOutput(
-        g, slice_op, slice_out, "Out", output_scale, is_output_unsigned);
-
-    ++quantize_slice_count;
-  };
-
-  gpd(graph, handler);
-  AddStatis(quantize_slice_count);
-  LogQuantizedOpsCounter("slice", quantize_slice_count);
+  AddStatis(quantize_immutable_count);
+  LogQuantizedOpsCounter(immutable_type, quantize_immutable_count);
 }
 
 void CPUQuantizePass::QuantizeMatmul(Graph* graph) const {
@@ -915,7 +818,7 @@ void CPUQuantizePass::QuantizeMatmul(Graph* graph) const {
 }
 
 void CPUQuantizePass::QuantizeElementwise(
-    Graph* graph, const std::string elementwise_type) const {
+    Graph* graph, const std::string& elementwise_type) const {
   GraphPatternDetector gpd;
   auto pattern = gpd.mutable_pattern();
   patterns::ElementwiseOp elementwise_pattern{pattern, name_scope_};
@@ -1212,71 +1115,6 @@ void CPUQuantizePass::QuantizeFusionLSTM(Graph* graph) const {
   LogQuantizedOpsCounter("fusion_lstm", quantize_count);
 }
 
-void CPUQuantizePass::QuantizeNearestInterp(Graph* graph) const {
-  GraphPatternDetector gpd;
-  auto pattern = gpd.mutable_pattern();
-  patterns::NearestInterp nearest_interp_pattern{pattern, name_scope_};
-  nearest_interp_pattern();
-
-  int quantize_nearest_interp_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    VLOG(4) << "Quantize nearest_interp op";
-    GET_IR_NODE_FROM_SUBGRAPH(
-        nearest_interp_op, nearest_interp_op, nearest_interp_pattern);
-
-    // skip if should not be quantized
-    if (!platform::HasOpINT8DataType(nearest_interp_op->Op())) {
-      LogQuantizationDisabled(nearest_interp_op);
-      return;
-    }
-    GET_IR_NODE_FROM_SUBGRAPH(prev_op, prev_op, nearest_interp_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(
-        nearest_interp_in, nearest_interp_in, nearest_interp_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(
-        nearest_interp_out, nearest_interp_out, nearest_interp_pattern);
-
-    // skip if prev op and next op is not quantized
-    if (!(IsOpDequantized(prev_op)) && !(IsOpQuantized(nearest_interp_out))) {
-      MarkAndLogCannotQuantizeOp(nearest_interp_op,
-                                 "No other quantizable operators nearby");
-      return;
-    }
-
-    if (!AreScalesPresentForNodes({nearest_interp_in, nearest_interp_out})) {
-      MarkAndLogCannotQuantizeOp(nearest_interp_op,
-                                 "No scale available for the operator");
-      return;
-    }
-
-    bool is_input_unsigned{false};
-    auto input_scale =
-        GetScaleValueForNode(nearest_interp_in, &is_input_unsigned);
-    QuantizeInput(g,
-                  nearest_interp_op,
-                  nearest_interp_in,
-                  "X",
-                  input_scale,
-                  is_input_unsigned);
-
-    bool is_output_unsigned{false};
-    auto output_scale =
-        GetScaleValueForNode(nearest_interp_out, &is_output_unsigned);
-    DequantizeOutput(g,
-                     nearest_interp_op,
-                     nearest_interp_out,
-                     "Out",
-                     output_scale,
-                     is_output_unsigned);
-
-    ++quantize_nearest_interp_count;
-  };
-
-  gpd(graph, handler);
-  AddStatis(quantize_nearest_interp_count);
-  LogQuantizedOpsCounter("nearest_interp", quantize_nearest_interp_count);
-}
-
 void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
   VLOG(3) << "Quantizing the graph.";
   PADDLE_ENFORCE_NOT_NULL(
@@ -1293,18 +1131,20 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
   QuantizePool(graph);
   QuantizeConcat(graph);
   QuantizePriorBox(graph);
-  QuantizeTranspose(graph);
   QuantizeFc(graph);
-  QuantizeReshape(graph);
   QuantizeMatmul(graph);
+  QuantizeImmutable(graph, "reshape2", "X");
+  QuantizeImmutable(graph, "transpose2", "X");
+  QuantizeImmutable(graph, "slice", "Input");
+  QuantizeImmutable(graph, "shape", "Input");
+  QuantizeImmutable(graph, "nearest_interp", "X");
+  QuantizeImmutable(graph, "nearest_interp_v2", "X");
   QuantizeElementwise(graph, "elementwise_add");
   QuantizeElementwise(graph, "elementwise_mul");
   QuantizeElementwise(graph, "elementwise_sub");
   QuantizeFusionGru(graph);
   QuantizeMultiGru(graph);
   QuantizeFusionLSTM(graph);
-  QuantizeSlice(graph);
-  QuantizeNearestInterp(graph);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
index a880907402b3c..56909b7fe7fb5 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
@@ -54,16 +54,15 @@ class CPUQuantizePass : public FusePassBase {
   void QuantizePool(Graph* graph) const;
   void QuantizeConcat(Graph* graph) const;
   void QuantizePriorBox(Graph* graph) const;
-  void QuantizeTranspose(Graph* graph) const;
-  void QuantizeReshape(Graph* graph) const;
   void QuantizeMatmul(Graph* graph) const;
   void QuantizeElementwise(Graph* graph,
-                           const std::string elementwise_type) const;
+                           const std::string& elementwise_type) const;
   void QuantizeFusionGru(Graph* graph) const;
   void QuantizeMultiGru(Graph* graph) const;
   void QuantizeFusionLSTM(Graph* graph) const;
-  void QuantizeSlice(Graph* graph) const;
-  void QuantizeNearestInterp(Graph* graph) const;
+  void QuantizeImmutable(Graph* graph,
+                         const std::string& immutable_type,
+                         const std::string& input_name) const;
 
   void QuantizeInput(Graph* g,
                      Node* op,
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
index 4fa79f6a87ca8..ec7432e83f874 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
@@ -66,7 +66,7 @@ void SetOp(ProgramDesc* prog,
              type == "nearest_interp" || type == "nearest_interp_v2") {
     op->SetInput("X", {inputs[0]});
     op->SetOutput("Out", {outputs[0]});
-  } else if (type == "slice") {
+  } else if (type == "slice" || type == "shape") {
     op->SetInput("Input", {inputs[0]});
     op->SetOutput("Out", {outputs[0]});
   } else if (type == "dropout") {
@@ -550,55 +550,33 @@ void TestImmutableOpWithManyOutputs(const std::string tested_op) {
            SCALE * S8_MAX);
 }
 
-TEST(CpuQuantizePass, reshape2) { TestImmutableOp("reshape2"); }
+const std::vector<std::string> immutables = {"reshape2",
+                                             "transpose2",
+                                             "slice",
+                                             "shape",
+                                             "nearest_interp",
+                                             "nearest_interp_v2"};
 
-TEST(CpuQuantizePass, reshape2BetweenNonQuantizedOp) {
-  TestImmutableOpBetweenNonQuantizedOp("reshape2");
-}
-
-TEST(CpuQuantizePass, reshape2WithManyOutputs) {
-  TestImmutableOpWithManyOutputs("reshape2");
-}
-
-TEST(CpuQuantizePass, transpose2) { TestImmutableOp("transpose2"); }
-
-TEST(CpuQuantizePass, transpose2BetweenNonQuantizedOp) {
-  TestImmutableOpBetweenNonQuantizedOp("transpose2");
-}
-
-TEST(CpuQuantizePass, transpose2WithManyOutputs) {
-  TestImmutableOpWithManyOutputs("transpose2");
-}
-
-TEST(CpuQuantizePass, slice) { TestImmutableOp("slice"); }
-
-TEST(CpuQuantizePass, sliceBetweenNonQuantizedOp) {
-  TestImmutableOpBetweenNonQuantizedOp("slice");
-}
-
-TEST(CpuQuantizePass, sliceWithManyOutputs) {
-  TestImmutableOpWithManyOutputs("slice");
-}
+class TestImmutables : public testing::TestWithParam<std::string> {};
 
-TEST(CpuQuantizePass, nearestInterp) { TestImmutableOp("nearest_interp"); }
-
-TEST(CpuQuantizePass, nearestInterpBetweenNonQuantizedOp) {
-  TestImmutableOpBetweenNonQuantizedOp("nearest_interp");
-}
+TEST_P(TestImmutables, immutable_basic) { TestImmutableOp(GetParam()); }
 
-TEST(CpuQuantizePass, nearestInterpWithManyOutputs) {
-  TestImmutableOpWithManyOutputs("nearest_interp");
+TEST_P(TestImmutables, immutable_between_non_quantized) {
+  TestImmutableOpBetweenNonQuantizedOp(GetParam());
 }
 
-TEST(CpuQuantizePass, nearestInterpV2) { TestImmutableOp("nearest_interp_v2"); }
-
-TEST(CpuQuantizePass, nearestInterpV2BetweenNonQuantizedOp) {
-  TestImmutableOpBetweenNonQuantizedOp("nearest_interp_v2");
+TEST_P(TestImmutables, immutable_many_outputs) {
+  TestImmutableOpWithManyOutputs(GetParam());
 }
 
-TEST(CpuQuantizePass, nearestInterpV2WithManyOutputs) {
-  TestImmutableOpWithManyOutputs("nearest_interp_v2");
-}
+INSTANTIATE_TEST_CASE_P(
+    CpuQuantizePass,
+    TestImmutables,
+    testing::ValuesIn(immutables),
+    [](const ::testing::TestParamInfo<TestImmutables::ParamType>& info) {
+      std::string name = info.param;
+      return name;
+    });
 
 static const std::initializer_list<std::string> variable_names_matmul = {
     "a", "b", "c", "d", "e", "f"};
@@ -735,7 +713,7 @@ TEST_P(TestElementwises, elementwise_unsigned_and_signed_input) {
 }
 
 INSTANTIATE_TEST_CASE_P(
-    Elementwises,
+    CpuQuantizePass,
     TestElementwises,
     testing::ValuesIn(elementwises),
     [](const ::testing::TestParamInfo<TestElementwises::ParamType>& info) {
diff --git a/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc
index a96ce5e297a87..b28b07924d888 100644
--- a/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc
@@ -17,6 +17,7 @@
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
 #include "paddle/fluid/string/pretty_log.h"
 
 namespace paddle {
@@ -26,71 +27,40 @@ namespace ir {
 using string::PrettyLogDetail;
 
 void ElementwiseActivationOneDNNPass::ApplyImpl(Graph *graph) const {
-  std::vector<std::string> act_types = {"relu",
-                                        "tanh",
-                                        "leaky_relu",
-                                        "swish",
-                                        "hard_swish",
-                                        "sqrt",
-                                        "abs",
-                                        "clip",
-                                        "gelu",
-                                        "relu6",
-                                        "sigmoid"};
+  auto act_types = paddle::platform::GetSupportedActivations();
   std::vector<std::string> elt_types = {
       "elementwise_add", "elementwise_sub", "elementwise_mul"};
 
   for (const auto &elt_type : elt_types)
     for (const auto &act_type : act_types) {
-      std::unordered_map<std::string, std::string> attr_map;
-
-      if (act_type == "swish")
-        attr_map.emplace("beta", "activation_alpha");
-      else if (act_type == "relu6")
-        attr_map.emplace("threshold", "activation_alpha");
-      else if (act_type == "clip") {
-        attr_map.emplace("min", "activation_alpha");
-        attr_map.emplace("max", "activation_beta");
-      } else {
-        attr_map.emplace("alpha", "activation_alpha");
-        attr_map.emplace("beta", "activation_beta");
-      }
-      FuseElementwiseAct(graph, elt_type, act_type, attr_map);
+      FuseElementwiseAct(graph, elt_type, act_type);
     }
 }
 
 void ElementwiseActivationOneDNNPass::FuseElementwiseAct(
     Graph *graph,
     const std::string &elt_type,
-    const std::string &act_type,
-    const std::unordered_map<std::string, std::string> &attr_map) const {
+    const std::string &act_type) const {
   PADDLE_ENFORCE_NOT_NULL(
       graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
   FusePassBase::Init(elt_type + "_" + act_type + "_mkldnn_fuse_pass", graph);
 
   GraphPatternDetector gpd;
-  auto *elementwise_input = gpd.mutable_pattern()
-                                ->NewNode(elt_type + "_act/elementwise_input")
-                                ->AsInput()
-                                ->assert_is_op_input(elt_type, "X");
-  patterns::ElementwiseActivation elementwise_act_pattern(gpd.mutable_pattern(),
-                                                          elt_type + "_act");
-  elementwise_act_pattern(elementwise_input, elt_type, act_type);
+  patterns::OperatorActivation elementwise_act_pattern(gpd.mutable_pattern(),
+                                                       elt_type + "_act");
+  elementwise_act_pattern(elt_type, act_type);
 
   int found_elementwise_activation_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
                      Graph *g) {
     VLOG(4) << "Fuse " << elt_type << " with activation op.";
-    // Elementwise output
-    GET_IR_NODE_FROM_SUBGRAPH(
-        elementwise_out, elementwise_out, elementwise_act_pattern);
-    // ACT output
     GET_IR_NODE_FROM_SUBGRAPH(
-        activation_out, activation_out, elementwise_act_pattern);
-    // ops
+        elementwise, preceding_op, elementwise_act_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(
-        elementwise, elementwise, elementwise_act_pattern);
+        elementwise_out, preceding_op_out, elementwise_act_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(activation, activation, elementwise_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        activation_out, activation_out, elementwise_act_pattern);
 
     auto *elementwise_op = elementwise->Op();
 
@@ -106,6 +76,7 @@ void ElementwiseActivationOneDNNPass::FuseElementwiseAct(
     }
 
     auto *activation_op = activation->Op();
+    auto attr_map = paddle::platform::GetAttributeMap(act_type);
     for (const auto &attr : attr_map) {
       if (activation_op->HasAttr(attr.first)) {
         elementwise_op->SetAttr(attr.second,
@@ -115,9 +86,9 @@ void ElementwiseActivationOneDNNPass::FuseElementwiseAct(
 
     if (act_type == "gelu" && activation_op->HasAttr("approximate") &&
         BOOST_GET_CONST(bool, activation_op->GetAttr("approximate")))
-      elementwise_op->SetAttr("activation_type", std::string("gelu_tanh"));
+      elementwise_op->SetAttr("fuse_activation", std::string("gelu_tanh"));
     else
-      elementwise_op->SetAttr("activation_type", act_type);
+      elementwise_op->SetAttr("fuse_activation", act_type);
 
     elementwise_op->SetOutput("Out", {activation_out->Name()});
 
@@ -128,10 +99,11 @@ void ElementwiseActivationOneDNNPass::FuseElementwiseAct(
 
   gpd(graph, handler);
   AddStatis(found_elementwise_activation_count);
-  PrettyLogDetail("---    fused %d %s with %s activation",
-                  found_elementwise_activation_count,
-                  elt_type,
-                  act_type);
+  if (!Has("disable_logs") || !Get<bool>("disable_logs"))
+    PrettyLogDetail("---    fused %d %s with %s activation",
+                    found_elementwise_activation_count,
+                    elt_type,
+                    act_type);
 }
 
 }  // namespace ir
@@ -146,14 +118,16 @@ REGISTER_PASS_CAPABILITY(elt_act_mkldnn_fuse_pass)
             .LE("elementwise_add", 1)
             .LE("elementwise_sub", 1)
             .LE("elementwise_mul", 1)
-            .LE("relu", 0)
-            .LE("tanh", 0)
-            .LE("leaky_relu", 1)
-            .LE("swish", 0)
-            .LE("hard_swish", 0)
-            .LE("sqrt", 0)
-            .LE("abs", 0)
+            .EQ("abs", 0)
             .LE("clip", 1)
-            .LE("gelu", 0)
-            .LE("relu6", 0)
-            .LE("sigmoid", 0));
+            .EQ("gelu", 0)
+            .EQ("hard_sigmoid", 0)
+            .LE("hard_swish", 0)
+            .LE("leaky_relu", 1)
+            .LE("mish", 1)
+            .EQ("relu", 0)
+            .EQ("relu6", 0)
+            .EQ("sigmoid", 0)
+            .EQ("sqrt", 0)
+            .EQ("swish", 0)
+            .EQ("tanh", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h
index 8df479e3ddf06..37bd5345ec78f 100644
--- a/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h
@@ -34,11 +34,9 @@ class ElementwiseActivationOneDNNPass : public FusePassBase {
  protected:
   void ApplyImpl(Graph *graph) const override;
 
-  void FuseElementwiseAct(
-      Graph *graph,
-      const std::string &elt_types,
-      const std::string &act_types,
-      const std::unordered_map<std::string, std::string> &attr_map) const;
+  void FuseElementwiseAct(Graph *graph,
+                          const std::string &elt_types,
+                          const std::string &act_types) const;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc
index 99243ec7d7047..e5031c83aac16 100644
--- a/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc
@@ -39,20 +39,17 @@ void FuseFCActOneDNNPass::FuseFCAct(Graph *graph,
   FusePassBase::Init("fc_act", graph);
 
   GraphPatternDetector gpd;
-  patterns::FCActOneDNN fc_act_pattern(gpd.mutable_pattern(), "fc_act");
-  fc_act_pattern(act_type);
+  patterns::OperatorActivation fc_act_pattern(gpd.mutable_pattern(), "fc_act");
+  fc_act_pattern("fc", act_type);
 
   int found_fc_act_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
                      Graph *g) {
     VLOG(4) << "Fuse fc with activation op.";
-    // FC output
-    GET_IR_NODE_FROM_SUBGRAPH(fc_out, fc_out, fc_act_pattern);
-    // ACT output
-    GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, fc_act_pattern);
-    // ops
-    GET_IR_NODE_FROM_SUBGRAPH(fc, fc, fc_act_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(act, act, fc_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(fc, preceding_op, fc_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(fc_out, preceding_op_out, fc_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(act, activation, fc_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(act_out, activation_out, fc_act_pattern);
 
     auto *fc_op = fc->Op();
     auto *act_op = act->Op();
diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc
new file mode 100644
index 0000000000000..80f49c97e8465
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc
@@ -0,0 +1,281 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.h"
+
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using string::PrettyLogDetail;
+
+void MatmulActivationMkldnnFusePass::ApplyImpl(Graph* graph) const {
+  auto act_types = paddle::platform::GetSupportedActivations();
+  std::vector<std::string> matmul_types = {"matmul"};
+
+  for (const auto& matmul_type : matmul_types)
+    for (auto& act_type : act_types) {
+      FuseMatmulAct(graph, matmul_type, act_type);
+    }
+}
+
+void MatmulActivationMkldnnFusePass::FuseMatmulAct(
+    Graph* graph, const std::string& matmul_type, std::string& act_type) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
+  FusePassBase::Init(matmul_type + "_" + act_type + "_mkldnn_fuse_pass", graph);
+
+  GraphPatternDetector gpd;
+  patterns::OperatorActivation matmul_act_pattern(
+      gpd.mutable_pattern(), "matmul_activation_mkldnn_fuse");
+  matmul_act_pattern(matmul_type, act_type);
+
+  int found_matmul_activation_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "handle " + matmul_type + "+" + act_type + " fuse";
+
+    if (!IsCompat(subgraph, g)) {
+      LOG(WARNING) << "matmul_activation_mkldnn_fuse_pass op compat failed.";
+      return;
+    }
+
+    GET_IR_NODE_FROM_SUBGRAPH(matmul, preceding_op, matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_out, preceding_op_out, matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(activation, activation, matmul_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        activation_out, activation_out, matmul_act_pattern);
+
+    OpDesc* matmul_op = matmul->Op();
+    OpDesc* act_op = activation->Op();
+
+    auto attr_map = paddle::platform::GetAttributeMap(act_type);
+    for (const auto& attrs : attr_map) {
+      if (act_op->HasAttr(attrs.first)) {
+        matmul_op->SetAttr(attrs.second, act_op->GetAttr(attrs.first));
+      }
+    }
+
+    if (act_type == "gelu" && activation->Op()->HasAttr("approximate")) {
+      act_type = BOOST_GET_CONST(bool, activation->Op()->GetAttr("approximate"))
+                     ? "gelu_tanh"
+                     : "gelu_erf";
+    }
+    matmul_op->SetAttr("fuse_activation", act_type);
+    matmul_op->SetOutput("Out", {activation_out->Name()});
+
+    IR_NODE_LINK_TO(matmul, activation_out);
+    GraphSafeRemoveNodes(graph, {activation, matmul_out});
+    found_matmul_activation_count++;
+  };
+
+  gpd(graph, handler);
+  AddStatis(found_matmul_activation_count);
+  if (!Has("disable_logs") || !Get<bool>("disable_logs")) {
+    PrettyLogDetail("---    fused %d matmul with %s activation",
+                    found_matmul_activation_count,
+                    act_type);
+  }
+}
+
+MatmulActivationMkldnnFusePass::MatmulActivationMkldnnFusePass() {
+  AddOpCompat(OpCompat("matmul"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("alpha")
+      .IsType<float>()
+      .End()
+      .AddAttr("transpose_X")
+      .IsType<bool>()
+      .End()
+      .AddAttr("transpose_Y")
+      .IsType<bool>()
+      .End();
+
+  AddOpCompat(OpCompat("abs"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End();
+
+  AddOpCompat(OpCompat("clip"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("min")
+      .End()
+      .AddAttr("max")
+      .End();
+
+  AddOpCompat(OpCompat("gelu"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("approximate")
+      .IsType<bool>()
+      .End();
+
+  AddOpCompat(OpCompat("hard_sigmoid"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("slope")
+      .IsOptional()
+      .IsType<float>()
+      .End()
+      .AddAttr("offset")
+      .IsOptional()
+      .IsType<float>()
+      .End();
+
+  AddOpCompat(OpCompat("hard_swish"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("threshold")
+      .IsOptional()
+      .IsType<float>()
+      .End()
+      .AddAttr("scale")
+      .IsOptional()
+      .IsType<float>()
+      .End()
+      .AddAttr("offset")
+      .IsOptional()
+      .IsType<float>()
+      .End();
+
+  AddOpCompat(OpCompat("leaky_relu"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("alpha")
+      .IsType<float>()
+      .End();
+
+  AddOpCompat(OpCompat("mish"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End();
+
+  AddOpCompat(OpCompat("relu"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End();
+
+  AddOpCompat(OpCompat("relu6"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("threshold")
+      .IsType<float>()
+      .End();
+
+  AddOpCompat(OpCompat("sigmoid"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End();
+
+  AddOpCompat(OpCompat("sqrt"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End();
+
+  AddOpCompat(OpCompat("swish"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("beta")
+      .IsType<float>()
+      .End();
+
+  AddOpCompat(OpCompat("tanh"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End();
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(matmul_activation_mkldnn_fuse_pass,
+              paddle::framework::ir::MatmulActivationMkldnnFusePass);
+
+REGISTER_PASS_CAPABILITY(matmul_activation_mkldnn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .LE("matmul", 1)
+            .EQ("abs", 0)
+            .LE("clip", 1)
+            .EQ("gelu", 0)
+            .EQ("hard_sigmoid", 0)
+            .LE("hard_swish", 0)
+            .LE("leaky_relu", 1)
+            .LE("mish", 1)
+            .EQ("relu", 0)
+            .EQ("relu6", 0)
+            .EQ("sigmoid", 0)
+            .EQ("sqrt", 0)
+            .EQ("swish", 0)
+            .EQ("tanh", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.h
new file mode 100644
index 0000000000000..ebef63e292438
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class MatmulActivationMkldnnFusePass : public FusePassBase {
+ public:
+  MatmulActivationMkldnnFusePass();
+  virtual ~MatmulActivationMkldnnFusePass() {}
+
+ protected:
+  void ApplyImpl(Graph *graph) const override;
+
+  void FuseMatmulAct(Graph *graph,
+                     const std::string &matmul_type,
+                     std::string &act_type) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
index a2b66263aa792..6f7bb614cc79f 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <boost/logic/tribool.hpp>
 #include <random>
 #include <string>
 #include <unordered_set>
+#include "paddle/utils/tribool.h"
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/ir/graph_traits.h"
@@ -52,12 +52,12 @@ class MKLDNNConvBatchNormPassTest {
              const std::string& name,
              const std::vector<std::string>& inputs,
              const std::vector<std::string>& outputs,
-             boost::tribool use_mkldnn) {
+             paddle::tribool use_mkldnn) {
     auto* op = prog->MutableBlock(0)->AppendOp();
 
     op->SetType(type);
 
-    if (!boost::indeterminate(use_mkldnn))
+    if (!paddle::indeterminate(use_mkldnn))
       op->SetAttr("use_mkldnn", use_mkldnn);
 
     if (type == "conv2d_transpose") {
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
index f6d318f74fe3e..a3b1f730dfc24 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
@@ -14,8 +14,8 @@
 
 #include <gtest/gtest.h>
 
-#include <boost/logic/tribool.hpp>
 #include <unordered_set>
+#include "paddle/utils/tribool.h"
 
 #include "paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
@@ -44,12 +44,12 @@ class MKLDNNInplacePassTest {
              const std::string& name,
              const std::vector<std::string>& inputs,
              const std::vector<std::string>& outputs,
-             boost::tribool use_mkldnn) {
+             paddle::tribool use_mkldnn) {
     auto* op = prog->MutableBlock(0)->AppendOp();
 
     op->SetType(type);
 
-    if (!boost::indeterminate(use_mkldnn))
+    if (!paddle::indeterminate(use_mkldnn))
       op->SetAttr("use_mkldnn", use_mkldnn);
 
     if (type == "conv2d") {
@@ -102,7 +102,7 @@ class MKLDNNInplacePassTest {
           "conv1",
           std::vector<std::string>({"a", "weights", "bias"}),
           std::vector<std::string>({"f"}),
-          boost::indeterminate);
+          paddle::indeterminate);
     SetOp(&prog,
           "relu",
           "relu1",
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc
index bb38e6e9091dd..b9c1954dc74e0 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc
@@ -14,7 +14,7 @@
 
 #include <gtest/gtest.h>
 
-#include <boost/logic/tribool.hpp>
+#include "paddle/utils/tribool.h"
 
 #include "paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h"
 
@@ -29,12 +29,12 @@ class PlacementPassTest {
              const std::string& name,
              const std::vector<std::string>& inputs,
              const std::vector<std::string>& outputs,
-             boost::tribool use_mkldnn) {
+             paddle::tribool use_mkldnn) {
     auto* op = prog->MutableBlock(0)->AppendOp();
 
     op->SetType(type);
 
-    if (!boost::indeterminate(use_mkldnn))
+    if (!paddle::indeterminate(use_mkldnn))
       op->SetAttr("use_mkldnn", use_mkldnn);
 
     if (type == "conv2d") {
@@ -90,13 +90,13 @@ class PlacementPassTest {
           "concat1",
           std::vector<std::string>({"a", "b"}),
           std::vector<std::string>({"c"}),
-          boost::indeterminate);
+          paddle::indeterminate);
     SetOp(&prog,
           "conv2d",
           "conv1",
           std::vector<std::string>({"c", "weights", "bias"}),
           std::vector<std::string>({"f"}),
-          boost::indeterminate);
+          paddle::indeterminate);
     SetOp(&prog,
           "relu",
           "relu1",
diff --git a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
index 40c6050a3c3f1..42c54fcb36242 100644
--- a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
@@ -55,23 +55,6 @@ void QuantDequantMkldnnPass::MarkSkipQuantizedOps(
   }
 }
 
-void QuantDequantMkldnnPass::MarkSkipQuantizedPool2d(ir::Graph* graph) const {
-  VLOG(3) << "mark avg pool2d as skip quantized op";
-  for (auto* op_node :
-       ir::TopologyVarientSort(*graph, static_cast<ir::SortKind>(0))) {
-    if (!op_node->IsOp()) continue;
-
-    if (op_node->Name() == "pool2d") {
-      auto* op_desc = op_node->Op();
-      auto pool_type =
-          BOOST_GET_CONST(std::string, op_desc->GetAttr("pooling_type"));
-      if (pool_type == "avg") {
-        op_node->Op()->SetAttr("skip_quant", 1);
-      }
-    }
-  }
-}
-
 void QuantDequantMkldnnPass::CollectInfoFromFake(
     ir::Graph* graph,
     Scope* scope,
@@ -548,7 +531,6 @@ void QuantDequantMkldnnPass::ApplyImpl(ir::Graph* graph) const {
 
   auto* scope = param_scope();
   MarkSkipQuantizedOps(graph, skip_ops);
-  MarkSkipQuantizedPool2d(graph);
   CollectInfoFromFake(graph, scope, fake_dequantize_types, &weight_thresholds);
   CollectInputScalesFromFake(
       graph, scope, fake_quantize_types, &var_quant_scales);
diff --git a/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.cc
index 3dd850d886c8e..41e70e529bf73 100644
--- a/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.cc
@@ -17,6 +17,7 @@
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
 #include "paddle/fluid/string/pretty_log.h"
 
 namespace paddle {
@@ -26,59 +27,34 @@ namespace ir {
 using string::PrettyLogDetail;
 
 void SoftplusActivationOneDNNPass::ApplyImpl(Graph *graph) const {
-  std::vector<std::string> act_types = {"relu",
-                                        "tanh",
-                                        "leaky_relu",
-                                        "swish",
-                                        "hardswish",
-                                        "sqrt",
-                                        "abs",
-                                        "clip",
-                                        "gelu",
-                                        "relu6",
-                                        "sigmoid"};
+  auto act_types = paddle::platform::GetSupportedActivations();
 
   for (const auto &act_type : act_types) {
-    std::unordered_map<std::string, std::string> attr_map;
-
-    if (act_type == "swish")
-      attr_map.emplace("beta", "fuse_activation_alpha");
-    else if (act_type == "relu6")
-      attr_map.emplace("threshold", "fuse_activation_alpha");
-    else if (act_type == "clip") {
-      attr_map.emplace("min", "fuse_activation_alpha");
-      attr_map.emplace("max", "fuse_activation_beta");
-    } else {
-      attr_map.emplace("alpha", "fuse_activation_alpha");
-      attr_map.emplace("beta", "fuse_activation_beta");
-    }
-    FuseSoftplusActivation(graph, act_type, attr_map);
+    FuseSoftplusActivation(graph, act_type);
   }
 }
 
 void SoftplusActivationOneDNNPass::FuseSoftplusActivation(
-    Graph *graph,
-    const std::string &fuse_activation_type,
-    const std::unordered_map<std::string, std::string> &attr_map) const {
+    Graph *graph, const std::string &act_type) const {
   PADDLE_ENFORCE_NOT_NULL(
       graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
   FusePassBase::Init("softplus_activation", graph);
 
   GraphPatternDetector gpd;
-  patterns::SoftplusActivation softplus_activation_pattern(
+  patterns::OperatorActivation softplus_activation_pattern(
       gpd.mutable_pattern(), "softplus_activation");
-  softplus_activation_pattern(fuse_activation_type);
+  softplus_activation_pattern("softplus", act_type);
 
   int found_softplus_activation_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
                      Graph *g) {
     VLOG(4) << "Fuse softplus with activation op.";
     GET_IR_NODE_FROM_SUBGRAPH(
-        softplus_out, softplus_out, softplus_activation_pattern);
+        softplus_out, preceding_op_out, softplus_activation_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(
         activation_out, activation_out, softplus_activation_pattern);
-
-    GET_IR_NODE_FROM_SUBGRAPH(softplus, softplus, softplus_activation_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        softplus, preceding_op, softplus_activation_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(
         activation, activation, softplus_activation_pattern);
 
@@ -94,18 +70,18 @@ void SoftplusActivationOneDNNPass::FuseSoftplusActivation(
     }
 
     auto *activation_op = activation->Op();
+    auto attr_map = paddle::platform::GetAttributeMap(act_type);
     for (const auto &attr : attr_map) {
       if (activation_op->HasAttr(attr.first)) {
         softplus_op->SetAttr(attr.second, activation_op->GetAttr(attr.first));
       }
     }
 
-    if (fuse_activation_type == "gelu" &&
-        activation_op->HasAttr("approximate") &&
+    if (act_type == "gelu" && activation_op->HasAttr("approximate") &&
         BOOST_GET_CONST(bool, activation_op->GetAttr("approximate")))
-      softplus_op->SetAttr("fuse_activation_type", std::string("gelu_tanh"));
+      softplus_op->SetAttr("fuse_activation", std::string("gelu_tanh"));
     else
-      softplus_op->SetAttr("fuse_activation_type", fuse_activation_type);
+      softplus_op->SetAttr("fuse_activation", act_type);
 
     softplus_op->SetAttr("use_mkldnn", true);
 
@@ -121,7 +97,7 @@ void SoftplusActivationOneDNNPass::FuseSoftplusActivation(
   if (!Has("disable_logs") || !Get<bool>("disable_logs"))
     PrettyLogDetail("---    fused %d softplus with %s activation",
                     found_softplus_activation_count,
-                    fuse_activation_type);
+                    act_type);
 }
 }  // namespace ir
 }  // namespace framework
@@ -133,13 +109,16 @@ REGISTER_PASS_CAPABILITY(softplus_activation_mkldnn_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
             .LE("softplus", 1)
-            .EQ("relu", 0)
-            .EQ("tanh", 0)
-            .LE("leaky_relu", 1)
-            .EQ("swish", 0)
-            .EQ("hard_swish", 0)
-            .EQ("sqrt", 0)
             .EQ("abs", 0)
-            .LE("relu6", 1)
             .LE("clip", 1)
-            .EQ("gelu", 0));
+            .EQ("gelu", 0)
+            .EQ("hard_sigmoid", 0)
+            .LE("hard_swish", 0)
+            .LE("leaky_relu", 1)
+            .LE("mish", 1)
+            .EQ("relu", 0)
+            .EQ("relu6", 0)
+            .EQ("sigmoid", 0)
+            .EQ("sqrt", 0)
+            .EQ("swish", 0)
+            .EQ("tanh", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.h
index c49502c674355..6368a102b0e85 100644
--- a/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.h
@@ -34,10 +34,8 @@ class SoftplusActivationOneDNNPass : public FusePassBase {
  protected:
   void ApplyImpl(ir::Graph *graph) const override;
 
-  void FuseSoftplusActivation(
-      ir::Graph *graph,
-      const std::string &fuse_activation_type,
-      const std::unordered_map<std::string, std::string> &attr_map) const;
+  void FuseSoftplusActivation(ir::Graph *graph,
+                              const std::string &act_type) const;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass_tester.cc
index 2fbb46e32d1e9..afe3d75fd2126 100644
--- a/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass_tester.cc
@@ -50,9 +50,9 @@ void MainTest(const std::string& activation_type) {
       const auto* op = node->Op();
       ASSERT_TRUE(op->HasAttr("use_mkldnn"));
       EXPECT_TRUE(BOOST_GET_CONST(bool, op->GetAttr("use_mkldnn")));
-      ASSERT_TRUE(op->HasAttr("fuse_activation_type"));
+      ASSERT_TRUE(op->HasAttr("fuse_activation"));
       auto activation_type =
-          BOOST_GET_CONST(std::string, op->GetAttr("fuse_activation_type"));
+          BOOST_GET_CONST(std::string, op->GetAttr("fuse_activation"));
       EXPECT_EQ(activation_type.compare(activation_type), 0);
     }
   }
diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h
index 967482d2419e9..37a28bec16da2 100644
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -25,7 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/platform/variant.h"
+#include "paddle/phi/core/macros.h"
 #include "paddle/utils/any.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
index c0b4ac864cabc..33d293faad129 100644
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -144,7 +144,7 @@ LoDTensor LodExpand(const LoDTensor& source,
       auto slice = tensor.Slice(elem, elem + 1);
       TensorCopy(source.Slice(ins, ins + 1),
                  platform::CPUPlace(),
-                 platform::CPUDeviceContext(),
+                 phi::CPUContext(),
                  &slice);
     }
   }
diff --git a/paddle/fluid/framework/new_executor/CMakeLists.txt b/paddle/fluid/framework/new_executor/CMakeLists.txt
index cf10734d1deeb..006e98f175423 100644
--- a/paddle/fluid/framework/new_executor/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/CMakeLists.txt
@@ -1,4 +1,16 @@
-set(INTERPRETERCORE_DEPS
+add_subdirectory(workqueue)
+add_subdirectory(garbage_collector)
+
+set(STANDALONE_EXECUTOR_SRCS
+    data_transfer.cc
+    new_executor_defs.cc
+    interpretercore_util.cc
+    event_manager.cc
+    stream_analyzer.cc
+    interpretercore.cc
+    standalone_executor.cc)
+
+set(STANDALONE_EXECUTOR_DEPS
     op_registry
     device_context
     scope
@@ -20,62 +32,33 @@ set(INTERPRETERCORE_DEPS
     variable_helper
     timer
     monitor
-    nan_inf_utils)
-
-add_subdirectory(workqueue)
-add_subdirectory(garbage_collector)
-
-cc_library(
-  data_transfer
-  SRCS data_transfer.cc
-  DEPS enforce scope glog)
-cc_library(
-  new_executor_defs
-  SRCS new_executor_defs.cc
-  DEPS enforce glog scope)
-cc_library(
-  interpretercore_util
-  SRCS interpretercore_util.cc
-  DEPS ${INTERPRETERCORE_DEPS} workqueue new_executor_defs data_transfer)
-cc_library(
-  event_manager
-  SRCS event_manager.cc
-  DEPS ${DEVICE_EVENT_LIBS} glog new_executor_defs)
-cc_library(
-  stream_analyzer
-  SRCS stream_analyzer.cc
-  DEPS ${DEVICE_EVENT_LIBS} glog device_context new_executor_defs)
+    nan_inf_utils
+    enforce
+    scope
+    glog
+    enforce
+    glog
+    scope
+    workqueue
+    interpretercore_event_garbage_collector
+    ${DEVICE_EVENT_LIBS}
+    glog)
 
 if(WITH_GPU OR WITH_ROCM)
-  cc_library(
-    interpretercore
-    SRCS interpretercore.cc
-    DEPS workqueue
-         ${DEVICE_EVENT_LIBS}
-         interpretercore_util
-         interpretercore_event_garbage_collector
-         interpretercore_fast_garbage_collector
-         stream_analyzer
-         event_manager)
-else()
-  cc_library(
-    interpretercore
-    SRCS interpretercore.cc
-    DEPS workqueue ${DEVICE_EVENT_LIBS} interpretercore_util
-         interpretercore_event_garbage_collector stream_analyzer event_manager)
+  set(STANDALONE_EXECUTOR_DEPS ${STANDALONE_EXECUTOR_DEPS}
+                               interpretercore_fast_garbage_collector)
 endif()
 
 cc_library(
   standalone_executor
-  SRCS standalone_executor.cc
-  DEPS interpretercore)
+  SRCS ${STANDALONE_EXECUTOR_SRCS}
+  DEPS ${STANDALONE_EXECUTOR_DEPS})
 
 cc_library(
   staticgraph_executor_statistics
   SRCS executor_statistics.cc
   DEPS enforce glog os_info)
 
-# cc_binary(standalone_executor_test SRCS standalone_executor_test.cc DEPS interpretercore standalone_executor operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)
 # skip win32 since wget is not installed by default on windows machine.
 if(WITH_GPU
    AND WITH_TESTING
@@ -120,13 +103,7 @@ if(WITH_GPU
   cc_test(
     standalone_executor_test
     SRCS standalone_executor_test.cc
-    DEPS interpretercore
-         standalone_executor
-         operator
-         op_registry
-         executor
-         ${OPS}
-         ${OP_DEPS})
+    DEPS standalone_executor operator op_registry executor ${OPS} ${OP_DEPS})
   set_tests_properties(standalone_executor_test PROPERTIES TIMEOUT 100)
 
   add_dependencies(standalone_executor_test download_program)
diff --git a/paddle/fluid/framework/new_executor/data_transfer.cc b/paddle/fluid/framework/new_executor/data_transfer.cc
index 701f0a430aa5c..3cf16266baf08 100644
--- a/paddle/fluid/framework/new_executor/data_transfer.cc
+++ b/paddle/fluid/framework/new_executor/data_transfer.cc
@@ -137,6 +137,13 @@ void DataTranferHelper::RunAndConstructOpFuncNode(
   new_op_func_node.output_index["Out"] = {var_scope_->VarId(new_var_name)};
   new_op_func_node.kernel_func_ = OpKernelComputeFunc(kernel_iter->second);
   new_op_func_node.kernel_func_(exec_ctx);
+  // NOTE(winter-wang): in npu device, D2H kernel is asynchronous. need to
+  // explicit synchronization.
+#ifdef PADDLE_WITH_ASCEND_CL
+  if (op_type == kMemcpyD2H) {
+    dev_ctx->Wait();
+  }
+#endif
   // NOTE(Aurelius84): data_transform_op is expensive operation, so we tag them
   // as kQueueSync and execute them in thread pool.
   new_op_func_node.type_ = OpFuncType::kQueueSync;
@@ -308,6 +315,7 @@ std::shared_ptr<OperatorBase> TransferDevice(const std::string& var_name,
     op_type = kMemcpyH2D;
     int dst_place_type = platform::is_gpu_place(dst_place)   ? 0
                          : platform::is_npu_place(dst_place) ? 1
+                         : platform::is_ipu_place(dst_place) ? 3
                          : platform::is_xpu_place(dst_place) ? 2
                                                              : -1;
     attr_map = {{"dst_place_type", dst_place_type}};
diff --git a/paddle/fluid/framework/new_executor/executor_statistics.cc b/paddle/fluid/framework/new_executor/executor_statistics.cc
index c9bb7d4555fc0..a381943587d03 100644
--- a/paddle/fluid/framework/new_executor/executor_statistics.cc
+++ b/paddle/fluid/framework/new_executor/executor_statistics.cc
@@ -583,7 +583,8 @@ int StatisticsEngine::StatNormalizationTime(
   if (total - normalization_sum != 0) {
     LOG(WARNING) << "total: " << total
                  << "is greater than normalization_sum:" << normalization_sum;
-    return -1;
+    // TODO(dev): figure out why total != normalization_sum  and fix it
+    // return -1;
   }
   return 0;
 }
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 3c66eb0c4613c..3680f0aa900c6 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -25,6 +25,7 @@
 #include "paddle/fluid/platform/os_info.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/fluid/platform/profiler/supplement_tracing.h"
+#include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_context.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -90,6 +91,7 @@ InterpreterCore::InterpreterCore(const platform::Place& place,
     auto local_scope = &var_scope_.GetMutableScope()->NewScope();
     local_scope_ = local_scope;
   }
+  var_scope_.SetLocalScope(local_scope_);
 
   // prune
 
@@ -115,7 +117,6 @@ InterpreterCore::~InterpreterCore() {
 interpreter::CostInfo InterpreterCore::DryRun(
     const std::vector<std::string>& feed_names,
     const std::vector<framework::LoDTensor>& feed_tensors) {
-  var_scope_.SetLocalScope(local_scope_);
   Prepare(feed_names, feed_tensors, true);
   interpreter::CostInfo cost_info;
   {
@@ -144,7 +145,6 @@ paddle::framework::FetchList InterpreterCore::Run(
   platform::AttachPointerHashToMKLDNNKey(this, place_);
 #endif
   bool is_build = is_build_;
-  var_scope_.SetLocalScope(local_scope_);
   Prepare(feed_names, feed_tensors, is_build);
 
   if (is_build) {
@@ -153,8 +153,10 @@ paddle::framework::FetchList InterpreterCore::Run(
     // until the second step run.
     async_work_queue_ = GetWorkQueue();
     ExecuteInstructionList(vec_instruction_);
+#ifdef PADDLE_WITH_ASCEND_CL
+    platform::DeviceContextPool::Instance().Get(place_)->Wait();
+#endif
   }
-
   if (create_local_scope_) {
     ClearLoDTensorArrayInLocalScope();
   }
@@ -174,7 +176,6 @@ paddle::framework::FetchList InterpreterCore::Run(
   platform::AttachPointerHashToMKLDNNKey(this, place_);
 #endif
   if (!is_build_) {
-    var_scope_.SetLocalScope(local_scope_);
     paddle::framework::interpreter::build_variable_scope(block_, &var_scope_);
 
     std::vector<paddle::framework::OpFuncNode> op_func_nodes;
@@ -196,12 +197,14 @@ paddle::framework::FetchList InterpreterCore::Run(
     async_work_queue_ = GetWorkQueue();
 
     ExecuteInstructionList(vec_instruction_);
+#ifdef PADDLE_WITH_ASCEND_CL
+    platform::DeviceContextPool::Instance().Get(place_)->Wait();
+#endif
   }
 
   if (create_local_scope_) {
     ClearLoDTensorArrayInLocalScope();
   }
-
   // return Fetch Tensors
   auto* fetch_var = local_scope_->FindVar(interpreter::kFetchVarName);
   if (fetch_var) {
@@ -473,8 +476,13 @@ void InterpreterCore::Convert(
   BuildSkipShareLoDInfo();
 
   for (size_t i = 0; i < vec_instruction_.size(); ++i) {
+#ifdef PADDLE_WITH_IPU
+    gc_event_.emplace_back(phi::CPUPlace(), 0);
+#else
     gc_event_.emplace_back(vec_instruction_[i].DeviceContext().GetPlace(),
                            platform::GenerateDeviceEventFlag());
+
+#endif
   }
   bool inplaced = false;
   for (auto inst : vec_instruction_) {
@@ -528,6 +536,17 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
   VLOG(4) << "Start run " << place << " " << op->DebugStringEx(local_scope_);
   Scope* local_scope = create_local_scope_ ? var_scope_.GetMutableLocalScope()
                                            : var_scope_.GetMutableScope();
+
+#ifdef PADDLE_WITH_ASCEND_CL
+  // NOTE(wangxi): nan/inf cannot be detected on NPU by checking the variable
+  // values, but only through special `float_status` to checks whether
+  // the operation is overflow. More about `float_status`, see:
+  // https://gitee.com/ascend/modelzoo/issues/I3NF8V?from=project-issue
+  if (FLAGS_check_nan_inf) {
+    framework::details::NPUAllocAndClearFloatStatus(*op, *local_scope, place);
+  }
+#endif
+
   auto op_with_kernel = dynamic_cast<const framework::OperatorWithKernel*>(op);
   {
     // If it is OperatorBase, InferShape do nothing.
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc
index 1a539c1ce1cea..acbcf1da4c5e3 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -15,6 +15,7 @@
 
 #include <algorithm>
 
+#include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/fluid/framework/new_executor/data_transfer.h"
 #include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h"
@@ -43,6 +44,7 @@ PADDLE_DEFINE_EXPORTED_bool(
     "Enable serial execution for standalone executor, used for debug.");
 
 DECLARE_bool(use_mkldnn);
+DECLARE_bool(check_nan_inf);
 
 namespace paddle {
 namespace framework {
@@ -446,11 +448,19 @@ void build_op_func_list(const platform::Place& place,
     op_func_node.output_index = outs_name2id;
     VLOG(4) << "Start run " << place << " " << op->DebugStringEx(local_scope);
 
+#ifdef PADDLE_WITH_ASCEND_CL
+    // NOTE(wangxi): nan/inf cannot be detected on NPU by checking the variable
+    // values, but only through special `float_status` to checks whether
+    // the operation is overflow. More about `float_status`, see:
+    // https://gitee.com/ascend/modelzoo/issues/I3NF8V?from=project-issue
+    if (FLAGS_check_nan_inf) {
+      framework::details::NPUAllocAndClearFloatStatus(*op, *local_scope, place);
+    }
+#endif
+
     if (dynamic_cast<framework::OperatorWithKernel*>(op) == nullptr) {
       // op is not a operatorwithkernel, so direcly run OperatorBase::Run()
       deal_operator_base(place, var_scope, ops[i], &op_func_node, local_scope);
-      VLOG(4) << "End run " << place << " "
-              << op_func_node.operator_base_->DebugStringEx(local_scope);
     } else {
       auto op_with_kernel = const_cast<framework::OperatorWithKernel*>(
           static_cast<const framework::OperatorWithKernel*>(op));
@@ -593,6 +603,12 @@ void build_op_func_list(const platform::Place& place,
                   << var_scope->GetNameById(p.second);
         }
       }
+
+      // for debug nan/inf
+      if (FLAGS_check_nan_inf) {
+        VLOG(4) << "Check nan/inf";
+        framework::details::CheckOpHasNanOrInf(*op, *runtime_scope, place);
+      }
     }
 
     VLOG(4) << "End run " << place << " "
@@ -768,12 +784,7 @@ void ShrinkDownstreamMap(std::map<int, std::list<int>>* downstream_map,
   // b: c
 
   // happens_before[i][j] means i should be executed before j
-  op_happens_before->resize(op_num);
-  for (size_t i = 0; i < op_num; ++i) {
-    (*op_happens_before)[i].resize(op_num);
-    std::fill(
-        (*op_happens_before)[i].begin(), (*op_happens_before)[i].end(), false);
-  }
+  op_happens_before->assign(op_num, std::vector<bool>(op_num, false));
 
   // bfs to get all next ops
   auto bfs = [&](size_t op_idx) {
@@ -883,6 +894,18 @@ std::map<int, std::list<int>> build_op_downstream_map(
         }
       }
     }
+    // the original output of inplace op is also change.
+    if (!vec_instruction[op_idx].InplaceBackMap().empty()) {
+      auto& m = vec_instruction[op_idx].InplaceBackMap();
+      for (auto& p : m) {
+        auto& var = p.second;
+        if (var2min_rw_op.count(var)) {
+          for (auto dep_op : var2min_rw_op[var]) {
+            op2dependences[op_idx].insert(dep_op);
+          }
+        }
+      }
+    }
 
     // step2: update 2 var2xxxx data structure
     for (auto& item :
@@ -894,16 +917,6 @@ std::map<int, std::list<int>> build_op_downstream_map(
       }
     }
 
-    for (auto& item :
-         vec_instruction[op_idx].Inputs()) {  // for all inputs(read only)
-      for (auto var : item.second) {
-        if (remove_duplicate.count(var) ==
-            0) {  // var in input list and in output list, so remove it.
-          update_var_min_rw_op(op2dependences, &var2min_rw_op, op_idx, var);
-        }
-      }
-    }
-
     // NOTE(zhiqiu): The inplace op with `transfer` also changes
     // original output after that so add original output as well
     // original: a->op->a
@@ -914,8 +927,16 @@ std::map<int, std::list<int>> build_op_downstream_map(
       for (auto& p : m) {
         auto var = p.second;
         var2recent_write_op[var] = op_idx;
-        // var in input list and in output list, so remove it.
-        if (remove_duplicate.count(var) == 0) {
+        var2min_rw_op[var] = {static_cast<int>(op_idx)};
+        remove_duplicate.insert(var);
+      }
+    }
+
+    for (auto& item :
+         vec_instruction[op_idx].Inputs()) {  // for all inputs(read only)
+      for (auto var : item.second) {
+        if (remove_duplicate.count(var) ==
+            0) {  // var in input list and in output list, so remove it.
           update_var_min_rw_op(op2dependences, &var2min_rw_op, op_idx, var);
         }
       }
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h
index 70a92f0ae28ae..31e27a07c665d 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.h
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.h
@@ -389,7 +389,8 @@ static bool IsCpuOp(const Instruction& instr) {
 
 // is supported heterogeneous place
 static bool IsSupportedHetePlace(const phi::Place& place) {
-  return platform::is_gpu_place(place) || platform::is_xpu_place(place);
+  return platform::is_gpu_place(place) || platform::is_npu_place(place) ||
+         platform::is_xpu_place(place) || platform::is_ipu_place(place);
 }
 
 }  // namespace interpreter
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index 3ef0a827c2480..2e6e9aa8427b0 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -19,53 +19,32 @@
 namespace paddle {
 namespace framework {
 StandaloneExecutor::StandaloneExecutor(const platform::Place& place,
-                                       const ProgramDesc& startup_prog,
-                                       const ProgramDesc& main_prog,
-                                       Scope* scope)
-    : place_(place),
-      startup_prog_(startup_prog),
-      main_prog_(main_prog),
-      scope_(scope) {
-  // NOTE(zhiqiu): for startup_program, run once ?
-  if (startup_prog.Block(0).AllOps().size() > 0) {
-    auto core = GetInterpreterCore(startup_prog, {}, {}, false);
-    VLOG(4) << "StandaloneExecutor: " << this << ", InterpreterCore: " << core;
-    core->Run({});
-  }
-}
-
-paddle::framework::FetchList StandaloneExecutor::Run(
-    const std::vector<std::string>& feed_names,
-    const std::vector<framework::LoDTensor>& feed_tensors,
-    const std::vector<std::string>& fetch_names) {
-  platform::RecordEvent record_event(
-      "StandaloneExecutor::run", platform::TracerEventType::UserDefined, 1);
-
-  auto core = GetInterpreterCore(main_prog_, feed_names, fetch_names, true);
-
-  return core->Run(feed_names, feed_tensors);
-}
+                                       const ProgramDesc& prog)
+    : place_(place), prog_(prog) {}
 
 paddle::framework::FetchList StandaloneExecutor::Run(
+    Scope* scope,
     const std::vector<std::string>& feed_names,
     const std::vector<std::string>& fetch_names) {
   platform::RecordEvent record_event(
       "StandaloneExecutor::run", platform::TracerEventType::UserDefined, 1);
 
-  auto core = GetInterpreterCore(main_prog_, feed_names, fetch_names, false);
+  auto core = GetInterpreterCore(scope, prog_, feed_names, fetch_names, false);
   VLOG(4) << "StandaloneExecutor: " << this << ", InterpreterCore: " << core;
   return core->Run(feed_names);
 }
 
 framework::interpreter::CostInfo StandaloneExecutor::DryRun(
+    Scope* scope,
     const std::vector<std::string>& feed_names,
     const std::vector<framework::LoDTensor>& feed_tensors) {
-  auto core = GetInterpreterCore(main_prog_, feed_names, {}, true);
+  auto core = GetInterpreterCore(scope, prog_, feed_names, {}, true);
 
   return core->DryRun(feed_names, feed_tensors);
 }
 
 std::shared_ptr<InterpreterCore> StandaloneExecutor::GetInterpreterCore(
+    Scope* scope,
     const ProgramDesc& prog,
     const std::vector<std::string>& feed_names,
     const std::vector<std::string>& fetch_names,
@@ -79,6 +58,7 @@ std::shared_ptr<InterpreterCore> StandaloneExecutor::GetInterpreterCore(
   for (auto& fetchname : fetch_names) {
     oss << fetchname << ",";
   }
+  oss << "scope:" << scope;
 
   auto iter = interpretercores_.find(oss.str());
 
@@ -89,13 +69,13 @@ std::shared_ptr<InterpreterCore> StandaloneExecutor::GetInterpreterCore(
     std::shared_ptr<InterpreterCore> core = nullptr;
 
     if (add_fetch_op) {
-      core = CreateInterpreterCore(place_, prog, scope_, fetch_names);
+      core = CreateInterpreterCore(place_, prog, scope, fetch_names);
     } else {
       core = std::make_shared<InterpreterCore>(
           place_,
           prog.Block(0),
           /*skip_gc_vars=*/std::set<std::string>(),
-          scope_);
+          scope);
     }
     interpretercores_.emplace(oss.str(), core);
     return core;
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.h b/paddle/fluid/framework/new_executor/standalone_executor.h
index 7b54a855007be..e6d84d6f9a183 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.h
+++ b/paddle/fluid/framework/new_executor/standalone_executor.h
@@ -31,39 +31,32 @@ class InterpreterCore;
 
 class StandaloneExecutor {
  public:
-  StandaloneExecutor(const platform::Place& place,
-                     const ProgramDesc& startup_prog,
-                     const ProgramDesc& main_prog,
-                     Scope* scope);
+  StandaloneExecutor(const platform::Place& place, const ProgramDesc& prog);
 
   ~StandaloneExecutor() {}
 
-  paddle::framework::FetchList Run(
-      const std::vector<std::string>& feed_names,
-      const std::vector<framework::LoDTensor>& feed_tensors,
-      const std::vector<std::string>& fetch_names);
-
   // NOTE(zhiqiu): feed_names are only used for caching interpretercore.
   // fetch_names are used for caching interpretercore and inserting fetch ops,
   // the latter can be moved to python side.
-  paddle::framework::FetchList Run(const std::vector<std::string>& feed_names,
+  paddle::framework::FetchList Run(Scope* scope,
+                                   const std::vector<std::string>& feed_names,
                                    const std::vector<std::string>& fetch_names);
 
   framework::interpreter::CostInfo DryRun(
+      Scope* scope,
       const std::vector<std::string>& feed_names,
       const std::vector<framework::LoDTensor>& feed_tensors);
 
  private:
   std::shared_ptr<InterpreterCore> GetInterpreterCore(
+      Scope* scope,
       const ProgramDesc& prog,
       const std::vector<std::string>& feed_names,
       const std::vector<std::string>& fetch_names,
       bool add_fetch_op);
 
   platform::Place place_;
-  const ProgramDesc& startup_prog_;
-  const ProgramDesc& main_prog_;
-  Scope* scope_;  // not owned
+  const ProgramDesc& prog_;
 
   std::unordered_map<std::string, std::shared_ptr<InterpreterCore>>
       interpretercores_;
diff --git a/paddle/fluid/framework/new_executor/stream_analyzer.cc b/paddle/fluid/framework/new_executor/stream_analyzer.cc
index b7a7e4c0b546f..760a852baee68 100644
--- a/paddle/fluid/framework/new_executor/stream_analyzer.cc
+++ b/paddle/fluid/framework/new_executor/stream_analyzer.cc
@@ -21,23 +21,37 @@
 
 namespace paddle {
 namespace framework {
+namespace {
+std::map<Place, std::shared_future<std::unique_ptr<platform::DeviceContext>>>*
+    d2h_ctxs = nullptr;
+std::map<Place, std::shared_future<std::unique_ptr<platform::DeviceContext>>>*
+    h2d_ctxs = nullptr;
+std::mutex ctx_mtx;
+}  // namespace
 
 StreamAnalyzer::StreamAnalyzer(const platform::Place& place) : place_(place) {
-  if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    platform::EmplaceDeviceContexts(
-        &d2h_ctxs_,
-        {place},
-        /*disable_setting_default_stream_for_allocator=*/true);
-    platform::EmplaceDeviceContexts(
-        &h2d_ctxs_,
-        {place},
-        /*disable_setting_default_stream_for_allocator=*/true);
-#else
-    PADDLE_THROW(
-        platform::errors::Unimplemented("CUDAPlace is not supported. Please "
-                                        "re-compile with WITH_GPU option."));
-#endif
+  if (platform::is_gpu_place(place) || platform::is_npu_place(place)) {
+    std::lock_guard<std::mutex> lk(ctx_mtx);
+    if (d2h_ctxs == nullptr) {
+      d2h_ctxs = new std::map<
+          Place,
+          std::shared_future<std::unique_ptr<platform::DeviceContext>>>();
+      h2d_ctxs = new std::map<
+          Place,
+          std::shared_future<std::unique_ptr<platform::DeviceContext>>>();
+    }
+    if (d2h_ctxs->find(place) == d2h_ctxs->end()) {
+      platform::EmplaceDeviceContexts(
+          d2h_ctxs,
+          {place},
+          /*disable_setting_default_stream_for_allocator=*/true);
+      platform::EmplaceDeviceContexts(
+          h2d_ctxs,
+          {place},
+          /*disable_setting_default_stream_for_allocator=*/true);
+    }
+    d2h_ctx_ = (*d2h_ctxs)[place];
+    h2d_ctx_ = (*h2d_ctxs)[place];
   }
 }
 
@@ -162,15 +176,15 @@ platform::DeviceContext* StreamAnalyzer::ParseDeviceContext(
     const OpFuncNode& op_func_node) {
   auto& op_type = op_func_node.operator_base_->Type();
   auto* dev_ctx = op_func_node.dev_ctx_;
-  // only gpu need update. xpu not need, because xpu memcpy op kernel is
+  // only gpu/npu need update. xpu not need, because xpu memcpy op kernel is
   // synchronous.
-  if (platform::is_gpu_place(place_)) {
+  if (platform::is_gpu_place(place_) || platform::is_npu_place(place_)) {
     if (op_type == interpreter::kMemcpyD2H) {
       VLOG(3) << "Get dev_ctx from d2h_context_pool_";
-      dev_ctx = d2h_ctxs_[place_].get().get();
+      dev_ctx = d2h_ctx_.get().get();
     } else if (op_type == interpreter::kMemcpyH2D) {
       VLOG(3) << "Get dev_ctx from h2d_context_pool_";
-      dev_ctx = h2d_ctxs_[place_].get().get();
+      dev_ctx = h2d_ctx_.get().get();
     }
   }
   return dev_ctx;
@@ -188,11 +202,21 @@ platform::DeviceContext* StreamAnalyzer::ParseDeviceContext(
  */
 bool StreamAnalyzer::IsDirectRun(Instruction& cur_instr,
                                  const Instruction& next_instr) {
-  return platform::is_xpu_place(place_) ||
-         (&cur_instr.DeviceContext() == &next_instr.DeviceContext() ||
-          interpreter::IsCpuOp(cur_instr) ||
-          interpreter::IsMemcpyD2H(cur_instr) ||
-          interpreter::IsMemcpyH2D(next_instr));
+  if (&cur_instr.DeviceContext() == &next_instr.DeviceContext()) return true;
+
+  // xpu&ipu memcpy kerenl is synchronous.
+  if (platform::is_ipu_place(place_) || platform::is_xpu_place(place_))
+    return true;
+
+  // npu d2h kernel is asynchronous.
+  if (platform::is_npu_place(place_)) {
+    return interpreter::IsCpuOp(cur_instr) ||
+           interpreter::IsMemcpyH2D(next_instr);
+  }
+  // gpu or cpu
+  return interpreter::IsCpuOp(cur_instr) ||
+         interpreter::IsMemcpyD2H(cur_instr) ||
+         interpreter::IsMemcpyH2D(next_instr);
 }
 
 platform::DeviceType StreamAnalyzer::GetWaiterType(const Instruction& instr) {
@@ -201,6 +225,8 @@ platform::DeviceType StreamAnalyzer::GetWaiterType(const Instruction& instr) {
   } else {
     if (platform::is_xpu_place(place_)) {
       return platform::kXPU;
+    } else if (platform::is_npu_place(place_)) {
+      return platform::kNPU;
     }
     return platform::kCUDA;
   }
diff --git a/paddle/fluid/framework/new_executor/stream_analyzer.h b/paddle/fluid/framework/new_executor/stream_analyzer.h
index 61e37bbb686fc..4be8ffe6bb4ca 100644
--- a/paddle/fluid/framework/new_executor/stream_analyzer.h
+++ b/paddle/fluid/framework/new_executor/stream_analyzer.h
@@ -53,9 +53,9 @@ class StreamAnalyzer {
 
   platform::DeviceType GetWaiterType(const Instruction& instr);
 
-  Place place_;
-  std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>> d2h_ctxs_;
-  std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>> h2d_ctxs_;
+  const Place place_;
+  std::shared_future<std::unique_ptr<platform::DeviceContext>> d2h_ctx_;
+  std::shared_future<std::unique_ptr<platform::DeviceContext>> h2d_ctx_;
   std::map<size_t, std::shared_ptr<platform::DeviceEvent>> var_id2event_;
 };
 
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index c8a9950ae5efb..c0a9528c28126 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include <string>
 
-#include "boost/blank.hpp"
 #include "glog/logging.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_call_stack.h"
@@ -24,6 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/shape_inference.h"
 #include "paddle/fluid/framework/var_type_inference.h"
+#include "paddle/utils/blank.h"
 
 namespace paddle {
 namespace framework {
@@ -764,7 +764,7 @@ void OpDesc::RenameInput(const std::string &old_name,
   need_update_ = true;
 }
 
-struct SetAttrDescVisitor : public boost::static_visitor<void> {
+struct SetAttrDescVisitor {
   explicit SetAttrDescVisitor(proto::OpDesc::Attr *attr) : attr_(attr) {}
   mutable proto::OpDesc::Attr *attr_;
   void operator()(int v) const { attr_->set_i(v); }
@@ -810,7 +810,7 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {
     VectorToRepeated(v, attr_->mutable_float64s());
   }
 
-  void operator()(boost::blank) const {
+  void operator()(paddle::blank) const {
     PADDLE_THROW(platform::errors::Unavailable(
         "Unsupported calling method of SetAttrDescVisitor object for "
         "`boosst::blank` type."));
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index d38efbff3165c..53b77d538b3ed 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -408,6 +408,12 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
       ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
       __VA_ARGS__)
 
+#define REGISTER_OP_IPU_KERNEL_FUNCTOR(op_type, ...)                  \
+  REGISTER_OP_KERNEL_EX(                                              \
+      op_type, IPU, ::paddle::platform::IPUPlace, DEFAULT_TYPE,       \
+      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
+      __VA_ARGS__)
+
 /**
  * Macro to mark what Operator and Kernel
  * we will use and tell the compiler to
diff --git a/paddle/fluid/framework/op_registry_test.cc b/paddle/fluid/framework/op_registry_test.cc
index a95bcdf944943..fa0528d488297 100644
--- a/paddle/fluid/framework/op_registry_test.cc
+++ b/paddle/fluid/framework/op_registry_test.cc
@@ -232,9 +232,8 @@ class OpKernelTest : public paddle::framework::OpKernel<T> {
 REGISTER_OP_WITHOUT_GRADIENT(op_with_kernel,
                              paddle::framework::OpWithKernelTest,
                              paddle::framework::OpKernelTestMaker);
-REGISTER_OP_CPU_KERNEL(
-    op_with_kernel,
-    paddle::framework::OpKernelTest<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(op_with_kernel,
+                       paddle::framework::OpKernelTest<phi::CPUContext, float>);
 
 REGISTER_OP_CUDA_KERNEL(
     op_with_kernel,
@@ -264,10 +263,9 @@ TEST(OperatorRegistrar, CUDA) {
 }
 
 static int op_test_value = 0;
-
-using paddle::platform::CPUDeviceContext;
 using paddle::platform::CUDADeviceContext;
 using paddle::platform::DeviceContext;
+using phi::CPUContext;
 
 namespace paddle {
 namespace framework {
@@ -295,8 +293,7 @@ class OpMultiKernelTest : public paddle::framework::OpKernel<T> {
 };
 
 template <typename T>
-class OpMultiKernelTest<CPUDeviceContext, T>
-    : public paddle::framework::OpKernel<T> {
+class OpMultiKernelTest<CPUContext, T> : public paddle::framework::OpKernel<T> {
  public:
   void Compute(const paddle::framework::ExecutionContext& ctx) const {
     ++op_test_value;
@@ -319,7 +316,7 @@ class OpMultiKernelTest2 : public paddle::framework::OpKernel<T> {
 };
 
 template <typename T>
-class OpMultiKernelTest2<CPUDeviceContext, T>
+class OpMultiKernelTest2<CPUContext, T>
     : public paddle::framework::OpKernel<T> {
  public:
   void Compute(const paddle::framework::ExecutionContext& ctx) const {
@@ -342,16 +339,14 @@ class OpMultiKernelTest2<CUDADeviceContext, T>
 REGISTER_OP_WITHOUT_GRADIENT(op_with_multi_kernel,
                              paddle::framework::OpWithMultiKernelTest,
                              paddle::framework::OpKernelTestMaker);
-REGISTER_OP_KERNEL(
-    op_with_multi_kernel,
-    CPU,
-    paddle::platform::CPUPlace,
-    paddle::framework::OpMultiKernelTest<CPUDeviceContext, float>);
-REGISTER_OP_KERNEL(
-    op_with_multi_kernel,
-    MKLDNN,
-    paddle::platform::CPUPlace,
-    paddle::framework::OpMultiKernelTest2<CPUDeviceContext, float>);
+REGISTER_OP_KERNEL(op_with_multi_kernel,
+                   CPU,
+                   paddle::platform::CPUPlace,
+                   paddle::framework::OpMultiKernelTest<CPUContext, float>);
+REGISTER_OP_KERNEL(op_with_multi_kernel,
+                   MKLDNN,
+                   paddle::platform::CPUPlace,
+                   paddle::framework::OpMultiKernelTest2<CPUContext, float>);
 REGISTER_OP_KERNEL(
     op_with_multi_kernel,
     CUDA,
diff --git a/paddle/fluid/framework/op_version_registry.h b/paddle/fluid/framework/op_version_registry.h
index c88b947edc686..579dd320d144f 100644
--- a/paddle/fluid/framework/op_version_registry.h
+++ b/paddle/fluid/framework/op_version_registry.h
@@ -22,6 +22,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_version_proto.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/core/macros.h"
 #include "paddle/utils/none.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 2be93f0dc9178..4f50996267b97 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1276,41 +1276,70 @@ bool OperatorWithKernel::SupportNPU() const {
 
 bool OperatorWithKernel::SupportsMKLDNN(
     const proto::VarType::Type data_type) const {
-  auto op_kernel_iter = OperatorWithKernel::AllOpKernels().find(type_);
-  if (op_kernel_iter == OperatorWithKernel::AllOpKernels().end()) {
-    VLOG(6) << "Warning: " << type_
-            << " don't find its MKLDNN Kernel in Fluid "
-               "Registered Kernels. And We don't "
-               "search its kernels in phi lib, "
-               "SupportsMKLDNN() return false.";
-    return false;
+  auto phi_kernels = phi::KernelFactory::Instance().SelectKernelMap(
+      phi::TransToPhiKernelName(type_));
+  auto has_phi_kernel =
+      std::any_of(phi_kernels.begin(),
+                  phi_kernels.end(),
+                  [](phi::KernelKeyMap::const_reference kern_pair) {
+                    return kern_pair.first.backend() == phi::Backend::ONEDNN;
+                  });
+  if (has_phi_kernel) {
+    return true;
+  } else {
+    auto op_kernel_iter = OperatorWithKernel::AllOpKernels().find(type_);
+    if (op_kernel_iter == OperatorWithKernel::AllOpKernels().end()) {
+      return false;
+    } else {
+      auto& op_kernels = op_kernel_iter->second;
+      return std::any_of(
+          op_kernels.begin(),
+          op_kernels.end(),
+          [data_type](OpKernelMap::const_reference kern_pair) {
+            return platform::is_cpu_place(kern_pair.first.place_) &&
+                   kern_pair.first.library_type_ == LibraryType::kMKLDNN &&
+                   kern_pair.first.data_type_ == data_type;
+          });
+    }
   }
-  auto& op_kernels = op_kernel_iter->second;
-  return std::any_of(op_kernels.begin(),
-                     op_kernels.end(),
-                     [data_type](OpKernelMap::const_reference kern_pair) {
-                       return platform::is_cpu_place(kern_pair.first.place_) &&
-                              kern_pair.first.library_type_ ==
-                                  LibraryType::kMKLDNN &&
-                              kern_pair.first.data_type_ == data_type;
-                     });
 }
 
 bool OperatorWithKernel::SupportsKernelType(
     const OpKernelType& kernel_type) const {
   auto& all_op_kernels = AllOpKernels();
   auto kernels_iter = all_op_kernels.find(type_);
-  bool support =
-      kernels_iter != all_op_kernels.end() &&
-      kernels_iter->second.find(kernel_type) != kernels_iter->second.end();
-#if defined(PADDLE_WITH_XPU)
+  if (kernels_iter == all_op_kernels.end()) return false;
+  OpKernelMap& kernels = kernels_iter->second;
+  auto kernel_iter = kernels.find(kernel_type);
+
+#if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
+  if (paddle::platform::is_xpu_place(kernel_type.place_)) {
+    return kernel_iter != kernels.end() &&
+           paddle::platform::is_xpu_support_op(type_, kernel_type) &&
+           !paddle::platform::is_in_xpu_black_list(type_);
+  }
+#endif
+
+#ifdef PADDLE_WITH_XPU_KP
   if (paddle::platform::is_xpu_place(kernel_type.place_)) {
-    support = support &&
-              paddle::platform::is_xpu_support_op(type_, kernel_type) &&
-              !paddle::platform::is_in_xpu_black_list(type_);
+    bool use_xpu_kp_kernel_rt =
+        FLAGS_run_kp_kernel &&
+        paddle::platform::is_xpu_kp_support_op(type_, kernel_type);
+    bool use_xpu_kp_kernel_debug =
+        paddle::platform::is_in_xpu_kpwhite_list(type_);
+    bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug);
+    if (is_xpu_kp_support) {
+      auto tmp_kernel_type = kernel_type;
+      tmp_kernel_type.library_type_ = LibraryType::kKP;
+      return kernels.find(tmp_kernel_type) != kernels.end();
+    }
+    return kernel_iter != kernels.end() &&
+           paddle::platform::is_xpu_support_op(type_, kernel_type) &&
+           !paddle::platform::is_in_xpu_black_list(type_);
   }
 #endif
-  return support;
+
+  return kernel_iter != kernels.end();
 }
 
 bool OperatorWithKernel::CanMKLDNNBeUsed(const framework::ExecutionContext& ctx,
@@ -1622,9 +1651,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
                                        platform::EventRole::kInnerOp);
     if (run_phi_kernel_) {
       phi::KernelContext pt_kernel_context;
-      // Do data transform before building KernelContext
-      // TODO(zhiqiu): support TransferInplaceVarsBack
-      PreparePhiData(exec_scope, *pt_kernel_, *kernel_signature_, runtime_ctx);
       if (enable_cache_runtime_context_ && !need_prepare_phi_data_ &&
           !need_prepare_data_) {
         impl_ =
@@ -2007,15 +2033,15 @@ Scope* OperatorWithKernel::PrepareData(
     }
   }
 
-  for (auto& var_name_item : Inputs()) {
-    bool should_skip_input =
-        no_buffer_ins && no_buffer_ins->count(var_name_item.first) > 0;
-
-    std::vector<Variable*>& input_vars = ctx->inputs[var_name_item.first];
-
-    for (size_t i = 0; i < var_name_item.second.size(); ++i) {
-      auto& var_name = var_name_item.second[i];
-      auto* var = input_vars[i];
+  const auto& name_map = Inputs();
+  auto prepare_input_data = [&](const std::string& in_name,
+                                std::vector<Variable*>* in_vars,
+                                const phi::TensorArgDef* in_def,
+                                bool should_skip_input) -> void {
+    auto& name_vec = name_map.at(in_name);
+    for (size_t i = 0; i < in_vars->size(); ++i) {
+      const auto& var_name = name_vec[i];
+      auto* var = in_vars->at(i);
 
       // Only tensor can be tranfer to another device.
       if (var == nullptr || !VarIsTensor(*var)) {
@@ -2046,17 +2072,17 @@ Scope* OperatorWithKernel::PrepareData(
             new_scope = &scope.NewScope();
           }
           auto* trans_var = new_scope->Var(var_name);
-          input_vars[i] = trans_var;
+          in_vars->at(i) = trans_var;
           auto out = trans_var->GetMutable<LoDTensor>();
           out->Resize(tensor_in->dims());
           platform::MatchShapeToLayout(
               out, tensor_in->layout(), DataLayout::kNHWC);
           VLOG(7) << "Created reshaped dummy input based on MKL-DNN Tensor , "
                      "but kNHWC layout"
-                  << var_name_item.first << " in Operator " << type_;
+                  << in_name << " in Operator " << type_;
         } else {
-          VLOG(7) << "Skip scanning input " << var_name_item.first
-                  << " in Operator " << type_;
+          VLOG(7) << "Skip scanning input " << in_name << " in Operator "
+                  << type_;
         }
 #endif
         continue;
@@ -2066,15 +2092,48 @@ Scope* OperatorWithKernel::PrepareData(
         continue;
       }
 
-      auto kernel_type_for_var = GetKernelTypeForVar(
-          var_name_item.first, *tensor_in, expected_kernel_key);
+      auto kernel_type_for_var =
+          GetKernelTypeForVar(in_name, *tensor_in, expected_kernel_key);
+      bool need_trans_dtype =
+          kernel_type_for_var.data_type_ != expected_kernel_key.data_type_;
+      bool need_trans_layout = NeedTransformLayout(
+          kernel_type_for_var.data_layout_, expected_kernel_key.data_layout_);
+      if (!need_trans_dtype && !need_trans_layout) {
+        if (!run_phi_kernel_ &&
+            platform::places_are_same_class(kernel_type_for_var.place_,
+                                            expected_kernel_key.place_)) {
+          continue;
+        }
+      }
 
-      if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) {
-        continue;
+      std::unique_ptr<OpKernelType> new_expected_kernel_key = nullptr;
+      if (run_phi_kernel_ && in_def->backend != phi::Backend::ALL_BACKEND) {
+        auto tensor_backend = phi::TransToPhiBackend(tensor_in->place());
+        if ((in_def->backend != tensor_backend &&
+             (in_def->backend != phi::Backend::GPUDNN ||
+              tensor_backend != phi::Backend::GPU) &&
+             (in_def->backend != phi::Backend::KPS ||
+              tensor_backend != phi::Backend::XPU)) ||
+            tensor_in->place().GetType() == AllocationType::GPUPINNED) {
+          new_expected_kernel_key = std::make_unique<OpKernelType>(
+              expected_kernel_key.data_type_,
+              phi::TransToPhiPlace(in_def->backend),
+              expected_kernel_key.data_layout_,
+              expected_kernel_key.library_type_,
+              expected_kernel_key.customized_type_value_);
+        }
+      }
+
+      if (!need_trans_dtype && !need_trans_layout) {
+        if (run_phi_kernel_ && new_expected_kernel_key == nullptr) {
+          continue;
+        }
       }
 
       VLOG(3) << "Transform Variable " << var_name << " from "
-              << kernel_type_for_var << " to " << expected_kernel_key;
+              << kernel_type_for_var << " to "
+              << (new_expected_kernel_key ? *new_expected_kernel_key
+                                          : expected_kernel_key);
 
       // In the inference scenerio, the scopes will be reused across the
       // batches, so the `new_scope` here will result in GPU memroy explosion
@@ -2094,13 +2153,22 @@ Scope* OperatorWithKernel::PrepareData(
       // not do transfer scope caching, and cpu inference performance is not
       // impacted by test.
       enable_cache_transfer_scope_ = false;
-      if (!run_by_executor_ &&
-          (platform::is_gpu_place(kernel_type_for_var.place_) ||
-           platform::is_gpu_place(expected_kernel_key.place_))) {
-        new_scope = TryCreateTransferScope(
-            kernel_type_for_var, expected_kernel_key, &scope);
-        enable_cache_transfer_scope_ = true;
+      if (!run_by_executor_) {
+        if (new_expected_kernel_key) {
+          if ((platform::is_gpu_place(kernel_type_for_var.place_) ||
+               platform::is_gpu_place(new_expected_kernel_key->place_))) {
+            new_scope = TryCreateTransferScope(
+                kernel_type_for_var, *new_expected_kernel_key, &scope);
+            enable_cache_transfer_scope_ = true;
+          }
+        } else if ((platform::is_gpu_place(kernel_type_for_var.place_) ||
+                    platform::is_gpu_place(expected_kernel_key.place_))) {
+          new_scope = TryCreateTransferScope(
+              kernel_type_for_var, expected_kernel_key, &scope);
+          enable_cache_transfer_scope_ = true;
+        }
       }
+
       if (!new_scope) {
         new_scope = &scope.NewScope();
       }
@@ -2117,7 +2185,7 @@ Scope* OperatorWithKernel::PrepareData(
 
       // Create new var with the same name in transfer scopes
       auto* trans_var = new_scope->Var(var_name);
-      input_vars[i] = trans_var;
+      in_vars->at(i) = trans_var;
 
       // Find if inplace exists between input and output
       // If inplace exists, set the new created var to inplaced output, and
@@ -2125,7 +2193,7 @@ Scope* OperatorWithKernel::PrepareData(
       for (auto& pair : Outputs()) {
         for (size_t j = 0; j < pair.second.size(); ++j) {
           if (pair.second[j] == var_name) {
-            VLOG(4) << "Found inplace between input(" << var_name_item.first
+            VLOG(4) << "Found inplace between input(" << in_name
                     << ") and output(" << pair.first
                     << "), the variable name is " << var_name;
             ctx->outputs[pair.first][j] = trans_var;
@@ -2136,9 +2204,47 @@ Scope* OperatorWithKernel::PrepareData(
 
       // Do transfer
       Tensor out;
-      TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out);
+      TransformData(new_expected_kernel_key ? *new_expected_kernel_key
+                                            : expected_kernel_key,
+                    kernel_type_for_var,
+                    *tensor_in,
+                    &out);
       SetTensorToVariable(*var, out, trans_var);
     }
+  };
+
+  if (run_phi_kernel_) {
+    const auto& input_names = kernel_signature_->input_names;
+    const auto& input_defs = pt_kernel_->args_def().input_defs();
+    PADDLE_ENFORCE_EQ(input_names.size(),
+                      input_defs.size(),
+                      platform::errors::InvalidArgument(
+                          "The size of inputs_args names (%d) must be equal to "
+                          "the size of kernel input_defs (%d).",
+                          input_names.size(),
+                          input_defs.size()));
+    for (size_t i = 0; i < input_defs.size(); ++i) {
+      const auto& input_defs = pt_kernel_->args_def().input_defs();
+      auto& in_def = input_defs.at(i);
+      std::string input_name = input_names[i];
+      auto iter = ctx->inputs.find(input_name);
+      if (iter == ctx->inputs.end()) {
+        continue;
+      }
+      auto& ins_vector = iter->second;
+      bool should_skip_input =
+          no_buffer_ins && no_buffer_ins->count(input_name) > 0;
+      prepare_input_data(input_name, &ins_vector, &in_def, should_skip_input);
+    }
+  } else {
+    for (auto& var_name_item : Inputs()) {
+      bool should_skip_input =
+          no_buffer_ins && no_buffer_ins->count(var_name_item.first) > 0;
+
+      std::vector<Variable*>& input_vars = ctx->inputs[var_name_item.first];
+      prepare_input_data(
+          var_name_item.first, &input_vars, nullptr, should_skip_input);
+    }
   }
 
   // If pre_scope = &scope, it means that scope is cached and the op is not in
@@ -2381,107 +2487,6 @@ phi::KernelSignature OperatorWithKernel::GetExpectedPhiKernelArgs(
   return (*arg_map_fn_)(arg_mapping_ctx);
 }
 
-Scope* OperatorWithKernel::PreparePhiData(
-    const Scope& scope,
-    const phi::Kernel& pt_kernel,
-    const phi::KernelSignature& pt_kernel_signature,
-    RuntimeContext* ctx) const {
-  const auto& input_names = pt_kernel_signature.input_names;
-  auto input_defs = pt_kernel.args_def().input_defs();
-  PADDLE_ENFORCE_EQ(input_names.size(),
-                    input_defs.size(),
-                    platform::errors::InvalidArgument(
-                        "The size of inputs_args names (%d) must be equal to "
-                        "the size of kernel input_defs (%d).",
-                        input_names.size(),
-                        input_defs.size()));
-  Scope* new_scope = nullptr;
-  auto& name_map = Inputs();
-  const std::unordered_set<std::string>* no_buffer_ins = nullptr;
-  if (info_) {
-    auto& no_buffer_inferer = info_->NoNeedBufferVarsInferer();
-    // Some op may not register NoNeedBufferVarsInferer
-    if (no_buffer_inferer) {
-      no_buffer_ins = &(no_buffer_inferer(Inputs(), Outputs(), Attrs()));
-      if (no_buffer_ins->empty()) no_buffer_ins = nullptr;
-    }
-  }
-
-  for (size_t i = 0; i < input_defs.size(); ++i) {
-    auto& in_def = input_defs.at(i);
-    if (ctx->inputs.find(input_names[i]) == ctx->inputs.end()) {
-      continue;
-    }
-    auto& ins_vector = ctx->inputs.at(input_names[i]);
-    auto& name_vec = name_map.at(input_names[i]);
-    bool should_skip_input =
-        no_buffer_ins && no_buffer_ins->count(input_names[i]) > 0;
-
-    for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
-      // Only tensor can be tranfer to another device.
-      auto* var = ins_vector[offset];
-      if (var == nullptr || !VarIsTensor(*var)) {
-        continue;
-      }
-      auto* tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var);
-
-      // When no_buffer_ins then checking of Tensor::holder_ is
-      // not a thread safe. And for infershape scenario checks
-      // to be omitted are not really needed
-      if (should_skip_input == true) {
-        // TODO(YuanRisheng) : There need to supplement MKLDNN code later
-        continue;
-      }
-
-      if (!tensor_in->IsInitialized()) {
-        continue;
-      }
-
-      if (in_def.backend == phi::Backend::ALL_BACKEND) {
-        continue;
-      }
-
-      auto tensor_backend = phi::TransToPhiBackend(tensor_in->place());
-      if (in_def.backend == tensor_backend ||
-          (in_def.backend == phi::Backend::GPUDNN &&
-           tensor_backend == phi::Backend::GPU)) {
-        continue;
-      }
-
-      auto expected_place = phi::TransToPhiPlace(in_def.backend);
-      VLOG(3) << "phi Transform Variable " << input_names[i] << " from "
-              << tensor_in->place() << " to " << expected_place;
-
-      if (!new_scope) {
-        new_scope = &scope.NewScope();
-      }
-      // For inference, if a gpu model has an op which could only run on CPU,
-      // each result of different input will be the same with the first one.
-      // The reason is that if a gpu tensor is the input of a cpu kernel,
-      // we will create a new cpu tensor in new scope.
-      // However, if enable_cache_runtime_context_, we get the cpu tensor each
-      // time, not the gpu tensor. Thus, we set pre_scope_ = nullptr
-      // to trigger `new RuntimeContext()` in RunImpl().
-      if (enable_cache_runtime_context_) {
-        pre_scope_ = nullptr;
-      }
-
-      // Create new var with the same name in transfer scopes
-      auto* trans_var = new_scope->Var(name_vec[offset]);
-      ins_vector[offset] = trans_var;
-
-      // Do transfer
-      Tensor out;
-      framework::TensorCopySync(*tensor_in, expected_place, &out);
-      SetTensorToVariable(*var, out, trans_var);
-
-      need_prepare_phi_data_ = true;
-    }
-  }
-
-  return new_scope;
-}
-
 void OperatorWithKernel::BuildPhiKernelContext(
     const RuntimeContext& ctx,
     platform::DeviceContext* dev_ctx,
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index d468ead659258..c3827f56c7197 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -38,7 +38,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/unused_var_check.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/variant.h"
+
 #include "paddle/phi/core/compat/arg_map_context.h"
 #include "paddle/phi/core/compat/op_utils.h"
 #include "paddle/phi/core/kernel_factory.h"
@@ -646,14 +646,6 @@ class OperatorWithKernel : public OperatorBase {
   phi::KernelKey ChoosePhiKernel(const ExecutionContext& ctx) const;
 
   void ChooseKernel(const ExecutionContext& ctx) const;
-  /**
-   * Transfer data place for phi kernel
-   * Is this really needed?
-   */
-  Scope* PreparePhiData(const Scope& scope,
-                        const phi::Kernel& pt_kernel,
-                        const phi::KernelSignature& pt_kernel_signature,
-                        RuntimeContext* ctx) const;
 
   void BuildPhiKernelContext(const RuntimeContext& ctx,
                              platform::DeviceContext* dev_ctx,
diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc
index d4dfd165259a2..ba7a5956ae0fd 100644
--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@@ -420,16 +420,13 @@ REGISTER_OP_WITHOUT_GRADIENT(
 
 REGISTER_OP_CPU_KERNEL(
     indicate_lod_tensor_data_type_test,
-    paddle::framework::EmptyTestKernel<paddle::platform::CPUDeviceContext,
-                                       int>);
+    paddle::framework::EmptyTestKernel<phi::CPUContext, int>);
 REGISTER_OP_CPU_KERNEL(
     indicate_selected_rows_data_type_test,
-    paddle::framework::EmptyTestKernel<paddle::platform::CPUDeviceContext,
-                                       int>);
+    paddle::framework::EmptyTestKernel<phi::CPUContext, int>);
 REGISTER_OP_CPU_KERNEL(
     indicate_other_data_type_test,
-    paddle::framework::EmptyTestKernel<paddle::platform::CPUDeviceContext,
-                                       int>);
+    paddle::framework::EmptyTestKernel<phi::CPUContext, int>);
 
 TEST(IndicateVarDataTypeTest, lodtensor) {
   paddle::framework::InitDevices();
@@ -599,16 +596,14 @@ REGISTER_OP_WITHOUT_GRADIENT(get_lod_level_test,
                              paddle::framework::GetSetLoDLevelTestMaker);
 REGISTER_OP_CPU_KERNEL(
     get_lod_level_test,
-    paddle::framework::EmptyTestKernel<paddle::platform::CPUDeviceContext,
-                                       float>);
+    paddle::framework::EmptyTestKernel<phi::CPUContext, float>);
 
 REGISTER_OP_WITHOUT_GRADIENT(set_lod_level_test,
                              paddle::framework::SetLoDLevelTest,
                              paddle::framework::GetSetLoDLevelTestMaker);
 REGISTER_OP_CPU_KERNEL(
     set_lod_level_test,
-    paddle::framework::EmptyTestKernel<paddle::platform::CPUDeviceContext,
-                                       float>);
+    paddle::framework::EmptyTestKernel<phi::CPUContext, float>);
 
 void SetGetLoDLevelTestMain(std::string op_type) {
   paddle::framework::InitDevices({});
diff --git a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
index 7cb9cf254fb1a..5b8e62d4f079d 100644
--- a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
+++ b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
@@ -1,7 +1,7 @@
 cc_library(
   cinn_cache_key
   SRCS cinn_cache_key.cc
-  DEPS boost graph graph_helper lod_tensor proto_desc)
+  DEPS graph graph_helper lod_tensor proto_desc)
 cc_library(
   build_cinn_pass
   SRCS build_cinn_pass.cc
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index fffacc59ba7bc..4e6aeaeb7ac6a 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -548,6 +548,15 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
       PADDLE_THROW(platform::errors::PermissionDenied(
           "Paddle can't use XPU device since it's not compiled with XPU,"
           "Please recompile or reinstall Paddle with XPU support."));
+#endif
+    } else if (platform::is_ipu_place(place)) {
+#if defined(PADDLE_WITH_IPU)
+      gc.reset(new IPUGarbageCollector(place, max_memory_size));
+      VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
+#else
+      PADDLE_THROW(platform::errors::PermissionDenied(
+          "Paddle can't use IPU device since it's not compiled with IPU,"
+          "Please recompile or reinstall Paddle with IPU support."));
 #endif
     } else if (platform::is_custom_place(place)) {
 #if defined(PADDLE_WITH_CUSTOM_DEVICE)
@@ -972,37 +981,26 @@ void ParallelExecutor::BCastParamsToDevices(
   }
 }
 
-FetchResultType ParallelExecutor::Run(
-    const std::vector<std::string> &fetch_tensors, bool return_merged) {
-  platform::RecordEvent record_run(
-      "ParallelExecutor::Run", platform::TracerEventType::UserDefined, 1);
-  VLOG(3) << "enter ParallelExecutor Run";
-#ifdef PADDLE_WITH_CUDA
-  if (platform::IsCUDAGraphCapturing()) {
-    PADDLE_ENFORCE_EQ(fetch_tensors.empty(),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Cannot fetch data when using CUDA Graph."));
-    PADDLE_ENFORCE_EQ(
-        member_->build_strategy_.allow_cuda_graph_capture_,
-        true,
-        platform::errors::InvalidArgument(
-            "You must turn on build_strategy.allow_cuda_graph_capture = True "
-            "to enable CUDA Graph capturing."));
-    PADDLE_ENFORCE_EQ(
-        member_->places_[0],
-        platform::CUDAGraphCapturingPlace(),
-        platform::errors::InvalidArgument("The place to capture CUDAGraph is "
-                                          "not the same as the place to run."));
-  }
-#endif
+FetchUnmergedList ParallelExecutor::Run(
+    const std::vector<std::string> &fetch_tensors) {
+  PreludeToRun(fetch_tensors);
+  platform::RecordBlock b(0);
 
-#ifdef WITH_GPERFTOOLS
-  if (gProfileStarted) {
-    ProfilerFlush();
-  }
-#endif
+  ResetHasFeedGuard reset_has_feed_guard(member_);
 
+  ir::SkipMemOptVarsGuard guard(&(member_->mem_opt_var_infos_),
+                                fetch_tensors,
+                                member_->HasGarbageCollectors());
+
+  VLOG(3) << "ParallelExecutor begin to run member_->executor_->Run";
+  auto fetch_data =
+      member_->executor_->Run(fetch_tensors, /*return_merged=*/false);
+  return BOOST_GET(FetchUnmergedList, fetch_data);
+}
+
+FetchList ParallelExecutor::RunAndMerge(
+    const std::vector<std::string> &fetch_tensors) {
+  PreludeToRun(fetch_tensors);
   platform::RecordBlock b(0);
 
   ResetHasFeedGuard reset_has_feed_guard(member_);
@@ -1011,9 +1009,10 @@ FetchResultType ParallelExecutor::Run(
                                 fetch_tensors,
                                 member_->HasGarbageCollectors());
 
-  VLOG(3) << "ParallelExecutor begin to run member_->executor_->Run";
-  auto fetch_data = member_->executor_->Run(fetch_tensors, return_merged);
-  return fetch_data;
+  VLOG(3) << "ParallelExecutor begin to run member_->executor_->RunAndMerge";
+  auto fetch_data =
+      member_->executor_->Run(fetch_tensors, /*return_merged=*/true);
+  return BOOST_GET(FetchList, fetch_data);
 }
 
 void ParallelExecutor::RunWithoutFetch(
@@ -1440,6 +1439,38 @@ std::vector<ir::Graph *> ParallelExecutor::CloneGraphToMultiDevices(
   return graphs;
 }
 
+void ParallelExecutor::PreludeToRun(
+    const std::vector<std::string> &fetch_tensors) {
+  platform::RecordEvent record_run(
+      "ParallelExecutor::Run", platform::TracerEventType::UserDefined, 1);
+  VLOG(3) << "enter ParallelExecutor Run";
+#ifdef PADDLE_WITH_CUDA
+  if (platform::IsCUDAGraphCapturing()) {
+    PADDLE_ENFORCE_EQ(fetch_tensors.empty(),
+                      true,
+                      platform::errors::InvalidArgument(
+                          "Cannot fetch data when using CUDA Graph."));
+    PADDLE_ENFORCE_EQ(
+        member_->build_strategy_.allow_cuda_graph_capture_,
+        true,
+        platform::errors::InvalidArgument(
+            "You must turn on build_strategy.allow_cuda_graph_capture = True "
+            "to enable CUDA Graph capturing."));
+    PADDLE_ENFORCE_EQ(
+        member_->places_[0],
+        platform::CUDAGraphCapturingPlace(),
+        platform::errors::InvalidArgument("The place to capture CUDAGraph is "
+                                          "not the same as the place to run."));
+  }
+#endif
+
+#ifdef WITH_GPERFTOOLS
+  if (gProfileStarted) {
+    ProfilerFlush();
+  }
+#endif
+}
+
 void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) {
   if (member_->build_strategy_.reduce_ ==
       BuildStrategy::ReduceStrategy::kNoReduce) {
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 4cb9c0340b53c..a3b812a71a2b7 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -89,8 +89,8 @@ class ParallelExecutor {
   void FeedAndSplitTensorIntoLocalScopes(
       const std::unordered_map<std::string, LoDTensor> &tensors);
 
-  FetchResultType Run(const std::vector<std::string> &fetch_tensors,
-                      bool return_merged = true);
+  FetchUnmergedList Run(const std::vector<std::string> &fetch_tensors);
+  FetchList RunAndMerge(const std::vector<std::string> &fetch_tensors);
 
   void RunWithoutFetch(const std::vector<std::string> &skip_eager_vars);
 
@@ -126,6 +126,8 @@ class ParallelExecutor {
 
   std::vector<ir::Graph *> CloneGraphToMultiDevices(ir::Graph *graph);
 
+  void PreludeToRun(const std::vector<std::string> &fetch_tensors);
+
   void PrepareNCCLCommunicator(Scope *global_scope);
 
   std::vector<ir::Graph *> CompileGraphWithBuildStrategy(
diff --git a/paddle/fluid/framework/phi_utils.cc b/paddle/fluid/framework/phi_utils.cc
index fada192e55e14..2e56fea28e0b5 100644
--- a/paddle/fluid/framework/phi_utils.cc
+++ b/paddle/fluid/framework/phi_utils.cc
@@ -66,7 +66,7 @@ OpKernelType TransPhiKernelKeyToOpKernelType(const phi::KernelKey& kernel_key) {
   platform::Place place = phi::TransToPhiPlace(kernel_key.backend(), false);
   DataLayout data_layout = kernel_key.layout();
   LibraryType library_type = LibraryType::kPlain;
-  if (kernel_key.backend() == phi::Backend::MKLDNN) {
+  if (kernel_key.backend() == phi::Backend::ONEDNN) {
     library_type = LibraryType::kMKLDNN;
   } else if (kernel_key.backend() == phi::Backend::GPUDNN) {
     library_type = LibraryType::kCUDNN;
@@ -87,7 +87,7 @@ phi::KernelKey TransOpKernelTypeToPhiKernelKey(
       backend = phi::Backend::GPUDNN;
       break;
     case LibraryType::kMKLDNN:
-      backend = phi::Backend::MKLDNN;
+      backend = phi::Backend::ONEDNN;
       break;
     case LibraryType::kKP:
       backend = phi::Backend::KPS;
diff --git a/paddle/fluid/framework/phi_utils.h b/paddle/fluid/framework/phi_utils.h
index 535672f2e1288..6c8e825157973 100644
--- a/paddle/fluid/framework/phi_utils.h
+++ b/paddle/fluid/framework/phi_utils.h
@@ -66,7 +66,7 @@ struct ConvertToPhiContext {
 };
 
 template <>
-struct ConvertToPhiContext<platform::CPUDeviceContext> {
+struct ConvertToPhiContext<phi::CPUContext> {
   using TYPE = phi::CPUContext;
 };
 
diff --git a/paddle/fluid/framework/phi_utils_test.cc b/paddle/fluid/framework/phi_utils_test.cc
index e8f8825006094..94ab77f310f99 100644
--- a/paddle/fluid/framework/phi_utils_test.cc
+++ b/paddle/fluid/framework/phi_utils_test.cc
@@ -32,7 +32,7 @@ TEST(PhiUtils, TransPhiKernelKeyToOpKernelType) {
 
 #ifdef PADDLE_WITH_MKLDNN
   phi::KernelKey kernel_key_mkldnn(
-      phi::Backend::MKLDNN, phi::DataLayout::NCHW, phi::DataType::FLOAT32);
+      phi::Backend::ONEDNN, phi::DataLayout::NCHW, phi::DataType::FLOAT32);
   op_kernel_type =
       paddle::framework::TransPhiKernelKeyToOpKernelType(kernel_key_mkldnn);
   ASSERT_EQ(op_kernel_type.data_type_, paddle::framework::proto::VarType::FP32);
@@ -76,7 +76,7 @@ TEST(PhiUtils, TransOpKernelTypeToPhiKernelKey) {
       paddle::framework::TransOpKernelTypeToPhiKernelKey(op_kernel_type_mkldnn);
   ASSERT_EQ(kernel_key_mkldnn.dtype(), phi::DataType::FLOAT32);
   ASSERT_EQ(kernel_key_mkldnn.layout(), phi::DataLayout::MKLDNN);
-  ASSERT_EQ(kernel_key_mkldnn.backend(), phi::Backend::MKLDNN);
+  ASSERT_EQ(kernel_key_mkldnn.backend(), phi::Backend::ONEDNN);
 #endif
 
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/framework/selected_rows_utils_test.cc b/paddle/fluid/framework/selected_rows_utils_test.cc
index db2c6c1f991b7..340acf53efa9d 100644
--- a/paddle/fluid/framework/selected_rows_utils_test.cc
+++ b/paddle/fluid/framework/selected_rows_utils_test.cc
@@ -53,7 +53,7 @@ TEST_F(SelectedRowsTester, complete_dims) {
 
 TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
   phi::SelectedRows dst_tensor;
-  platform::CPUDeviceContext cpu_ctx(place_);
+  phi::CPUContext cpu_ctx(place_);
   std::ostringstream oss;
 
   SerializeToStream(oss, *selected_rows_, cpu_ctx);
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 2fe2b87fcd4ae..dbb549efa2519 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -689,7 +689,7 @@ inline void AnyImpl(Predicate predicate,
 }
 
 template <typename Predicate>
-class AnyVisitor : public boost::static_visitor<bool> {
+class AnyVisitor : public std::unary_function<const Place&, bool> {
  private:
   const framework::Tensor& tensor_;
   Predicate predicate_;
@@ -774,7 +774,7 @@ class AnyVisitor : public boost::static_visitor<bool> {
 };
 
 template <typename Predicate>
-class AnyOutVisitor : public boost::static_visitor<> {
+class AnyOutVisitor : public std::unary_function<const Place&, void> {
  private:
   const framework::Tensor& tensor_;
   mutable framework::Tensor* out_;
@@ -843,7 +843,7 @@ inline void AllImpl(Predicate predicate,
 }
 
 template <typename Predicate>
-class AllOutVisitor : public boost::static_visitor<> {
+class AllOutVisitor : public std::unary_function<const Place&, void> {
  private:
   const framework::Tensor& tensor_;
   mutable framework::Tensor* out_;
@@ -942,7 +942,7 @@ static inline void __global__ BothFalse(const T* cmp, T* out, int element_num) {
 }
 #endif
 
-struct BothFalseVisitor : public boost::static_visitor<> {
+struct BothFalseVisitor : public std::unary_function<const Place&, void> {
   const framework::Tensor& in_;
   mutable framework::Tensor* out_;
   BothFalseVisitor(const framework::Tensor& in, framework::Tensor* out)
@@ -1253,7 +1253,7 @@ void TensorFromStream(std::istream& is,
     is.seekg(seekg, is.cur);
 
     void* buf;
-    platform::CPUDeviceContext ctx;
+    phi::CPUContext ctx;
     size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
     if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
         platform::is_xpu_place(dev_ctx.GetPlace()) ||
@@ -1336,7 +1336,7 @@ void TensorFromStream(std::istream& is,
     std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
     tensor->Resize(phi::make_ddim(dims));
     void* buf;
-    platform::CPUDeviceContext ctx;
+    phi::CPUContext ctx;
     size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
     if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
         platform::is_xpu_place(dev_ctx.GetPlace()) ||
diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc
index 20fab1d20b0c0..74454a5a09b7a 100644
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -24,7 +24,7 @@ namespace framework {
 TEST(TensorCopy, Tensor) {
   Tensor src_tensor;
   Tensor dst_tensor;
-  platform::CPUDeviceContext cpu_ctx((platform::CPUPlace()));
+  phi::CPUContext cpu_ctx((platform::CPUPlace()));
 
   int* src_ptr = src_tensor.mutable_data<int>(phi::make_ddim({3, 3}),
                                               platform::CPUPlace());
@@ -164,7 +164,7 @@ TEST(TensorFromVector, Tensor) {
     // Copy to CPU Tensor
     cpu_tensor.Resize(phi::make_ddim({3, 3}));
     auto cpu_place = new paddle::platform::CPUPlace();
-    paddle::platform::CPUDeviceContext cpu_ctx(*cpu_place);
+    phi::CPUContext cpu_ctx(*cpu_place);
     paddle::framework::TensorFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
 
     // Copy to GPUTensor
@@ -255,20 +255,23 @@ TEST(TensorToVector, Tensor) {
 #endif
 }
 
-TEST(TensorToVector, Tensor_bool){{paddle::framework::Tensor src;
-bool* src_ptr = src.mutable_data<bool>({3, 3}, paddle::platform::CPUPlace());
-for (int i = 0; i < 3 * 3; ++i) {
-  src_ptr[i] = static_cast<bool>(i % 2);
-}
+TEST(TensorToVector, Tensor_bool) {
+{
+  paddle::framework::Tensor src;
+  bool* src_ptr = src.mutable_data<bool>({3, 3}, paddle::platform::CPUPlace());
+  for (int i = 0; i < 3 * 3; ++i) {
+    src_ptr[i] = static_cast<bool>(i % 2);
+  }
 
-paddle::platform::CPUPlace place;
-std::vector<bool> dst;
-paddle::framework::TensorToVector<bool>(src, &dst);
+  paddle::platform::CPUPlace place;
+  std::vector<bool> dst;
+  paddle::framework::TensorToVector<bool>(src, &dst);
 
-for (int i = 0; i < 3 * 3; ++i) {
-  EXPECT_EQ(src_ptr[i], dst[i]);
+  for (int i = 0; i < 3 * 3; ++i) {
+    EXPECT_EQ(src_ptr[i], dst[i]);
+  }
 }
-}  // namespace framework
+
 #ifdef PADDLE_WITH_CUDA
 {
   std::vector<bool> src_vec = {
@@ -325,7 +328,7 @@ for (int i = 0; i < 3 * 3; ++i) {
   }
 }
 #endif
-}  // namespace paddle
+}
 
 TEST(TensorFromDLPack, Tensor) {
   {
@@ -334,7 +337,7 @@ TEST(TensorFromDLPack, Tensor) {
 
     cpu_tensor.Resize(phi::make_ddim({3, 3}));
     paddle::platform::CPUPlace cpu_place;
-    paddle::platform::CPUDeviceContext cpu_ctx(cpu_place);
+    phi::CPUContext cpu_ctx(cpu_place);
     paddle::framework::TensorFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
     paddle::framework::DLPackTensor dlpack_tensor(cpu_tensor, 1);
 
@@ -360,7 +363,7 @@ TEST(TensorFromDLPack, Tensor) {
     // Copy to CPU Tensor
     cpu_tensor.Resize(phi::make_ddim({3, 3}));
     paddle::platform::CPUPlace cpu_place;
-    paddle::platform::CPUDeviceContext cpu_ctx(cpu_place);
+    phi::CPUContext cpu_ctx(cpu_place);
     paddle::framework::TensorFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
 
     // Copy to GPUTensor
@@ -502,7 +505,7 @@ TEST(Tensor, FromAndToStream) {
   {
     framework::Tensor dst_tensor;
     auto place = new platform::CPUPlace();
-    platform::CPUDeviceContext cpu_ctx(*place);
+    phi::CPUContext cpu_ctx(*place);
     std::ostringstream oss;
     TensorToStream(oss, src_tensor, cpu_ctx);
 
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
index 1a805ccd76e44..7a60d5db0dfa9 100644
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -37,7 +37,7 @@ limitations under the License. */
 #include "paddle/phi/backends/dynload/port.h"
 
 #ifdef PADDLE_WITH_PSLIB
-#include "proto/ps.pb.h"
+#include "proto/the_one_ps.pb.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/framework/tuple.h b/paddle/fluid/framework/tuple.h
index 6c283f4d32e57..a06f92f32d28c 100644
--- a/paddle/fluid/framework/tuple.h
+++ b/paddle/fluid/framework/tuple.h
@@ -22,7 +22,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/variant.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
index ca9d6ec44a8d9..3bcad63f21a84 100644
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -22,9 +22,9 @@ limitations under the License. */
 #include <unordered_set>
 #include <vector>
 
-#include "boost/blank.hpp"
 #include "paddle/fluid/imperative/type_defs.h"
-#include "paddle/fluid/platform/variant.h"
+
+#include "paddle/utils/blank.h"
 #include "paddle/utils/small_vector.h"
 #include "paddle/utils/variant.h"
 
@@ -42,7 +42,7 @@ class InferNoNeedBufferVarsFN;
 using VariableNameMap = std::map<std::string, std::vector<std::string>>;
 using VariableValueMap = std::map<std::string, std::vector<Variable*>>;
 
-using Attribute = paddle::variant<boost::blank,
+using Attribute = paddle::variant<paddle::blank,
                                   int,
                                   float,
                                   std::string,
@@ -59,7 +59,7 @@ using Attribute = paddle::variant<boost::blank,
 using AttributeMap = std::unordered_map<std::string, Attribute>;
 
 #ifdef PADDLE_WITH_ASCEND_CL
-using NPUAttribute = paddle::variant<boost::blank,
+using NPUAttribute = paddle::variant<paddle::blank,
                                      int,
                                      float,
                                      std::string,
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index 2d4a57b82a186..98ece2db96c1b 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -5,7 +5,7 @@ cc_library(
 cc_library(
   var_helper
   SRCS var_helper.cc
-  DEPS tensor phi_api)
+  DEPS tensor selected_rows)
 if(WITH_XPU)
   cc_library(
     prepared_operator
@@ -20,8 +20,8 @@ if(WITH_XPU)
          op_kernel_type
          data_transform
          nan_inf_utils
-         phi_api
-         phi_utils
+         scalar
+         int_array
          var_helper
          profiler)
 else()
@@ -37,21 +37,16 @@ else()
          op_kernel_type
          data_transform
          nan_inf_utils
-         phi_api
-         phi_utils
+         scalar
+         int_array
          var_helper
          profiler)
 endif()
 cc_library(
   layer
   SRCS layer.cc
-  DEPS prepared_operator
-       math_function
-       imperative_flag
-       variable_helper
-       op_registry
-       var_helper
-       phi_api)
+  DEPS prepared_operator math_function imperative_flag variable_helper
+       op_registry var_helper)
 add_subdirectory(jit)
 if(WITH_GPU)
   cc_library(
diff --git a/paddle/fluid/imperative/bkcl_context.cc b/paddle/fluid/imperative/bkcl_context.cc
index 4bef3549d26c5..831e7dae942ae 100644
--- a/paddle/fluid/imperative/bkcl_context.cc
+++ b/paddle/fluid/imperative/bkcl_context.cc
@@ -110,6 +110,10 @@ void BKCLParallelContext::Init() {
                                                      strategy_.local_rank_,
                                                      xpu_id,
                                                      ring_id);
+    compute_events_.emplace_back(
+        platform::XpuEventResourcePool::Instance().New(place_.device));
+    comm_events_.emplace_back(
+        platform::XpuEventResourcePool::Instance().New(place_.device));
   }
 }
 
@@ -134,6 +138,11 @@ void BKCLParallelContext::InitWithRingID(int ring_id) {
   // it will assign bkcl_comm in XPUDeviceContext within ring_id
   platform::BKCLCommContext::Instance().CreateComm(
       &bkcl_ids[0], strategy_.nranks_, strategy_.local_rank_, xpu_id, ring_id);
+
+  compute_events_.emplace_back(
+      platform::XpuEventResourcePool::Instance().New(place_.device));
+  comm_events_.emplace_back(
+      platform::XpuEventResourcePool::Instance().New(place_.device));
 }
 
 void BKCLParallelContext::AllReduceByStream(const framework::Variable &src,
@@ -213,9 +222,18 @@ void BKCLParallelContext::WaitCompute(int ring_id) {
                                    "but got ring id = %d, nrings = %d",
                                    ring_id,
                                    strategy_.nrings_));
-  auto compute_dev_ctx = static_cast<platform::XPUDeviceContext *>(
-      platform::DeviceContextPool::Instance().Get(place_));
-  compute_dev_ctx->Wait();
+  auto compute_stream = static_cast<platform::XPUDeviceContext *>(
+                            platform::DeviceContextPool::Instance().Get(place_))
+                            ->stream();
+  auto comm_stream = platform::BKCLCommContext::Instance()
+                         .Get(ring_id, place_)
+                         ->dev_context()
+                         ->stream();
+  auto event = compute_events_[ring_id].get();
+
+  // compute_stream-->event-->comm_stream
+  PADDLE_ENFORCE_XPU_SUCCESS(xpu_event_record(event, compute_stream));
+  PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_wait_event(comm_stream, event));
 }
 
 void BKCLParallelContext::WaitComm(int ring_id) {
@@ -230,9 +248,18 @@ void BKCLParallelContext::WaitComm(int ring_id) {
                                    "but got ring id = %d, nrings = %d",
                                    ring_id,
                                    strategy_.nrings_));
-  auto comm_dev_ctx =
-      platform::BKCLCommContext::Instance().Get(ring_id, place_)->dev_context();
-  comm_dev_ctx->Wait();
+  auto comm_stream = platform::BKCLCommContext::Instance()
+                         .Get(ring_id, place_)
+                         ->dev_context()
+                         ->stream();
+  auto compute_stream = static_cast<platform::XPUDeviceContext *>(
+                            platform::DeviceContextPool::Instance().Get(place_))
+                            ->stream();
+  auto event = compute_events_[ring_id].get();
+
+  // comm_stream-->event-->compute_stream
+  PADDLE_ENFORCE_XPU_SUCCESS(xpu_event_record(event, comm_stream));
+  PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_wait_event(compute_stream, event));
 }
 
 void BKCLParallelContext::SynchronizeCompute() {
diff --git a/paddle/fluid/imperative/bkcl_context.h b/paddle/fluid/imperative/bkcl_context.h
index 7ba1358959161..6a938924b9780 100644
--- a/paddle/fluid/imperative/bkcl_context.h
+++ b/paddle/fluid/imperative/bkcl_context.h
@@ -19,6 +19,7 @@
 #include <vector>
 
 #include "paddle/fluid/imperative/parallel_context.h"
+#include "paddle/fluid/platform/device/xpu/xpu_resource_pool.h"
 #include "xpu/bkcl.h"
 
 namespace paddle {
@@ -52,6 +53,13 @@ class BKCLParallelContext : public ParallelContext {
   void WaitComm(int ring_id) override;
 
   void SynchronizeCompute() override;
+
+ private:
+  // used for comm wait compute, compute_stream-->event-->comm_stream[ring_id]
+  std::vector<std::shared_ptr<platform::XpuEventObject>> compute_events_;
+
+  // used for compute wait comm, comm_stream[ring_id]-->event-->compute_stream
+  std::vector<std::shared_ptr<platform::XpuEventObject>> comm_events_;
 };
 
 }  //  namespace imperative
diff --git a/paddle/fluid/imperative/gloo_context.cc b/paddle/fluid/imperative/gloo_context.cc
index dd263f0f8f2fe..b6c21bead4182 100644
--- a/paddle/fluid/imperative/gloo_context.cc
+++ b/paddle/fluid/imperative/gloo_context.cc
@@ -46,8 +46,8 @@ void GLOOParallelContext::Init() {
   int port = std::stoi(addr[1]);
   gloo_wrapper->SetHttpStore(host, port, "worker");
   gloo_wrapper->Init();
-  device_ = std::unique_ptr<platform::CPUDeviceContext>(
-      new platform::CPUDeviceContext(platform::CPUPlace()));
+  device_ = std::unique_ptr<phi::CPUContext>(
+      new phi::CPUContext(platform::CPUPlace()));
   device_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                             .GetAllocator(platform::CPUPlace())
                             .get());
@@ -200,7 +200,7 @@ void GLOOParallelContext::Broadcast(framework::Variable *src, int ring_id) {
 
 paddle::platform::DeviceContext *GLOOParallelContext::GetDeviceContext(
     int ring_id) {
-  // return the CPUDeviceContext
+  // return the CPUContext
   return device_.get();
 }
 
diff --git a/paddle/fluid/imperative/gloo_context.h b/paddle/fluid/imperative/gloo_context.h
index 85aacc0d3f77b..5290e3d1315a4 100644
--- a/paddle/fluid/imperative/gloo_context.h
+++ b/paddle/fluid/imperative/gloo_context.h
@@ -64,7 +64,7 @@ class GLOOParallelContext : public ParallelContext {
   void AllReduce(const phi::SelectedRows& src, phi::SelectedRows* dst);
 
  private:
-  std::unique_ptr<platform::CPUDeviceContext> device_;
+  std::unique_ptr<phi::CPUContext> device_;
 };
 
 }  //  namespace imperative
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index ba60c834f79ae..f6883fe6c6a92 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -79,15 +79,16 @@ static void MoveOrCopyVar(framework::Variable* dst,
 }
 
 template <typename T>
-class TensorAddFunctor : public boost::static_visitor<> {
+class TensorAddFunctor
+    : public std::unary_function<const platform::Place&, void> {
  public:
   TensorAddFunctor(int64_t numel, const T* x, T* y)
       : numel_(numel), x_(x), y_(y) {}
 
   void operator()(const platform::CPUPlace& place) const {
-    platform::CPUDeviceContext* ctx = dynamic_cast<platform::CPUDeviceContext*>(
+    phi::CPUContext* ctx = dynamic_cast<phi::CPUContext*>(
         platform::DeviceContextPool::Instance().Get(place));
-    auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, T>(*ctx);
+    auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(*ctx);
     blas.AXPY(numel_, 1., x_, y_);
   }
 
@@ -438,7 +439,7 @@ void TensorAdd(const VarType& src, VarType* dst) {
           place));
 #endif
     } else if (platform::is_cpu_place(place)) {
-      return TensorAddImpl<platform::CPUDeviceContext, platform::float16>(
+      return TensorAddImpl<phi::CPUContext, platform::float16>(
           src_tensor, dst_tensor, place);
     }
   }
@@ -455,7 +456,7 @@ void TensorAdd(const VarType& src, VarType* dst) {
           place));
 #endif
     } else if (platform::is_cpu_place(place)) {
-      return TensorAddImpl<platform::CPUDeviceContext, platform::bfloat16>(
+      return TensorAddImpl<phi::CPUContext, platform::bfloat16>(
           src_tensor, dst_tensor, place);
     }
   }
@@ -498,8 +499,8 @@ void SelectedRowsAddToTensor(const VarType& src, VarType* dst) {
     PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(platform::CUDADeviceContext, double);
   } else {
 #endif
-    PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(platform::CPUDeviceContext, float);
-    PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(platform::CPUDeviceContext, double);
+    PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::CPUContext, float);
+    PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::CPUContext, double);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   }
 #endif
@@ -550,8 +551,8 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var,
     PADDLE_SELECTED_ROWS_ADD_TENSOR(platform::CUDADeviceContext, double);
   } else {
 #endif
-    PADDLE_SELECTED_ROWS_ADD_TENSOR(platform::CPUDeviceContext, float);
-    PADDLE_SELECTED_ROWS_ADD_TENSOR(platform::CPUDeviceContext, double);
+    PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::CPUContext, float);
+    PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::CPUContext, double);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   }
 #endif
@@ -613,8 +614,8 @@ std::shared_ptr<ReturnVarType> SelectedRowsMerge(const VarType& src1,
     PADDLE_SELECTED_ROWS_ADD(platform::CUDADeviceContext, double);
   } else {
 #endif
-    PADDLE_SELECTED_ROWS_ADD(platform::CPUDeviceContext, float);
-    PADDLE_SELECTED_ROWS_ADD(platform::CPUDeviceContext, double);
+    PADDLE_SELECTED_ROWS_ADD(phi::CPUContext, float);
+    PADDLE_SELECTED_ROWS_ADD(phi::CPUContext, double);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   }
 #endif
diff --git a/paddle/fluid/imperative/layout_autotune.cc b/paddle/fluid/imperative/layout_autotune.cc
index 669a4af99f31f..10a4a2e69d540 100644
--- a/paddle/fluid/imperative/layout_autotune.cc
+++ b/paddle/fluid/imperative/layout_autotune.cc
@@ -131,6 +131,8 @@ paddle::imperative::NameVarMap<VarType> DealLightlyLayoutSensitive(
     transposer = std::make_shared<FlattenOpTransformer<VarType>>(op_type);
   } else if (op_type == "arg_max") {
     transposer = std::make_shared<ArgmaxOpTransformer<VarType>>(op_type);
+  } else if (op_type == "concat") {
+    transposer = std::make_shared<ConcatOpTransformer<VarType>>(op_type);
   } else if (op_type.find("elementwise_") != std::string::npos) {
     transposer = std::make_shared<ElementwiseOpTransformer<VarType>>(op_type);
   } else {
diff --git a/paddle/fluid/imperative/layout_transformer.h b/paddle/fluid/imperative/layout_transformer.h
index 50d3e2b6ac139..fa7261b6d52b6 100644
--- a/paddle/fluid/imperative/layout_transformer.h
+++ b/paddle/fluid/imperative/layout_transformer.h
@@ -374,8 +374,9 @@ class ArgmaxOpTransformer
     bool keep_dims = BOOST_GET_CONST(bool, (*attrs)["keepdims"]);
     if (keep_dims) {
       if (var_layout != DataLayout::UNDEFINED) {
-        std::vector<int> perm_nhwc = {0, 2, 3, 1};
-        std::vector<int> perm_nchw = {0, 3, 1, 2};
+        std::vector<int> perm_nhwc = {0, 3, 1, 2};
+        std::vector<int> perm_nchw = {0, 2, 3, 1};
+
         auto perm = var_layout == DataLayout::NHWC ? perm_nhwc : perm_nchw;
         switch (AttrTypeID((*attrs)["axis"])) {
           case paddle::framework::proto::AttrType::INT: {
@@ -400,5 +401,51 @@ class ArgmaxOpTransformer
   }
 };
 
+template <typename VarType>
+class ConcatOpTransformer
+    : public LightlyLayoutSensitiveOpTransformer<VarType> {
+ public:
+  explicit ConcatOpTransformer(const std::string& type)
+      : LightlyLayoutSensitiveOpTransformer<VarType>(type) {}
+
+  paddle::imperative::NameVarMap<VarType> Apply(
+      const paddle::imperative::NameVarMap<VarType>& ins,
+      const paddle::imperative::NameVarMap<VarType>& outs,
+      paddle::framework::AttributeMap* attrs,
+      const std::shared_ptr<paddle::imperative::Tracer>& tracer) {
+    VLOG(3) << "Optimze lightly layout sensitive op " << this->Type();
+    auto& in_var = ins.at("X")[0];
+    auto var_layout = paddle::imperative::GetDataLayout(in_var);
+    bool need_tranppose = false;
+    for (auto& pair : ins) {
+      for (auto& var : pair.second) {
+        if (var != nullptr &&
+            (paddle::imperative::GetDataLayout(var) != var_layout)) {
+          need_tranppose = true;
+          break;
+        }
+      }
+    }
+
+    if (need_tranppose) {
+      return LightlyLayoutSensitiveOpTransformer<VarType>::Apply(
+          ins, outs, attrs, tracer);
+    }
+
+    if (var_layout != DataLayout::UNDEFINED) {
+      std::vector<int> perm_nhwc = {0, 3, 1, 2};
+      std::vector<int> perm_nchw = {0, 2, 3, 1};
+      auto perm = var_layout == DataLayout::NHWC ? perm_nhwc : perm_nchw;
+      auto axis = BOOST_GET_CONST(int, (*attrs)["axis"]);
+      (*attrs)["axis"] = static_cast<int>(perm[axis]);
+    }
+    auto axis = BOOST_GET_CONST(int, (*attrs)["axis"]);
+    VLOG(3) << "Optimze lightly layout sensitive op asdfasdfasdf axis" << axis;
+
+    this->SetVarsLayout(outs, var_layout);
+    return ins;
+  }
+};
+
 }  // namespace imperative
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc
index 7c0243caf6abf..f445632de8c5d 100644
--- a/paddle/fluid/imperative/partial_grad_engine.cc
+++ b/paddle/fluid/imperative/partial_grad_engine.cc
@@ -98,6 +98,7 @@ static void GetGraphInfoBetweenTargets(
     auto &grad_node = output_target->GradVarBase()->GradNode();
     if (visited.count(grad_node.get()) == 0) {
       for (auto &op : *grad_node) {
+        VLOG(10) << "Pushed op: " << op.Type();
         q.emplace(&op, grad_node.get());
       }
     }
@@ -141,6 +142,8 @@ static void GetGraphInfoBetweenTargets(
     for (auto &pending_node : node->GradPendingNodes()) {
       for (auto &pending_op : *pending_node) {
         preceding_ops[&pending_op].insert(op);
+        VLOG(10) << "Find preceding op of: " << pending_op.Type()
+                 << " is: " << op->Type();
       }
       if (visited.count(pending_node.get()) == 0) {
         visited.insert(pending_node.get());
@@ -175,6 +178,7 @@ static void GetGraphInfoBetweenTargets(
   std::queue<std::pair<OpBase * /*op*/, OpBase * /*pending op*/>> op_queue;
   std::unordered_set<std::pair<OpBase *, OpBase *>, HashPair> op_base_visited;
   for (auto &endpoint_op : endpoint_ops) {
+    VLOG(10) << "Emplaced endpoint op: " << endpoint_op->Type();
     op_queue.emplace(endpoint_op, nullptr);
     op_base_visited.emplace(endpoint_op, nullptr);
   }
@@ -186,14 +190,18 @@ static void GetGraphInfoBetweenTargets(
 
     op_queue.pop();
 
+    VLOG(10) << "Get op: " << op->Type();
+
     bool is_valid = false;
     for (auto &output_pair : op->GetOutsMap()) {
       if (!output_pair.second.IsGrad()) {
+        VLOG(10) << "Continueded output for : " << op->Type();
         continue;
       }
 
       for (auto &out_var : output_pair.second) {
         if (out_var && target_vars.count(out_var.get()) > 0) {
+          VLOG(10) << "Find target output for : " << op->Type();
           is_valid = true;
           break;
         }
@@ -211,11 +219,13 @@ static void GetGraphInfoBetweenTargets(
     is_valid = false;
     for (auto &input_pair : op->GetInsMap()) {
       if (!input_pair.second.IsGrad()) {
+        VLOG(10) << "Continueded input for : " << op->Type();
         continue;
       }
 
       for (auto &in_var : input_pair.second) {
         if (in_var && no_grad_var_grad.count(in_var.get()) == 0) {
+          VLOG(10) << "Find not no grad var in input for : " << op->Type();
           target_vars.insert(in_var.get());
           is_valid = true;
         }
@@ -240,7 +250,10 @@ static void GetGraphInfoBetweenTargets(
     auto iter = preceding_ops.find(op);
     if (iter != preceding_ops.end()) {
       for (auto &preceding_op : iter->second) {
+        VLOG(10) << "Scan preceding op: " << preceding_op->Type() << " for "
+                 << op->Type();
         if (op_base_visited.count(std::make_pair(preceding_op, op)) == 0) {
+          VLOG(10) << "Emplace op: " << preceding_op->Type();
           op_queue.emplace(preceding_op, op);
           op_base_visited.emplace(preceding_op, op);
         }
@@ -648,6 +661,7 @@ PartialGradTask::PartialGradTask(
                     platform::errors::Unimplemented(
                         "only_inputs=False is not supported yet"));
 
+  VLOG(10) << "no_grad_vars size: " << no_grad_vars.size();
   for (auto &var : no_grad_vars) {
     if (var && var->GradVarBase()) {
       no_grad_var_grad_.insert(var->GradVarBase()->SharedVar().get());
@@ -853,6 +867,7 @@ std::vector<std::shared_ptr<VarBase>> PartialGradTask::Run() {
     }
 
     for (auto &pending_op : iter->second) {
+      VLOG(10) << "Find pending op" << pending_op->Type();
       auto dep_iter = op_deps_.find(pending_op);
       PADDLE_ENFORCE_EQ(
           dep_iter != op_deps_.end(),
@@ -862,6 +877,7 @@ std::vector<std::shared_ptr<VarBase>> PartialGradTask::Run() {
       if (--(dep_iter->second) == 0) {
         q.push(pending_op);
       }
+      VLOG(10) << "Pending op deps: " << dep_iter->second;
     }
   }
 
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 950a66d5e6d68..029c01a245b1e 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -394,6 +394,16 @@ PreparedOp PrepareImpl(
     kernel_iter = kernels.find(expected_kernel_key);
   }
 #endif
+#ifdef PADDLE_WITH_IPU
+  if (kernel_iter == kernels.end() &&
+      paddle::platform::is_ipu_place(expected_kernel_key.place_)) {
+    VLOG(3) << "missing IPU kernel: " << op.Type()
+            << ", expected_kernel_key:" << expected_kernel_key
+            << ", fallbacking to CPU one!";
+    expected_kernel_key.place_ = platform::CPUPlace();
+    kernel_iter = kernels.find(expected_kernel_key);
+  }
+#endif
 #ifdef PADDLE_WITH_MLU
   if (kernel_iter == kernels.end() &&
       paddle::platform::is_mlu_place(expected_kernel_key.place_)) {
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index f45b72055ec4e..c0ff0914401b5 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -82,7 +82,7 @@ std::shared_ptr<NameVarMap<VarType>> PrepareData(
       auto& template_var = name_pair.second[i];
       SetForwardDataTypeOfGradVar(template_var);
       const auto* tensor = GetTensorFromVar(template_var->Var());
-      if (tensor && tensor->IsInitialized()) {
+      if (tensor && tensor->IsInitialized() && (tensor->memory_size() != 0)) {
         auto kernel_type_for_var = op.GetKernelTypeForVar(
             name_pair.first, *tensor, expected_kernel_key);
         if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) {
@@ -91,7 +91,8 @@ std::shared_ptr<NameVarMap<VarType>> PrepareData(
           VLOG(3) << "Transform Variable " << GetNameFromVar(template_var)
                   << " from " << kernel_type_for_var << " to "
                   << expected_kernel_key;
-
+          VLOG(3) << GetNameFromVar(template_var)
+                  << " memory size is: " << tensor->memory_size();
           if (CheckCachedKey(template_var, expected_kernel_key)) {
             VLOG(3) << "Hit variable_wrapper cache: key="
                     << expected_kernel_key;
@@ -257,29 +258,35 @@ void BuildDygraphPhiKernelContext(const phi::KernelSignature& kernel_signature,
   auto& output_defs = phi_kernel.args_def().output_defs();
   auto& attr_defs = phi_kernel.args_def().attribute_defs();
 
-  PADDLE_ENFORCE_EQ(input_names.size(),
-                    input_defs.size(),
-                    platform::errors::InvalidArgument(
-                        "the size of inputs_args names (%d) must be equal to "
-                        "the size of kernel input_defs (%d).",
-                        input_names.size(),
-                        input_defs.size()));
-
-  PADDLE_ENFORCE_EQ(output_names.size(),
-                    output_defs.size(),
-                    platform::errors::InvalidArgument(
-                        "the size of outputs_args names (%d) must be equal to "
-                        "the size of kernel output_defs (%d).",
-                        output_names.size(),
-                        output_defs.size()));
-
-  PADDLE_ENFORCE_EQ(attr_names.size(),
-                    attr_defs.size(),
-                    platform::errors::InvalidArgument(
-                        "the size of attribute_args names (%d) must be equal "
-                        "to the size of kernel attribute_defs (%d).",
-                        attr_names.size(),
-                        attr_defs.size()));
+  PADDLE_ENFORCE_EQ(
+      input_names.size(),
+      input_defs.size(),
+      platform::errors::InvalidArgument(
+          "Op %s: the size of inputs_args names (%d) must be equal to "
+          "the size of kernel input_defs (%d).",
+          kernel_signature.name,
+          input_names.size(),
+          input_defs.size()));
+
+  PADDLE_ENFORCE_EQ(
+      output_names.size(),
+      output_defs.size(),
+      platform::errors::InvalidArgument(
+          "Op %s: the size of outputs_args names (%d) must be equal to "
+          "the size of kernel output_defs (%d).",
+          kernel_signature.name,
+          output_names.size(),
+          output_defs.size()));
+
+  PADDLE_ENFORCE_EQ(
+      attr_names.size(),
+      attr_defs.size(),
+      platform::errors::InvalidArgument(
+          "Op %s: the size of attribute_args names (%d) must be equal "
+          "to the size of kernel attribute_defs (%d).",
+          kernel_signature.name,
+          attr_names.size(),
+          attr_defs.size()));
 
   for (size_t i = 0; i < input_names.size(); ++i) {
     auto it = ins.find(input_names[i]);
@@ -628,7 +635,8 @@ void PreparePhiData(const phi::Kernel& phi_kernel,
     for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
       auto& var = ins_vector[offset];
       const auto* tensor_in = GetTensorFromVar(var->Var());
-      if (tensor_in && tensor_in->IsInitialized()) {
+      if (tensor_in && tensor_in->IsInitialized() &&
+          (tensor_in->memory_size() != 0)) {
         if (in_def.backend == phi::Backend::ALL_BACKEND) {
           continue;
         }
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index f06ed80a940f0..468263e7be7ea 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -21,6 +21,9 @@
 #include "paddle/fluid/imperative/parallel_context.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
+#ifdef PADDLE_WITH_XPU_BKCL
+#include "paddle/fluid/platform/device/xpu/enforce_xpu.h"
+#endif
 #include "paddle/fluid/string/string_helper.h"
 #include "paddle/phi/core/dense_tensor.h"
 namespace paddle {
@@ -53,12 +56,11 @@ void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) {
     }
     framework::VisitDataTypeForHIP(
         dtype_,
-        DivNRanksForAllReduce<platform::CPUDeviceContext>(
-            tensor, nranks, context));
+        DivNRanksForAllReduce<phi::CPUContext>(tensor, nranks, context));
 #else
-    framework::VisitDataType(dtype_,
-                             DivNRanksForAllReduce<platform::CPUDeviceContext>(
-                                 tensor, nranks, context));
+    framework::VisitDataType(
+        dtype_,
+        DivNRanksForAllReduce<phi::CPUContext>(tensor, nranks, context));
 #endif
     VLOG(4) << "after div 2" << *tensor;
   } else if (platform::is_xpu_place(tensor->place())) {
@@ -328,11 +330,10 @@ void Group::ConcatTensors(const platform::DeviceContext &context) {
         "Please recompile or reinstall Paddle with CNCL support."));
 #endif
   } else if (platform::is_cpu_place(place)) {
-    ConcatTensorsWithType(
-        static_cast<const platform::CPUDeviceContext &>(context),
-        dense_tensors_,
-        &dense_contents_,
-        dtype_);
+    ConcatTensorsWithType(static_cast<const phi::CPUContext &>(context),
+                          dense_tensors_,
+                          &dense_contents_,
+                          dtype_);
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
         "Concat grad tensor not supported on place (%s)", place));
@@ -390,11 +391,10 @@ void Group::SplitTensors(const platform::DeviceContext &context) {
         "Please recompile or reinstall Paddle with CNCL support."));
 #endif
   } else if (platform::is_cpu_place(place)) {
-    SplitTensorsWithType(
-        static_cast<const platform::CPUDeviceContext &>(context),
-        &dense_contents_,
-        &dense_tensors_,
-        dtype_);
+    SplitTensorsWithType(static_cast<const phi::CPUContext &>(context),
+                         &dense_contents_,
+                         &dense_tensors_,
+                         dtype_);
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
         "Split grad tensor not supported on place (%s)", place));
@@ -434,10 +434,6 @@ Reducer::Reducer(const std::vector<std::shared_ptr<imperative::VarBase>> &vars,
   VLOG(3) << "Start construct the Reducer ...";
   nrings_ = parallel_ctx->GetNRings();
   nranks_ = parallel_ctx->GetNRanks();
-#ifdef PADDLE_WITH_XPU_BKCL
-  comm_pool_.reset(new ::ThreadPool(1));
-  comm_op_count_ = 0;
-#endif
   // initialize groups
   InitializeGroups(group_indices);
   for (size_t global_var_index = 0; global_var_index < vars_.size();
@@ -856,8 +852,23 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) {
 
 #ifdef PADDLE_WITH_XPU_BKCL
       if (platform::is_xpu_place(group_tensor.place())) {
-        // TODO(liuyuhui) support XPU set constant
-        VLOG(3) << "XPU doesn't support set_constant";
+        auto dev_ctx = static_cast<platform::XPUDeviceContext *>(
+            platform::DeviceContextPool::Instance().Get(place_));
+        if (HasGrad(var_index)) {
+          auto var_base = vars_[var_index]->GradVarBase();
+          auto tensor =
+              var_base->MutableVar()->GetMutable<framework::LoDTensor>();
+          group_tensor.ShareDataWith(*tensor).Resize(
+              {static_cast<int64_t>(length)});
+        } else {
+          group_tensor.Resize({static_cast<int64_t>(length)});
+          int r = xpu::constant(dev_ctx->x_context(),
+                                reinterpret_cast<float *>(group_tensor.data()),
+                                group_tensor.numel(),
+                                0.0f);
+          PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
+          PADDLE_ENFORCE_XPU_SUCCESS(xpu_wait(dev_ctx->stream()));
+        }
       }
 #elif defined(PADDLE_WITH_CNCL)
       if (platform::is_mlu_place(group_tensor.place())) {
@@ -951,33 +962,7 @@ void Reducer::MarkGroupReady(size_t group_index) {
     // so we expose WaitCompute() interface and call
     // it here.
     parallel_ctx_->WaitCompute(run_order);
-#ifdef PADDLE_WITH_XPU_BKCL
-    {
-      std::lock_guard<std::mutex> lock(mutex_);
-      comm_op_count_ += 1;  // lock
-    }
-    // TODO(liuyuhui): Add try catch to deal with exception later,
-    // otherwise the main thread will continue to run when an exception is
-    // thrown in comm_pool_.
-    auto next_group = next_group_;
-    comm_pool_->enqueue([this, run_order, next_group, &group] {
-      auto dev_id = place_.device;
-      platform::SetXPUDeviceId(dev_id);
-      FusedAllReduceSchedule(run_order, group, next_group);
-      {
-        std::lock_guard<std::mutex> lock(mutex_);
-        comm_op_count_ -= 1;  // lock
-        cv_.notify_all();
-      }
-    });
-#elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) ||    \
-    defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL) || \
-    defined(PADDLE_WITH_CNCL)
     FusedAllReduceSchedule(run_order, group, next_group_);
-#else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "Not compiled with BKCL or NCCL or CNCL or GLOO."));
-#endif
   }
 }
 
@@ -1000,17 +985,6 @@ void Reducer::FusedAllReduceSchedule(const int run_order,
     // group.dense_tensors ---> group.dense_contents_
     group.ConcatTensors(dev_context);
 
-// NOTE(liuyuhui): ConcatTensors use communication stream, but BKCL only support
-// default stream for communicating, so there exist some problems in
-// synchronization. And need to add a WaitComm there.
-// TODO(liuyuhui): If BKCL support non-blocking communication, it should be
-// fixed as multi gpus card training.
-#ifdef PADDLE_WITH_XPU_BKCL
-    if (platform::is_xpu_place(group.dense_tensors_[0].place())) {
-      parallel_ctx_->WaitComm(run_order);
-    }
-#endif
-
     group.DivNRanks(dev_context, nranks_);
     // Start allreduce
     parallel_ctx_->AllReduceByStream(
@@ -1138,12 +1112,6 @@ bool Reducer::HasGrad(size_t var_index) {
 void Reducer::FinalizeBackward() {
   groups_need_finalize_ = false;
   grad_need_hooks_ = false;
-#ifdef PADDLE_WITH_XPU_BKCL
-  {
-    std::unique_lock<std::mutex> lock(mutex_);
-    cv_.wait(lock, [&] { return comm_op_count_ == 0; });
-  }
-#endif
 
   // Must prevent compute_stream_ starting until all comm streams have finished
   for (int i = 0; i < nrings_; ++i) {
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 26a5c5adfd666..07eb9ae6a8e5e 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -140,6 +140,15 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
       PADDLE_THROW(platform::errors::PermissionDenied(
           "Paddle can't use NPU device since it's not compiled with NPU,"
           "Please recompile or reinstall Paddle with NPU support."));
+#endif
+    } else if (platform::is_ipu_place(place)) {
+#if defined(PADDLE_WITH_IPU)
+      gc.reset(new framework::IPUGarbageCollector(place, 0));
+      VLOG(10) << "Created GarbageCollector at " << place;
+#else
+      PADDLE_THROW(platform::errors::PermissionDenied(
+          "Paddle can't use IPU device since it's not compiled with IPU,"
+          "Please recompile or reinstall Paddle with IPU support."));
 #endif
     } else if (platform::is_mlu_place(place)) {
 #if defined(PADDLE_WITH_MLU)
@@ -152,8 +161,14 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
 #endif
     } else if (platform::is_custom_place(place)) {
 #if defined(PADDLE_WITH_CUSTOM_DEVICE)
-      gc.reset(new framework::CustomDefaultStreamGarbageCollector(place, 0));
-      VLOG(10) << "Created GarbageCollector at " << place;
+      if (framework::IsFastEagerDeletionModeEnabled()) {
+        gc.reset(
+            new framework::CustomDeviceUnsafeFastGarbageCollector(place, 0));
+        VLOG(10) << "Created UnsafeFastGarbageCollector at " << place;
+      } else {
+        gc.reset(new framework::CustomDefaultStreamGarbageCollector(place, 0));
+        VLOG(10) << "Created GarbageCollector at " << place;
+      }
 #else
       PADDLE_THROW(platform::errors::PermissionDenied(
           "Paddle can't use CustomDevice since it's not compiled with "
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 4e991a3013875..7f2daa942b057 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -40,6 +40,10 @@ get_property(phi_modules GLOBAL PROPERTY PHI_MODULES)
 get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
 set(utils_modules stringpiece pretty_log string_helper benchmark)
 
+if(WITH_CUSTOM_DEVICE)
+  set(fluid_modules ${fluid_modules} phi_capi)
+endif()
+
 add_subdirectory(api)
 
 # Create static inference library if needed
@@ -119,6 +123,8 @@ cc_library(
 get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
 target_link_libraries(paddle_inference_shared ${os_dependency_modules})
 if(WIN32)
+  set_property(TARGET paddle_inference_shared
+               PROPERTY WINDOWS_EXPORT_ALL_SYMBOLS ON)
   target_link_libraries(paddle_inference_shared gflags)
 endif()
 
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 4b7bed65bab77..67f0e3212db43 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -49,10 +49,10 @@ function(inference_analysis_test_build TARGET)
       SRCS
       ${analysis_test_SRCS}
       DEPS
+      ${analysis_test_EXTRA_DEPS}
       analysis
       pass
-      ${GLOB_PASS_LIB}
-      ${analysis_test_EXTRA_DEPS})
+      ${GLOB_PASS_LIB})
   endif()
 endfunction()
 
@@ -80,10 +80,10 @@ function(inference_analysis_test TARGET)
       SRCS
       ${analysis_test_SRCS}
       DEPS
+      ${analysis_test_EXTRA_DEPS}
       analysis
       pass
-      ${GLOB_PASS_LIB}
-      ${analysis_test_EXTRA_DEPS})
+      ${GLOB_PASS_LIB})
     inference_base_test_run(${TARGET} COMMAND ${TARGET} ARGS
                             ${analysis_test_ARGS})
   endif()
@@ -109,4 +109,9 @@ elseif(WIN32)
     paddle_inference_api
     ARGS
     --inference_model_dir=${WORD2VEC_MODEL_DIR})
+  if(WITH_ONNXRUNTIME AND WIN32)
+    # Copy onnxruntime for some c++ test in Windows, since the test will
+    # be build only in CI, so suppose the generator in Windows is Ninja.
+    copy_onnx(test_analyzer)
+  endif()
 endif()
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 016df40c86a2d..717737749a96b 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -35,7 +35,7 @@
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
-#include "paddle/fluid/platform/variant.h"
+
 #include "paddle/phi/common/data_type.h"
 
 namespace paddle {
@@ -331,6 +331,9 @@ struct Argument {
 
   // mixed precision related
   DECL_ARGUMENT_FIELD(model_precision, ModelPrecision, int);
+  DECL_ARGUMENT_FIELD(mixed_black_list,
+                      MixedBlackList,
+                      std::unordered_set<std::string>);
 
  private:
   std::unordered_set<std::string> valid_fields_;
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 4aeaefa3c49c3..3c04638003cdd 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -87,6 +87,9 @@ void IRPassManager::CreatePasses(Argument *argument,
     pass->Set("with_dynamic_shape", new bool(with_dynamic_shape));
 
     pass->Set("model_precision", new int(argument->model_precision()));
+    pass->Set(
+        "mixed_black_list",
+        new std::unordered_set<std::string>(argument->mixed_black_list()));
 
     if (pass_name == "graph_viz_pass") {
       std::string optim_cache_dir = argument->optim_cache_dir();
diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
index 4ba17aa126dc6..e2108278b15c5 100644
--- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
@@ -234,7 +234,7 @@ void LiteSubgraphPass::SetUpEngine(
                              framework::Scope* scope,
                              const std::vector<std::string>& params) {
     std::ostringstream os;
-    platform::CPUDeviceContext ctx;
+    phi::CPUContext ctx;
     for (const auto& param : params) {
       VLOG(3) << "Serialize param: " << param;
       PADDLE_ENFORCE_NOT_NULL(
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 7a9c5b889d146..d39eadc7cc8f1 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -13,26 +13,117 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
+#include <cstddef>
+#include <string>
+#include <unordered_set>
 
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+#include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/ir/subgraph_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/tensorrt/op_teller.h"
 #include "paddle/fluid/inference/utils/io_utils.h"
+#include "paddle/phi/common/backend.h"
+#include "paddle/phi/common/data_type.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
+namespace {
+
+bool IsFloat(framework::proto::VarType::Type t) {
+  if (t == framework::proto::VarType::FP16 ||
+      t == framework::proto::VarType::FP32 ||
+      t == framework::proto::VarType::FP64 ||
+      t == framework::proto::VarType::BF16)
+    return true;
+  return false;
+}
+
+// if in mixed model precision, we should make all tensorrt_engine's output
+// floats dtype to float32 dtype.
+void OutputProcess(framework::ir::Graph *graph,
+                   const std::unordered_set<framework::ir::Node *> &trt_outputs,
+                   phi::Backend backend,
+                   phi::DataType precision,
+                   const std::unordered_set<std::string> &blacklist) {
+  framework::BlockDesc *block_desc{nullptr};
+  int suffix = 0;
+  std::unordered_map<framework::ir::Node *, framework::ir::Node *>
+      var_to_cast_op_map;
+
+  framework::proto::VarType::Type to_type;
+  if (precision == phi::DataType::FLOAT16) {
+    to_type = framework::proto::VarType::FP16;
+  } else if (precision == phi::DataType::BFLOAT16) {
+    to_type = framework::proto::VarType::BF16;
+  } else if (precision == phi::DataType::FLOAT32) {
+    return;
+  } else {
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "mixed_precision currently not supported dtype %d, we now only support "
+        "fp16 and bf16.",
+        static_cast<int>(precision)));
+  }
+
+  for (auto *op_node : framework::ir::TopologySortOperations(*graph)) {
+    if (!op_node->IsOp()) continue;
+    auto op_type = op_node->Op()->Type();
+    if (op_type == "feed") block_desc = op_node->Op()->Block();
+    if (op_type != "tensorrt_engine") continue;
+    for (auto *var_node : op_node->outputs) {
+      if (!trt_outputs.count(var_node)) continue;
+      if (!var_node->Var()->Persistable() &&
+          IsFloat(var_node->Var()->GetDataType()) &&
+          var_node->Var()->GetDataType() != framework::proto::VarType::FP32) {
+        for (auto *next_op : var_node->outputs) {
+          // if next_op support mixed_precision, we need to add cast op.
+          if (OpSupportPrecision(
+                  phi::TransToPhiKernelName(next_op->Op()->Type()),
+                  backend,
+                  precision,
+                  blacklist)) {
+            AddCastOp(graph,
+                      var_node,
+                      next_op,
+                      framework::proto::VarType::FP32,
+                      to_type,
+                      &suffix,
+                      block_desc,
+                      &var_to_cast_op_map);
+            var_node->Var()->SetDataType(framework::proto::VarType::FP32);
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace
 
 using framework::ir::Node;
 
 void analysis::TensorRtSubgraphPass::ApplyImpl(
     framework::ir::Graph *graph) const {
   framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph);
+
+  auto model_precision =
+      static_cast<phi::DataType>(Get<int>("model_precision"));
+  if (model_precision == phi::DataType::BFLOAT16) {
+    LOG(WARNING)
+        << "Paddle-TRT not support bf16 mixed precison, just fallback.";
+    return;
+  }
+
   auto enable_int8 = Get<bool>("enable_int8");
   auto use_calib_mode = Get<bool>("use_calib_mode");
   bool no_calib_int8 = enable_int8 && !(use_calib_mode);
@@ -181,15 +272,25 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
     }
   }
 
+  auto model_precision =
+      static_cast<phi::DataType>(Get<int>("model_precision"));
+  auto mixed_black_list =
+      Get<std::unordered_set<std::string>>("mixed_black_list");
+
   std::set<std::string> output_names;
   std::set<std::string> output_names_with_id;
   std::map<std::string, int> origin_name_output_dims;
+  std::unordered_set<Node *> trt_outputs;
   for (auto *x : node->outputs) {
     output_names.insert(x->Name());
     output_names_with_id.insert(x->Name() + std::to_string(x->id()));
     origin_name_output_dims[x->Name()] = x->Var()->GetShape().size();
+    trt_outputs.insert(x);
   }
 
+  OutputProcess(
+      graph, trt_outputs, phi::Backend::GPU, model_precision, mixed_black_list);
+
   std::unordered_map<std::string, std::string> output_name_map;
   std::unordered_map<std::string, framework::ir::Node *> graph_var_map;
 
@@ -285,6 +386,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   op_desc->SetAttr("allow_build_at_runtime", allow_build_at_runtime);
   op_desc->SetAttr("shape_range_info_path", shape_range_info_path);
   op_desc->SetAttr("use_inspector", Get<bool>("use_inspector"));
+  op_desc->SetAttr("model_precision", Get<int>("model_precision"));
 
   // we record all inputs' shapes in attr to check if they are consistent
   // with the real inputs' shapes retrieved from scope when trt runs.
@@ -404,7 +506,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
                   min_input_shape,
                   max_input_shape,
                   opt_input_shape,
-                  disable_trt_plugin_fp16);
+                  disable_trt_plugin_fp16,
+                  static_cast<phi::DataType>(Get<int>("model_precision")));
   trt_engine->SetUseOSS(Get<bool>("use_varseqlen"));
   trt_engine->SetWithInterleaved(Get<bool>("with_interleaved"));
   trt_engine->SetTransformerPosid(
diff --git a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
index 6b6651678f85e..48d2cefe4a720 100644
--- a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
+++ b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
@@ -18,6 +18,7 @@
 
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -118,6 +119,15 @@ bool WeightsShouldNotConvert(ir::Node* var_node) {
   return false;
 }
 
+inline bool IsFloatVarType(framework::proto::VarType::Type type) {
+  if (type == framework::proto::VarType::FP16 ||
+      type == framework::proto::VarType::FP32 ||
+      type == framework::proto::VarType::BF16 ||
+      type == framework::proto::VarType::FP64)
+    return true;
+  return false;
+}
+
 void ConvertTensorDtype(framework::ir::Graph* graph,
                         const std::unordered_set<std::string>& blacklist,
                         bool keep_io_types,
@@ -145,8 +155,6 @@ void ConvertTensorDtype(framework::ir::Graph* graph,
     if (!op_node->IsOp()) continue;
     auto op_type = op_node->Op()->Type();
     auto phi_op_type = phi::TransToPhiKernelName(op_type);
-    // LOG(INFO) << "process op " << op_type << ", corresponding phi type is "
-    // << phi_op_type;
     // 1. set input dtype.
     if (op_type == "feed") {
       block_desc = op_node->Op()->Block();
@@ -174,12 +182,14 @@ void ConvertTensorDtype(framework::ir::Graph* graph,
         ++num_low_precision;
         auto inputs = op_node->inputs;
         for (auto* in_node : inputs) {
+          if (in_node->IsCtrlVar()) continue;
           auto* in_var = in_node->Var();
           if (in_var->Persistable() &&
               in_var->GetDataType() == framework::proto::VarType::FP32) {
             if (WeightsShouldNotConvert(in_node)) continue;
             in_var->SetDataType(to_type);
           } else if (!in_var->Persistable() &&
+                     IsFloatVarType(in_var->GetDataType()) &&
                      in_var->GetDataType() != to_type) {
             AddCastOp(graph,
                       in_node,
@@ -192,6 +202,7 @@ void ConvertTensorDtype(framework::ir::Graph* graph,
           }
         }
         for (auto* out_node : op_node->outputs) {
+          if (out_node->IsCtrlVar()) continue;
           auto* out_var = out_node->Var();
           if (out_var->GetDataType() == framework::proto::VarType::FP32) {
             if (OutShouldNotConvert(out_node)) continue;
@@ -201,8 +212,9 @@ void ConvertTensorDtype(framework::ir::Graph* graph,
       } else {
         auto inputs = op_node->inputs;
         for (auto* in_node : inputs) {
+          if (in_node->IsCtrlVar()) continue;
           auto* in_var = in_node->Var();
-          if (!in_var->Persistable() &&
+          if (!in_var->Persistable() && IsFloatVarType(in_var->GetDataType()) &&
               in_var->GetDataType() != framework::proto::VarType::FP32) {
             AddCastOp(graph,
                       in_node,
@@ -223,6 +235,7 @@ void ConvertTensorDtype(framework::ir::Graph* graph,
       // trt pass should explicitle add cast op is input is bf16/tf32, etc.
       if (op_node->Name() == "tensorrt_engine") continue;
       for (auto* in_node : op_node->inputs) {
+        if (in_node->IsCtrlVar()) continue;
         auto* in_var = in_node->Var();
         if (in_var->GetDataType() == to_type) {
           AddCastOp(graph,
@@ -241,6 +254,7 @@ void ConvertTensorDtype(framework::ir::Graph* graph,
   // 4. if output_op's dtype is not compatible to output dtype, then just insert
   // cast.
   for (auto* node : output_nodes) {
+    if (node->IsCtrlVar()) continue;
     auto var = node->Var();
     if (keep_io_types && var->GetDataType() == to_type) {
       // fp16/bf16 -> fp32.
@@ -365,7 +379,7 @@ void ConvertToMixedPrecision(const std::string& model_file,
       [](framework::Scope* scope,
          const std::vector<std::string>& params) -> std::string {
     std::ostringstream os;
-    platform::CPUDeviceContext ctx;
+    phi::CPUContext ctx;
     for (const auto& param : params) {
       VLOG(3) << "Serialize param: " << param;
       PADDLE_ENFORCE_NOT_NULL(
@@ -379,27 +393,21 @@ void ConvertToMixedPrecision(const std::string& model_file,
   };
 
   std::unordered_set<std::string> weights_should_be_fp32;
-  for (auto* node : paddle::framework::ir::TopologySortOperations(*graph)) {
-    if (!node->IsOp()) continue;
-    auto* op_desc = node->Op();
-    if (op_desc->Type() == "feed" || op_desc->Type() == "fetch") continue;
-
-    if (op_desc->Type() == "batch_norm") {
-      auto vecs = op_desc->Input("Bias");
-      for (auto s : vecs) {
-        weights_should_be_fp32.insert(s);
-      }
-      vecs = op_desc->Input("Mean");
-      for (auto s : vecs) {
-        weights_should_be_fp32.insert(s);
-      }
-      vecs = op_desc->Input("Scale");
-      for (auto s : vecs) {
-        weights_should_be_fp32.insert(s);
-      }
-      vecs = op_desc->Input("Variance");
-      for (auto s : vecs) {
-        weights_should_be_fp32.insert(s);
+  for (auto* node : graph->Nodes()) {
+    if (!(node->IsVar() && !node->IsCtrlVar())) continue;
+    if (node->Var()->GetType() ==
+            paddle::framework::proto::VarType::SELECTED_ROWS ||
+        node->Var()->GetType() ==
+            paddle::framework::proto::VarType::LOD_TENSOR ||
+        node->Var()->GetType() ==
+            paddle::framework::proto::VarType::LOD_TENSOR_ARRAY ||
+        node->Var()->GetType() == paddle::framework::proto::VarType::STRINGS ||
+        node->Var()->GetType() == paddle::framework::proto::VarType::VOCAB) {
+      if (node->Var()->Persistable() &&
+          node->Var()->GetDataType() ==
+              paddle::framework::proto::VarType::FP32) {
+        VLOG(2) << "weights keep to fp32: " << node->Name();
+        weights_should_be_fp32.insert(node->Name());
       }
     }
   }
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 0d55b9c66416a..3aff5d5536a23 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -55,6 +55,9 @@ set(paddle_inference_api_deps
 if(WITH_CRYPTO)
   list(APPEND paddle_inference_api_deps paddle_crypto)
 endif()
+if(WITH_CUSTOM_DEVICE)
+  set(paddle_inference_api_deps ${paddle_inference_api_deps} phi_capi)
+endif()
 
 cc_library(
   paddle_inference_api
@@ -99,6 +102,12 @@ cc_test(
   SRCS api_tester.cc
   DEPS paddle_inference_api)
 
+if(WITH_ONNXRUNTIME AND WIN32)
+  # Copy onnxruntime for some c++ test in Windows, since the test will
+  # be build only in CI, so suppose the generator in Windows is Ninja.
+  copy_onnx(test_paddle_inference_api)
+endif()
+
 if(WITH_TESTING)
   if(NOT APPLE AND NOT WIN32)
     if(WITH_GPU)
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 75a5d9ee4f55b..ae90618f5207c 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -256,6 +256,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(gpu_device_id_);
   CP_MEMBER(memory_pool_init_size_mb_);
 
+  // Mixed related.
+  CP_MEMBER(mixed_black_list_);
+
   CP_MEMBER(enable_memory_optim_);
   // TensorRT related.
   CP_MEMBER(use_tensorrt_);
@@ -871,6 +874,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
   ss << ipu_available_memory_proportion_;
   ss << ipu_enable_half_partial_;
 
+  for (auto &op : mixed_black_list_) ss << op.c_str();
   return ss.str();
 }
 
@@ -1188,4 +1192,10 @@ bool AnalysisConfig::tuned_tensorrt_dynamic_shape() {
 bool AnalysisConfig::trt_allow_build_at_runtime() {
   return trt_allow_build_at_runtime_;
 }
+
+void AnalysisConfig::Exp_SetBlackListOpsForMixedModel(
+    const std::unordered_set<std::string> &black_list) {
+  mixed_black_list_ = black_list;
+}
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 8a2083ea226b4..541c53c8dae64 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1216,7 +1216,9 @@ void AnalysisPredictor::PrepareArgument() {
   argument_.SetAnalysisPasses(config_.pass_builder()->AnalysisPasses());
   argument_.SetScopeNotOwned(scope_.get());
 
+  // mixed precison.
   argument_.SetModelPrecision(static_cast<int>(model_precision_));
+  argument_.SetMixedBlackList(config_.mixed_black_list_);
 }
 
 // NOTE All the members in AnalysisConfig should be copied to Argument.
@@ -1360,6 +1362,10 @@ CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
   config.SetInValid();
   auto predictor_p = dynamic_cast<AnalysisPredictor *>(predictor.get());
 
+#ifdef PADDLE_WITH_TENSORRT
+  paddle::framework::ir::patterns::KeyCounter::Instance().CleanCounter();
+#endif
+
   if (!predictor_p->Init(nullptr)) {
     return nullptr;
   }
@@ -2083,6 +2089,8 @@ USE_TRT_CONVERTER(top_k)
 USE_TRT_CONVERTER(top_k_v2)
 USE_TRT_CONVERTER(squeeze2)
 USE_TRT_CONVERTER(unsqueeze2)
+USE_TRT_CONVERTER(fill_constant)
+USE_TRT_CONVERTER(fused_token_prune)
 #if PADDLE_WITH_CUSPARSELT && IS_TRT_VERSION_GE(8000)
 USE_TRT_CONVERTER(sparse_fc)
 USE_TRT_CONVERTER(sparse_multihead_matmul)
diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc
index d5897e3c4f2a7..054b4668c4cc6 100644
--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
@@ -156,3 +156,7 @@ std::shared_ptr<framework::Cipher> MakeCipher(const std::string &config_file) {
 #endif
 
 }  // namespace paddle
+
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#include "paddle/phi/capi/capi.h"
+#endif
diff --git a/paddle/fluid/inference/api/details/CMakeLists.txt b/paddle/fluid/inference/api/details/CMakeLists.txt
index 2acd96b3fb97c..02d5f91d630ce 100644
--- a/paddle/fluid/inference/api/details/CMakeLists.txt
+++ b/paddle/fluid/inference/api/details/CMakeLists.txt
@@ -38,3 +38,9 @@ cc_test(
   zero_copy_tensor_test
   SRCS zero_copy_tensor_test.cc
   DEPS paddle_inference_api)
+
+if(WITH_ONNXRUNTIME AND WIN32)
+  # Copy onnxruntime for some c++ test in Windows, since the test will
+  # be build only in CI, so suppose the generator in Windows is Ninja.
+  copy_onnx(zero_copy_tensor_test)
+endif()
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index 4040d09c4519e..7bb384b27381d 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -179,13 +179,6 @@ PlaceType Tensor::place() const { return place_; }
 
 template <typename T>
 void Tensor::CopyFromCpu(const T *data) {
-#ifdef PADDLE_WITH_ONNXRUNTIME
-  if (is_ort_tensor_) {
-    ORTCopyFromCpu<T>(data);
-    return;
-  }
-#endif
-
   EAGER_GET_TENSOR(paddle::framework::LoDTensor);
   PADDLE_ENFORCE_GE(tensor->numel(),
                     0,
@@ -731,112 +724,6 @@ void Tensor::SetOrtBuffer(const std::shared_ptr<std::vector<int8_t>> buffer) {
   buffer_ = buffer;
 }
 
-Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info,
-                       float *data,
-                       size_t size,
-                       const int64_t *shape,
-                       size_t shape_len) {
-  return Ort::Value::CreateTensor<float>(
-      memory_info, data, size, shape, shape_len);
-}
-
-Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info,
-                       int64_t *data,
-                       size_t size,
-                       const int64_t *shape,
-                       size_t shape_len) {
-  return Ort::Value::CreateTensor<int64_t>(
-      memory_info, data, size, shape, shape_len);
-}
-
-Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info,
-                       int32_t *data,
-                       size_t size,
-                       const int64_t *shape,
-                       size_t shape_len) {
-  return Ort::Value::CreateTensor<int32_t>(
-      memory_info, data, size, shape, shape_len);
-}
-
-Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info,
-                       uint8_t *data,
-                       size_t size,
-                       const int64_t *shape,
-                       size_t shape_len) {
-  return Ort::Value::CreateTensor<uint8_t>(
-      memory_info, data, size, shape, shape_len);
-}
-
-Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info,
-                       int8_t *data,
-                       size_t size,
-                       const int64_t *shape,
-                       size_t shape_len) {
-  return Ort::Value::CreateTensor<int8_t>(
-      memory_info, data, size, shape, shape_len);
-}
-
-Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info,
-                       float16 *data,
-                       size_t size,
-                       const int64_t *shape,
-                       size_t shape_len) {
-  return Ort::Value::CreateTensor(memory_info,
-                                  static_cast<void *>(data),
-                                  size * sizeof(float16),
-                                  shape,
-                                  shape_len,
-                                  ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16);
-}
-
-template <typename T>
-void Tensor::ORTCopyFromCpu(const T *data) {
-  auto binding = binding_.lock();
-  PADDLE_ENFORCE_NOT_NULL(binding,
-                          paddle::platform::errors::PreconditionNotMet(
-                              "input tensor [%s] no binding ptr", name_));
-  const char *device_name = place_ == PlaceType::kCPU ? "Cpu" : "Cuda";
-  Ort::MemoryInfo memory_info(
-      device_name, OrtDeviceAllocator, device_, OrtMemTypeDefault);
-  size_t size = std::accumulate(
-      begin(shape_), end(shape_), 1UL, std::multiplies<size_t>());
-  auto buffer = buffer_.lock();
-  size_t buffer_size = size * sizeof(T);
-  if (buffer_size > buffer->size()) {
-    buffer->resize(buffer_size);
-  }
-  std::memcpy(static_cast<void *>(buffer->data()), data, buffer_size);
-
-  auto onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED;
-  if (std::is_same<T, float>::value) {
-    onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  } else if (std::is_same<T, double>::value) {
-    onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE;
-  } else if (std::is_same<T, int64_t>::value) {
-    onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
-  } else if (std::is_same<T, int32_t>::value) {
-    onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32;
-  } else if (std::is_same<T, uint8_t>::value) {
-    onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8;
-  } else if (std::is_same<T, int8_t>::value) {
-    onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8;
-  } else if (std::is_same<T, float16>::value) {
-    onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16;
-  } else {
-    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
-        "Found undefined data type for onnxruntime, only supports "
-        "float16/float32/float64/int8/uint8/int32/int64."));
-  }
-
-  auto ort_value = Ort::Value::CreateTensor(memory_info,
-                                            buffer->data(),
-                                            buffer_size,
-                                            shape_.data(),
-                                            shape_.size(),
-                                            onnx_dtype);
-  binding->BindInput(name_.c_str(), ort_value);
-}
-
 template <typename T>
 void Tensor::ORTCopyToCpu(T *data) const {
   auto binding = binding_.lock();
@@ -857,13 +744,6 @@ void Tensor::ORTCopyToCpu(T *data) const {
   }
 }
 
-template void Tensor::ORTCopyFromCpu<float>(const float *data);
-template void Tensor::ORTCopyFromCpu<int64_t>(const int64_t *data);
-template void Tensor::ORTCopyFromCpu<int32_t>(const int32_t *data);
-template void Tensor::ORTCopyFromCpu<uint8_t>(const uint8_t *data);
-template void Tensor::ORTCopyFromCpu<int8_t>(const int8_t *data);
-template void Tensor::ORTCopyFromCpu<float16>(const float16 *data);
-
 template void Tensor::ORTCopyToCpu<float>(float *data) const;
 template void Tensor::ORTCopyToCpu<int32_t>(int32_t *data) const;
 template void Tensor::ORTCopyToCpu<uint8_t>(uint8_t *data) const;
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc
index bca2cde0fc2c6..cef7402e6c061 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc
@@ -142,7 +142,7 @@ void AnalysisPredictor::MkldnnQuantizer::CalculateScalesForOpOutputs(
           scales_[var_name] = scales_[input_var_name];
         }
         compute_scale = false;
-      } else if (op->Type() == "slice") {
+      } else if (op->Type() == "slice" || op->Type() == "shape") {
         auto input_var_name = op->Input("Input")[0];
         PADDLE_ENFORCE_NE(scales_.find(input_var_name),
                           scales_.end(),
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
index d4fa78518e149..bfe6c5a94776a 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
@@ -45,6 +45,9 @@ MkldnnQuantizerConfig::MkldnnQuantizerConfig() {
   rules_["slice"]["Input"] = ScaleAlgo::KL;
   rules_["slice"]["Out"] = ScaleAlgo::NONE;
 
+  rules_["shape"]["Input"] = ScaleAlgo::KL;
+  rules_["shape"]["Out"] = ScaleAlgo::NONE;
+
   rules_["fc"]["Input"] = ScaleAlgo::KL;
   rules_["fc"]["W"] = ScaleAlgo::MAX_CH_T;
   rules_["fc"]["Bias"] = ScaleAlgo::NONE;
@@ -62,6 +65,10 @@ MkldnnQuantizerConfig::MkldnnQuantizerConfig() {
   rules_["elementwise_mul"]["Y"] = ScaleAlgo::KL;
   rules_["elementwise_mul"]["Out"] = ScaleAlgo::KL;
 
+  rules_["elementwise_sub"]["X"] = ScaleAlgo::KL;
+  rules_["elementwise_sub"]["Y"] = ScaleAlgo::KL;
+  rules_["elementwise_sub"]["Out"] = ScaleAlgo::KL;
+
   // Reshape2 does not perform calculation on the data and shapes are not
   // changed. Scale is calculated on input data and assign to Quantize and
   // Dequantize scale.
diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.cc b/paddle/fluid/inference/api/onnxruntime_predictor.cc
index 83919ad13967d..5313db6442986 100644
--- a/paddle/fluid/inference/api/onnxruntime_predictor.cc
+++ b/paddle/fluid/inference/api/onnxruntime_predictor.cc
@@ -24,11 +24,10 @@
 #include <utility>
 #include <vector>
 
-#include "paddle/fluid//platform/device/gpu/gpu_types.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/version.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/inference/analysis/helper.h"
-#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
@@ -97,6 +96,7 @@ bool ONNXRuntimePredictor::Init() {
   } else {
     place_ = paddle::platform::CPUPlace();
   }
+  scope_.reset(new paddle::framework::Scope());
 
   char *onnx_proto = nullptr;
   int out_size;
@@ -147,6 +147,8 @@ bool ONNXRuntimePredictor::Init() {
   Ort::Allocator allocator(session_, memory_info);
 
   size_t n_inputs = session_.GetInputCount();
+  framework::proto::VarType::Type proto_type =
+      framework::proto::VarType::LOD_TENSOR;
   for (size_t i = 0; i < n_inputs; ++i) {
     auto input_name = session_.GetInputName(i, allocator);
     auto type_info = session_.GetInputTypeInfo(i);
@@ -155,6 +157,10 @@ bool ONNXRuntimePredictor::Init() {
     ONNXTensorElementDataType data_type =
         type_info.GetTensorTypeAndShapeInfo().GetElementType();
     input_desc_.emplace_back(ONNXDesc{input_name, shape, data_type});
+
+    auto *ptr = scope_->Var(input_name);
+    framework::InitializeVariable(ptr, proto_type);
+
     allocator.Free(input_name);
   }
 
@@ -249,13 +255,13 @@ bool ONNXRuntimePredictor::FindONNXDesc(const std::string &name,
 
 std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetInputTensor(
     const std::string &name) {
-  PADDLE_ENFORCE_EQ(FindONNXDesc(name, true),
-                    true,
-                    platform::errors::PreconditionNotMet(
-                        "The in variable named %s is not found in the "
-                        "ONNXPredictor.",
-                        name));
-  std::unique_ptr<ZeroCopyTensor> res(new ZeroCopyTensor(nullptr, this));
+  PADDLE_ENFORCE_NOT_NULL(scope_->FindVar(name),
+                          platform::errors::PreconditionNotMet(
+                              "The in variable named %s is not found in the "
+                              "ONNXPredictor.",
+                              name));
+  std::unique_ptr<ZeroCopyTensor> res(
+      new ZeroCopyTensor(static_cast<void *>(scope_.get()), this));
   res->input_or_output_ = true;
   res->SetName(name);
   if (platform::is_cpu_place(place_)) {
@@ -264,16 +270,6 @@ std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetInputTensor(
     auto gpu_place = place_;
     res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
   }
-  res->SetOrtMark(true);
-  res->SetOrtBinding(binding_);
-  auto iter = input_buffers_.find(name);
-  if (iter == input_buffers_.end()) {
-    std::vector<int8_t> i_vector;
-    input_buffers_[name] = std::make_shared<std::vector<int8_t>>(i_vector);
-    res->SetOrtBuffer(input_buffers_[name]);
-  } else {
-    res->SetOrtBuffer(iter->second);
-  }
   return res;
 }
 
@@ -306,6 +302,24 @@ std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetOutputTensor(
   return res;
 }
 
+Ort::Value ONNXRuntimePredictor::GetOrtValue(const ONNXDesc &desc,
+                                             const char *device_name) {
+  Ort::MemoryInfo memory_info(
+      device_name, OrtDeviceAllocator, place_.GetDeviceId(), OrtMemTypeDefault);
+  auto *var = scope_->FindVar(desc.name);
+  auto *tensor = var->GetMutable<framework::LoDTensor>();
+  size_t size =
+      tensor->numel() *
+      framework::SizeOfType(framework::TransToProtoVarType(tensor->dtype()));
+  std::vector<int64_t> shape = phi::vectorize<int64_t>(tensor->dims());
+  return Ort::Value::CreateTensor(memory_info,
+                                  static_cast<void *>(tensor->data()),
+                                  size,
+                                  shape.data(),
+                                  shape.size(),
+                                  desc.dtype);
+}
+
 bool ONNXRuntimePredictor::Run(const std::vector<PaddleTensor> &inputs,
                                std::vector<PaddleTensor> *output_data,
                                int batch_size) {
@@ -315,7 +329,13 @@ bool ONNXRuntimePredictor::Run(const std::vector<PaddleTensor> &inputs,
 
 bool ONNXRuntimePredictor::ZeroCopyRun() {
   try {
-    const char *device_name = place_ == PlaceType::kCPU ? "Cpu" : "Cuda";
+    const char *device_name = platform::is_cpu_place(place_) ? "Cpu" : "Cuda";
+    std::vector<Ort::Value> inputs;
+    inputs.reserve(input_desc_.size());
+    for (auto desc : input_desc_) {
+      inputs.push_back(GetOrtValue(desc, device_name));
+      binding_->BindInput(desc.name.c_str(), inputs.back());
+    }
     for (auto output : output_desc_) {
       Ort::MemoryInfo out_memory_info(device_name,
                                       OrtDeviceAllocator,
@@ -333,8 +353,10 @@ bool ONNXRuntimePredictor::ZeroCopyRun() {
 }
 
 std::unique_ptr<PaddlePredictor> ONNXRuntimePredictor::Clone(void *stream) {
-  LOG(ERROR) << "Not support Clone(), Please create new Predictor";
-  return nullptr;
+  std::lock_guard<std::mutex> lk(clone_mutex_);
+  auto *x = new ONNXRuntimePredictor(config_);
+  x->Init();
+  return std::unique_ptr<PaddlePredictor>(x);
 }
 
 uint64_t ONNXRuntimePredictor::TryShrinkMemory() {
diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.h b/paddle/fluid/inference/api/onnxruntime_predictor.h
index 27ce4529a8fe8..b8f0ad0a52941 100644
--- a/paddle/fluid/inference/api/onnxruntime_predictor.h
+++ b/paddle/fluid/inference/api/onnxruntime_predictor.h
@@ -21,8 +21,6 @@
 
 #include "onnxruntime_c_api.h"    // NOLINT
 #include "onnxruntime_cxx_api.h"  // NOLINT
-#include "paddle/fluid/framework/naive_executor.h"
-#include "paddle/fluid/framework/op_compatible_info.h"
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/api/details/reset_tensor_array.h"
@@ -94,7 +92,7 @@ class ONNXRuntimePredictor : public PaddlePredictor {
   /// \param[in] AnalysisConfig config
   ///
   explicit ONNXRuntimePredictor(const AnalysisConfig &config)
-      : config_(config), env_(ORT_LOGGING_LEVEL_WARNING, "onnx") {
+      : env_(ORT_LOGGING_LEVEL_WARNING, "onnx"), config_(config) {
     predictor_id_ = inference::GetUniqueId();
   }
   ///
@@ -176,6 +174,8 @@ class ONNXRuntimePredictor : public PaddlePredictor {
   ///
   std::unique_ptr<PaddlePredictor> Clone(void *stream = nullptr) override;
 
+  std::shared_ptr<framework::Scope> scope_;
+
  protected:
   const void *GetDeviceContexts() const override;
 
@@ -191,14 +191,24 @@ class ONNXRuntimePredictor : public PaddlePredictor {
   ///
   bool FindONNXDesc(const std::string &name, bool is_input);
 
- private:
-  AnalysisConfig config_;
+  /// \brief get the Ort Value(input Tensor).
+  ///
+  /// \param[in] desc ONNXDesce(name、shape、dtype)
+  ///
+  /// \param[in] device_name "cpu" or "gpu" of device
+  ///
+  /// \return get a Ort::Value
+  ///
+  Ort::Value GetOrtValue(const ONNXDesc &desc, const char *device_name);
 
+ private:
   // ONNXRuntime
   Ort::Env env_;
   Ort::Session session_{nullptr};
   std::shared_ptr<Ort::IoBinding> binding_;
 
+  AnalysisConfig config_;
+  std::mutex clone_mutex_;
   platform::Place place_;
   std::vector<ONNXDesc> input_desc_;
   std::vector<ONNXDesc> output_desc_;
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 6de23e930836a..08d0e073babc1 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -914,6 +914,14 @@ struct PD_INFER_DECL AnalysisConfig {
 
   const DistConfig& dist_config() const { return dist_config_; }
 
+  ///
+  /// \brief Set a list of operators that do not support mixed precision. This
+  /// interface is in the experimental stage and may change in the future. Note
+  /// that the blacklist must be the same as the model conversion blacklist.
+  ///
+  void Exp_SetBlackListOpsForMixedModel(
+      const std::unordered_set<std::string>& black_list);
+
  protected:
   // Update the config.
   void Update();
@@ -926,6 +934,9 @@ struct PD_INFER_DECL AnalysisConfig {
   mutable std::string prog_file_;
   mutable std::string params_file_;
 
+  // Mixed precision.
+  std::unordered_set<std::string> mixed_black_list_;
+
   // GPU related.
   bool use_gpu_{false};
   int gpu_device_id_{0};
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 73c216290dd88..3642a28790aec 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -160,6 +160,10 @@ const std::vector<std::string> kGpuLowerPrecisionPasses{
 const std::vector<std::string> kTrtLowerPrecisionPasses{
     // "conv_bn_fuse_pass",
     // "conv_eltwiseadd_bn_fuse_pass",
+    "trt_map_matmul_v2_to_mul_pass",
+    "trt_map_matmul_v2_to_matmul_pass",
+    "trt_map_matmul_to_mul_pass",
+    "fc_fuse_pass",
     "tensorrt_subgraph_pass",
 };
 
@@ -298,6 +302,7 @@ void CpuPassStrategy::EnableMKLDNN() {
              "softplus_activation_mkldnn_fuse_pass",  //
              "shuffle_channel_mkldnn_detect_pass",    //
              "elt_act_mkldnn_fuse_pass",              //
+             "matmul_activation_mkldnn_fuse_pass",    //
              // TODO(intel): Please fix the bug on windows.
              // https://github.com/PaddlePaddle/Paddle/issues/29710
              // "mkldnn_inplace_pass",  // This pass should be activated after
diff --git a/paddle/fluid/inference/capi/CMakeLists.txt b/paddle/fluid/inference/capi/CMakeLists.txt
index 73ba41607aae8..c6ee6bab3c776 100644
--- a/paddle/fluid/inference/capi/CMakeLists.txt
+++ b/paddle/fluid/inference/capi/CMakeLists.txt
@@ -20,7 +20,7 @@ cc_library(
   SRCS ${C_API_SRCS}
   DEPS paddle_inference)
 
-if(NOT ON_INFER)
+if(NOT ON_INFER AND NOT WIN32)
   return()
 endif()
 
diff --git a/paddle/fluid/inference/capi_exp/CMakeLists.txt b/paddle/fluid/inference/capi_exp/CMakeLists.txt
index e35e14a0c0241..089a766b91cfe 100644
--- a/paddle/fluid/inference/capi_exp/CMakeLists.txt
+++ b/paddle/fluid/inference/capi_exp/CMakeLists.txt
@@ -20,7 +20,7 @@ cc_library(
   SRCS ${C_API_SRCS}
   DEPS paddle_inference)
 
-if(NOT ON_INFER)
+if(NOT ON_INFER AND NOT WIN32)
   return()
 endif()
 
diff --git a/paddle/fluid/inference/lite/CMakeLists.txt b/paddle/fluid/inference/lite/CMakeLists.txt
index 7aa010cb0066c..3f4992b8946ec 100644
--- a/paddle/fluid/inference/lite/CMakeLists.txt
+++ b/paddle/fluid/inference/lite/CMakeLists.txt
@@ -5,7 +5,7 @@ endif()
 cc_library(
   lite_op_teller
   SRCS op_teller.cc
-  DEPS ${LITE_DEPS} framework_proto device_context boost xxhash)
+  DEPS ${LITE_DEPS} framework_proto device_context xxhash)
 cc_library(
   lite_engine
   SRCS engine.cc
@@ -13,7 +13,7 @@ cc_library(
 cc_library(
   lite_tensor_utils
   SRCS tensor_utils.cc
-  DEPS memcpy ${LITE_DEPS} framework_proto boost device_context ${XPU_DEPS})
+  DEPS memcpy ${LITE_DEPS} framework_proto device_context ${XPU_DEPS})
 cc_test(
   test_lite_engine
   SRCS test_engine_lite.cc
diff --git a/paddle/fluid/inference/lite/test_engine_lite.cc b/paddle/fluid/inference/lite/test_engine_lite.cc
index dee83f70ba2a2..45b9d222c4c3e 100644
--- a/paddle/fluid/inference/lite/test_engine_lite.cc
+++ b/paddle/fluid/inference/lite/test_engine_lite.cc
@@ -81,7 +81,7 @@ void make_fake_model(std::string* model, std::string* param) {
   ctx.PartialInitWithAllocator();
 #else
   platform::CPUPlace place;
-  platform::CPUDeviceContext ctx(place);
+  phi::CPUContext ctx(place);
 #endif
   // Prepare variables.
   std::vector<std::string> repetitive_params{"x", "y"};
diff --git a/paddle/fluid/inference/paddle_inference_custom_device.map b/paddle/fluid/inference/paddle_inference_custom_device.map
index 52bc2870482e2..d78860e0a2070 100644
--- a/paddle/fluid/inference/paddle_inference_custom_device.map
+++ b/paddle/fluid/inference/paddle_inference_custom_device.map
@@ -5,6 +5,7 @@
 		*profile*;
 		*phi*;
 		*FLAGS_*;
+		PD_*;
 	local:
 		*;
 };
diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt
index 0f1350459ef22..7239b506d33f6 100644
--- a/paddle/fluid/inference/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
@@ -4,18 +4,18 @@ if(WIN32)
   nv_library(
     tensorrt_engine
     SRCS engine.cc trt_int8_calibrator.cc
-    DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context boost
+    DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context
          paddle_inference_api)
 else()
   nv_library(
     tensorrt_engine
     SRCS engine.cc trt_int8_calibrator.cc
-    DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context boost)
+    DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context)
 endif()
 nv_library(
   tensorrt_op_teller
   SRCS op_teller.cc
-  DEPS framework_proto device_context boost)
+  DEPS framework_proto device_context)
 nv_test(
   test_tensorrt
   SRCS test_tensorrt.cc
@@ -24,5 +24,12 @@ nv_test(
   test_tensorrt_engine
   SRCS test_engine.cc test_dynamic_engine.cc
   DEPS dynload_cuda tensorrt_engine tensorrt_plugin)
+
+if(WITH_ONNXRUNTIME AND WIN32)
+  # Copy onnxruntime for some c++ test in Windows, since the test will
+  # be build only in CI, so suppose the generator in Windows is Ninja.
+  copy_onnx(test_tensorrt_engine)
+endif()
+
 add_subdirectory(plugin)
 add_subdirectory(convert)
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index c999c009605ee..519daba2747d4 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -68,7 +68,9 @@ list(
   c_allreduce_op.cc
   top_k_op.cc
   squeeze2_op.cc
-  unsqueeze2_op.cc)
+  unsqueeze2_op.cc
+  fill_constant_op.cc
+  fused_token_prune_op.cc)
 
 if(CUSPARSELT_FOUND AND ${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 8)
   list(APPEND CONVERT_FILES sparse_fc_op.cc sparse_multihead_matmul_op.cc)
@@ -85,3 +87,9 @@ nv_test(
   SRCS test_op_converter.cc
   DEPS paddle_framework ${GLOB_OPERATOR_DEPS} tensorrt_engine
        tensorrt_converter)
+
+if(WITH_ONNXRUNTIME AND WIN32)
+  # Copy onnxruntime for some c++ test in Windows, since the test will
+  # be build only in CI, so suppose the generator in Windows is Ninja.
+  copy_onnx(test_op_converter)
+endif()
diff --git a/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc b/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc
index 44283f4e0d7e9..017fa8800b458 100644
--- a/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc
@@ -50,22 +50,26 @@ class AffineChannelOpConverter : public OpConverter {
 
     auto* scale_v = scope.FindVar(scale_name);
     auto* scale_t = scale_v->GetMutable<framework::LoDTensor>();
-    float* scale_ptr = engine_->GetWeightCPUData(scale_name, scale_t);
+    float* scale_ptr = const_cast<float*>(static_cast<const float*>(
+        engine_->GetFp32TrtWeight(scale_name, *scale_t).get().values));
 
     auto* bias_v = scope.FindVar(bias_name);
     auto* bias_t = bias_v->GetMutable<framework::LoDTensor>();
-    float* bias_ptr = engine_->GetWeightCPUData(bias_name, bias_t);
+    float* bias_ptr = const_cast<float*>(static_cast<const float*>(
+        engine_->GetFp32TrtWeight(bias_name, *bias_t).get().values));
 
     // tensorrt scalend layer only support spatial dims >= 2,
     // so nhwc is not availabe (spatial dims == 0)
     const int channel_axis = engine_->with_dynamic_shape();
 
-    TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT,
-                                         static_cast<void*>(scale_ptr),
-                                         (size_t)idim.d[channel_axis]};
-    TensorRTEngine::Weight bias_weights{nvinfer1::DataType::kFLOAT,
-                                        static_cast<void*>(bias_ptr),
-                                        (size_t)idim.d[channel_axis]};
+    TensorRTEngine::Weight scale_weights{
+        nvinfer1::DataType::kFLOAT,
+        static_cast<void*>(scale_ptr),
+        static_cast<size_t>(idim.d[channel_axis])};
+    TensorRTEngine::Weight bias_weights{
+        nvinfer1::DataType::kFLOAT,
+        static_cast<void*>(bias_ptr),
+        static_cast<size_t>(idim.d[channel_axis])};
     TensorRTEngine::Weight power_weights{
         nvinfer1::DataType::kFLOAT, nullptr, 0};
 
diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
index 376dfa32e879f..c5dae16bc3cac 100644
--- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
@@ -169,7 +169,7 @@ class BatchNormOpConverter : public OpConverter {
     engine_->SetWeights(op_desc.Input("Scale").front(),
                         std::move(combile_scale_tensor));
     if (x_dim.nbDims < 3 + dynamic_shape_offset) {
-      layer->getOutput(0)->setName("batch_norm_out");
+      layer->getOutput(0)->setName(("BN: ScaleNd: " + output_name).c_str());
       layer->setName(("BN: ScaleNd: (Output: " + output_name + ")").c_str());
       nvinfer1::Dims squeeze_shape;
       squeeze_shape.nbDims = x_dim.nbDims;
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index 244078dc344a2..c47f6d03cd543 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/phi/common/data_type.h"
 
 namespace paddle {
 namespace framework {
@@ -48,7 +50,7 @@ void ConvertConv2d(TensorRTEngine* engine,
       platform::errors::NotFound("Can not find %s presistale var in scope.",
                                  filter_var_name));
   auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
-  float* weight_data = nullptr;
+
   bool enable_int8 = op_desc.HasAttr("enable_int8");
 
   if (enable_int8) {
@@ -57,7 +59,6 @@ void ConvertConv2d(TensorRTEngine* engine,
     engine->SetTensorDynamicRange(X, in_scale);
 #endif
   }
-  weight_data = engine->GetWeightCPUData(op_desc.Input("Filter").front(), Y_t);
 
   PADDLE_ENFORCE_EQ(Y_t->dims().size(),
                     4UL,
@@ -104,21 +105,19 @@ void ConvertConv2d(TensorRTEngine* engine,
     nv_post_paddings.d[1] = paddings[3];
   }
 
-  TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
-                                static_cast<void*>(weight_data),
-                                static_cast<size_t>(Y_t->numel())};
-  float* bias_data = nullptr;
-  size_t bias_size = 0;
+  auto weight = engine->GetTrtWeight(op_desc.Input("Filter").front(), *Y_t);
+
+  TensorRTEngine::Weight bias;
+  bias.SetDataType(weight.get().type);
+  bias.SetCount(0);
+  bias.SetValues(nullptr);
   if (op_desc.Type() == "conv2d_fusion") {
     auto* bias_tensor = scope.GetVar(op_desc.Input("Bias").front());
     auto* bias_tensor_data = bias_tensor->GetMutable<framework::LoDTensor>();
-    bias_data = engine->GetWeightCPUData(op_desc.Input("Bias").front(),
-                                         bias_tensor_data);
-    bias_size = static_cast<size_t>(bias_tensor_data->numel());
+    bias =
+        engine->GetTrtWeight(op_desc.Input("Bias").front(), *bias_tensor_data);
   }
 
-  TensorRTEngine::Weight bias{
-      nvinfer1::DataType::kFLOAT, static_cast<void*>(bias_data), bias_size};
   // In conv2d_transpose and depthwise_conv2d_transpose,
   // output channels = filter_dims[1] * groups
   auto* layer = (op_desc.Type() == "conv2d_transpose" ||
diff --git a/paddle/fluid/inference/tensorrt/convert/conv3d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv3d_op.cc
index 7308d44bf8320..4ffc805654727 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv3d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv3d_op.cc
@@ -48,14 +48,12 @@ void ConvertConv3d(TensorRTEngine* engine,
       platform::errors::NotFound("Can not find %s presistale var in scope.",
                                  filter_var_name));
   auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
-  float* weight_data = nullptr;
   bool enable_int8 = op_desc.HasAttr("enable_int8");
 
   if (enable_int8) {
     float in_scale = BOOST_GET_CONST(float, op_desc.GetAttr("Input_scale"));
     engine->SetTensorDynamicRange(X, in_scale);
   }
-  weight_data = engine->GetWeightCPUData(op_desc.Input("Filter").front(), Y_t);
 
   PADDLE_ENFORCE_EQ(Y_t->dims().size(),
                     5UL,
@@ -85,14 +83,12 @@ void ConvertConv3d(TensorRTEngine* engine,
   nvinfer1::Dims3 nv_strides(strides[0], strides[1], strides[2]);
   nvinfer1::Dims3 nv_paddings(paddings[0], paddings[1], paddings[2]);
 
-  TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
-                                static_cast<void*>(weight_data),
-                                static_cast<size_t>(Y_t->numel())};
+  auto weight = engine->GetTrtWeight(op_desc.Input("Filter").front(), *Y_t);
   float* bias_data = nullptr;
   size_t bias_size = 0;
 
   TensorRTEngine::Weight bias{
-      nvinfer1::DataType::kFLOAT, static_cast<void*>(bias_data), bias_size};
+      weight.get().type, static_cast<void*>(bias_data), bias_size};
   // In conv3d_transpose output channels = filter_dims[1] * groups
   auto* layer = (op_desc.Type() == "conv3d_transpose")
                     ? fadd_layer(X, n_input * groups, nv_ksize, weight, bias)
diff --git a/paddle/fluid/inference/tensorrt/convert/deformable_conv_op.cc b/paddle/fluid/inference/tensorrt/convert/deformable_conv_op.cc
index f0a82bebc7ca9..8cf7f6528e595 100644
--- a/paddle/fluid/inference/tensorrt/convert/deformable_conv_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/deformable_conv_op.cc
@@ -49,8 +49,6 @@ class DeformableConvOpConverter : public OpConverter {
     auto* filter_var = scope.FindVar(filter_name);
     auto* filter_tensor = filter_var->GetMutable<framework::LoDTensor>();
 
-    float* filter_data = engine_->GetWeightCPUData(filter_name, filter_tensor);
-
     const int c_o = filter_tensor->dims()[0];
     const int c_i = filter_tensor->dims()[1];
     const int k_h = filter_tensor->dims()[2];
@@ -73,15 +71,20 @@ class DeformableConvOpConverter : public OpConverter {
     weights.count = filter_tensor->numel();
     bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
     if (with_fp16) {
-      auto half_filter_data = new half[filter_tensor->numel()];
-      for (int i = 0; i < filter_tensor->numel(); i++) {
-        half_filter_data[i] = static_cast<half>(filter_data[i]);
+      auto filter_weight = engine_->GetTrtWeight(filter_name, *filter_tensor);
+      if (filter_weight.get().type == nvinfer1::DataType::kFLOAT) {
+        auto half_filter_data = new half[filter_tensor->numel()];
+        for (int i = 0; i < filter_tensor->numel(); i++) {
+          half_filter_data[i] = static_cast<half>(
+              static_cast<const float*>(filter_weight.get().values)[i]);
+        }
+        weights.type = nvinfer1::DataType::kHALF;
+        weights.values = half_filter_data;
+      } else if (filter_weight.get().type == nvinfer1::DataType::kHALF) {
+        weights = filter_weight.get();
       }
-      weights.type = nvinfer1::DataType::kHALF;
-      weights.values = half_filter_data;
     } else {
-      weights.type = nvinfer1::DataType::kFLOAT;
-      weights.values = filter_data;
+      weights = engine_->GetFp32TrtWeight(filter_name, *filter_tensor).get();
     }
     auto* deformable_conv_plugin = new plugin::DeformableConvPlugin(
         with_fp16 ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT,
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
index 2d342a6f7040d..7fd89dd731a8e 100644
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -33,27 +33,42 @@ class ElementwiseTensorOpConverter : public OpConverter {
     if (Y_v) {
       // Y is weight
       auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
-      float* weight_data =
-          engine_->GetWeightCPUData(op_desc.Input("Y").front(), Y_t);
       std::vector<int> dims_y = phi::vectorize<int>(Y_t->dims());
-      TensorRTEngine::Weight y_weight{nvinfer1::DataType::kFLOAT,
-                                      static_cast<void*>(weight_data),
-                                      static_cast<size_t>(Y_t->numel())};
+      auto y_weight = engine_->GetTrtWeight(op_desc.Input("Y").front(), *Y_t);
+
       nvinfer1::Dims trt_dims_y;
       trt_dims_y.nbDims = dims_y.size();
       for (int i = 0; i < trt_dims_y.nbDims; i++) {
         trt_dims_y.d[i] = dims_y[i];
       }
+      // this is the special case when dims_y includes batch dimension!
+      // we need remove batch dimension!
+      if (!engine_->with_dynamic_shape() &&
+          trt_dims_y.nbDims == (X->getDimensions().nbDims + 1)) {
+        trt_dims_y.nbDims--;
+        PADDLE_ENFORCE_EQ(trt_dims_y.d[0],
+                          1,
+                          platform::errors::InvalidArgument(
+                              "Elementwise type(%s) op's Y is a weight "
+                              "including batch dimension. Please "
+                              "check if the 0th dimension equals 1.",
+                              op_type_));
+        for (int i = 0; i < trt_dims_y.nbDims; i++) {
+          trt_dims_y.d[i] = trt_dims_y.d[i + 1];
+        }
+      }
       Y = TRT_ENGINE_ADD_LAYER(engine_, Constant, trt_dims_y, y_weight.get())
               ->getOutput(0);
     } else {
       Y = engine_->GetITensor(op_desc.Input("Y").front());
     }
-
+    bool swap_xy = false;
+    // Swap X and Y
     if (X->getDimensions().nbDims < Y->getDimensions().nbDims) {
       auto* tmp = X;
       X = Y;
       Y = tmp;
+      swap_xy = true;
     }
     nvinfer1::Dims dims_x = X->getDimensions();
     nvinfer1::Dims dims_y = Y->getDimensions();
@@ -117,6 +132,13 @@ class ElementwiseTensorOpConverter : public OpConverter {
       reshape_y_tensor = Y;
     }
 
+    // We should swap X and Y back, because some operators do not have symmetry
+    if (swap_xy) {
+      auto* tmp = reshape_y_tensor;
+      reshape_y_tensor = X;
+      X = tmp;
+    }
+
     auto op_pair = ops.find(op_type_);
     PADDLE_ENFORCE_NE(op_pair,
                       ops.end(),
diff --git a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
index 1a1f72388e40e..5020b97627753 100644
--- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
@@ -10,8 +10,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/utils.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h"
+#include "paddle/phi/core/ddim.h"
 
 namespace paddle {
 namespace framework {
@@ -73,27 +76,39 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
     // input_embs[0]: word_embedding
     // input_embs[1]: pos_embedding
     // input_embs[2]: sent_embedding
-    std::vector<float*> input_embs;
+    std::vector<nvinfer1::Weights> input_embs;
     std::vector<int> emb_sizes;
 
     // get the presistable var's data
-    auto get_persistable_data = [&](const std::string& var_name,
-                                    framework::DDim* dims) -> float* {
+    auto GetWeight = [&](const std::string& var_name,
+                         framework::DDim* dim) -> TensorRTEngine::Weight {
       auto* temp_var = scope.FindVar(var_name);
       auto* temp_tensor = temp_var->GetMutable<framework::LoDTensor>();
-      (*dims) = temp_tensor->dims();
+      *dim = temp_tensor->dims();
+      auto weight = engine_->GetTrtWeight(var_name, *temp_tensor);
+      return weight;
+    };
 
-      auto* temp_data = engine_->GetWeightCPUData(var_name, temp_tensor);
-      return temp_data;
+    auto GetFp32Weight = [&](const std::string& var_name,
+                             framework::DDim* dim) -> TensorRTEngine::Weight {
+      auto* temp_var = scope.FindVar(var_name);
+      auto* temp_tensor = temp_var->GetMutable<framework::LoDTensor>();
+      *dim = temp_tensor->dims();
+      auto weight = engine_->GetFp32TrtWeight(var_name, *temp_tensor);
+      return weight;
     };
 
     int hidden = 0;
     for (int i = 0; i < input_num; i++) {
       framework::DDim emb_dims;
-      float* emb_data = get_persistable_data(emb_names[i], &emb_dims);
-      int64_t emb_size = phi::product(emb_dims);
-      input_embs.push_back(emb_data);
-      emb_sizes.push_back(emb_size);
+      TensorRTEngine::Weight weight;
+      if (flag_varseqlen) {
+        weight = GetWeight(emb_names[i], &emb_dims);
+      } else {
+        weight = GetFp32Weight(emb_names[i], &emb_dims);
+      }
+      input_embs.push_back(weight.get());
+      emb_sizes.push_back(weight.get().count);
       PADDLE_ENFORCE_EQ(
           emb_dims.size(),
           2,
@@ -103,11 +118,15 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
     }
 
     framework::DDim bias_dims, scale_dims;
+    TensorRTEngine::Weight bias_weight, scale_weight;
+    if (flag_varseqlen) {
+      bias_weight = GetWeight(op_desc.Input("Bias").front(), &bias_dims);
+      scale_weight = GetWeight(op_desc.Input("Scale").front(), &scale_dims);
+    } else {
+      bias_weight = GetFp32Weight(op_desc.Input("Bias").front(), &bias_dims);
+      scale_weight = GetFp32Weight(op_desc.Input("Scale").front(), &scale_dims);
+    }
 
-    auto* bias =
-        get_persistable_data(op_desc.Input("Bias").front(), &bias_dims);
-    auto* scale =
-        get_persistable_data(op_desc.Input("Scale").front(), &scale_dims);
     int64_t bias_size = phi::product(bias_dims);
     int64_t scale_size = phi::product(scale_dims);
     nvinfer1::ILayer* layer = nullptr;
@@ -134,24 +153,24 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
               "But Precision::KFloat32 is setted."));
       const std::vector<nvinfer1::PluginField> fields{
           {"bert_embeddings_layernorm_beta",
-           bias,
-           nvinfer1::PluginFieldType::kFLOAT32,
+           bias_weight.get().values,
+           GetPluginFieldType(bias_weight.get().type),
            static_cast<int32_t>(bias_size)},
           {"bert_embeddings_layernorm_gamma",
-           scale,
-           nvinfer1::PluginFieldType::kFLOAT32,
+           scale_weight.get().values,
+           GetPluginFieldType(scale_weight.get().type),
            static_cast<int32_t>(scale_size)},
           {"bert_embeddings_word_embeddings",
-           input_embs[0],
-           nvinfer1::PluginFieldType::kFLOAT32,
+           input_embs[0].values,
+           GetPluginFieldType(input_embs[0].type),
            static_cast<int32_t>(emb_sizes[0])},
           {"bert_embeddings_token_type_embeddings",
-           input_embs[2],
-           nvinfer1::PluginFieldType::kFLOAT32,
+           input_embs[2].values,
+           GetPluginFieldType(input_embs[2].type),
            static_cast<int32_t>(emb_sizes[2])},
           {"bert_embeddings_position_embeddings",
-           input_embs[1],
-           nvinfer1::PluginFieldType::kFLOAT32,
+           input_embs[1].values,
+           GetPluginFieldType(input_embs[1].type),
            static_cast<int32_t>(emb_sizes[1])},
           {"output_fp16", &output_fp16, nvinfer1::PluginFieldType::kINT32, 1},
       };
@@ -235,15 +254,23 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
           engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
       float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon"));
       plugin::DynamicPluginTensorRT* plugin = nullptr;
-      plugin = new plugin::EmbEltwiseLayernormPluginDynamic(input_embs,
-                                                            bias,
-                                                            scale,
-                                                            emb_sizes,
-                                                            bias_size,
-                                                            scale_size,
-                                                            hidden,
-                                                            eps,
-                                                            with_fp16);
+      std::vector<float*> input_embs_data;
+      for (size_t i = 0; i < input_embs.size(); ++i) {
+        input_embs_data.push_back(const_cast<float*>(
+            static_cast<const float*>(input_embs[i].values)));
+      }
+      plugin = new plugin::EmbEltwiseLayernormPluginDynamic(
+          input_embs_data,
+          const_cast<float*>(
+              static_cast<const float*>(bias_weight.get().values)),
+          const_cast<float*>(
+              static_cast<const float*>(scale_weight.get().values)),
+          emb_sizes,
+          bias_size,
+          scale_size,
+          hidden,
+          eps,
+          with_fp16);
       layer = engine_->AddDynamicPlugin(input_ids.data(), input_num, plugin);
       auto output_name = op_desc.Output("Out")[0];
       RreplenishLayerAndOutput(
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
index ce6644cad4200..0d61dc6d0ea96 100644
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -27,6 +27,16 @@ class OpDesc;
 namespace paddle {
 namespace inference {
 namespace tensorrt {
+namespace {
+template <typename T>
+void tranpose_weight(const T* src, T* dst, int m, int n) {
+  for (int i = 0; i < m; i++) {
+    for (int j = 0; j < n; j++) {
+      dst[j * m + i] = src[i * n + j];
+    }
+  }
+}
+}  // namespace
 
 /*
  * FC converter convert a MUL op in Fluid to a FC layer in TRT.
@@ -156,9 +166,7 @@ class FcOpConverter : public OpConverter {
         op_desc.HasAttr("activation_type")
             ? BOOST_GET_CONST(std::string, op_desc.GetAttr("activation_type"))
             : "";
-    // This may trigger a GPU->CPU copy, because TRT's weight can only be
-    // assigned from CPU memory, which can't be avoided.
-    float* weight_data = nullptr;
+
     bool enable_int8 = op_desc.HasAttr("enable_int8");
     bool support_int8 = false;
     if (op_desc.HasAttr("support_int8")) {
@@ -173,7 +181,6 @@ class FcOpConverter : public OpConverter {
       }
       engine_->SetTensorDynamicRange(X, in_scale);
     }
-    weight_data = engine_->GetWeightCPUData(op_desc.Input(w_name).front(), Y_t);
 
     PADDLE_ENFORCE_EQ(Y_t->dims().size(),
                       2UL,
@@ -183,13 +190,6 @@ class FcOpConverter : public OpConverter {
                           Y_t->dims().size()));  // a matrix
     int m = Y_t->dims()[0];
     int n = Y_t->dims()[1];
-    auto tranpose_weight = [](const float* src, float* dst, int m, int n) {
-      for (int i = 0; i < m; i++) {
-        for (int j = 0; j < n; j++) {
-          dst[j * m + i] = src[i * n + j];
-        }
-      }
-    };
 
     auto regist_fc = [&](nvinfer1::ITensor* inputs,
                          int n_output,
@@ -283,11 +283,36 @@ class FcOpConverter : public OpConverter {
       transpose_y = BOOST_GET_CONST(bool, op_desc.GetAttr("transpose_Y"));
     }
     int weight_w, weight_h;
+    auto weight = engine_->GetTrtWeight(op_desc.Input(w_name).front(), *Y_t);
+
     if (!transpose_y) {
-      std::vector<float> weight_data_tmp;
-      weight_data_tmp.reserve(Y_t->numel());
-      memcpy(weight_data_tmp.data(), weight_data, Y_t->numel() * sizeof(float));
-      tranpose_weight(weight_data_tmp.data(), weight_data, m, n);
+      if (weight.get().type == nvinfer1::DataType::kFLOAT) {
+        std::vector<float> weight_data_tmp;
+        weight_data_tmp.reserve(Y_t->numel());
+        memcpy(weight_data_tmp.data(),
+               weight.get().values,
+               Y_t->numel() * sizeof(float));
+        tranpose_weight(
+            weight_data_tmp.data(),
+            const_cast<float*>(static_cast<const float*>(weight.get().values)),
+            m,
+            n);
+      } else if (weight.get().type == nvinfer1::DataType::kHALF) {
+        std::vector<float16> weight_data_tmp;
+        weight_data_tmp.reserve(Y_t->numel());
+        memcpy(weight_data_tmp.data(),
+               weight.get().values,
+               Y_t->numel() * sizeof(float16));
+        tranpose_weight(weight_data_tmp.data(),
+                        const_cast<float16*>(
+                            static_cast<const float16*>(weight.get().values)),
+                        m,
+                        n);
+      } else {
+        PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+            "Paddle-TRT fc convert not supporte dtype, now only support fp32 "
+            "and fp16."));
+      }
       weight_w = n;
       weight_h = m;
     } else {
@@ -295,112 +320,36 @@ class FcOpConverter : public OpConverter {
       weight_h = n;
     }
     size_t n_output = weight_w;
-    TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
-                                  static_cast<void*>(weight_data),
-                                  static_cast<size_t>(Y_t->numel())};
     weight.dims.assign({weight_w, weight_h});
 
-    float* bias_data = nullptr;
-    int bias_num = 0;
+    TensorRTEngine::Weight bias{weight.get().type, nullptr, 0};
     if (with_bias) {
       auto* b_v = scope.GetVar(op_desc.Input("Bias").front());
       auto* b_t = b_v->GetMutable<framework::LoDTensor>();
-      bias_data = engine_->GetWeightCPUData(op_desc.Input("Bias").front(), b_t);
-      bias_num = b_t->numel();
+      bias = engine_->GetTrtWeight(op_desc.Input("Bias").front(), *b_t);
     }
-    TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT,
-                                static_cast<void*>(bias_data),
-                                static_cast<size_t>(bias_num)};
 
     // Running the TRT Static Shape mode: x_num_col_dims-1
     if (!engine_->with_dynamic_shape()) {
       x_num_col_dims--;
     }
-    // If use tensorrt'oss, the x_dim and x_num_col_dims need change, and can
-    // not add Shuffle layer in ernie's multihead.
-    if (x_dim.nbDims == 4 && x_num_col_dims == 1) {
-      if (enable_int8 || support_int8) {
-        // add conv1x1 layer
-        nvinfer1::DimsHW nv_ksize(1, 1);
-        auto* fc_layer_int8 = TRT_ENGINE_ADD_LAYER(engine_,
-                                                   Convolution,
-                                                   *X,
-                                                   n_output,
-                                                   nv_ksize,
-                                                   weight.get(),
-                                                   bias.get());
-        if (activation_type == "relu") {
-          fc_layer_int8->setName(
-              ("ernie_fc_op_int8: Convolution (Output: " + output_name + ")")
-                  .c_str());
-          PADDLE_ENFORCE_EQ(
-              op_desc.HasAttr("out_threshold"),
-              true,
-              platform::errors::InvalidArgument(
-                  "must have out threshold in fc layers in int8 mode"));
-          float out_scale = 0;
-          if (enable_int8) {
-            out_scale =
-                BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold"));
-          } else {
-            out_scale = BOOST_GET_CONST(float, op_desc.GetAttr("Out"));
-          }
-          engine_->SetTensorDynamicRange(fc_layer_int8->getOutput(0),
-                                         out_scale);
-          nvinfer1::IActivationLayer* relu_layer_int8 =
-              TRT_ENGINE_ADD_LAYER(engine_,
-                                   Activation,
-                                   *(fc_layer_int8->getOutput(0)),
-                                   nvinfer1::ActivationType::kRELU);
-          RreplenishLayerAndOutput(relu_layer_int8,
-                                   "relu_after_ernie_fc_int8",
-                                   {output_name},
-                                   test_mode);
-        } else {
-          RreplenishLayerAndOutput(fc_layer_int8,
-                                   "ernie_fc_op_int8: Convolution",
-                                   {output_name},
-                                   test_mode);
-        }
-      } else {
-        // add fc layer
-        auto* fc_layer_float = TRT_ENGINE_ADD_LAYER(
-            engine_, FullyConnected, *X, n_output, weight.get(), bias.get());
-        if (activation_type == "relu") {
-          fc_layer_float->setName(
-              ("ernie_fc_op_float: (Output: " + output_name + ")").c_str());
-          nvinfer1::IActivationLayer* relu_layer_float =
-              TRT_ENGINE_ADD_LAYER(engine_,
-                                   Activation,
-                                   *(fc_layer_float->getOutput(0)),
-                                   nvinfer1::ActivationType::kRELU);
-          RreplenishLayerAndOutput(relu_layer_float,
-                                   "relu_after_ernie_fc_float",
-                                   {output_name},
-                                   test_mode);
-        } else {
-          RreplenishLayerAndOutput(
-              fc_layer_float, "ernie_fc_op_float", {output_name}, test_mode);
-        }
-      }
-    } else {  // need reshape input before and after fc
-      PADDLE_ENFORCE_GT(
-          x_dim.nbDims,
-          x_num_col_dims,
-          platform::errors::InvalidArgument(
-              "Params and input dims mismatch. Paddle-TRT FC "
-              "converter expects x_dim.nbDims > x_num_col_dims, but "
-              "x_dim.nbDims : %d, x_num_col_dims : %d.",
-              x_dim.nbDims,
-              x_num_col_dims));
-      auto* reshape_before_fc_layer =
-          reshape_before_fc(X, x_dim, x_num_col_dims, output_name);
-      auto* reshape_itensor = reshape_before_fc_layer->getOutput(0);
-      if (enable_int8 || support_int8) {
-        engine_->SetTensorDynamicRange(reshape_itensor, in_scale);
-      }
-      regist_fc(reshape_itensor, n_output, weight, bias);
+    PADDLE_ENFORCE_GT(
+        x_dim.nbDims,
+        x_num_col_dims,
+        platform::errors::InvalidArgument(
+            "Params and input dims mismatch. Paddle-TRT FC "
+            "converter expects x_dim.nbDims > x_num_col_dims, but "
+            "x_dim.nbDims : %d, x_num_col_dims : %d.",
+            x_dim.nbDims,
+            x_num_col_dims));
+    // need reshape input before and after fc
+    auto* reshape_before_fc_layer =
+        reshape_before_fc(X, x_dim, x_num_col_dims, output_name);
+    auto* reshape_itensor = reshape_before_fc_layer->getOutput(0);
+    if (enable_int8 || support_int8) {
+      engine_->SetTensorDynamicRange(reshape_itensor, in_scale);
     }
+    regist_fc(reshape_itensor, n_output, weight, bias);
   }
 };
 
diff --git a/paddle/fluid/inference/tensorrt/convert/fill_constant_op.cc b/paddle/fluid/inference/tensorrt/convert/fill_constant_op.cc
new file mode 100644
index 0000000000000..53eb3f2c89732
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/fill_constant_op.cc
@@ -0,0 +1,71 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class FillConstantOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope,
+                  bool test_mode) override {
+    VLOG(4)
+        << "convert a fluid fill_constant op to tensorrt fill_constant layer";
+
+    framework::OpDesc op_desc(op, nullptr);
+    int dtype = BOOST_GET_CONST(int, op_desc.GetAttr("dtype"));
+    std::string str_value =
+        BOOST_GET_CONST(std::string, op_desc.GetAttr("str_value"));
+    std::vector<int64_t> shape =
+        BOOST_GET_CONST(std::vector<int64_t>, op_desc.GetAttr("shape"));
+    std::unique_ptr<framework::Tensor> out_tensor(new framework::Tensor());
+    out_tensor->Resize(phi::make_ddim(shape));
+    nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT;
+    void* trt_data = nullptr;
+    size_t trt_num;
+    if (dtype == 2 || dtype == 3) {  // int,int64
+      auto* tmp_ptr = out_tensor->mutable_data<int>(platform::CPUPlace());
+      for (int64_t i = 0; i < out_tensor->numel(); i++)
+        tmp_ptr[i] = std::stoi(str_value);
+      trt_dtype = nvinfer1::DataType::kINT32;
+      trt_data = static_cast<void*>(tmp_ptr);
+    } else if (dtype == 5) {  // float
+      auto* tmp_ptr = out_tensor->mutable_data<float>(platform::CPUPlace());
+      for (int64_t i = 0; i < out_tensor->numel(); i++)
+        tmp_ptr[i] = std::stof(str_value);
+      trt_data = static_cast<void*>(tmp_ptr);
+    }
+
+    trt_num = static_cast<size_t>(out_tensor->numel());
+    engine_->SetWeights("fill_constant_value", std::move(out_tensor));
+    TensorRTEngine::Weight weight{trt_dtype, trt_data, trt_num};
+
+    nvinfer1::Dims trt_in_shape;
+    trt_in_shape.nbDims = shape.size();
+    for (size_t i = 0; i < shape.size(); i++) trt_in_shape.d[i] = shape[i];
+    nvinfer1::ILayer* layer =
+        TRT_ENGINE_ADD_LAYER(engine_, Constant, trt_in_shape, weight.get());
+    auto output_name = op_desc.Output("Out")[0];
+    RreplenishLayerAndOutput(layer, "fill_constant", {output_name}, test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(fill_constant, FillConstantOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/fused_token_prune_op.cc b/paddle/fluid/inference/tensorrt/convert/fused_token_prune_op.cc
new file mode 100644
index 0000000000000..bab04ac16aac9
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/fused_token_prune_op.cc
@@ -0,0 +1,76 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class FusedTokenPruneOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope,
+                  bool test_mode) override {
+    framework::OpDesc op_desc(op, nullptr);
+    nvinfer1::ILayer* layer = nullptr;
+
+    auto* Attn = engine_->GetITensor(op_desc.Input("Attn").front());
+    auto* X = engine_->GetITensor(op_desc.Input("X").front());
+    auto* Mask = engine_->GetITensor(op_desc.Input("Mask").front());
+    auto* NewMask = engine_->GetITensor(op_desc.Input("NewMask").front());
+    bool keep_first_token =
+        op_desc.HasAttr("keep_first_token")
+            ? BOOST_GET_CONST(bool, op_desc.GetAttr("keep_first_token"))
+            : true;
+    bool keep_order = op_desc.HasAttr("keep_order")
+                          ? BOOST_GET_CONST(bool, op_desc.GetAttr("keep_order"))
+                          : false;
+
+    std::vector<nvinfer1::ITensor*> itensors = {Attn, X, Mask, NewMask};
+
+    auto output_name = op_desc.Output("SlimmedX")[0];
+    auto out_inds_name = op_desc.Output("CLSInds")[0];
+    if (engine_->with_dynamic_shape()) {
+#if IS_TRT_VERSION_GE(6000)
+      bool with_fp16 =
+          engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
+
+      if (engine_->precision() == AnalysisConfig::Precision::kInt8) {
+        with_fp16 = true;
+      }
+      plugin::FusedTokenPrunePluginDynamic* plugin =
+          new plugin::FusedTokenPrunePluginDynamic(
+              with_fp16, keep_first_token, keep_order);
+      layer = engine_->AddDynamicPlugin(itensors.data(), 4, plugin);
+#else
+      PADDLE_THROW(platform::errors::Fatal(
+          "You are running the TRT Dynamic Shape mode, need to confirm that "
+          "your TRT version is no less than 6.0"));
+#endif
+    } else {
+      PADDLE_THROW(platform::errors::Fatal(
+          "You are running the Ernie(Bert) model in static shape mode, which "
+          "is not supported for the time being.\n"
+          "You can use the config.SetTRTDynamicShapeInfo(...) interface to set "
+          "the shape information to run the dynamic shape mode."));
+    }
+    RreplenishLayerAndOutput(
+        layer, "fused_token_prune", {output_name, out_inds_name}, test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(fused_token_prune, FusedTokenPruneOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/group_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/group_norm_op.cc
index f5a2026ff6fdf..1b45264475354 100644
--- a/paddle/fluid/inference/tensorrt/convert/group_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/group_norm_op.cc
@@ -12,6 +12,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
 
 namespace paddle {
 namespace framework {
@@ -44,30 +45,20 @@ class GroupNormOpConverter : public OpConverter {
     std::string bias_name = op_desc.Input("Bias").front();
 
     // get the presistable var's data
-    auto get_persistable_data = [&](const std::string& var_name,
-                                    framework::DDim* dims) -> float* {
+    auto GetWeight = [&](const std::string& var_name,
+                         framework::DDim* dims) -> TensorRTEngine::Weight {
       auto* temp_var = scope.FindVar(var_name);
       auto* temp_tensor = temp_var->GetMutable<framework::LoDTensor>();
       (*dims) = temp_tensor->dims();
 
-      auto* temp_data = engine_->GetWeightCPUData(var_name, temp_tensor);
-      return temp_data;
+      auto weight = engine_->GetTrtWeight(var_name, *temp_tensor);
+      return weight;
     };
 
     framework::DDim scale_dims;
     framework::DDim bias_dims;
-    float* scale_data = get_persistable_data(scale_name, &scale_dims);
-    float* bias_data = get_persistable_data(bias_name, &bias_dims);
-
-    int64_t scale_numel = phi::product(scale_dims);
-    int64_t bias_numel = phi::product(bias_dims);
-
-    TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT,
-                                         static_cast<void*>(scale_data),
-                                         static_cast<size_t>(scale_numel)};
-    TensorRTEngine::Weight bias_weights{nvinfer1::DataType::kFLOAT,
-                                        static_cast<void*>(bias_data),
-                                        static_cast<size_t>(bias_numel)};
+    auto scale_weights = GetWeight(scale_name, &scale_dims);
+    auto bias_weights = GetWeight(bias_name, &bias_dims);
 
     nvinfer1::Dims scale_nv_dims;
     nvinfer1::Dims bias_nv_dims;
diff --git a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
index a82101e29f571..c899f4f6e777e 100644
--- a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
@@ -49,20 +49,10 @@ class LayerNormOpConverter : public OpConverter {
     auto* Bias_t = Bias_v->GetMutable<framework::LoDTensor>();
     auto* Scale_t = Scale_v->GetMutable<framework::LoDTensor>();
 
-    std::unique_ptr<framework::LoDTensor> bias_tensor(
-        new framework::LoDTensor());
-    std::unique_ptr<framework::LoDTensor> scale_tensor(
-        new framework::LoDTensor());
-
-    bias_tensor->Resize(Bias_t->dims());
-    scale_tensor->Resize(Scale_t->dims());
-
-    platform::CPUPlace cpu_place;
-    paddle::framework::TensorCopySync((*Bias_t), cpu_place, &(*bias_tensor));
-    paddle::framework::TensorCopySync((*Scale_t), cpu_place, &(*scale_tensor));
-
-    auto* bias_data = bias_tensor->mutable_data<float>(platform::CPUPlace());
-    auto* scale_data = scale_tensor->mutable_data<float>(platform::CPUPlace());
+    auto bias_weight =
+        engine_->GetFp32TrtWeight(op_desc.Input("Bias").front(), *Bias_t);
+    auto scale_weight =
+        engine_->GetFp32TrtWeight(op_desc.Input("Scale").front(), *Scale_t);
 
     nvinfer1::ILayer* layernorm_layer = nullptr;
     if (engine_->with_dynamic_shape()) {
@@ -73,14 +63,15 @@ class LayerNormOpConverter : public OpConverter {
       std::vector<int64_t> mean_shape{input_num};
       std::vector<int64_t> variance_shape{input_num};
       plugin::LayerNormPluginDynamic* plugin =
-          new plugin::LayerNormPluginDynamic(bias_data,
-                                             bias_tensor->numel(),
-                                             scale_data,
-                                             scale_tensor->numel(),
-                                             begin_norm_axis,
-                                             eps,
-                                             mean_shape,
-                                             variance_shape);
+          new plugin::LayerNormPluginDynamic(
+              static_cast<const float*>(bias_weight.get().values),
+              bias_weight.get().count,
+              static_cast<const float*>(scale_weight.get().values),
+              scale_weight.get().count,
+              begin_norm_axis,
+              eps,
+              mean_shape,
+              variance_shape);
       layernorm_layer = engine_->AddDynamicPlugin(&X, 1, plugin);
     } else {
       int input_num = 1;
@@ -89,23 +80,20 @@ class LayerNormOpConverter : public OpConverter {
       }
       std::vector<int64_t> mean_shape{input_num};
       std::vector<int64_t> variance_shape{input_num};
-      plugin::LayerNormPlugin* plugin =
-          new plugin::LayerNormPlugin(bias_data,
-                                      bias_tensor->numel(),
-                                      scale_data,
-                                      scale_tensor->numel(),
-                                      begin_norm_axis,
-                                      eps,
-                                      mean_shape,
-                                      variance_shape);
+      plugin::LayerNormPlugin* plugin = new plugin::LayerNormPlugin(
+          static_cast<const float*>(bias_weight.get().values),
+          bias_weight.get().count,
+          static_cast<const float*>(scale_weight.get().values),
+          scale_weight.get().count,
+          begin_norm_axis,
+          eps,
+          mean_shape,
+          variance_shape);
       layernorm_layer = engine_->AddPlugin(
           &X, 1, reinterpret_cast<plugin::PluginTensorRT*>(plugin));
     }
 
     auto output_name = op_desc.Output("Y").front();
-    engine_->SetWeights(op_desc.Input("Bias").front(), std::move(bias_tensor));
-    engine_->SetWeights(op_desc.Input("Scale").front(),
-                        std::move(scale_tensor));
     RreplenishLayerAndOutput(
         layernorm_layer, "layer_norm", {output_name}, test_mode);
   }
diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
index d30dc5eb35b15..8bc44cc6ab9d2 100644
--- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
@@ -48,9 +48,11 @@ class MultiheadMatMulOpConverter : public OpConverter {
       in_scale = BOOST_GET_CONST(float, op_desc.GetAttr("Input_scale"));
       engine_->SetTensorDynamicRange(input, in_scale);
     }
-    weight_data = engine_->GetWeightCPUData(weight_name, weight_t);
+    weight_data = const_cast<float*>(static_cast<const float*>(
+        engine_->GetFp32TrtWeight(weight_name, *weight_t).get().values));
 
-    float* bias_data = engine_->GetWeightCPUData(bias_name, bias_t);
+    float* bias_data = const_cast<float*>(static_cast<const float*>(
+        engine_->GetFp32TrtWeight(bias_name, *bias_t).get().values));
     std::vector<float> weight_data_tmp;
     weight_data_tmp.reserve(weight_t->numel());
     memcpy(
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index d179e8bb34c16..0eb2bc0875fdf 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -230,10 +230,54 @@ class OpConverter {
                     const framework::Scope& scope,
                     TensorRTEngine* engine) {
     std::unique_lock<std::mutex> lk(mut_);
+    for (int i = 0; i < block.ops_size(); i++) {
+      SetEngine(engine);
+      const auto& op = block.ops(i);
+      framework::OpDesc op_desc(op, nullptr);
+      framework::Variable* X_v = nullptr;
+      std::string X_name;
+      // inputs : string -> std::vector<string>
+      auto inputs = op_desc.Inputs();
+      if (inputs.count("X")) {
+        X_name = op_desc.Input("X")[0];
+      } else if (inputs.count("Input")) {
+        X_name = op_desc.Input("Input")[0];
+      } else if (inputs.count("Y")) {
+        X_name = op_desc.Input("Y")[0];
+      }
+      X_v = scope.FindVar(X_name);
+      // If this weight is shared between ops, it needn't to be convtered to
+      // itensor once again
+      if (engine->GetITensorMap()->count(X_name)) {
+        continue;
+      }
+      if (X_v) {
+        ConvertWeight2ITensor(scope, X_name);
+      }
+    }
     for (int i = 0; i < block.ops_size(); i++) {
       const auto& op = block.ops(i);
       ConvertOp(op, parameters, scope, engine);
     }
+    for (int i = 0; i < engine->network()->getNbLayers(); i++) {
+      auto layer = engine->network()->getLayer(i);
+      if (layer->getType() == nvinfer1::LayerType::kSHUFFLE) {
+        auto* input_tensor = layer->getInput(0);
+        auto* output_tensor = layer->getOutput(0);
+        auto output_tensor_name = output_tensor->getName();
+        auto input_tensor_name = input_tensor->getName();
+        if (engine->DynamicRangeIsSet(input_tensor) &&
+            !engine->DynamicRangeIsSet(output_tensor)) {
+          float output_scale = engine->GetTensorDynamicRange(input_tensor);
+          VLOG(1) << "Set output tensor scale = " << output_scale
+                  << " for tensor in TensorRT: " << output_tensor_name << ".";
+          engine->SetTensorDynamicRange(output_tensor, output_scale);
+        } else {
+          VLOG(1) << "Failed to get input tensor scale for tensor in TensorRT: "
+                  << input_tensor_name << ".";
+        }
+      }
+    }
   }
 
   // The scope  here should be inited with the parameter vars.
@@ -273,8 +317,8 @@ class OpConverter {
           continue;
         }
         std::vector<int64_t> input_shape;
-        input_shape.push_back(-1);
-        for (size_t i = 1; i < ranks; i++) {
+        // input_shape.push_back(-1);
+        for (size_t i = 0; i < ranks; i++) {
           if (min_input_shape[i] != max_input_shape[i]) {
             input_shape.push_back(-1);
           } else {
@@ -299,6 +343,8 @@ class OpConverter {
             FluidDataType2TRT(
                 var->Proto()->type().lod_tensor().tensor().data_type()),
             Vec2TRT_Dims(var_shape, input));
+        VLOG(1) << "Set trt input [" << input << "] type is "
+                << var->Proto()->type().lod_tensor().tensor().data_type();
       }
     }
     PADDLE_ENFORCE_EQ(all_dynamic_shape_set,
@@ -402,6 +448,17 @@ class OpConverter {
     return c;
   }
 
+  nvinfer1::ITensor* FloorDiv(nvinfer1::ITensor* a, nvinfer1::ITensor* b) {
+    nvinfer1::ITensor* c =
+        TRT_ENGINE_ADD_LAYER(engine_,
+                             ElementWise,
+                             *a,
+                             *b,
+                             nvinfer1::ElementWiseOperation::kFLOOR_DIV)
+            ->getOutput(0);
+    return c;
+  }
+
   nvinfer1::ITensor* Act(nvinfer1::ITensor* a,
                          nvinfer1::ActivationType act_type) {
     nvinfer1::ITensor* c =
@@ -422,22 +479,27 @@ class OpConverter {
             ->getOutput(0);
     return tensor;
   }
-
-  // Create and add Multi-D constant float layer
-  nvinfer1::ITensor* AddConstantLayer(const float* data,
+  template <typename T>
+  // Create and add Multi-D constant float/int32 layer
+  nvinfer1::ITensor* AddConstantLayer(const T* data,
                                       const std::vector<int32_t>& weight_dims,
                                       const std::string& weight_name) {
-    std::unique_ptr<framework::Tensor> tmp_tensor(new framework::Tensor());
     int data_size = std::accumulate(
         weight_dims.begin(), weight_dims.end(), 1, std::multiplies<int>());
+    std::unique_ptr<framework::Tensor> tmp_tensor(new framework::Tensor());
     tmp_tensor->Resize({data_size});
-    auto* tmp_data = tmp_tensor->mutable_data<float>(platform::CPUPlace());
+    auto* tmp_data = tmp_tensor->mutable_data<T>(platform::CPUPlace());
     for (int i = 0; i < data_size; i++) {
       tmp_data[i] = data[i];
     }
     engine_->SetWeights(weight_name, std::move(tmp_tensor));
 
-    TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
+    nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT;
+    if (std::is_integral<T>::value) {
+      trt_dtype = nvinfer1::DataType::kINT32;
+    }
+
+    TensorRTEngine::Weight weight{trt_dtype,
                                   static_cast<void*>(tmp_data),
                                   static_cast<size_t>(data_size)};
     nvinfer1::Dims trt_dims;
@@ -449,44 +511,26 @@ class OpConverter {
     return const_layer->getOutput(0);
   }
 
-  // Create and add 1D constant float layer
-  nvinfer1::ITensor* Add1DConstantLayer(const std::vector<float>& data,
+  // Create and add 1D constant float/int32 layer
+  template <typename T>
+  nvinfer1::ITensor* Add1DConstantLayer(const std::vector<T>& data,
                                         const std::string& weight_name = "",
                                         bool scalar = false) {
     std::unique_ptr<framework::Tensor> tmp_tensor(new framework::Tensor());
     int data_size = data.size();
     tmp_tensor->Resize({data_size});
-    auto* tmp_data = tmp_tensor->mutable_data<float>(platform::CPUPlace());
+    auto* tmp_data = tmp_tensor->mutable_data<T>(platform::CPUPlace());
     for (int i = 0; i < data_size; i++) {
       tmp_data[i] = data[i];
     }
     engine_->SetWeights(weight_name, std::move(tmp_tensor));
 
-    TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
-                                  static_cast<void*>(tmp_data),
-                                  static_cast<size_t>(data_size)};
-    nvinfer1::Dims input_shape;
-    input_shape.nbDims = scalar ? 0 : 1;
-    input_shape.d[0] = data_size;
-    auto const_layer =
-        TRT_ENGINE_ADD_LAYER(engine_, Constant, input_shape, weight.get());
-    return const_layer->getOutput(0);
-  }
-
-  // Create and add 1D constant layer
-  nvinfer1::ITensor* Add1DConstantLayer(const std::vector<int>& data,
-                                        const std::string& weight_name = "",
-                                        bool scalar = false) {
-    std::unique_ptr<framework::Tensor> tmp_tensor(new framework::Tensor());
-    int data_size = data.size();
-    tmp_tensor->Resize({data_size});
-    auto* tmp_data = tmp_tensor->mutable_data<int>(platform::CPUPlace());
-    for (int i = 0; i < data_size; i++) {
-      tmp_data[i] = data[i];
+    nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT;
+    if (std::is_integral<T>::value) {
+      trt_dtype = nvinfer1::DataType::kINT32;
     }
-    engine_->SetWeights(weight_name, std::move(tmp_tensor));
 
-    TensorRTEngine::Weight weight{nvinfer1::DataType::kINT32,
+    TensorRTEngine::Weight weight{trt_dtype,
                                   static_cast<void*>(tmp_data),
                                   static_cast<size_t>(data_size)};
     nvinfer1::Dims input_shape;
@@ -513,6 +557,35 @@ class OpConverter {
     return Add1DConstantLayer(tmp_data, weight_name, scalar);
   }
 
+  // For cases when input is not middle-tensor , but persistable tensor
+  // you should call this.
+  nvinfer1::ITensor* ConvertWeight2ITensor(const framework::Scope& scope,
+                                           const std::string& name) {
+    auto* var_v = scope.FindVar(name);
+    auto* var_t = var_v->GetMutable<framework::LoDTensor>();
+    auto weight = engine_->GetTrtWeight(name, *var_t);
+
+    // Now we have create weights, then we need create a itensor
+    auto var_dims = var_t->dims();
+    nvinfer1::Dims trt_in_shape;
+    trt_in_shape.nbDims = var_t->dims().size();
+    for (int64_t i = 0; i < trt_in_shape.nbDims; i++) {
+      trt_in_shape.d[i] = var_dims[i];
+    }
+    // In fact , this is not always right, because we can't determine if the 0th
+    // dimension is batch. Just for run chenqu's model
+    if (!engine_->with_dynamic_shape()) {
+      trt_in_shape.nbDims--;
+      for (int i = 0; i < trt_in_shape.nbDims; i++) {
+        trt_in_shape.d[i] = trt_in_shape.d[i + 1];
+      }
+    }
+    nvinfer1::ILayer* layer =
+        TRT_ENGINE_ADD_LAYER(engine_, Constant, trt_in_shape, weight.get());
+    engine_->SetITensor(name, layer->getOutput(0));
+    return layer->getOutput(0);
+  }
+
   void RreplenishLayerAndOutput(
       nvinfer1::ILayer* layer,
       const std::string& layer_type,
diff --git a/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc
index 78dd812e035db..5bfa1170fa109 100644
--- a/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc
@@ -81,7 +81,8 @@ class PrelnEmbEltwiseLayerNormOpConverter : public OpConverter {
       auto* temp_tensor = temp_var->GetMutable<framework::LoDTensor>();
       (*dims) = temp_tensor->dims();
 
-      auto* temp_data = engine_->GetWeightCPUData(var_name, temp_tensor);
+      auto* temp_data = const_cast<float*>(static_cast<const float*>(
+          engine_->GetFp32TrtWeight(var_name, *temp_tensor).get().values));
       return temp_data;
     };
 
diff --git a/paddle/fluid/inference/tensorrt/convert/preln_residual_bias.cc b/paddle/fluid/inference/tensorrt/convert/preln_residual_bias.cc
index d09df4a4f2818..7b89b62dc8b66 100644
--- a/paddle/fluid/inference/tensorrt/convert/preln_residual_bias.cc
+++ b/paddle/fluid/inference/tensorrt/convert/preln_residual_bias.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/plugin/preln_residual_bias_plugin.h"
 
 namespace paddle {
@@ -43,7 +44,8 @@ class PrelnResidualBiasOpConverter : public OpConverter {
       auto* temp_var = scope.FindVar(var_name);
       auto* temp_tensor = temp_var->GetMutable<framework::LoDTensor>();
       (*dims) = temp_tensor->dims();
-      auto* temp_data = engine_->GetWeightCPUData(var_name, temp_tensor);
+      auto* temp_data = const_cast<float*>(static_cast<const float*>(
+          engine_->GetFp32TrtWeight(var_name, *temp_tensor).get().values));
       return temp_data;
     };
     framework::DDim bias_dims, scale_dims, ele_bias_dims;
diff --git a/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc
index 7824a9b23dc5e..bc9b317920755 100644
--- a/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc
@@ -49,7 +49,8 @@ class PrelnSkipLayerNormOpConverter : public OpConverter {
       auto* temp_tensor = temp_var->GetMutable<framework::LoDTensor>();
       (*dims) = temp_tensor->dims();
 
-      auto* temp_data = engine_->GetWeightCPUData(var_name, temp_tensor);
+      auto* temp_data = const_cast<float*>(static_cast<const float*>(
+          engine_->GetFp32TrtWeight(var_name, *temp_tensor).get().values));
       return temp_data;
     };
 
diff --git a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
index 3195833c0e570..38b01eff6fb19 100644
--- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
@@ -43,28 +43,21 @@ class PReluOpConverter : public OpConverter {
     auto* alpha_var = scope.FindVar(op_desc.Input("Alpha")[0]);
     auto* alpha_tensor = alpha_var->GetMutable<framework::LoDTensor>();
 
+    auto alpha_weight =
+        engine_->GetFp32TrtWeight(op_desc.Input("Alpha")[0], *alpha_tensor);
+
     platform::CPUPlace cpu_place;
-    std::unique_ptr<framework::LoDTensor> alpha_tensor_temp(
-        new framework::LoDTensor());
-    alpha_tensor_temp->Resize(alpha_tensor->dims());
-    paddle::framework::TensorCopySync(
-        *alpha_tensor, cpu_place, alpha_tensor_temp.get());
-    float* alpha_data = alpha_tensor_temp->mutable_data<float>(cpu_place);
 
     nvinfer1::ILayer* layer = nullptr;
     if (engine_->with_dynamic_shape()) {
       plugin::PReluPluginDynamic* plugin = new plugin::PReluPluginDynamic(
-          alpha_data, alpha_tensor_temp->numel(), mode, data_format);
+          static_cast<const float*>(alpha_weight.get().values),
+          alpha_tensor->numel(),
+          mode,
+          data_format);
       layer = engine_->AddDynamicPlugin(&input, input_num, plugin);
     } else {
 #if IS_TRT_VERSION_GE(7000)
-      float* alpha_weight_data =
-          engine_->GetWeightCPUData(op_desc.Input("Alpha")[0], alpha_tensor);
-      TensorRTEngine::Weight alpha_weight{
-          nvinfer1::DataType::kFLOAT,
-          static_cast<void*>(alpha_weight_data),
-          static_cast<size_t>(alpha_tensor->numel())};
-
       nvinfer1::Dims dims;
       dims.nbDims = 0;
       // jump batch dim
@@ -83,13 +76,13 @@ class PReluOpConverter : public OpConverter {
           engine_, ParametricReLU, *input, *alpha_layer_output);
 #else
       plugin::PReluPlugin* plugin = new plugin::PReluPlugin(
-          alpha_data, alpha_tensor_temp->numel(), mode, data_format);
+          static_cast<const float*>(alpha_weight.get().values),
+          alpha_tensor->numel(),
+          mode,
+          data_format);
       layer = engine_->AddPlugin(&input, input_num, plugin);
 #endif
     }
-    // keep alpha tensor to avoid release it's memory
-    engine_->SetWeights(op_desc.Input("Alpha")[0],
-                        std::move(alpha_tensor_temp));
 
     auto output_name = op_desc.Output("Out")[0];
     RreplenishLayerAndOutput(layer, "prelu", {output_name}, test_mode);
diff --git a/paddle/fluid/inference/tensorrt/convert/reshape_op.cc b/paddle/fluid/inference/tensorrt/convert/reshape_op.cc
index 00ee5503cc2e2..eec881eae8e18 100644
--- a/paddle/fluid/inference/tensorrt/convert/reshape_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/reshape_op.cc
@@ -35,14 +35,29 @@ class ReshapeOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
     auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
+
     std::vector<int> shape =
         BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("shape"));
     int nbDims_num = shape.size();
     nvinfer1::Dims reshape_dim;
-    if (engine_->with_dynamic_shape()) {  // running the TRT Dynamic Shape mode
-      reshape_dim.nbDims = nbDims_num;
-      for (int i = 0; i < nbDims_num; ++i) {
-        reshape_dim.d[i] = shape[i];
+    nvinfer1::ITensor* real_shape_tensor = nullptr;
+    std::vector<nvinfer1::ITensor*> concat_inputs;
+    bool one_input = false;
+    if (engine_->with_dynamic_shape()) {
+      if (op_desc.Inputs().find("ShapeTensor") != op_desc.Inputs().end() &&
+          op_desc.Input("ShapeTensor").size() > 0) {
+        for (auto name : op_desc.Input("ShapeTensor"))
+          concat_inputs.push_back(engine_->GetITensor(name));
+        real_shape_tensor = Concat(concat_inputs);
+      } else if (op_desc.Inputs().find("Shape") != op_desc.Inputs().end() &&
+                 op_desc.Input("Shape").size() > 0) {
+        real_shape_tensor = engine_->GetITensor(op_desc.Input("Shape")[0]);
+      } else {
+        reshape_dim.nbDims = nbDims_num;
+        for (int i = 0; i < nbDims_num; ++i) {
+          reshape_dim.d[i] = shape[i];
+        }
+        one_input = true;
       }
     } else {  // running the TRT Static Shape mode
       reshape_dim.nbDims = nbDims_num - 1;
@@ -51,7 +66,10 @@ class ReshapeOpConverter : public OpConverter {
       }
     }
     auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
-    layer->setReshapeDimensions(reshape_dim);
+    if (!engine_->with_dynamic_shape() || one_input)
+      layer->setReshapeDimensions(reshape_dim);
+    else
+      layer->setInput(1, *real_shape_tensor);
     auto output_name = op_desc.Output("Out")[0];
     RreplenishLayerAndOutput(layer, "reshape", {output_name}, test_mode);
   }
diff --git a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
index 9ed72610dc179..cf95a4d9b55e0 100644
--- a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/utils.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h"
 
 namespace paddle {
@@ -34,22 +36,6 @@ class SkipLayerNormOpConverter : public OpConverter {
     inputs.push_back(input1);
     inputs.push_back(input2);
 
-    auto get_persistable_data = [&](const std::string& arg_name,
-                                    framework::DDim* dims) -> float* {
-      std::string var_name = op_desc.Input(arg_name).front();
-      auto* temp_var = scope.FindVar(var_name);
-      auto* temp_tensor = temp_var->GetMutable<framework::LoDTensor>();
-      (*dims) = temp_tensor->dims();
-
-      auto* temp_data = engine_->GetWeightCPUData(var_name, temp_tensor);
-      return temp_data;
-    };
-
-    framework::DDim bias_dims, scale_dims;
-    auto* bias = get_persistable_data("Bias", &bias_dims);
-    auto* scale = get_persistable_data("Scale", &scale_dims);
-    int bias_size = phi::product(bias_dims);
-    int scale_size = phi::product(scale_dims);
     bool enable_int8 = op_desc.HasAttr("enable_int8");
 
     nvinfer1::ILayer* layer = nullptr;
@@ -57,6 +43,18 @@ class SkipLayerNormOpConverter : public OpConverter {
                           engine_->tensorrt_transformer_posid() != "" &&
                           engine_->tensorrt_transformer_maskid() != "";
     if (flag_varseqlen) {
+      auto GetWeight =
+          [&](const std::string& arg_name) -> TensorRTEngine::Weight {
+        std::string var_name = op_desc.Input(arg_name).front();
+        auto* temp_var = scope.FindVar(var_name);
+        auto* temp_tensor = temp_var->GetMutable<framework::LoDTensor>();
+        auto weight = engine_->GetTrtWeight(var_name, *temp_tensor);
+        return weight;
+      };
+
+      auto bias_weight = GetWeight("Bias").get();
+      auto scale_weight = GetWeight("Scale").get();
+
       if (engine_->with_interleaved()) {
         VLOG(4)
             << "fused skip_layernorm op: use_varseqlen and with_interleaved";
@@ -72,11 +70,14 @@ class SkipLayerNormOpConverter : public OpConverter {
             platform::errors::InvalidArgument(
                 "fail to get creator of CustomSkipLayerNormPluginDynamic"));
         const std::vector<nvinfer1::PluginField> fields{
-            {"beta", bias, nvinfer1::PluginFieldType::kFLOAT32, bias_size},
+            {"beta",
+             bias_weight.values,
+             GetPluginFieldType(bias_weight.type),
+             static_cast<int32_t>(bias_weight.count)},
             { "gamma",
-              scale,
-              nvinfer1::PluginFieldType::kFLOAT32,
-              scale_size }};
+              scale_weight.values,
+              GetPluginFieldType(scale_weight.type),
+              static_cast<int32_t>(scale_weight.count) }};
         nvinfer1::PluginFieldCollection* pluginPtr =
             static_cast<nvinfer1::PluginFieldCollection*>(
                 malloc(sizeof(*pluginPtr) +
@@ -119,8 +120,14 @@ class SkipLayerNormOpConverter : public OpConverter {
         const std::vector<nvinfer1::PluginField> fields{
             {"type_id", &type, nvinfer1::PluginFieldType::kINT32, 1},
             {"ld", &ld, nvinfer1::PluginFieldType::kINT32, 1},
-            {"beta", bias, nvinfer1::PluginFieldType::kFLOAT32, bias_size},
-            {"gamma", scale, nvinfer1::PluginFieldType::kFLOAT32, scale_size},
+            {"beta",
+             bias_weight.values,
+             GetPluginFieldType(bias_weight.type),
+             static_cast<int32_t>(bias_weight.count)},
+            {"gamma",
+             scale_weight.values,
+             GetPluginFieldType(scale_weight.type),
+             static_cast<int32_t>(scale_weight.count)},
         };
         nvinfer1::PluginFieldCollection* pluginPtr =
             static_cast<nvinfer1::PluginFieldCollection*>(
@@ -143,12 +150,29 @@ class SkipLayerNormOpConverter : public OpConverter {
         layer = plugin_layer;
       }
     } else {
+      auto GetFp32Weight =
+          [&](const std::string& arg_name) -> TensorRTEngine::Weight {
+        std::string var_name = op_desc.Input(arg_name).front();
+        auto* temp_var = scope.FindVar(var_name);
+        auto* temp_tensor = temp_var->GetMutable<framework::LoDTensor>();
+        auto weight = engine_->GetFp32TrtWeight(var_name, *temp_tensor);
+        return weight;
+      };
+
+      auto bias_weight = GetFp32Weight("Bias").get();
+      auto scale_weight = GetFp32Weight("Scale").get();
+
       float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon"));
       bool with_fp16 =
           engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
       plugin::SkipLayerNormPluginDynamic* plugin =
           new plugin::SkipLayerNormPluginDynamic(
-              bias, scale, bias_size, scale_size, eps, with_fp16);
+              static_cast<const float*>(bias_weight.values),
+              static_cast<const float*>(scale_weight.values),
+              bias_weight.count,
+              scale_weight.count,
+              eps,
+              with_fp16);
       layer = engine_->AddDynamicPlugin(inputs.data(), 2, plugin);
     }
 
diff --git a/paddle/fluid/inference/tensorrt/convert/slice_op.cc b/paddle/fluid/inference/tensorrt/convert/slice_op.cc
index bcf5e638126e2..4f85e4f07cc4e 100644
--- a/paddle/fluid/inference/tensorrt/convert/slice_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/slice_op.cc
@@ -166,6 +166,29 @@ class SliceOpConverter : public OpConverter {
       }
       layer = TRT_ENGINE_ADD_LAYER(
           engine_, Slice, *input, trt_start_dims, trt_size_dims, trt_step_dims);
+      nvinfer1::Dims real_trt_size_dims;
+      real_trt_size_dims.nbDims = 0;
+
+      if (decrease_axises.size() > 0) {
+        for (size_t i = 0; i < decrease_axises.size(); i++) {
+          decrease_axises[i]--;
+        }
+        for (int i = 0; i < trt_size_dims.nbDims; i++) {
+          if (decrease_axises.end() !=
+              std::find(decrease_axises.begin(), decrease_axises.end(), i))
+            continue;
+          real_trt_size_dims.d[real_trt_size_dims.nbDims] = trt_size_dims.d[i];
+          real_trt_size_dims.nbDims++;
+        }
+        if (real_trt_size_dims.nbDims == 0) {
+          real_trt_size_dims.nbDims = 1;
+          real_trt_size_dims.d[0] = 1;
+        }
+        auto reshape_layer =
+            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *layer->getOutput(0));
+        reshape_layer->setReshapeDimensions(real_trt_size_dims);
+        layer = static_cast<nvinfer1::ILayer*>(reshape_layer);
+      }
 #else
       bool with_fp16 =
           engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
diff --git a/paddle/fluid/inference/tensorrt/convert/sparse_fc_op.cc b/paddle/fluid/inference/tensorrt/convert/sparse_fc_op.cc
index 6974e5a77006e..33801e969172a 100644
--- a/paddle/fluid/inference/tensorrt/convert/sparse_fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/sparse_fc_op.cc
@@ -154,7 +154,10 @@ class SparseFcOpConverter : public OpConverter {
       }
       engine_->SetTensorDynamicRange(X, in_scale);
     }
-    weight_data = engine_->GetWeightCPUData(op_desc.Input(w_name).front(), Y_t);
+    weight_data = const_cast<float*>(static_cast<const float*>(
+        engine_->GetFp32TrtWeight(op_desc.Input(w_name).front(), *Y_t)
+            .get()
+            .values));
 
     PADDLE_ENFORCE_EQ(
         Y_t->dims().size(),
@@ -321,7 +324,10 @@ class SparseFcOpConverter : public OpConverter {
     if (with_bias) {
       auto* b_v = scope.GetVar(op_desc.Input("Bias").front());
       auto* b_t = b_v->GetMutable<framework::LoDTensor>();
-      bias_data = engine_->GetWeightCPUData(op_desc.Input("Bias").front(), b_t);
+      bias_data = weight_data = const_cast<float*>(static_cast<const float*>(
+          engine_->GetFp32TrtWeight(op_desc.Input("Bias").front(), *b_t)
+              .get()
+              .values));
       bias_num = b_t->numel();
     }
     // Running the TRT Static Shape mode: x_num_col_dims-1
diff --git a/paddle/fluid/inference/tensorrt/convert/sparse_multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/sparse_multihead_matmul_op.cc
index 7f54f97d34933..4a8d15ef0dbac 100644
--- a/paddle/fluid/inference/tensorrt/convert/sparse_multihead_matmul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/sparse_multihead_matmul_op.cc
@@ -64,9 +64,11 @@ class SparseMultiheadMatMulOpConverter : public OpConverter {
       in_scale = BOOST_GET_CONST(float, op_desc.GetAttr("Input_scale"));
       engine_->SetTensorDynamicRange(input, in_scale);
     }
-    weight_data = engine_->GetWeightCPUData(weight_name, weight_t);
+    weight_data = const_cast<float*>(static_cast<const float*>(
+        engine_->GetFp32TrtWeight(weight_name, *weight_t).get().values));
 
-    float* bias_data = engine_->GetWeightCPUData(bias_name, bias_t);
+    float* bias_data = const_cast<float*>(static_cast<const float*>(
+        engine_->GetFp32TrtWeight(bias_name, *bias_t).get().values));
     std::vector<float> weight_data_tmp;
     weight_data_tmp.reserve(weight_t->numel());
     memcpy(
diff --git a/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc
index a2fe32b75f3de..d770ef5478abb 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc
@@ -62,7 +62,7 @@ void IOConverterTester(const platform::DeviceContext& ctx) {
 
 TEST(EngineIOConverterTester, DefaultCPU) {
   platform::CPUPlace place;
-  platform::CPUDeviceContext ctx(place);
+  phi::CPUContext ctx(place);
   IOConverterTester(ctx);
 }
 
diff --git a/paddle/fluid/inference/tensorrt/convert/utils.h b/paddle/fluid/inference/tensorrt/convert/utils.h
new file mode 100644
index 0000000000000..1415e67fbeccd
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/utils.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/inference/tensorrt/engine.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+inline nvinfer1::PluginFieldType GetPluginFieldType(nvinfer1::DataType type) {
+  switch (type) {
+#if IS_TRT_VERSION_GE(7000)
+    case nvinfer1::DataType::kBOOL:
+      return nvinfer1::PluginFieldType::kCHAR;
+#endif
+    case nvinfer1::DataType::kFLOAT:
+      return nvinfer1::PluginFieldType::kFLOAT32;
+    case nvinfer1::DataType::kHALF:
+      return nvinfer1::PluginFieldType::kFLOAT16;
+    case nvinfer1::DataType::kINT32:
+      return nvinfer1::PluginFieldType::kINT32;
+    case nvinfer1::DataType::kINT8:
+      return nvinfer1::PluginFieldType::kINT8;
+    default:
+      return nvinfer1::PluginFieldType::kUNKNOWN;
+  }
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 82c51311a03d5..a4d373e83b355 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -19,15 +19,46 @@ limitations under the License. */
 
 #include <string>
 
+#include "NvInferRuntimeCommon.h"
 #include "cuda_runtime_api.h"  // NOLINT
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/common/data_type.h"
 
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 
+void TensorRTEngine::Weight::SetDataType(phi::DataType type) {
+  nvinfer1::DataType nv_type;
+  switch (type) {
+    case phi::DataType::FLOAT32:
+      nv_type = nvinfer1::DataType::kFLOAT;
+      break;
+    case phi::DataType::FLOAT16:
+      nv_type = nvinfer1::DataType::kHALF;
+      break;
+    case phi::DataType::INT32:
+      nv_type = nvinfer1::DataType::kINT32;
+      break;
+    case phi::DataType::INT8:
+      nv_type = nvinfer1::DataType::kINT8;
+      break;
+#if IS_TRT_VERSION_GE(7000)
+    case phi::DataType::BOOL:
+      nv_type = nvinfer1::DataType::kBOOL;
+      break;
+#endif
+    default:
+      paddle::platform::errors::InvalidArgument(
+          "Paddle-TRT loads weighths failed, found not supported data type %s.",
+          type);
+      break;
+  }
+  w_.type = nv_type;
+}
+
 int TensorRTEngine::runtime_batch_ = 1;
 
 void TensorRTEngine::InitNetwork() {
@@ -197,6 +228,18 @@ void TensorRTEngine::FreezeNetwork() {
     }
   }
 
+  // If model is mixed precision, then we should cast all float output to
+  // float32 precision. Otherwise, we can not confirm the output precision of
+  // the trt engine.
+  if (model_precision_ != phi::DataType::FLOAT32) {
+    for (int i = 0; i < network()->getNbOutputs(); ++i) {
+      network()->getOutput(i)->setAllowedFormats(
+          static_cast<nvinfer1::TensorFormats>(
+              1 << static_cast<int>(nvinfer1::TensorFormat::kLINEAR)));
+      network()->getOutput(i)->setType(nvinfer1::DataType::kFLOAT);
+    }
+  }
+
   if (use_dla_) {
     if (!enable_int8 && !enable_fp16) {
       LOG(WARNING) << "TensorRT DLA must be used with int8 or fp16, but you "
@@ -390,12 +433,76 @@ nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) {
   return itensor_map_[name];
 }
 
+std::unordered_map<std::string, nvinfer1::ITensor *>
+    *TensorRTEngine::GetITensorMap() {
+  return &itensor_map_;
+}
+
 void TensorRTEngine::SetRuntimeBatch(size_t batch_size) {
   runtime_batch_ = batch_size;
 }
 
-float *TensorRTEngine::GetWeightCPUData(const std::string &name,
-                                        framework::Tensor *weight_tensor) {
+TensorRTEngine::Weight TensorRTEngine::GetFp32TrtWeight(
+    const std::string &name, const framework::Tensor &weight_tensor) {
+  static int name_suffix_counter = 0;
+  std::string name_suffix = std::to_string(name_suffix_counter);
+  std::string splitter = "__";
+  std::string name_with_suffix = name + splitter + name_suffix;
+  platform::CPUPlace cpu_place;
+  PADDLE_ENFORCE_EQ(weight_map.count(name_with_suffix),
+                    0,
+                    platform::errors::AlreadyExists(
+                        "The weight named %s is set into the weight map "
+                        "twice in TRT OP converter.",
+                        name_with_suffix));
+  weight_map[name_with_suffix].reset(new framework::Tensor());
+  weight_map[name_with_suffix]->Resize(weight_tensor.dims());
+
+  TensorRTEngine::Weight weight;
+  weight.SetCount(weight_tensor.numel());
+  weight.SetDataType(nvinfer1::DataType::kFLOAT);
+  // weight_tensor.dims().;
+
+  // if trt not support dtype, we need to cast to  fp32.
+  if (weight_tensor.dtype() == phi::DataType::BFLOAT16) {
+    framework::Tensor bf16_tensor;
+    bf16_tensor.clear();
+    paddle::framework::TensorCopySync(
+        weight_tensor, platform::CPUPlace(), &bf16_tensor);
+    weight_map[name_with_suffix]->set_type(
+        paddle::experimental::DataType::FLOAT32);
+    weight_map[name_with_suffix]->Resize(weight_tensor.dims());
+    auto *fp32_data =
+        weight_map[name_with_suffix]->mutable_data<float>(platform::CPUPlace());
+    auto *bf16_data = bf16_tensor.mutable_data<bfloat16>(platform::CPUPlace());
+    for (int i = 0; i < weight_tensor.numel(); i++) {
+      fp32_data[i] = static_cast<float>(bf16_data[i]);
+    }
+  } else if (weight_tensor.dtype() == phi::DataType::FLOAT16) {
+    framework::Tensor fp16_tensor;
+    fp16_tensor.clear();
+    paddle::framework::TensorCopySync(
+        weight_tensor, platform::CPUPlace(), &fp16_tensor);
+    weight_map[name_with_suffix]->set_type(
+        paddle::experimental::DataType::FLOAT32);
+    weight_map[name_with_suffix]->Resize(weight_tensor.dims());
+    auto *fp32_data =
+        weight_map[name_with_suffix]->mutable_data<float>(platform::CPUPlace());
+    auto *fp16_data = fp16_tensor.mutable_data<float16>(platform::CPUPlace());
+    for (int i = 0; i < weight_tensor.numel(); i++) {
+      fp32_data[i] = static_cast<float>(fp16_data[i]);
+    }
+  } else {
+    paddle::framework::TensorCopySync(
+        weight_tensor, cpu_place, weight_map[name_with_suffix].get());
+  }
+  weight.SetValues(weight_map[name_with_suffix]->data());
+  name_suffix_counter += 1;
+  return weight;
+}
+
+TensorRTEngine::Weight TensorRTEngine::GetTrtWeight(
+    const std::string &name, const framework::Tensor &weight_tensor) {
   static int name_suffix_counter = 0;
   std::string name_suffix = std::to_string(name_suffix_counter);
   std::string splitter = "__";
@@ -407,14 +514,53 @@ float *TensorRTEngine::GetWeightCPUData(const std::string &name,
                         "The weight named %s is set into the weight map "
                         "twice in TRT OP converter.",
                         name_with_suffix));
+
   weight_map[name_with_suffix].reset(new framework::Tensor());
-  weight_map[name_with_suffix]->Resize(weight_tensor->dims());
-  paddle::framework::TensorCopySync(
-      *weight_tensor, cpu_place, weight_map[name_with_suffix].get());
-  float *weight_data =
-      weight_map[name_with_suffix]->mutable_data<float>(cpu_place);
+  weight_map[name_with_suffix]->Resize(weight_tensor.dims());
+
+  TensorRTEngine::Weight weight;
+  weight.SetCount(weight_tensor.numel());
+
+  // if trt not support dtype, we need to cast to fp32.
+  if (weight_tensor.dtype() == phi::DataType::BFLOAT16) {
+    framework::Tensor bf16_tensor;
+    bf16_tensor.clear();
+    paddle::framework::TensorCopySync(
+        weight_tensor, platform::CPUPlace(), &bf16_tensor);
+    weight_map[name_with_suffix]->set_type(
+        paddle::experimental::DataType::FLOAT32);
+    auto *fp32_data =
+        weight_map[name_with_suffix]->mutable_data<float>(platform::CPUPlace());
+    auto *bf16_data = bf16_tensor.mutable_data<bfloat16>(platform::CPUPlace());
+    for (int i = 0; i < weight_tensor.numel(); i++) {
+      fp32_data[i] = static_cast<float>(bf16_data[i]);
+    }
+    weight.SetDataType(phi::DataType::FLOAT32);
+    weight.SetValues(fp32_data);
+  } else if (weight_tensor.dtype() == phi::DataType::INT64) {
+    framework::Tensor int64_tensor;
+    int64_tensor.clear();
+    paddle::framework::TensorCopySync(
+        weight_tensor, platform::CPUPlace(), &int64_tensor);
+    weight_map[name_with_suffix]->set_type(
+        paddle::experimental::DataType::INT32);
+    auto *int32_data =
+        weight_map[name_with_suffix]->mutable_data<int>(platform::CPUPlace());
+    auto *int64_data = int64_tensor.mutable_data<int64_t>(platform::CPUPlace());
+    for (int i = 0; i < weight_tensor.numel(); i++) {
+      int32_data[i] = int64_data[i];
+    }
+    weight.SetDataType(phi::DataType::FLOAT32);
+    weight.SetValues(int32_data);
+  } else {
+    paddle::framework::TensorCopySync(
+        weight_tensor, cpu_place, weight_map[name_with_suffix].get());
+    weight.SetDataType(weight_tensor.dtype());
+    weight.SetValues(weight_map[name_with_suffix]->data());
+  }
+
   name_suffix_counter += 1;
-  return weight_data;
+  return weight;
 }
 
 int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; }
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 8d28d1c05ea14..73506eb8f6244 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -25,6 +25,8 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "NvInferRuntimeCommon.h"
+#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
@@ -34,6 +36,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
 #include "paddle/fluid/inference/utils/singleton.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/common/data_type.h"
 #include "paddle/utils/any.h"
 
 namespace paddle {
@@ -187,6 +190,14 @@ class TensorRTEngine {
     }
     const nvinfer1::Weights& get() { return w_; }
 
+    void SetDataType(nvinfer1::DataType type) { w_.type = type; }
+
+    void SetDataType(phi::DataType type);
+
+    void SetValues(const void* values) { w_.values = values; }
+
+    void SetCount(int64_t num) { w_.count = num; }
+
     std::vector<int64_t> dims;
 
    private:
@@ -203,6 +214,7 @@ class TensorRTEngine {
       const ShapeMapType max_input_shape = {},
       const ShapeMapType optim_input_shape = {},
       bool disable_trt_plugin_fp16 = false,
+      phi::DataType model_precision = phi::DataType::FLOAT32,
       nvinfer1::ILogger& logger = NaiveLogger::Global())
       : max_batch_(max_batch),
         max_workspace_(max_workspace),
@@ -213,6 +225,7 @@ class TensorRTEngine {
         max_input_shape_(max_input_shape),
         optim_input_shape_(optim_input_shape),
         disable_trt_plugin_fp16_(disable_trt_plugin_fp16),
+        model_precision_(model_precision),
         logger_(logger) {
     if (min_input_shape_.size() != 0 && max_input_shape_.size() != 0 &&
         optim_input_shape_.size() != 0) {
@@ -268,6 +281,7 @@ class TensorRTEngine {
   void SetITensor(const std::string& name, nvinfer1::ITensor* tensor);
   // Get an ITensor called name.
   nvinfer1::ITensor* GetITensor(const std::string& name);
+  std::unordered_map<std::string, nvinfer1::ITensor*>* GetITensorMap();
 
   nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); }
   nvinfer1::IExecutionContext* context() {
@@ -406,8 +420,21 @@ class TensorRTEngine {
     quant_dynamic_range_[tensor] = range;
   }
 
-  float* GetWeightCPUData(const std::string& name,
-                          framework::Tensor* weight_tensor);
+  // Get fp32 trt weight. If src weight is not fp32, we will cast.
+  Weight GetFp32TrtWeight(const std::string& name,
+                          const framework::Tensor& weight_tensor);
+
+  // if the src weight type is fp16, then return fp16 trt weight, etc.
+  Weight GetTrtWeight(const std::string& name,
+                      const framework::Tensor& weight_tensor);
+
+  float GetTensorDynamicRange(nvinfer1::ITensor* tensor) {
+    return quant_dynamic_range_[tensor];
+  }
+
+  bool DynamicRangeIsSet(nvinfer1::ITensor* tensor) {
+    return quant_dynamic_range_.count(tensor);
+  }
 
   // A pointer to CPU memory is needed of the TRT weight.
   // Before TRT runs, fluid loads weight into GPU storage.
@@ -424,7 +451,14 @@ class TensorRTEngine {
     static int suffix_counter = 0;
     std::string suffix = std::to_string(suffix_counter);
     std::string splitter = "__";
-    weight_map[w_name + splitter + suffix] = std::move(w_tensor);
+    std::string name_with_suffix = w_name + splitter + suffix;
+    PADDLE_ENFORCE_EQ(weight_map.count(name_with_suffix),
+                      0,
+                      platform::errors::AlreadyExists(
+                          "The weight named %s is set into the weight map "
+                          "twice in TRT OP converter.",
+                          name_with_suffix));
+    weight_map[name_with_suffix] = std::move(w_tensor);
     suffix_counter += 1;
   }
 
@@ -652,6 +686,7 @@ class TensorRTEngine {
   ShapeMapType max_input_shape_;
   ShapeMapType optim_input_shape_;
   bool disable_trt_plugin_fp16_{false};
+  phi::DataType model_precision_{phi::DataType::FLOAT32};
   bool use_varseqlen_{false};
   bool use_dla_{false};
   int dla_core_{0};
@@ -739,6 +774,7 @@ class TRTEngineManager {
       const std::map<std::string, std::vector<int>> max_input_shape = {},
       const std::map<std::string, std::vector<int>> optim_input_shape = {},
       bool disable_trt_plugin_fp16 = false,
+      phi::DataType model_precision = phi::DataType::FLOAT32,
       nvinfer1::ILogger& logger = NaiveLogger::Global()) {
     auto* p = new TensorRTEngine(max_batch,
                                  max_workspace,
@@ -749,6 +785,7 @@ class TRTEngineManager {
                                  max_input_shape,
                                  optim_input_shape,
                                  disable_trt_plugin_fp16,
+                                 model_precision,
                                  logger);
     engines_[name].reset(p);
     return p;
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 1ee748afe507c..894e44fda9496 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -169,6 +169,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "transformer_input_convert",
       "recover_padding",
       "remove_padding",
+      "fill_constant",
       "squeeze2",
       "unsqueeze2"};
   std::unordered_set<std::string> teller_set{
@@ -274,8 +275,10 @@ struct SimpleOpTypeSetTeller : public Teller {
       "transformer_input_convert",
       "recover_padding",
       "remove_padding",
+      "fill_constant",
       "squeeze2",
-      "unsqueeze2"};
+      "unsqueeze2",
+      "fused_token_prune"};
 };
 
 bool OpTeller::Tell(const framework::ir::Node* node,
@@ -325,6 +328,28 @@ bool OpTeller::Tell(const framework::ir::Node* node,
 #endif
     }
 
+    // In static shape mode in TRT, we can't allow that op's input is a
+    // 1D-tensor So we filter it here. Some op like elementwise having "Y" too,
+    // but that is dealt with in the specified op, here just the common case
+    if (!with_dynamic_shape) {
+      std::string X_name;
+      auto inputs = desc.Inputs();
+      if (inputs.count("X")) {
+        X_name = desc.Input("X")[0];
+      } else if (inputs.count("Input")) {
+        X_name = desc.Input("Input")[0];
+      }
+      auto* block = desc.Block();
+      if (block) {
+        auto* x_var_desc = block->FindVar(X_name);
+        // Can't get feed op's TensorDesc
+        if (op_type != "feed" && x_var_desc && !x_var_desc->Persistable()) {
+          const auto x_shape = x_var_desc->GetShape();
+          if (x_shape.size() == 1) return false;
+        }
+      }
+    }
+
     if (op_type == "pool2d") {
       std::vector<int> paddings =
           BOOST_GET_CONST(std::vector<int>, desc.GetAttr("paddings"));
@@ -1217,14 +1242,9 @@ bool OpTeller::Tell(const framework::ir::Node* node,
       if (desc.HasAttr("decrease_axis")) {
         std::vector<int> decrease_axis =
             BOOST_GET_CONST(std::vector<int>, desc.GetAttr("decrease_axis"));
-        if (with_dynamic_shape) {
-          if (decrease_axis.size() > 1) {
-            return false;
-          }
-        } else {
-          if (decrease_axis.size() > 0) {
-            VLOG(3) << "Invalid slice decrease_axis. decrease_axis.size() > 0"
-                       "is not supported in TensorRT";
+        if (!with_dynamic_shape) {
+          if (decrease_axis.end() !=
+              std::find(decrease_axis.begin(), decrease_axis.end(), 0)) {
             return false;
           }
         }
@@ -1314,14 +1334,19 @@ bool OpTeller::Tell(const framework::ir::Node* node,
       auto* y_var_desc = block->FindVar(desc.Input("Y")[0]);
       const auto x_shape = x_var_desc->GetShape();
       const auto y_shape = y_var_desc->GetShape();
-      if (x_shape.size() == 1 && y_shape.size() == 1) {
-        VLOG(3) << "Now trt may not support two 1d tensor elementwise op.";
+
+      // The case when x_shape.size() == 1 is dealt with in common case
+      if (!with_dynamic_shape && (!y_var_desc->Persistable()) &&
+          y_shape.size() == 1) {
+        VLOG(3) << "Static shape in trt not support y is  a 1D intermediate "
+                   "tensor in "
+                   "elementwise op.";
         return false;
       }
-      if (x_var_desc->Persistable()) {
-        VLOG(3) << "Input X is a parameter which is not supported for "
-                   "elementwise_add/elementwise_mul in tensorrt, swap x and "
-                   "y will work";
+      if (x_var_desc->Persistable() && !with_dynamic_shape) {
+        VLOG(3)
+            << "Input X is a parameter which is not supported for "
+               "elementwise in tensorrt's static shape, swap x and y will work";
         return false;
       }
     }
@@ -1425,6 +1450,27 @@ bool OpTeller::Tell(const framework::ir::Node* node,
       }
     }
 
+    if (op_type == "fill_constant") {
+      auto fill_constant_inputs = desc.Inputs();
+      if (fill_constant_inputs.find("ValueTensor") !=
+          fill_constant_inputs.end()) {
+        if (desc.Input("ValueTensor").size()) return false;
+      }
+      if (fill_constant_inputs.find("ShapeTensor") !=
+          fill_constant_inputs.end()) {
+        if (desc.Input("ShapeTensor").size()) return false;
+      }
+      if (fill_constant_inputs.find("ShapeTensorList") !=
+          fill_constant_inputs.end()) {
+        if (desc.Input("ShapeTensorList").size()) return false;
+      }
+      int dtype = BOOST_GET_CONST(int, desc.GetAttr("dtype"));
+      // only support int32, int64, float32
+      if (!(dtype == 2 || dtype == 3 || dtype == 5)) {
+        return false;
+      }
+    }
+
     if (op_type == "instance_norm") {
       if (with_dynamic_shape) {
         VLOG(3) << "trt instance_norm op does not support dynamic shape ";
@@ -1778,6 +1824,9 @@ bool OpTeller::Tell(const framework::ir::Node* node,
     }
 
     if (op_type == "reshape" || op_type == "reshape2") {
+      if (with_dynamic_shape) {
+        return true;
+      }
       if (!desc.HasAttr("shape")) {
         return false;
       }
@@ -1917,6 +1966,21 @@ bool OpTeller::Tell(const framework::ir::Node* node,
     }
 #endif
 
+    // conv2d_transpose, conv3d_transpose, depthwise_conv2d_transpose
+    if (op_type.find("d_transpose") > 0) {
+      // trt doen't support output_padding,
+      // output_padding is set when stride > 1
+      if (desc.HasAttr("output_padding")) {
+        const std::vector<int> output_padding =
+            BOOST_GET_CONST(std::vector<int>, desc.GetAttr("output_padding"));
+        if (output_padding.size() > 0) {
+          int max_padding =
+              *std::max_element(output_padding.begin(), output_padding.end());
+          if (max_padding > 0) return false;
+        }
+      }
+    }
+
     if (op_type == "conv3d" || op_type == "conv3d_transpose") {
       if (desc.HasAttr("padding_algorithm")) {
         std::string padding_algorithm =
@@ -1997,6 +2061,10 @@ bool OpTeller::Tell(const framework::ir::Node* node,
     }
 
     if (op_type == "cast") {
+// trt 6015 result in Windows ppyolo_mbv3 TRT fp32 diff
+#if !IS_TRT_VERSION_GE(7000)
+      return false;
+#endif
       int in_dtype = BOOST_GET_CONST(int, desc.GetAttr("in_dtype"));
       int out_dtype = BOOST_GET_CONST(int, desc.GetAttr("out_dtype"));
       if ((in_dtype == 4 || in_dtype == 5) && out_dtype == 4) {
diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index cd65316fb4a63..90344fc0adae8 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -29,7 +29,8 @@ list(
   remove_padding_plugin.cu
   recover_padding_plugin.cu
   c_allreduce_op_plugin.cu
-  preln_residual_bias_plugin.cu)
+  preln_residual_bias_plugin.cu
+  fused_token_prune_op_plugin.cu)
 
 if(CUSPARSELT_FOUND AND ${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 8)
   list(APPEND TRT_FILES spmm_plugin.cu)
@@ -44,3 +45,10 @@ nv_test(
   test_split_plugin
   SRCS test_split_plugin.cc
   DEPS paddle_framework ${GLOB_OPERATOR_DEPS} tensorrt_plugin)
+
+if(NOT WIN32)
+  nv_test(
+    test_fused_token_prune_plugin
+    SRCS test_fused_token_prune_plugin.cc
+    DEPS paddle_framework ${GLOB_OPERATOR_DEPS} tensorrt_plugin)
+endif()
diff --git a/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu
new file mode 100644
index 0000000000000..c10ab7277e788
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu
@@ -0,0 +1,531 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "cub/cub.cuh"
+
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/fluid/platform/device_context.h"
+
+#include "paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h"
+#include "paddle/fluid/operators/fused_token_prune_op.cu.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+#if IS_TRT_VERSION_GE(6000)
+
+template <typename T>
+__global__ void ElementwiseMask(const T* a,
+                                const T* b,
+                                T* res,
+                                int num_elements) {
+#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
+  auto tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= num_elements) return;
+  const T zero = 0;
+  res[tid] = b[tid] >= zero ? a[tid] : zero;
+#endif
+}
+
+template <typename T>
+__global__ void FillZero(T* data, int len) {
+  auto tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= len) return;
+  const T zero = 0;
+  data[tid] = zero;
+}
+
+__global__ void FillIndex(int32_t* indices, int num_raws, int num_cols) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= num_raws * num_cols) return;
+
+  int col = tid % num_cols;
+  int raw = tid / num_cols;
+
+  indices[tid] = col;
+}
+
+template <typename T>
+__global__ void MaximumFirst(T* mat, int num_raws, int num_cols, T max_value) {
+  auto raw = blockIdx.x * blockDim.x + threadIdx.x;
+  if (raw >= num_raws) return;
+  mat[raw * num_cols] = max_value;
+}
+
+__global__ void FillOffsets(int* offsets, int num_raws, int num_cols) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid > num_raws) return;
+
+  offsets[tid] = tid * num_cols;
+}
+
+template <typename T>
+__global__ void Slice(
+    const T* src, T* dst, int num_raws, int src_num_cols, int dst_num_cols) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= num_raws * dst_num_cols) return;
+  int raw = tid / dst_num_cols;
+  int col = tid % dst_num_cols;
+  dst[tid] = src[raw * src_num_cols + col];
+}
+
+template <typename T>
+__global__ void ReduceSum2(
+    const T* src, T* dst, int bsz, int nb_head, int max_seq_len) {
+  int tid = threadIdx.x;
+  int bid = blockIdx.x;
+  int num_blocks_per_head = ((max_seq_len / blockDim.x) * max_seq_len);
+  int batch = bid / (nb_head * num_blocks_per_head);
+  int col = bid % max_seq_len;
+  int head = (bid / num_blocks_per_head) % nb_head;
+
+  extern __shared__ T res_float[];
+  res_float[tid] =
+      src[batch * (nb_head * max_seq_len * max_seq_len) +
+          head * (max_seq_len * max_seq_len) + col + tid * max_seq_len];
+  __syncthreads();
+
+  for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
+    if (tid < offset) {
+      res_float[tid] += res_float[tid + offset];
+    }
+    __syncthreads();
+    if (offset % 2 == 1 && tid == offset - 2) {
+      res_float[tid] += res_float[tid + 1];
+    }
+  }
+
+  if (tid == 0) {
+    auto* dst_addr = dst + batch * max_seq_len + col;
+    atomicAdd(dst_addr, res_float[0]);
+  }
+}
+
+template <>
+__global__ void ReduceSum2<half>(
+    const half* src, half* dst, int bsz, int nb_head, int max_seq_len) {
+#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
+  int tid = threadIdx.x;
+  int bid = blockIdx.x;
+  int num_blocks_per_head = ((max_seq_len / blockDim.x) * max_seq_len);
+  int batch = bid / (nb_head * num_blocks_per_head);
+  int col = bid % max_seq_len;
+  int head = (bid / num_blocks_per_head) % nb_head;
+
+  extern __shared__ half res_half[];
+  res_half[tid] =
+      src[batch * (nb_head * max_seq_len * max_seq_len) +
+          head * (max_seq_len * max_seq_len) + col + tid * max_seq_len];
+  __syncthreads();
+
+  for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
+    if (tid < offset) {
+      res_half[tid] += res_half[tid + offset];
+    }
+    __syncthreads();
+    if (offset % 2 == 1 && tid == offset - 2) {
+      res_half[tid] += res_half[tid + 1];
+    }
+    __syncthreads();
+  }
+
+  if (tid == 0) {
+    platform::fastAtomicAdd<platform::float16>(
+        reinterpret_cast<platform::float16*>(dst),
+        static_cast<size_t>(batch * max_seq_len + col),
+        static_cast<size_t>(bsz * max_seq_len),
+        static_cast<platform::float16>(res_half[0]));
+  }
+#endif
+}
+
+template <typename T>
+__global__ void TakeAlongAxis(const T* src,
+                              T* dst,
+                              int32_t* indices,
+                              int num_raws,
+                              int src_num_cols,
+                              int dst_num_cols,
+                              int num_elements) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= num_raws * dst_num_cols) return;
+
+  int raw = tid / dst_num_cols;
+  int col = tid % dst_num_cols;
+  for (int i = 0; i < num_elements; ++i) {
+    dst[tid * num_elements + i] =
+        *(src + (raw * src_num_cols + indices[tid]) * num_elements + i);
+  }
+}
+
+nvinfer1::DimsExprs FusedTokenPrunePluginDynamic::getOutputDimensions(
+    int output_index,
+    const nvinfer1::DimsExprs* inputs,
+    int nb_inputs,
+    nvinfer1::IExprBuilder& expr_builder) TRT_NOEXCEPT {
+  auto x_dims = inputs[1], new_mask_dims = inputs[3];
+  if (output_index == 0) {
+    nvinfer1::DimsExprs ret = x_dims;
+    ret.d[1] = new_mask_dims.d[2];
+    return ret;
+  } else {
+    nvinfer1::DimsExprs ret;
+    ret.nbDims = 2;
+    ret.d[0] = new_mask_dims.d[0];
+    ret.d[1] = new_mask_dims.d[2];
+    return ret;
+  }
+}
+
+bool FusedTokenPrunePluginDynamic::supportsFormatCombination(
+    int pos,
+    const nvinfer1::PluginTensorDesc* in_out,
+    int nb_inputs,
+    int nb_outputs) TRT_NOEXCEPT {
+  PADDLE_ENFORCE_NOT_NULL(
+      in_out,
+      platform::errors::InvalidArgument(
+          "The input of swish plugin shoule not be nullptr."));
+
+  PADDLE_ENFORCE_LT(
+      pos,
+      nb_inputs + nb_outputs,
+      platform::errors::InvalidArgument("The pos(%d) should be less than the "
+                                        "num(%d) of the input and the output.",
+                                        pos,
+                                        nb_inputs + nb_outputs));
+
+  const nvinfer1::PluginTensorDesc& in = in_out[pos];
+  if (pos == 0) {
+    if (with_fp16_) {
+#ifdef TRT_PLUGIN_FP16_AVALIABLE
+      return (in.type == nvinfer1::DataType::kFLOAT ||
+              in.type == nvinfer1::DataType::kHALF) &&
+             (in.format == nvinfer1::TensorFormat::kLINEAR);
+#else
+      return (in.type == nvinfer1::DataType::kFLOAT) &&
+             (in.format == nvinfer1::TensorFormat::kLINEAR);
+#endif
+    } else {
+      return (in.type == nvinfer1::DataType::kFLOAT) &&
+             (in.format == nvinfer1::TensorFormat::kLINEAR);
+    }
+  } else if (pos <= 4) {
+    const nvinfer1::PluginTensorDesc& prev = in_out[pos - 1];
+    return in.type == prev.type && in.format == prev.format;
+  } else {
+    const nvinfer1::PluginTensorDesc& prev = in_out[pos - 1];
+    return in.type == nvinfer1::DataType::kINT32 && in.format == prev.format;
+  }
+}
+
+nvinfer1::DataType FusedTokenPrunePluginDynamic::getOutputDataType(
+    int index,
+    const nvinfer1::DataType* input_types,
+    int nb_inputs) const TRT_NOEXCEPT {
+  if (index == 0) {
+    return input_types[1];
+  } else if (index == 1) {
+    return nvinfer1::DataType::kINT32;
+  }
+}
+
+size_t FusedTokenPrunePluginDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc* inputs,
+    int nb_inputs,
+    const nvinfer1::PluginTensorDesc* outputs,
+    int nb_outputs) const TRT_NOEXCEPT {
+  auto attn_dims = inputs[0].dims;
+  auto x_dims = inputs[1].dims;
+  auto new_mask_dims = inputs[3].dims;
+  auto bsz = attn_dims.d[0], nb_head = attn_dims.d[1],
+       max_seq_len = attn_dims.d[2];
+
+  int slimmed_x_len = new_mask_dims.d[2];
+  int total = bsz * nb_head * max_seq_len * max_seq_len;
+  size_t size = total * sizeof(float);
+  size += bsz * max_seq_len * sizeof(float);
+  size += bsz * max_seq_len * sizeof(int32_t);
+  size += bsz * max_seq_len * sizeof(float);
+  size += bsz * max_seq_len * sizeof(int32_t);
+  size += (bsz + 1) * sizeof(int);
+  size += bsz * slimmed_x_len * sizeof(int32_t);
+  return size;
+}
+
+template <typename T>
+int FusedTokenPrunePluginDynamic::enqueueImpl(
+    const nvinfer1::PluginTensorDesc* input_desc,
+    const nvinfer1::PluginTensorDesc* output_desc,
+    const void* const* inputs,
+    void* const* outputs,
+    void* workspace_ptr,
+    cudaStream_t stream,
+    int device_id,
+    T max_value) {
+  // Dims
+  auto attn_dims = input_desc[0].dims;
+  auto x_dims = input_desc[1].dims;
+  auto new_mask_dims = input_desc[3].dims;
+
+  auto bsz = attn_dims.d[0], nb_head = attn_dims.d[1],
+       max_seq_len = attn_dims.d[2];
+  auto c = x_dims.d[2];
+  auto slimmed_x_len = new_mask_dims.d[2];
+
+  // Inputs
+  const T* attn_data = static_cast<const T*>(inputs[0]);
+  const T* x_data = static_cast<const T*>(inputs[1]);
+  const T* mask_data = static_cast<const T*>(inputs[2]);
+
+  // Outputs
+  T* output_data = static_cast<T*>(outputs[0]);
+  int32_t* output_indices_data = static_cast<int32_t*>(outputs[1]);
+
+  int total = bsz * nb_head * max_seq_len * max_seq_len;
+  int block = operators::ComputeBlockSize(max_seq_len);
+  int grid = operators::CeilDivide(total, block);
+
+  // Workspace for intermediate variable
+  char* workspace = static_cast<char*>(workspace_ptr);
+  T* attn_tmp_data = reinterpret_cast<T*>(workspace);
+  size_t offset = total * sizeof(T);
+  T* attn_accu_data = reinterpret_cast<T*>(workspace + offset);
+  offset += bsz * max_seq_len * sizeof(T);
+  int32_t* attn_accu_indices_data =
+      reinterpret_cast<int32_t*>(workspace + offset);
+  offset += bsz * max_seq_len * sizeof(int32_t);
+  T* sort_attn_accu_data = reinterpret_cast<T*>(workspace + offset);
+  offset += bsz * max_seq_len * sizeof(T);
+  int32_t* sort_attn_accu_indices_data =
+      reinterpret_cast<int32_t*>(workspace + offset);
+  offset += bsz * max_seq_len * sizeof(int32_t);
+  int* offsets_data = reinterpret_cast<int*>(workspace + offset);
+  offset += (bsz + 1) * sizeof(int);
+  int32_t* slimmed_sort_attn_accu_indices_data =
+      reinterpret_cast<int32_t*>(workspace + offset);
+
+  // 1. Filter attn by mask
+  ElementwiseMask<T>
+      <<<grid, block, 0, stream>>>(attn_data, mask_data, attn_tmp_data, total);
+
+  total = bsz * max_seq_len;
+  block = operators::ComputeBlockSize(max_seq_len);
+  grid = operators::CeilDivide(total, block);
+  FillZero<T><<<grid, block, 0, stream>>>(attn_accu_data, total);
+
+  // 2. Reduce sum
+  total = bsz * nb_head * max_seq_len * max_seq_len;
+  int block_tmp = max_seq_len;
+  while (block_tmp > 1024)
+    block_tmp /= 2;  // if max seq len > 1024, it must be 2^n
+  block =
+      block_tmp;  // make sure max_seq_len is an integral multiple of block_size
+  grid = operators::CeilDivide(total, block);
+  ReduceSum2<T><<<grid, block, block * sizeof(T), stream>>>(
+      attn_tmp_data, attn_accu_data, bsz, nb_head, max_seq_len);
+
+  // 3. Prepare token indices
+  total = bsz * max_seq_len;
+  block = operators::ComputeBlockSize(max_seq_len);
+  grid = operators::CeilDivide(total, block);
+
+  FillIndex<<<grid, block, 0, stream>>>(
+      attn_accu_indices_data, bsz, max_seq_len);
+
+  // 4. Sort token indices by attn
+  if (keep_first_token_) {
+    MaximumFirst<T>
+        <<<bsz, 1, 0, stream>>>(attn_accu_data, bsz, max_seq_len, max_value);
+  }
+  size_t temp_storage_bytes = -1;
+  int num_items = bsz * max_seq_len;
+  int num_segments = bsz;
+  FillOffsets<<<bsz + 1, 1, 0, stream>>>(offsets_data, bsz, max_seq_len);
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceSegmentedRadixSort::SortPairsDescending(
+      nullptr,
+      temp_storage_bytes,
+      attn_accu_data,
+      sort_attn_accu_data,
+      attn_accu_indices_data,
+      sort_attn_accu_indices_data,
+      num_items,
+      num_segments,
+      offsets_data,
+      offsets_data + 1,
+      0,
+      sizeof(T) * 8,
+      stream));
+  int64_t temp_size = temp_storage_bytes;
+  framework::Tensor temp_storage;
+  auto* temp_storage_data = temp_storage.mutable_data<uint8_t>(
+      {temp_size}, platform::CUDAPlace(device_id));
+
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceSegmentedRadixSort::SortPairsDescending(
+      temp_storage_data,
+      temp_storage_bytes,
+      attn_accu_data,
+      sort_attn_accu_data,
+      attn_accu_indices_data,
+      sort_attn_accu_indices_data,
+      num_items,
+      num_segments,
+      offsets_data,
+      offsets_data + 1,
+      0,
+      sizeof(T) * 8,
+      stream));
+  // 5. Slice
+  total = bsz * slimmed_x_len;
+  block = operators::ComputeBlockSize(slimmed_x_len);
+  grid = operators::CeilDivide(total, block);
+
+  Slice<int32_t>
+      <<<grid, block, 0, stream>>>(sort_attn_accu_indices_data,
+                                   slimmed_sort_attn_accu_indices_data,
+                                   bsz,
+                                   max_seq_len,
+                                   slimmed_x_len);
+
+  if (keep_order_) {
+    // 6. reorder
+    num_items = bsz * slimmed_x_len;
+    FillOffsets<<<bsz + 1, 1, 0, stream>>>(offsets_data, bsz, slimmed_x_len);
+    temp_storage_bytes = -1;
+    PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceSegmentedRadixSort::SortKeys(
+        nullptr,
+        temp_storage_bytes,
+        slimmed_sort_attn_accu_indices_data,
+        output_indices_data,
+        num_items,
+        num_segments,
+        offsets_data,
+        offsets_data + 1,
+        0,
+        sizeof(int32_t) * 8,
+        stream));
+
+    temp_size = temp_storage_bytes;
+    temp_storage.Resize({temp_size});
+    temp_storage_data =
+        temp_storage.mutable_data<uint8_t>(platform::CUDAPlace(device_id));
+    PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceSegmentedRadixSort::SortKeys(
+        temp_storage_data,
+        temp_storage_bytes,
+        slimmed_sort_attn_accu_indices_data,
+        output_indices_data,
+        num_items,
+        num_segments,
+        offsets_data,
+        offsets_data + 1,
+        0,
+        sizeof(int32_t) * 8,
+        stream));
+
+    TakeAlongAxis<T><<<grid, block, 0, stream>>>(x_data,
+                                                 output_data,
+                                                 output_indices_data,
+                                                 bsz,
+                                                 max_seq_len,
+                                                 slimmed_x_len,
+                                                 c);
+  } else {
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(output_indices_data,
+                                          slimmed_sort_attn_accu_indices_data,
+                                          bsz * slimmed_x_len * sizeof(int32_t),
+                                          cudaMemcpyDeviceToDevice));
+    TakeAlongAxis<T>
+        <<<grid, block, 0, stream>>>(x_data,
+                                     output_data,
+                                     slimmed_sort_attn_accu_indices_data,
+                                     bsz,
+                                     max_seq_len,
+                                     slimmed_x_len,
+                                     c);
+  }
+
+  return cudaGetLastError() != cudaSuccess;
+}
+
+int FusedTokenPrunePluginDynamic::enqueue(
+    const nvinfer1::PluginTensorDesc* input_desc,
+    const nvinfer1::PluginTensorDesc* output_desc,
+    const void* const* inputs,
+    void* const* outputs,
+    void* workspace,
+    cudaStream_t stream) TRT_NOEXCEPT {
+  auto input_type = input_desc[0].type;
+  auto attn_dims = input_desc[0].dims;
+  auto bsz = attn_dims.d[0], nb_head = attn_dims.d[1],
+       max_seq_len = attn_dims.d[2];
+  int device_id;
+  cudaGetDevice(&device_id);
+
+  if (input_type == nvinfer1::DataType::kFLOAT) {
+    VLOG(1) << "TRT Plugin DataType selected. FusedTokenPrune-->fp32";
+
+    float max = std::numeric_limits<float>::max();
+
+    return enqueueImpl<float>(input_desc,
+                              output_desc,
+                              inputs,
+                              outputs,
+                              workspace,
+                              stream,
+                              device_id,
+                              max);
+
+  } else if (input_type == nvinfer1::DataType::kHALF) {
+#ifdef TRT_PLUGIN_FP16_AVALIABLE
+    VLOG(1) << "TRT Plugin DataType selected. FusedTokenPrune-->fp16";
+
+    half max = 65504.0;
+
+    return enqueueImpl<half>(input_desc,
+                             output_desc,
+                             inputs,
+                             outputs,
+                             workspace,
+                             stream,
+                             device_id,
+                             max);
+
+#else
+    PADDLE_THROW(platform::errors::Fatal(
+        "The Ernie(Bert) TensorRT Plugin should be "
+        "complied with CUDA version >= 10.0 when running with fp16. "
+        "Please recomplie it or try to use fp32 by set "
+        "config.SetTRTDynamicShapeInfo(min_input_shape, "
+        "max_input_shape, opt_input_shape, true"));
+#endif
+  } else {
+    PADDLE_THROW(
+        platform::errors::Fatal("The FusedTokenPrune TRT Plugin's input type "
+                                "should be float or half."));
+  }
+}
+
+#endif
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h
new file mode 100644
index 0000000000000..fcd91522ca39c
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h
@@ -0,0 +1,159 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+#if IS_TRT_VERSION_GE(6000)
+
+class FusedTokenPrunePluginDynamic : public DynamicPluginTensorRT {
+ public:
+  explicit FusedTokenPrunePluginDynamic(bool with_fp16,
+                                        bool keep_first_token,
+                                        bool keep_order)
+      : keep_first_token_(keep_first_token), keep_order_(keep_order) {
+    with_fp16_ = with_fp16;
+  }
+  FusedTokenPrunePluginDynamic(void const* serial_data, size_t serial_length) {
+    DeserializeValue(&serial_data, &serial_length, &with_fp16_);
+    DeserializeValue(&serial_data, &serial_length, &keep_first_token_);
+    DeserializeValue(&serial_data, &serial_length, &keep_order_);
+  }
+  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override {
+    return new FusedTokenPrunePluginDynamic(
+        with_fp16_, keep_first_token_, keep_order_);
+  }
+
+  const char* getPluginType() const TRT_NOEXCEPT override {
+    return "fused_token_prune_plugin_dynamic";
+  }
+  int getNbOutputs() const TRT_NOEXCEPT override { return 2; }
+  int initialize() TRT_NOEXCEPT override { return 0; }
+
+  size_t getSerializationSize() const TRT_NOEXCEPT override {
+    return SerializedSize(with_fp16_) + SerializedSize(keep_first_token_) +
+           SerializedSize(keep_order_);
+  }
+  void serialize(void* buffer) const TRT_NOEXCEPT override {
+    SerializeValue(&buffer, with_fp16_);
+    SerializeValue(&buffer, keep_first_token_);
+    SerializeValue(&buffer, keep_order_);
+  }
+
+  nvinfer1::DimsExprs getOutputDimensions(
+      int output_index,
+      const nvinfer1::DimsExprs* inputs,
+      int nb_inputs,
+      nvinfer1::IExprBuilder& expr_builder)  // NOLINT
+      TRT_NOEXCEPT override;
+
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc* in_out,
+                                 int nb_inputs,
+                                 int nb_outputs) TRT_NOEXCEPT override;
+
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                       int nb_inputs,
+                       const nvinfer1::DynamicPluginTensorDesc* out,
+                       int nb_outputs) TRT_NOEXCEPT override {}
+
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                          int nb_inputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nb_outputs) const TRT_NOEXCEPT override;
+
+  int enqueue(const nvinfer1::PluginTensorDesc* input_desc,
+              const nvinfer1::PluginTensorDesc* output_desc,
+              const void* const* inputs,
+              void* const* outputs,
+              void* workspace,
+              cudaStream_t stream) TRT_NOEXCEPT override;
+
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType* input_types,
+                                       int nb_inputs) const
+      TRT_NOEXCEPT override;
+
+  void destroy() TRT_NOEXCEPT override { delete this; }
+
+ private:
+  template <typename T>
+  int enqueueImpl(const nvinfer1::PluginTensorDesc* input_desc,
+                  const nvinfer1::PluginTensorDesc* output_desc,
+                  const void* const* inputs,
+                  void* const* outputs,
+                  void* workspace,
+                  cudaStream_t stream,
+                  int device_id,
+                  T max_value);
+  bool keep_first_token_;
+  bool keep_order_;
+};
+
+class FusedTokenPrunePluginDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  FusedTokenPrunePluginDynamicCreator() {}
+  const char* getPluginName() const TRT_NOEXCEPT override {
+    return "fused_token_prune_plugin_dynamic";
+  }
+
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override {
+    return &field_collection_;
+  }
+
+  nvinfer1::IPluginV2* createPlugin(const char* name,
+                                    const nvinfer1::PluginFieldCollection* fc)
+      TRT_NOEXCEPT override {
+    return nullptr;
+  }
+
+  nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                         const void* serial_data,
+                                         size_t serial_length)
+      TRT_NOEXCEPT override {
+    auto plugin = new FusedTokenPrunePluginDynamic(serial_data, serial_length);
+    return plugin;
+  }
+
+  void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override {
+    plugin_namespace_ = lib_namespace;
+  }
+
+  const char* getPluginNamespace() const TRT_NOEXCEPT override {
+    return plugin_namespace_.c_str();
+  }
+
+ private:
+  std::string plugin_namespace_;
+  std::string plugin_name_;
+  nvinfer1::PluginFieldCollection field_collection_;
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+REGISTER_TRT_PLUGIN_V2(FusedTokenPrunePluginDynamicCreator);
+
+#endif
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
index 69c317781ef57..21eb89d135efa 100644
--- a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
@@ -184,11 +184,6 @@ nvinfer1::DimsExprs PoolPluginDynamic::getOutputDimensions(
                     platform::errors::InvalidArgument(
                         "The Split plugin should be only one input."));
 
-  PADDLE_ENFORCE_EQ(
-      inputs[0].d[1]->isConstant(),
-      true,
-      platform::errors::InvalidArgument("The channel dimension should be "
-                                        "static, but we found it's dynamic."));
   nvinfer1::DimsExprs output(inputs[0]);
   if (is_global_ && !adaptive_) {
     output.d[2] = expr_builder.constant(1);
diff --git a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu
index 4ef160d2e04b8..e77f12769c0f3 100644
--- a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu
@@ -152,8 +152,8 @@ __global__ void StackKernel(const T* const* input,
                             T* output,
                             int num_stack,
                             int base_unit) {
-  int stack_id = blockIdx.x;
-  int lead_id = blockIdx.y;
+  int stack_id = blockIdx.y;
+  int lead_id = blockIdx.x;
 
   for (int i = threadIdx.x; i < base_unit; i += blockDim.x) {
     output[lead_id * num_stack * base_unit + stack_id * base_unit + i] =
@@ -201,7 +201,8 @@ int StackPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
                   stream);
 
   const int num_stacks = out_dims.d[axis_];
-  dim3 num_blocks(num_stacks, lead_unit);
+  // lead_unit may be very large, so make it be blockIdx.x
+  dim3 num_blocks(lead_unit, num_stacks);
   const int num_threads = 256;
   auto infer_type = input_desc[0].type;
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/test_fused_token_prune_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/test_fused_token_prune_plugin.cc
new file mode 100644
index 0000000000000..131ce46d89a66
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/test_fused_token_prune_plugin.cc
@@ -0,0 +1,48 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+
+#include "paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+TEST(fused_token_prune_op_plugin, test_plugin) {
+  FusedTokenPrunePluginDynamic plugin(
+      true, /*keep_first_token*/ false, /*keep_order*/ true);
+  plugin.configurePlugin(nullptr, 4, nullptr, 2);
+  plugin.initialize();
+  plugin.getPluginType();
+  plugin.getNbOutputs();
+  auto clone_plugin = plugin.clone();
+  clone_plugin->destroy();
+  size_t buf_size = plugin.getSerializationSize();
+  std::vector<char> buf(buf_size);
+  plugin.serialize(buf.data());
+}
+
+TEST(fused_token_prune_op_plugin, test_plugin_creater) {
+  FusedTokenPrunePluginDynamicCreator creator;
+  creator.getFieldNames();
+  creator.createPlugin("test", nullptr);
+  creator.setPluginNamespace("test");
+}
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc
index 8f20ffb5e6b8c..8d95bbea5b89f 100644
--- a/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc
@@ -18,9 +18,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/phi/common/data_type.h"
 #if PADDLE_WITH_CUSPARSELT && IS_TRT_VERSION_GE(8000)
 #include "paddle/fluid/inference/tensorrt/plugin/spmm_plugin.h"
 #endif
+#include "paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/phi/common/float16.h"
 
@@ -66,6 +68,7 @@ class TensorRTDynamicEngineTest : public ::testing::Test {
                                  max_input_shape,
                                  optim_input_shape,
                                  false,
+                                 phi::DataType::FLOAT32,
                                  NaiveLogger::Global());
     engine_->InitNetwork();
   }
@@ -193,6 +196,197 @@ TEST_F(TensorRTDynamicEngineTest, test_spmm) {
   return;
 }
 
+class TensorRTDynamicTestFusedTokenPrune : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0));
+    ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(platform::CUDAPlace(0), ctx_->stream())
+                           .get());
+    ctx_->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CPUPlace())
+            .get());
+    ctx_->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(platform::CUDAPlace(0))
+            .get());
+    ctx_->SetPinnedAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CUDAPinnedPlace())
+            .get());
+    ctx_->PartialInitWithAllocator();
+
+    std::map<std::string, std::vector<int>> min_input_shape = {
+        {"attn", {4, 1, 4, 4}},
+        {"x", {4, 4, 1}},
+        {"mask", {4, 1, 4, 4}},
+        {"new_mask", {4, 1, 2, 2}}};
+    std::map<std::string, std::vector<int>> max_input_shape = {
+        {"attn", {4, 1, 4, 4}},
+        {"x", {4, 4, 1}},
+        {"mask", {4, 1, 4, 4}},
+        {"new_mask", {4, 1, 2, 2}}};
+    std::map<std::string, std::vector<int>> optim_input_shape = {
+        {"attn", {4, 1, 4, 4}},
+        {"x", {4, 4, 1}},
+        {"mask", {4, 1, 4, 4}},
+        {"new_mask", {4, 1, 2, 2}}};
+
+    engine_ = new TensorRTEngine(16,
+                                 1 << 10,
+                                 AnalysisConfig::Precision::kHalf,
+                                 nullptr,
+                                 0,
+                                 min_input_shape,
+                                 max_input_shape,
+                                 optim_input_shape,
+                                 false,
+                                 phi::DataType::FLOAT32,
+                                 NaiveLogger::Global());
+    engine_->InitNetwork();
+  }
+
+  void TearDown() override {
+    if (engine_) {
+      delete engine_;
+      engine_ = nullptr;
+    }
+  }
+
+  void PrepareInputOutput(const std::vector<std::vector<float16>> inputs,
+                          std::vector<std::vector<int>> output_shapes) {
+    LOG(INFO) << "PrepareInputOutput";
+    int num_inputs = inputs.size();
+    int num_outputs = output_shapes.size();
+    inputs_.resize(num_inputs);
+    outputs_.resize(num_outputs);
+    for (int i = 0; i < num_inputs; ++i) {
+      paddle::framework::TensorFromVector(inputs[i], *ctx_, &inputs_[i]);
+    }
+    for (int i = 0; i < num_outputs; ++i) {
+      outputs_[i].Resize(phi::make_ddim(output_shapes[i]));
+    }
+  }
+
+  void GetOutput(std::vector<float> &slimmed_x,     // NOLINT
+                 std::vector<int32_t> &cls_inds) {  // NOLINT
+    paddle::framework::TensorToVector(outputs_[0], *ctx_, &slimmed_x);
+    paddle::framework::TensorToVector(outputs_[1], *ctx_, &cls_inds);
+  }
+
+ protected:
+  std::vector<framework::Tensor> inputs_;
+  std::vector<framework::Tensor> outputs_;
+  TensorRTEngine *engine_;
+  platform::CUDADeviceContext *ctx_;
+};
+
+TEST_F(TensorRTDynamicTestFusedTokenPrune, test_fused_token_prune) {
+#if IS_TRT_VERSION_GE(8000)
+  auto *attn = engine_->DeclareInput(
+      "attn", nvinfer1::DataType::kHALF, nvinfer1::Dims4{-1, 1, 4, 4});
+  auto *x = engine_->DeclareInput(
+      "x", nvinfer1::DataType::kHALF, nvinfer1::Dims3{-1, 4, 1});
+  auto *mask = engine_->DeclareInput(
+      "mask", nvinfer1::DataType::kHALF, nvinfer1::Dims4{-1, 1, 4, 4});
+  auto *new_mask = engine_->DeclareInput(
+      "new_mask", nvinfer1::DataType::kHALF, nvinfer1::Dims4{-1, 1, 2, 2});
+  plugin::FusedTokenPrunePluginDynamic *plugin =
+      new plugin::FusedTokenPrunePluginDynamic(
+          true, /*keep_first_token*/ false, /*keep_order*/ true);
+  std::vector<nvinfer1::ITensor *> itensors = {attn, x, mask, new_mask};
+  auto *layer = engine_->AddDynamicPlugin(itensors.data(), 4, plugin);
+  PADDLE_ENFORCE_NOT_NULL(layer,
+                          platform::errors::InvalidArgument(
+                              "TRT fused_token_prune layer building failed."));
+  std::vector<std::string> output_tensor_names{"out_slimmed_x", "out_cls_inds"};
+  for (size_t i = 0; i < 2; i++) {
+    layer->getOutput(i)->setName(output_tensor_names[i].c_str());
+    engine_->DeclareOutput(layer, i, output_tensor_names[i]);
+  }
+  engine_->FreezeNetwork();
+
+  ASSERT_EQ(engine_->engine()->getNbBindings(), 6);
+  LOG(INFO) << "create input";
+  std::vector<float16> attn_v(64);
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      for (int k = 0; k < 4; ++k) {
+        attn_v[i * 16 + j * 4 + k] = k;
+      }
+    }
+  }
+  std::vector<float16> x_v(16);
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      x_v[i * 4 + j] = 1;
+    }
+  }
+  std::vector<float16> mask_v(64);
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      for (int k = 0; k < 4; ++k) {
+        mask_v[i * 16 + j * 4 + k] = 1;
+      }
+    }
+  }
+  std::vector<float16> new_mask_v(16);
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 2; ++k) {
+        new_mask_v[i * 4 + j * 2 + k] = 1;
+      }
+    }
+  }
+
+  LOG(INFO) << "create output";
+  std::vector<int> out_slimmed_x_shape{4, 2, 1};
+  std::vector<int> out_cls_ins_shape{4, 2};
+
+  PrepareInputOutput({attn_v, x_v, mask_v, new_mask_v},
+                     {out_slimmed_x_shape, out_cls_ins_shape});
+
+  auto *attn_gpu_data = inputs_[0].mutable_data<float16>(ctx_->GetPlace());
+  auto *x_gpu_data = inputs_[1].mutable_data<float16>(ctx_->GetPlace());
+  auto *mask_gpu_data = inputs_[2].mutable_data<float16>(ctx_->GetPlace());
+  auto *new_mask_gpu_data = inputs_[3].mutable_data<float16>(ctx_->GetPlace());
+
+  auto *slimmed_x_gpu_data = outputs_[0].mutable_data<float>(ctx_->GetPlace());
+  auto *cls_inds_gpu_data = outputs_[1].mutable_data<int32_t>(ctx_->GetPlace());
+
+  LOG(INFO) << "create buffers";
+
+  std::vector<void *> buffers(6);
+  buffers[0] = reinterpret_cast<void *>(attn_gpu_data);
+  buffers[1] = reinterpret_cast<void *>(x_gpu_data);
+  buffers[2] = reinterpret_cast<void *>(mask_gpu_data);
+  buffers[3] = reinterpret_cast<void *>(new_mask_gpu_data);
+  buffers[4] = reinterpret_cast<void *>(slimmed_x_gpu_data);
+  buffers[5] = reinterpret_cast<void *>(cls_inds_gpu_data);
+
+  LOG(INFO) << "Execute";
+
+  engine_->Execute(4, &buffers, ctx_->stream());
+
+  std::vector<float> slimmed_x_v;
+  std::vector<int32_t> cls_inds_v;
+
+  LOG(INFO) << "GetOutput";
+  GetOutput(slimmed_x_v, cls_inds_v);
+
+  ASSERT_EQ(cls_inds_v[0], 2);
+  ASSERT_EQ(cls_inds_v[1], 3);
+  ASSERT_EQ(cls_inds_v[2], 2);
+  ASSERT_EQ(cls_inds_v[3], 3);
+  ASSERT_EQ(cls_inds_v[4], 2);
+  ASSERT_EQ(cls_inds_v[5], 3);
+  ASSERT_EQ(cls_inds_v[6], 2);
+  ASSERT_EQ(cls_inds_v[7], 3);
+  LOG(INFO) << "finish";
+#endif
+}
+
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 8261ce288cb97..4463a949948d8 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -1,12 +1,16 @@
-if(NOT APPLE AND NOT WIN32)
-  set(INFERENCE_EXTRA_DEPS paddle_inference_shared)
-else()
-  set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_inference_io
-                           ir_pass_manager analysis_predictor benchmark)
+# If CI_SKIP_CPP_TEST=ON, there is no need to build and run these test.
+if("$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
+  return()
 endif()
 
-if(WITH_GPU AND TENSORRT_FOUND)
-  set(INFERENCE_EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps})
+# In Windows, c_api test link must link both 2 shared to avoid symbols redefinition,
+# in Linux, c_api test cant do like this or graph_to_program register more than once.
+# Both Windows and Linux can only use paddle_inference_c, but this will increase size
+# of build folder by 30G.
+if(WIN32)
+  set(INFERENCE_C_EXTRA_DEPS paddle_inference_shared paddle_inference_c_shared)
+else()
+  set(INFERENCE_C_EXTRA_DEPS paddle_inference_shared paddle_inference_c)
 endif()
 
 function(download_data install_dir data_file check_sum)
@@ -107,7 +111,7 @@ function(inference_analysis_api_test target install_dir filename)
     SRCS
     ${filename}
     EXTRA_DEPS
-    ${INFERENCE_EXTRA_DEPS}
+    paddle_inference_shared
     ARGS
     --infer_model=${install_dir}/model
     --infer_data=${install_dir}/data.txt
@@ -120,7 +124,7 @@ function(inference_analysis_api_int8_test target install_dir filename)
     SRCS
     ${filename}
     EXTRA_DEPS
-    ${INFERENCE_EXTRA_DEPS}
+    paddle_inference_shared
     ARGS
     --infer_model=${install_dir}/model
     --infer_data=${install_dir}/data.txt
@@ -137,7 +141,7 @@ function(inference_multiple_models_analysis_api_test target install_dir
     SRCS
     ${filename}
     EXTRA_DEPS
-    ${INFERENCE_EXTRA_DEPS}
+    paddle_inference_shared
     ARGS
     --infer_model=${install_dir}/mobilenet_v2_models/1
     --infer_model2=${install_dir}/mobilenet_v2_models/xx
@@ -146,7 +150,7 @@ endfunction()
 
 function(inference_analysis_api_test_build TARGET_NAME filename)
   inference_analysis_test_build(${TARGET_NAME} SRCS ${filename} EXTRA_DEPS
-                                ${INFERENCE_EXTRA_DEPS})
+                                paddle_inference_shared)
 endfunction()
 
 function(inference_analysis_api_int8_test_run TARGET_NAME test_binary model_dir
@@ -205,7 +209,7 @@ endfunction()
 
 function(inference_analysis_api_test_with_fake_data_build TARGET_NAME filename)
   inference_analysis_test_build(${TARGET_NAME} SRCS ${filename} EXTRA_DEPS
-                                ${INFERENCE_EXTRA_DEPS})
+                                paddle_inference_shared)
 endfunction()
 
 function(inference_analysis_api_test_with_fake_data_run TARGET_NAME test_binary
@@ -373,7 +377,7 @@ inference_analysis_test(
   SRCS
   analyzer_dam_tester.cc
   EXTRA_DEPS
-  ${INFERENCE_EXTRA_DEPS}
+  paddle_inference_shared
   ARGS
   --infer_model=${DAM_SMALL_INSTALL_DIR}/model
   --infer_data=${DAM_SMALL_INSTALL_DIR}/data.txt)
@@ -434,7 +438,7 @@ inference_analysis_test(
   SRCS
   analyzer_ernie_tester.cc
   EXTRA_DEPS
-  ${INFERENCE_EXTRA_DEPS}
+  paddle_inference_shared
   ARGS
   --infer_model=${ERNIE_INSTALL_DIR}/model
   --infer_data=${ERNIE_INSTALL_DIR}/data.txt
@@ -478,7 +482,7 @@ inference_analysis_test(
   SRCS
   analyzer_transformer_compare_tester.cc
   EXTRA_DEPS
-  ${INFERENCE_EXTRA_DEPS}
+  paddle_inference_shared
   ARGS
   --infer_model=${TRANSFORMER_INSTALL_DIR}/model
   --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt
@@ -489,7 +493,7 @@ inference_analysis_test(
   SRCS
   analyzer_transformer_fuse_tester.cc
   EXTRA_DEPS
-  ${INFERENCE_EXTRA_DEPS}
+  paddle_inference_shared
   ARGS
   --infer_model=${TRANSFORMER_INSTALL_DIR}/model
   --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt
@@ -500,7 +504,7 @@ inference_analysis_test(
   SRCS
   analyzer_transformer_profile_tester.cc
   EXTRA_DEPS
-  ${INFERENCE_EXTRA_DEPS}
+  paddle_inference_shared
   ARGS
   --infer_model=${TRANSFORMER_INSTALL_DIR}/model
   --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt
@@ -518,7 +522,7 @@ inference_analysis_test(
   SRCS
   analyzer_vit_ocr_tester.cc
   EXTRA_DEPS
-  ${INFERENCE_EXTRA_DEPS}
+  paddle_inference_shared
   ARGS
   --infer_model=${VIT_OCR_INSTALL_DIR}/vit_ocr/model
   --infer_data=${VIT_OCR_INSTALL_DIR}/vit_ocr/datavit.txt)
@@ -541,7 +545,7 @@ inference_analysis_test(
   SRCS
   analyzer_detect_functional_mkldnn_tester.cc
   EXTRA_DEPS
-  ${INFERENCE_EXTRA_DEPS}
+  paddle_inference_shared
   ARGS
   --infer_model=${DENSEBOX_INSTALL_DIR}/model
   --infer_data=${DENSEBOX_INSTALL_DIR}/detect_input_50.txt
@@ -899,7 +903,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
     SRCS
     trt_mobilenet_test.cc
     EXTRA_DEPS
-    ${INFERENCE_EXTRA_DEPS}
+    paddle_inference_shared
     ARGS
     --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
   inference_analysis_test(
@@ -907,7 +911,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
     SRCS
     trt_resnet50_test.cc
     EXTRA_DEPS
-    ${INFERENCE_EXTRA_DEPS}
+    paddle_inference_shared
     ARGS
     --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
   inference_analysis_test(
@@ -915,7 +919,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
     SRCS
     trt_resnext_test.cc
     EXTRA_DEPS
-    ${INFERENCE_EXTRA_DEPS}
+    paddle_inference_shared
     ARGS
     --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
   inference_analysis_test(
@@ -923,7 +927,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
     SRCS
     trt_fc_prelu_test.cc
     EXTRA_DEPS
-    ${INFERENCE_EXTRA_DEPS}
+    paddle_inference_shared
     ARGS
     --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
   inference_analysis_test(
@@ -931,7 +935,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
     SRCS
     trt_cascade_rcnn_test.cc
     EXTRA_DEPS
-    ${INFERENCE_EXTRA_DEPS}
+    paddle_inference_shared
     ARGS
     --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
   inference_analysis_test(
@@ -939,7 +943,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
     SRCS
     trt_split_converter_test.cc
     EXTRA_DEPS
-    ${INFERENCE_EXTRA_DEPS}
+    paddle_inference_shared
     ARGS
     --infer_model=${TEST_SPLIT_CONVERTER_MODEL}/)
   inference_analysis_test(
@@ -947,19 +951,27 @@ if(WITH_GPU AND TENSORRT_FOUND)
     SRCS
     analyzer_capi_exp_gpu_tester.cc
     EXTRA_DEPS
-    ${INFERENCE_EXTRA_DEPS}
-    paddle_inference_c
+    ${INFERENCE_C_EXTRA_DEPS}
     ARGS
     --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
+  if(WIN32)
+    target_link_libraries(test_analyzer_capi_exp_gpu paddle_inference_c_shared)
+  else()
+    target_link_libraries(test_analyzer_capi_exp_gpu paddle_inference_c)
+  endif()
   inference_analysis_test(
     test_analyzer_capi_exp_xpu
     SRCS
     analyzer_capi_exp_xpu_tester.cc
     EXTRA_DEPS
-    ${INFERENCE_EXTRA_DEPS}
-    paddle_inference_c
+    ${INFERENCE_C_EXTRA_DEPS}
     ARGS
     --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
+  if(WIN32)
+    target_link_libraries(test_analyzer_capi_exp_xpu paddle_inference_c_shared)
+  else()
+    target_link_libraries(test_analyzer_capi_exp_xpu paddle_inference_c)
+  endif()
 
   set(TRT_MODEL_QUANT_RESNET_DIR
       "${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model")
@@ -973,7 +985,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
     SRCS
     trt_quant_int8_test.cc
     EXTRA_DEPS
-    ${INFERENCE_EXTRA_DEPS}
+    paddle_inference_shared
     ARGS
     --infer_model=${TRT_MODEL_QUANT_RESNET_DIR})
 
@@ -989,7 +1001,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
     SRCS
     trt_quant_int8_yolov3_r50_test.cc
     EXTRA_DEPS
-    ${INFERENCE_EXTRA_DEPS}
+    paddle_inference_shared
     ARGS
     --infer_model=${TRT_MODEL_QUANT_YOLOV3_DIR})
 
@@ -1012,7 +1024,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
     SRCS
     trt_dynamic_shape_test.cc
     EXTRA_DEPS
-    ${INFERENCE_EXTRA_DEPS}
+    paddle_inference_shared
     ARGS
     --infer_model=${TRT_MODEL_INSTALL_DIR})
 
@@ -1028,7 +1040,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
     SRCS
     trt_dynamic_shape_ernie_test.cc
     EXTRA_DEPS
-    ${INFERENCE_EXTRA_DEPS}
+    paddle_inference_shared
     ARGS
     --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4)
 
@@ -1045,7 +1057,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
     SRCS
     trt_dynamic_shape_transformer_prune_test.cc
     EXTRA_DEPS
-    ${INFERENCE_EXTRA_DEPS}
+    paddle_inference_shared
     ARGS
     --infer_model=${TEST_TRT_TRANSFORMER_PRUNE_MODEL}/transformer_prune)
 
@@ -1060,7 +1072,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
     SRCS
     trt_dynamic_shape_ernie_serialize_deserialize_test.cc
     EXTRA_DEPS
-    ${INFERENCE_EXTRA_DEPS}
+    paddle_inference_shared
     ARGS
     --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized)
 
@@ -1075,7 +1087,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
     SRCS
     trt_dynamic_shape_ernie_fp16_serialize_deserialize_test.cc
     EXTRA_DEPS
-    ${INFERENCE_EXTRA_DEPS}
+    paddle_inference_shared
     ARGS
     --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_fp16_unserialized)
 
@@ -1089,7 +1101,7 @@ inference_analysis_test(
   SRCS
   lite_mul_model_test.cc
   EXTRA_DEPS
-  ${INFERENCE_EXTRA_DEPS}
+  paddle_inference_shared
   ARGS
   --infer_model=${LITE_MODEL_INSTALL_DIR})
 inference_analysis_test(
@@ -1097,7 +1109,7 @@ inference_analysis_test(
   SRCS
   lite_resnet50_test.cc
   EXTRA_DEPS
-  ${INFERENCE_EXTRA_DEPS}
+  paddle_inference_shared
   ARGS
   --infer_model=${RESNET50_MODEL_DIR})
 
@@ -1106,30 +1118,44 @@ inference_analysis_test(
   SRCS
   analyzer_capi_exp_tester.cc
   EXTRA_DEPS
-  ${INFERENCE_EXTRA_DEPS}
-  paddle_inference_c
+  ${INFERENCE_C_EXTRA_DEPS}
   ARGS
   --infer_model=${RESNET50_MODEL_DIR}/model)
+if(WIN32)
+  target_link_libraries(test_analyzer_capi_exp paddle_inference_c_shared)
+else()
+  target_link_libraries(test_analyzer_capi_exp paddle_inference_c)
+endif()
 
 inference_analysis_test(
   test_analyzer_capi_exp_pd_config
   SRCS
   analyzer_capi_exp_pd_config_tester.cc
   EXTRA_DEPS
-  ${INFERENCE_EXTRA_DEPS}
-  paddle_inference_c
+  ${INFERENCE_C_EXTRA_DEPS}
   ARGS
   --infer_model=${MOBILENET_INSTALL_DIR}/model)
+if(WIN32)
+  target_link_libraries(test_analyzer_capi_exp_pd_config
+                        paddle_inference_c_shared)
+else()
+  target_link_libraries(test_analyzer_capi_exp_pd_config paddle_inference_c)
+endif()
 
 inference_analysis_test(
   test_analyzer_capi_exp_pd_tensor
   SRCS
   analyzer_capi_exp_pd_tensor_tester.cc
   EXTRA_DEPS
-  ${INFERENCE_EXTRA_DEPS}
-  paddle_inference_c
+  ${INFERENCE_C_EXTRA_DEPS}
   ARGS
   --infer_model=${MOBILENET_INSTALL_DIR}/model)
+if(WIN32)
+  target_link_libraries(test_analyzer_capi_exp_pd_tensor
+                        paddle_inference_c_shared)
+else()
+  target_link_libraries(test_analyzer_capi_exp_pd_tensor paddle_inference_c)
+endif()
 
 if(NOT APPLE AND NOT WIN32)
   inference_analysis_test(
@@ -1137,17 +1163,23 @@ if(NOT APPLE AND NOT WIN32)
     SRCS
     analyzer_capi_exp_pd_threads_tester.cc
     EXTRA_DEPS
-    ${INFERENCE_EXTRA_DEPS}
-    paddle_inference_c
+    ${INFERENCE_C_EXTRA_DEPS}
     ARGS
     --infer_model=${MOBILENET_INSTALL_DIR}/model)
+  if(WIN32)
+    target_link_libraries(test_analyzer_capi_exp_pd_threads
+                          paddle_inference_c_shared)
+  else()
+    target_link_libraries(test_analyzer_capi_exp_pd_threads paddle_inference_c)
+  endif()
 endif()
+
 inference_analysis_test(
   test_analyzer_zerocopytensor_tensor
   SRCS
   analyzer_zerocopy_tensor_tester.cc
   EXTRA_DEPS
-  ${INFERENCE_EXTRA_DEPS}
+  paddle_inference_shared
   ARGS
   --infer_model=${OCR_INSTALL_DIR}/model)
 
@@ -1159,7 +1191,7 @@ if(WITH_DISTRIBUTE
     SRCS
     analyzer_dist_model_tester.cc
     EXTRA_DEPS
-    ${INFERENCE_EXTRA_DEPS}
+    paddle_inference_shared
     ARGS
     --infer_model=${OCR_INSTALL_DIR}/model)
 endif()
@@ -1169,7 +1201,7 @@ inference_analysis_test(
   SRCS
   analyzer_paddle_tensor_tester.cc
   EXTRA_DEPS
-  ${INFERENCE_EXTRA_DEPS}
+  paddle_inference_shared
   ARGS
   --infer_model=${OCR_INSTALL_DIR}/model
   --infer_data=${OCR_INSTALL_DIR}/data.txt
@@ -1181,10 +1213,14 @@ if(WITH_MKLDNN)
     SRCS
     analyzer_capi_exp_int_tester.cc
     EXTRA_DEPS
-    ${INFERENCE_EXTRA_DEPS}
-    paddle_inference_c
+    ${INFERENCE_C_EXTRA_DEPS}
     ARGS
     --infer_model=${INT8_DATA_DIR}/resnet50/model)
+  if(WIN32)
+    target_link_libraries(test_analyzer_capi_exp_int paddle_inference_c_shared)
+  else()
+    target_link_libraries(test_analyzer_capi_exp_int paddle_inference_c)
+  endif()
 endif()
 
 inference_analysis_test(
@@ -1192,10 +1228,14 @@ inference_analysis_test(
   SRCS
   analyzer_capi_exp_ner_tester.cc
   EXTRA_DEPS
-  ${INFERENCE_EXTRA_DEPS}
-  paddle_inference_c
+  ${INFERENCE_C_EXTRA_DEPS}
   ARGS
   --infer_model=${CHINESE_NER_INSTALL_DIR}/model)
+if(WIN32)
+  target_link_libraries(test_analyzer_capi_exp_ner paddle_inference_c_shared)
+else()
+  target_link_libraries(test_analyzer_capi_exp_ner paddle_inference_c)
+endif()
 
 if(WITH_GPU)
   inference_analysis_test(
@@ -1203,7 +1243,7 @@ if(WITH_GPU)
     SRCS
     paddle_infer_api_test.cc
     EXTRA_DEPS
-    ${INFERENCE_EXTRA_DEPS}
+    paddle_inference_shared
     ARGS
     --infer_model=${RESNET50_MODEL_DIR})
 
@@ -1212,7 +1252,7 @@ if(WITH_GPU)
     SRCS
     paddle_infer_api_copy_tensor_tester.cc
     EXTRA_DEPS
-    ${INFERENCE_EXTRA_DEPS}
+    paddle_inference_shared
     ARGS
     --infer_model=${RESNET50_MODEL_DIR})
   set_tests_properties(paddle_infer_api_copy_tensor_tester PROPERTIES TIMEOUT
@@ -1224,10 +1264,6 @@ cc_test(
   SRCS paddle_infer_api_errors_tester.cc
   DEPS paddle_inference_api)
 
-if("$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
-  return()
-endif()
-
 if(WITH_GPU AND TENSORRT_FOUND)
   set_tests_properties(trt_resnext_test PROPERTIES TIMEOUT 300)
   set_tests_properties(trt_quant_int8_yolov3_r50_test PROPERTIES TIMEOUT 300)
@@ -1288,7 +1324,7 @@ if(WITH_IPU)
     SRCS
     ipu_word2vec_sample.cc
     EXTRA_DEPS
-    ${INFERENCE_EXTRA_DEPS}
+    paddle_inference_shared
     ARGS
     --infer_model=${WORD2VEC_INSTALL_DIR})
 
@@ -1307,7 +1343,7 @@ if(WITH_IPU)
     SRCS
     ipu_resnet50_test.cc
     EXTRA_DEPS
-    ${INFERENCE_EXTRA_DEPS}
+    paddle_inference_shared
     ARGS
     --infer_model=${RESNET50_MODEL_DIR}
     --warmup=true
@@ -1317,7 +1353,7 @@ if(WITH_IPU)
     SRCS
     ipu_resnet50_fp16_test.cc
     EXTRA_DEPS
-    ${INFERENCE_EXTRA_DEPS}
+    paddle_inference_shared
     ARGS
     --infer_model=${RESNET50_MODEL_DIR}
     --warmup=true
diff --git a/paddle/fluid/inference/tests/api/analyzer_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_image_classification_tester.cc
index 0df36592cc39e..dc8921ef7311e 100644
--- a/paddle/fluid/inference/tests/api/analyzer_image_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_image_classification_tester.cc
@@ -66,11 +66,6 @@ void profile(bool use_mkldnn = false) {
                  FLAGS_num_threads);
 }
 
-TEST(Analyzer_resnet50, profile) { profile(); }
-#ifdef PADDLE_WITH_MKLDNN
-TEST(Analyzer_resnet50, profile_mkldnn) { profile(true /* use_mkldnn */); }
-#endif
-
 // Check the fuse status
 TEST(Analyzer_resnet50, fuse_statis) {
   AnalysisConfig cfg;
@@ -82,6 +77,11 @@ TEST(Analyzer_resnet50, fuse_statis) {
   LOG(INFO) << "num_ops: " << num_ops;
 }
 
+TEST(Analyzer_resnet50, profile) { profile(); }
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_resnet50, profile_mkldnn) { profile(true /* use_mkldnn */); }
+#endif
+
 // Compare result of NativeConfig and AnalysisConfig
 void compare(bool use_mkldnn = false) {
   AnalysisConfig cfg;
diff --git a/paddle/fluid/inference/tests/api/trt_fc_prelu_test.cc b/paddle/fluid/inference/tests/api/trt_fc_prelu_test.cc
index 93d4a88383c33..70c1eb8bab253 100644
--- a/paddle/fluid/inference/tests/api/trt_fc_prelu_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_fc_prelu_test.cc
@@ -23,6 +23,11 @@ namespace inference {
 
 TEST(TensorRT_fc, compare) {
   std::string model_dir = FLAGS_infer_model + "/fc_uint8";
+  AnalysisConfig config;
+  config.EnableUseGpu(100, 0);
+  config.SetModel(model_dir);
+  config.DisableGlogInfo();
+  auto predictor = CreatePaddlePredictor(config);
   compare(model_dir, /* use_tensorrt */ true);
   // Open it when need.
   // profile(model_dir, /* use_analysis */ true, FLAGS_use_tensorrt);
diff --git a/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc b/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc
index 3b25c32fc7514..45c14f4fc8b37 100644
--- a/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc
@@ -23,6 +23,11 @@ namespace inference {
 
 TEST(TensorRT_mobilenet, compare) {
   std::string model_dir = FLAGS_infer_model + "/mobilenet";
+  AnalysisConfig config;
+  config.EnableUseGpu(100, 0);
+  config.SetModel(model_dir);
+  config.DisableGlogInfo();
+  auto predictor = CreatePaddlePredictor(config);
   compare(model_dir, /* use_tensorrt */ true);
   // Open it when need.
   // profile(model_dir, /* use_analysis */ true, FLAGS_use_tensorrt);
diff --git a/paddle/fluid/inference/tests/api/trt_resnext_test.cc b/paddle/fluid/inference/tests/api/trt_resnext_test.cc
index 374074957c870..8d4e331fa9730 100644
--- a/paddle/fluid/inference/tests/api/trt_resnext_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_resnext_test.cc
@@ -23,6 +23,11 @@ namespace inference {
 
 TEST(TensorRT_resnext50, compare) {
   std::string model_dir = FLAGS_infer_model + "/resnext50";
+  AnalysisConfig config;
+  config.EnableUseGpu(100, 0);
+  config.SetModel(model_dir);
+  config.DisableGlogInfo();
+  auto predictor = CreatePaddlePredictor(config);
   compare(model_dir, /* use_tensorrt */ true);
 }
 
diff --git a/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc b/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc
index c36968b7ed6f8..056371b0ae662 100644
--- a/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc
+++ b/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc
@@ -179,67 +179,69 @@ TEST(tensorrt_tester_LeViT, multi_thread4_trt_fp32_bz2) {
 }
 
 #ifdef PADDLE_WITH_GPU
-// TEST(tensorrt_tester_LeViT, multi_stream_thread4_trt_fp32_bz2) {
-//   int thread_num = 4;
-
-//   // init stream
-//   std::vector<cudaStream_t> streams(thread_num);
-//   for (size_t i = 0; i < thread_num; ++i) {
-//     cudaStreamCreate(&streams[i]);
-//   }
-
-//   // init input data
-//   std::map<std::string, paddle::test::Record> my_input_data_map;
-//   my_input_data_map["x"] = PrepareInput(2);
-//   // init output data
-//   std::map<std::string, paddle::test::Record> infer_output_data,
-//       truth_output_data;
-//   // prepare groudtruth config
-//   paddle_infer::Config config, config_no_ir;
-//   config_no_ir.SetModel(FLAGS_modeldir + "/inference.pdmodel",
-//                         FLAGS_modeldir + "/inference.pdiparams");
-//   config_no_ir.SwitchIrOptim(false);
-//   // prepare inference config
-//   config.SetModel(FLAGS_modeldir + "/inference.pdmodel",
-//                   FLAGS_modeldir + "/inference.pdiparams");
-//   config.EnableUseGpu(100, 0);
-//   config.EnableTensorRtEngine(
-//       1 << 20, 2, 50, paddle_infer::PrecisionType::kFloat32, false, false);
-//   // get groudtruth by disbale ir
-
-//   paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1);
-//   SingleThreadPrediction(pred_pool_no_ir.Retrive(0), &my_input_data_map,
-//                          &truth_output_data, 1);
-
-//   // get infer results from multi threads
-//   std::vector<std::thread> threads;
-//   config.SetExecStream(streams[0]);
-//   config.pass_builder()->DeletePass("add_support_int8_pass");
-//   auto main_predictor = CreatePredictor(config);
-//   std::vector<decltype(main_predictor)> predictors;
-//   for (size_t i = 0; i < thread_num - 1; ++i) {
-//     predictors.push_back(std::move(main_predictor->Clone(streams[i + 1])));
-//     LOG(INFO) << "predictors[" << i << "] stream is "
-//               << predictors[i]->GetExecStream();
-//   }
-//   predictors.push_back(std::move(main_predictor));
-//   LOG(INFO) << "predictors[" << thread_num - 1 << "] stream is "
-//             << predictors[thread_num - 1]->GetExecStream();
-//   for (int i = 0; i < thread_num; ++i) {
-//     threads.emplace_back(paddle::test::SingleThreadPrediction,
-//                          predictors[i].get(), &my_input_data_map,
-//                          &infer_output_data, 10);
-//   }
-
-//   // thread join & check outputs
-//   for (int i = 0; i < thread_num; ++i) {
-//     LOG(INFO) << "join tid : " << i;
-//     threads[i].join();
-//     CompareRecord(&truth_output_data, &infer_output_data);
-//   }
-
-//   std::cout << "finish multi-thread test" << std::endl;
-// }
+TEST(tensorrt_tester_LeViT, multi_stream_thread4_trt_fp32_bz2) {
+  int thread_num = 4;
+
+  // init stream
+  std::vector<cudaStream_t> streams(thread_num);
+  for (size_t i = 0; i < thread_num; ++i) {
+    cudaStreamCreate(&streams[i]);
+  }
+
+  // init input data
+  std::map<std::string, paddle::test::Record> my_input_data_map;
+  my_input_data_map["x"] = PrepareInput(2);
+  // init output data
+  std::map<std::string, paddle::test::Record> infer_output_data,
+      truth_output_data;
+  // prepare groudtruth config
+  paddle_infer::Config config, config_no_ir;
+  config_no_ir.SetModel(FLAGS_modeldir + "/inference.pdmodel",
+                        FLAGS_modeldir + "/inference.pdiparams");
+  config_no_ir.SwitchIrOptim(false);
+  // prepare inference config
+  config.SetModel(FLAGS_modeldir + "/inference.pdmodel",
+                  FLAGS_modeldir + "/inference.pdiparams");
+  config.EnableUseGpu(100, 0);
+  config.EnableTensorRtEngine(
+      1 << 20, 2, 50, paddle_infer::PrecisionType::kFloat32, false, false);
+  // get groudtruth by disbale ir
+
+  paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1);
+  SingleThreadPrediction(
+      pred_pool_no_ir.Retrive(0), &my_input_data_map, &truth_output_data, 1);
+
+  // get infer results from multi threads
+  std::vector<std::thread> threads;
+  config.SetExecStream(streams[0]);
+  config.pass_builder()->DeletePass("add_support_int8_pass");
+  auto main_predictor = CreatePredictor(config);
+  std::vector<decltype(main_predictor)> predictors;
+  for (size_t i = 0; i < thread_num - 1; ++i) {
+    predictors.push_back(std::move(main_predictor->Clone(streams[i + 1])));
+    LOG(INFO) << "predictors[" << i << "] stream is "
+              << predictors[i]->GetExecStream();
+  }
+  predictors.push_back(std::move(main_predictor));
+  LOG(INFO) << "predictors[" << thread_num - 1 << "] stream is "
+            << predictors[thread_num - 1]->GetExecStream();
+  for (int i = 0; i < thread_num; ++i) {
+    threads.emplace_back(paddle::test::SingleThreadPrediction,
+                         predictors[i].get(),
+                         &my_input_data_map,
+                         &infer_output_data,
+                         10);
+  }
+
+  // thread join & check outputs
+  for (int i = 0; i < thread_num; ++i) {
+    LOG(INFO) << "join tid : " << i;
+    threads[i].join();
+    CompareRecord(&truth_output_data, &infer_output_data);
+  }
+
+  std::cout << "finish multi-thread test" << std::endl;
+}
 #endif
 
 }  // namespace paddle_infer
diff --git a/paddle/fluid/inference/utils/CMakeLists.txt b/paddle/fluid/inference/utils/CMakeLists.txt
index 9ab07633e0fe0..f165002f353e4 100644
--- a/paddle/fluid/inference/utils/CMakeLists.txt
+++ b/paddle/fluid/inference/utils/CMakeLists.txt
@@ -18,6 +18,13 @@ cc_test(
   infer_io_utils_tester
   SRCS io_utils_tester.cc
   DEPS infer_io_utils)
+
+if(WITH_ONNXRUNTIME AND WIN32)
+  # Copy onnxruntime for some c++ test in Windows, since the test will
+  # be build only in CI, so suppose the generator in Windows is Ninja.
+  copy_onnx(infer_io_utils_tester)
+endif()
+
 cc_library(table_printer SRCS table_printer.cc)
 cc_test(
   test_table_printer
diff --git a/paddle/fluid/jit/CMakeLists.txt b/paddle/fluid/jit/CMakeLists.txt
index 9c96d8e986a22..e15ef14e5dc59 100644
--- a/paddle/fluid/jit/CMakeLists.txt
+++ b/paddle/fluid/jit/CMakeLists.txt
@@ -26,16 +26,17 @@ cc_library(
 cc_library(
   jit_layer
   SRCS layer.cc
-  DEPS jit_compilation_unit)
+  DEPS jit_serializer jit_function_utils jit_serializer_utils
+       jit_compilation_unit jit_function_schema)
 
 if(WITH_TESTING
    AND NOT WIN32
    AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
   add_custom_target(
     jit_download_program
-    COMMAND wget -nc
+    COMMAND wget -nc -q
             https://paddle-ci.gz.bcebos.com/dy2st/multi_program_load.tar.gz
-    COMMAND tar zxvf multi_program_load.tar.gz)
+    COMMAND tar zxf multi_program_load.tar.gz)
   set(JIT_DEPS
       phi
       elementwise_add_op
@@ -45,12 +46,7 @@ if(WITH_TESTING
       feed_op
       fetch_op
       scale_op
-      jit_serializer
-      jit_layer
-      jit_function_utils
-      jit_function_schema
-      jit_compilation_unit
-      jit_serializer_utils)
+      jit_layer)
   cc_test(
     layer_test
     SRCS layer_test.cc
diff --git a/paddle/fluid/jit/base_function.h b/paddle/fluid/jit/base_function.h
index ebe4314a5319e..df774d8fd84c7 100644
--- a/paddle/fluid/jit/base_function.h
+++ b/paddle/fluid/jit/base_function.h
@@ -14,23 +14,23 @@
 
 #pragma once
 
-#include <ostream>
-#include <string>
-
-#include "paddle/phi/common/place.h"
-
-#include "paddle/fluid/framework/variable.h"
+#include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/core/dense_tensor.h"
 
 namespace paddle {
 namespace jit {
 
-using Variable = paddle::framework::Variable;
+using Tensor = paddle::experimental::Tensor;
+using DenseTensor = phi::DenseTensor;
+
 class BaseFunction {
  public:
-  virtual std::vector<Variable> operator()(
-      const std::vector<Variable> &inputs) = 0;
+  virtual std::vector<DenseTensor> operator()(
+      const std::vector<DenseTensor> &inputs) = 0;
+
+  virtual std::vector<Tensor> operator()(const std::vector<Tensor> &inputs) = 0;
+
   virtual ~BaseFunction() {}
-  // virtual void SetPalce(const phi::Place &place);
 };
 
 }  // namespace jit
diff --git a/paddle/fluid/jit/compilation_unit.cc b/paddle/fluid/jit/compilation_unit.cc
index 261839b479e5b..60d42d045b0e3 100644
--- a/paddle/fluid/jit/compilation_unit.cc
+++ b/paddle/fluid/jit/compilation_unit.cc
@@ -22,16 +22,28 @@ namespace jit {
 std::shared_ptr<BaseFunction> CompilationUnit::Function(
     const std::string &name) const {
   PADDLE_ENFORCE_EQ(
-      function_dict_.count(name),
+      function_map_.count(name),
       1,
-      platform::errors::InvalidArgument(
-          "Funciton name %s is not exist in function_dict_.", name));
-  return function_dict_.at(name);
+      phi::errors::InvalidArgument(
+          "Funciton name %s is not exist in function_map_.", name));
+  return function_map_.at(name);
 }
 
 void CompilationUnit::SetFunction(
     const std::string &name, const std::shared_ptr<BaseFunction> &function) {
-  function_dict_[name] = function;
+  function_map_[name] = function;
+}
+
+std::vector<std::string> CompilationUnit::FunctionNames() const {
+  std::vector<std::string> names;
+  for (auto it = function_map_.begin(); it != function_map_.end(); it++) {
+    names.emplace_back(it->first);
+  }
+  return names;
+}
+
+const Name2FunctionMap &CompilationUnit::FunctionMap() const {
+  return function_map_;
 }
 
 }  // namespace jit
diff --git a/paddle/fluid/jit/compilation_unit.h b/paddle/fluid/jit/compilation_unit.h
index 2944aa928f32f..45a771b649401 100644
--- a/paddle/fluid/jit/compilation_unit.h
+++ b/paddle/fluid/jit/compilation_unit.h
@@ -21,6 +21,8 @@
 
 namespace paddle {
 namespace jit {
+using Name2FunctionMap =
+    std::unordered_map<std::string, std::shared_ptr<BaseFunction>>;
 
 class CompilationUnit {
  public:
@@ -32,8 +34,12 @@ class CompilationUnit {
   void SetFunction(const std::string &name,
                    const std::shared_ptr<BaseFunction> &function);
 
+  std::vector<std::string> FunctionNames() const;
+
+  const Name2FunctionMap &FunctionMap() const;
+
  private:
-  std::unordered_map<std::string, std::shared_ptr<BaseFunction>> function_dict_;
+  Name2FunctionMap function_map_;
 };
 
 }  // namespace jit
diff --git a/paddle/fluid/jit/executor_function.h b/paddle/fluid/jit/executor_function.h
index 224798b7dbb2b..a9b9d59d21bf4 100644
--- a/paddle/fluid/jit/executor_function.h
+++ b/paddle/fluid/jit/executor_function.h
@@ -42,20 +42,26 @@ class ExecutorFunction : public BaseFunction {
 
   ~ExecutorFunction() noexcept {}
 
-  std::vector<Variable> operator()(const std::vector<Variable> &inputs) {
-    utils::ShareInputsIntoScope(info_->InputArgNames(), inputs, &scope_);
+  std::vector<Tensor> operator()(const std::vector<Tensor> &inputs) {
+    auto dense_tensors = utils::ToDenseTensors(inputs);
+    return utils::ToTensors(this->operator()(dense_tensors));
+  }
+
+  std::vector<DenseTensor> operator()(const std::vector<DenseTensor> &inputs) {
+    utils::ShareIntoScope(info_->InputArgNames(), inputs, &scope_);
     inner_exe_.Run(info_->ProgramDesc(),
                    &scope_,
                    /*blockID=*/0,
                    false,
                    true,
                    info_->OutputArgNames());
-    VLOG(6) << framework::GenScopeTreeDebugInfo(&scope_);
-    std::vector<Variable> res;
-    utils::FetchVarsByNames(info_->OutputArgNames(), scope_, &res);
+    std::vector<DenseTensor> res;
+    utils::FetchOuts(info_->OutputArgNames(), scope_, &res);
     return res;
   }
 
+  const std::shared_ptr<FunctionInfo> &Info() const { return info_; }
+
  private:
   std::shared_ptr<FunctionInfo> info_;
   framework::Scope scope_;
diff --git a/paddle/fluid/jit/function_utils.cc b/paddle/fluid/jit/function_utils.cc
index 4757e784dfe75..a6da061de99dc 100644
--- a/paddle/fluid/jit/function_utils.cc
+++ b/paddle/fluid/jit/function_utils.cc
@@ -21,36 +21,50 @@
 namespace paddle {
 namespace jit {
 namespace utils {
-void FetchVarsByNames(const std::vector<std::string> &names,
-                      const framework::Scope &scope,
-                      std::vector<Variable> *outs) {
-  for (auto &out_name : names) {
+
+std::vector<DenseTensor> ToDenseTensors(const std::vector<Tensor> &tensors) {
+  std::vector<DenseTensor> ret;
+  for (auto &t : tensors) {
+    ret.emplace_back(*std::dynamic_pointer_cast<phi::DenseTensor>(t.impl()));
+  }
+  return ret;
+}
+
+std::vector<Tensor> ToTensors(const std::vector<DenseTensor> &tensors) {
+  std::vector<Tensor> ret;
+  for (auto &t : tensors) {
+    ret.emplace_back(std::make_shared<DenseTensor>(t));
+  }
+  return ret;
+}
+
+void FetchOuts(const std::vector<std::string> &names,
+               const framework::Scope &scope,
+               std::vector<DenseTensor> *outs) {
+  outs->reserve(names.size());
+  for (size_t i = 0; i < names.size(); ++i) {
+    auto &out_name = names[i];
     VLOG(3) << "fetch out: " << out_name;
     auto *var = scope.FindVar(out_name);
     auto &src_tensor = var->Get<DenseTensor>();
-    Variable v;
-    auto *p = v.GetMutable<DenseTensor>();
-    *p = src_tensor;
-    outs->emplace_back(v);
+    outs->emplace_back(src_tensor);
   }
 }
 
-void ShareInputsIntoScope(const std::vector<std::string> &ordered_input_names,
-                          const std::vector<Variable> &vars,
-                          framework::Scope *scope) {
-  VLOG(3) << "vars size: " << vars.size();
+void ShareIntoScope(const std::vector<std::string> &ordered_input_names,
+                    const std::vector<DenseTensor> &tensors,
+                    framework::Scope *scope) {
+  VLOG(3) << "tensors size: " << tensors.size();
   PADDLE_ENFORCE_EQ(
-      vars.size(),
+      tensors.size(),
       ordered_input_names.size(),
       platform::errors::InvalidArgument(
-          "vars.size() should be equal to ordered_input_names.size()."));
-
-  for (size_t i = 0; i < vars.size(); i++) {
+          "tensors.size() should be equal to ordered_input_names.size()."));
+  for (size_t i = 0; i < tensors.size(); ++i) {
     VLOG(3) << "share into scope: " << ordered_input_names[i];
-    auto &dense_tensor = vars[i].Get<DenseTensor>();
     auto *var = scope->Var(ordered_input_names[i]);
     auto *dst_tensor = var->GetMutable<DenseTensor>();
-    *dst_tensor = dense_tensor;
+    *dst_tensor = tensors[i];
   }
 }
 
diff --git a/paddle/fluid/jit/function_utils.h b/paddle/fluid/jit/function_utils.h
index 49db3f71fbdbf..ba1eaf7308be9 100644
--- a/paddle/fluid/jit/function_utils.h
+++ b/paddle/fluid/jit/function_utils.h
@@ -20,6 +20,7 @@
 
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
+#include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/dense_tensor.h"
 
@@ -30,15 +31,20 @@ namespace jit {
 using Variable = paddle::framework::Variable;
 using Name2VariableMap = std::unordered_map<std::string, Variable>;
 using DenseTensor = phi::DenseTensor;
+using Tensor = paddle::experimental::Tensor;
+
 namespace utils {
 
-void FetchVarsByNames(const std::vector<std::string> &names,
-                      const framework::Scope &scope,
-                      std::vector<Variable> *outs);
+std::vector<DenseTensor> ToDenseTensors(const std::vector<Tensor> &tensors);
+std::vector<Tensor> ToTensors(const std::vector<DenseTensor> &tensors);
 
-void ShareInputsIntoScope(const std::vector<std::string> &ordered_input_names,
-                          const std::vector<Variable> &vars,
-                          framework::Scope *scope);
+void FetchOuts(const std::vector<std::string> &names,
+               const framework::Scope &scope,
+               std::vector<DenseTensor> *outs);
+
+void ShareIntoScope(const std::vector<std::string> &ordered_input_names,
+                    const std::vector<DenseTensor> &vars,
+                    framework::Scope *scope);
 
 void ShareParamsIntoScope(const std::vector<std::string> &param_names,
                           const Name2VariableMap &params_dict,
diff --git a/paddle/fluid/jit/layer.cc b/paddle/fluid/jit/layer.cc
index a11101d520493..f5985d71b0347 100644
--- a/paddle/fluid/jit/layer.cc
+++ b/paddle/fluid/jit/layer.cc
@@ -16,9 +16,6 @@
 
 namespace paddle {
 namespace jit {
-// TODO(dev): Make vector<string>, num_slot as in argument
-// Layer(const std::shared_ptr<ClassType>& type) : obj_(type, /*num_slot*/ 0U)
-// {}
 Layer::Layer(const std::vector<std::shared_ptr<FunctionInfo>>& infos,
              const Name2VariableMap& params_dict,
              const phi::Place& place)
@@ -30,7 +27,13 @@ std::shared_ptr<BaseFunction> Layer::Function(const std::string& name) const {
   return unit_.Function(name);
 }
 
-std::vector<Variable> Layer::forward(const std::vector<Variable>& inputs) {
+std::vector<Tensor> Layer::forward(const std::vector<Tensor>& inputs) {
+  auto func = Function("forward");
+  return (*func)(inputs);
+}
+
+std::vector<DenseTensor> Layer::forward(
+    const std::vector<DenseTensor>& inputs) {
   auto func = Function("forward");
   return (*func)(inputs);
 }
@@ -42,5 +45,13 @@ void Layer::SetFunction(const std::string& name,
   unit_.SetFunction(name, function);
 }
 
+std::vector<std::string> Layer::FunctionNames() const {
+  return unit_.FunctionNames();
+}
+
+const Name2FunctionMap& Layer::FunctionMap() const {
+  return unit_.FunctionMap();
+}
+
 }  // namespace jit
 }  // namespace paddle
diff --git a/paddle/fluid/jit/layer.h b/paddle/fluid/jit/layer.h
index 1407259d14444..ee75881fc3156 100644
--- a/paddle/fluid/jit/layer.h
+++ b/paddle/fluid/jit/layer.h
@@ -32,9 +32,6 @@ using Name2VariableMap = std::unordered_map<std::string, Variable>;
 
 class Layer {
  public:
-  // TODO(dev): Make vector<string>, num_slot as in argument
-  // Layer(const std::shared_ptr<ClassType>& type) : obj_(type, /*num_slot*/ 0U)
-  // {}
   Layer(const std::vector<std::shared_ptr<FunctionInfo>>& infos,
         const Name2VariableMap& params_dict,
         const phi::Place& place);
@@ -43,15 +40,20 @@ class Layer {
 
   Variable Attribute(const std::string& name) const;
 
-  std::vector<Variable> forward(const std::vector<Variable>& inputs);
+  std::vector<Tensor> forward(const std::vector<Tensor>& inputs);
+
+  std::vector<DenseTensor> forward(const std::vector<DenseTensor>& inputs);
 
   void to(const phi::Place& place);
 
   void SetFunction(const std::string& name,
                    const std::shared_ptr<BaseFunction>& function);
 
+  std::vector<std::string> FunctionNames() const;
+
+  const Name2FunctionMap& FunctionMap() const;
+
  private:
-  // internal::Object obj_;
   Name2VariableMap params_dict_;
   Name2VariableMap attrs_dict_;
   CompilationUnit unit_;
diff --git a/paddle/fluid/jit/layer_test.cc b/paddle/fluid/jit/layer_test.cc
index 6c9adff385aba..793afacb79dc7 100644
--- a/paddle/fluid/jit/layer_test.cc
+++ b/paddle/fluid/jit/layer_test.cc
@@ -52,17 +52,16 @@ namespace paddle {
 namespace jit {
 using DenseTensor = phi::DenseTensor;
 
-std::vector<Variable> PrepareInputs(const phi::Place& place) {
+std::vector<DenseTensor> PrepareInputs(const phi::Place& place) {
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto& dev_ctx = *pool.Get(place);
 
-  Variable v;
-  auto* dense_tensor = v.GetMutable<DenseTensor>();
-  dense_tensor->Resize(phi::make_ddim({2, 4}));
-  dense_tensor->mutable_data<float>(place);
-  phi::funcs::set_constant(dev_ctx, dense_tensor, 2.);
+  DenseTensor t;
+  t.Resize(phi::make_ddim({2, 4}));
+  t.mutable_data<float>(place);
+  phi::funcs::set_constant(dev_ctx, &t, 2.);
 
-  return {v};
+  return {t};
 }
 
 TEST(CpuLayerTest, Construct) {
@@ -72,16 +71,12 @@ TEST(CpuLayerTest, Construct) {
   auto inputs = PrepareInputs(place);
 
   auto outs = layer.forward(inputs);
-  auto out_vars = outs[0];
-  auto out_dense_tensor = out_vars.Get<DenseTensor>();
-  auto out_data = out_dense_tensor.data<float>();
+  auto out_data = outs[0].data<float>();
   EXPECT_NEAR(out_data[0], 0.02194316, 1e-6);
 
   auto func = layer.Function("infer");
   outs = (*func)(inputs);
-  out_vars = outs[0];
-  out_dense_tensor = out_vars.Get<DenseTensor>();
-  out_data = out_dense_tensor.data<float>();
+  out_data = outs[0].data<float>();
   EXPECT_NEAR(out_data[0], 1.41562390, 1e-6);
 }
 
@@ -98,8 +93,7 @@ TEST(GpuLayerTest, Construct) {
   auto inputs = PrepareInputs(place);
 
   auto outs = layer.forward(inputs);
-  auto out_vars = outs[0];
-  auto out_dense_tensor = out_vars.Get<DenseTensor>();
+  auto out_dense_tensor = outs[0];
   phi::Copy(
       *dev_ctx_gpu, out_dense_tensor, phi::CPUPlace(), true, &cpu_dense_tensor);
   auto out_data = cpu_dense_tensor.data<float>();
@@ -107,8 +101,7 @@ TEST(GpuLayerTest, Construct) {
 
   auto func = layer.Function("infer");
   outs = (*func)(inputs);
-  out_vars = outs[0];
-  out_dense_tensor = out_vars.Get<DenseTensor>();
+  out_dense_tensor = outs[0];
   phi::Copy(
       *dev_ctx_gpu, out_dense_tensor, phi::CPUPlace(), true, &cpu_dense_tensor);
   out_data = cpu_dense_tensor.data<float>();
diff --git a/paddle/fluid/jit/pe_function.h b/paddle/fluid/jit/pe_function.h
index a77fd59358660..f174a0e996467 100644
--- a/paddle/fluid/jit/pe_function.h
+++ b/paddle/fluid/jit/pe_function.h
@@ -43,24 +43,29 @@ class PEFunction : public BaseFunction {
 
   ~PEFunction() noexcept {}
 
-  std::vector<Variable> operator()(const std::vector<Variable> &inputs) {
-    // bool is_test = true;
+  std::vector<Tensor> operator()(const std::vector<Tensor> &inputs) {
+    auto dense_tensors = utils::ToDenseTensors(inputs);
+    return utils::ToTensors(this->operator()(dense_tensors));
+  }
+
+  std::vector<DenseTensor> operator()(const std::vector<DenseTensor> &inputs) {
     std::string prog_string;
     std::hash<std::string> string_hash;
     auto &program_desc = info_->ProgramDesc();
+    // TODO(dev): Serialize is very slow.
     const_cast<framework::ProgramDesc *>(&program_desc)
         ->Proto()
         ->SerializePartialToString(&prog_string);
-    // program_desc.Proto()->SerializePartialToString(&prog_string);
+
     int64_t program_id = static_cast<int64_t>(string_hash(prog_string));
     const framework::BlockDesc &global_block = program_desc.Block(0);
     int64_t start_op_index = 0;
     int64_t end_op_index = static_cast<int64_t>(global_block.OpSize());
 
-    utils::ShareInputsIntoScope(info_->InputArgNames(), inputs, &scope_);
+    utils::ShareIntoScope(info_->InputArgNames(), inputs, &scope_);
     std::vector<std::string> input_var_names = info_->InputArgNames();
     std::vector<std::string> output_var_names = info_->OutputArgNames();
-    std::vector<std::string> dout_var_names;
+
     if (end_op_index > start_op_index) {
       auto cache_info = framework::GetExecutorInfoFromCache(program_desc,
                                                             place_,
@@ -78,9 +83,7 @@ class PEFunction : public BaseFunction {
         skip_eager_delete_vars.insert(skip_eager_delete_vars.end(),
                                       output_var_names.begin(),
                                       output_var_names.end());
-        skip_eager_delete_vars.insert(skip_eager_delete_vars.end(),
-                                      dout_var_names.begin(),
-                                      dout_var_names.end());
+
         framework::details::ParseSafeEagerDeletionSkipVars(
             program_desc,
             end_op_index,
@@ -89,9 +92,8 @@ class PEFunction : public BaseFunction {
       }
       parallel_executor->RunWithoutFetch(skip_eager_delete_vars);
     }
-    VLOG(6) << framework::GenScopeTreeDebugInfo(&scope_);
-    std::vector<Variable> res;
-    utils::FetchVarsByNames(info_->OutputArgNames(), scope_, &res);
+    std::vector<DenseTensor> res;
+    utils::FetchOuts(info_->OutputArgNames(), scope_, &res);
     return res;
   }
 
diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt
index 5d1f97c096bdd..eccba465051b9 100644
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
@@ -1,4 +1,3 @@
-add_subdirectory(detail)
 add_subdirectory(allocation)
 
 if(WITH_MKLDNN)
@@ -10,7 +9,7 @@ endif()
 cc_library(
   malloc
   SRCS malloc.cc
-  DEPS place enforce allocator_facade profiler ${MKLDNN_CTX_DEPS})
+  DEPS place enforce allocator profiler ${MKLDNN_CTX_DEPS})
 cc_library(
   memcpy
   SRCS memcpy.cc
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 46a46b04b3e0c..ec8391469f94c 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -1,264 +1,179 @@
-cc_library(
-  allocator
-  SRCS allocator.cc
-  DEPS place stats profiler)
-cc_library(
-  cpu_allocator
-  SRCS cpu_allocator.cc
-  DEPS allocator)
-cc_library(
-  locked_allocator
-  SRCS locked_allocator.cc
-  DEPS allocator)
-cc_library(
-  buffered_allocator
-  SRCS buffered_allocator.cc
-  DEPS allocator)
-cc_library(
-  best_fit_allocator
-  SRCS best_fit_allocator.cc
-  DEPS allocator)
-cc_library(
-  naive_best_fit_allocator
-  SRCS naive_best_fit_allocator.cc
-  DEPS allocator buddy_allocator)
-cc_test(
-  naive_best_fit_allocator_test
-  SRCS naive_best_fit_allocator_test.cc
-  DEPS naive_best_fit_allocator)
-cc_test(
-  buffered_allocator_test
-  SRCS buffered_allocator_test.cc
-  DEPS locked_allocator buffered_allocator cpu_allocator best_fit_allocator)
+include(ExternalProject)
+
+set(ALLOCATOR_DEPS place stats profiler)
+set(ALLOCATOR_SRCS
+    allocator.cc
+    cpu_allocator.cc
+    locked_allocator.cc
+    aligned_allocator.cc
+    buffered_allocator.cc
+    best_fit_allocator.cc
+    naive_best_fit_allocator.cc
+    allocator_strategy.cc
+    allocator_facade.cc
+    auto_growth_best_fit_allocator.cc
+    virtual_memory_auto_growth_best_fit_allocator.cc
+    retry_allocator.cc
+    memory_block.cc
+    memory_block_desc.cc
+    meta_cache.cc
+    buddy_allocator.cc
+    system_allocator.cc)
 
-if(WITH_MKLDNN)
-  set(MKLDNN_CTX_DEPS mkldnn)
-else()
-  set(MKLDNN_CTX_DEPS)
+if(WITH_GPU OR WITH_ROCM)
+  list(
+    APPEND
+    ALLOCATOR_SRCS
+    cuda_allocator.cc
+    cuda_managed_allocator.cc
+    pinned_allocator.cc
+    stream_safe_cuda_allocator.cc
+    thread_local_allocator.cc)
+  list(APPEND ALLOCATOR_DEPS cuda_device_guard gpu_info dynload_cuda)
 endif()
 
 if(WITH_GPU)
-  nv_library(
-    cuda_allocator
-    SRCS cuda_allocator.cc
-    DEPS allocator cuda_device_guard stats)
-  nv_library(
-    cuda_managed_allocator
-    SRCS cuda_managed_allocator.cc
-    DEPS allocator cuda_device_guard gpu_info)
-  nv_library(
-    pinned_allocator
-    SRCS pinned_allocator.cc
-    DEPS allocator)
-  nv_library(
-    stream_safe_cuda_allocator
-    SRCS stream_safe_cuda_allocator.cc
-    DEPS allocator cuda_graph)
-  nv_library(
-    thread_local_allocator
-    SRCS thread_local_allocator.cc
-    DEPS allocator)
+  list(APPEND ALLOCATOR_DEPS cuda_graph)
+endif()
 
-  cc_test(
-    thread_local_allocator_test
-    SRCS thread_local_allocator_test.cc
-    DEPS thread_local_allocator)
-  if(CUDA_VERSION GREATER_EQUAL 10.2)
-    nv_library(
-      cuda_virtual_mem_allocator
-      SRCS cuda_virtual_mem_allocator.cc
-      DEPS dynload_cuda)
+if(CUDA_VERSION VERSION_GREATER_EQUAL 10.2)
+  list(APPEND ALLOCATOR_SRCS cuda_virtual_mem_allocator.cc)
+endif()
+
+if(NOT WIN32)
+  list(APPEND ALLOCATOR_SRCS mmap_allocator.cc)
+  if(WITH_GPU)
+    list(APPEND ALLOCATOR_SRCS cuda_ipc_allocator.cc)
   endif()
 endif()
 
-if(WITH_ROCM)
-  hip_library(
-    cuda_allocator
-    SRCS cuda_allocator.cc
-    DEPS allocator cuda_device_guard stats)
-  hip_library(
-    cuda_managed_allocator
-    SRCS cuda_managed_allocator.cc
-    DEPS allocator cuda_device_guard gpu_info)
-  hip_library(
-    pinned_allocator
-    SRCS pinned_allocator.cc
-    DEPS allocator)
-  hip_library(
-    stream_safe_cuda_allocator
-    SRCS stream_safe_cuda_allocator.cc
-    DEPS allocator)
-  hip_library(
-    thread_local_allocator
-    SRCS thread_local_allocator.cc
-    DEPS allocator)
+if(WITH_ASCEND_CL)
+  list(APPEND ALLOCATOR_SRCS npu_allocator.cc npu_pinned_allocator.cc)
+  list(APPEND ALLOCATOR_DEPS npu_info)
+endif()
 
-  cc_test(
-    thread_local_allocator_test
-    SRCS thread_local_allocator_test.cc
-    DEPS thread_local_allocator)
+if(WITH_CUSTOM_DEVICE)
+  list(APPEND ALLOCATOR_SRCS custom_allocator.cc)
+  list(APPEND ALLOCATOR_DEPS device_manager)
 endif()
 
-if(WITH_ASCEND_CL)
-  cc_library(
-    npu_allocator
-    SRCS npu_allocator.cc
-    DEPS allocator npu_info)
-  cc_library(
-    npu_pinned_allocator
-    SRCS npu_pinned_allocator.cc
-    DEPS allocator npu_info)
+if(WITH_XPU)
+  list(APPEND ALLOCATOR_DEPS xpu_info)
 endif()
 
-cc_library(
-  retry_allocator
-  SRCS retry_allocator.cc
+if(WITH_IPU)
+  list(APPEND ALLOCATOR_DEPS ipu_info)
+endif()
+
+add_library(allocator "${ALLOCATOR_SRCS}")
+target_link_libraries(allocator ${ALLOCATOR_DEPS})
+# note: why only add dependency for framework_proto.
+# Because it is needed to generate framework.pb.h used in some header files.
+add_dependencies(allocator framework_proto)
+set_property(GLOBAL PROPERTY FLUID_MODULES allocator)
+
+cc_test(
+  naive_best_fit_allocator_test
+  SRCS naive_best_fit_allocator_test.cc
+  DEPS allocator)
+cc_test(
+  buffered_allocator_test
+  SRCS buffered_allocator_test.cc
   DEPS allocator)
 
-if(WITH_GPU OR WITH_ROCM)
-  set(AllocatorFacadeDeps
-      gpu_info
-      cuda_allocator
-      cuda_managed_allocator
-      pinned_allocator
-      cuda_device_guard
-      thread_local_allocator
-      stream_safe_cuda_allocator
-      device_context)
-  if(CUDA_VERSION GREATER_EQUAL 10.2)
-    list(APPEND AllocatorFacadeDeps cuda_virtual_mem_allocator)
-  endif()
-elseif(WITH_XPU)
-  set(AllocatorFacadeDeps xpu_info)
-elseif(WITH_IPU)
-  set(AllocatorFacadeDeps ipu_info)
-elseif(WITH_ASCEND)
-  set(AllocatorFacadeDeps ascend_npu_info)
-else()
-  set(AllocatorFacadeDeps)
+if(WITH_GPU)
+  nv_test(
+    thread_local_allocator_test
+    SRCS thread_local_allocator_test.cc
+    DEPS allocator)
 endif()
-
-if(WITH_CUSTOM_DEVICE)
-  cc_library(
-    custom_allocator
-    SRCS custom_allocator.cc
-    DEPS allocator device_manager)
-  set(AllocatorFacadeDeps ${AllocatorFacadeDeps} custom_allocator)
+if(WITH_ROCM)
+  hip_test(
+    thread_local_allocator_test
+    SRCS thread_local_allocator_test.cc
+    DEPS allocator)
 endif()
 
 if(WITH_GPU)
   nv_test(
     best_fit_allocator_test
     SRCS best_fit_allocator_test.cc best_fit_allocator_test.cu
-    DEPS best_fit_allocator locked_allocator cpu_allocator cuda_allocator
-         device_context memcpy)
+    DEPS allocator memcpy)
 elseif(WITH_ROCM)
   hip_test(
     best_fit_allocator_test
     SRCS best_fit_allocator_test.cc best_fit_allocator_test.cu
-    DEPS best_fit_allocator locked_allocator cpu_allocator cuda_allocator
-         device_context memcpy)
+    DEPS allocator memcpy)
 else()
   cc_test(
     best_fit_allocator_test
     SRCS best_fit_allocator_test.cc
-    DEPS best_fit_allocator locked_allocator cpu_allocator)
-endif()
-
-list(
-  APPEND
-  AllocatorFacadeDeps
-  cpu_allocator
-  locked_allocator
-  aligned_allocator
-  retry_allocator
-  buffered_allocator
-  naive_best_fit_allocator
-  auto_growth_best_fit_allocator
-  virtual_memory_auto_growth_best_fit_allocator
-  best_fit_allocator)
-
-if(WITH_ASCEND_CL)
-  list(APPEND AllocatorFacadeDeps npu_pinned_allocator)
+    DEPS allocator)
 endif()
 
-cc_library(
-  aligned_allocator
-  SRCS aligned_allocator.cc
-  DEPS allocator)
 cc_test(
   test_aligned_allocator
   SRCS test_aligned_allocator.cc
-  DEPS aligned_allocator)
-cc_library(
-  allocator_strategy
-  SRCS allocator_strategy.cc
-  DEPS gflags ${AllocatorFacadeDeps})
-cc_library(
-  allocator_facade
-  SRCS allocator_facade.cc
-  DEPS allocator_strategy stats)
-
-if(WITH_GPU)
-  target_link_libraries(allocator_facade cuda_graph)
-endif()
+  DEPS allocator)
 
 cc_test(
   retry_allocator_test
   SRCS retry_allocator_test.cc
-  DEPS retry_allocator locked_allocator cpu_allocator)
-if(WITH_TESTING)
-  if((WITH_GPU OR WITH_ROCM) AND TARGET retry_allocator_test)
-    target_link_libraries(retry_allocator_test cuda_allocator)
-  endif()
-
-  if(TEST retry_allocator_test)
-    set_tests_properties(retry_allocator_test PROPERTIES LABELS
-                                                         "RUN_TYPE=EXCLUSIVE")
-  endif()
+  DEPS allocator)
+if(TEST retry_allocator_test)
+  set_tests_properties(retry_allocator_test PROPERTIES LABELS
+                                                       "RUN_TYPE=EXCLUSIVE")
 endif()
 
 cc_test(
   allocator_facade_abs_flags_test
   SRCS allocator_facade_abs_flags_test.cc
-  DEPS allocator_facade)
+  DEPS allocator)
 
 cc_test(
   allocator_facade_frac_flags_test
   SRCS allocator_facade_frac_flags_test.cc
-  DEPS allocator_facade)
+  DEPS allocator)
 
-cc_library(
-  auto_growth_best_fit_allocator
-  SRCS auto_growth_best_fit_allocator.cc
-  DEPS allocator aligned_allocator flags)
 cc_test(
   auto_growth_best_fit_allocator_facade_test
   SRCS auto_growth_best_fit_allocator_facade_test.cc
-  DEPS cpu_allocator auto_growth_best_fit_allocator)
+  DEPS allocator)
 cc_test(
   auto_growth_best_fit_allocator_test
   SRCS auto_growth_best_fit_allocator_test.cc
-  DEPS auto_growth_best_fit_allocator)
-
-cc_library(
-  virtual_memory_auto_growth_best_fit_allocator
-  SRCS virtual_memory_auto_growth_best_fit_allocator.cc
-  DEPS allocator aligned_allocator)
+  DEPS allocator)
 
 if(NOT WIN32)
-  cc_library(
-    mmap_allocator
-    SRCS mmap_allocator.cc
-    DEPS allocator)
   cc_test(
     mmap_allocator_test
     SRCS mmap_allocator_test.cc
-    DEPS mmap_allocator allocator)
-  if(WITH_GPU)
-    cc_library(
-      cuda_ipc_allocator
-      SRCS cuda_ipc_allocator.cc
-      DEPS allocator)
+    DEPS allocator)
+endif()
+
+cc_test(
+  system_allocator_test
+  SRCS system_allocator_test.cc
+  DEPS allocator)
+
+cc_test(
+  buddy_allocator_test
+  SRCS buddy_allocator_test.cc
+  DEPS allocator)
+
+if(WITH_TESTING)
+  if(TEST buddy_allocator_test)
+    set_tests_properties(buddy_allocator_test PROPERTIES LABELS
+                                                         "RUN_TYPE=EXCLUSIVE")
+  endif()
+
+  # TODO(zhiqiu): why not win32? because wget is not found on windows
+  if(NOT WIN32)
+    add_custom_target(
+      download_data
+      COMMAND wget -nc
+              https://paddle-ci.cdn.bcebos.com/buddy_allocator_test_data.tar
+      COMMAND tar -xf buddy_allocator_test_data.tar)
+    add_dependencies(buddy_allocator_test download_data)
   endif()
 endif()
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 4364934a4027d..917cebc11f9a9 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -1080,11 +1080,11 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
   } else {
     return m->GetAllocator(p, size)->Allocate(size);
   }
-#elif defined PADDLE_WITH_XPU
+#elif defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_ASCEND_CL)
   return GetAllocator(place)->Allocate(size);
 #else
-  PADDLE_THROW(
-      platform::errors::PreconditionNotMet("Not compiled with GPU or XPU."));
+  PADDLE_THROW(platform::errors::PreconditionNotMet(
+      "Not compiled with GPU or XPU or NPU."));
 #endif
 }
 
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/allocation/buddy_allocator.cc
similarity index 99%
rename from paddle/fluid/memory/detail/buddy_allocator.cc
rename to paddle/fluid/memory/allocation/buddy_allocator.cc
index 90cce14c5676c..907fd37e44205 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/allocation/buddy_allocator.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/memory/detail/buddy_allocator.h"
+#include "paddle/fluid/memory/allocation/buddy_allocator.h"
 
 #include <algorithm>
 
diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/allocation/buddy_allocator.h
similarity index 97%
rename from paddle/fluid/memory/detail/buddy_allocator.h
rename to paddle/fluid/memory/allocation/buddy_allocator.h
index 463e3cfcf6d8d..5e39e21c9664f 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/allocation/buddy_allocator.h
@@ -25,8 +25,8 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "paddle/fluid/memory/detail/memory_block.h"
-#include "paddle/fluid/memory/detail/system_allocator.h"
+#include "paddle/fluid/memory/allocation/memory_block.h"
+#include "paddle/fluid/memory/allocation/system_allocator.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/npu/npu_info.h"
diff --git a/paddle/fluid/memory/detail/buddy_allocator_test.cc b/paddle/fluid/memory/allocation/buddy_allocator_test.cc
similarity index 99%
rename from paddle/fluid/memory/detail/buddy_allocator_test.cc
rename to paddle/fluid/memory/allocation/buddy_allocator_test.cc
index ab558e8bfce15..ad53a784502b4 100644
--- a/paddle/fluid/memory/detail/buddy_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/buddy_allocator_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/memory/detail/buddy_allocator.h"
+#include "paddle/fluid/memory/allocation/buddy_allocator.h"
 
 #include <memory>
 
@@ -330,7 +330,7 @@ TEST(BuddyAllocator, SpeedAna) {
   std::vector<bool> vec_free_flag;
 
   std::string line;
-  int size, id;
+  int size = 0, id = 0;
   while (in_file >> size >> id) {
     vec_size.push_back(size);
     vec_pos.push_back(id);
diff --git a/paddle/fluid/memory/detail/memory_block.cc b/paddle/fluid/memory/allocation/memory_block.cc
similarity index 98%
rename from paddle/fluid/memory/detail/memory_block.cc
rename to paddle/fluid/memory/allocation/memory_block.cc
index 52f7d33aae1d3..0f0a81cf9d118 100644
--- a/paddle/fluid/memory/detail/memory_block.cc
+++ b/paddle/fluid/memory/allocation/memory_block.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/memory/detail/memory_block.h"
+#include "paddle/fluid/memory/allocation/memory_block.h"
 
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/memory/detail/memory_block.h b/paddle/fluid/memory/allocation/memory_block.h
similarity index 100%
rename from paddle/fluid/memory/detail/memory_block.h
rename to paddle/fluid/memory/allocation/memory_block.h
diff --git a/paddle/fluid/memory/detail/memory_block_desc.cc b/paddle/fluid/memory/allocation/memory_block_desc.cc
similarity index 97%
rename from paddle/fluid/memory/detail/memory_block_desc.cc
rename to paddle/fluid/memory/allocation/memory_block_desc.cc
index 93d2559c37f77..d20d56a6d05e8 100644
--- a/paddle/fluid/memory/detail/memory_block_desc.cc
+++ b/paddle/fluid/memory/allocation/memory_block_desc.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <cstddef>
 #include <functional>
 
-#include "paddle/fluid/memory/detail/memory_block.h"
+#include "paddle/fluid/memory/allocation/memory_block.h"
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/fluid/memory/detail/meta_cache.cc b/paddle/fluid/memory/allocation/meta_cache.cc
similarity index 97%
rename from paddle/fluid/memory/detail/meta_cache.cc
rename to paddle/fluid/memory/allocation/meta_cache.cc
index 4831e005c84c0..945b0f7b89283 100644
--- a/paddle/fluid/memory/detail/meta_cache.cc
+++ b/paddle/fluid/memory/allocation/meta_cache.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "glog/logging.h"
-#include "paddle/fluid/memory/detail/memory_block.h"
+#include "paddle/fluid/memory/allocation/memory_block.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index 9d5f048a1651d..d1a3b77e7720b 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -18,8 +18,8 @@
 
 #include "gflags/gflags.h"
 #include "glog/logging.h"
-#include "paddle/fluid/memory/detail/buddy_allocator.h"
-#include "paddle/fluid/memory/detail/system_allocator.h"
+#include "paddle/fluid/memory/allocation/buddy_allocator.h"
+#include "paddle/fluid/memory/allocation/system_allocator.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -59,7 +59,7 @@ uint64_t Release(const Place &place);
 template <typename Place>
 size_t Used(const Place &place);
 
-struct Usage : public boost::static_visitor<size_t> {
+struct Usage {
   size_t operator()(const platform::CPUPlace &cpu) const;
   size_t operator()(const platform::CUDAPlace &gpu) const;
   size_t operator()(const platform::CUDAPinnedPlace &cuda_pinned) const;
@@ -180,7 +180,7 @@ void Free<platform::XPUPlace>(const platform::XPUPlace &place,
                               void *p,
                               size_t size) {
 #ifdef PADDLE_WITH_XPU
-  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+  VLOG(10) << "Free " << size << " bytes on " << platform::Place(place);
   VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
 
   platform::XPUDeviceGuard gurad(place.device);
@@ -764,7 +764,7 @@ class BuddyAllocatorList {
  private:
   explicit BuddyAllocatorList(const std::string &device_type)
       : device_type_(device_type) {
-    auto devices = phi::DeviceManager::GetDeviceList(device_type);
+    auto devices = phi::DeviceManager::GetSelectedDeviceList(device_type);
     for (auto dev_id : devices) {
       init_flags_[dev_id].reset(new std::once_flag());
     }
@@ -894,7 +894,7 @@ size_t Used<platform::CustomPlace>(const platform::CustomPlace &place) {
 #endif
 }
 
-struct AllocVisitor : public boost::static_visitor<void *> {
+struct AllocVisitor : std::unary_function<const Place, void *> {
   inline explicit AllocVisitor(size_t size) : size_(size) {}
 
   template <typename Place>
@@ -906,7 +906,7 @@ struct AllocVisitor : public boost::static_visitor<void *> {
   size_t size_;
 };
 
-struct FreeVisitor : public boost::static_visitor<void> {
+struct FreeVisitor : public std::unary_function<const Place, void> {
   inline explicit FreeVisitor(void *ptr, size_t size)
       : ptr_(ptr), size_(size) {}
 
@@ -920,7 +920,7 @@ struct FreeVisitor : public boost::static_visitor<void> {
   size_t size_;
 };
 
-struct ReleaseVisitor : public boost::static_visitor<uint64_t> {
+struct ReleaseVisitor : std::unary_function<const Place, uint64_t> {
   template <typename Place>
   inline uint64_t operator()(const Place &place) const {
     return Release<Place>(place);
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/allocation/system_allocator.cc
similarity index 99%
rename from paddle/fluid/memory/detail/system_allocator.cc
rename to paddle/fluid/memory/allocation/system_allocator.cc
index eb5c74e56d61f..fcfece978cb7f 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/allocation/system_allocator.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #define GLOG_NO_ABBREVIATED_SEVERITIES
 
-#include "paddle/fluid/memory/detail/system_allocator.h"
+#include "paddle/fluid/memory/allocation/system_allocator.h"
 
 #include "paddle/fluid/memory/stats.h"
 
diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/allocation/system_allocator.h
similarity index 100%
rename from paddle/fluid/memory/detail/system_allocator.h
rename to paddle/fluid/memory/allocation/system_allocator.h
diff --git a/paddle/fluid/memory/detail/system_allocator_test.cc b/paddle/fluid/memory/allocation/system_allocator_test.cc
similarity index 97%
rename from paddle/fluid/memory/detail/system_allocator_test.cc
rename to paddle/fluid/memory/allocation/system_allocator_test.cc
index dbf3fad6c3373..4749ff3f8adb7 100644
--- a/paddle/fluid/memory/detail/system_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/system_allocator_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/memory/detail/system_allocator.h"
+#include "paddle/fluid/memory/allocation/system_allocator.h"
 
 #include <memory>
 
diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.h b/paddle/fluid/memory/allocation/thread_local_allocator.h
index 3b71ec866b663..a2c9e813f7ac6 100644
--- a/paddle/fluid/memory/allocation/thread_local_allocator.h
+++ b/paddle/fluid/memory/allocation/thread_local_allocator.h
@@ -18,8 +18,8 @@
 #include <vector>
 
 #include "paddle/fluid/memory/allocation/allocator.h"
-#include "paddle/fluid/memory/detail/buddy_allocator.h"
-#include "paddle/fluid/memory/detail/system_allocator.h"
+#include "paddle/fluid/memory/allocation/buddy_allocator.h"
+#include "paddle/fluid/memory/allocation/system_allocator.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 namespace paddle {
diff --git a/paddle/fluid/memory/detail/CMakeLists.txt b/paddle/fluid/memory/detail/CMakeLists.txt
deleted file mode 100644
index afe5c0dba0f3b..0000000000000
--- a/paddle/fluid/memory/detail/CMakeLists.txt
+++ /dev/null
@@ -1,79 +0,0 @@
-include(ExternalProject)
-
-cc_library(
-  memory_block
-  SRCS memory_block.cc memory_block_desc.cc meta_cache.cc
-  DEPS place)
-
-if(WITH_GPU)
-  nv_library(
-    system_allocator
-    SRCS system_allocator.cc
-    DEPS gflags cpu_info gpu_info place)
-elseif(WITH_ROCM)
-  hip_library(
-    system_allocator
-    SRCS system_allocator.cc
-    DEPS gflags cpu_info gpu_info place)
-elseif(${WITH_ASCEND_CL})
-  cc_library(
-    system_allocator
-    SRCS system_allocator.cc
-    DEPS gflags cpu_info npu_info place)
-elseif(WITH_MLU)
-  cc_library(
-    system_allocator
-    SRCS system_allocator.cc
-    DEPS gflags cpu_info mlu_info place)
-else()
-  cc_library(
-    system_allocator
-    SRCS system_allocator.cc
-    DEPS gflags cpu_info place)
-endif()
-
-cc_test(
-  system_allocator_test
-  SRCS system_allocator_test.cc
-  DEPS system_allocator)
-
-cc_library(
-  buddy_allocator
-  SRCS buddy_allocator.cc
-  DEPS memory_block system_allocator glog)
-
-cc_test(
-  buddy_allocator_test
-  SRCS buddy_allocator_test.cc
-  DEPS buddy_allocator)
-
-function(file_download_and_uncompress URL NAME)
-  message(STATUS "Download dependence[${NAME}] from ${URL}")
-  set(${NAME}_INCLUDE_DIR
-      ${THIRD_PARTY_PATH}/${NAME}
-      PARENT_SCOPE)
-  ExternalProject_Add(
-    extern_download_${NAME}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    PREFIX ${THIRD_PARTY_PATH}/${NAME}
-    URL ${URL}
-    DOWNLOAD_DIR ${THIRD_PARTY_PATH}/${NAME}
-    SOURCE_DIR ${THIRD_PARTY_PATH}/${NAME}
-    DOWNLOAD_NO_PROGRESS 1
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND ""
-    UPDATE_COMMAND ""
-    INSTALL_COMMAND "")
-  set(third_party_deps
-      ${third_party_deps} extern_download_${NAME}
-      PARENT_SCOPE)
-endfunction()
-
-if(WITH_TESTING)
-  if(TEST buddy_allocator_test)
-    set_tests_properties(buddy_allocator_test PROPERTIES LABELS
-                                                         "RUN_TYPE=EXCLUSIVE")
-  endif()
-  set(URL "https://paddle-ci.cdn.bcebos.com/buddy_allocator_test_data.tar")
-  file_download_and_uncompress(URL "buddy_allocator")
-endif()
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index f09cbfc3bef16..05f46dd396023 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -1442,6 +1442,28 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place,
     return Copy(place_dst, dst, place_src, src, num);
   }
 #endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  else if (src_place.GetType() == phi::AllocationType::CPU &&  // NOLINT
+           dst_place.GetType() == phi::AllocationType::CUSTOM) {
+    platform::CustomPlace place_dst(dst_place.GetDeviceType(),
+                                    dst_place.GetDeviceId());
+    platform::CPUPlace place_src;
+    return Copy(place_dst, dst, place_src, src, num, nullptr);
+  } else if (src_place.GetType() == phi::AllocationType::CUSTOM &&
+             dst_place.GetType() == phi::AllocationType::CPU) {
+    platform::CustomPlace place_src(src_place.GetDeviceType(),
+                                    src_place.GetDeviceId());
+    platform::CPUPlace place_dst;
+    return Copy(place_dst, dst, place_src, src, num, nullptr);
+  } else if (src_place.GetType() == phi::AllocationType::CUSTOM &&
+             dst_place.GetType() == phi::AllocationType::CUSTOM) {
+    platform::CustomPlace place_src(src_place.GetDeviceType(),
+                                    src_place.GetDeviceId());
+    platform::CustomPlace place_dst(dst_place.GetDeviceType(),
+                                    dst_place.GetDeviceId());
+    return Copy(place_dst, dst, place_src, src, num, nullptr);
+  }
+#endif
 }
 
 // NOTE: Only for (CPUPlace) -> (CPUPlace and PinnedPlace).
diff --git a/paddle/fluid/memory/pinned_memory_test.cu b/paddle/fluid/memory/pinned_memory_test.cu
index 5b982f62c86de..259222754e8f8 100644
--- a/paddle/fluid/memory/pinned_memory_test.cu
+++ b/paddle/fluid/memory/pinned_memory_test.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 
 #include <unordered_map>
 
-#include "paddle/fluid/memory/detail/memory_block.h"
+#include "paddle/fluid/memory/allocation/memory_block.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/cpu_info.h"
diff --git a/paddle/fluid/memory/stats.cc b/paddle/fluid/memory/stats.cc
index 12312a28f6c2a..0289859dff30e 100644
--- a/paddle/fluid/memory/stats.cc
+++ b/paddle/fluid/memory/stats.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/stats.h"
 
 #include "paddle/fluid/memory/allocation/spin_lock.h"
-#include "paddle/fluid/platform/variant.h"
+#include "paddle/phi/core/macros.h"
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 17aabc25b3fa4..893f7d51140a7 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -101,7 +101,7 @@ else()
     cc_library(gather_scatter_kernel SRCS gather_scatter_kernel.cc gather_scatter_kernel.cu DEPS tensor)
 endif()
 
-set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi phi_api_utils gather_scatter_kernel)
+set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi phi_api_utils gather_scatter_kernel backward_infermeta)
 
 register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op quantize_linear_op
         recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
@@ -149,6 +149,10 @@ if (WITH_ASCEND_CL)
   op_library(sync_batch_norm_op)
 endif()
 
+if (WITH_MLU)
+  op_library(sync_batch_norm_op)
+endif()
+
 op_library(lstm_op DEPS ${OP_HEADER_DEPS}  lstm_compute)
 op_library(eye_op DEPS ${OP_HEADER_DEPS})
 op_library(recurrent_op DEPS ${OP_HEADER_DEPS})
@@ -168,7 +172,7 @@ sequence_pooling executor device_memory_aligment generator)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc_functor matrix_inverse matrix_solve)
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper boost ps_gpu_wrapper)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper ps_gpu_wrapper)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} common_infer_shape_functions)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} eigen_function)
 if (WITH_GPU OR WITH_ROCM)
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 8f443f6f165e5..4a7f6cfbf0b31 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -1469,20 +1469,16 @@ namespace plat = paddle::platform;
                     ops::ActivationOpGrad,                                  \
                     ops::ActivationGradOpInplaceInferer);
 
-#define REGISTER_ACTIVATION_CPU_KERNEL(                             \
-    act_type, op_name, functor, grad_functor)                       \
-  REGISTER_OP_CPU_KERNEL(                                           \
-      act_type,                                                     \
-      ops::ActivationKernel<paddle::platform::CPUDeviceContext,     \
-                            ops::functor<float>>,                   \
-      ops::ActivationKernel<paddle::platform::CPUDeviceContext,     \
-                            ops::functor<double>>);                 \
-  REGISTER_OP_CPU_KERNEL(                                           \
-      act_type##_grad,                                              \
-      ops::ActivationGradKernel<paddle::platform::CPUDeviceContext, \
-                                ops::grad_functor<float>>,          \
-      ops::ActivationGradKernel<paddle::platform::CPUDeviceContext, \
-                                ops::grad_functor<double>>);
+#define REGISTER_ACTIVATION_CPU_KERNEL(                                     \
+    act_type, op_name, functor, grad_functor)                               \
+  REGISTER_OP_CPU_KERNEL(                                                   \
+      act_type,                                                             \
+      ops::ActivationKernel<phi::CPUContext, ops::functor<float>>,          \
+      ops::ActivationKernel<phi::CPUContext, ops::functor<double>>);        \
+  REGISTER_OP_CPU_KERNEL(                                                   \
+      act_type##_grad,                                                      \
+      ops::ActivationGradKernel<phi::CPUContext, ops::grad_functor<float>>, \
+      ops::ActivationGradKernel<phi::CPUContext, ops::grad_functor<double>>);
 
 FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP);
 FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CPU_KERNEL);
diff --git a/paddle/fluid/operators/activation_op_mlu.cc b/paddle/fluid/operators/activation_op_mlu.cc
index 4d6fe0d2b3830..d1087965f044e 100644
--- a/paddle/fluid/operators/activation_op_mlu.cc
+++ b/paddle/fluid/operators/activation_op_mlu.cc
@@ -208,11 +208,272 @@ class LogMLUKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename T>
+class ExpMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+    output->mutable_data<T>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc input_desc(*input);
+    MLUCnnlTensorDesc output_desc(*output);
+    cnnlComputationPreference_t prefer = CNNL_COMPUTATION_HIGH_PRECISION;
+
+    MLUCnnl::Exp(ctx,
+                 prefer,
+                 input_desc.get(),
+                 GetBasePtr(input),
+                 output_desc.get(),
+                 GetBasePtr(output));
+  }
+};
+
+template <typename T>
+class ExpGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    dx->mutable_data<T>(ctx.GetPlace());
+    MLUCnnlTensorDesc dout_desc(*dout);
+    MLUCnnlTensorDesc dx_desc(*dx);
+    MLUCnnlTensorDesc out_desc(*out);
+
+    MLUCnnlOpTensorDesc op_tensor_desc(
+        CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
+
+    MLUCnnl::OpTensor(ctx,
+                      op_tensor_desc.get(),
+                      dout_desc.get(),
+                      GetBasePtr(dout),
+                      out_desc.get(),
+                      GetBasePtr(out),
+                      dx_desc.get(),
+                      GetBasePtr(dx),
+                      ToCnnlDataType<T>());
+  }
+};
+
+template <typename T>
+class HardSwishMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+    output->mutable_data<T>(ctx.GetPlace());
+    float threshold = ctx.Attr<float>("threshold");
+    float scale = ctx.Attr<float>("scale");
+    float offset = ctx.Attr<float>("offset");
+    PADDLE_ENFORCE_EQ(threshold,
+                      6.0f,
+                      platform::errors::External(
+                          "Not support threshold [%f] in MLU", threshold));
+    PADDLE_ENFORCE_EQ(
+        scale,
+        6.0f,
+        platform::errors::External("Not support scale [%f] in MLU", scale));
+    PADDLE_ENFORCE_EQ(
+        offset,
+        3.0f,
+        platform::errors::External("Not support offset [%f] in MLU", offset));
+
+    MLUCnnlActivationDesc act_desc(CNNL_ACTIVATION_HARDSWISH,
+                                   1.0f /*ceof useless*/);
+    MLUCnnlTensorDesc input_desc(*input);
+    MLUCnnlTensorDesc output_desc(*output);
+
+    MLUCnnl::Active(ctx,
+                    act_desc.get(),
+                    input_desc.get(),
+                    GetBasePtr(input),
+                    output_desc.get(),
+                    GetBasePtr(output));
+  }
+};
+
+template <typename T>
+class HardSwishGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    float threshold = ctx.Attr<float>("threshold");
+    float scale = ctx.Attr<float>("scale");
+    float offset = ctx.Attr<float>("offset");
+    PADDLE_ENFORCE_EQ(threshold,
+                      6.0f,
+                      platform::errors::External(
+                          "Not support threshold [%f] in MLU", threshold));
+    PADDLE_ENFORCE_EQ(
+        scale,
+        6.0f,
+        platform::errors::External("Not support scale [%f] in MLU", scale));
+    PADDLE_ENFORCE_EQ(
+        offset,
+        3.0f,
+        platform::errors::External("Not support offset [%f] in MLU", offset));
+    auto* out = ctx.Input<Tensor>("X");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    dx->mutable_data<T>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc out_desc(*out);
+    MLUCnnlTensorDesc dout_desc(*dout);
+    MLUCnnlTensorDesc dx_desc(*dx);
+    MLUCnnlActivationDesc act_desc(CNNL_ACTIVATION_HARDSWISH,
+                                   1.0f /*ceof useless*/);
+    MLUCnnl::ActiveGrad(ctx,
+                        act_desc.get(),
+                        nullptr,
+                        nullptr,
+                        nullptr,
+                        nullptr,
+                        dout_desc.get(),
+                        GetBasePtr(dout),
+                        out_desc.get(),
+                        GetBasePtr(out),
+                        dx_desc.get(),
+                        GetBasePtr(dx));
+  }
+};
+
+template <typename T>
+class HardSigmoidMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+    float slope = ctx.Attr<float>("slope");
+    float offset = ctx.Attr<float>("offset");
+    output->mutable_data<T>(ctx.GetPlace());
+
+    MLUCnnlActivationDesc act_desc(CNNL_ACTIVATION_HARDSIGMOID,
+                                   1.0f /*ceof useless*/,
+                                   1.0f /*sliced_dim useless*/,
+                                   slope,
+                                   offset);
+    MLUCnnlTensorDesc input_desc(*input);
+    MLUCnnlTensorDesc output_desc(*output);
+
+    MLUCnnl::Active(ctx,
+                    act_desc.get(),
+                    input_desc.get(),
+                    GetBasePtr(input),
+                    output_desc.get(),
+                    GetBasePtr(output));
+  }
+};
+
+template <typename T>
+class HardSigmoidGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    float slope = ctx.Attr<float>("slope");
+    float offset = ctx.Attr<float>("offset");
+    dx->mutable_data<T>(ctx.GetPlace());
+
+    MLUCnnlActivationDesc act_desc(CNNL_ACTIVATION_HARDSIGMOID,
+                                   1.0f /*ceof useless*/,
+                                   1.0f /*sliced_dim useless*/,
+                                   slope,
+                                   offset);
+    MLUCnnlTensorDesc out_desc(*out);
+    MLUCnnlTensorDesc dout_desc(*dout);
+    MLUCnnlTensorDesc dx_desc(*dx);
+    MLUCnnl::ActiveGrad(ctx,
+                        act_desc.get(),
+                        nullptr,
+                        nullptr,
+                        nullptr,
+                        nullptr,
+                        dout_desc.get(),
+                        GetBasePtr(dout),
+                        out_desc.get(),
+                        GetBasePtr(out),
+                        dx_desc.get(),
+                        GetBasePtr(dx));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ReciprocalMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+    auto place = ctx.GetPlace();
+    out->mutable_data<T>(place);
+    MLUCnnlTensorDesc x_desc(*x);
+    MLUCnnlTensorDesc out_desc(*out);
+    MLUCnnl::Reciprocal(
+        ctx, x_desc.get(), GetBasePtr(x), out_desc.get(), GetBasePtr(out));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ReciprocalGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto place = ctx.GetPlace();
+    dx->mutable_data<T>(place);
+    Tensor square_out;
+    square_out.Resize(out->dims());
+    square_out.mutable_data<T>(place);
+    MLUCnnlTensorDesc out_desc(*out);
+    MLUCnnlTensorDesc dout_desc(*dout);
+    MLUCnnlTensorDesc dx_desc(*dx);
+    MLUCnnlTensorDesc square_out_desc(square_out);
+    MLUCnnl::Square(ctx,
+                    out_desc.get(),
+                    GetBasePtr(out),
+                    square_out_desc.get(),
+                    GetBasePtr(&square_out));
+    cnnlOpTensorDesc_t op_tensor_op = CNNL_OP_TENSOR_MUL;
+    cnnlDataType_t op_tensor_comp_type = CNNL_DTYPE_FLOAT;
+    cnnlNanPropagation_t op_tensor_nan_opt = CNNL_NOT_PROPAGATE_NAN;
+    MLUCnnlOpTensorDesc op_tensor_desc(
+        op_tensor_op, op_tensor_comp_type, op_tensor_nan_opt);
+    float alpha1_float = -1;
+    float alpha2_float = 1;
+    float beta_float = 0;
+    MLUCnnl::OpTensor(ctx,
+                      op_tensor_desc.get(),
+                      dout_desc.get(),
+                      GetBasePtr(dout),
+                      square_out_desc.get(),
+                      GetBasePtr(&square_out),
+                      dx_desc.get(),
+                      GetBasePtr(dx),
+                      op_tensor_comp_type,
+                      alpha1_float,
+                      alpha2_float,
+                      beta_float);
+  }
+};
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
+// reciprocal
+REGISTER_OP_MLU_KERNEL(
+    reciprocal,
+    ops::ReciprocalMLUKernel<paddle::platform::MLUDeviceContext, float>,
+    ops::ReciprocalMLUKernel<paddle::platform::MLUDeviceContext,
+                             paddle::platform::float16>);
+
+REGISTER_OP_MLU_KERNEL(
+    reciprocal_grad,
+    ops::ReciprocalGradMLUKernel<paddle::platform::MLUDeviceContext, float>,
+    ops::ReciprocalGradMLUKernel<paddle::platform::MLUDeviceContext,
+                                 paddle::platform::float16>);
 // relu
 REGISTER_OP_MLU_KERNEL(
     relu,
@@ -303,3 +564,28 @@ REGISTER_OP_MLU_KERNEL(
     log10,
     ops::LogMLUKernel<CNNL_LOG_10, float>,
     ops::LogMLUKernel<CNNL_LOG_10, paddle::platform::float16>);
+
+REGISTER_OP_MLU_KERNEL(exp,
+                       ops::ExpMLUKernel<float>,
+                       ops::ExpMLUKernel<paddle::platform::float16>);
+
+REGISTER_OP_MLU_KERNEL(exp_grad,
+                       ops::ExpGradMLUKernel<float>,
+                       ops::ExpGradMLUKernel<paddle::platform::float16>);
+
+REGISTER_OP_MLU_KERNEL(hard_swish,
+                       ops::HardSwishMLUKernel<float>,
+                       ops::HardSwishMLUKernel<paddle::platform::float16>);
+
+REGISTER_OP_MLU_KERNEL(hard_swish_grad,
+                       ops::HardSwishGradMLUKernel<float>,
+                       ops::HardSwishGradMLUKernel<paddle::platform::float16>);
+
+REGISTER_OP_MLU_KERNEL(hard_sigmoid,
+                       ops::HardSigmoidMLUKernel<float>,
+                       ops::HardSigmoidMLUKernel<paddle::platform::float16>);
+
+REGISTER_OP_MLU_KERNEL(
+    hard_sigmoid_grad,
+    ops::HardSigmoidGradMLUKernel<float>,
+    ops::HardSigmoidGradMLUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/activation_op_xpu.cc b/paddle/fluid/operators/activation_op_xpu.cc
index 67a1d70ebad44..0e7136b9f6ce8 100644
--- a/paddle/fluid/operators/activation_op_xpu.cc
+++ b/paddle/fluid/operators/activation_op_xpu.cc
@@ -158,20 +158,29 @@ struct XPUReciprocalGradFunctor : public BaseActivationFunctor<T> {
 };
 
 template <typename T>
-struct XPUReluFunctor : public BaseActivationFunctor<T> {
+struct XPUReluGradFunctor : public BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  void operator()(const framework::ExecutionContext &ctx) const {
+    xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
+        ctx, xpu::relu_grad<XPUType>);
+  }
+};
+
+template <typename T>
+struct XPURelu6Functor : public BaseActivationFunctor<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
     xpu_activation_forward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::relu<XPUType>);
+        ctx, xpu::relu6<XPUType>);
   }
 };
 
 template <typename T>
-struct XPUReluGradFunctor : public BaseActivationFunctor<T> {
+struct XPURelu6GradFunctor : public BaseActivationFunctor<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
   void operator()(const framework::ExecutionContext &ctx) const {
     xpu_activation_backward<paddle::platform::XPUDeviceContext, T, XPUType>(
-        ctx, xpu::relu_grad<XPUType>);
+        ctx, xpu::relu6_grad<XPUType>);
   }
 };
 
@@ -416,6 +425,24 @@ struct XPUPowGradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
+template <typename T>
+struct XPUReluFunctor : public BaseActivationFunctor<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  void operator()(const framework::ExecutionContext &ctx) const {
+    const auto *x = ctx.Input<Tensor>("X");
+    auto *y = ctx.Output<Tensor>("Out");
+    const XPUType *x_data = reinterpret_cast<const XPUType *>(x->data<T>());
+    XPUType *y_data =
+        reinterpret_cast<XPUType *>(y->mutable_data<T>(ctx.GetPlace()));
+
+    auto xpu_context =
+        ctx.device_context<paddle::platform::XPUDeviceContext>().x_context();
+    int r =
+        xpu::relu(xpu_context, x_data, y_data, x->numel(), nullptr, nullptr);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "relu");
+  }
+};
+
 template <typename T>
 struct XPUSoftPlusFunctor : public BaseActivationFunctor<T> {
   void operator()(const framework::ExecutionContext &ctx) const {
@@ -539,6 +566,10 @@ REGISTER_OP_XPU_KERNEL(
     ops::XPUActivationGradKernel<ops::XPUReluGradFunctor<float>>,
     ops::XPUActivationGradKernel<
         ops::XPUReluGradFunctor<paddle::platform::float16>>);
+REGISTER_OP_XPU_KERNEL(relu6,
+                       ops::XPUActivationKernel<ops::XPURelu6Functor<float>>);
+REGISTER_OP_XPU_KERNEL(
+    relu6_grad, ops::XPUActivationKernel<ops::XPURelu6GradFunctor<float>>);
 REGISTER_OP_XPU_KERNEL(
     tanh,
     ops::XPUActivationKernel<ops::XPUTanhFunctor<float>>,
diff --git a/paddle/fluid/operators/add_position_encoding_op.cc b/paddle/fluid/operators/add_position_encoding_op.cc
index f58a838460ec3..f4e7481bdd456 100644
--- a/paddle/fluid/operators/add_position_encoding_op.cc
+++ b/paddle/fluid/operators/add_position_encoding_op.cc
@@ -122,12 +122,11 @@ REGISTER_OPERATOR(
     ops::AddPositionEncodingGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(add_position_encoding_grad, ops::AddPositionEncodingOpGrad);
 
-REGISTER_OP_CPU_KERNEL(
-    add_position_encoding,
-    ops::AddPositionEncodingKernel<plt::CPUDeviceContext, float>,
-    ops::AddPositionEncodingKernel<plt::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(add_position_encoding,
+                       ops::AddPositionEncodingKernel<phi::CPUContext, float>,
+                       ops::AddPositionEncodingKernel<phi::CPUContext, double>);
 
 REGISTER_OP_CPU_KERNEL(
     add_position_encoding_grad,
-    ops::AddPositionEncodingGradKernel<plt::CPUDeviceContext, float>,
-    ops::AddPositionEncodingGradKernel<plt::CPUDeviceContext, double>);
+    ops::AddPositionEncodingGradKernel<phi::CPUContext, float>,
+    ops::AddPositionEncodingGradKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/affine_channel_op.cc b/paddle/fluid/operators/affine_channel_op.cc
index a72fd850d89bc..8c6360bfd89cf 100644
--- a/paddle/fluid/operators/affine_channel_op.cc
+++ b/paddle/fluid/operators/affine_channel_op.cc
@@ -342,7 +342,7 @@ DECLARE_INPLACE_OP_INFERER(AffineChannelGradInplaceInferer,
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
+using CPU = phi::CPUContext;
 
 REGISTER_OPERATOR(affine_channel,
                   ops::AffineChannelOp,
diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc
index 9efa8d0f86385..1977a33fc197e 100644
--- a/paddle/fluid/operators/affine_grid_op.cc
+++ b/paddle/fluid/operators/affine_grid_op.cc
@@ -28,7 +28,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 
 template <typename T>
-struct Linspace<paddle::platform::CPUDeviceContext, T> {
+struct Linspace<phi::CPUContext, T> {
   void operator()(T start,
                   T end,
                   int count,
@@ -282,14 +282,12 @@ REGISTER_OPERATOR(affine_grid,
                   ops::AffineGridGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(affine_grid_grad, ops::AffineGridOpGrad);
 
-REGISTER_OP_CPU_KERNEL(
-    affine_grid,
-    ops::AffineGridOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AffineGridOpKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    affine_grid_grad,
-    ops::AffineGridGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AffineGridGradOpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(affine_grid,
+                       ops::AffineGridOpKernel<phi::CPUContext, float>,
+                       ops::AffineGridOpKernel<phi::CPUContext, double>);
+REGISTER_OP_CPU_KERNEL(affine_grid_grad,
+                       ops::AffineGridGradOpKernel<phi::CPUContext, float>,
+                       ops::AffineGridGradOpKernel<phi::CPUContext, double>);
 
 REGISTER_OP_VERSION(affine_grid)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/allclose_op.cc b/paddle/fluid/operators/allclose_op.cc
index 16c712b4a2751..aa3cd5d4149c4 100644
--- a/paddle/fluid/operators/allclose_op.cc
+++ b/paddle/fluid/operators/allclose_op.cc
@@ -84,7 +84,7 @@ class AllcloseOpVarTypeInference : public framework::VarTypeInference {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
+using CPU = phi::CPUContext;
 
 DECLARE_INFER_SHAPE_FUNCTOR(allclose,
                             AllcloseInferShapeFunctor,
diff --git a/paddle/fluid/operators/amp/alloc_float_status_op.cc b/paddle/fluid/operators/amp/alloc_float_status_op.cc
index c27f0f159f51e..fc96dd52e54a2 100644
--- a/paddle/fluid/operators/amp/alloc_float_status_op.cc
+++ b/paddle/fluid/operators/amp/alloc_float_status_op.cc
@@ -65,7 +65,7 @@ class AllocFloatStatusKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
+using CPU = phi::CPUContext;
 
 REGISTER_OPERATOR(
     alloc_float_status,
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
index 20f986596063e..8fc582c19845c 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
@@ -95,7 +95,7 @@ template <typename T>
 class CheckFiniteAndUnscaleCpuKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
-    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::CPUContext>();
     const auto xs = ctx.MultiInput<framework::Tensor>("X");
     const auto* scale = ctx.Input<framework::Tensor>("Scale");
     auto outs = ctx.MultiOutput<framework::Tensor>("Out");
@@ -106,11 +106,10 @@ class CheckFiniteAndUnscaleCpuKernel : public framework::OpKernel<T> {
 
     *found_inf_data = false;
     framework::Tensor is_finite =
-        ctx.AllocateTmpTensor<bool, platform::CPUDeviceContext>({1}, dev_ctx);
+        ctx.AllocateTmpTensor<bool, phi::CPUContext>({1}, dev_ctx);
     bool* is_finite_data = is_finite.template data<bool>();
 
-    auto& dev = *ctx.template device_context<platform::CPUDeviceContext>()
-                     .eigen_device();
+    auto& dev = *ctx.template device_context<phi::CPUContext>().eigen_device();
 
     T inverse_scale = Inverse<T>(*scale_data);
     for (size_t i = 0; i < xs.size(); ++i) {
diff --git a/paddle/fluid/operators/amp/clear_float_status_op.cc b/paddle/fluid/operators/amp/clear_float_status_op.cc
index beef807620592..7bfc2d34d296e 100644
--- a/paddle/fluid/operators/amp/clear_float_status_op.cc
+++ b/paddle/fluid/operators/amp/clear_float_status_op.cc
@@ -68,7 +68,7 @@ class ClearFloatStatusKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
+using CPU = phi::CPUContext;
 
 REGISTER_OPERATOR(
     clear_float_status,
diff --git a/paddle/fluid/operators/amp/get_float_status_op.cc b/paddle/fluid/operators/amp/get_float_status_op.cc
index add662d8258eb..88a2affbcaaba 100644
--- a/paddle/fluid/operators/amp/get_float_status_op.cc
+++ b/paddle/fluid/operators/amp/get_float_status_op.cc
@@ -67,7 +67,7 @@ class GetFloatStatusKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
+using CPU = phi::CPUContext;
 
 REGISTER_OPERATOR(
     get_float_status,
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cc b/paddle/fluid/operators/amp/update_loss_scaling_op.cc
index 346e981b3a99b..3bae775d30817 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cc
@@ -169,9 +169,9 @@ decr_every_n_nan_or_inf steps and each step some gradients are infinite.
 };
 
 template <typename T, bool IsFoundInfOnCPU>
-class UpdateLossScalingFunctor<platform::CPUDeviceContext, T, IsFoundInfOnCPU> {
+class UpdateLossScalingFunctor<phi::CPUContext, T, IsFoundInfOnCPU> {
  public:
-  void operator()(const platform::CPUDeviceContext& ctx,
+  void operator()(const phi::CPUContext& ctx,
                   const bool* found_inf_data,
                   const T* pre_loss_scaling_data,
                   const int* good_in_data,
@@ -203,9 +203,9 @@ class UpdateLossScalingFunctor<platform::CPUDeviceContext, T, IsFoundInfOnCPU> {
 };
 
 template <typename T>
-class LazyZeros<platform::CPUDeviceContext, T> {
+class LazyZeros<phi::CPUContext, T> {
  public:
-  void operator()(const platform::CPUDeviceContext& dev_ctx,
+  void operator()(const phi::CPUContext& dev_ctx,
                   const bool* found_inf_data,
                   const std::vector<const framework::Tensor*>& xs,
                   const std::vector<framework::Tensor*>& outs) const {
@@ -225,7 +225,7 @@ class LazyZeros<platform::CPUDeviceContext, T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
+using CPU = phi::CPUContext;
 
 REGISTER_OPERATOR(
     update_loss_scaling,
diff --git a/paddle/fluid/operators/angle_op.cc b/paddle/fluid/operators/angle_op.cc
index ae483f39e7f94..f925c7fa74759 100644
--- a/paddle/fluid/operators/angle_op.cc
+++ b/paddle/fluid/operators/angle_op.cc
@@ -116,20 +116,16 @@ REGISTER_OPERATOR(angle,
 
 REGISTER_OP_CPU_KERNEL(
     angle,
-    ops::AngleKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AngleKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::AngleKernel<paddle::platform::CPUDeviceContext,
-                     paddle::platform::complex<float>>,
-    ops::AngleKernel<paddle::platform::CPUDeviceContext,
-                     paddle::platform::complex<double>>);
+    ops::AngleKernel<phi::CPUContext, float>,
+    ops::AngleKernel<phi::CPUContext, double>,
+    ops::AngleKernel<phi::CPUContext, paddle::platform::complex<float>>,
+    ops::AngleKernel<phi::CPUContext, paddle::platform::complex<double>>);
 
 REGISTER_OPERATOR(angle_grad, ops::AngleGradOp);
 
 REGISTER_OP_CPU_KERNEL(
     angle_grad,
-    ops::AngleGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AngleGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::AngleGradKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex<float>>,
-    ops::AngleGradKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex<double>>);
+    ops::AngleGradKernel<phi::CPUContext, float>,
+    ops::AngleGradKernel<phi::CPUContext, double>,
+    ops::AngleGradKernel<phi::CPUContext, paddle::platform::complex<float>>,
+    ops::AngleGradKernel<phi::CPUContext, paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc
index ef8ab38d2f35e..a2af64e227680 100644
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@@ -43,7 +43,7 @@ struct ArrayToLoDFunctorImpl {
   void apply();
 };
 
-struct ArrayToLoDFunctor : public boost::static_visitor<void> {
+struct ArrayToLoDFunctor : public std::unary_function<platform::Place, void> {
   std::vector<framework::Tensor> in;
   mutable framework::Tensor *out;
 
@@ -51,7 +51,7 @@ struct ArrayToLoDFunctor : public boost::static_visitor<void> {
   void operator()(Place place) const {
     auto &pool = platform::DeviceContextPool::Instance();
     if (std::is_same<Place, platform::CPUPlace>::value) {
-      Apply(static_cast<platform::CPUDeviceContext *>(pool.Get(place)));
+      Apply(static_cast<phi::CPUContext *>(pool.Get(place)));
     } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       Apply(static_cast<platform::CUDADeviceContext *>(pool.Get(place)));
diff --git a/paddle/fluid/operators/assign_op.h b/paddle/fluid/operators/assign_op.h
index e90de55bbbd90..e6374eb3a66f7 100644
--- a/paddle/fluid/operators/assign_op.h
+++ b/paddle/fluid/operators/assign_op.h
@@ -71,7 +71,7 @@ class AssignFunctor {
  private:
   void copy_tensor(const framework::LoDTensor &lod_tensor,
                    framework::LoDTensor *out) const {
-    if (lod_tensor.numel() == 0) return;
+    if (!lod_tensor.IsInitialized()) return;
     auto &out_tensor = *out;
     paddle::framework::TensorCopy(lod_tensor, lod_tensor.place(), &out_tensor);
     out_tensor.set_lod(lod_tensor.lod());
diff --git a/paddle/fluid/operators/assign_op_test.cc b/paddle/fluid/operators/assign_op_test.cc
index fd6a793ec4732..0b6245f17d38d 100644
--- a/paddle/fluid/operators/assign_op_test.cc
+++ b/paddle/fluid/operators/assign_op_test.cc
@@ -22,7 +22,7 @@ limitations under the License. */
 
 TEST(AssignOp, AssignLoDTensor) {
   paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CPUDeviceContext ctx(cpu_place);
+  phi::CPUContext ctx(cpu_place);
 
   paddle::framework::Variable output;
   paddle::operators::AssignFunctor assign_functor(&output, ctx);
@@ -47,7 +47,7 @@ TEST(AssignOp, AssignLoDTensor) {
 
 TEST(AssignOp, AssignLoDTensorArray) {
   paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CPUDeviceContext ctx(cpu_place);
+  phi::CPUContext ctx(cpu_place);
 
   paddle::framework::Variable output;
   paddle::operators::AssignFunctor assign_functor(&output, ctx);
@@ -78,7 +78,7 @@ TEST(AssignOp, AssignLoDTensorArray) {
 
 TEST(AssignOp, AssignSelectedRows) {
   paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CPUDeviceContext ctx(cpu_place);
+  phi::CPUContext ctx(cpu_place);
 
   paddle::framework::Variable output;
   paddle::operators::AssignFunctor assign_functor(&output, ctx);
diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index e0d6e38e73fec..60e5912c4418d 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -337,7 +337,7 @@ template <typename T>
 class AttentionLSTMKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    using DeviceContext = paddle::platform::CPUDeviceContext;
+    using DeviceContext = phi::CPUContext;
 
     auto* x = ctx.Input<LoDTensor>("X");
     auto* h0 = ctx.Input<Tensor>("H0");
@@ -416,10 +416,10 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
     T* lstm_x_data = lstm_x->mutable_data<T>(ctx.GetPlace());
     T* lstm_out_data = lstm_out->mutable_data<T>(ctx.GetPlace());
 
-    auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, T>(ctx);
+    auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(ctx);
 
     // x(TxM) * fc (Mx1) part of atten_wgt(M+D)x1
-    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::CPUContext>();
     phi::funcs::FCFunctor<DeviceContext, T> fc;
     fc(dev_ctx,
        total_T,
diff --git a/paddle/fluid/operators/average_accumulates_op.cc b/paddle/fluid/operators/average_accumulates_op.cc
index bb1973f96aaea..856a703fd2b06 100644
--- a/paddle/fluid/operators/average_accumulates_op.cc
+++ b/paddle/fluid/operators/average_accumulates_op.cc
@@ -18,11 +18,10 @@ namespace paddle {
 namespace operators {
 
 template <>
-void GetAccumulators<paddle::platform::CPUDeviceContext>(
-    const framework::ExecutionContext& ctx,
-    int64_t* num_updates,
-    int64_t* num_accumulates,
-    int64_t* old_num_accumulates) {
+void GetAccumulators<phi::CPUContext>(const framework::ExecutionContext& ctx,
+                                      int64_t* num_updates,
+                                      int64_t* num_accumulates,
+                                      int64_t* old_num_accumulates) {
   auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
   auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
   auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
@@ -33,11 +32,10 @@ void GetAccumulators<paddle::platform::CPUDeviceContext>(
 }
 
 template <>
-void SetAccumulators<paddle::platform::CPUDeviceContext>(
-    const framework::ExecutionContext& ctx,
-    int64_t num_updates,
-    int64_t num_accumulates,
-    int64_t old_num_accumulates) {
+void SetAccumulators<phi::CPUContext>(const framework::ExecutionContext& ctx,
+                                      int64_t num_updates,
+                                      int64_t num_accumulates,
+                                      int64_t old_num_accumulates) {
   auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
   auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
   auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
@@ -217,7 +215,6 @@ REGISTER_OPERATOR(
     ops::AverageAccumulatesOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    average_accumulates,
-    ops::AverageAccumulatesKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AverageAccumulatesKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(average_accumulates,
+                       ops::AverageAccumulatesKernel<phi::CPUContext, float>,
+                       ops::AverageAccumulatesKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/batch_fc_op.cc b/paddle/fluid/operators/batch_fc_op.cc
index d8c11c04287c2..38504e3ecdf18 100644
--- a/paddle/fluid/operators/batch_fc_op.cc
+++ b/paddle/fluid/operators/batch_fc_op.cc
@@ -166,7 +166,6 @@ REGISTER_OPERATOR(batch_fc_grad,
                   ops::BatchFCGradOp,
                   ops::BatchFCGradOpNoNeedBufferVarsInferer);
 
-REGISTER_OP_CPU_KERNEL(
-    batch_fc,
-    ops::BatchFCKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BatchFCKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(batch_fc,
+                       ops::BatchFCKernel<phi::CPUContext, float>,
+                       ops::BatchFCKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/beam_search_decode_op.h b/paddle/fluid/operators/beam_search_decode_op.h
index f30b4e2379ef7..2800ef3907407 100644
--- a/paddle/fluid/operators/beam_search_decode_op.h
+++ b/paddle/fluid/operators/beam_search_decode_op.h
@@ -141,7 +141,7 @@ void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
 
   auto cpu_place = std::unique_ptr<paddle::platform::CPUPlace>(
       new paddle::platform::CPUPlace());
-  paddle::platform::CPUDeviceContext cpu_ctx(*cpu_place);
+  phi::CPUContext cpu_ctx(*cpu_place);
 
   framework::LoD lod;
   lod.push_back(source_level_lod);
diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc
index 3a2526fd52063..49ad3d166d908 100644
--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
@@ -143,9 +143,8 @@ REGISTER_OPERATOR(beam_search,
                   ops::BeamSearchOp,
                   ops::BeamSearchOpMaker,
                   ops::BeamSearchInferVarType);
-REGISTER_OP_CPU_KERNEL(
-    beam_search,
-    ops::BeamSearchOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BeamSearchOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::BeamSearchOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::BeamSearchOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(beam_search,
+                       ops::BeamSearchOpKernel<phi::CPUContext, float>,
+                       ops::BeamSearchOpKernel<phi::CPUContext, double>,
+                       ops::BeamSearchOpKernel<phi::CPUContext, int>,
+                       ops::BeamSearchOpKernel<phi::CPUContext, int64_t>);
diff --git a/paddle/fluid/operators/benchmark/CMakeLists.txt b/paddle/fluid/operators/benchmark/CMakeLists.txt
index e05011eaf6b3a..b0a1c488f047c 100644
--- a/paddle/fluid/operators/benchmark/CMakeLists.txt
+++ b/paddle/fluid/operators/benchmark/CMakeLists.txt
@@ -12,3 +12,9 @@ cc_test(
        ${GLOB_OP_LIB}
        ${GLOB_OPERATOR_DEPS}
        eigen_function)
+
+if(WITH_ONNXRUNTIME AND WIN32)
+  # Copy onnxruntime for some c++ test in Windows, since the test will
+  # be build only in CI, so suppose the generator in Windows is Ninja.
+  copy_onnx(op_tester)
+endif()
diff --git a/paddle/fluid/operators/bmm_op.cc b/paddle/fluid/operators/bmm_op.cc
index a24d727e82727..8cacc3c4f2277 100644
--- a/paddle/fluid/operators/bmm_op.cc
+++ b/paddle/fluid/operators/bmm_op.cc
@@ -172,11 +172,9 @@ REGISTER_OPERATOR(bmm,
                   ops::BmmOpGradMaker<paddle::framework::OpDesc>,
                   ops::BmmOpGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(bmm_grad, ops::BmmOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    bmm,
-    ops::BmmKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BmmKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    bmm_grad,
-    ops::BmmGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BmmGradKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(bmm,
+                       ops::BmmKernel<phi::CPUContext, float>,
+                       ops::BmmKernel<phi::CPUContext, double>);
+REGISTER_OP_CPU_KERNEL(bmm_grad,
+                       ops::BmmGradKernel<phi::CPUContext, float>,
+                       ops::BmmGradKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/bpr_loss_op.cc b/paddle/fluid/operators/bpr_loss_op.cc
index 7362a2e6530c3..20ea0b187f64e 100644
--- a/paddle/fluid/operators/bpr_loss_op.cc
+++ b/paddle/fluid/operators/bpr_loss_op.cc
@@ -176,7 +176,7 @@ class BprLossGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CPUCtx = paddle::platform::CPUDeviceContext;
+using CPUCtx = phi::CPUContext;
 
 REGISTER_OPERATOR(bpr_loss,
                   ops::BprLossOp,
diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc
index 4245c336e5dcc..b3903da7c3f2a 100644
--- a/paddle/fluid/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
@@ -141,7 +141,7 @@ class CastOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
+using CPU = phi::CPUContext;
 
 // cast use phi kernel, so no need to REGISTER_OP_CPU_KERNEL here.
 REGISTER_OPERATOR(cast,
diff --git a/paddle/fluid/operators/center_loss_op.cc b/paddle/fluid/operators/center_loss_op.cc
index a3a67177e2b47..15cc71565091c 100644
--- a/paddle/fluid/operators/center_loss_op.cc
+++ b/paddle/fluid/operators/center_loss_op.cc
@@ -146,7 +146,7 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(CenterLossGradNoNeedBufVarsInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CPUCtx = paddle::platform::CPUDeviceContext;
+using CPUCtx = phi::CPUContext;
 
 REGISTER_OPERATOR(center_loss,
                   ops::CenterLossOp,
diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc
index 644de7e191faa..b7600cbb4af41 100644
--- a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc
+++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc
@@ -113,13 +113,11 @@ It accomplishes the execution of the instruction according to the following step
 }  // namespace paddle::operators
 
 namespace ops = paddle::operators;
-using CPUDeviceContext = paddle::platform::CPUDeviceContext;
 REGISTER_OPERATOR(
     cinn_instruction_run,
     ops::CinnInstructionRunOp,
     ops::CinnInstructionRunOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    cinn_instruction_run,
-    ops::CinnInstructionRunOpKernel<CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(cinn_instruction_run,
+                       ops::CinnInstructionRunOpKernel<phi::CPUContext, float>);
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.cc b/paddle/fluid/operators/cinn/cinn_launch_op.cc
index 4e0ed2cfb199c..cd0a31dc0cddd 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.cc
@@ -189,6 +189,5 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 /* see [Why use single type kernel] */
-REGISTER_OP_CPU_KERNEL(
-    cinn_launch,
-    ops::CinnLaunchOpKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(cinn_launch,
+                       ops::CinnLaunchOpKernel<phi::CPUContext, float>);
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
index 674f55efb5feb..5b965573deefa 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
@@ -89,31 +89,37 @@ class TestCinnLaunchOp : public ::testing::Test {
   void TearDown() override { CinnCompiler::GetInstance()->Clear(); }
 };
 
-TEST_F(TestCinnLaunchOp, TestRunInstructionByPE) {
-  // CPU
+TEST_F(TestCinnLaunchOp, TestRunCPUInstructionByPE) {
   RunAndCheck(platform::CPUPlace());
   // the second run on the same place is to check the cache logic
   RunAndCheck(platform::CPUPlace());
+}
+
 #ifdef PADDLE_WITH_CUDA
-  // GPU
+TEST_F(TestCinnLaunchOp, TestRunGPUInstructionByPE) {
   RunAndCheck(platform::CUDAPlace());
   RunAndCheck(platform::CUDAPlace());
-#endif
 }
+#endif
 
-TEST_F(TestCinnLaunchOp, TestRunInstructionByCinnProgram) {
+TEST_F(TestCinnLaunchOp, TestRunCPUInstructionByCinnProgram) {
   // set FLAGS_enable_pe_launch_cinn=false to switch to use
   // default scheduler of CINN to execute the compiled program
   FLAGS_enable_pe_launch_cinn = false;
 
   RunAndCheck(platform::CPUPlace());
   RunAndCheck(platform::CPUPlace());
+}
+
 #ifdef PADDLE_WITH_CUDA
-  // GPU
+TEST_F(TestCinnLaunchOp, TestRunGPUInstructionByCinnProgram) {
+  // set FLAGS_enable_pe_launch_cinn=false to switch to use
+  // default scheduler of CINN to execute the compiled program
+  FLAGS_enable_pe_launch_cinn = false;
   RunAndCheck(platform::CUDAPlace());
   RunAndCheck(platform::CUDAPlace());
-#endif
 }
+#endif
 
 TEST_F(TestCinnLaunchOp, TestRunWithAutoTuneEnabled) {
   FLAGS_enable_cinn_auto_tune = true;
diff --git a/paddle/fluid/operators/class_center_sample_op.cu b/paddle/fluid/operators/class_center_sample_op.cu
index 236f783017dcc..a0642694843e8 100644
--- a/paddle/fluid/operators/class_center_sample_op.cu
+++ b/paddle/fluid/operators/class_center_sample_op.cu
@@ -220,6 +220,12 @@ class NotEqualToPreviousAdjacentIterator {
     return ret;
   }
 
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type operator-(Distance n) const {
+    self_type ret(arr_, offset_ - n);
+    return ret;
+  }
+
   template <typename Distance>
   __host__ __device__ __forceinline__ reference operator[](Distance n) const {
     return *(*this + n);
diff --git a/paddle/fluid/operators/clip_by_norm_op.cc b/paddle/fluid/operators/clip_by_norm_op.cc
index 9ce6f7bebc837..cfb56a4b2a6b1 100644
--- a/paddle/fluid/operators/clip_by_norm_op.cc
+++ b/paddle/fluid/operators/clip_by_norm_op.cc
@@ -19,6 +19,5 @@ REGISTER_OP_WITHOUT_GRADIENT(clip_by_norm,
                              ops::ClipByNormOp,
                              ops::ClipByNormOpMaker);
 
-REGISTER_OP_CPU_KERNEL(
-    clip_by_norm,
-    ops::ClipByNormKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(clip_by_norm,
+                       ops::ClipByNormKernel<phi::CPUContext, float>);
diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc
index 5c0f5c39a34e5..561d2696fef85 100644
--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -511,11 +511,10 @@ REGISTER_OPERATOR(coalesce_tensor,
                   paddle::operators::CoalesceTensorOpMaker);
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
-REGISTER_OP_CPU_KERNEL(
-    coalesce_tensor,
-    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(coalesce_tensor,
+                       ops::CoalesceTensorOpKernel<phi::CPUContext, int>,
+                       ops::CoalesceTensorOpKernel<phi::CPUContext, float>,
+                       ops::CoalesceTensorOpKernel<phi::CPUContext, double>);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 REGISTER_OP_CUDA_KERNEL(
@@ -550,20 +549,18 @@ REGISTER_OP_XPU_KERNEL(
 #if defined(PADDLE_WITH_ASCEND_CL)
 REGISTER_OP_NPU_KERNEL(
     coalesce_tensor,
-    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext,
-                                plat::float16>,
-    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, double>);
+    ops::CoalesceTensorOpKernel<phi::CPUContext, int>,
+    ops::CoalesceTensorOpKernel<phi::CPUContext, float>,
+    ops::CoalesceTensorOpKernel<phi::CPUContext, plat::float16>,
+    ops::CoalesceTensorOpKernel<phi::CPUContext, double>);
 #endif
 
 #if defined(PADDLE_WITH_MLU)
 REGISTER_OP_MLU_KERNEL(
     coalesce_tensor,
-    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext,
-                                plat::float16>,
-    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::CoalesceTensorOpKernel<phi::CPUContext, plat::float16>,
+    ops::CoalesceTensorOpKernel<phi::CPUContext, int>,
+    ops::CoalesceTensorOpKernel<phi::CPUContext, float>);
 #endif
 
 REGISTER_OP_VERSION(coalesce_tensor)
diff --git a/paddle/fluid/operators/collective/allreduce_op.cc b/paddle/fluid/operators/collective/allreduce_op.cc
index 6c365292f54fd..b3351dc82b7e7 100644
--- a/paddle/fluid/operators/collective/allreduce_op.cc
+++ b/paddle/fluid/operators/collective/allreduce_op.cc
@@ -73,10 +73,9 @@ REGISTER_OP_WITHOUT_GRADIENT(allreduce,
                              ops::AllReduceOp,
                              ops::AllReduceOpMaker);
 
-REGISTER_OP_CPU_KERNEL(
-    allreduce,
-    ops::AllReduceOpKernel<plat::CPUDeviceContext, float>,
-    ops::AllReduceOpKernel<plat::CPUDeviceContext, double>,
-    ops::AllReduceOpKernel<plat::CPUDeviceContext, int>,
-    ops::AllReduceOpKernel<plat::CPUDeviceContext, int64_t>,
-    ops::AllReduceOpKernel<plat::CPUDeviceContext, plat::float16>);
+REGISTER_OP_CPU_KERNEL(allreduce,
+                       ops::AllReduceOpKernel<phi::CPUContext, float>,
+                       ops::AllReduceOpKernel<phi::CPUContext, double>,
+                       ops::AllReduceOpKernel<phi::CPUContext, int>,
+                       ops::AllReduceOpKernel<phi::CPUContext, int64_t>,
+                       ops::AllReduceOpKernel<phi::CPUContext, plat::float16>);
diff --git a/paddle/fluid/operators/complex_op.cc b/paddle/fluid/operators/complex_op.cc
index ebd0b2334529d..778f5831c0fbb 100644
--- a/paddle/fluid/operators/complex_op.cc
+++ b/paddle/fluid/operators/complex_op.cc
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/complex_op.h"
-
-#include <vector>
-
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -59,36 +59,6 @@ class ComplexOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "complex");
-    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "complex");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "complex");
-
-    if (ctx->GetInputDim("X") == ctx->GetInputDim("Y")) {
-      ctx->ShareDim("X", /*->*/ "Out");
-      // NOTE(chenfeiyu): lod & broadcasting is intrinsically contradictory
-      // so tensors with lod are not supported here
-    } else {
-      auto x_dims = ctx->GetInputDim("X");
-      auto y_dims = ctx->GetInputDim("Y");
-      int max_dim = std::max(x_dims.size(), y_dims.size());
-
-      // start align axis
-      int axis = std::abs(x_dims.size() - y_dims.size());
-      std::vector<int> x_dims_array(max_dim);
-      std::vector<int> y_dims_array(max_dim);
-      std::vector<int> out_dims_array(max_dim);
-      GetBroadcastDimsArrays(x_dims,
-                             y_dims,
-                             x_dims_array.data(),
-                             y_dims_array.data(),
-                             out_dims_array.data(),
-                             max_dim,
-                             axis);
-      ctx->SetOutputDim("Out", phi::make_ddim(out_dims_array));
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -101,25 +71,6 @@ class ComplexGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "complex_grad");
-    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "kron_complex_gradgrad");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input",
-                   framework::GradVarName("Out"),
-                   "complex_grad");
-
-    auto x_grad_name = framework::GradVarName("X");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->ShareDim("X", /*->*/ x_grad_name);
-    }
-
-    auto y_grad_name = framework::GradVarName("Y");
-    if (ctx->HasOutput(y_grad_name)) {
-      ctx->ShareDim("Y", /*->*/ y_grad_name);
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -135,20 +86,21 @@ class ComplexGradOp : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 
+DECLARE_INFER_SHAPE_FUNCTOR(complex,
+                            ComplexInferShapeFunctor,
+                            PD_INFER_META(phi::ComplexInferMeta));
+
 REGISTER_OPERATOR(complex,
                   ops::ComplexOp,
                   ops::ComplexOpMaker,
                   ops::ComplexGradOpMaker<paddle::framework::OpDesc>,
-                  ops::ComplexGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OPERATOR(complex_grad, ops::ComplexGradOp);
+                  ops::ComplexGradOpMaker<paddle::imperative::OpBase>,
+                  ComplexInferShapeFunctor);
 
-REGISTER_OP_CPU_KERNEL(
-    complex,
-    ops::ComplexKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ComplexKernel<paddle::platform::CPUDeviceContext, double>);
+DECLARE_INFER_SHAPE_FUNCTOR(complex_grad,
+                            ComplexGradInferShapeFunctor,
+                            PD_INFER_META(phi::ComplexGradInferMeta));
 
-REGISTER_OP_CPU_KERNEL(
-    complex_grad,
-    ops::ComplexGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ComplexGradKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OPERATOR(complex_grad,
+                  ops::ComplexGradOp,
+                  ComplexGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/complex_op.cu b/paddle/fluid/operators/complex_op.cu
deleted file mode 100644
index c9bc2d459e73b..0000000000000
--- a/paddle/fluid/operators/complex_op.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/complex_op.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    complex,
-    ops::ComplexKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ComplexKernel<paddle::platform::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    complex_grad,
-    ops::ComplexGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ComplexGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/complex_op.h b/paddle/fluid/operators/complex_op.h
deleted file mode 100644
index 5fb19b46ec6a0..0000000000000
--- a/paddle/fluid/operators/complex_op.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/phi/kernels/funcs/complex_functors.h"
-
-namespace paddle {
-namespace operators {
-
-// functors to use with ElementwiseComputeEx
-template <typename T>
-struct RealAndImagToComplexFunctor {
-  inline HOSTDEVICE platform::complex<T> operator()(const T x, const T y) {
-    return platform::complex<T>(x, y);
-  }
-};
-
-template <typename T>
-struct ImagAndRealToComplexFunctor {
-  inline HOSTDEVICE platform::complex<T> operator()(const T y, const T x) {
-    return platform::complex<T>(x, y);
-  }
-};
-
-template <typename T>
-struct ComplexGradForRealFunctor {
-  inline HOSTDEVICE T operator()(const T x,
-                                 const T y,
-                                 const platform::complex<T> out,
-                                 const platform::complex<T> dout) {
-    return dout.real;
-  }
-};
-
-template <typename T>
-struct ComplexGradForImagFunctor {
-  inline HOSTDEVICE T operator()(const T x,
-                                 const T y,
-                                 const platform::complex<T> out,
-                                 const platform::complex<T> dout) {
-    return dout.imag;
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ComplexKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* x = ctx.Input<framework::Tensor>("X");
-    const auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* z = ctx.Output<framework::Tensor>("Out");
-
-    using C = platform::complex<T>;
-    z->mutable_data<C>(ctx.GetPlace());
-
-// NOTE(chenfeiyu): be careful of the caveats of calling elementwise-related
-// facility functions
-#if defined(__NVCC__) || defined(__HIPCC__)
-    ElementwiseComputeEx<RealAndImagToComplexFunctor<T>, DeviceContext, T, C>(
-        ctx, x, y, /*axis*/ -1, RealAndImagToComplexFunctor<T>(), z);
-#else
-    auto x_dims = x->dims();
-    auto y_dims = y->dims();
-    if (x_dims.size() >= y_dims.size()) {
-      ElementwiseComputeEx<RealAndImagToComplexFunctor<T>, DeviceContext, T, C>(
-          ctx, x, y, /*axis*/ -1, RealAndImagToComplexFunctor<T>(), z);
-    } else {
-      ElementwiseComputeEx<ImagAndRealToComplexFunctor<T>, DeviceContext, T, C>(
-          ctx, x, y, /*axis*/ -1, ImagAndRealToComplexFunctor<T>(), z);
-    }
-#endif
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ComplexGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    using C = platform::complex<T>;
-
-    // skip out in a hacky way
-    auto* out = dout;
-    ElemwiseGradCompute<DeviceContext,
-                        T,
-                        ComplexGradForRealFunctor<T>,
-                        ComplexGradForImagFunctor<T>,
-                        C>(ctx,
-                           *x,
-                           *y,
-                           *out,
-                           *dout,
-                           /*axis*/ -1,
-                           dx,
-                           dy,
-                           ComplexGradForRealFunctor<T>(),
-                           ComplexGradForImagFunctor<T>());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/complex_view_op.cc b/paddle/fluid/operators/complex_view_op.cc
index 344b2a1c48ad4..ce46a0f0121e6 100644
--- a/paddle/fluid/operators/complex_view_op.cc
+++ b/paddle/fluid/operators/complex_view_op.cc
@@ -20,7 +20,9 @@
 #include <vector>
 
 #include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -94,17 +96,6 @@ class AsRealOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "as_real");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "as_real");
-
-    auto out_dims_v = phi::vectorize(ctx->GetInputDim("X"));
-    out_dims_v.push_back(2);
-    const framework::DDim out_dims = phi::make_ddim(out_dims_v);
-    ctx->SetOutputDim("Out", out_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -148,6 +139,9 @@ class AsRealGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(as_real,
+                            AsRealInferShapeFunctor,
+                            PD_INFER_META(phi::AsRealInferMeta));
 
 REGISTER_OPERATOR(as_complex,
                   ops::AsComplexOp,
@@ -158,15 +152,10 @@ REGISTER_OPERATOR(as_complex,
 REGISTER_OPERATOR(as_real,
                   ops::AsRealOp,
                   ops::AsRealOpMaker,
+                  AsRealInferShapeFunctor,
                   ops::AsRealGradMaker<paddle::framework::OpDesc>,
                   ops::AsRealGradMaker<paddle::imperative::OpBase>);
 
-REGISTER_OP_CPU_KERNEL(
-    as_complex,
-    ops::AsComplexKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AsComplexKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    as_real,
-    ops::AsRealKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AsRealKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(as_complex,
+                       ops::AsComplexKernel<phi::CPUContext, float>,
+                       ops::AsComplexKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/complex_view_op.cu b/paddle/fluid/operators/complex_view_op.cu
index 18d448fb75d3c..eb10781491346 100644
--- a/paddle/fluid/operators/complex_view_op.cu
+++ b/paddle/fluid/operators/complex_view_op.cu
@@ -22,8 +22,3 @@ REGISTER_OP_CUDA_KERNEL(
     as_complex,
     ops::AsComplexKernel<paddle::platform::CUDADeviceContext, float>,
     ops::AsComplexKernel<paddle::platform::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    as_real,
-    ops::AsRealKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::AsRealKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/complex_view_op.h b/paddle/fluid/operators/complex_view_op.h
index 51abaa88f856e..169b8b05a554e 100644
--- a/paddle/fluid/operators/complex_view_op.h
+++ b/paddle/fluid/operators/complex_view_op.h
@@ -41,20 +41,5 @@ class AsComplexKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
-class AsRealKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const auto* x = context.Input<framework::LoDTensor>("X");
-    auto* out = context.Output<framework::LoDTensor>("Out");
-
-    out->mutable_data<T>(context.GetPlace());
-    const framework::DDim out_dims_original = out->dims();
-    framework::TensorCopy(*x, context.GetPlace(), out);
-    out->Resize(out_dims_original);            // restored the shape
-    out->mutable_data<T>(context.GetPlace());  // restore the dtype
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc
index 00806d18c066f..4cef104496510 100644
--- a/paddle/fluid/operators/controlflow/feed_op.cc
+++ b/paddle/fluid/operators/controlflow/feed_op.cc
@@ -29,7 +29,7 @@ namespace operators {
 
 // FeedVariableVisitor is to feed the variable data
 // according to data type (LoDTensor or  Strings).
-class FeedVariableVisitor : public boost::static_visitor<void> {
+class FeedVariableVisitor {
  public:
   explicit FeedVariableVisitor(framework::Variable *out_var,
                                const platform::Place &place)
diff --git a/paddle/fluid/operators/controlflow/op_variant.cc b/paddle/fluid/operators/controlflow/op_variant.cc
index 60f58955adbed..48b7a43410672 100644
--- a/paddle/fluid/operators/controlflow/op_variant.cc
+++ b/paddle/fluid/operators/controlflow/op_variant.cc
@@ -17,24 +17,21 @@
 namespace paddle {
 namespace operators {
 
-struct InputsVisitor
-    : public boost::static_visitor<const framework::VariableNameMap *> {
+struct InputsVisitor {
   template <typename OpType>
   const framework::VariableNameMap *operator()(const OpType *op) const {
     return &(op->Inputs());
   }
 };
 
-struct OutputsVisitor
-    : public boost::static_visitor<const framework::VariableNameMap *> {
+struct OutputsVisitor {
   template <typename OpType>
   const framework::VariableNameMap *operator()(const OpType *op) const {
     return &(op->Outputs());
   }
 };
 
-struct AttributeMapVisitor
-    : public boost::static_visitor<const framework::AttributeMap *> {
+struct AttributeMapVisitor {
   const framework::AttributeMap *operator()(const framework::OpDesc *op) const {
     return &(op->GetAttrMap());
   }
@@ -45,7 +42,7 @@ struct AttributeMapVisitor
   }
 };
 
-struct RawPointerVisitor : public boost::static_visitor<const void *> {
+struct RawPointerVisitor {
   template <typename OpType>
   const void *operator()(const OpType *op) const {
     return op;
diff --git a/paddle/fluid/operators/controlflow/op_variant.h b/paddle/fluid/operators/controlflow/op_variant.h
index c75294ce9ab7a..04afe548e92e3 100644
--- a/paddle/fluid/operators/controlflow/op_variant.h
+++ b/paddle/fluid/operators/controlflow/op_variant.h
@@ -18,7 +18,6 @@
 
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/platform/variant.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/operators/controlflow/recurrent_op_helper.h b/paddle/fluid/operators/controlflow/recurrent_op_helper.h
index 78fabec56f51a..752a0a1f764eb 100644
--- a/paddle/fluid/operators/controlflow/recurrent_op_helper.h
+++ b/paddle/fluid/operators/controlflow/recurrent_op_helper.h
@@ -23,7 +23,7 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/controlflow/op_variant.h"
 #include "paddle/fluid/operators/recurrent_op.h"
-#include "paddle/fluid/platform/variant.h"
+
 #include "paddle/fluid/string/string_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/controlflow/while_op_helper.h b/paddle/fluid/operators/controlflow/while_op_helper.h
index 46c3b056bfdf1..8f7db23769a7e 100644
--- a/paddle/fluid/operators/controlflow/while_op_helper.h
+++ b/paddle/fluid/operators/controlflow/while_op_helper.h
@@ -20,7 +20,6 @@
 
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/controlflow/op_variant.h"
-#include "paddle/fluid/platform/variant.h"
 
 namespace phi {
 class DenseTensor;
diff --git a/paddle/fluid/operators/conv_op_xpu.cc b/paddle/fluid/operators/conv_op_xpu.cc
index f65921dbc1776..638983ea26be9 100644
--- a/paddle/fluid/operators/conv_op_xpu.cc
+++ b/paddle/fluid/operators/conv_op_xpu.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/conv_op.h"
 #include "paddle/fluid/platform/cudnn_workspace_helper.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 #ifdef PADDLE_WITH_XPU
 namespace paddle {
 namespace operators {
@@ -71,9 +72,26 @@ class GemmConvXPUKernel : public framework::OpKernel<T> {
     XPUT *output_data = reinterpret_cast<XPUT *>(output->data<T>());
 
     auto &dev_ctx = context.template device_context<DeviceContext>();
+    xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+
+    XPUT *filter_data_tmp;
+    const XPUT *filter_data_ptr = filter_data;
+    if (data_format == "NHWC") {
+      filter_data_tmp = RAII_GUARD.alloc<XPUT>(filter.numel());
+      PADDLE_ENFORCE_XDNN_NOT_NULL(filter_data_tmp);
+      std::vector<int> filter_shape = phi::vectorize<int>(filter.dims());
+      int r = xpu::transpose<XPUT>(dev_ctx.x_context(),
+                                   filter_data,
+                                   filter_data_tmp,
+                                   filter_shape,
+                                   {0, 2, 3, 1});
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
+      filter_data_ptr = reinterpret_cast<const XPUT *>(filter_data_tmp);
+    }
+
     int r = xpu::conv2d<XPUT, XPUT, XPUT, int16_t>(dev_ctx.x_context(),
                                                    input_data,
-                                                   filter_data,
+                                                   filter_data_ptr,
                                                    output_data,
                                                    batch_size,
                                                    img_c,
@@ -89,11 +107,7 @@ class GemmConvXPUKernel : public framework::OpKernel<T> {
                                                    nullptr,
                                                    nullptr,
                                                    is_nchw);
-    PADDLE_ENFORCE_EQ(
-        r,
-        XPU_SUCCESS,
-        platform::errors::External(
-            "XPU conv kernel return wrong value[%d %s]", r, XPUAPIErrorMsg[r]));
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv2d");
   }
 };
 
@@ -134,6 +148,7 @@ class GemmConvGradXPUKernel : public framework::OpKernel<T> {
     framework::DDim filter_data_dims =
         phi::slice_ddim(filter.dims(), 2, filter.dims().size());
     std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
+    std::vector<int> filter_shape = phi::vectorize<int>(filter.dims());
     UpdatePaddingAndDilation(
         &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
 
@@ -165,12 +180,35 @@ class GemmConvGradXPUKernel : public framework::OpKernel<T> {
       filter_grad_data = reinterpret_cast<XPUT *>(filter_grad->data<T>());
     }
     auto &dev_ctx = context.template device_context<DeviceContext>();
+    xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+
+    XPUT *filter_data_tmp;
+    XPUT *filter_grad_data_tmp;
+    const XPUT *filter_data_ptr = filter_data;
+    XPUT *filter_grad_data_ptr = filter_grad_data;
+    if (data_format == "NHWC") {
+      filter_data_tmp = RAII_GUARD.alloc<XPUT>(filter.numel());
+      PADDLE_ENFORCE_XDNN_NOT_NULL(filter_data_tmp);
+      int r = xpu::transpose<XPUT>(dev_ctx.x_context(),
+                                   filter_data,
+                                   filter_data_tmp,
+                                   filter_shape,
+                                   {0, 2, 3, 1});
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
+      filter_data_ptr = reinterpret_cast<const XPUT *>(filter_data_tmp);
+
+      if (filter_grad_data != nullptr) {
+        filter_grad_data_tmp = RAII_GUARD.alloc<XPUT>(filter.numel());
+        PADDLE_ENFORCE_XDNN_NOT_NULL(filter_grad_data_tmp);
+        filter_grad_data_ptr = filter_grad_data_tmp;
+      }
+    }
     int r = xpu::conv2d_grad<XPUT, XPUT, XPUT, int16_t>(dev_ctx.x_context(),
                                                         input_data,
-                                                        filter_data,
+                                                        filter_data_ptr,
                                                         output_grad_data,
                                                         input_grad_data,
-                                                        filter_grad_data,
+                                                        filter_grad_data_ptr,
                                                         batch_size,
                                                         img_c,
                                                         img_h,
@@ -187,11 +225,18 @@ class GemmConvGradXPUKernel : public framework::OpKernel<T> {
                                                         nullptr,
                                                         nullptr,
                                                         is_nchw);
-    PADDLE_ENFORCE_EQ(
-        r,
-        XPU_SUCCESS,
-        platform::errors::External(
-            "XPU conv kernel return wrong value[%d %s]", r, XPUAPIErrorMsg[r]));
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv2d_grad");
+
+    if ((filter_grad_data_ptr != nullptr) && (data_format == "NHWC")) {
+      std::vector<int> filter_shape_fhwc = {
+          filter_shape[0], filter_shape[2], filter_shape[3], filter_shape[1]};
+      int r = xpu::transpose<XPUT>(dev_ctx.x_context(),
+                                   filter_grad_data_ptr,
+                                   filter_grad_data,
+                                   filter_shape_fhwc,
+                                   {0, 3, 1, 2});
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
+    }
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/cos_sim_op.cc b/paddle/fluid/operators/cos_sim_op.cc
index e6ecf6675cba9..e3228104de38b 100644
--- a/paddle/fluid/operators/cos_sim_op.cc
+++ b/paddle/fluid/operators/cos_sim_op.cc
@@ -249,8 +249,6 @@ REGISTER_OPERATOR(cos_sim,
                   ops::CosSimGradOpMaker<paddle::framework::OpDesc>,
                   ops::CosSimGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(cos_sim_grad, ops::CosSimOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    cos_sim, ops::CosSimKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    cos_sim_grad,
-    ops::CosSimGradKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(cos_sim, ops::CosSimKernel<phi::CPUContext, float>);
+REGISTER_OP_CPU_KERNEL(cos_sim_grad,
+                       ops::CosSimGradKernel<phi::CPUContext, float>);
diff --git a/paddle/fluid/operators/crf_decoding_op.cc b/paddle/fluid/operators/crf_decoding_op.cc
index 3bead0127b823..ee3ff671ede2d 100644
--- a/paddle/fluid/operators/crf_decoding_op.cc
+++ b/paddle/fluid/operators/crf_decoding_op.cc
@@ -215,7 +215,6 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(crf_decoding,
                              ops::CRFDecodingOp,
                              ops::CRFDecodingOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    crf_decoding,
-    ops::CRFDecodingOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CRFDecodingOpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(crf_decoding,
+                       ops::CRFDecodingOpKernel<phi::CPUContext, float>,
+                       ops::CRFDecodingOpKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc
index d1358ca2f44e8..bdc1f61fbe0eb 100644
--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
@@ -223,14 +223,12 @@ REGISTER_OPERATOR(crop,
                   ops::CropGradOpMaker<paddle::imperative::OpBase>,
                   ops::GropNoNeedBufferVarInferer);
 REGISTER_OPERATOR(crop_grad, ops::CropOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    crop,
-    ops::CropKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CropKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    crop_grad,
-    ops::CropGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CropGradKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(crop,
+                       ops::CropKernel<phi::CPUContext, float>,
+                       ops::CropKernel<phi::CPUContext, double>);
+REGISTER_OP_CPU_KERNEL(crop_grad,
+                       ops::CropGradKernel<phi::CPUContext, float>,
+                       ops::CropGradKernel<phi::CPUContext, double>);
 
 REGISTER_OP_CUDA_KERNEL(
     crop,
diff --git a/paddle/fluid/operators/crop_op_npu.cc b/paddle/fluid/operators/crop_op_npu.cc
index 6c4c6eb25d820..bd50dea15f80e 100644
--- a/paddle/fluid/operators/crop_op_npu.cc
+++ b/paddle/fluid/operators/crop_op_npu.cc
@@ -70,8 +70,12 @@ class CropNPUKernel : public framework::OpKernel<T> {
                             shape->dims().size(),
                             x->dims().size()));
 
+      // shape memory maybe have gc.
+      Tensor tmp_shape(*shape);
+      tmp_shape.mutable_data<T>(ctx.GetPlace());
+
       const auto& runner =
-          NpuOpRunner("Crop", {*x, *shape}, {*out}, attr_input);
+          NpuOpRunner("Crop", {*x, tmp_shape}, {*out}, attr_input);
       auto stream =
           ctx.template device_context<paddle::platform::NPUDeviceContext>()
               .stream();
diff --git a/paddle/fluid/operators/crop_tensor_op.cc b/paddle/fluid/operators/crop_tensor_op.cc
index 9422de6093441..f72175d4d5338 100644
--- a/paddle/fluid/operators/crop_tensor_op.cc
+++ b/paddle/fluid/operators/crop_tensor_op.cc
@@ -320,18 +320,16 @@ REGISTER_OPERATOR(crop_tensor,
                   ops::CropTensorGradOpMaker<paddle::framework::OpDesc>,
                   ops::CropTensorGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(crop_tensor_grad, ops::CropTensorOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    crop_tensor,
-    ops::CropTensorKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CropTensorKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::CropTensorKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::CropTensorKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    crop_tensor_grad,
-    ops::CropTensorGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CropTensorGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::CropTensorGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::CropTensorGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(crop_tensor,
+                       ops::CropTensorKernel<phi::CPUContext, float>,
+                       ops::CropTensorKernel<phi::CPUContext, double>,
+                       ops::CropTensorKernel<phi::CPUContext, int>,
+                       ops::CropTensorKernel<phi::CPUContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(crop_tensor_grad,
+                       ops::CropTensorGradKernel<phi::CPUContext, float>,
+                       ops::CropTensorGradKernel<phi::CPUContext, double>,
+                       ops::CropTensorGradKernel<phi::CPUContext, int>,
+                       ops::CropTensorGradKernel<phi::CPUContext, int64_t>);
 
 REGISTER_OP_CUDA_KERNEL(
     crop_tensor,
diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc
index 5c0c2794bd652..0d98f5b75e4fb 100644
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -421,7 +421,7 @@ class CrossEntropyGradOpMaker2 : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CPUCtx = paddle::platform::CPUDeviceContext;
+using CPUCtx = phi::CPUContext;
 
 REGISTER_OPERATOR(cross_entropy,
                   ops::CrossEntropyOpBase,
diff --git a/paddle/fluid/operators/ctc_align_op.cc b/paddle/fluid/operators/ctc_align_op.cc
index 6d3da99820517..dbab71e1619ec 100644
--- a/paddle/fluid/operators/ctc_align_op.cc
+++ b/paddle/fluid/operators/ctc_align_op.cc
@@ -129,7 +129,6 @@ REGISTER_OPERATOR(
     ops::CTCAlignOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    ctc_align,
-    ops::CTCAlignKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::CTCAlignKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(ctc_align,
+                       ops::CTCAlignKernel<phi::CPUContext, int>,
+                       ops::CTCAlignKernel<phi::CPUContext, int64_t>);
diff --git a/paddle/fluid/operators/cum_op.cc b/paddle/fluid/operators/cum_op.cc
index 169f1919a7539..b42f26342ab97 100644
--- a/paddle/fluid/operators/cum_op.cc
+++ b/paddle/fluid/operators/cum_op.cc
@@ -145,7 +145,7 @@ class LogcumsumexpGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
+using CPU = phi::CPUContext;
 DECLARE_INFER_SHAPE_FUNCTOR(cumsum,
                             CumsumInferShapeFunctor,
                             PD_INFER_META(phi::CumInferMeta));
diff --git a/paddle/fluid/operators/data_norm_op.cc b/paddle/fluid/operators/data_norm_op.cc
index e0997635cb42c..6685e54e43b60 100644
--- a/paddle/fluid/operators/data_norm_op.cc
+++ b/paddle/fluid/operators/data_norm_op.cc
@@ -287,8 +287,7 @@ The required data format for this layer is one of the following:
 };
 
 template <typename T>
-class DataNormKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
+class DataNormKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     // const bool is_test = ctx.Attr<bool>("is_test");
@@ -533,8 +532,7 @@ class DataNormGradOp : public framework::OperatorWithKernel {
 };
 
 template <typename T>
-class DataNormGradKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
+class DataNormGradKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     const auto *x = ctx.Input<Tensor>("X");
@@ -788,14 +786,12 @@ REGISTER_OPERATOR(data_norm,
                   ops::DataNormGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(data_norm_grad, ops::DataNormGradOp);
 
-REGISTER_OP_CPU_KERNEL(
-    data_norm,
-    ops::DataNormKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DataNormKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    data_norm_grad,
-    ops::DataNormGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DataNormGradKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(data_norm,
+                       ops::DataNormKernel<phi::CPUContext, float>,
+                       ops::DataNormKernel<phi::CPUContext, double>);
+REGISTER_OP_CPU_KERNEL(data_norm_grad,
+                       ops::DataNormGradKernel<phi::CPUContext, float>,
+                       ops::DataNormGradKernel<phi::CPUContext, double>);
 REGISTER_OP_VERSION(data_norm).AddCheckpoint(
     R"ROC(
               upgrad data_norm op by adding scale_w to support scale and shift.)ROC",
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cc b/paddle/fluid/operators/deformable_psroi_pooling_op.cc
index 45ed3642f1066..f83a4c04a8162 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cc
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cc
@@ -349,7 +349,7 @@ class DeformablePSROIPoolGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
+using CPU = phi::CPUContext;
 REGISTER_OPERATOR(
     deformable_psroi_pooling,
     ops::DeformablePSROIPoolOp,
diff --git a/paddle/fluid/operators/dequantize_abs_max_op.cc b/paddle/fluid/operators/dequantize_abs_max_op.cc
index 1c784d9891b44..64807329a4043 100644
--- a/paddle/fluid/operators/dequantize_abs_max_op.cc
+++ b/paddle/fluid/operators/dequantize_abs_max_op.cc
@@ -33,8 +33,8 @@ namespace paddle {
 namespace operators {
 
 template <typename T>
-struct DequantizeFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& dev_ctx,
+struct DequantizeFunctor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext& dev_ctx,
                   const framework::Tensor* in,
                   const framework::Tensor* scale,
                   float max_range,
@@ -49,8 +49,8 @@ struct DequantizeFunctor<platform::CPUDeviceContext, T> {
   }
 };
 
-template struct DequantizeFunctor<platform::CPUDeviceContext, int8_t>;
-template struct DequantizeFunctor<platform::CPUDeviceContext, int16_t>;
+template struct DequantizeFunctor<phi::CPUContext, int8_t>;
+template struct DequantizeFunctor<phi::CPUContext, int16_t>;
 
 class DequantizeMaxAbsOp : public framework::OperatorWithKernel {
  public:
@@ -102,7 +102,7 @@ This calculation is an opposite operation of QuantizeMaxAbsOp:
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
+using CPU = phi::CPUContext;
 
 REGISTER_OPERATOR(
     dequantize_abs_max,
diff --git a/paddle/fluid/operators/dequantize_log_op.cc b/paddle/fluid/operators/dequantize_log_op.cc
index 28d218ed3a85a..c80c050b14afd 100644
--- a/paddle/fluid/operators/dequantize_log_op.cc
+++ b/paddle/fluid/operators/dequantize_log_op.cc
@@ -32,8 +32,8 @@ namespace paddle {
 namespace operators {
 
 template <typename T>
-struct DequantizeFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& dev_ctx,
+struct DequantizeFunctor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext& dev_ctx,
                   const framework::Tensor* in,
                   const framework::Tensor* dict,
                   framework::Tensor* out) {
@@ -51,7 +51,7 @@ struct DequantizeFunctor<platform::CPUDeviceContext, T> {
   }
 };
 
-template struct DequantizeFunctor<platform::CPUDeviceContext, int8_t>;
+template struct DequantizeFunctor<phi::CPUContext, int8_t>;
 
 class DequantizeLogOp : public framework::OperatorWithKernel {
  public:
@@ -108,7 +108,7 @@ This calculation is an opposite operation of QuantizeLogOp:
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
+using CPU = phi::CPUContext;
 
 REGISTER_OPERATOR(
     dequantize_log,
diff --git a/paddle/fluid/operators/detection/bipartite_match_op.cc b/paddle/fluid/operators/detection/bipartite_match_op.cc
index 86b18f8920b87..ef824d2d8cdcd 100644
--- a/paddle/fluid/operators/detection/bipartite_match_op.cc
+++ b/paddle/fluid/operators/detection/bipartite_match_op.cc
@@ -200,7 +200,7 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
     auto* match_indices = context.Output<Tensor>("ColToRowMatchIndices");
     auto* match_dist = context.Output<Tensor>("ColToRowMatchDist");
 
-    auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
+    auto& dev_ctx = context.device_context<phi::CPUContext>();
 
     auto col = dist_mat->dims()[1];
 
@@ -216,9 +216,9 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
     match_indices->mutable_data<int>({n, col}, context.GetPlace());
     match_dist->mutable_data<T>({n, col}, context.GetPlace());
 
-    phi::funcs::SetConstant<platform::CPUDeviceContext, int> iset;
+    phi::funcs::SetConstant<phi::CPUContext, int> iset;
     iset(dev_ctx, match_indices, static_cast<int>(-1));
-    phi::funcs::SetConstant<platform::CPUDeviceContext, T> tset;
+    phi::funcs::SetConstant<phi::CPUContext, T> tset;
     tset(dev_ctx, match_dist, static_cast<T>(0));
 
     int* indices = match_indices->data<int>();
diff --git a/paddle/fluid/operators/detection/box_clip_op.cc b/paddle/fluid/operators/detection/box_clip_op.cc
index ade3ed5f4de26..cd17a8c9883df 100644
--- a/paddle/fluid/operators/detection/box_clip_op.cc
+++ b/paddle/fluid/operators/detection/box_clip_op.cc
@@ -104,7 +104,6 @@ REGISTER_OPERATOR(
     ops::BoxClipOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    box_clip,
-    ops::BoxClipKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BoxClipKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(box_clip,
+                       ops::BoxClipKernel<phi::CPUContext, float>,
+                       ops::BoxClipKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/detection/box_clip_op.h b/paddle/fluid/operators/detection/box_clip_op.h
index e27dd30896852..5c816ee3eb5e2 100644
--- a/paddle/fluid/operators/detection/box_clip_op.h
+++ b/paddle/fluid/operators/detection/box_clip_op.h
@@ -29,8 +29,7 @@ class BoxClipKernel : public framework::OpKernel<T> {
     auto* input_box = context.Input<LoDTensor>("Input");
     auto* im_info = context.Input<LoDTensor>("ImInfo");
     auto* output_box = context.Output<LoDTensor>("Output");
-    auto& dev_ctx =
-        context.template device_context<platform::CPUDeviceContext>();
+    auto& dev_ctx = context.template device_context<phi::CPUContext>();
     output_box->mutable_data<T>(context.GetPlace());
     if (input_box->lod().size()) {
       PADDLE_ENFORCE_EQ(input_box->lod().size(),
diff --git a/paddle/fluid/operators/detection/box_coder_op.cc b/paddle/fluid/operators/detection/box_coder_op.cc
index 87edd80143a55..64aa86315622f 100644
--- a/paddle/fluid/operators/detection/box_coder_op.cc
+++ b/paddle/fluid/operators/detection/box_coder_op.cc
@@ -251,7 +251,6 @@ REGISTER_OPERATOR(
     ops::BoxCoderOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    box_coder,
-    ops::BoxCoderKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BoxCoderKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(box_coder,
+                       ops::BoxCoderKernel<phi::CPUContext, float>,
+                       ops::BoxCoderKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc
index 05a44dda32a54..d641a6fd41ef7 100644
--- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc
+++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc
@@ -227,7 +227,6 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 
-REGISTER_OP_CPU_KERNEL(
-    box_decoder_and_assign,
-    ops::BoxDecoderAndAssignKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BoxDecoderAndAssignKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(box_decoder_and_assign,
+                       ops::BoxDecoderAndAssignKernel<phi::CPUContext, float>,
+                       ops::BoxDecoderAndAssignKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/detection/generate_mask_labels_op.cc b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
index 93e9111f1ac61..5473a57902b87 100644
--- a/paddle/fluid/operators/detection/generate_mask_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
@@ -122,7 +122,7 @@ class GenerateMaskLabelsOp : public framework::OperatorWithKernel {
  * to encode class specific mask targets.
  */
 template <typename T>
-static inline void ExpandMaskTarget(const platform::CPUDeviceContext& ctx,
+static inline void ExpandMaskTarget(const phi::CPUContext& ctx,
                                     const Tensor& masks,
                                     const Tensor& mask_class_labels,
                                     const int resolution,
@@ -150,7 +150,7 @@ static inline void ExpandMaskTarget(const platform::CPUDeviceContext& ctx,
 }
 
 template <typename T>
-std::vector<Tensor> SampleMaskForOneImage(const platform::CPUDeviceContext& ctx,
+std::vector<Tensor> SampleMaskForOneImage(const phi::CPUContext& ctx,
                                           const Tensor& im_info,
                                           const Tensor& gt_classes,
                                           const Tensor& is_crowd,
@@ -391,7 +391,7 @@ class GenerateMaskLabelsKernel : public framework::OpKernel<T> {
     std::vector<size_t> lod0(1, 0);
 
     int64_t num_mask = 0;
-    auto& dev_ctx = ctx.device_context<platform::CPUDeviceContext>();
+    auto& dev_ctx = ctx.device_context<phi::CPUContext>();
 
     auto gt_classes_lod = gt_classes->lod().back();
     auto is_crowd_lod = is_crowd->lod().back();
diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
index 749e88c0a9975..7376e0993a506 100644
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -168,7 +168,7 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel {
 };
 
 template <typename T>
-void Concat(const platform::CPUDeviceContext& context,
+void Concat(const phi::CPUContext& context,
             const Tensor& in_tensor_a,
             const Tensor& in_tensor_b,
             Tensor* out_tensor) {
@@ -176,24 +176,23 @@ void Concat(const platform::CPUDeviceContext& context,
   std::vector<Tensor> inputs;
   inputs.emplace_back(in_tensor_a);
   inputs.emplace_back(in_tensor_b);
-  math::ConcatFunctor<platform::CPUDeviceContext, T> concat_functor;
+  math::ConcatFunctor<phi::CPUContext, T> concat_functor;
   concat_functor(context, inputs, axis, out_tensor);
 }
 
 template <typename T>
-std::vector<std::vector<int>> SampleFgBgGt(
-    const platform::CPUDeviceContext& context,
-    Tensor* iou,
-    const Tensor& is_crowd,
-    const int batch_size_per_im,
-    const float fg_fraction,
-    const float fg_thresh,
-    const float bg_thresh_hi,
-    const float bg_thresh_lo,
-    std::minstd_rand engine,
-    const bool use_random,
-    const bool is_cascade_rcnn,
-    const Tensor& rpn_rois) {
+std::vector<std::vector<int>> SampleFgBgGt(const phi::CPUContext& context,
+                                           Tensor* iou,
+                                           const Tensor& is_crowd,
+                                           const int batch_size_per_im,
+                                           const float fg_fraction,
+                                           const float fg_thresh,
+                                           const float bg_thresh_hi,
+                                           const float bg_thresh_lo,
+                                           std::minstd_rand engine,
+                                           const bool use_random,
+                                           const bool is_cascade_rcnn,
+                                           const Tensor& rpn_rois) {
   std::vector<int> fg_inds;
   std::vector<int> bg_inds;
   std::vector<int> mapped_gt_inds;
@@ -286,7 +285,7 @@ std::vector<std::vector<int>> SampleFgBgGt(
 }
 
 template <typename T>
-void GatherBoxesLabels(const platform::CPUDeviceContext& context,
+void GatherBoxesLabels(const phi::CPUContext& context,
                        const Tensor& boxes,
                        const Tensor& max_overlap,
                        const Tensor& gt_boxes,
@@ -335,7 +334,7 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context,
 
 template <typename T>
 std::vector<Tensor> SampleRoisForOneImage(
-    const platform::CPUDeviceContext& context,
+    const phi::CPUContext& context,
     const Tensor& rpn_rois_in,
     const Tensor& gt_classes,
     const Tensor& is_crowd,
@@ -372,7 +371,7 @@ std::vector<Tensor> SampleRoisForOneImage(
     Tensor roi_filter;
     // Tensor box_filter;
     if (keep.numel() == 0) {
-      phi::funcs::SetConstant<platform::CPUDeviceContext, T> set_zero;
+      phi::funcs::SetConstant<phi::CPUContext, T> set_zero;
       roi_filter.mutable_data<T>({proposals_num, kBoxDim}, context.GetPlace());
       set_zero(context, &roi_filter, static_cast<T>(0));
     } else {
@@ -597,7 +596,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
     std::vector<size_t> lod0(1, 0);
 
     int64_t num_rois = 0;
-    auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
+    auto& dev_ctx = context.device_context<phi::CPUContext>();
 
     auto rpn_rois_lod = rpn_rois->lod().back();
     auto gt_classes_lod = gt_classes->lod().back();
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc
index ba213f10852e7..29d7347f1ba75 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -98,8 +98,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
     float min_size = context.Attr<float>("min_size");
     float eta = context.Attr<float>("eta");
 
-    auto &dev_ctx =
-        context.template device_context<platform::CPUDeviceContext>();
+    auto &dev_ctx = context.template device_context<phi::CPUContext>();
 
     auto &scores_dim = scores->dims();
     int64_t num = scores_dim[0];
@@ -122,7 +121,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
     scores_swap.mutable_data<T>({num, h_score, w_score, c_score},
                                 dev_ctx.GetPlace());
 
-    phi::funcs::Transpose<platform::CPUDeviceContext, T, 4> trans;
+    phi::funcs::Transpose<phi::CPUContext, T, 4> trans;
     std::vector<int> axis = {0, 2, 3, 1};
     trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis);
     trans(dev_ctx, *scores, &scores_swap, axis);
@@ -181,7 +180,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
   }
 
   std::pair<Tensor, Tensor> ProposalForOneImage(
-      const platform::CPUDeviceContext &ctx,
+      const phi::CPUContext &ctx,
       const Tensor &im_info_slice,
       const Tensor &anchors,
       const Tensor &variances,
@@ -234,7 +233,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
     FilterBoxes<T>(ctx, &proposals, min_size, im_info_slice, true, &keep);
     // Handle the case when there is no keep index left
     if (keep.numel() == 0) {
-      phi::funcs::SetConstant<platform::CPUDeviceContext, T> set_zero;
+      phi::funcs::SetConstant<phi::CPUContext, T> set_zero;
       bbox_sel.mutable_data<T>({1, 4}, ctx.GetPlace());
       set_zero(ctx, &bbox_sel, static_cast<T>(0));
       Tensor scores_filter;
diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
index 257716b635724..450154bec4e17 100644
--- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
@@ -99,8 +99,7 @@ class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
     float eta = context.Attr<float>("eta");
     bool pixel_offset = context.Attr<bool>("pixel_offset");
 
-    auto &dev_ctx =
-        context.template device_context<platform::CPUDeviceContext>();
+    auto &dev_ctx = context.template device_context<phi::CPUContext>();
 
     auto &scores_dim = scores->dims();
     int64_t num = scores_dim[0];
@@ -123,7 +122,7 @@ class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
     scores_swap.mutable_data<T>({num, h_score, w_score, c_score},
                                 dev_ctx.GetPlace());
 
-    phi::funcs::Transpose<platform::CPUDeviceContext, T, 4> trans;
+    phi::funcs::Transpose<phi::CPUContext, T, 4> trans;
     std::vector<int> axis = {0, 2, 3, 1};
     trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis);
     trans(dev_ctx, *scores, &scores_swap, axis);
@@ -183,7 +182,7 @@ class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
   }
 
   std::pair<Tensor, Tensor> ProposalForOneImage(
-      const platform::CPUDeviceContext &ctx,
+      const phi::CPUContext &ctx,
       const Tensor &im_shape_slice,
       const Tensor &anchors,
       const Tensor &variances,
@@ -240,7 +239,7 @@ class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
         ctx, &proposals, min_size, im_shape_slice, false, &keep, pixel_offset);
     // Handle the case when there is no keep index left
     if (keep.numel() == 0) {
-      phi::funcs::SetConstant<platform::CPUDeviceContext, T> set_zero;
+      phi::funcs::SetConstant<phi::CPUContext, T> set_zero;
       bbox_sel.mutable_data<T>({1, 4}, ctx.GetPlace());
       set_zero(ctx, &bbox_sel, static_cast<T>(0));
       Tensor scores_filter;
diff --git a/paddle/fluid/operators/detection/iou_similarity_op.cc b/paddle/fluid/operators/detection/iou_similarity_op.cc
index 504090cfaf7fe..c31c630cd6ccd 100644
--- a/paddle/fluid/operators/detection/iou_similarity_op.cc
+++ b/paddle/fluid/operators/detection/iou_similarity_op.cc
@@ -113,7 +113,6 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 
-REGISTER_OP_CPU_KERNEL(
-    iou_similarity,
-    ops::IOUSimilarityKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::IOUSimilarityKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(iou_similarity,
+                       ops::IOUSimilarityKernel<phi::CPUContext, float>,
+                       ops::IOUSimilarityKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/detection/locality_aware_nms_op.cc b/paddle/fluid/operators/detection/locality_aware_nms_op.cc
index 6b0608d386f2e..6fb48229517d3 100644
--- a/paddle/fluid/operators/detection/locality_aware_nms_op.cc
+++ b/paddle/fluid/operators/detection/locality_aware_nms_op.cc
@@ -356,7 +356,7 @@ class LocalityAwareNMSKernel : public framework::OpKernel<T> {
     auto* outs = ctx.Output<LoDTensor>("Out");
     auto& score_dims = scores_input->dims();
     auto score_size = score_dims.size();
-    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::CPUContext>();
 
     LoDTensor scores;
     LoDTensor boxes;
diff --git a/paddle/fluid/operators/detection/mine_hard_examples_op.cc b/paddle/fluid/operators/detection/mine_hard_examples_op.cc
index e2157b02f92d2..163da3cdd9727 100644
--- a/paddle/fluid/operators/detection/mine_hard_examples_op.cc
+++ b/paddle/fluid/operators/detection/mine_hard_examples_op.cc
@@ -403,7 +403,6 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 
-REGISTER_OP_CPU_KERNEL(
-    mine_hard_examples,
-    ops::MineHardExamplesKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MineHardExamplesKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(mine_hard_examples,
+                       ops::MineHardExamplesKernel<phi::CPUContext, float>,
+                       ops::MineHardExamplesKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc
index 0fb02832be066..68b4ab20150bb 100644
--- a/paddle/fluid/operators/detection/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc
@@ -219,7 +219,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
     T nms_threshold = static_cast<T>(ctx.Attr<float>("nms_threshold"));
     T nms_eta = static_cast<T>(ctx.Attr<float>("nms_eta"));
     T score_threshold = static_cast<T>(ctx.Attr<float>("score_threshold"));
-    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::CPUContext>();
 
     int num_det = 0;
 
@@ -361,7 +361,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
     auto rois_num = ctx.Input<Tensor>("RoisNum");
     auto score_dims = scores->dims();
     auto score_size = score_dims.size();
-    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::CPUContext>();
 
     std::vector<std::map<int, std::vector<int>>> all_indices;
     std::vector<size_t> batch_starts = {0};
diff --git a/paddle/fluid/operators/detection/retinanet_detection_output_op.cc b/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
index a2a9358ca0d85..915b174f174c5 100644
--- a/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
+++ b/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
@@ -507,7 +507,7 @@ class RetinanetDetectionOutputKernel : public framework::OpKernel<T> {
     int64_t box_dim = box_dims[2];
     int64_t out_dim = box_dim + 2;
 
-    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::CPUContext>();
 
     std::vector<std::vector<std::vector<T>>> all_nmsed_out;
     std::vector<size_t> batch_starts = {0};
diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
index 4d7d9fec77dbe..8fbfe2ad8548c 100644
--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
@@ -112,12 +112,11 @@ void AppendRpns(LoDTensor* out, int64_t offset, Tensor* to_add) {
 }
 
 template <typename T>
-std::vector<Tensor> FilterStraddleAnchor(
-    const platform::CPUDeviceContext& context,
-    const Tensor* anchor,
-    const float rpn_straddle_thresh,
-    T im_height,
-    T im_width) {
+std::vector<Tensor> FilterStraddleAnchor(const phi::CPUContext& context,
+                                         const Tensor* anchor,
+                                         const float rpn_straddle_thresh,
+                                         T im_height,
+                                         T im_width) {
   std::vector<int> inds_inside;
   int anchor_num = anchor->dims()[0];
   auto* anchor_data = anchor->data<T>();
@@ -154,7 +153,7 @@ std::vector<Tensor> FilterStraddleAnchor(
 }
 
 template <typename T>
-Tensor FilterCrowdGt(const platform::CPUDeviceContext& context,
+Tensor FilterCrowdGt(const phi::CPUContext& context,
                      Tensor* gt_boxes,
                      Tensor* is_crowd) {
   int gt_num = gt_boxes->dims()[0];
@@ -300,7 +299,7 @@ void ScoreAssign(const T* anchor_by_gt_overlap_data,
 }
 
 template <typename T>
-std::vector<Tensor> SampleRpnFgBgGt(const platform::CPUDeviceContext& ctx,
+std::vector<Tensor> SampleRpnFgBgGt(const phi::CPUContext& ctx,
                                     const Tensor& anchor_by_gt_overlap,
                                     const int rpn_batch_size_per_im,
                                     const float rpn_positive_overlap,
@@ -437,7 +436,7 @@ class RpnTargetAssignKernel : public framework::OpKernel<T> {
     tgt_bbox->mutable_data<T>({max_num, 4}, place);
     tgt_lbl->mutable_data<int>({max_num, 1}, place);
     bbox_inside_weight->mutable_data<T>({max_num, 4}, place);
-    auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
+    auto& dev_ctx = context.device_context<phi::CPUContext>();
 
     std::random_device rnd;
     std::minstd_rand engine;
@@ -857,11 +856,10 @@ class RetinanetTargetAssignOp : public framework::OperatorWithKernel {
 };
 
 template <typename T>
-std::vector<Tensor> FilterCrowdGtBoxLabel(
-    const platform::CPUDeviceContext& context,
-    Tensor* gt_boxes,
-    Tensor* gt_labels,
-    Tensor* is_crowd) {
+std::vector<Tensor> FilterCrowdGtBoxLabel(const phi::CPUContext& context,
+                                          Tensor* gt_boxes,
+                                          Tensor* gt_labels,
+                                          Tensor* is_crowd) {
   int gt_num = gt_boxes->dims()[0];
   std::vector<int> not_crowd_inds;
   auto* is_crowd_data = is_crowd->data<int>();
@@ -893,7 +891,7 @@ std::vector<Tensor> FilterCrowdGtBoxLabel(
 }
 
 template <typename T>
-std::vector<Tensor> GetAllFgBgGt(const platform::CPUDeviceContext& ctx,
+std::vector<Tensor> GetAllFgBgGt(const phi::CPUContext& ctx,
                                  const Tensor& anchor_by_gt_overlap,
                                  const Tensor& ncrowd_gt_labels,
                                  const float positive_overlap,
@@ -1044,7 +1042,7 @@ class RetinanetTargetAssignKernel : public framework::OpKernel<T> {
     tgt_lbl->mutable_data<int>({max_num, 1}, place);
     bbox_inside_weight->mutable_data<T>({max_num, 4}, place);
     fg_num->mutable_data<int>({batch_num, 1}, place);
-    auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
+    auto& dev_ctx = context.device_context<phi::CPUContext>();
 
     std::random_device rnd;
     std::minstd_rand engine;
diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc
index 9bdc1b645bfe6..bc23c5105db94 100644
--- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc
+++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc
@@ -266,12 +266,10 @@ REGISTER_OPERATOR(sigmoid_focal_loss,
                   ops::SigmoidFocalLossGradOpMaker<paddle::framework::OpDesc>,
                   ops::SigmoidFocalLossGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(sigmoid_focal_loss_grad, ops::SigmoidFocalLossGradOp);
-REGISTER_OP_CPU_KERNEL(
-    sigmoid_focal_loss,
-    ops::SigmoidFocalLossKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SigmoidFocalLossKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(sigmoid_focal_loss,
+                       ops::SigmoidFocalLossKernel<phi::CPUContext, float>,
+                       ops::SigmoidFocalLossKernel<phi::CPUContext, double>);
 REGISTER_OP_CPU_KERNEL(
     sigmoid_focal_loss_grad,
-    ops::SigmoidFocalLossGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SigmoidFocalLossGradKernel<paddle::platform::CPUDeviceContext,
-                                    double>);
+    ops::SigmoidFocalLossGradKernel<phi::CPUContext, float>,
+    ops::SigmoidFocalLossGradKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/detection/target_assign_op.cc b/paddle/fluid/operators/detection/target_assign_op.cc
index a6c1db5f78d0e..99deee3f72aea 100644
--- a/paddle/fluid/operators/detection/target_assign_op.cc
+++ b/paddle/fluid/operators/detection/target_assign_op.cc
@@ -149,8 +149,8 @@ for i-th instance and each `id` of NegIndices in this instance:
 };
 
 template <typename T, typename WT>
-struct NegTargetAssignFunctor<platform::CPUDeviceContext, T, WT> {
-  void operator()(const platform::CPUDeviceContext& ctx,
+struct NegTargetAssignFunctor<phi::CPUContext, T, WT> {
+  void operator()(const phi::CPUContext& ctx,
                   const int* neg_indices,
                   const size_t* lod,
                   const int N,
@@ -172,10 +172,8 @@ struct NegTargetAssignFunctor<platform::CPUDeviceContext, T, WT> {
   }
 };
 
-template struct NegTargetAssignFunctor<platform::CPUDeviceContext, int, float>;
-template struct NegTargetAssignFunctor<platform::CPUDeviceContext,
-                                       float,
-                                       float>;
+template struct NegTargetAssignFunctor<phi::CPUContext, int, float>;
+template struct NegTargetAssignFunctor<phi::CPUContext, float, float>;
 
 }  // namespace operators
 }  // namespace paddle
@@ -187,7 +185,6 @@ REGISTER_OPERATOR(
     ops::TargetAssignOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    target_assign,
-    ops::TargetAssignKernel<paddle::platform::CPUDeviceContext, int, float>,
-    ops::TargetAssignKernel<paddle::platform::CPUDeviceContext, float, float>);
+REGISTER_OP_CPU_KERNEL(target_assign,
+                       ops::TargetAssignKernel<phi::CPUContext, int, float>,
+                       ops::TargetAssignKernel<phi::CPUContext, float, float>);
diff --git a/paddle/fluid/operators/determinant_op.cc b/paddle/fluid/operators/determinant_op.cc
index b3af0853095da..b4724eb3c83a3 100644
--- a/paddle/fluid/operators/determinant_op.cc
+++ b/paddle/fluid/operators/determinant_op.cc
@@ -179,12 +179,10 @@ REGISTER_OPERATOR(slogdeterminant,
 REGISTER_OPERATOR(slogdeterminant_grad,
                   ops::SlogDeterminantGradOp)  // reuse det grad op
 
-REGISTER_OP_CPU_KERNEL(
-    slogdeterminant,
-    ops::SlogDeterminantKernel<plat::CPUDeviceContext, float>,
-    ops::SlogDeterminantKernel<plat::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    slogdeterminant_grad,
-    ops::SlogDeterminantGradKernel<plat::CPUDeviceContext, float>,
-    ops::SlogDeterminantGradKernel<plat::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(slogdeterminant,
+                       ops::SlogDeterminantKernel<phi::CPUContext, float>,
+                       ops::SlogDeterminantKernel<phi::CPUContext, double>);
+
+REGISTER_OP_CPU_KERNEL(slogdeterminant_grad,
+                       ops::SlogDeterminantGradKernel<phi::CPUContext, float>,
+                       ops::SlogDeterminantGradKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/dgc_clip_by_norm_op.cc b/paddle/fluid/operators/dgc_clip_by_norm_op.cc
index 5d0cd4bbc3578..9949fefb1b18b 100644
--- a/paddle/fluid/operators/dgc_clip_by_norm_op.cc
+++ b/paddle/fluid/operators/dgc_clip_by_norm_op.cc
@@ -66,6 +66,5 @@ REGISTER_OP_WITHOUT_GRADIENT(dgc_clip_by_norm,
                              ops::DGCClipByNormOp,
                              ops::DGCClipByNormOpMaker);
 
-REGISTER_OP_CPU_KERNEL(
-    dgc_clip_by_norm,
-    ops::DGCClipByNormKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(dgc_clip_by_norm,
+                       ops::DGCClipByNormKernel<phi::CPUContext, float>);
diff --git a/paddle/fluid/operators/diag_embed_op.cc b/paddle/fluid/operators/diag_embed_op.cc
index 0377e40e0a221..531d6f92d8830 100644
--- a/paddle/fluid/operators/diag_embed_op.cc
+++ b/paddle/fluid/operators/diag_embed_op.cc
@@ -138,9 +138,8 @@ REGISTER_OPERATOR(
     ops::DiagEmbedOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    diag_embed,
-    ops::DiagEmbedKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::DiagEmbedKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DiagEmbedKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::DiagEmbedKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(diag_embed,
+                       ops::DiagEmbedKernel<phi::CPUContext, int>,
+                       ops::DiagEmbedKernel<phi::CPUContext, float>,
+                       ops::DiagEmbedKernel<phi::CPUContext, double>,
+                       ops::DiagEmbedKernel<phi::CPUContext, int64_t>);
diff --git a/paddle/fluid/operators/diag_op.cc b/paddle/fluid/operators/diag_op.cc
index 9f12e4af47fdd..8ccc5ff3891b9 100644
--- a/paddle/fluid/operators/diag_op.cc
+++ b/paddle/fluid/operators/diag_op.cc
@@ -59,9 +59,8 @@ REGISTER_OPERATOR(
     ops::DiagOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    diag,
-    ops::DiagKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::DiagKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DiagKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::DiagKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(diag,
+                       ops::DiagKernel<phi::CPUContext, int>,
+                       ops::DiagKernel<phi::CPUContext, float>,
+                       ops::DiagKernel<phi::CPUContext, double>,
+                       ops::DiagKernel<phi::CPUContext, int64_t>);
diff --git a/paddle/fluid/operators/diag_v2_op.cc b/paddle/fluid/operators/diag_v2_op.cc
deleted file mode 100644
index 61a3409c418ba..0000000000000
--- a/paddle/fluid/operators/diag_v2_op.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/infermeta/unary.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-class DiagV2Op : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-};
-
-class DiagV2OpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input tensor. Its shape is either 1-D or 2-D.");
-    AddOutput("Out", "The output tensor. A square matrix or a vector.");
-    AddAttr<int>("offset",
-                 "The diagonal offset. A positive value represents "
-                 "superdiagonal, 0 represents the main diagonal, and a "
-                 "negative value represents subdiagonal.")
-        .SetDefault(0);
-    AddAttr<float>("padding_value",
-                   "Use this value to fill the area outside the specified "
-                   "diagonal band. Only takes effect when the input is a 1-D "
-                   "Tensor. The default value is 0.")
-        .SetDefault(0.0f);
-    AddComment(R"DOC(
-      If ``x`` is a vector (1-D tensor), a 2-D square tensor with the elements of ``x`` as the diagonal is returned.
-
-      If ``x`` is a matrix (2-D tensor), a 1-D tensor with the diagonal elements of ``x`` is returned.
-
-      The argument ``offset`` controls the diagonal offset:
-
-      If ``offset`` = 0, it is the main diagonal.
-
-      If ``offset`` > 0, it is superdiagonal.
-
-      If ``offset`` < 0, it is subdiagonal.
-)DOC");
-  }
-};
-
-class DiagV2GradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "X", "X", "DiagV2Grad");
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")),
-                   "Output",
-                   framework::GradVarName("X"),
-                   "DiagV2Grad");
-
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
-                                       ctx, framework::GradVarName("Out")),
-                                   ctx.GetPlace());
-  }
-};
-
-template <typename T>
-class DiagV2GradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("diag_v2_grad");
-    grad_op->SetInput("X", this->Input("X"));
-    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    grad_op->SetAttrMap(this->Attrs());
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERER(DiagGradV2NoNeedBufferVarsInferer, "X");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-DECLARE_INFER_SHAPE_FUNCTOR(diag_v2,
-                            DiagInferShapeFunctor,
-                            PD_INFER_META(phi::DiagInferMeta));
-
-REGISTER_OPERATOR(diag_v2,
-                  ops::DiagV2Op,
-                  ops::DiagV2OpMaker,
-                  ops::DiagV2GradOpMaker<paddle::framework::OpDesc>,
-                  ops::DiagV2GradOpMaker<paddle::imperative::OpBase>,
-                  DiagInferShapeFunctor);
-
-REGISTER_OPERATOR(diag_v2_grad,
-                  ops::DiagV2GradOp,
-                  ops::DiagGradV2NoNeedBufferVarsInferer);
diff --git a/paddle/fluid/operators/digamma_op.cc b/paddle/fluid/operators/digamma_op.cc
deleted file mode 100644
index 5f17c3b3da658..0000000000000
--- a/paddle/fluid/operators/digamma_op.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/digamma_op.h"
-
-namespace paddle {
-namespace operators {
-
-class DigammaOp : public framework::OperatorWithKernel {
- public:
-  DigammaOp(const std::string &type,
-            const framework::VariableNameMap &inputs,
-            const framework::VariableNameMap &outputs,
-            const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Digamma");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Digamma");
-
-    auto in_dims = ctx->GetInputDim("X");
-
-    ctx->SetOutputDim("Out", in_dims);
-    ctx->ShareLoD("X", "Out");
-  }
-};
-
-class DigammaOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The input tensor of digamma operator.");
-    AddOutput("Out", "(Tensor), The output tensor of digamma operator.");
-    AddComment(R"DOC(
-Digamma Operator.
-
-This operator is used to perform elementwise digamma for input $X$.
-$$out = \Psi(x) = \frac{ \Gamma^{'}(x) }{ \Gamma(x) }$$
-
-)DOC");
-  }
-};
-
-class DigammaGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input",
-                   "Out@Grad",
-                   "DigammaGrad");
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "DigammaGrad");
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")),
-                   "Output",
-                   "X@Grad",
-                   "DigammaGrad");
-
-    auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    ctx->SetOutputDim(framework::GradVarName("X"), dout_dims);
-    ctx->ShareLoD(framework::GradVarName("Out"), framework::GradVarName("X"));
-  }
-};
-
-template <typename T>
-class DigammaGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
-  void Apply(GradOpPtr<T> retv) const override {
-    retv->SetType("digamma_grad");
-    retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    retv->SetInput("X", this->Input("X"));
-    retv->SetAttrMap(this->Attrs());
-    retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(digamma,
-                  ops::DigammaOp,
-                  ops::DigammaOpMaker,
-                  ops::DigammaGradOpMaker<paddle::framework::OpDesc>,
-                  ops::DigammaGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(digamma_grad, ops::DigammaGradOp);
diff --git a/paddle/fluid/operators/dirichlet_op.cc b/paddle/fluid/operators/dirichlet_op.cc
index 81a3e63192eb5..ccbe3b62b73dd 100644
--- a/paddle/fluid/operators/dirichlet_op.cc
+++ b/paddle/fluid/operators/dirichlet_op.cc
@@ -42,11 +42,11 @@ struct GammaCPUFunctor {
 };
 
 template <typename T>
-struct DirichletSampler<platform::CPUDeviceContext, T> {
+struct DirichletSampler<phi::CPUContext, T> {
   void operator()(const framework::ExecutionContext& ctx,
                   const Tensor* alpha,
                   Tensor* out) {
-    auto& dev_ctx = ctx.device_context<platform::CPUDeviceContext>();
+    auto& dev_ctx = ctx.device_context<phi::CPUContext>();
 
     auto p_gen = framework::DefaultCPUGenerator();
     auto generator = p_gen->GetCPUEngine();
@@ -71,8 +71,7 @@ struct DirichletSampler<platform::CPUDeviceContext, T> {
         gamma_samples.data<T>(),
         standard_uniform,
         standard_normal);
-    platform::ForRange<platform::CPUDeviceContext> for_range(dev_ctx,
-                                                             alpha->numel());
+    platform::ForRange<phi::CPUContext> for_range(dev_ctx, alpha->numel());
     for_range(gamma_functor);
 
     // normalize them into a simplex, along the last axis
@@ -81,10 +80,10 @@ struct DirichletSampler<platform::CPUDeviceContext, T> {
     new_shape[new_shape.size() - 1] = 1;
     gamma_sum.mutable_data<T>(new_shape, dev_ctx.GetPlace());
 
-    ReduceKernelFunctor<platform::CPUDeviceContext, T, SumFunctor>(
+    ReduceKernelFunctor<phi::CPUContext, T, SumFunctor>(
         &gamma_samples, &gamma_sum, {new_shape.size() - 1}, true, false, ctx)
         .template apply<T>();
-    ElementwiseComputeEx<DivFunctor<T>, platform::CPUDeviceContext, T, T>(
+    ElementwiseComputeEx<DivFunctor<T>, phi::CPUContext, T, T>(
         ctx, &gamma_samples, &gamma_sum, -1, DivFunctor<T>(), out);
   }
 };
@@ -125,7 +124,5 @@ REGISTER_OP_WITHOUT_GRADIENT(dirichlet,
                              paddle::operators::DirichletOpMaker);
 REGISTER_OP_CPU_KERNEL(
     dirichlet,
-    paddle::operators::DirichletKernel<paddle::platform::CPUDeviceContext,
-                                       float>,
-    paddle::operators::DirichletKernel<paddle::platform::CPUDeviceContext,
-                                       double>);
+    paddle::operators::DirichletKernel<phi::CPUContext, float>,
+    paddle::operators::DirichletKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/dist_op.cc b/paddle/fluid/operators/dist_op.cc
deleted file mode 100644
index 49f8fa75aa6ce..0000000000000
--- a/paddle/fluid/operators/dist_op.cc
+++ /dev/null
@@ -1,140 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/binary.h"
-
-namespace paddle {
-namespace operators {
-
-class DistOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Dist");
-    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "Dist");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Dist");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-
-    PADDLE_ENFORCE_NE(phi::product(x_dims),
-                      0,
-                      platform::errors::InvalidArgument(
-                          "The Input(X) has not been initialized properly. The "
-                          "shape of Input(X) = [%s].",
-                          x_dims));
-    PADDLE_ENFORCE_NE(phi::product(y_dims),
-                      0,
-                      platform::errors::InvalidArgument(
-                          "The Input(Y) has not been initialized properly. The "
-                          "shape of Input(Y) = [%s].",
-                          y_dims));
-    ctx->SetOutputDim("Out", {1});
-  }
-};
-
-class DistOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input Tensor of Dist Op.");
-    AddInput("Y", "The Right-hand-side input Tensor of Dist Op.");
-    AddOutput("Out",
-              "The output of Dist Op, "
-              "which is the p-norm of (X - Y)");
-    AddAttr<float>("p", "the norm to be computed.").SetDefault(2.0f);
-    AddComment(R"DOC(
-Dist Operator.
-Given two tensors X and Y, compute Lp-norm of (X-Y). It is not a norm in a strict sense,
-only as a measure of distance. The shapes of X and Y must be broadcastable. Where, Z = X - Y,
-
-When p = 0, defining $0^0 = 0$, the zero-norm of Z is simply the number of non-zero elements of z.
-$$
-||Z||_{0} = \lim_{p \rightarrow 0} \sum_{i=1}^{m} |z_i|^p
-$$
-
-When p = inf, the inf-norm of Z is the maximum element of Z.
-$$
-||Z||_\infty=\max_i |z_i|
-$$
-
-When p = -inf, the negative-inf-norm of Z is the minimum element of Z.
-$$
-||Z||_{-\infty}=\min_i |z_i|
-$$
-
-Otherwise, the p-norm of Z follows the formula,
-$$
-||Z||_{p} = (\sum_{i=i}^{m} |z_i|^p)^{1/p}
-$$
-    )DOC");
-  }
-};
-
-class DistOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-    }
-    if (ctx->HasOutput(framework::GradVarName("Y"))) {
-      ctx->SetOutputDim(framework::GradVarName("Y"), y_dims);
-    }
-  }
-};
-
-template <typename T>
-class DistGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType(this->ForwardOpType() + "_grad");
-    op->SetInput("X", this->Input("X"));
-    op->SetInput("Y", this->Input("Y"));
-    op->SetInput("Out", this->Output("Out"));
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    op->SetOutput(framework::GradVarName("Y"), this->InputGrad("Y"));
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-DECLARE_INFER_SHAPE_FUNCTOR(dist,
-                            DistInferShapeFunctor,
-                            PD_INFER_META(phi::DistInferMeta));
-
-REGISTER_OPERATOR(dist,
-                  ops::DistOp,
-                  ops::DistOpMaker,
-                  ops::DistGradOpMaker<paddle::framework::OpDesc>,
-                  ops::DistGradOpMaker<paddle::imperative::OpBase>,
-                  DistInferShapeFunctor);
-REGISTER_OPERATOR(dist_grad, ops::DistOpGrad);
diff --git a/paddle/fluid/operators/dlnne/CMakeLists.txt b/paddle/fluid/operators/dlnne/CMakeLists.txt
index 11347f0f94e5c..a2aa80f2875b8 100644
--- a/paddle/fluid/operators/dlnne/CMakeLists.txt
+++ b/paddle/fluid/operators/dlnne/CMakeLists.txt
@@ -39,7 +39,6 @@ op_library(
   DEPS
   ${GLOB_OPERATOR_DEPS}
   framework_proto
-  boost
   device_context
   op_registry
   scope)
diff --git a/paddle/fluid/operators/dot_op.cc b/paddle/fluid/operators/dot_op.cc
deleted file mode 100644
index 880186b84c3a1..0000000000000
--- a/paddle/fluid/operators/dot_op.cc
+++ /dev/null
@@ -1,139 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/dot_op.h"
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/binary.h"
-
-namespace paddle {
-namespace operators {
-
-class DotOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
-  }
-};
-
-class DotOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() final {
-    AddInput("X", "(Tensor) The first input tensor. ");
-    AddInput("Y", "(Tensor) The second input tensor. ");
-    AddOutput("Out", "(Tensor) The result tensor.");
-    AddComment("");
-  }
-};
-
-class DotGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        true,
-        ctx->HasInput("X"),
-        platform::errors::PreconditionNotMet("Input(X) should not be null."));
-    PADDLE_ENFORCE_EQ(
-        true,
-        ctx->HasInput("Y"),
-        platform::errors::PreconditionNotMet("Input(Y) should not be null."));
-    PADDLE_ENFORCE_EQ(true,
-                      ctx->HasInput(framework::GradVarName("Out")),
-                      platform::errors::PreconditionNotMet(
-                          "Input(Out@GRAD) should not be null."));
-
-    auto x_grad_name = framework::GradVarName("X");
-    auto y_grad_name = framework::GradVarName("Y");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->ShareDim("X", /*->*/ x_grad_name);
-      ctx->ShareLoD("X", /*->*/ x_grad_name);
-    }
-    if (ctx->HasOutput(y_grad_name)) {
-      ctx->ShareDim("Y", /*->*/ y_grad_name);
-      ctx->ShareLoD("Y", /*->*/ y_grad_name);
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
-                                       ctx, framework::GradVarName("Out")),
-                                   ctx.GetPlace());
-  }
-};
-
-template <typename T>
-class DotOpGradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("dot_grad");
-
-    op->SetInput("X", this->Input("X"));
-    op->SetInput("Y", this->Input("Y"));
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetAttrMap(this->Attrs());
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    op->SetOutput(framework::GradVarName("Y"), this->InputGrad("Y"));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-DECLARE_INFER_SHAPE_FUNCTOR(dot,
-                            DotInferShapeFunctor,
-                            PD_INFER_META(phi::DotInferMeta));
-
-REGISTER_OPERATOR(dot,
-                  ops::DotOp,
-                  ops::DotOpMaker,
-                  ops::DotOpGradMaker<paddle::framework::OpDesc>,
-                  ops::DotOpGradMaker<paddle::imperative::OpBase>,
-                  DotInferShapeFunctor);
-
-REGISTER_OPERATOR(dot_grad, ops::DotGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    dot,
-    ops::DotKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DotKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::DotKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::DotKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::DotKernel<paddle::platform::CPUDeviceContext,
-                   paddle::platform::complex<float>>,
-    ops::DotKernel<paddle::platform::CPUDeviceContext,
-                   paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    dot_grad,
-    ops::DotGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DotGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::DotGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::DotGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::DotGradKernel<paddle::platform::CPUDeviceContext,
-                       paddle::platform::complex<float>>,
-    ops::DotGradKernel<paddle::platform::CPUDeviceContext,
-                       paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/dot_op.cu b/paddle/fluid/operators/dot_op.cu
deleted file mode 100644
index 362a6a80f96fe..0000000000000
--- a/paddle/fluid/operators/dot_op.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/dot_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    dot,
-    ops::DotKernel<plat::CUDADeviceContext, float>,
-    ops::DotKernel<plat::CUDADeviceContext, double>,
-    ops::DotKernel<plat::CUDADeviceContext, int>,
-    ops::DotKernel<plat::CUDADeviceContext, int64_t>,
-    ops::DotKernel<plat::CUDADeviceContext, paddle::platform::complex<float>>,
-    ops::DotKernel<plat::CUDADeviceContext, paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(dot_grad,
-                        ops::DotGradKernel<plat::CUDADeviceContext, float>,
-                        ops::DotGradKernel<plat::CUDADeviceContext, double>,
-                        ops::DotGradKernel<plat::CUDADeviceContext, int>,
-                        ops::DotGradKernel<plat::CUDADeviceContext, int64_t>,
-                        ops::DotGradKernel<plat::CUDADeviceContext,
-                                           paddle::platform::complex<float>>,
-                        ops::DotGradKernel<plat::CUDADeviceContext,
-                                           paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/dot_op.h b/paddle/fluid/operators/dot_op.h
deleted file mode 100644
index 0f4c80c4c9e07..0000000000000
--- a/paddle/fluid/operators/dot_op.h
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/complex_functors.h"
-
-// only can include the headers in paddle/phi/api dirs
-#include "paddle/phi/api/lib/utils/tensor_utils.h"
-#include "paddle/phi/kernels/dot_grad_kernel.h"
-#include "paddle/phi/kernels/dot_kernel.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-// See Note [ Why still keep the original kernel implementation? ]
-template <typename DeviceContext, typename T>
-class DotKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Output<Tensor>("Out");
-    auto& dev_ctx = ctx.device_context<DeviceContext>();
-    out->mutable_data<T>(x->place());
-
-    // call new kernel
-    phi::DotKernel<
-        T,
-        typename paddle::framework::ConvertToPhiContext<DeviceContext>::TYPE>(
-        static_cast<const typename paddle::framework::ConvertToPhiContext<
-            DeviceContext>::TYPE&>(dev_ctx),
-        *x,
-        *y,
-        out);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class DotGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* tensor_x = ctx.Input<Tensor>("X");
-    auto* tensor_y = ctx.Input<Tensor>("Y");
-    auto* tensor_dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* tensor_dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* tensor_dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-
-    if (tensor_dx) tensor_dx->mutable_data<T>(ctx.GetPlace());
-    if (tensor_dy) tensor_dy->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.device_context<DeviceContext>();
-
-    // call new kernel
-    phi::DotGradKernel<T>(
-        static_cast<const typename paddle::framework::ConvertToPhiContext<
-            DeviceContext>::TYPE&>(dev_ctx),
-        *tensor_x,
-        *tensor_y,
-        *tensor_dout,
-        tensor_dx,
-        tensor_dy);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/dropout_op_test.cc b/paddle/fluid/operators/dropout_op_test.cc
index bdf08646f1d8b..7733d202e5781 100644
--- a/paddle/fluid/operators/dropout_op_test.cc
+++ b/paddle/fluid/operators/dropout_op_test.cc
@@ -91,7 +91,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
 TEST(Dropout, CPUDense) {
   f::Scope scope;
   p::CPUPlace place;
-  p::CPUDeviceContext ctx(place);
+  phi::CPUContext ctx(place);
   Compare(scope, ctx);
 }
 
diff --git a/paddle/fluid/operators/eig_op.cc b/paddle/fluid/operators/eig_op.cc
index 2a7c738f97913..b53bba9fac0c4 100644
--- a/paddle/fluid/operators/eig_op.cc
+++ b/paddle/fluid/operators/eig_op.cc
@@ -164,19 +164,15 @@ REGISTER_OPERATOR(eig,
 
 REGISTER_OPERATOR(eig_grad, ops::EigGradOp);
 
-REGISTER_OP_CPU_KERNEL(
-    eig,
-    ops::EigKernel<paddle::platform::CPUDeviceContext, float, complex64>,
-    ops::EigKernel<paddle::platform::CPUDeviceContext, double, complex128>,
-    ops::EigKernel<paddle::platform::CPUDeviceContext, complex64, complex64>,
-    ops::EigKernel<paddle::platform::CPUDeviceContext, complex128, complex128>);
+REGISTER_OP_CPU_KERNEL(eig,
+                       ops::EigKernel<phi::CPUContext, float, complex64>,
+                       ops::EigKernel<phi::CPUContext, double, complex128>,
+                       ops::EigKernel<phi::CPUContext, complex64, complex64>,
+                       ops::EigKernel<phi::CPUContext, complex128, complex128>);
 
 REGISTER_OP_CPU_KERNEL(
     eig_grad,
-    ops::EigGradKernel<paddle::platform::CPUDeviceContext, float, complex64>,
-    ops::EigGradKernel<paddle::platform::CPUDeviceContext, double, complex128>,
-    ops::
-        EigGradKernel<paddle::platform::CPUDeviceContext, complex64, complex64>,
-    ops::EigGradKernel<paddle::platform::CPUDeviceContext,
-                       complex128,
-                       complex128>);
+    ops::EigGradKernel<phi::CPUContext, float, complex64>,
+    ops::EigGradKernel<phi::CPUContext, double, complex128>,
+    ops::EigGradKernel<phi::CPUContext, complex64, complex64>,
+    ops::EigGradKernel<phi::CPUContext, complex128, complex128>);
diff --git a/paddle/fluid/operators/eig_op.h b/paddle/fluid/operators/eig_op.h
index b677acbe96663..82c7fe6881969 100644
--- a/paddle/fluid/operators/eig_op.h
+++ b/paddle/fluid/operators/eig_op.h
@@ -19,7 +19,6 @@
 #include <algorithm>
 #include <complex>
 
-#include "paddle/fluid/operators/math/matrix_solve.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/complex_kernel.h"
@@ -30,6 +29,7 @@
 #include "paddle/phi/kernels/funcs/diag_functor.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/matrix_solve.h"
 #include "paddle/phi/kernels/funcs/slice.h"
 #include "paddle/phi/kernels/funcs/unsqueeze.h"
 #include "paddle/phi/kernels/matmul_kernel.h"
@@ -70,7 +70,7 @@ void TransposeTwoAxis(const Tensor& input,
   permute[axis2] = axis1;
 
   transposed_input->mutable_data<T>(input.dims(), context.GetPlace());
-  auto& dev_ctx = context.template device_context<platform::CPUDeviceContext>();
+  auto& dev_ctx = context.template device_context<phi::CPUContext>();
 
   TransCompute<DeviceContext, T>(
       input.dims().size(), dev_ctx, input, transposed_input, permute);
@@ -366,7 +366,7 @@ void ComputeBackwardForComplexInput(
   int k = rhs.dims()[rhs.dims().size() - 1];
   auto* matrix_data = Vh.data<T>();
   auto* rhs_data = rhs.data<T>();
-  math::SolveLinearSystem<T>(
+  phi::funcs::SolveLinearSystem<T>(
       matrix_data, rhs_data, x_grad_data, m, k, batch_count);
 }
 
diff --git a/paddle/fluid/operators/eigvals_op.cc b/paddle/fluid/operators/eigvals_op.cc
index a01316787f247..78bd2b37f6959 100644
--- a/paddle/fluid/operators/eigvals_op.cc
+++ b/paddle/fluid/operators/eigvals_op.cc
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/eigvals_op.h"
-
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -36,60 +37,17 @@ class EigvalsOpMaker : public framework::OpProtoAndCheckerMaker {
 class EigvalsOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Eigvals");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Eigvals");
-
-    DDim x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_GE(x_dims.size(),
-                      2,
-                      platform::errors::InvalidArgument(
-                          "The dimensions of Input(X) for Eigvals operator "
-                          "should be at least 2, "
-                          "but received X's dimension = %d, X's shape = [%s].",
-                          x_dims.size(),
-                          x_dims));
-
-    if (ctx->IsRuntime() || !phi::contain_unknown_dim(x_dims)) {
-      int last_dim = x_dims.size() - 1;
-      PADDLE_ENFORCE_EQ(x_dims[last_dim],
-                        x_dims[last_dim - 1],
-                        platform::errors::InvalidArgument(
-                            "The last two dimensions of Input(X) for Eigvals "
-                            "operator should be equal, "
-                            "but received X's shape = [%s].",
-                            x_dims));
-    }
-
-    auto output_dims = vectorize(x_dims);
-    output_dims.resize(x_dims.size() - 1);
-    ctx->SetOutputDim("Out", phi::make_ddim(output_dims));
-  }
 };
 
-class EigvalsOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext* ctx) const {
-    auto input_dtype = ctx->GetInputDataType("X");
-    auto output_dtype = framework::IsComplexType(input_dtype)
-                            ? input_dtype
-                            : framework::ToComplexType(input_dtype);
-    ctx->SetOutputDataType("Out", output_dtype);
-  }
-};
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
-namespace plat = paddle::platform;
+
+DECLARE_INFER_SHAPE_FUNCTOR(eigvals,
+                            EigvalsInferShapeFunctor,
+                            PD_INFER_META(phi::EigvalsInferMeta));
 
 REGISTER_OPERATOR(eigvals,
                   ops::EigvalsOp,
                   ops::EigvalsOpMaker,
-                  ops::EigvalsOpVarTypeInference);
-REGISTER_OP_CPU_KERNEL(eigvals,
-                       ops::EigvalsKernel<plat::CPUDeviceContext, float>,
-                       ops::EigvalsKernel<plat::CPUDeviceContext, double>,
-                       ops::EigvalsKernel<plat::CPUDeviceContext,
-                                          paddle::platform::complex<float>>,
-                       ops::EigvalsKernel<plat::CPUDeviceContext,
-                                          paddle::platform::complex<double>>);
+                  EigvalsInferShapeFunctor);
diff --git a/paddle/fluid/operators/eigvals_op.h b/paddle/fluid/operators/eigvals_op.h
deleted file mode 100644
index 38560bf7c35bd..0000000000000
--- a/paddle/fluid/operators/eigvals_op.h
+++ /dev/null
@@ -1,273 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/allocation/allocator.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/core/ddim.h"
-#include "paddle/phi/kernels/funcs/complex_functors.h"
-#include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
-
-namespace paddle {
-namespace operators {
-using Tensor = framework::Tensor;
-using DDim = framework::DDim;
-
-template <typename T, typename enable = void>
-struct PaddleComplex;
-
-template <typename T>
-struct PaddleComplex<
-    T,
-    typename std::enable_if<std::is_floating_point<T>::value>::type> {
-  using type = paddle::platform::complex<T>;
-};
-template <typename T>
-struct PaddleComplex<
-    T,
-    typename std::enable_if<
-        std::is_same<T, platform::complex<float>>::value ||
-        std::is_same<T, platform::complex<double>>::value>::type> {
-  using type = T;
-};
-
-template <typename T>
-using PaddleCType = typename PaddleComplex<T>::type;
-template <typename T>
-using Real = typename phi::dtype::Real<T>;
-
-static void SpiltBatchSquareMatrix(const Tensor& input,
-                                   std::vector<Tensor>* output) {
-  DDim input_dims = input.dims();
-  int last_dim = input_dims.size() - 1;
-  int n_dim = input_dims[last_dim];
-
-  DDim flattened_input_dims, flattened_output_dims;
-  if (input_dims.size() > 2) {
-    flattened_input_dims =
-        phi::flatten_to_3d(input_dims, last_dim - 1, last_dim);
-  } else {
-    flattened_input_dims = phi::make_ddim({1, n_dim, n_dim});
-  }
-
-  Tensor flattened_input;
-  flattened_input.ShareDataWith(input);
-  flattened_input.Resize(flattened_input_dims);
-  (*output) = flattened_input.Split(1, 0);
-}
-
-static void CheckLapackEigResult(const int info, const std::string& name) {
-  PADDLE_ENFORCE_LE(info,
-                    0,
-                    platform::errors::PreconditionNotMet(
-                        "The QR algorithm failed to compute all the "
-                        "eigenvalues in function %s.",
-                        name.c_str()));
-  PADDLE_ENFORCE_GE(
-      info,
-      0,
-      platform::errors::InvalidArgument(
-          "The %d-th argument has an illegal value in function %s.",
-          -info,
-          name.c_str()));
-}
-
-template <typename DeviceContext, typename T>
-static typename std::enable_if<std::is_floating_point<T>::value>::type
-LapackEigvals(const framework::ExecutionContext& ctx,
-              const Tensor& input,
-              Tensor* output,
-              Tensor* work,
-              Tensor* rwork /*unused*/) {
-  Tensor a;  // will be overwritten when lapackEig exit
-  framework::TensorCopy(input, input.place(), &a);
-
-  Tensor w;
-  int64_t n_dim = input.dims()[1];
-  auto* w_data =
-      w.mutable_data<T>(phi::make_ddim({n_dim << 1}), ctx.GetPlace());
-
-  int64_t work_mem = work->memory_size();
-  int64_t required_work_mem = 3 * n_dim * sizeof(T);
-  PADDLE_ENFORCE_GE(
-      work_mem,
-      3 * n_dim * sizeof(T),
-      platform::errors::InvalidArgument(
-          "The memory size of the work tensor in LapackEigvals function "
-          "should be at least %" PRId64 " bytes, "
-          "but received work\'s memory size = %" PRId64 " bytes.",
-          required_work_mem,
-          work_mem));
-
-  int info = 0;
-  phi::funcs::lapackEig<T>('N',
-                           'N',
-                           static_cast<int>(n_dim),
-                           a.template data<T>(),
-                           static_cast<int>(n_dim),
-                           w_data,
-                           NULL,
-                           1,
-                           NULL,
-                           1,
-                           work->template data<T>(),
-                           static_cast<int>(work_mem / sizeof(T)),
-                           static_cast<T*>(NULL),
-                           &info);
-
-  std::string name = "framework::platform::dynload::dgeev_";
-  if (framework::TransToProtoVarType(input.dtype()) ==
-      framework::proto::VarType::FP64) {
-    name = "framework::platform::dynload::sgeev_";
-  }
-  CheckLapackEigResult(info, name);
-
-  platform::ForRange<DeviceContext> for_range(
-      ctx.template device_context<DeviceContext>(), n_dim);
-  phi::funcs::RealImagToComplexFunctor<PaddleCType<T>> functor(
-      w_data, w_data + n_dim, output->template data<PaddleCType<T>>(), n_dim);
-  for_range(functor);
-}
-
-template <typename DeviceContext, typename T>
-typename std::enable_if<std::is_same<T, platform::complex<float>>::value ||
-                        std::is_same<T, platform::complex<double>>::value>::type
-LapackEigvals(const framework::ExecutionContext& ctx,
-              const Tensor& input,
-              Tensor* output,
-              Tensor* work,
-              Tensor* rwork) {
-  Tensor a;  // will be overwritten when lapackEig exit
-  framework::TensorCopy(input, input.place(), &a);
-
-  int64_t work_mem = work->memory_size();
-  int64_t n_dim = input.dims()[1];
-  int64_t required_work_mem = 3 * n_dim * sizeof(T);
-  PADDLE_ENFORCE_GE(
-      work_mem,
-      3 * n_dim * sizeof(T),
-      platform::errors::InvalidArgument(
-          "The memory size of the work tensor in LapackEigvals function "
-          "should be at least %" PRId64 " bytes, "
-          "but received work\'s memory size = %" PRId64 " bytes.",
-          required_work_mem,
-          work_mem));
-
-  int64_t rwork_mem = rwork->memory_size();
-  int64_t required_rwork_mem = (n_dim << 1) * sizeof(phi::dtype::Real<T>);
-  PADDLE_ENFORCE_GE(
-      rwork_mem,
-      required_rwork_mem,
-      platform::errors::InvalidArgument(
-          "The memory size of the rwork tensor in LapackEigvals function "
-          "should be at least %" PRId64 " bytes, "
-          "but received rwork\'s memory size = %" PRId64 " bytes.",
-          required_rwork_mem,
-          rwork_mem));
-
-  int info = 0;
-  phi::funcs::lapackEig<T, phi::dtype::Real<T>>(
-      'N',
-      'N',
-      static_cast<int>(n_dim),
-      a.template data<T>(),
-      static_cast<int>(n_dim),
-      output->template data<T>(),
-      NULL,
-      1,
-      NULL,
-      1,
-      work->template data<T>(),
-      static_cast<int>(work_mem / sizeof(T)),
-      rwork->template data<phi::dtype::Real<T>>(),
-      &info);
-
-  std::string name = "framework::platform::dynload::cgeev_";
-  if (framework::TransToProtoVarType(input.dtype()) ==
-      framework::proto::VarType::COMPLEX64) {
-    name = "framework::platform::dynload::zgeev_";
-  }
-  CheckLapackEigResult(info, name);
-}
-
-template <typename DeviceContext, typename T>
-class EigvalsKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Tensor* input = ctx.Input<Tensor>("X");
-    Tensor* output = ctx.Output<Tensor>("Out");
-    output->mutable_data<PaddleCType<T>>(ctx.GetPlace());
-
-    std::vector<Tensor> input_matrices;
-    SpiltBatchSquareMatrix(*input, /*->*/ &input_matrices);
-
-    int64_t n_dim = input_matrices[0].dims()[1];
-    int64_t n_batch = input_matrices.size();
-    DDim output_dims = output->dims();
-    output->Resize(phi::make_ddim({n_batch, n_dim}));
-    std::vector<Tensor> output_vectors = output->Split(1, 0);
-
-    // query workspace size
-    T qwork;
-    int info;
-    phi::funcs::lapackEig<T, phi::dtype::Real<T>>(
-        'N',
-        'N',
-        static_cast<int>(n_dim),
-        input_matrices[0].template data<T>(),
-        static_cast<int>(n_dim),
-        NULL,
-        NULL,
-        1,
-        NULL,
-        1,
-        &qwork,
-        -1,
-        static_cast<phi::dtype::Real<T>*>(NULL),
-        &info);
-    int64_t lwork = static_cast<int64_t>(qwork);
-
-    Tensor work, rwork;
-    try {
-      work.mutable_data<T>(phi::make_ddim({lwork}), ctx.GetPlace());
-    } catch (memory::allocation::BadAlloc&) {
-      LOG(WARNING) << "Failed to allocate Lapack workspace with the optimal "
-                   << "memory size = " << lwork * sizeof(T) << " bytes, "
-                   << "try reallocating a smaller workspace with the minimum "
-                   << "required size = " << 3 * n_dim * sizeof(T) << " bytes, "
-                   << "this may lead to bad performance.";
-      lwork = 3 * n_dim;
-      work.mutable_data<T>(phi::make_ddim({lwork}), ctx.GetPlace());
-    }
-    if (framework::IsComplexType(
-            framework::TransToProtoVarType(input->dtype()))) {
-      rwork.mutable_data<phi::dtype::Real<T>>(phi::make_ddim({n_dim << 1}),
-                                              ctx.GetPlace());
-    }
-
-    for (int64_t i = 0; i < n_batch; ++i) {
-      LapackEigvals<DeviceContext, T>(
-          ctx, input_matrices[i], &output_vectors[i], &work, &rwork);
-    }
-    output->Resize(output_dims);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/eigvalsh_op.cc b/paddle/fluid/operators/eigvalsh_op.cc
index 3684b926a1ac5..f7abdbee84f1d 100644
--- a/paddle/fluid/operators/eigvalsh_op.cc
+++ b/paddle/fluid/operators/eigvalsh_op.cc
@@ -151,24 +151,23 @@ REGISTER_OPERATOR(eigvalsh,
                   ops::EigvalshGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(eigvalsh_grad, ops::EigvalshGradOp);
 
-REGISTER_OP_CPU_KERNEL(
-    eigvalsh,
-    ops::EigvalshKernel<paddle::platform::CPUDeviceContext, float, float>,
-    ops::EigvalshKernel<paddle::platform::CPUDeviceContext, double, double>,
-    ops::EigvalshKernel<paddle::platform::CPUDeviceContext,
-                        float,
-                        paddle::platform::complex<float>>,
-    ops::EigvalshKernel<paddle::platform::CPUDeviceContext,
-                        double,
-                        paddle::platform::complex<double>>);
+REGISTER_OP_CPU_KERNEL(eigvalsh,
+                       ops::EigvalshKernel<phi::CPUContext, float, float>,
+                       ops::EigvalshKernel<phi::CPUContext, double, double>,
+                       ops::EigvalshKernel<phi::CPUContext,
+                                           float,
+                                           paddle::platform::complex<float>>,
+                       ops::EigvalshKernel<phi::CPUContext,
+                                           double,
+                                           paddle::platform::complex<double>>);
 
 REGISTER_OP_CPU_KERNEL(
     eigvalsh_grad,
-    ops::EigvalshGradKernel<paddle::platform::CPUDeviceContext, float, float>,
-    ops::EigvalshGradKernel<paddle::platform::CPUDeviceContext, double, double>,
-    ops::EigvalshGradKernel<paddle::platform::CPUDeviceContext,
+    ops::EigvalshGradKernel<phi::CPUContext, float, float>,
+    ops::EigvalshGradKernel<phi::CPUContext, double, double>,
+    ops::EigvalshGradKernel<phi::CPUContext,
                             float,
                             paddle::platform::complex<float>>,
-    ops::EigvalshGradKernel<paddle::platform::CPUDeviceContext,
+    ops::EigvalshGradKernel<phi::CPUContext,
                             double,
                             paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
index c71f6b7c3cd19..0123df0006f15 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
@@ -20,12 +20,6 @@ namespace paddle {
 namespace framework {
 class OpDesc;
 }  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
index 422cbd881d28a..6a8c986a53c24 100644
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
@@ -25,9 +25,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cc b/paddle/fluid/operators/elementwise/elementwise_max_op.cc
index 58e9c6d7b4cb8..1911b5c2de6d7 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cc
@@ -23,9 +23,6 @@ class OpDesc;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cc b/paddle/fluid/operators/elementwise/elementwise_min_op.cc
index 8b967cb1fe15e..9fd70754888bd 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cc
@@ -23,9 +23,6 @@ class OpDesc;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_mlu.h b/paddle/fluid/operators/elementwise/elementwise_mlu.h
index d5c85e9f71cc1..50085f531a99d 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mlu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mlu.h
@@ -122,6 +122,7 @@ enum BINARY_FUNCTOR {
   DIVNONAN,
   MAXIMUM,
   MINIMUM,
+  POW,
 };
 
 template <BINARY_FUNCTOR func>
@@ -171,6 +172,18 @@ inline void MLUBinary<MINIMUM>(const framework::ExecutionContext& ctx,
   MLUCnnl::Minimum(ctx, in1_desc, in1, in2_desc, in2, out_desc, out);
 }
 
+template <>
+inline void MLUBinary<POW>(const framework::ExecutionContext& ctx,
+                           cnnlComputationPreference_t prefer,
+                           const cnnlTensorDescriptor_t x_desc,
+                           const void* x,
+                           const cnnlTensorDescriptor_t y_desc,
+                           const void* y,
+                           const cnnlTensorDescriptor_t out_desc,
+                           void* out) {
+  MLUCnnl::Pow(ctx, prefer, x_desc, x, y_desc, y, out_desc, out);
+}
+
 template <BINARY_FUNCTOR Functor, typename T>
 void MLUBinaryOp(const framework::ExecutionContext& ctx) {
   auto* x = ctx.Input<Tensor>("X");
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.cc b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
index ee67f7e4020f1..55d6e214d6c12 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
@@ -25,9 +25,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op.cc b/paddle/fluid/operators/elementwise/elementwise_pow_op.cc
index c13fba99bdbab..fcfee9b4fca15 100644
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.cc
@@ -20,9 +20,6 @@ class OpDesc;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op_mlu.cc b/paddle/fluid/operators/elementwise/elementwise_pow_op_mlu.cc
new file mode 100644
index 0000000000000..431122641ec3d
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op_mlu.cc
@@ -0,0 +1,214 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class ElementwisePowMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    MLUBinaryOp<POW, T>(ctx);
+  }
+};
+
+template <typename T>
+class ElementwisePowGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
+    auto place = ctx.GetPlace();
+
+    auto x_dims = x->dims();
+    auto y_dims = y->dims();
+    axis =
+        (axis < 0 ? std::abs(x_dims.size() - y_dims.size()) + axis + 1 : axis);
+
+    int max_dim = std::max(x_dims.size(), y_dims.size());
+    std::vector<int> x_dims_array(max_dim);
+    std::vector<int> y_dims_array(max_dim);
+    std::vector<int> out_dims_array(max_dim);
+    GetBroadcastDimsArrays(x_dims,
+                           y_dims,
+                           x_dims_array.data(),
+                           y_dims_array.data(),
+                           out_dims_array.data(),
+                           max_dim,
+                           axis);
+    cnnlDataType_t data_type = ToCnnlDataType<T>();
+    MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), data_type);
+    MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), data_type);
+    MLUCnnlTensorDesc out_desc(max_dim, out_dims_array.data(), data_type);
+
+    auto dout_dims = dout->dims();
+    if (dx) {
+      // dx = dout * y * pow(x, y - 1);
+      Tensor one_dx(y->type());
+      one_dx.mutable_data<T>(phi::make_ddim(y_dims_array), place);
+      FillMLUTensorWithHostValue(ctx, static_cast<T>(1), &one_dx);
+
+      Tensor sub_dx(y->type());
+      sub_dx.mutable_data<T>(phi::make_ddim(y_dims_array), place);
+      MLUCnnlOpTensorDesc op_tensor_desc(
+          CNNL_OP_TENSOR_SUB, data_type, CNNL_NOT_PROPAGATE_NAN);
+      MLUCnnl::OpTensor(ctx,
+                        op_tensor_desc.get(),
+                        y_desc.get(),
+                        GetBasePtr(y),
+                        y_desc.get(),
+                        GetBasePtr(&one_dx),
+                        y_desc.get(),
+                        GetBasePtr(&sub_dx),
+                        data_type);
+
+      Tensor tmp_dx(x->type());
+      tmp_dx.mutable_data<T>(phi::make_ddim(out_dims_array), place);
+      MLUCnnl::Pow(ctx,
+                   CNNL_COMPUTATION_HIGH_PRECISION,
+                   x_desc.get(),
+                   GetBasePtr(x),
+                   y_desc.get(),
+                   GetBasePtr(&sub_dx),
+                   out_desc.get(),
+                   GetBasePtr(&tmp_dx));
+
+      MLUCnnl::MulAx(ctx,
+                     y_desc.get(),
+                     GetBasePtr(y),
+                     out_desc.get(),
+                     GetBasePtr(&tmp_dx));
+      MLUCnnl::MulAx(ctx,
+                     out_desc.get(),
+                     GetBasePtr(dout),
+                     out_desc.get(),
+                     GetBasePtr(&tmp_dx));
+
+      if (x_dims != dout_dims) {
+        dx->mutable_data<T>(place);
+        std::vector<int> reduce_axes;
+        GetReduceAxes(axis, dout_dims, x_dims, &reduce_axes);
+        if (!reduce_axes.empty()) {
+          MLUCnnlReduceDesc reduction_desc(reduce_axes,
+                                           CNNL_REDUCE_ADD,
+                                           data_type,
+                                           CNNL_NOT_PROPAGATE_NAN,
+                                           CNNL_REDUCE_NO_INDICES,
+                                           CNNL_32BIT_INDICES);
+          MLUCnnlTensorDesc dx_desc(*dx);
+          MLUCnnl::Reduce(ctx,
+                          true /*need_workspace*/,
+                          reduction_desc.get(),
+                          nullptr,
+                          out_desc.get(),
+                          GetBasePtr(&tmp_dx),
+                          0,
+                          nullptr,
+                          nullptr,
+                          dx_desc.get(),
+                          GetBasePtr(dx));
+        }
+      } else {
+        dx->ShareDataWith(tmp_dx);
+      }
+    }
+    if (dy) {
+      // dy = dout * log(x) * pow(x, y)
+      Tensor tmp_dy(y->type());
+      tmp_dy.mutable_data<T>(phi::make_ddim(out_dims_array), place);
+      MLUCnnl::Pow(ctx,
+                   CNNL_COMPUTATION_HIGH_PRECISION,
+                   x_desc.get(),
+                   GetBasePtr(x),
+                   y_desc.get(),
+                   GetBasePtr(y),
+                   out_desc.get(),
+                   GetBasePtr(&tmp_dy));
+
+      Tensor log_x(x->type());
+      log_x.mutable_data<T>(x->dims(), place);
+      MLUCnnl::Log(ctx,
+                   CNNL_COMPUTATION_HIGH_PRECISION,
+                   CNNL_LOG_E,
+                   x_desc.get(),
+                   GetBasePtr(x),
+                   x_desc.get(),
+                   GetBasePtr(&log_x));
+      MLUCnnl::MulAx(ctx,
+                     x_desc.get(),
+                     GetBasePtr(&log_x),
+                     out_desc.get(),
+                     GetBasePtr(&tmp_dy));
+      MLUCnnl::MulAx(ctx,
+                     out_desc.get(),
+                     GetBasePtr(dout),
+                     out_desc.get(),
+                     GetBasePtr(&tmp_dy));
+
+      if (y_dims != dout_dims) {
+        dy->mutable_data<T>(place);
+        std::vector<int> reduce_axes;
+        GetReduceAxes(axis, dout_dims, y_dims, &reduce_axes);
+        if (!reduce_axes.empty()) {
+          MLUCnnlReduceDesc reduction_desc(reduce_axes,
+                                           CNNL_REDUCE_ADD,
+                                           data_type,
+                                           CNNL_NOT_PROPAGATE_NAN,
+                                           CNNL_REDUCE_NO_INDICES,
+                                           CNNL_32BIT_INDICES);
+          MLUCnnlTensorDesc dy_desc(*dy);
+          MLUCnnl::Reduce(ctx,
+                          true /*need_workspace*/,
+                          reduction_desc.get(),
+                          nullptr,
+                          out_desc.get(),
+                          GetBasePtr(&tmp_dy),
+                          0,
+                          nullptr,
+                          nullptr,
+                          dy_desc.get(),
+                          GetBasePtr(dy));
+        }
+      } else {
+        dy->ShareDataWith(tmp_dy);
+      }
+    }
+    if (!dx && !dy) {
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Not support all outputs to be empty."));
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(elementwise_pow,
+                       ops::ElementwisePowMLUKernel<plat::float16>,
+                       ops::ElementwisePowMLUKernel<float>);
+
+REGISTER_OP_MLU_KERNEL(elementwise_pow_grad,
+                       ops::ElementwisePowGradMLUKernel<plat::float16>,
+                       ops::ElementwisePowGradMLUKernel<float>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
index a9968906fb90a..24f0228025f7f 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
@@ -23,9 +23,6 @@ class OpDesc;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
index 61552e492dfa1..42d749b7b8e3e 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
@@ -50,22 +50,7 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
  private:
   dnnl::post_ops get_post_ops(const framework::ExecutionContext& ctx) const {
     dnnl::post_ops post_operations;
-    if (ctx.HasAttr("activation_type")) {
-      const float scale = ctx.HasAttr("activation_scale")
-                              ? ctx.Attr<float>("activation_scale")
-                              : 1.0f;
-      const float alpha = ctx.HasAttr("activation_alpha")
-                              ? ctx.Attr<float>("activation_alpha")
-                              : 0.0f;
-      const float beta = ctx.HasAttr("activation_beta")
-                             ? ctx.Attr<float>("activation_beta")
-                             : 0.0f;
-
-      const auto activation_algorithm = platform::AcquireActivationAlgorithm(
-          ctx.Attr<std::string>("activation_type"));
-
-      post_operations.append_eltwise(scale, activation_algorithm, alpha, beta);
-    }
+    platform::AppendActivation(ctx, post_operations);
     return post_operations;
   }
 
@@ -75,8 +60,8 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
     const auto& mkldnn_engine = dev_ctx.GetEngine();
 
-    const auto* x = ctx.Input<Tensor>("X");
-    const auto* y = ctx.Input<Tensor>("Y");
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
     auto* z = ctx.Output<Tensor>("Out");
 
     float scale_x = ctx.Attr<float>("Scale_x");
@@ -96,6 +81,12 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
                                              scale_o,
                                              get_post_ops(ctx));
 
+    // oneDNN's binary is optimized for broadcasting y into x, so in other case
+    // we have to swap tensors to achieve optimal performance
+    if (x->numel() < y->numel()) {
+      std::swap(x, y);
+    }
+
     const auto src_x_memory = handler.AcquireSrcMemory(x);
     const auto src_y_memory = handler.AcquireSecondSrcMemory(y);
     // (jczaja) For Inplace src and dst should be the same memory object.
@@ -159,6 +150,13 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
+    // oneDNN's binary is optimized for broadcasting y into x, so in other case
+    // we have to swap tensors to achieve optimal performance
+    if (x->numel() < y->numel()) {
+      std::swap(x, y);
+      std::swap(dx, dy);
+    }
+
     int axis = ctx.Attr<int>("axis");
 
     auto tz = phi::vectorize<int64_t>(dout->dims());
diff --git a/paddle/fluid/operators/expand_as_op.cc b/paddle/fluid/operators/expand_as_op.cc
index af6510ae3b931..6f1e04ebfa6cf 100644
--- a/paddle/fluid/operators/expand_as_op.cc
+++ b/paddle/fluid/operators/expand_as_op.cc
@@ -146,19 +146,17 @@ REGISTER_OPERATOR(expand_as,
 REGISTER_OPERATOR(expand_as_grad,
                   ops::ExpandAsGradOp,
                   ops::ExpandAsGradNoNeedBufVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    expand_as,
-    ops::ExpandAsKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ExpandAsKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ExpandAsKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ExpandAsKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ExpandAsKernel<paddle::platform::CPUDeviceContext, bool>);
-REGISTER_OP_CPU_KERNEL(
-    expand_as_grad,
-    ops::ExpandAsGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ExpandAsGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ExpandAsGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ExpandAsGradKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(expand_as,
+                       ops::ExpandAsKernel<phi::CPUContext, float>,
+                       ops::ExpandAsKernel<phi::CPUContext, double>,
+                       ops::ExpandAsKernel<phi::CPUContext, int>,
+                       ops::ExpandAsKernel<phi::CPUContext, int64_t>,
+                       ops::ExpandAsKernel<phi::CPUContext, bool>);
+REGISTER_OP_CPU_KERNEL(expand_as_grad,
+                       ops::ExpandAsGradKernel<phi::CPUContext, int>,
+                       ops::ExpandAsGradKernel<phi::CPUContext, int64_t>,
+                       ops::ExpandAsGradKernel<phi::CPUContext, float>,
+                       ops::ExpandAsGradKernel<phi::CPUContext, double>);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 REGISTER_OP_CUDA_KERNEL(
     expand_as,
diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc
index 100158ce9c21e..d8c66f95a1395 100644
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -280,19 +280,17 @@ REGISTER_OPERATOR(expand_grad,
                   ops::ExpandDoubleGradOpMaker<paddle::framework::OpDesc>,
                   ops::ExpandDoubleGradOpMaker<paddle::imperative::OpBase>,
                   ops::ExpandGradNoNeedBufVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    expand,
-    ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ExpandKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ExpandKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ExpandKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ExpandKernel<paddle::platform::CPUDeviceContext, bool>);
-REGISTER_OP_CPU_KERNEL(
-    expand_grad,
-    ops::ExpandGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ExpandGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ExpandGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ExpandGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(expand,
+                       ops::ExpandKernel<phi::CPUContext, float>,
+                       ops::ExpandKernel<phi::CPUContext, double>,
+                       ops::ExpandKernel<phi::CPUContext, int>,
+                       ops::ExpandKernel<phi::CPUContext, int64_t>,
+                       ops::ExpandKernel<phi::CPUContext, bool>);
+REGISTER_OP_CPU_KERNEL(expand_grad,
+                       ops::ExpandGradKernel<phi::CPUContext, float>,
+                       ops::ExpandGradKernel<phi::CPUContext, double>,
+                       ops::ExpandGradKernel<phi::CPUContext, int>,
+                       ops::ExpandGradKernel<phi::CPUContext, int64_t>);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 REGISTER_OP_CUDA_KERNEL(
     expand,
diff --git a/paddle/fluid/operators/exponential_op.cc b/paddle/fluid/operators/exponential_op.cc
index 03fbdfcd5ae77..5a75063fba7c1 100644
--- a/paddle/fluid/operators/exponential_op.cc
+++ b/paddle/fluid/operators/exponential_op.cc
@@ -62,8 +62,7 @@ class ExponentialOpInferVarType
 };
 
 template <typename T>
-class ExponentialKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
+class ExponentialKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto *out = ctx.Output<framework::Tensor>("Out");
@@ -135,9 +134,8 @@ REGISTER_OPERATOR(exponential_grad,
                   ExponentialGradInferer);
 
 REGISTER_OP_CPU_KERNEL(exponential,
-                       ops::ExponentialKernel<plat::CPUDeviceContext, float>,
-                       ops::ExponentialKernel<plat::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    exponential_grad,
-    ops::ExponentialGradKernel<plat::CPUDeviceContext, float>,
-    ops::ExponentialGradKernel<plat::CPUDeviceContext, double>);
+                       ops::ExponentialKernel<phi::CPUContext, float>,
+                       ops::ExponentialKernel<phi::CPUContext, double>);
+REGISTER_OP_CPU_KERNEL(exponential_grad,
+                       ops::ExponentialGradKernel<phi::CPUContext, float>,
+                       ops::ExponentialGradKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/fake_dequantize_op.cc b/paddle/fluid/operators/fake_dequantize_op.cc
index 58022ee6400fd..4e1df4f98ab57 100644
--- a/paddle/fluid/operators/fake_dequantize_op.cc
+++ b/paddle/fluid/operators/fake_dequantize_op.cc
@@ -23,8 +23,8 @@ namespace paddle {
 namespace operators {
 
 template <typename T>
-struct DequantizeFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& dev_ctx,
+struct DequantizeFunctor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext& dev_ctx,
                   const framework::Tensor* in,
                   const framework::Tensor* scale,
                   T max_range,
@@ -39,8 +39,8 @@ struct DequantizeFunctor<platform::CPUDeviceContext, T> {
 };
 
 template <typename T>
-struct ChannelDequantizeFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& dev_ctx,
+struct ChannelDequantizeFunctor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext& dev_ctx,
                   const framework::Tensor* in,
                   const framework::Tensor** scales,
                   const int scale_num,
@@ -139,10 +139,10 @@ struct ChannelDequantizeFunctor<platform::CPUDeviceContext, T> {
   }
 };
 
-template struct DequantizeFunctor<platform::CPUDeviceContext, float>;
-template struct DequantizeFunctor<platform::CPUDeviceContext, double>;
-template struct ChannelDequantizeFunctor<platform::CPUDeviceContext, float>;
-template struct ChannelDequantizeFunctor<platform::CPUDeviceContext, double>;
+template struct DequantizeFunctor<phi::CPUContext, float>;
+template struct DequantizeFunctor<phi::CPUContext, double>;
+template struct ChannelDequantizeFunctor<phi::CPUContext, float>;
+template struct ChannelDequantizeFunctor<phi::CPUContext, double>;
 
 class FakeDequantizeMaxAbsOp : public framework::OperatorWithKernel {
  public:
@@ -269,7 +269,7 @@ Notes: In general, the per-channel quantization is only applied to weights and t
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
+using CPU = phi::CPUContext;
 
 REGISTER_OPERATOR(
     fake_dequantize_max_abs,
diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc
index 61ee9d49ebeec..cb8263714a5e4 100644
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -32,8 +32,8 @@ struct Compare {
 };
 
 template <typename T>
-struct FindAbsMaxFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext &ctx,
+struct FindAbsMaxFunctor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext &ctx,
                   const T *in,
                   const int num,
                   T *out) {
@@ -41,11 +41,11 @@ struct FindAbsMaxFunctor<platform::CPUDeviceContext, T> {
   }
 };
 
-template struct FindAbsMaxFunctor<platform::CPUDeviceContext, float>;
+template struct FindAbsMaxFunctor<phi::CPUContext, float>;
 
 template <typename T>
-struct FindChannelAbsMaxFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext &ctx,
+struct FindChannelAbsMaxFunctor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext &ctx,
                   const framework::Tensor &in_tensor,
                   const int quant_axis,
                   T *out_abs_max) {
@@ -86,11 +86,11 @@ struct FindChannelAbsMaxFunctor<platform::CPUDeviceContext, T> {
   }
 };
 
-template struct FindChannelAbsMaxFunctor<platform::CPUDeviceContext, float>;
+template struct FindChannelAbsMaxFunctor<phi::CPUContext, float>;
 
 template <typename T>
-struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext &ctx,
+struct ClipAndFakeQuantFunctor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext &ctx,
                   const framework::Tensor &in,
                   const framework::Tensor &scale,
                   const int bin_cnt,
@@ -98,7 +98,7 @@ struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
                   framework::Tensor *out) {
     T s = scale.data<T>()[0];
     T inv_s = inverse(s);
-    platform::Transform<platform::CPUDeviceContext> trans;
+    platform::Transform<phi::CPUContext> trans;
     if (round_type == 0) {
       trans(ctx,
             in.data<T>(),
@@ -117,11 +117,11 @@ struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
   }
 };
 
-template struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, float>;
+template struct ClipAndFakeQuantFunctor<phi::CPUContext, float>;
 
 template <typename T>
-struct ClipAndFakeQuantDequantFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext &ctx,
+struct ClipAndFakeQuantDequantFunctor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext &ctx,
                   const framework::Tensor &in,
                   const framework::Tensor &scale,
                   const int bin_cnt,
@@ -130,7 +130,7 @@ struct ClipAndFakeQuantDequantFunctor<platform::CPUDeviceContext, T> {
     T s = scale.data<T>()[0];
     T inv_s = inverse(s);
 
-    platform::Transform<platform::CPUDeviceContext> trans;
+    platform::Transform<phi::CPUContext> trans;
     if (round_type == 0) {
       trans(ctx,
             in.data<T>(),
@@ -151,12 +151,11 @@ struct ClipAndFakeQuantDequantFunctor<platform::CPUDeviceContext, T> {
     }
   }
 };
-template struct ClipAndFakeQuantDequantFunctor<platform::CPUDeviceContext,
-                                               float>;
+template struct ClipAndFakeQuantDequantFunctor<phi::CPUContext, float>;
 
 template <typename T>
-struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext &ctx,
+struct ChannelClipAndFakeQuantFunctor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext &ctx,
                   const framework::Tensor &in,
                   const framework::Tensor &scale,
                   const int bin_cnt,
@@ -176,7 +175,7 @@ struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
     auto *out_data = out->mutable_data<T>(ctx.GetPlace());
     auto in_dims = in.dims();
     const int64_t channel = in_dims[quant_axis];
-    platform::Transform<platform::CPUDeviceContext> trans;
+    platform::Transform<phi::CPUContext> trans;
     if (quant_axis == 0) {
       const int64_t channel_size = in.numel() / channel;
       for (int64_t i = 0; i < channel; i++) {
@@ -235,11 +234,10 @@ struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
   }
 };
 
-template struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext,
-                                               float>;
+template struct ChannelClipAndFakeQuantFunctor<phi::CPUContext, float>;
 template <typename T>
-struct ChannelClipFakeQuantDequantFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext &ctx,
+struct ChannelClipFakeQuantDequantFunctor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext &ctx,
                   const framework::Tensor &in,
                   const framework::Tensor &scale,
                   const int bin_cnt,
@@ -258,7 +256,7 @@ struct ChannelClipFakeQuantDequantFunctor<platform::CPUDeviceContext, T> {
     auto *out_data = out->mutable_data<T>(ctx.GetPlace());
     auto in_dims = in.dims();
     const int64_t channel = in_dims[quant_axis];
-    platform::Transform<platform::CPUDeviceContext> trans;
+    platform::Transform<phi::CPUContext> trans;
     if (quant_axis == 0) {
       const int64_t channel_size = in.numel() / channel;
       for (int i = 0; i < channel; i++) {
@@ -326,11 +324,10 @@ struct ChannelClipFakeQuantDequantFunctor<platform::CPUDeviceContext, T> {
   }
 };
 
-template struct ChannelClipFakeQuantDequantFunctor<platform::CPUDeviceContext,
-                                                   float>;
+template struct ChannelClipFakeQuantDequantFunctor<phi::CPUContext, float>;
 template <typename T>
-struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext &ctx,
+struct FindRangeAbsMaxFunctor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext &ctx,
                   const framework::Tensor &cur_scale,
                   const framework::Tensor &last_scale,
                   const framework::Tensor &iter,
@@ -349,18 +346,17 @@ struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, T> {
       max = cur;
     } else if (fabs(removed - max) < 1e-6) {
       int size = (it > window_size) ? window_size : it;
-      FindAbsMaxFunctor<platform::CPUDeviceContext, T>()(
-          ctx, scale_arr, size, &max);
+      FindAbsMaxFunctor<phi::CPUContext, T>()(ctx, scale_arr, size, &max);
     }
     out_scale->mutable_data<T>(ctx.GetPlace())[0] = max;
   }
 };
 
-template struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, float>;
+template struct FindRangeAbsMaxFunctor<phi::CPUContext, float>;
 
 template <typename T>
-struct FindMovingAverageAbsMaxFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext &ctx,
+struct FindMovingAverageAbsMaxFunctor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext &ctx,
                   const framework::Tensor &in_accum,
                   const framework::Tensor &in_state,
                   const T *cur_scale,
@@ -382,8 +378,7 @@ struct FindMovingAverageAbsMaxFunctor<platform::CPUDeviceContext, T> {
   }
 };
 
-template struct FindMovingAverageAbsMaxFunctor<platform::CPUDeviceContext,
-                                               float>;
+template struct FindMovingAverageAbsMaxFunctor<phi::CPUContext, float>;
 
 class FakeQuantOrWithDequantAbsMaxOp : public framework::OperatorWithKernel {
  public:
@@ -968,7 +963,7 @@ class StrightThroughEstimatorMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
+using CPU = phi::CPUContext;
 
 REGISTER_OPERATOR(
     fake_quantize_abs_max,
diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
index a851a6db5657f..43bb6089a87dd 100644
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -223,7 +223,6 @@ REGISTER_OPERATOR(
     ops::FCOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    fc,
-    ops::FCOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FCOpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(fc,
+                       ops::FCOpKernel<phi::CPUContext, float>,
+                       ops::FCOpKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/fill_any_op.cc b/paddle/fluid/operators/fill_any_op.cc
index ddbfe226b647e..853ebbdd9e57c 100644
--- a/paddle/fluid/operators/fill_any_op.cc
+++ b/paddle/fluid/operators/fill_any_op.cc
@@ -95,20 +95,18 @@ REGISTER_OPERATOR(fill_any_grad,
 
 REGISTER_OP_CPU_KERNEL(
     fill_any,
-    ops::FillAnyKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FillAnyKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::FillAnyKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::FillAnyKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::FillAnyKernel<paddle::platform::CPUDeviceContext,
-                       paddle::platform::float16>,
-    ops::FillAnyKernel<paddle::platform::CPUDeviceContext, bool>);
+    ops::FillAnyKernel<phi::CPUContext, float>,
+    ops::FillAnyKernel<phi::CPUContext, double>,
+    ops::FillAnyKernel<phi::CPUContext, int64_t>,
+    ops::FillAnyKernel<phi::CPUContext, int>,
+    ops::FillAnyKernel<phi::CPUContext, paddle::platform::float16>,
+    ops::FillAnyKernel<phi::CPUContext, bool>);
 
 REGISTER_OP_CPU_KERNEL(
     fill_any_grad,
-    ops::FillAnyGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FillAnyGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::FillAnyGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::FillAnyGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::FillAnyGradKernel<paddle::platform::CPUDeviceContext,
-                           paddle::platform::float16>,
-    ops::FillAnyGradKernel<paddle::platform::CPUDeviceContext, bool>);
+    ops::FillAnyGradKernel<phi::CPUContext, float>,
+    ops::FillAnyGradKernel<phi::CPUContext, double>,
+    ops::FillAnyGradKernel<phi::CPUContext, int64_t>,
+    ops::FillAnyGradKernel<phi::CPUContext, int>,
+    ops::FillAnyGradKernel<phi::CPUContext, paddle::platform::float16>,
+    ops::FillAnyGradKernel<phi::CPUContext, bool>);
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op_mlu.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op_mlu.cc
new file mode 100644
index 0000000000000..32a19750f420a
--- /dev/null
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op_mlu.cc
@@ -0,0 +1,99 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/fill_constant_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/fluid/operators/utils.h"
+
+namespace paddle {
+namespace operators {
+template <typename T>
+class FillConstantBatchSizeLikeOpMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto data_type =
+        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
+    auto float_value = ctx.Attr<float>("value");
+    auto str_value = ctx.Attr<std::string>("str_value");
+    auto force_cpu = ctx.Attr<bool>("force_cpu");
+
+    auto *out = ctx.Output<Tensor>("Out");
+    auto *in = ctx.Input<framework::LoDTensor>("Input");
+    if (in->lod().size() && ctx.Attr<int>("input_dim_idx") == 0) {
+      // set the correct batch size for the LoDTensor.
+      auto odims = out->dims();
+      int output_dim_idx = ctx.Attr<int>("output_dim_idx");
+      odims[output_dim_idx] = static_cast<int>(in->lod().back().size()) - 1;
+      out->mutable_data<T>(odims, ctx.GetPlace());
+    }
+
+    T value;
+    if (str_value.empty()) {
+      value = static_cast<T>(float_value);
+    } else {
+      // handle NaN/Inf first, which cannot be read from stream.
+      if (str_value == "inf") {
+        value = static_cast<T>(std::numeric_limits<double>::infinity());
+      } else if (str_value == "-inf") {
+        value = static_cast<T>(-std::numeric_limits<double>::infinity());
+      } else if (str_value == "nan") {
+        value = static_cast<T>(std::numeric_limits<double>::quiet_NaN());
+      } else {
+        std::stringstream convert_stream(str_value);
+        if (std::is_same<int64_t, T>::value) {
+          int64_t tmp_value;
+          convert_stream >> tmp_value;
+          value = static_cast<T>(tmp_value);
+        } else {
+          double tmp_value;
+          convert_stream >> tmp_value;
+          value = static_cast<T>(tmp_value);
+        }
+      }
+    }
+
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    bool cpu_place = force_cpu || ctx.GetPlace() == platform::CPUPlace();
+    if (cpu_place) {
+      auto &dev_ctx = *pool.Get(platform::CPUPlace());
+      phi::funcs::SetConstant<phi::CPUContext, T> functor;
+      out->mutable_data(platform::CPUPlace(),
+                        framework::TransToPhiDataType(data_type));
+      functor(reinterpret_cast<const phi::CPUContext &>(dev_ctx),
+              out,
+              static_cast<T>(value));
+    } else {
+      out->mutable_data(ctx.GetPlace(),
+                        framework::TransToPhiDataType(data_type));
+      const T *value_data = &value;
+      cnnlPointerMode_t pointer_mode = CNNL_POINTER_MODE_HOST;
+      MLUCnnlTensorDesc output_desc(*out);
+      MLUCnnl::Fill(
+          ctx, pointer_mode, value_data, output_desc.get(), GetBasePtr(out));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(
+    fill_constant_batch_size_like,
+    ops::FillConstantBatchSizeLikeOpMLUKernel<int>,
+    ops::FillConstantBatchSizeLikeOpMLUKernel<float>,
+    ops::FillConstantBatchSizeLikeOpMLUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
index ad4efbb3e0c63..02f89cfdd2691 100644
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
@@ -70,10 +70,10 @@ class FillConstantBatchSizeLikeOpNPUKernel : public framework::OpKernel<T> {
     bool cpu_place = force_cpu || ctx.GetPlace() == platform::CPUPlace();
     if (cpu_place) {
       auto &dev_ctx = *pool.Get(platform::CPUPlace());
-      phi::funcs::SetConstant<platform::CPUDeviceContext, T> functor;
+      phi::funcs::SetConstant<phi::CPUContext, T> functor;
       out->mutable_data(platform::CPUPlace(),
                         framework::TransToPhiDataType(data_type));
-      functor(reinterpret_cast<const platform::CPUDeviceContext &>(dev_ctx),
+      functor(reinterpret_cast<const phi::CPUContext &>(dev_ctx),
               out,
               static_cast<T>(value));
     } else {
diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h
index dc5079ddb605f..8e51c203d4122 100644
--- a/paddle/fluid/operators/fill_constant_op.h
+++ b/paddle/fluid/operators/fill_constant_op.h
@@ -124,9 +124,9 @@ class FillConstantKernel : public framework::OpKernel<T> {
                                                                  : "<T>");
       tensor->mutable_data(platform::CPUPlace(),
                            framework::TransToPhiDataType(data_type));
-      phi::funcs::SetConstant<platform::CPUDeviceContext, T> functor;
+      phi::funcs::SetConstant<phi::CPUContext, T> functor;
       auto &dev_ctx = *pool.Get(platform::CPUPlace());
-      functor(reinterpret_cast<const platform::CPUDeviceContext &>(dev_ctx),
+      functor(reinterpret_cast<const phi::CPUContext &>(dev_ctx),
               tensor,
               static_cast<T>(value));
     } else if (actual_place == 1) {
diff --git a/paddle/fluid/operators/fill_zeros_like_op.cc b/paddle/fluid/operators/fill_zeros_like_op.cc
index b9e91baa1e707..8bd0e328c1f5b 100644
--- a/paddle/fluid/operators/fill_zeros_like_op.cc
+++ b/paddle/fluid/operators/fill_zeros_like_op.cc
@@ -94,24 +94,22 @@ REGISTER_OPERATOR(
 
 REGISTER_OP_CPU_KERNEL(
     fill_zeros_like,
-    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext,
-                             paddle::platform::complex<float>>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext,
+    ops::FillZerosLikeKernel<phi::CPUContext, int>,
+    ops::FillZerosLikeKernel<phi::CPUContext, int64_t>,
+    ops::FillZerosLikeKernel<phi::CPUContext, float>,
+    ops::FillZerosLikeKernel<phi::CPUContext, double>,
+    ops::FillZerosLikeKernel<phi::CPUContext, bool>,
+    ops::FillZerosLikeKernel<phi::CPUContext, paddle::platform::complex<float>>,
+    ops::FillZerosLikeKernel<phi::CPUContext,
                              paddle::platform::complex<double>>);
 
 REGISTER_OP_CPU_KERNEL(
     fill_zeros_like2,
-    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext,
-                             paddle::platform::complex<float>>,
-    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext,
+    ops::FillZerosLikeKernel<phi::CPUContext, int>,
+    ops::FillZerosLikeKernel<phi::CPUContext, int64_t>,
+    ops::FillZerosLikeKernel<phi::CPUContext, float>,
+    ops::FillZerosLikeKernel<phi::CPUContext, double>,
+    ops::FillZerosLikeKernel<phi::CPUContext, bool>,
+    ops::FillZerosLikeKernel<phi::CPUContext, paddle::platform::complex<float>>,
+    ops::FillZerosLikeKernel<phi::CPUContext,
                              paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc
index c584dd114e0eb..e160fc6f09ad0 100644
--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
@@ -438,35 +438,31 @@ REGISTER_OPERATOR(flatten_contiguous_range_grad,
                   ops::FlattenContiguousRangeGradOp,
                   ops::FlattenGradInplaceInferer);
 
-REGISTER_OP_CPU_KERNEL(
-    flatten,
-    ops::FlattenKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FlattenKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::FlattenKernel<paddle::platform::CPUDeviceContext, uint8_t>,
-    ops::FlattenKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::FlattenKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::FlattenKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    flatten_grad,
-    ops::FlattenGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FlattenGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::FlattenGradKernel<paddle::platform::CPUDeviceContext, uint8_t>,
-    ops::FlattenGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::FlattenGradKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::FlattenGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    flatten2,
-    ops::Flatten2Kernel<paddle::platform::CPUDeviceContext, float>,
-    ops::Flatten2Kernel<paddle::platform::CPUDeviceContext, double>,
-    ops::Flatten2Kernel<paddle::platform::CPUDeviceContext, uint8_t>,
-    ops::Flatten2Kernel<paddle::platform::CPUDeviceContext, int>,
-    ops::Flatten2Kernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::Flatten2Kernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    flatten2_grad,
-    ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, uint8_t>,
-    ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(flatten,
+                       ops::FlattenKernel<phi::CPUContext, float>,
+                       ops::FlattenKernel<phi::CPUContext, double>,
+                       ops::FlattenKernel<phi::CPUContext, uint8_t>,
+                       ops::FlattenKernel<phi::CPUContext, int>,
+                       ops::FlattenKernel<phi::CPUContext, int8_t>,
+                       ops::FlattenKernel<phi::CPUContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(flatten_grad,
+                       ops::FlattenGradKernel<phi::CPUContext, float>,
+                       ops::FlattenGradKernel<phi::CPUContext, double>,
+                       ops::FlattenGradKernel<phi::CPUContext, uint8_t>,
+                       ops::FlattenGradKernel<phi::CPUContext, int>,
+                       ops::FlattenGradKernel<phi::CPUContext, int8_t>,
+                       ops::FlattenGradKernel<phi::CPUContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(flatten2,
+                       ops::Flatten2Kernel<phi::CPUContext, float>,
+                       ops::Flatten2Kernel<phi::CPUContext, double>,
+                       ops::Flatten2Kernel<phi::CPUContext, uint8_t>,
+                       ops::Flatten2Kernel<phi::CPUContext, int>,
+                       ops::Flatten2Kernel<phi::CPUContext, int8_t>,
+                       ops::Flatten2Kernel<phi::CPUContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(flatten2_grad,
+                       ops::Flatten2GradKernel<phi::CPUContext, float>,
+                       ops::Flatten2GradKernel<phi::CPUContext, double>,
+                       ops::Flatten2GradKernel<phi::CPUContext, uint8_t>,
+                       ops::Flatten2GradKernel<phi::CPUContext, int>,
+                       ops::Flatten2GradKernel<phi::CPUContext, int8_t>,
+                       ops::Flatten2GradKernel<phi::CPUContext, int64_t>);
diff --git a/paddle/fluid/operators/fold_op.cc b/paddle/fluid/operators/fold_op.cc
index 21e92e6d37511..5ec5a93ada46d 100644
--- a/paddle/fluid/operators/fold_op.cc
+++ b/paddle/fluid/operators/fold_op.cc
@@ -341,11 +341,9 @@ REGISTER_OPERATOR(fold_grad,
                   ops::FoldGradOp,
                   ops::FoldGradOpNoNeedBufferVarsInferer);
 
-REGISTER_OP_CPU_KERNEL(
-    fold,
-    ops::FoldOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FoldOpKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    fold_grad,
-    ops::FoldGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FoldGradOpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(fold,
+                       ops::FoldOpKernel<phi::CPUContext, float>,
+                       ops::FoldOpKernel<phi::CPUContext, double>);
+REGISTER_OP_CPU_KERNEL(fold_grad,
+                       ops::FoldGradOpKernel<phi::CPUContext, float>,
+                       ops::FoldGradOpKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/frame_op.cc b/paddle/fluid/operators/frame_op.cc
index 25efd98d37afd..45a6bc9994db7 100644
--- a/paddle/fluid/operators/frame_op.cc
+++ b/paddle/fluid/operators/frame_op.cc
@@ -185,22 +185,18 @@ REGISTER_OPERATOR(frame_grad, ops::FrameOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     frame,
-    ops::FrameKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::FrameKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::FrameKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FrameKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::FrameKernel<paddle::platform::CPUDeviceContext,
-                     paddle::platform::complex<float>>,
-    ops::FrameKernel<paddle::platform::CPUDeviceContext,
-                     paddle::platform::complex<double>>);
+    ops::FrameKernel<phi::CPUContext, int>,
+    ops::FrameKernel<phi::CPUContext, int64_t>,
+    ops::FrameKernel<phi::CPUContext, float>,
+    ops::FrameKernel<phi::CPUContext, double>,
+    ops::FrameKernel<phi::CPUContext, paddle::platform::complex<float>>,
+    ops::FrameKernel<phi::CPUContext, paddle::platform::complex<double>>);
 
 REGISTER_OP_CPU_KERNEL(
     frame_grad,
-    ops::FrameGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::FrameGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::FrameGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FrameGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::FrameGradKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex<float>>,
-    ops::FrameGradKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex<double>>);
+    ops::FrameGradKernel<phi::CPUContext, int>,
+    ops::FrameGradKernel<phi::CPUContext, int64_t>,
+    ops::FrameGradKernel<phi::CPUContext, float>,
+    ops::FrameGradKernel<phi::CPUContext, double>,
+    ops::FrameGradKernel<phi::CPUContext, paddle::platform::complex<float>>,
+    ops::FrameGradKernel<phi::CPUContext, paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/fsp_op.cc b/paddle/fluid/operators/fsp_op.cc
index e1f82fb27ad0b..ff3a5a638daf0 100644
--- a/paddle/fluid/operators/fsp_op.cc
+++ b/paddle/fluid/operators/fsp_op.cc
@@ -169,11 +169,9 @@ REGISTER_OPERATOR(fsp,
                   ops::FSPGradOpMaker<paddle::framework::OpDesc>,
                   ops::FSPGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(fsp_grad, ops::FSPOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    fsp,
-    ops::FSPOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FSPOpKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    fsp_grad,
-    ops::FSPGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FSPGradOpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(fsp,
+                       ops::FSPOpKernel<phi::CPUContext, float>,
+                       ops::FSPOpKernel<phi::CPUContext, double>);
+REGISTER_OP_CPU_KERNEL(fsp_grad,
+                       ops::FSPGradOpKernel<phi::CPUContext, float>,
+                       ops::FSPGradOpKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index 4ffb96d3c51bc..02a3f4d7a0eb6 100755
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -26,12 +26,18 @@ register_operators(
   fused_bias_dropout_residual_layer_norm_op
   resnet_unit_op
   fused_gemm_epilogue_op
-  fused_gate_attention_op)
+  fused_gate_attention_op
+  resnet_basic_block_op)
 
 # fusion_gru_op does not have CUDA kernel
 op_library(fusion_gru_op)
 op_library(fusion_lstm_op)
 
+if(WITH_XPU)
+  op_library(resnet_basic_block_op)
+  op_library(resnet_unit_op)
+endif()
+
 if(WITH_GPU OR WITH_ROCM)
   # fused_bn_activation_op needs cudnn 7.4.1 above
   # HIP not support bn act fuse in MIOPEN
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu
index 5e96ca140274d..121cbc909b812 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cu
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cu
@@ -315,9 +315,14 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
     cudnnConvolutionFwdAlgo_t algo;
     auto handle = dev_ctx.cudnn_handle();
     auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+    auto dtype = platform::CudnnDataType<T>::type;
 
     PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
         cudnn_conv_desc, CUDNN_DEFAULT_MATH));
+    if (dtype == CUDNN_DATA_HALF) {
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
+          cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
+    }
 #if CUDA_VERSION >= 11000 && CUDNN_VERSION >= 8000
     if (!platform::allow_tf32_cudnn) {
       PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
@@ -414,7 +419,6 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
         algo = algo_cache.GetAlgorithm(
             x_dims[2] * x_dims[3], search_times, 0, search_func);
       } else {
-        auto dtype = platform::CudnnDataType<T>::type;
         algo = algo_cache.GetAlgorithm(x_dims,
                                        f_dims,
                                        strides,
@@ -544,9 +548,11 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 #if CUDNN_VERSION >= 7100
-REGISTER_OP_CUDA_KERNEL(conv2d_fusion,
-                        ops::CUDNNConvFusionOpKernel<float>,
-                        ops::CUDNNConvFusionOpKernel<double>);
+REGISTER_OP_CUDA_KERNEL(
+    conv2d_fusion,
+    ops::CUDNNConvFusionOpKernel<float>,
+    ops::CUDNNConvFusionOpKernel<double>,
+    ops::CUDNNConvFusionOpKernel<paddle::platform::float16>);
 #endif
 #ifdef PADDLE_WITH_HIP
 REGISTER_OP_CUDA_KERNEL(conv2d_fusion, ops::CUDNNConvFusionOpKernel<float>);
diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h
index 3ac5718917346..ef1befbb32033 100644
--- a/paddle/fluid/operators/fused/fmha_ref.h
+++ b/paddle/fluid/operators/fused/fmha_ref.h
@@ -97,10 +97,9 @@ class FMHARef {
     // input shape: [bs, seq_len, 3, num_head, head_dim]
     // transpose with perm [2, 0, 3, 1, 4],
     // output_shape: [3, bs, num_head, seq_len, head_dim]
-    int ndims = 5;
     std::vector<int> perm_1 = {2, 0, 3, 1, 4};
     TransposeGPUKernelDriver<T>(
-        dev_ctx_, ndims, qkv_input_tensor, perm_1, transpose_2_out_tensor);
+        dev_ctx_, qkv_input_tensor, perm_1, transpose_2_out_tensor);
     T* qkv_data = transpose_2_out_tensor->data<T>();
     T* qk_out_data = qk_out_tensor->data<T>();
     T* qktv_out_data = qktv_out_tensor->data<T>();
@@ -255,9 +254,8 @@ class FMHARef {
     // transpose: [0, 2, 1, 3]
     // output shape: [batch_size, seq_len, num_heads, head_dim]
     std::vector<int> perm_3 = {0, 2, 1, 3};
-    ndims = 4;
     TransposeGPUKernelDriver<T>(
-        dev_ctx_, ndims, *qktv_out_tensor, perm_3, fmha_out_tensor);
+        dev_ctx_, *qktv_out_tensor, perm_3, fmha_out_tensor);
   }
 
   void ComputeBackward(const Tensor& transpose_2_out_tensor,
@@ -297,10 +295,9 @@ class FMHARef {
     T* qktv_out_grad_data = qktv_out_grad_tensor->data<T>();
 
     // transpose bw
-    int ndims = 4;
     std::vector<int> perm_3 = {0, 2, 1, 3};
     TransposeGPUKernelDriver<T>(
-        dev_ctx_, ndims, fmha_out_grad_tensor, perm_3, qktv_out_grad_tensor);
+        dev_ctx_, fmha_out_grad_tensor, perm_3, qktv_out_grad_tensor);
 
     // recall batchedgemm(nn) fw: softmax_out_data(x) * v_ptr(y) =
     // qktv_out_data(out)
@@ -476,13 +473,9 @@ class FMHARef {
                      stride_b);
 
     // transpose bw
-    ndims = 5;
     std::vector<int> perm_1 = {1, 3, 0, 2, 4};
-    TransposeGPUKernelDriver<T>(dev_ctx_,
-                                ndims,
-                                *transpose_2_out_grad_tensor,
-                                perm_1,
-                                qkv_input_grad_tensor);
+    TransposeGPUKernelDriver<T>(
+        dev_ctx_, *transpose_2_out_grad_tensor, perm_1, qkv_input_grad_tensor);
   }
 
  private:
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu
index 0c33f7c9d4f9b..2c3fd75d8e012 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_attention_op.cu
@@ -24,11 +24,13 @@ limitations under the License. */
 #include "paddle/fluid/operators/fused/fused_dropout_helper.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
@@ -44,16 +46,30 @@ static void AllReduce(framework::Tensor &tensor,  // NOLINT
                       const platform::CUDADeviceContext &ctx) {
   if (ring_id == -1) return;
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  auto dtype =
-      platform::ToNCCLDataType(framework::TransToProtoVarType(tensor.dtype()));
-  int64_t numel = tensor.numel();
-  const void *sendbuff = tensor.data<T>();
-  auto place = ctx.GetPlace();
-  void *recvbuff = tensor.mutable_data<T>(place);
-  auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place);
-  auto stream = ctx.stream();
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
-      sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream));
+  auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance();
+
+  if (map->has(ring_id)) {
+    paddle::distributed::ProcessGroup *pg = map->get(ring_id);
+    std::vector<phi::DenseTensor> in_tensor;
+    std::vector<phi::DenseTensor> out_tensor;
+    in_tensor.push_back(tensor);
+    out_tensor.push_back(tensor);
+    paddle::distributed::AllreduceOptions opts;
+    opts.reduce_op = distributed::ReduceOp::SUM;
+    auto task = pg->AllReduce(in_tensor, out_tensor, opts);
+    task->Wait();
+  } else {
+    auto dtype = platform::ToNCCLDataType(
+        framework::TransToProtoVarType(tensor.dtype()));
+    int64_t numel = tensor.numel();
+    const void *sendbuff = tensor.data<T>();
+    auto place = ctx.GetPlace();
+    void *recvbuff = tensor.mutable_data<T>(place);
+    auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place);
+    auto stream = ctx.stream();
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+        sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream));
+  }
 #else
   PADDLE_THROW(platform::errors::Unimplemented(
       "PaddlePaddle should compile with NCCL or RCCL when used tensor model "
diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc
index d9405aa021dc1..31bb78922a5a5 100644
--- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc
@@ -463,17 +463,13 @@ REGISTER_OPERATOR(fused_elemwise_activation_grad,
 
 REGISTER_OP_CPU_KERNEL(
     fused_elemwise_activation,
-    ops::FusedElemwiseActivationKernel<paddle::platform::CPUDeviceContext,
-                                       float>,
-    ops::FusedElemwiseActivationKernel<paddle::platform::CPUDeviceContext,
-                                       double>);
+    ops::FusedElemwiseActivationKernel<phi::CPUContext, float>,
+    ops::FusedElemwiseActivationKernel<phi::CPUContext, double>);
 
 REGISTER_OP_CPU_KERNEL(
     fused_elemwise_activation_grad,
-    ops::FusedElemwiseActivationGradKernel<paddle::platform::CPUDeviceContext,
-                                           float>,
-    ops::FusedElemwiseActivationGradKernel<paddle::platform::CPUDeviceContext,
-                                           double>);
+    ops::FusedElemwiseActivationGradKernel<phi::CPUContext, float>,
+    ops::FusedElemwiseActivationGradKernel<phi::CPUContext, double>);
 
 // for memory optimization, we register the fused_elemwise_add_activation OP
 REGISTER_OPERATOR(
@@ -488,14 +484,10 @@ REGISTER_OPERATOR(fused_elemwise_add_activation_grad,
 
 REGISTER_OP_CPU_KERNEL(
     fused_elemwise_add_activation,
-    ops::FusedElemwiseActivationKernel<paddle::platform::CPUDeviceContext,
-                                       float>,
-    ops::FusedElemwiseActivationKernel<paddle::platform::CPUDeviceContext,
-                                       double>);
+    ops::FusedElemwiseActivationKernel<phi::CPUContext, float>,
+    ops::FusedElemwiseActivationKernel<phi::CPUContext, double>);
 
 REGISTER_OP_CPU_KERNEL(
     fused_elemwise_add_activation_grad,
-    ops::FusedElemwiseActivationGradKernel<paddle::platform::CPUDeviceContext,
-                                           float>,
-    ops::FusedElemwiseActivationGradKernel<paddle::platform::CPUDeviceContext,
-                                           double>);
+    ops::FusedElemwiseActivationGradKernel<phi::CPUContext, float>,
+    ops::FusedElemwiseActivationGradKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
index aae3be9aca568..8f413f34242a8 100644
--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
@@ -391,7 +391,7 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
   GET_Ht(ct, gates, ht)
 
   void SeqCompute(const framework::ExecutionContext& ctx) const {
-    using DeviceContext = paddle::platform::CPUDeviceContext;
+    using DeviceContext = phi::CPUContext;
     INIT_BASE_INPUT_OUTPUT
     INIT_BASE_SIZES
     INIT_VEC_FUNC
@@ -496,7 +496,7 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
   }
 
   void BatchCompute(const framework::ExecutionContext& ctx) const {
-    using DeviceContext = platform::CPUDeviceContext;
+    using DeviceContext = phi::CPUContext;
     INIT_BASE_INPUT_OUTPUT
     if (ids->lod()[0].size() == 2) {
       SeqCompute(ctx);
diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
index 937203c92fbf4..c593c65618d78 100644
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
@@ -197,7 +197,7 @@ class FusedEmbeddingSeqPoolKernel : public framework::OpKernel<T> {
       const int m = batch_size * idx_width;
       const int n = table_width;
       const int k = table_height;
-      auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
+      auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(context);
       blas.CSRMM(&transa,
                  &m,
                  &n,
@@ -313,7 +313,7 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
                           padding_idx);
 
       auto *d_output_data = d_output->data<T>();
-      auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
+      auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(context);
       int width = static_cast<int>(table_dim[1]);
       int num_seq = batch_size * idx_width;
       LOG(INFO) << "num seq = " << num_seq << " width = " << width;
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu
index fe388aa40566e..4126f5ad7263a 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cu
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu
@@ -17,11 +17,13 @@ limitations under the License. */
 #include "paddle/fluid/operators/fused/fused_dropout_helper.h"
 #include "paddle/fluid/operators/layer_norm_kernel.cu.h"
 #include "paddle/fluid/operators/matmul_v2_op.h"
+#include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
@@ -37,16 +39,30 @@ static void AllReduce(framework::Tensor& tensor,  // NOLINT
                       const platform::CUDADeviceContext& ctx) {
   if (ring_id == -1) return;
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  auto dtype =
-      platform::ToNCCLDataType(framework::TransToProtoVarType(tensor.dtype()));
-  int64_t numel = tensor.numel();
-  const void* sendbuff = tensor.data<T>();
-  auto place = ctx.GetPlace();
-  void* recvbuff = tensor.mutable_data<T>(place);
-  auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place);
-  auto stream = ctx.stream();
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
-      sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream));
+  auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance();
+
+  if (map->has(ring_id)) {
+    paddle::distributed::ProcessGroup* pg = map->get(ring_id);
+    std::vector<phi::DenseTensor> in_tensor;
+    std::vector<phi::DenseTensor> out_tensor;
+    in_tensor.push_back(tensor);
+    out_tensor.push_back(tensor);
+    paddle::distributed::AllreduceOptions opts;
+    opts.reduce_op = distributed::ReduceOp::SUM;
+    auto task = pg->AllReduce(in_tensor, out_tensor, opts);
+    task->Wait();
+  } else {
+    auto dtype = platform::ToNCCLDataType(
+        framework::TransToProtoVarType(tensor.dtype()));
+    int64_t numel = tensor.numel();
+    const void* sendbuff = tensor.data<T>();
+    auto place = ctx.GetPlace();
+    void* recvbuff = tensor.mutable_data<T>(place);
+    auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place);
+    auto stream = ctx.stream();
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+        sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream));
+  }
 #else
   PADDLE_THROW(platform::errors::Unimplemented(
       "PaddlePaddle should compile with NCCL or RCCL when used tensor model "
diff --git a/paddle/fluid/operators/fused/fused_gate_attention.h b/paddle/fluid/operators/fused/fused_gate_attention.h
index 2dd923bd64d19..45d47908b99e0 100644
--- a/paddle/fluid/operators/fused/fused_gate_attention.h
+++ b/paddle/fluid/operators/fused/fused_gate_attention.h
@@ -622,11 +622,10 @@ class FMHAGateRef {
                                   Tensor* q_transpose_out,
                                   Tensor* k_transpose_out,
                                   Tensor* v_transpose_out) {
-    int ndims = 5;
     std::vector<int> perm = {0, 1, 3, 2, 4};
-    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, q_out, perm, q_transpose_out);
-    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, k_out, perm, k_transpose_out);
-    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, v_out, perm, v_transpose_out);
+    TransposeGPUKernelDriver<T>(dev_ctx_, q_out, perm, q_transpose_out);
+    TransposeGPUKernelDriver<T>(dev_ctx_, k_out, perm, k_transpose_out);
+    TransposeGPUKernelDriver<T>(dev_ctx_, v_out, perm, v_transpose_out);
   }
 
   void ComputeQKVTransposeBackward(const Tensor& q_transpose_out_grad,
@@ -635,48 +634,41 @@ class FMHAGateRef {
                                    Tensor* q_out_grad,
                                    Tensor* k_out_grad,
                                    Tensor* v_out_grad) {
-    int ndims = 5;
     std::vector<int> perm = {0, 1, 3, 2, 4};
     TransposeGPUKernelDriver<T>(
-        dev_ctx_, ndims, q_transpose_out_grad, perm, q_out_grad);
+        dev_ctx_, q_transpose_out_grad, perm, q_out_grad);
     TransposeGPUKernelDriver<T>(
-        dev_ctx_, ndims, k_transpose_out_grad, perm, k_out_grad);
+        dev_ctx_, k_transpose_out_grad, perm, k_out_grad);
     TransposeGPUKernelDriver<T>(
-        dev_ctx_, ndims, v_transpose_out_grad, perm, v_out_grad);
+        dev_ctx_, v_transpose_out_grad, perm, v_out_grad);
   }
 
   // [batch_size, seq_len_m, seq_len_r, 3, num_heads, head_dim] ->
   //         [3, batch_size, seq_len_m, num_heads, seq_len_r, head_dim]
   void ComputeQKVTransposeForward(const Tensor& qkv_out,
                                   Tensor* qkv_transpose_out) {
-    int ndims = 6;
     std::vector<int> perm = {3, 0, 1, 4, 2, 5};
-    TransposeGPUKernelDriver<T>(
-        dev_ctx_, ndims, qkv_out, perm, qkv_transpose_out);
+    TransposeGPUKernelDriver<T>(dev_ctx_, qkv_out, perm, qkv_transpose_out);
   }
 
   void ComputeQKVTransposeBackward(const Tensor& qkv_transpose_out_grad,
                                    Tensor* qkv_out_grad) {
-    int ndims = 6;
     std::vector<int> perm = {1, 2, 4, 0, 3, 5};
     TransposeGPUKernelDriver<T>(
-        dev_ctx_, ndims, qkv_transpose_out_grad, perm, qkv_out_grad);
+        dev_ctx_, qkv_transpose_out_grad, perm, qkv_out_grad);
   }
 
   // [batch_size, seq_len_m, num_head, seq_len_r, c] ->
   //         [batch_size, seq_len_m, seq_len_r, num_head, c]
   void ComputeQKTVTransposeForward(const Tensor& qktv_out, Tensor* fmha_out) {
-    int ndims = 5;
     std::vector<int> perm = {0, 1, 3, 2, 4};
-    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, qktv_out, perm, fmha_out);
+    TransposeGPUKernelDriver<T>(dev_ctx_, qktv_out, perm, fmha_out);
   }
 
   void ComputeQKTVTransposeBackward(const Tensor& fmha_out_grad,
                                     Tensor* qktv_out_grad) {
-    int ndims = 5;
     std::vector<int> perm = {0, 1, 3, 2, 4};
-    TransposeGPUKernelDriver<T>(
-        dev_ctx_, ndims, fmha_out_grad, perm, qktv_out_grad);
+    TransposeGPUKernelDriver<T>(dev_ctx_, fmha_out_grad, perm, qktv_out_grad);
   }
 
   // qk_out = qk_out + nonbatched_bias + src_mask
diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cu b/paddle/fluid/operators/fused/fused_gate_attention_op.cu
index 0d219a4f76d16..7400246f40725 100644
--- a/paddle/fluid/operators/fused/fused_gate_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cu
@@ -363,13 +363,14 @@ class FusedGateAttentionOpKernel : public framework::OpKernel<T> {
         dev_ctx, query, key, query_weight, qkv_weight, merge_qkv, has_gating);
 
     if (merge_qkv) {
-      PADDLE_ENFORCE_EQ(!key || query == key,
-                        true,
-                        platform::errors::InvalidArgument(
-                            "key is expected to be nullptr or the same as "
-                            "query, but recieved key=%p, query=%p.",
-                            key,
-                            query));
+      PADDLE_ENFORCE_EQ(
+          !key || query == key || query->data<T>() == key->data<T>(),
+          true,
+          platform::errors::InvalidArgument(
+              "key is expected to be nullptr or the same as "
+              "query, but recieved key=%p, query=%p.",
+              key,
+              query));
 
       // 1. Merged QKV Matmul: einsum(nbhqk,nbkhc -> nbqhc)
       Tensor *qkv_out = config.GetQKVOut();
diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
index 4aedf4eb79bd1..301b62524a54d 100644
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
@@ -573,7 +573,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
         smem[warp_m * WARPS_N + warp_n] = mu_local;
       }
       __syncthreads();
-      if (tidx == 0) {
+      if (tidx % THREADS_PER_ROW == 0) {
         mu_local = 0.f;
 #pragma unroll
         for (int it = 0; it < WARPS_N; ++it) {
@@ -608,7 +608,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
         smem[warp_m * WARPS_N + warp_n] = var_local;
       }
       __syncthreads();
-      if (tidx == 0) {
+      if (tidx % THREADS_PER_ROW == 0) {
         var_local = 0.f;
 #pragma unroll
         for (int it = 0; it < WARPS_N; ++it) {
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cc b/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
index aa05ebc43da78..86de140b9cde8 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -63,6 +64,7 @@ class FusedMultiTransformerOp : public framework::OperatorWithKernel {
     // y: qkv's weight: [3, num_head, dim_head, dim_embed]
     auto x_dim = ctx->GetInputDim("X");
     auto y_dim = ctx->GetInputsDim("QKVW")[0];
+    bool trans_qkvw = ctx->Attrs().Get<bool>("trans_qkvw");
     PADDLE_ENFORCE_EQ(
         x_dim.size(),
         3,
@@ -79,24 +81,37 @@ class FusedMultiTransformerOp : public framework::OperatorWithKernel {
                           "but received dimensions of"
                           "Input is [%d]",
                           y_dim.size()));
-    PADDLE_ENFORCE_EQ(x_dim[2],
-                      y_dim[3],
-                      platform::errors::InvalidArgument(
-                          "ShapeError: the dimension of x_dim[2] and y_dim[3]"
-                          "must be equal. But received: the shape "
-                          "of input x = [%s], and the shape of "
-                          "input qkv_weight = [%s]",
-                          x_dim,
-                          y_dim));
+    PADDLE_ENFORCE_EQ(
+        x_dim[2],
+        trans_qkvw ? y_dim[3] : y_dim[0],
+        platform::errors::InvalidArgument(
+            "ShapeError: the dimension of x_dim[2] and y_dim[3](trans_qkvw is "
+            "true) or y_dim[0](trans_qkvw is false)"
+            "must be equal. But received: the shape "
+            "of input x = [%s], and the shape of "
+            "input qkv_weight = [%s]",
+            x_dim,
+            y_dim));
 
     if (ctx->Attrs().Get<int>("ring_id") == -1) {
-      PADDLE_ENFORCE_EQ(y_dim[1] * y_dim[2],
-                        y_dim[3],
-                        platform::errors::InvalidArgument(
-                            "The dimensions of qkv_weight must be 4"
-                            "(3, num_head, dim_head, dim_embed),"
-                            "and must satisfy the limitations: "
-                            "(num_head * dim_head == dim_embed)"));
+      if (trans_qkvw) {
+        PADDLE_ENFORCE_EQ(y_dim[1] * y_dim[2],
+                          y_dim[3],
+                          platform::errors::InvalidArgument(
+                              "The dimensions of qkv_weight must be 4"
+                              "(3, num_head, dim_head, dim_embed),"
+                              "and must satisfy the limitations: "
+                              "(num_head * dim_head == dim_embed)"));
+
+      } else {
+        PADDLE_ENFORCE_EQ(y_dim[2] * y_dim[3],
+                          y_dim[0],
+                          platform::errors::InvalidArgument(
+                              "The dimensions of qkv_weight must be 4"
+                              "(dim_embed, 3, num_head, dim_head),"
+                              "and must satisfy the limitations: "
+                              "(num_head * dim_head == dim_embed)"));
+      }
     }
 
     if (ctx->HasInputs("CacheKV")) {
@@ -122,11 +137,11 @@ class FusedMultiTransformerOp : public framework::OperatorWithKernel {
                             x_dim[0],
                             c_dim[1]));  // batch_size
       PADDLE_ENFORCE_EQ(c_dim[2],
-                        y_dim[1],
+                        trans_qkvw ? y_dim[1] : y_dim[2],
                         paddle::platform::errors::InvalidArgument(
                             "The third dim of CacheKV must be equal with num "
                             "head %d, but got %d",
-                            y_dim[1],
+                            trans_qkvw ? y_dim[1] : y_dim[2],
                             c_dim[2]));  // num_head
       PADDLE_ENFORCE_GT(
           c_dim[3],
@@ -135,11 +150,11 @@ class FusedMultiTransformerOp : public framework::OperatorWithKernel {
               "The forth dim of CacheKV must be greater than 0, but got %d",
               c_dim[3]));  // cache_seq_len
       PADDLE_ENFORCE_EQ(c_dim[4],
-                        y_dim[2],
+                        trans_qkvw ? y_dim[2] : y_dim[3],
                         paddle::platform::errors::InvalidArgument(
                             "The fifth dim of CacheKV must be equal with head "
                             "size %d, but got %d",
-                            y_dim[2],
+                            trans_qkvw ? y_dim[2] : y_dim[3],
                             c_dim[4]));  // head_size
     }
 
@@ -258,6 +273,13 @@ class FusedMultiTransformerOpOpMaker
                   "upscale_in_train"));
         });
     AddAttr<std::string>("act_method", "act_method").SetDefault("gelu");
+    AddAttr<bool>(
+        "trans_qkvw",
+        "Whether the weights of qkv should be transposed. If true,"
+        "the shape eights of qkv should be [3, num_head, dim_head, dim_embed]."
+        "Otherwise the shape of weights of qkv should be"
+        "[dim_embed, 3, num_head, dim_head]")
+        .SetDefault(true);
 
     AddAttr<int>(
         "ring_id",
@@ -278,3 +300,12 @@ REGISTER_OPERATOR(
     ops::FusedMultiTransformerOpOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OP_VERSION(fused_multi_transformer)
+    .AddCheckpoint(
+        R"ROC(
+              Add a new attribute [trans_qkvw] )ROC",
+        paddle::framework::compatible::OpVersionDesc().NewAttr(
+            "trans_qkvw",
+            "A flag to indicate whether to transpose for weights of qkv.",
+            true));
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
index ca2b884bf79f6..a8bebd5012db5 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
@@ -29,9 +29,11 @@ limitations under the License. */
 #include "paddle/fluid/operators/fused/fused_dropout_helper.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
@@ -50,16 +52,30 @@ static void AllReduce(framework::Tensor &tensor,  // NOLINT
                       const platform::CUDADeviceContext &ctx) {
   if (ring_id == -1) return;
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  auto dtype =
-      platform::ToNCCLDataType(framework::TransToProtoVarType(tensor.dtype()));
-  int64_t numel = tensor.numel();
-  const void *sendbuff = tensor.data<T>();
-  auto place = ctx.GetPlace();
-  void *recvbuff = tensor.mutable_data<T>(place);
-  auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place);
-  auto stream = ctx.stream();
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
-      sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream));
+  auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance();
+
+  if (map->has(ring_id)) {
+    paddle::distributed::ProcessGroup *pg = map->get(ring_id);
+    std::vector<phi::DenseTensor> in_tensor;
+    std::vector<phi::DenseTensor> out_tensor;
+    in_tensor.push_back(tensor);
+    out_tensor.push_back(tensor);
+    paddle::distributed::AllreduceOptions opts;
+    opts.reduce_op = distributed::ReduceOp::SUM;
+    auto task = pg->AllReduce(in_tensor, out_tensor, opts);
+    task->Wait();
+  } else {
+    auto dtype = platform::ToNCCLDataType(
+        framework::TransToProtoVarType(tensor.dtype()));
+    int64_t numel = tensor.numel();
+    const void *sendbuff = tensor.data<T>();
+    auto place = ctx.GetPlace();
+    void *recvbuff = tensor.mutable_data<T>(place);
+    auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place);
+    auto stream = ctx.stream();
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+        sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream));
+  }
 #else
   PADDLE_THROW(platform::errors::Unimplemented(
       "PaddlePaddle should compile with NCCL or RCCL when used tensor model "
@@ -294,6 +310,52 @@ inline __device__ uint4 mul(uint4 a, uint4 b) {
   return c;
 }
 
+template <>
+inline __device__ uint32_t mul(uint32_t a, float b) {
+  float2 tmp = half2_to_float2(a);
+  float2 tmp_res;
+  tmp_res.x = tmp.x * b;
+  tmp_res.y = tmp.y * b;
+  uint32_t res = float2_to_half2(tmp_res);
+  return res;
+}
+
+template <>
+inline __device__ uint2 mul(uint2 a, float b) {
+  uint2 res;
+  res.x = mul<uint32_t, uint32_t, float>(a.x, b);
+  res.y = mul<uint32_t, uint32_t, float>(a.y, b);
+  return res;
+}
+
+template <>
+inline __device__ uint4 mul(uint4 a, float b) {
+  uint4 res;
+  res.x = mul<uint32_t, uint32_t, float>(a.x, b);
+  res.y = mul<uint32_t, uint32_t, float>(a.y, b);
+  res.z = mul<uint32_t, uint32_t, float>(a.z, b);
+  res.w = mul<uint32_t, uint32_t, float>(a.w, b);
+  return res;
+}
+
+template <>
+inline __device__ float2 mul(float2 a, float b) {
+  float2 res;
+  res.x = a.x * b;
+  res.y = a.y * b;
+  return res;
+}
+
+template <>
+inline __device__ float4 mul(float4 a, float b) {
+  float4 res;
+  res.x = a.x * b;
+  res.y = a.y * b;
+  res.z = a.z * b;
+  res.w = a.w * b;
+  return res;
+}
+
 inline __device__ float sum(float v) { return v; }
 inline __device__ float sum(float2 v) { return v.x + v.y; }
 inline __device__ float sum(float4 v) { return v.x + v.y + v.z + v.w; }
@@ -445,11 +507,15 @@ inline __device__ Float8_ cast_to_float(uint4 u) {
 }
 
 template <int THREADS_PER_KEY, typename K_vec, int N>
-inline __device__ float qk_dot_(const K_vec (&q)[N], const K_vec (&k)[N]) {
-  K_vec qk_vec = mul<K_vec, K_vec, K_vec>(q[0], k[0]);
+inline __device__ float qk_dot_(const K_vec (&q)[N],
+                                const K_vec (&k)[N],
+                                float inv_sqrt_dh) {
+  K_vec inv_q = mul<K_vec, K_vec, float>(q[0], inv_sqrt_dh);
+  K_vec qk_vec = mul<K_vec, K_vec, K_vec>(inv_q, k[0]);
 #pragma unroll
   for (int ii = 1; ii < N; ++ii) {
-    qk_vec = fma(q[ii], k[ii], qk_vec);
+    inv_q = mul<K_vec, K_vec, float>(q[ii], inv_sqrt_dh);
+    qk_vec = fma(inv_q, k[ii], qk_vec);
   }
 
   float qk = sum(qk_vec);
@@ -463,8 +529,10 @@ inline __device__ float qk_dot_(const K_vec (&q)[N], const K_vec (&k)[N]) {
 template <typename T, int THREADS_PER_KEY>
 struct Qk_dot {
   template <typename K_vec, int N>
-  static inline __device__ float dot(const K_vec (&q)[N], const K_vec (&k)[N]) {
-    return qk_dot_<THREADS_PER_KEY>(q, k);
+  static inline __device__ float dot(const K_vec (&q)[N],
+                                     const K_vec (&k)[N],
+                                     float inv_sqrt_dh) {
+    return qk_dot_<THREADS_PER_KEY>(q, k, inv_sqrt_dh);
   }
 };
 
@@ -706,7 +774,9 @@ __global__ void masked_multihead_attention_kernel(
       }
     }
 
-    float qk = Qk_dot<T, THREADS_PER_KEY>::dot(q, k) * params.inv_sqrt_dh;
+    // NOTE(liyurui): We should multiple q with inv_sqrt_dh first, for dot(q, k)
+    // may overflow with FP16 in large model.
+    float qk = Qk_dot<T, THREADS_PER_KEY>::dot(q, k, params.inv_sqrt_dh);
 
     // bool is_mask = false;
     if (ti < params.timestep && tid % THREADS_PER_KEY == 0) {
@@ -1119,17 +1189,23 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
     // y: qkv's weight: [3, num_head, dim_head, dim_embed]
     auto qkv_weights = ctx.MultiInput<Tensor>("QKVW");
     auto qkv_biases = ctx.MultiInput<Tensor>("QKVBias");
+    const bool trans_qkvw = ctx.Attr<bool>("trans_qkvw");
     const auto qkv_w_dims = qkv_weights[0]->dims();
-    int num_head = qkv_w_dims[1];
-    int dim_head = qkv_w_dims[2];
+    int num_head = trans_qkvw ? qkv_w_dims[1] : qkv_w_dims[2];
+    int dim_head = trans_qkvw ? qkv_w_dims[2] : qkv_w_dims[3];
     int hidden_size = num_head * dim_head;
     int output_size = 3 * hidden_size;
     int input_size = dim_embed;
 
     bool compute_bias = qkv_biases.size() > 0 && time_step == nullptr;
-    // (transA, transB, compute_bias) = (false, true, false)
-    auto qkv_compute = AttnMatMul<T>(
-        dev_ctx, false, true, bsz_seq, output_size, input_size, compute_bias);
+    // (transA, transB, compute_bias) = (false, trans_qkvw, false)
+    auto qkv_compute = AttnMatMul<T>(dev_ctx,
+                                     false,
+                                     trans_qkvw,
+                                     bsz_seq,
+                                     output_size,
+                                     input_size,
+                                     compute_bias);
     Tensor qkv_out;
     auto *qkv_out_data =
         qkv_out.mutable_data<T>({bsz, seq_len, 3, num_head, dim_head}, place);
diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc
index 9e31d6cfcfb6e..9556ed12880ae 100644
--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
@@ -310,7 +310,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
   T* xx_data = xx->mutable_data<T>(place)
 
   void SeqCompute(const framework::ExecutionContext& ctx) const {
-    using DeviceContext = paddle::platform::CPUDeviceContext;
+    using DeviceContext = phi::CPUContext;
     INIT_BASE_DEFINES;
     INIT_OTHER_DEFINES;
     const int N = x_lod[0].size() - 1;
@@ -400,7 +400,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
   }
 
   void BatchCompute(const framework::ExecutionContext& ctx) const {
-    using DeviceContext = paddle::platform::CPUDeviceContext;
+    using DeviceContext = phi::CPUContext;
     INIT_BASE_DEFINES;
     if (x_lod[0].size() == 2) {
       xx->Resize({total_T, D3});
diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc
index 282b0a22a8cbe..5454c90b3c596 100644
--- a/paddle/fluid/operators/fused/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc
@@ -306,23 +306,23 @@ This operator fuse the X into LSTM, more details can refer to LSTM op.
 template <typename T>
 class FuisonLSTMKernel : public framework::OpKernel<T> {
  public:
-#define INIT_BASE_DEFINES                                   \
-  using DeviceContext = paddle::platform::CPUDeviceContext; \
-  auto* x = ctx.Input<LoDTensor>("X");                      \
-  auto* h0 = ctx.Input<Tensor>("H0");                       \
-  auto* c0 = ctx.Input<Tensor>("C0");                       \
-  auto* wx = ctx.Input<Tensor>("WeightX");                  \
-  auto* wh = ctx.Input<Tensor>("WeightH");                  \
-  auto* bias = ctx.Input<Tensor>("Bias");                   \
-  auto* xx = ctx.Output<LoDTensor>("XX");                   \
-  auto* hidden_out = ctx.Output<LoDTensor>("Hidden");       \
-  auto* cell_out = ctx.Output<LoDTensor>("Cell");           \
-  bool is_reverse = ctx.Attr<bool>("is_reverse");           \
-  bool use_peepholes = ctx.Attr<bool>("use_peepholes");     \
-  auto x_dims = x->dims();   /* T x M*/                     \
-  auto wh_dims = wh->dims(); /* D x 4D*/                    \
-  const int M = x_dims[1];                                  \
-  const int D = wh_dims[0];                                 \
+#define INIT_BASE_DEFINES                               \
+  using DeviceContext = phi::CPUContext;                \
+  auto* x = ctx.Input<LoDTensor>("X");                  \
+  auto* h0 = ctx.Input<Tensor>("H0");                   \
+  auto* c0 = ctx.Input<Tensor>("C0");                   \
+  auto* wx = ctx.Input<Tensor>("WeightX");              \
+  auto* wh = ctx.Input<Tensor>("WeightH");              \
+  auto* bias = ctx.Input<Tensor>("Bias");               \
+  auto* xx = ctx.Output<LoDTensor>("XX");               \
+  auto* hidden_out = ctx.Output<LoDTensor>("Hidden");   \
+  auto* cell_out = ctx.Output<LoDTensor>("Cell");       \
+  bool is_reverse = ctx.Attr<bool>("is_reverse");       \
+  bool use_peepholes = ctx.Attr<bool>("use_peepholes"); \
+  auto x_dims = x->dims();   /* T x M*/                 \
+  auto wh_dims = wh->dims(); /* D x 4D*/                \
+  const int M = x_dims[1];                              \
+  const int D = wh_dims[0];                             \
   const int D4 = wh_dims[1]
 
 #define INIT_OTHER_DEFINES                                                     \
diff --git a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
index e88deeae21431..2ebac6d7f7124 100644
--- a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
@@ -149,7 +149,7 @@ template <typename T>
 class FusionSeqConvEltAddReluKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    using DeviceContext = paddle::platform::CPUDeviceContext;
+    using DeviceContext = phi::CPUContext;
     auto* x = ctx.Input<LoDTensor>("X");
     auto* w = ctx.Input<Tensor>("Filter");
     auto* b = ctx.Input<Tensor>("Bias");
diff --git a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
index f022d4156f4fa..6655c6756a5c8 100644
--- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
@@ -149,7 +149,7 @@ template <typename T>
 class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    using DeviceContext = paddle::platform::CPUDeviceContext;
+    using DeviceContext = phi::CPUContext;
     auto ins = ctx.MultiInput<LoDTensor>("X");
     auto* w = ctx.Input<Tensor>("FCWeight");
     auto* b = ctx.Input<Tensor>("FCBias");
@@ -239,7 +239,7 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel<T> {
 
     auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
 
-    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::CPUContext>();
     phi::funcs::FCFunctor<DeviceContext, T> fc;
     fc(dev_ctx,
        total_T,
diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
index 7b72f84191e04..ff983684708aa 100644
--- a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
@@ -22,9 +22,9 @@ namespace operators {
 
 using paddle::framework::LoDTensor;
 using paddle::framework::Tensor;
-using paddle::platform::CPUDeviceContext;
 using paddle::platform::MKLDNNGetDataType;
 using paddle::platform::MKLDNNMemDesc;
+using phi::CPUContext;
 using platform::to_void_cast;
 
 template <typename T, typename T_out = T>
diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
index 1258e6bfaf21c..748de5dae9520 100644
--- a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
@@ -22,9 +22,9 @@ namespace operators {
 
 using paddle::framework::LoDTensor;
 using paddle::framework::Tensor;
-using paddle::platform::CPUDeviceContext;
 using paddle::platform::MKLDNNGetDataType;
 using paddle::platform::MKLDNNMemDesc;
+using phi::CPUContext;
 using platform::to_void_cast;
 
 template <typename T, typename T_out = T>
diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h b/paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h
index b999dddf8cfb0..a357a59a09420 100644
--- a/paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#pragma once
+
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 
 namespace paddle {
@@ -19,10 +21,10 @@ namespace operators {
 
 using paddle::framework::LoDTensor;
 using paddle::framework::Tensor;
-using paddle::platform::CPUDeviceContext;
 using paddle::platform::CreateKey;
 using paddle::platform::MKLDNNGetDataType;
 using paddle::platform::MKLDNNMemDesc;
+using phi::CPUContext;
 using platform::to_void_cast;
 
 template <typename T, typename T_alg, typename T_out = T>
diff --git a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
index 44b39b7a80ab7..c59e7d661607c 100644
--- a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
@@ -28,10 +28,10 @@ namespace operators {
 
 using paddle::framework::LoDTensor;
 using paddle::framework::Tensor;
-using paddle::platform::CPUDeviceContext;
 using paddle::platform::CreateKey;
 using paddle::platform::MKLDNNGetDataType;
 using paddle::platform::MKLDNNMemDesc;
+using phi::CPUContext;
 using phi::vectorize;
 using platform::to_void_cast;
 using Direction = dnnl::rnn_direction;
diff --git a/paddle/fluid/operators/fused/resnet_basic_block_op.cc b/paddle/fluid/operators/fused/resnet_basic_block_op.cc
new file mode 100644
index 0000000000000..5990db8147be4
--- /dev/null
+++ b/paddle/fluid/operators/fused/resnet_basic_block_op.cc
@@ -0,0 +1,577 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/api/all.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+class ResNetBasicBlockOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const {
+    // Check input
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ResNetBasicBlockOp");
+    OP_INOUT_CHECK(
+        ctx->HasInput("Filter1"), "Input", "Filter1", "ResNetBasicBlockOp");
+    OP_INOUT_CHECK(
+        ctx->HasInput("Scale1"), "Input", "Scale1", "ResNetBasicBlockOp");
+    OP_INOUT_CHECK(
+        ctx->HasInput("Bias1"), "Input", "Bias1", "ResNetBasicBlockOp");
+    OP_INOUT_CHECK(
+        ctx->HasInput("Mean1"), "Input", "Mean1", "ResNetBasicBlockOp");
+    OP_INOUT_CHECK(
+        ctx->HasInput("Var1"), "Input", "Var1", "ResNetBasicBlockOp");
+    OP_INOUT_CHECK(
+        ctx->HasInput("Filter2"), "Input", "Filter2", "ResNetBasicBlockOp");
+    OP_INOUT_CHECK(
+        ctx->HasInput("Scale2"), "Input", "Scale2", "ResNetBasicBlockOp");
+    OP_INOUT_CHECK(
+        ctx->HasInput("Bias2"), "Input", "Bias2", "ResNetBasicBlockOp");
+    OP_INOUT_CHECK(
+        ctx->HasInput("Mean2"), "Input", "Mean2", "ResNetBasicBlockOp");
+    OP_INOUT_CHECK(
+        ctx->HasInput("Var2"), "Input", "Var2", "ResNetBasicBlockOp");
+
+    bool has_shortcut = ctx->Attrs().Get<bool>("has_shortcut");
+    if (has_shortcut) {
+      OP_INOUT_CHECK(
+          ctx->HasInput("Filter3"), "Input", "Filter3", "ResNetBasicBlockOp");
+      OP_INOUT_CHECK(
+          ctx->HasInput("Scale3"), "Input", "Scale3", "ResNetBasicBlockOp");
+      OP_INOUT_CHECK(
+          ctx->HasInput("Bias3"), "Input", "Bias3", "ResNetBasicBlockOp");
+      OP_INOUT_CHECK(
+          ctx->HasInput("Mean3"), "Input", "Mean3", "ResNetBasicBlockOp");
+      OP_INOUT_CHECK(
+          ctx->HasInput("Var3"), "Input", "Var3", "ResNetBasicBlockOp");
+    }
+
+    // Check output
+    OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "ResNetBasicBlockOp");
+    OP_INOUT_CHECK(
+        ctx->HasOutput("Conv1"), "Output", "Conv1", "ResNetBasicBlockOp");
+    OP_INOUT_CHECK(ctx->HasOutput("SavedMean1"),
+                   "Output",
+                   "SavedMean1",
+                   "ResNetBasicBlockOp");
+    OP_INOUT_CHECK(ctx->HasOutput("SavedInvstd1"),
+                   "Output",
+                   "SavedInvstd1",
+                   "ResNetBasicBlockOp");
+    OP_INOUT_CHECK(
+        ctx->HasOutput("Mean1Out"), "Output", "Mean1Out", "ResNetBasicBlockOp");
+    OP_INOUT_CHECK(
+        ctx->HasOutput("Var1Out"), "Output", "Var1Out", "ResNetBasicBlockOp");
+    OP_INOUT_CHECK(
+        ctx->HasOutput("Conv2"), "Output", "Conv2", "ResNetBasicBlockOp");
+    OP_INOUT_CHECK(ctx->HasOutput("SavedMean2"),
+                   "Output",
+                   "SavedMean2",
+                   "ResNetBasicBlockOp");
+    OP_INOUT_CHECK(ctx->HasOutput("SavedInvstd2"),
+                   "Output",
+                   "SavedInvstd2",
+                   "ResNetBasicBlockOp");
+    OP_INOUT_CHECK(
+        ctx->HasOutput("Mean2Out"), "Output", "Mean2Out", "ResNetBasicBlockOp");
+    OP_INOUT_CHECK(
+        ctx->HasOutput("Var2Out"), "Output", "Var2Out", "ResNetBasicBlockOp");
+    if (has_shortcut) {
+      OP_INOUT_CHECK(
+          ctx->HasOutput("Conv3"), "Output", "Conv3", "ResNetBasicBlockOp");
+      OP_INOUT_CHECK(ctx->HasOutput("SavedMean3"),
+                     "Output",
+                     "SavedMean3",
+                     "ResNetBasicBlockOp");
+      OP_INOUT_CHECK(ctx->HasOutput("SavedInvstd3"),
+                     "Output",
+                     "SavedInvstd3",
+                     "ResNetBasicBlockOp");
+      OP_INOUT_CHECK(ctx->HasOutput("Mean3Out"),
+                     "Output",
+                     "Mean3Out",
+                     "ResNetBasicBlockOp");
+      OP_INOUT_CHECK(
+          ctx->HasOutput("Var3Out"), "Output", "Var3Out", "ResNetBasicBlockOp");
+    }
+
+    // make sure Mean/RunningMean and Var/RunningVar share memory
+    PADDLE_ENFORCE_EQ(ctx->Inputs("Mean1")[0],
+                      ctx->Outputs("Mean1Out")[0],
+                      platform::errors::InvalidArgument(
+                          "Mean1 and Mean1Out should share the same memory"));
+    PADDLE_ENFORCE_EQ(ctx->Inputs("Var1")[0],
+                      ctx->Outputs("Var1Out")[0],
+                      platform::errors::InvalidArgument(
+                          "Var1 and Var1Out should share the same memory"));
+    PADDLE_ENFORCE_EQ(ctx->Inputs("Mean2")[0],
+                      ctx->Outputs("Mean2Out")[0],
+                      platform::errors::InvalidArgument(
+                          "Mean2 and Mean2Out should share the same memory"));
+    PADDLE_ENFORCE_EQ(ctx->Inputs("Var2")[0],
+                      ctx->Outputs("Var2Out")[0],
+                      platform::errors::InvalidArgument(
+                          "Var2 and Var2Out should share the same memory"));
+
+    if (has_shortcut) {
+      PADDLE_ENFORCE_EQ(ctx->Inputs("Mean3")[0],
+                        ctx->Outputs("Mean3Out")[0],
+                        platform::errors::InvalidArgument(
+                            "Mean3 and Mean3Out should share the same memory"));
+      PADDLE_ENFORCE_EQ(ctx->Inputs("Var3")[0],
+                        ctx->Outputs("Var3Out")[0],
+                        platform::errors::InvalidArgument(
+                            "Var3 and Var3Out should share the same memory"));
+    }
+
+    // Check dims of inputs
+    auto data_format = ctx->Attrs().Get<std::string>("data_format");
+    PADDLE_ENFORCE_EQ(
+        data_format,
+        "NCHW",
+        platform::errors::InvalidArgument("The data format must equal to NCHW. "
+                                          "But received: the data format "
+                                          "= [%s]",
+                                          data_format));
+    int stride1 = ctx->Attrs().Get<int>("stride1");
+    int stride2 = ctx->Attrs().Get<int>("stride2");
+    int padding1 = ctx->Attrs().Get<int>("padding1");
+    int padding2 = ctx->Attrs().Get<int>("padding2");
+
+    const auto x1_dims = ctx->GetInputDim("X");
+    const auto w1_dims = ctx->GetInputDim("Filter1");
+    const auto bn1_param_dims = ctx->GetInputDim("Scale1");
+    PADDLE_ENFORCE_EQ(
+        x1_dims.size(),
+        4,
+        platform::errors::InvalidArgument("The dimensions of input "
+                                          "must equal to 4."
+                                          "But received: the shape of input "
+                                          "= [%s], the dimension of input = "
+                                          "[%d]",
+                                          x1_dims,
+                                          x1_dims.size()));
+
+    // Calculate the dims of output1
+    int batch = x1_dims[0];
+    int output1_channel = w1_dims[0];
+    int filter1_size = w1_dims[2];
+    int out1_h = (x1_dims[2] + padding1 * 2 - filter1_size) / stride1 + 1;
+    int out1_w = (x1_dims[3] + padding1 * 2 - filter1_size) / stride1 + 1;
+    std::vector<int> out1_shape = {batch, output1_channel, out1_h, out1_w};
+
+    const auto w2_dims = ctx->GetInputDim("Filter2");
+    const auto bn2_param_dims = ctx->GetInputDim("Scale2");
+    int output2_channel = w2_dims[0];
+    int filter2_size = w2_dims[2];
+    int out2_h = (out1_h + padding2 * 2 - filter2_size) / stride2 + 1;
+    int out2_w = (out1_w + padding2 * 2 - filter2_size) / stride2 + 1;
+    std::vector<int> out2_shape = {batch, output2_channel, out2_h, out2_w};
+
+    auto y_dims = phi::make_ddim(out2_shape);
+    auto conv1_dims = phi::make_ddim(out1_shape);
+    ctx->SetOutputDim("Y", y_dims);
+    ctx->SetOutputDim("Conv1", conv1_dims);
+    ctx->SetOutputDim("SavedMean1", bn1_param_dims);
+    ctx->SetOutputDim("SavedInvstd1", bn1_param_dims);
+    ctx->SetOutputDim("Mean1Out", bn1_param_dims);
+    ctx->SetOutputDim("Var1Out", bn1_param_dims);
+    ctx->SetOutputDim("Conv2", y_dims);
+    ctx->SetOutputDim("Conv2Input", conv1_dims);
+    ctx->SetOutputDim("SavedMean2", bn2_param_dims);
+    ctx->SetOutputDim("SavedInvstd2", bn2_param_dims);
+    ctx->SetOutputDim("Mean2Out", bn2_param_dims);
+    ctx->SetOutputDim("Var2Out", bn2_param_dims);
+    if (has_shortcut) {
+      ctx->SetOutputDim("Conv3", y_dims);
+      ctx->SetOutputDim("SavedMean3", bn2_param_dims);
+      ctx->SetOutputDim("SavedInvstd3", bn2_param_dims);
+      ctx->SetOutputDim("Mean3Out", bn2_param_dims);
+      ctx->SetOutputDim("Var3Out", bn2_param_dims);
+    }
+
+    bool find_max = ctx->Attrs().Get<bool>("find_conv_input_max");
+    if (find_max) {
+      auto max_dims = phi::make_ddim({6});
+      ctx->SetOutputDim("MaxInput1", max_dims);
+      ctx->SetOutputDim("MaxFilter1", max_dims);
+      ctx->SetOutputDim("MaxInput2", max_dims);
+      ctx->SetOutputDim("MaxFilter2", max_dims);
+      if (has_shortcut) {
+        ctx->SetOutputDim("MaxInput3", max_dims);
+        ctx->SetOutputDim("MaxFilter3", max_dims);
+      }
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const {
+    auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+
+    // By default, the type of the scale, bias, mean,
+    // and var tensors should be float when input tensor's dtype is float16.
+    auto bn_param_type = framework::proto::VarType::FP32;
+    PADDLE_ENFORCE_EQ(
+        bn_param_type,
+        framework::TransToProtoVarType(ctx.Input<Tensor>("Scale1")->dtype()),
+        platform::errors::InvalidArgument(
+            "Scale input should be of float type"));
+    PADDLE_ENFORCE_EQ(
+        bn_param_type,
+        framework::TransToProtoVarType(ctx.Input<Tensor>("Bias1")->dtype()),
+        platform::errors::InvalidArgument(
+            "Bias input should be of float type"));
+    PADDLE_ENFORCE_EQ(
+        bn_param_type,
+        framework::TransToProtoVarType(ctx.Input<Tensor>("Scale2")->dtype()),
+        platform::errors::InvalidArgument(
+            "Scale input should be of float type"));
+    PADDLE_ENFORCE_EQ(
+        bn_param_type,
+        framework::TransToProtoVarType(ctx.Input<Tensor>("Bias2")->dtype()),
+        platform::errors::InvalidArgument(
+            "Bias input should be of float type"));
+
+    framework::LibraryType library = framework::LibraryType::kPlain;
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+    return framework::OpKernelType(
+        input_data_type, ctx.GetPlace(), layout, library);
+  }
+};
+
+class ResNetBasicBlockOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    //  has_shortcut = True:       else:
+    //          X                         X
+    //        /                         /
+    //      |       |                 |       |
+    //    CONV1     |               CONV1     |
+    //      |       |                 |       |
+    //     BN1      |                BN1      |
+    //      |       |                 |       |
+    //    RELU1     |               RELU1     |
+    //      |       |                 |       |
+    //    CONV2   CONV3             CONV2     |
+    //      |       |                 |       |
+    //     BN2     BN3               BN2      |
+    //      \       /                 \       /
+    //         ADD                       ADD
+    //          |                         |
+    //         RELU                      RELU
+    //          |                         |
+    //          Y                         Y
+    AddInput("X", "Input tensor of conv 1");
+    AddInput("Filter1", "Filter tensor of conv 1");
+    AddInput("Scale1", "Scale tensor of bn 1");
+    AddInput("Bias1", "Bias tensor of bn 1");
+    AddInput("Mean1", "Mean tensor of bn 1");
+    AddInput("Var1", "Variance tensor of bn 1");
+    AddInput("Filter2", "Filter tensor of conv 2");
+    AddInput("Scale2", "Scale tensor of bn 2");
+    AddInput("Bias2", "Bias tensor of bn 2");
+    AddInput("Mean2", "Mean tensor of bn 2");
+    AddInput("Var2", "Variance tensor of bn 2");
+    AddInput("Filter3", "Filter tensor of conv 3").AsDispensable();
+    AddInput("Scale3", "Scale tensor of bn 3").AsDispensable();
+    AddInput("Bias3", "Bias tensor of bn 3").AsDispensable();
+    AddInput("Mean3", "Mean tensor of bn 3").AsDispensable();
+    AddInput("Var3", "Variance tensor of bn 3").AsDispensable();
+    AddOutput("Y", "The result of ssd resnet unit");
+    AddOutput("Conv1", "The result of conv 1");
+    AddOutput("SavedMean1", "Mean of input 1 after conv 1");
+    AddOutput("SavedInvstd1", "Invstd of input 1 after conv 1");
+    AddOutput("Mean1Out", "Shared memory with Mean1");
+    AddOutput("Var1Out", "Shared memory with Var1");
+    AddOutput("Conv2", "The result of conv 2");
+    AddOutput("Conv2Input", "Conv2 input data");
+    AddOutput("SavedMean2", "Mean of input 2 after conv 2");
+    AddOutput("SavedInvstd2", "Invstd of input 2 after conv 2");
+    AddOutput("Mean2Out", "Shared memory with Mean2");
+    AddOutput("Var2Out", "Shared memory with Var2");
+    AddOutput("Conv3", "The result of conv 3").AsDispensable();
+    AddOutput("SavedMean3", "Mean of input 3 after conv 3").AsDispensable();
+    AddOutput("SavedInvstd3", "Invstd of input 3 after conv 3").AsDispensable();
+    AddOutput("Mean3Out", "Shared memory with Mean3").AsDispensable();
+    AddOutput("Var3Out", "Shared memory with Var3").AsDispensable();
+    AddOutput("MaxInput1", "The max value of conv1 input tensor")
+        .AsDispensable();
+    AddOutput("MaxFilter1", "The max value of conv1 filter tensor")
+        .AsDispensable();
+    AddOutput("MaxInput2", "The max value of conv2 input tensor")
+        .AsDispensable();
+    AddOutput("MaxFilter2", "The max value of conv2 filter tensor")
+        .AsDispensable();
+    AddOutput("MaxInput3", "The max value of conv3 input tensor")
+        .AsDispensable();
+    AddOutput("MaxFilter3", "The max value of conv3 filter tensor")
+        .AsDispensable();
+    AddAttr<int>("stride1", "Stride of conv1").SetDefault(1);
+    AddAttr<int>("stride2", "Stride of conv2").SetDefault(1);
+    AddAttr<int>("stride3", "Stride of conv3").SetDefault(1);
+    AddAttr<int>("padding1", "Padding of conv1").SetDefault(0);
+    AddAttr<int>("padding2", "Padding of conv2").SetDefault(0);
+    AddAttr<int>("padding3", "Padding of conv3").SetDefault(0);
+    AddAttr<int>("dilation1", "Dilation of conv1").SetDefault(1);
+    AddAttr<int>("dilation2", "Dilation of conv2").SetDefault(1);
+    AddAttr<int>("dilation3", "Dilation of conv3").SetDefault(1);
+    AddAttr<int>("group", "Group of all the 3 conv").SetDefault(1);
+    AddAttr<float>("momentum", "Momentum of all the 3 bn").SetDefault(0.9);
+    AddAttr<float>("epsilon", "Epsilon of all the 3 bn").SetDefault(1e-5);
+    AddAttr<std::string>("data_format", "").SetDefault("NCHW");
+    AddAttr<bool>("has_shortcut", "").SetDefault(false);
+    AddAttr<bool>("use_global_stats", "").SetDefault(false);
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
+    AddAttr<bool>(
+        "trainable_statistics",
+        "(bool, default false) Whether to calculate mean and variance "
+        "in test mode. If setting true in test mode, mean and variace "
+        "will be calculated by current batch statistics.")
+        .SetDefault(false);
+    AddAttr<std::string>("act_type", "The activation type to be fused.")
+        .SetDefault("relu");
+    AddAttr<bool>("find_conv_input_max",
+                  "(bool, default true) Whether to calculate max value of conv "
+                  "input tensor.")
+        .SetDefault(true);
+    AddComment(R"DOC(
+Fusion op of the basic unit of ssd resnet block.
+** This is only use for XPU, if has problems, concat zhangyikun02@baidu.com **
+)DOC");
+  }
+};
+
+template <typename T>
+class ResNetBasicBlockGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("resnet_basic_block_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Filter1", this->Input("Filter1"));
+    op->SetInput("Conv1", this->Output("Conv1"));
+    op->SetInput("Scale1", this->Input("Scale1"));
+    op->SetInput("Bias1", this->Input("Bias1"));
+    op->SetInput("SavedMean1", this->Output("SavedMean1"));
+    op->SetInput("SavedInvstd1", this->Output("SavedInvstd1"));
+    op->SetInput("Filter2", this->Input("Filter2"));
+    op->SetInput("Conv2", this->Output("Conv2"));
+    op->SetInput("Conv2Input", this->Output("Conv2Input"));
+    op->SetInput("Scale2", this->Input("Scale2"));
+    op->SetInput("Bias2", this->Input("Bias2"));
+    op->SetInput("SavedMean2", this->Output("SavedMean2"));
+    op->SetInput("SavedInvstd2", this->Output("SavedInvstd2"));
+    op->SetInput("Filter3", this->Input("Filter3"));
+    op->SetInput("Conv3", this->Output("Conv3"));
+    op->SetInput("Scale3", this->Input("Scale3"));
+    op->SetInput("Bias3", this->Input("Bias3"));
+    op->SetInput("SavedMean3", this->Output("SavedMean3"));
+    op->SetInput("SavedInvstd3", this->Output("SavedInvstd3"));
+    op->SetInput("MaxInput1", this->Output("MaxInput1"));
+    op->SetInput("MaxFilter1", this->Output("MaxFilter1"));
+    op->SetInput("MaxInput2", this->Output("MaxInput2"));
+    op->SetInput("MaxFilter2", this->Output("MaxFilter2"));
+    op->SetInput("MaxInput3", this->Output("MaxInput3"));
+    op->SetInput("MaxFilter3", this->Output("MaxFilter3"));
+    op->SetInput("Y", this->Output("Y"));
+    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
+
+    op->SetAttrMap(this->Attrs());
+
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Filter1"),
+                  this->InputGrad("Filter1"));
+    op->SetOutput(framework::GradVarName("Scale1"), this->InputGrad("Scale1"));
+    op->SetOutput(framework::GradVarName("Bias1"), this->InputGrad("Bias1"));
+    op->SetOutput(framework::GradVarName("Filter2"),
+                  this->InputGrad("Filter2"));
+    op->SetOutput(framework::GradVarName("Scale2"), this->InputGrad("Scale2"));
+    op->SetOutput(framework::GradVarName("Bias2"), this->InputGrad("Bias2"));
+    op->SetOutput(framework::GradVarName("Filter3"),
+                  this->InputGrad("Filter3"));
+    op->SetOutput(framework::GradVarName("Scale3"), this->InputGrad("Scale3"));
+    op->SetOutput(framework::GradVarName("Bias3"), this->InputGrad("Bias3"));
+  }
+};
+
+class ResNetBasicBlockOpInferVarType
+    : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string>& GetInputOutputWithSameType()
+      const override {
+    static std::unordered_map<std::string, std::string> m{{"X", /*->*/ "Y"}};
+    return m;
+  }
+};
+
+class ResNetBasicBlockGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const {
+    // check input
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ResNetBasicBlockGradOp");
+    OP_INOUT_CHECK(
+        ctx->HasInput("Filter1"), "Input", "Filter1", "ResNetBasicBlockGradOp");
+    OP_INOUT_CHECK(
+        ctx->HasInput("Conv1"), "Input", "Conv1", "ResNetBasicBlockGradOp");
+    OP_INOUT_CHECK(
+        ctx->HasInput("Scale1"), "Input", "Scale1", "ResNetBasicBlockGradOp");
+    OP_INOUT_CHECK(
+        ctx->HasInput("Bias1"), "Input", "Bias1", "ResNetBasicBlockGradOp");
+    OP_INOUT_CHECK(ctx->HasInput("SavedMean1"),
+                   "Input",
+                   "SavedMean1",
+                   "ResNetBasicBlockGradOp");
+    OP_INOUT_CHECK(ctx->HasInput("SavedInvstd1"),
+                   "Input",
+                   "SavedInvstd1",
+                   "ResNetBasicBlockGradOp");
+    OP_INOUT_CHECK(
+        ctx->HasInput("Filter2"), "Input", "Filter2", "ResNetBasicBlockGradOp");
+    OP_INOUT_CHECK(
+        ctx->HasInput("Conv2"), "Input", "Conv2", "ResNetBasicBlockGradOp");
+    OP_INOUT_CHECK(
+        ctx->HasInput("Scale2"), "Input", "Scale2", "ResNetBasicBlockGradOp");
+    OP_INOUT_CHECK(
+        ctx->HasInput("Bias2"), "Input", "Bias2", "ResNetBasicBlockGradOp");
+    OP_INOUT_CHECK(ctx->HasInput("SavedMean2"),
+                   "Input",
+                   "SavedMean2",
+                   "ResNetBasicBlockGradOp");
+    OP_INOUT_CHECK(ctx->HasInput("SavedInvstd2"),
+                   "Input",
+                   "SavedInvstd2",
+                   "ResNetBasicBlockGradOp");
+    bool has_shortcut = ctx->Attrs().Get<bool>("has_shortcut");
+    if (has_shortcut) {
+      OP_INOUT_CHECK(ctx->HasInput("Filter3"),
+                     "Input",
+                     "Filter3",
+                     "ResNetBasicBlockGradOp");
+      OP_INOUT_CHECK(
+          ctx->HasInput("Scale3"), "Input", "Scale3", "ResNetBasicBlockGradOp");
+      OP_INOUT_CHECK(
+          ctx->HasInput("Bias3"), "Input", "Bias3", "ResNetBasicBlockGradOp");
+    }
+    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "ResNetBasicBlockGradOp");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")),
+                   "Input",
+                   framework::GradVarName("Y"),
+                   "ResNetBasicBlockGradOp");
+
+    // check output
+    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Filter1")),
+                   "Output",
+                   framework::GradVarName("Filter1"),
+                   "ResNetBasicBlockGradOp");
+    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Scale1")),
+                   "Output",
+                   framework::GradVarName("Scale1"),
+                   "ResNetBasicBlockGradOp");
+    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Bias1")),
+                   "Output",
+                   framework::GradVarName("Bias1"),
+                   "ResNetBasicBlockGradOp");
+    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Filter2")),
+                   "Output",
+                   framework::GradVarName("Filter2"),
+                   "ResNetBasicBlockGradOp");
+    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Scale2")),
+                   "Output",
+                   framework::GradVarName("Scale2"),
+                   "ResNetBasicBlockGradOp");
+    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Bias2")),
+                   "Output",
+                   framework::GradVarName("Bias2"),
+                   "ResNetBasicBlockGradOp");
+    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")),
+                   "Output",
+                   framework::GradVarName("X"),
+                   "ResNetBasicBlockGradOp");
+    if (has_shortcut) {
+      OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Filter3")),
+                     "Output",
+                     framework::GradVarName("Filter3"),
+                     "ResNetBasicBlockGradOp");
+      OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Scale3")),
+                     "Output",
+                     framework::GradVarName("Scale3"),
+                     "ResNetBasicBlockGradOp");
+      OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Bias3")),
+                     "Output",
+                     framework::GradVarName("Bias3"),
+                     "ResNetBasicBlockGradOp");
+    }
+
+    const auto x1_dims = ctx->GetInputDim("X");
+    const auto filter1_x_dims = ctx->GetInputDim("Filter1");
+    const auto param1_dims = ctx->GetInputDim("Scale1");
+    const auto filter2_x_dims = ctx->GetInputDim("Filter2");
+    const auto param2_dims = ctx->GetInputDim("Scale2");
+    ctx->SetOutputDim(framework::GradVarName("X"), x1_dims);
+    ctx->SetOutputDim(framework::GradVarName("Filter1"), filter1_x_dims);
+    ctx->SetOutputDim(framework::GradVarName("Scale1"), param1_dims);
+    ctx->SetOutputDim(framework::GradVarName("Bias1"), param1_dims);
+    ctx->SetOutputDim(framework::GradVarName("Filter2"), filter2_x_dims);
+    ctx->SetOutputDim(framework::GradVarName("Scale2"), param2_dims);
+    ctx->SetOutputDim(framework::GradVarName("Bias2"), param2_dims);
+    if (has_shortcut) {
+      const auto filter_z_dims = ctx->GetInputDim("Filter3");
+      ctx->SetOutputDim(framework::GradVarName("Filter3"), filter_z_dims);
+      ctx->SetOutputDim(framework::GradVarName("Scale3"), param2_dims);
+      ctx->SetOutputDim(framework::GradVarName("Bias3"), param2_dims);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const {
+    PADDLE_ENFORCE_NOT_NULL(
+        ctx.InputVar(framework::GradVarName("Y")),
+        platform::errors::NotFound(
+            "Can not find Y@GRAD in the execution context."));
+
+    framework::LibraryType library = framework::LibraryType::kPlain;
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
+        ctx.GetPlace(),
+        layout,
+        library);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(resnet_basic_block,
+                  ops::ResNetBasicBlockOp,
+                  ops::ResNetBasicBlockOpMaker,
+                  ops::ResNetBasicBlockOpInferVarType,
+                  ops::ResNetBasicBlockGradOpMaker<paddle::framework::OpDesc>,
+                  ops::ResNetBasicBlockGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(resnet_basic_block_grad, ops::ResNetBasicBlockGradOp);
diff --git a/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc b/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc
new file mode 100644
index 0000000000000..52e6807f15c67
--- /dev/null
+++ b/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc
@@ -0,0 +1,966 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/conv_op.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/fluid/platform/device/xpu/xpu_header.h"
+#include "paddle/phi/api/all.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class ResnetBasicBlockAttr {
+ public:
+  explicit ResnetBasicBlockAttr(const framework::ExecutionContext& ctx) {
+    padding1 = ctx.Attr<int>("padding1");
+    padding2 = ctx.Attr<int>("padding2");
+    padding3 = ctx.Attr<int>("padding3");
+    stride1 = ctx.Attr<int>("stride1");
+    stride2 = ctx.Attr<int>("stride2");
+    stride3 = ctx.Attr<int>("stride3");
+    dilation1 = ctx.Attr<int>("dilation1");
+    dilation2 = ctx.Attr<int>("dilation2");
+    dilation3 = ctx.Attr<int>("dilation3");
+    group = ctx.Attr<int>("group");
+
+    eps = static_cast<double>(ctx.Attr<float>("epsilon"));
+    momentum = static_cast<double>(ctx.Attr<float>("momentum"));
+    has_shortcut = ctx.Attr<bool>("has_shortcut");
+    find_max = ctx.Attr<bool>("find_conv_input_max");
+
+    const auto is_test = ctx.Attr<bool>("is_test");
+    const auto use_global_stats = ctx.Attr<bool>("use_global_stats");
+    const auto trainable_stats = ctx.Attr<bool>("trainable_statistics");
+    bool test_mode = is_test && (!trainable_stats);
+    global_stats = test_mode || use_global_stats;
+
+    // init shape
+    auto input1 = ctx.Input<Tensor>("X");
+    auto filter1 = ctx.Input<Tensor>("Filter1");
+    auto conv1_out = ctx.Output<Tensor>("Conv1");
+    auto filter2 = ctx.Input<Tensor>("Filter2");
+    auto conv2_out = ctx.Output<Tensor>("Conv2");
+    conv1_input_shape = phi::vectorize<int>(input1->dims());
+    conv1_output_shape = phi::vectorize<int>(conv1_out->dims());
+    conv1_filter_shape = phi::vectorize<int>(filter1->dims());
+    conv1_filter_numel = filter1->numel();
+    conv1_input_numel = input1->numel();
+    conv1_output_numel = conv1_out->numel();
+
+    conv2_input_shape = phi::vectorize<int>(conv1_out->dims());
+    conv2_output_shape = phi::vectorize<int>(conv2_out->dims());
+    conv2_filter_shape = phi::vectorize<int>(filter2->dims());
+    conv2_filter_numel = filter2->numel();
+    conv2_input_numel = conv1_out->numel();
+    conv2_output_numel = conv2_out->numel();
+
+    if (has_shortcut) {
+      auto filter3 = ctx.Input<Tensor>("Filter3");
+      auto conv3_out = ctx.Output<Tensor>("Conv3");
+      conv3_input_shape = phi::vectorize<int>(input1->dims());
+      conv3_output_shape = phi::vectorize<int>(conv3_out->dims());
+      conv3_filter_shape = phi::vectorize<int>(filter3->dims());
+      conv3_filter_numel = filter3->numel();
+      conv3_input_numel = input1->numel();
+      conv3_output_numel = conv3_out->numel();
+    }
+  }
+
+  int padding1;
+  int padding2;
+  int padding3;
+  int stride1;
+  int stride2;
+  int stride3;
+  int dilation1;
+  int dilation2;
+  int dilation3;
+  int group;
+
+  double eps;
+  double momentum;
+
+  bool has_shortcut;
+  bool find_max;
+  bool global_stats;
+
+  std::vector<int> conv1_input_shape;
+  std::vector<int> conv1_output_shape;
+  std::vector<int> conv1_filter_shape;
+  std::vector<int> conv2_input_shape;
+  std::vector<int> conv2_output_shape;
+  std::vector<int> conv2_filter_shape;
+  std::vector<int> conv3_input_shape;
+  std::vector<int> conv3_output_shape;
+  std::vector<int> conv3_filter_shape;
+
+  int conv1_filter_numel;
+  int conv2_filter_numel;
+  int conv3_filter_numel;
+  int conv1_input_numel;
+  int conv2_input_numel;
+  int conv3_input_numel;
+  int conv1_output_numel;
+  int conv2_output_numel;
+  int conv3_output_numel;
+};
+
+class ResnetBasicBlockGradAttr {
+ public:
+  explicit ResnetBasicBlockGradAttr(const framework::ExecutionContext& ctx) {
+    padding1 = ctx.Attr<int>("padding1");
+    padding2 = ctx.Attr<int>("padding2");
+    padding3 = ctx.Attr<int>("padding3");
+    stride1 = ctx.Attr<int>("stride1");
+    stride2 = ctx.Attr<int>("stride2");
+    stride3 = ctx.Attr<int>("stride3");
+    dilation1 = ctx.Attr<int>("dilation1");
+    dilation2 = ctx.Attr<int>("dilation2");
+    dilation3 = ctx.Attr<int>("dilation3");
+    group = ctx.Attr<int>("group");
+
+    has_shortcut = ctx.Attr<bool>("has_shortcut");
+    find_max = ctx.Attr<bool>("find_conv_input_max");
+
+    // init shape
+    auto input1 = ctx.Input<Tensor>("X");
+    auto filter1 = ctx.Input<Tensor>("Filter1");
+    auto conv1_out = ctx.Input<Tensor>("Conv1");
+    auto filter2 = ctx.Input<Tensor>("Filter2");
+    auto conv2_out = ctx.Input<Tensor>("Conv2");
+    conv1_input_shape = phi::vectorize<int>(input1->dims());
+    conv1_output_shape = phi::vectorize<int>(conv1_out->dims());
+    conv1_filter_shape = phi::vectorize<int>(filter1->dims());
+    conv1_filter_numel = filter1->numel();
+    conv1_input_numel = input1->numel();
+    conv1_output_numel = conv1_out->numel();
+
+    conv2_input_shape = phi::vectorize<int>(conv1_out->dims());
+    conv2_output_shape = phi::vectorize<int>(conv2_out->dims());
+    conv2_filter_shape = phi::vectorize<int>(filter2->dims());
+    conv2_filter_numel = filter2->numel();
+    conv2_input_numel = conv1_out->numel();
+    conv2_output_numel = conv2_out->numel();
+
+    if (has_shortcut) {
+      auto filter3 = ctx.Input<Tensor>("Filter3");
+      auto conv3_out = ctx.Input<Tensor>("Conv3");
+      conv3_input_shape = phi::vectorize<int>(input1->dims());
+      conv3_output_shape = phi::vectorize<int>(conv3_out->dims());
+      conv3_filter_shape = phi::vectorize<int>(filter3->dims());
+      conv3_filter_numel = filter3->numel();
+      conv3_input_numel = input1->numel();
+      conv3_output_numel = conv3_out->numel();
+    }
+  }
+
+  int padding1;
+  int padding2;
+  int padding3;
+  int stride1;
+  int stride2;
+  int stride3;
+  int dilation1;
+  int dilation2;
+  int dilation3;
+  int group;
+
+  bool has_shortcut;
+  bool find_max;
+
+  std::vector<int> conv1_input_shape;
+  std::vector<int> conv1_output_shape;
+  std::vector<int> conv1_filter_shape;
+  std::vector<int> conv2_input_shape;
+  std::vector<int> conv2_output_shape;
+  std::vector<int> conv2_filter_shape;
+  std::vector<int> conv3_input_shape;
+  std::vector<int> conv3_output_shape;
+  std::vector<int> conv3_filter_shape;
+
+  int conv1_filter_numel;
+  int conv2_filter_numel;
+  int conv3_filter_numel;
+  int conv1_input_numel;
+  int conv2_input_numel;
+  int conv3_input_numel;
+  int conv1_output_numel;
+  int conv2_output_numel;
+  int conv3_output_numel;
+};
+
+template <typename T>
+static inline void xpu_conv2d(xpu::Context* ctx,
+                              const T* input_data,
+                              const T* filter_data,
+                              T* output_data,
+                              float* input_max_data,
+                              float* filter_max_data,
+                              const std::vector<int>& input_shape,
+                              const std::vector<int>& filter_shape,
+                              int padding,
+                              int stride,
+                              int dilation,
+                              int group) {
+  std::vector<int> ksize{filter_shape[2], filter_shape[3]};
+  std::vector<int> stride_vec{stride, stride};
+  std::vector<int> dilation_vec{dilation, dilation};
+  std::vector<int> padding_vec{padding, padding};
+  int N = input_shape[0];
+  int C = input_shape[1];
+  int H = input_shape[2];
+  int W = input_shape[3];
+
+  int r = xpu::conv2d<T, T, T, int16_t>(ctx,
+                                        input_data,
+                                        filter_data,
+                                        output_data,
+                                        N,
+                                        C,
+                                        H,
+                                        W,
+                                        filter_shape[0],
+                                        ksize,
+                                        stride_vec,
+                                        padding_vec,
+                                        dilation_vec,
+                                        group,
+                                        input_max_data,
+                                        filter_max_data,
+                                        nullptr,
+                                        true);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv2d");
+}
+
+template <typename T>
+static inline void xpu_conv2d_grad(xpu::Context* ctx,
+                                   const T* input_data,
+                                   const T* filter_data,
+                                   const T* output_grad_data,
+                                   T* input_grad_data,
+                                   T* filter_grad_data,
+                                   const float* input_max_data,
+                                   const float* filter_max_data,
+                                   const std::vector<int>& input_shape,
+                                   const std::vector<int>& filter_shape,
+                                   int padding,
+                                   int stride,
+                                   int dilation,
+                                   int group) {
+  std::vector<int> ksize{filter_shape[2], filter_shape[3]};
+  std::vector<int> stride_vec{stride, stride};
+  std::vector<int> dilation_vec{dilation, dilation};
+  std::vector<int> padding_vec{padding, padding};
+  int N = input_shape[0];
+  int C = input_shape[1];
+  int H = input_shape[2];
+  int W = input_shape[3];
+
+  int r = xpu::conv2d_grad<T, T, T, int16_t>(ctx,
+                                             input_data,
+                                             filter_data,
+                                             output_grad_data,
+                                             input_grad_data,
+                                             filter_grad_data,
+                                             N,
+                                             C,
+                                             H,
+                                             W,
+                                             filter_shape[0],
+                                             ksize,
+                                             stride_vec,
+                                             padding_vec,
+                                             dilation_vec,
+                                             group,
+                                             input_max_data,
+                                             filter_max_data,
+                                             nullptr,
+                                             nullptr,
+                                             nullptr,
+                                             true);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv2d_grad");
+}
+
+template <typename T>
+class ResNetBasicBlockXPUKernel : public framework::OpKernel<T> {
+ public:
+  using XPUT = typename XPUTypeTrait<T>::Type;
+
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_xpu_place(ctx.GetPlace()),
+        true,
+        platform::errors::PreconditionNotMet("It must use XPUPlace."));
+
+    // input
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* filter1 = ctx.Input<Tensor>("Filter1");
+    const Tensor* scale1 = ctx.Input<Tensor>("Scale1");
+    const Tensor* bias1 = ctx.Input<Tensor>("Bias1");
+    const Tensor* filter2 = ctx.Input<Tensor>("Filter2");
+    const Tensor* scale2 = ctx.Input<Tensor>("Scale2");
+    const Tensor* bias2 = ctx.Input<Tensor>("Bias2");
+
+    // output
+    Tensor* conv1_output = ctx.Output<Tensor>("Conv1");
+    Tensor* conv2_output = ctx.Output<Tensor>("Conv2");
+    Tensor* conv2_input = ctx.Output<Tensor>("Conv2Input");
+    Tensor* output = ctx.Output<Tensor>("Y");
+
+    auto place = ctx.GetPlace();
+    auto x_data = reinterpret_cast<const XPUT*>(x->data<T>());
+    auto conv1_filter_data = reinterpret_cast<const XPUT*>(filter1->data<T>());
+    auto conv2_filter_data = reinterpret_cast<const XPUT*>(filter2->data<T>());
+    auto conv1_output_data =
+        reinterpret_cast<XPUT*>(conv1_output->mutable_data<T>(place));
+    auto conv2_input_data =
+        reinterpret_cast<XPUT*>(conv2_input->mutable_data<T>(place));
+    auto conv2_output_data =
+        reinterpret_cast<XPUT*>(conv2_output->mutable_data<T>(place));
+    auto scale1_data = scale1->data<float>();
+    auto scale2_data = scale2->data<float>();
+    auto bias1_data = bias1->data<float>();
+    auto bias2_data = bias2->data<float>();
+    auto output_data = reinterpret_cast<XPUT*>(output->mutable_data<T>(place));
+
+    float* conv1_input_max_data = nullptr;
+    float* conv1_filter_max_data = nullptr;
+    float* conv2_input_max_data = nullptr;
+    float* conv2_filter_max_data = nullptr;
+    float* conv3_input_max_data = nullptr;
+    float* conv3_filter_max_data = nullptr;
+
+    ResnetBasicBlockAttr attr(ctx);
+
+    // init find max
+    if (attr.find_max) {
+      Tensor* max_input1 = ctx.Output<Tensor>("MaxInput1");
+      Tensor* max_filter1 = ctx.Output<Tensor>("MaxFilter1");
+      conv1_input_max_data = max_input1->mutable_data<float>(place);
+      conv1_filter_max_data = max_filter1->mutable_data<float>(place);
+
+      Tensor* max_input2 = ctx.Output<Tensor>("MaxInput2");
+      Tensor* max_filter2 = ctx.Output<Tensor>("MaxFilter2");
+      conv2_input_max_data = max_input2->mutable_data<float>(place);
+      conv2_filter_max_data = max_filter2->mutable_data<float>(place);
+
+      if (attr.has_shortcut) {
+        Tensor* max_input3 = ctx.Output<Tensor>("MaxInput3");
+        Tensor* max_filter3 = ctx.Output<Tensor>("MaxFilter3");
+        conv3_input_max_data = max_input3->mutable_data<float>(place);
+        conv3_filter_max_data = max_filter3->mutable_data<float>(place);
+      }
+    }
+
+    auto& dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
+    xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+    int r = XPU_SUCCESS;
+
+    // 1. short
+    const XPUT* z_out_data = nullptr;
+    if (attr.has_shortcut) {
+      Tensor* conv3_out = ctx.Output<Tensor>("Conv3");
+      const Tensor* filter3 = ctx.Input<Tensor>("Filter3");
+      auto conv3_filter_data =
+          reinterpret_cast<const XPUT*>(filter3->data<T>());
+      auto conv3_output_data =
+          reinterpret_cast<XPUT*>(conv3_out->mutable_data<T>(place));
+
+      XPUT* conv3_input_l3_data = nullptr;
+      XPUT* conv3_filter_l3_data =
+          RAII_GUARD.alloc_l3<XPUT>(attr.conv3_filter_numel);
+
+      if (attr.find_max) {
+        r = xpu::findmax_copy_fusion(dev_ctx.x_context(),
+                                     x_data,
+                                     conv3_input_max_data,
+                                     conv3_input_l3_data,
+                                     attr.conv3_input_numel);
+        PADDLE_ENFORCE_XDNN_SUCCESS(r, "findmax_copy_fusion");
+
+        r = xpu::findmax_copy_fusion(dev_ctx.x_context(),
+                                     conv3_filter_data,
+                                     conv3_filter_max_data,
+                                     conv3_filter_l3_data,
+                                     attr.conv3_filter_numel);
+        PADDLE_ENFORCE_XDNN_SUCCESS(r, "findmax_copy_fusion");
+      }
+
+      xpu_conv2d(dev_ctx.x_context(),
+                 conv3_input_l3_data != nullptr ? conv3_input_l3_data : x_data,
+                 conv3_filter_l3_data,
+                 conv3_output_data,
+                 conv3_input_max_data,
+                 conv3_filter_max_data,
+                 attr.conv3_input_shape,
+                 attr.conv3_filter_shape,
+                 attr.padding3,
+                 attr.stride3,
+                 attr.dilation3,
+                 attr.group);
+
+      // bn3
+      const Tensor* scale3 = ctx.Input<Tensor>("Scale3");
+      const Tensor* bias3 = ctx.Input<Tensor>("Bias3");
+      auto bias3_data = bias3->data<float>();
+      auto scale3_data = scale3->data<float>();
+
+      auto bn3_output_data = RAII_GUARD.alloc<XPUT>(attr.conv3_output_numel);
+      PADDLE_ENFORCE_XDNN_NOT_NULL(bn3_output_data);
+
+      if (!attr.global_stats) {
+        Tensor* saved_mean3 = ctx.Output<Tensor>("SavedMean3");
+        Tensor* saved_invstd3 = ctx.Output<Tensor>("SavedInvstd3");
+        Tensor* running_mean3 = ctx.Output<Tensor>("Mean3Out");
+        Tensor* running_var3 = ctx.Output<Tensor>("Var3Out");
+
+        auto saved_mean3_data = saved_mean3->mutable_data<float>(place);
+        auto saved_invstd3_data = saved_invstd3->mutable_data<float>(place);
+        auto running_mean3_data = running_mean3->mutable_data<float>(place);
+        auto running_var3_data = running_var3->mutable_data<float>(place);
+
+        r = xpu::batch_norm_fusion<XPUT>(dev_ctx.x_context(),
+                                         conv3_output_data,
+                                         bn3_output_data,
+                                         attr.conv3_output_shape[0],
+                                         attr.conv3_output_shape[1],
+                                         attr.conv3_output_shape[3],
+                                         attr.conv3_output_shape[3],
+                                         attr.eps,
+                                         attr.momentum,
+                                         scale3_data,
+                                         bias3_data,
+                                         saved_mean3_data,
+                                         saved_invstd3_data,
+                                         running_mean3_data,
+                                         running_var3_data,
+                                         true,
+                                         nullptr,
+                                         xpu::Activation_t::LINEAR,
+                                         nullptr,
+                                         0);
+        PADDLE_ENFORCE_XDNN_SUCCESS(r, "batch_norm_fusion");
+      } else {
+        const auto* mean3 = ctx.Input<Tensor>("Mean3");
+        const auto* var3 = ctx.Input<Tensor>("Var3");
+        const auto* mean3_data = mean3->data<float>();
+        const auto* variance3_data = var3->data<float>();
+        r = xpu::batch_norm_infer<XPUT>(dev_ctx.x_context(),
+                                        conv3_output_data,
+                                        bn3_output_data,
+                                        attr.conv3_output_shape[0],
+                                        attr.conv3_output_shape[1],
+                                        attr.conv3_output_shape[2],
+                                        attr.conv3_output_shape[3],
+                                        attr.eps,
+                                        scale3_data,
+                                        bias3_data,
+                                        mean3_data,
+                                        variance3_data,
+                                        true);
+        PADDLE_ENFORCE_XDNN_SUCCESS(r, "batch_norm_infer");
+      }
+      z_out_data = reinterpret_cast<const XPUT*>(bn3_output_data);
+    } else {
+      z_out_data = x_data;
+    }
+
+    // 2. conv1
+    XPUT* conv1_input_l3_data = nullptr;
+    XPUT* conv1_filter_l3_data =
+        RAII_GUARD.alloc_l3<XPUT>(attr.conv1_filter_numel);
+    if (attr.find_max) {
+      r = xpu::findmax_copy_fusion(dev_ctx.x_context(),
+                                   x_data,
+                                   conv1_input_max_data,
+                                   conv1_input_l3_data,
+                                   attr.conv1_input_numel);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "findmax_copy_fusion");
+
+      r = xpu::findmax_copy_fusion(dev_ctx.x_context(),
+                                   conv1_filter_data,
+                                   conv1_filter_max_data,
+                                   conv1_filter_l3_data,
+                                   attr.conv1_filter_numel);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "findmax_copy_fusion");
+    }
+    xpu_conv2d(dev_ctx.x_context(),
+               conv1_input_l3_data != nullptr ? conv1_input_l3_data : x_data,
+               conv1_filter_l3_data,
+               conv1_output_data,
+               conv1_input_max_data,
+               conv1_filter_max_data,
+               attr.conv1_input_shape,
+               attr.conv1_filter_shape,
+               attr.padding1,
+               attr.stride1,
+               attr.dilation1,
+               attr.group);
+
+    // 3. bn1 + relu
+    if (!attr.global_stats) {
+      Tensor* saved_mean1 = ctx.Output<Tensor>("SavedMean1");
+      Tensor* saved_invstd1 = ctx.Output<Tensor>("SavedInvstd1");
+      Tensor* running_mean1 = ctx.Output<Tensor>("Mean1Out");
+      Tensor* running_var1 = ctx.Output<Tensor>("Var1Out");
+
+      auto saved_mean1_data = saved_mean1->mutable_data<float>(place);
+      auto saved_invstd1_data = saved_invstd1->mutable_data<float>(place);
+      auto running_mean1_data = running_mean1->mutable_data<float>(place);
+      auto running_var1_data = running_var1->mutable_data<float>(place);
+
+      r = xpu::batch_norm_fusion<XPUT>(dev_ctx.x_context(),
+                                       conv1_output_data,
+                                       conv2_input_data,
+                                       attr.conv1_output_shape[0],
+                                       attr.conv1_output_shape[1],
+                                       attr.conv1_output_shape[2],
+                                       attr.conv1_output_shape[3],
+                                       attr.eps,
+                                       attr.momentum,
+                                       scale1_data,
+                                       bias1_data,
+                                       saved_mean1_data,
+                                       saved_invstd1_data,
+                                       running_mean1_data,
+                                       running_var1_data,
+                                       true,
+                                       nullptr,
+                                       xpu::Activation_t::RELU,
+                                       nullptr,
+                                       0);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "batch_norm_fusion");
+    } else {
+      // bn --> relu
+      auto bn1_output_data = RAII_GUARD.alloc<XPUT>(attr.conv1_output_numel);
+      PADDLE_ENFORCE_XDNN_NOT_NULL(bn1_output_data);
+
+      const auto* mean1 = ctx.Input<Tensor>("Mean1");
+      const auto* var1 = ctx.Input<Tensor>("Var1");
+      const auto* mean_data = mean1->data<float>();
+      const auto* variance_data = var1->data<float>();
+      r = xpu::batch_norm_infer<XPUT>(dev_ctx.x_context(),
+                                      conv1_output_data,
+                                      bn1_output_data,
+                                      attr.conv1_output_shape[0],
+                                      attr.conv1_output_shape[1],
+                                      attr.conv1_output_shape[2],
+                                      attr.conv1_output_shape[3],
+                                      attr.eps,
+                                      scale1_data,
+                                      bias1_data,
+                                      mean_data,
+                                      variance_data,
+                                      true);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "batch_norm_infer");
+
+      r = xpu::relu(dev_ctx.x_context(),
+                    bn1_output_data,
+                    conv2_input_data,
+                    attr.conv1_output_numel);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "relu");
+    }
+
+    // 4. conv2
+    XPUT* conv2_input_l3_data = nullptr;
+    XPUT* conv2_filter_l3_data =
+        RAII_GUARD.alloc_l3<XPUT>(attr.conv2_filter_numel);
+    if (attr.find_max) {
+      Tensor* max_input2 = ctx.Output<Tensor>("MaxInput2");
+      Tensor* max_filter2 = ctx.Output<Tensor>("MaxFilter2");
+      conv2_input_max_data = max_input2->mutable_data<float>(place);
+      conv2_filter_max_data = max_filter2->mutable_data<float>(place);
+
+      r = xpu::findmax_copy_fusion(dev_ctx.x_context(),
+                                   conv2_input_data,
+                                   conv2_input_max_data,
+                                   conv2_input_l3_data,
+                                   attr.conv2_input_numel);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "findmax_copy_fusion");
+
+      r = xpu::findmax_copy_fusion(dev_ctx.x_context(),
+                                   conv2_filter_data,
+                                   conv2_filter_max_data,
+                                   conv2_filter_l3_data,
+                                   attr.conv2_filter_numel);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "findmax_copy_fusion");
+    }
+    xpu_conv2d(
+        dev_ctx.x_context(),
+        conv2_input_l3_data != nullptr ? conv2_input_l3_data : conv2_input_data,
+        conv2_filter_l3_data,
+        conv2_output_data,
+        conv2_input_max_data,
+        conv2_filter_max_data,
+        attr.conv2_input_shape,
+        attr.conv2_filter_shape,
+        attr.padding2,
+        attr.stride2,
+        attr.dilation2,
+        attr.group);
+
+    // 5. bn2
+    if (!attr.global_stats) {
+      Tensor* saved_mean2 = ctx.Output<Tensor>("SavedMean2");
+      Tensor* saved_var2 = ctx.Output<Tensor>("SavedInvstd2");
+      Tensor* running_mean2 = ctx.Output<Tensor>("Mean2Out");
+      Tensor* running_var2 = ctx.Output<Tensor>("Var2Out");
+
+      auto saved_mean2_data = saved_mean2->mutable_data<float>(place);
+      auto saved_var2_data = saved_var2->mutable_data<float>(place);
+      auto running_mean2_data = running_mean2->mutable_data<float>(place);
+      auto running_var2_data = running_var2->mutable_data<float>(place);
+
+      r = xpu::batch_norm_fusion<XPUT>(dev_ctx.x_context(),
+                                       conv2_output_data,
+                                       output_data,
+                                       attr.conv2_output_shape[0],
+                                       attr.conv2_output_shape[1],
+                                       attr.conv2_output_shape[2],
+                                       attr.conv2_output_shape[3],
+                                       attr.eps,
+                                       attr.momentum,
+                                       scale2_data,
+                                       bias2_data,
+                                       saved_mean2_data,
+                                       saved_var2_data,
+                                       running_mean2_data,
+                                       running_var2_data,
+                                       true,
+                                       z_out_data,
+                                       xpu::Activation_t::RELU,
+                                       nullptr,
+                                       0);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "batch_norm_fusion");
+    } else {
+      auto bn2_out_data = RAII_GUARD.alloc<XPUT>(attr.conv2_output_numel);
+      PADDLE_ENFORCE_XDNN_NOT_NULL(bn2_out_data);
+
+      const auto* mean2 = ctx.Input<Tensor>("Mean2");
+      const auto* var2 = ctx.Input<Tensor>("Var2");
+      const auto* mean_data = mean2->data<float>();
+      const auto* variance_data = var2->data<float>();
+      r = xpu::batch_norm_infer<XPUT>(dev_ctx.x_context(),
+                                      conv2_output_data,
+                                      bn2_out_data,
+                                      attr.conv2_output_shape[0],
+                                      attr.conv2_output_shape[1],
+                                      attr.conv2_output_shape[2],
+                                      attr.conv2_output_shape[3],
+                                      attr.eps,
+                                      scale2_data,
+                                      bias2_data,
+                                      mean_data,
+                                      variance_data,
+                                      true);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "batch_norm_infer");
+
+      r = xpu::add_activation_fusion<XPUT>(dev_ctx.x_context(),
+                                           bn2_out_data,
+                                           z_out_data,
+                                           output_data,
+                                           output->numel(),
+                                           nullptr,
+                                           nullptr,
+                                           nullptr,
+                                           xpu::Activation_t::RELU);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "add_activation_fusion");
+    }
+  }
+};
+
+template <typename T>
+class ResNetBasicBlockGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  using XPUT = typename XPUTypeTrait<T>::Type;
+
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_xpu_place(ctx.GetPlace()),
+        true,
+        platform::errors::PreconditionNotMet("It must use XPUPlace."));
+
+    const Tensor* y_grad = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const Tensor* y = ctx.Input<Tensor>("Y");
+
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* filter1 = ctx.Input<Tensor>("Filter1");
+    const Tensor* scale1 = ctx.Input<Tensor>("Scale1");
+    const Tensor* filter2 = ctx.Input<Tensor>("Filter2");
+    const Tensor* scale2 = ctx.Input<Tensor>("Scale2");
+    const Tensor* saved_mean1 = ctx.Input<Tensor>("SavedMean1");
+    const Tensor* saved_invstd1 = ctx.Input<Tensor>("SavedInvstd1");
+    const Tensor* saved_mean2 = ctx.Input<Tensor>("SavedMean2");
+    const Tensor* saved_invstd2 = ctx.Input<Tensor>("SavedInvstd2");
+    const Tensor* conv1_out = ctx.Input<Tensor>("Conv1");
+    const Tensor* conv2_out = ctx.Input<Tensor>("Conv2");
+    const Tensor* conv2_input = ctx.Input<Tensor>("Conv2Input");
+
+    const Tensor* filter3 = ctx.Input<Tensor>("Filter3");
+    const Tensor* conv3_out = ctx.Input<Tensor>("Conv3");
+    const Tensor* scale3 = ctx.Input<Tensor>("Scale3");
+    const Tensor* saved_mean3 = ctx.Input<Tensor>("SavedMean3");
+    const Tensor* saved_invstd3 = ctx.Input<Tensor>("SavedInvstd3");
+
+    const Tensor* conv1_input_max = ctx.Input<Tensor>("MaxInput1");
+    const Tensor* conv1_filter_max = ctx.Input<Tensor>("MaxFilter1");
+    const Tensor* conv2_input_max = ctx.Input<Tensor>("MaxInput2");
+    const Tensor* conv2_filter_max = ctx.Input<Tensor>("MaxFilter2");
+    const Tensor* conv3_input_max = ctx.Input<Tensor>("MaxInput3");
+    const Tensor* conv3_filter_max = ctx.Input<Tensor>("MaxFilter3");
+
+    Tensor* x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    Tensor* filter1_grad =
+        ctx.Output<Tensor>(framework::GradVarName("Filter1"));
+    Tensor* scale1_grad = ctx.Output<Tensor>(framework::GradVarName("Scale1"));
+    Tensor* bias1_grad = ctx.Output<Tensor>(framework::GradVarName("Bias1"));
+    Tensor* filter2_grad =
+        ctx.Output<Tensor>(framework::GradVarName("Filter2"));
+    Tensor* scale2_grad = ctx.Output<Tensor>(framework::GradVarName("Scale2"));
+    Tensor* bias2_grad = ctx.Output<Tensor>(framework::GradVarName("Bias2"));
+    Tensor* filter3_grad =
+        ctx.Output<Tensor>(framework::GradVarName("Filter3"));
+    Tensor* scale3_grad = ctx.Output<Tensor>(framework::GradVarName("Scale3"));
+    Tensor* bias3_grad = ctx.Output<Tensor>(framework::GradVarName("Bias3"));
+
+    // attrs
+    ResnetBasicBlockGradAttr attr(ctx);
+    auto place = ctx.GetPlace();
+
+    const auto* y_grad_data = reinterpret_cast<const XPUT*>(y_grad->data<T>());
+    const auto* y_data = reinterpret_cast<const XPUT*>(y->data<T>());
+    const auto* x_data = reinterpret_cast<const XPUT*>(x->data<T>());
+    const auto* conv1_output_data =
+        reinterpret_cast<const XPUT*>(conv1_out->data<T>());
+    const auto* conv1_filter_data =
+        reinterpret_cast<const XPUT*>(filter1->data<T>());
+    const auto* conv2_input_data =
+        reinterpret_cast<const XPUT*>(conv2_input->data<T>());
+    const auto* conv2_output_data =
+        reinterpret_cast<const XPUT*>(conv2_out->data<T>());
+    const auto* conv2_filter_data =
+        reinterpret_cast<const XPUT*>(filter2->data<T>());
+
+    const auto* scale2_data = scale2->data<float>();
+    const auto* saved_mean2_data = saved_mean2->data<float>();
+    const auto* saved_invstd2_data = saved_invstd2->data<float>();
+    const auto* scale1_data = scale1->data<float>();
+    const auto* saved_mean1_data = saved_mean1->data<float>();
+    const auto* saved_invstd1_data = saved_invstd1->data<float>();
+    auto* scale2_grad_data = scale2_grad->mutable_data<float>(place);
+    auto* bias2_grad_data = bias2_grad->mutable_data<float>(place);
+
+    const float* conv1_input_max_data = nullptr;
+    const float* conv1_filter_max_data = nullptr;
+    const float* conv2_input_max_data = nullptr;
+    const float* conv2_filter_max_data = nullptr;
+    const float* conv3_input_max_data = nullptr;
+    const float* conv3_filter_max_data = nullptr;
+    if (attr.find_max) {
+      conv1_input_max_data =
+          reinterpret_cast<const float*>(conv1_input_max->data<float>());
+      conv1_filter_max_data =
+          reinterpret_cast<const float*>(conv1_filter_max->data<float>());
+      conv2_input_max_data =
+          reinterpret_cast<const float*>(conv2_input_max->data<float>());
+      conv2_filter_max_data =
+          reinterpret_cast<const float*>(conv2_filter_max->data<float>());
+      if (attr.has_shortcut) {
+        conv3_input_max_data =
+            reinterpret_cast<const float*>(conv3_input_max->data<float>());
+        conv3_filter_max_data =
+            reinterpret_cast<const float*>(conv3_filter_max->data<float>());
+      }
+    }
+
+    auto& dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
+    xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+    int r = XPU_SUCCESS;
+
+    // 0. bn2, bn2_fusion grad
+    auto conv2_output_grad_data =
+        RAII_GUARD.alloc<XPUT>(attr.conv2_output_numel);
+    PADDLE_ENFORCE_XDNN_NOT_NULL(conv2_output_grad_data);
+
+    XPUT* z_output_grad_data = nullptr;
+    XPUT* z_grad_data = nullptr;
+    if (!attr.has_shortcut) {
+      z_output_grad_data = RAII_GUARD.alloc<XPUT>(attr.conv1_input_numel);
+      PADDLE_ENFORCE_XDNN_NOT_NULL(z_output_grad_data);
+      z_grad_data = z_output_grad_data;
+    } else {
+      z_output_grad_data = RAII_GUARD.alloc<XPUT>(attr.conv3_output_numel);
+      PADDLE_ENFORCE_XDNN_NOT_NULL(z_output_grad_data);
+
+      z_grad_data = RAII_GUARD.alloc<XPUT>(attr.conv1_input_numel);
+      PADDLE_ENFORCE_XDNN_NOT_NULL(z_grad_data);
+    }
+
+    r = xpu::batch_norm_grad_fusion<XPUT>(dev_ctx.x_context(),
+                                          conv2_output_data,
+                                          y_data,
+                                          y_grad_data,
+                                          conv2_output_grad_data,
+                                          attr.conv2_output_shape[0],
+                                          attr.conv2_output_shape[1],
+                                          attr.conv2_output_shape[2],
+                                          attr.conv2_output_shape[3],
+                                          scale2_data,
+                                          saved_mean2_data,
+                                          saved_invstd2_data,
+                                          scale2_grad_data,
+                                          bias2_grad_data,
+                                          true,
+                                          z_output_grad_data,
+                                          xpu::Activation_t::RELU,
+                                          nullptr,
+                                          0);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "batch_norm_grad_fusion");
+
+    if (attr.has_shortcut) {
+      // bn3 grad
+      const auto* conv3_output_data =
+          reinterpret_cast<const XPUT*>(conv3_out->data<T>());
+      const auto* scale3_data = scale3->data<float>();
+      const auto* saved_mean3_data = saved_mean3->data<float>();
+      const auto* saved_invstd3_data = saved_invstd3->data<float>();
+      auto* scale3_grad_data = scale3_grad->mutable_data<float>(place);
+      auto* bias3_grad_data = bias3_grad->mutable_data<float>(place);
+      auto* conv3_output_grad_data =
+          RAII_GUARD.alloc<XPUT>(attr.conv3_output_numel);
+
+      r = xpu::batch_norm_grad<XPUT>(dev_ctx.x_context(),
+                                     conv3_output_data,
+                                     z_output_grad_data,
+                                     conv3_output_grad_data,
+                                     attr.conv3_output_shape[0],
+                                     attr.conv3_output_shape[1],
+                                     attr.conv3_output_shape[2],
+                                     attr.conv3_output_shape[3],
+                                     scale3_data,
+                                     saved_mean3_data,
+                                     saved_invstd3_data,
+                                     scale3_grad_data,
+                                     bias3_grad_data,
+                                     true);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "batch_norm_grad");
+
+      // conv3 grad
+      auto* conv3_filter_grad_data =
+          reinterpret_cast<XPUT*>(filter3_grad->mutable_data<T>(place));
+      auto* conv3_filter_data =
+          reinterpret_cast<const XPUT*>(filter3->data<T>());
+      xpu_conv2d_grad(dev_ctx.x_context(),
+                      x_data,
+                      conv3_filter_data,
+                      conv3_output_grad_data,
+                      z_grad_data,
+                      conv3_filter_grad_data,
+                      conv3_input_max_data,
+                      conv3_filter_max_data,
+                      attr.conv3_input_shape,
+                      attr.conv3_filter_shape,
+                      attr.padding3,
+                      attr.stride3,
+                      attr.dilation3,
+                      attr.group);
+    }
+
+    // 2. conv2_grad
+    auto* conv2_filter_grad_data =
+        reinterpret_cast<XPUT*>(filter2_grad->mutable_data<T>(place));
+    auto* conv2_input_grad_data =
+        RAII_GUARD.alloc<XPUT>(attr.conv2_input_numel);
+    xpu_conv2d_grad(dev_ctx.x_context(),
+                    conv2_input_data,
+                    conv2_filter_data,
+                    conv2_output_grad_data,
+                    conv2_input_grad_data,
+                    conv2_filter_grad_data,
+                    conv2_input_max_data,
+                    conv2_filter_max_data,
+                    attr.conv2_input_shape,
+                    attr.conv2_filter_shape,
+                    attr.padding2,
+                    attr.stride2,
+                    attr.dilation2,
+                    attr.group);
+
+    // 3. b1 grad
+    auto* conv1_output_grad_data =
+        RAII_GUARD.alloc<XPUT>(attr.conv1_output_numel);
+    PADDLE_ENFORCE_XDNN_NOT_NULL(conv1_output_grad_data);
+    auto* scale1_grad_data = scale1_grad->mutable_data<float>(ctx.GetPlace());
+    auto* bias1_grad_data = bias1_grad->mutable_data<float>(ctx.GetPlace());
+    r = xpu::batch_norm_grad_fusion<XPUT>(dev_ctx.x_context(),
+                                          conv1_output_data,
+                                          conv2_input_data,
+                                          conv2_input_grad_data,
+                                          conv1_output_grad_data,
+                                          attr.conv1_output_shape[0],
+                                          attr.conv1_output_shape[1],
+                                          attr.conv1_output_shape[2],
+                                          attr.conv1_output_shape[3],
+                                          scale1_data,
+                                          saved_mean1_data,
+                                          saved_invstd1_data,
+                                          scale1_grad_data,
+                                          bias1_grad_data,
+                                          true,
+                                          nullptr,
+                                          xpu::Activation_t::RELU,
+                                          nullptr,
+                                          0);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "batch_norm_grad_fusion");
+
+    // 4. conv1_grad
+    auto* x_grad_data = reinterpret_cast<XPUT*>(x_grad->mutable_data<T>(place));
+    auto* conv1_filter_grad_data =
+        reinterpret_cast<XPUT*>(filter1_grad->mutable_data<T>(place));
+    xpu_conv2d_grad(dev_ctx.x_context(),
+                    x_data,
+                    conv1_filter_data,
+                    conv1_output_grad_data,
+                    x_grad_data,
+                    conv1_filter_grad_data,
+                    conv1_input_max_data,
+                    conv1_filter_max_data,
+                    attr.conv1_input_shape,
+                    attr.conv1_filter_shape,
+                    attr.padding1,
+                    attr.stride1,
+                    attr.dilation1,
+                    attr.group);
+
+    // add z_grad to x_grad
+    r = xpu::add<XPUT>(
+        dev_ctx.x_context(), x_grad_data, z_grad_data, x_grad_data, x->numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "add");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_XPU_KERNEL(resnet_basic_block,
+                       ops::ResNetBasicBlockXPUKernel<float>);
+REGISTER_OP_XPU_KERNEL(resnet_basic_block_grad,
+                       ops::ResNetBasicBlockGradXPUKernel<float>);
+#endif
diff --git a/paddle/fluid/operators/fused/resnet_unit_op.cc b/paddle/fluid/operators/fused/resnet_unit_op.cc
index 4f4e0aa6ac29a..5852a5c04bde6 100644
--- a/paddle/fluid/operators/fused/resnet_unit_op.cc
+++ b/paddle/fluid/operators/fused/resnet_unit_op.cc
@@ -159,22 +159,28 @@ class ResNetUnitOp : public framework::OperatorWithKernel {
                           bn_param_dims,
                           bn_param_dims.size()));
     auto data_format = ctx->Attrs().Get<std::string>("data_format");
-    PADDLE_ENFORCE_EQ(
-        data_format,
-        "NHWC",
-        platform::errors::InvalidArgument("The data format must equal to NHWC. "
-                                          "But received: the data format "
-                                          "= [%s]",
-                                          data_format));
+    bool is_nchw = (data_format == "NCHW");
     // Calculate the dims of outputs
     int batch = x_dims[0];
     int output_channel = w_dims[0];
     int filter_size = w_dims[2];
     int stride = ctx->Attrs().Get<int>("stride");
     int padding = ctx->Attrs().Get<int>("padding");
-    int out_h = (x_dims[1] + padding * 2 - filter_size) / stride + 1;
-    int out_w = (x_dims[2] + padding * 2 - filter_size) / stride + 1;
-    std::vector<int> out_shape = {batch, out_h, out_w, output_channel};
+    std::vector<int> out_shape;
+    out_shape.push_back(batch);
+    if (is_nchw) {
+      int out_h = (x_dims[2] + padding * 2 - filter_size) / stride + 1;
+      int out_w = (x_dims[3] + padding * 2 - filter_size) / stride + 1;
+      out_shape.push_back(output_channel);
+      out_shape.push_back(out_h);
+      out_shape.push_back(out_w);
+    } else {
+      int out_h = (x_dims[1] + padding * 2 - filter_size) / stride + 1;
+      int out_w = (x_dims[2] + padding * 2 - filter_size) / stride + 1;
+      out_shape.push_back(out_h);
+      out_shape.push_back(out_w);
+      out_shape.push_back(output_channel);
+    }
 
     auto y_dims = phi::make_ddim(out_shape);
     auto bitmask_dims = GetBitmaskDims(out_shape);
diff --git a/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc b/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc
new file mode 100644
index 0000000000000..cce506c67abe2
--- /dev/null
+++ b/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc
@@ -0,0 +1,333 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class ResNetUnitXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto place = ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(
+        platform::is_xpu_place(place),
+        true,
+        platform::errors::PreconditionNotMet("It must use XPUPlace."));
+
+    bool is_nchw = (ctx.Attr<std::string>("data_format") == "NCHW");
+    // input x
+    const Tensor *input_x = ctx.Input<Tensor>("X");
+    const Tensor *filter_x = ctx.Input<Tensor>("FilterX");
+    const Tensor *scale_x = ctx.Input<Tensor>("ScaleX");
+    const Tensor *bias_x = ctx.Input<Tensor>("BiasX");
+
+    // output x
+    Tensor *conv_out_x = ctx.Output<Tensor>("ConvX");
+    Tensor *saved_mean_x = ctx.Output<Tensor>("SavedMeanX");
+    Tensor *saved_invstd_x = ctx.Output<Tensor>("SavedInvstdX");
+    Tensor *running_mean_x = ctx.Output<Tensor>("RunningMeanX");
+    Tensor *running_var_x = ctx.Output<Tensor>("RunningVarX");
+
+    Tensor *output = ctx.Output<Tensor>("Y");
+
+    //  attrs
+    int padding = ctx.Attr<int>("padding");
+    int stride = ctx.Attr<int>("stride");
+    int stride_z = ctx.Attr<int>("stride_z");
+    int dilation = ctx.Attr<int>("dilation");
+    int group = ctx.Attr<int>("group");
+    float eps = ctx.Attr<float>("epsilon");
+    float momentum = ctx.Attr<float>("momentum");
+    bool has_shortcut = ctx.Attr<bool>("has_shortcut");
+    bool fuse_add = ctx.Attr<bool>("fuse_add");
+    bool use_global_stats = ctx.Attr<bool>("use_global_stats");
+    bool is_test = ctx.Attr<bool>("is_test");
+    bool is_train = !is_test && !use_global_stats;
+    std::string act_type = ctx.Attr<std::string>("act_type");
+    auto &dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
+
+    std::vector<const T *> x_list = {input_x->data<T>()};
+    std::vector<const T *> w_list = {filter_x->data<T>()};
+    std::vector<T *> conv_y_list = {conv_out_x->mutable_data<T>(place)};
+
+    std::vector<std::vector<int>> x_shape_list = {
+        phi::vectorize<int>(input_x->dims())};
+
+    auto filter_x_shape = phi::vectorize<int>(filter_x->dims());
+    std::vector<int> ksize = {filter_x_shape[2], filter_x_shape[3]};
+    if (!is_nchw) {
+      ksize[0] = filter_x_shape[1];
+      ksize[1] = filter_x_shape[2];
+    }
+    std::vector<int> strides = {stride, stride};
+    std::vector<std::vector<int>> ksize_list = {ksize};
+    std::vector<std::vector<int>> stride_list = {strides};
+    std::vector<int> paddings = {padding, padding};
+    std::vector<int> dilations = {dilation, dilation};
+    std::vector<const float *> scale_list = {scale_x->data<float>()};
+    std::vector<const float *> bias_list = {bias_x->data<float>()};
+    std::vector<float *> batch_mean_list = {
+        saved_mean_x->mutable_data<float>(place)};
+    std::vector<float *> batch_invstd_list = {
+        saved_invstd_x->mutable_data<float>(place)};
+    std::vector<float *> global_mean_list = {
+        running_mean_x->mutable_data<float>(place)};
+    std::vector<float *> global_var_list = {
+        running_var_x->mutable_data<float>(place)};
+
+    std::vector<const float *> x_maxlist = {nullptr};
+    std::vector<const float *> w_maxlist = {nullptr};
+    if (has_shortcut) {
+      // input z
+      const Tensor *input_z = ctx.Input<Tensor>("Z");
+      const Tensor *filter_z = ctx.Input<Tensor>("FilterZ");
+      const Tensor *scale_z = ctx.Input<Tensor>("ScaleZ");
+      const Tensor *bias_z = ctx.Input<Tensor>("BiasZ");
+
+      Tensor *conv_out_z = ctx.Output<Tensor>("ConvZ");
+      Tensor *saved_mean_z = ctx.Output<Tensor>("SavedMeanZ");
+      Tensor *saved_invstd_z = ctx.Output<Tensor>("SavedInvstdZ");
+      Tensor *running_mean_z = ctx.Output<Tensor>("RunningMeanZ");
+      Tensor *running_var_z = ctx.Output<Tensor>("RunningVarZ");
+
+      x_list.push_back(input_z->data<T>());
+      w_list.push_back(filter_z->data<T>());
+      conv_y_list.push_back(conv_out_z->mutable_data<T>(place));
+
+      x_shape_list.push_back(phi::vectorize<int>(input_z->dims()));
+
+      auto filter_z_shape = phi::vectorize<int>(filter_z->dims());
+      std::vector<int> ksize_z = {filter_z_shape[2], filter_z_shape[3]};
+      if (!is_nchw) {
+        ksize_z[0] = filter_z_shape[1];
+        ksize_z[1] = filter_z_shape[2];
+      }
+      ksize_list.push_back(ksize_z);
+      stride_list.push_back({stride_z, stride_z});
+      scale_list.push_back(scale_z->data<float>());
+      bias_list.push_back(bias_z->data<float>());
+      batch_mean_list.push_back(saved_mean_z->mutable_data<float>(place));
+      batch_invstd_list.push_back(saved_invstd_z->mutable_data<float>(place));
+      global_mean_list.push_back(running_mean_z->mutable_data<float>(place));
+      global_var_list.push_back(running_var_z->mutable_data<float>(place));
+      x_maxlist.push_back(nullptr);
+      w_maxlist.push_back(nullptr);
+    } else {
+      if (fuse_add) {
+        const Tensor *input_z = ctx.Input<Tensor>("Z");
+        auto input_z_shape = phi::vectorize<int>(input_z->dims());
+        x_list.push_back(input_z->data<T>());
+        x_shape_list.push_back(input_z_shape);
+        x_maxlist.push_back(nullptr);
+      }
+    }
+    int r = xpu::resnet_unit_fusion<T, T, T, int16_t>(
+        dev_ctx.x_context(),
+        x_list,
+        w_list,
+        conv_y_list,
+        output->mutable_data<T>(place),
+        x_shape_list,
+        filter_x_shape[0],
+        ksize_list,
+        stride_list,
+        paddings,
+        dilations,
+        group,
+        eps,
+        momentum,
+        x_maxlist,
+        w_maxlist,
+        scale_list,
+        bias_list,
+        batch_mean_list,
+        batch_invstd_list,
+        global_mean_list,
+        global_var_list,
+        xpu::Activation_t::RELU,
+        is_nchw,
+        has_shortcut,
+        fuse_add,
+        is_train);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "resnet_unit_fusion");
+  }
+};
+
+template <typename T>
+class ResNetUnitGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto place = ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(
+        platform::is_xpu_place(place),
+        true,
+        platform::errors::PreconditionNotMet("It must use XPUPlace."));
+
+    bool is_nchw = (ctx.Attr<std::string>("data_format") == "NCHW");
+    const Tensor *y_grad = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const Tensor *x = ctx.Input<Tensor>("X");
+    const Tensor *filter_x = ctx.Input<Tensor>("FilterX");
+    const Tensor *scale_x = ctx.Input<Tensor>("ScaleX");
+    const Tensor *saved_mean_x = ctx.Input<Tensor>("SavedMeanX");
+    const Tensor *saved_invstd_x = ctx.Input<Tensor>("SavedInvstdX");
+    const Tensor *conv_out_x = ctx.Input<Tensor>("ConvX");
+    const Tensor *output = ctx.Input<Tensor>("Y");
+
+    Tensor *x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    Tensor *filter_x_grad =
+        ctx.Output<Tensor>(framework::GradVarName("FilterX"));
+    Tensor *scale_x_grad = ctx.Output<Tensor>(framework::GradVarName("ScaleX"));
+    Tensor *bias_x_grad = ctx.Output<Tensor>(framework::GradVarName("BiasX"));
+
+    int padding = ctx.Attr<int>("padding");
+    int stride = ctx.Attr<int>("stride");
+    int stride_z = ctx.Attr<int>("stride_z");
+    int dilation = ctx.Attr<int>("dilation");
+    int group = ctx.Attr<int>("group");
+    float eps = ctx.Attr<float>("epsilon");
+    bool has_shortcut = ctx.Attr<bool>("has_shortcut");
+    bool fuse_add = ctx.Attr<bool>("fuse_add");
+    std::string act_type = ctx.Attr<std::string>("act_type");
+
+    auto &dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
+
+    std::vector<const T *> x_list = {x->data<T>()};
+    std::vector<const T *> w_list = {filter_x->data<T>()};
+    std::vector<const T *> conv_y_list = {conv_out_x->data<T>()};
+    std::vector<T *> dx_list = {x_grad->mutable_data<T>(place)};
+    std::vector<T *> dw_list = {filter_x_grad->mutable_data<T>(place)};
+
+    std::vector<std::vector<int>> x_shape_list = {
+        phi::vectorize<int>(x->dims())};
+
+    auto filter_x_shape = phi::vectorize<int>(filter_x->dims());
+    std::vector<int> x_ksize = {filter_x_shape[2], filter_x_shape[3]};
+    if (!is_nchw) {
+      x_ksize[0] = filter_x_shape[1];
+      x_ksize[1] = filter_x_shape[2];
+    }
+    std::vector<std::vector<int>> ksize_list = {x_ksize};
+    std::vector<std::vector<int>> stride_list = {{stride, stride}};
+    std::vector<int> paddings = {padding, padding};
+    std::vector<int> dilations = {dilation, dilation};
+
+    std::vector<const float *> x_maxlist = {nullptr};
+    std::vector<const float *> w_maxlist = {nullptr};
+
+    std::vector<const float *> scale_list = {scale_x->data<float>()};
+    std::vector<const float *> batch_mean_list = {saved_mean_x->data<float>()};
+    std::vector<const float *> batch_invstd_list = {
+        saved_invstd_x->data<float>()};
+    std::vector<float *> dscale_list = {
+        scale_x_grad->mutable_data<float>(place)};
+    std::vector<float *> dbias_list = {bias_x_grad->mutable_data<float>(place)};
+
+    if (has_shortcut) {
+      //       X                   Z
+      //       |                   |
+      //    NormConv            NormConv
+      //       |                   |
+      // BNStatsFinalize    BNStatsFinalize
+      //       \                   /
+      //          ScaleBiasAddRelu
+      //                  |
+      //                  Y
+      const Tensor *z = ctx.Input<Tensor>("Z");
+      const Tensor *filter_z = ctx.Input<Tensor>("FilterZ");
+      const Tensor *scale_z = ctx.Input<Tensor>("ScaleZ");
+      const Tensor *saved_mean_z = ctx.Input<Tensor>("SavedMeanZ");
+      const Tensor *saved_invstd_z = ctx.Input<Tensor>("SavedInvstdZ");
+      const Tensor *conv_out_z = ctx.Input<Tensor>("ConvZ");
+
+      Tensor *z_grad = ctx.Output<Tensor>(framework::GradVarName("Z"));
+      Tensor *filter_z_grad =
+          ctx.Output<Tensor>(framework::GradVarName("FilterZ"));
+      Tensor *scale_z_grad =
+          ctx.Output<Tensor>(framework::GradVarName("ScaleZ"));
+      Tensor *bias_z_grad = ctx.Output<Tensor>(framework::GradVarName("BiasZ"));
+      x_list.push_back(z->data<T>());
+      w_list.push_back(filter_z->data<T>());
+      conv_y_list.push_back(conv_out_z->data<T>());
+      dx_list.push_back(z_grad->mutable_data<T>(place));
+      dw_list.push_back(filter_z_grad->mutable_data<T>(place));
+      x_shape_list.push_back(phi::vectorize<int>(z->dims()));
+
+      auto filter_z_shape = phi::vectorize<int>(filter_z->dims());
+      std::vector<int> ksize_z = {filter_z_shape[2], filter_z_shape[3]};
+      if (!is_nchw) {
+        ksize_z[0] = filter_z_shape[1];
+        ksize_z[1] = filter_z_shape[2];
+      }
+      ksize_list.push_back(ksize_z);
+      stride_list.push_back({stride_z, stride_z});
+      x_maxlist.push_back(nullptr);
+      w_maxlist.push_back(nullptr);
+
+      scale_list.push_back(scale_z->data<float>());
+      batch_mean_list.push_back(saved_mean_z->data<float>());
+      batch_invstd_list.push_back(saved_invstd_z->data<float>());
+      dscale_list.push_back(scale_z_grad->mutable_data<float>(place));
+      dbias_list.push_back(bias_z_grad->mutable_data<float>(place));
+    } else {
+      if (fuse_add) {
+        auto z_grad = ctx.Output<Tensor>(framework::GradVarName("Z"));
+        dx_list.push_back(z_grad->mutable_data<T>(place));
+      }
+    }
+
+    int r =
+        xpu::resnet_unit_grad_fusion<T, T, T, int16_t>(dev_ctx.x_context(),
+                                                       x_list,
+                                                       w_list,
+                                                       y_grad->data<T>(),
+                                                       output->data<T>(),
+                                                       conv_y_list,
+                                                       dx_list,
+                                                       dw_list,
+                                                       x_shape_list,
+                                                       filter_x_shape[0],
+                                                       ksize_list,
+                                                       stride_list,
+                                                       paddings,
+                                                       dilations,
+                                                       group,
+                                                       x_maxlist,
+                                                       w_maxlist,
+                                                       scale_list,
+                                                       batch_mean_list,
+                                                       batch_invstd_list,
+                                                       dscale_list,
+                                                       dbias_list,
+                                                       xpu::Activation_t::RELU,
+                                                       eps,
+                                                       is_nchw,
+                                                       has_shortcut,
+                                                       fuse_add);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "resnet_unit_grad_fusion");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_XPU_KERNEL(resnet_unit, ops::ResNetUnitXPUKernel<float>);
+REGISTER_OP_XPU_KERNEL(resnet_unit_grad, ops::ResNetUnitGradXPUKernel<float>);
diff --git a/paddle/fluid/operators/fused_softmax_mask_op.cc b/paddle/fluid/operators/fused_softmax_mask_op.cc
index 013e214b426e5..11c1fa4af8560 100644
--- a/paddle/fluid/operators/fused_softmax_mask_op.cc
+++ b/paddle/fluid/operators/fused_softmax_mask_op.cc
@@ -117,7 +117,6 @@ REGISTER_OPERATOR(fused_softmax_mask,
                   ops::SoftmaxMaskFuseGradOpMaker<paddle::framework::OpDesc>,
                   ops::SoftmaxMaskFuseGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(fused_softmax_mask_grad, ops::SoftmaxMaskFuseOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    fused_softmax_mask,
-    ops::SoftmaxMaskFuseCPUKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SoftmaxMaskFuseCPUKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(fused_softmax_mask,
+                       ops::SoftmaxMaskFuseCPUKernel<phi::CPUContext, float>,
+                       ops::SoftmaxMaskFuseCPUKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cc b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cc
index f1748ad931247..5992fa2dfc6e4 100644
--- a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cc
+++ b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cc
@@ -104,10 +104,7 @@ REGISTER_OPERATOR(
     ops::SoftmaxMaskFuseUpperTriangleGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(fused_softmax_mask_upper_triangle_grad,
                   ops::SoftmaxMaskFuseUpperTriangleOpGrad);
-REGISTER_OP_CPU_KERNEL(fused_softmax_mask_upper_triangle,
-                       ops::SoftmaxMaskFuseUpperTriangleCPUKernel<
-                           paddle::platform::CPUDeviceContext,
-                           float>,
-                       ops::SoftmaxMaskFuseUpperTriangleCPUKernel<
-                           paddle::platform::CPUDeviceContext,
-                           double>);
+REGISTER_OP_CPU_KERNEL(
+    fused_softmax_mask_upper_triangle,
+    ops::SoftmaxMaskFuseUpperTriangleCPUKernel<phi::CPUContext, float>,
+    ops::SoftmaxMaskFuseUpperTriangleCPUKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/fused_token_prune_op.cc b/paddle/fluid/operators/fused_token_prune_op.cc
new file mode 100644
index 0000000000000..50ca45967b7bd
--- /dev/null
+++ b/paddle/fluid/operators/fused_token_prune_op.cc
@@ -0,0 +1,187 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class FusedTokenPruneOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Attn",
+             "(Tensor)"
+             "The input of fused_token_prune op, whose shape should be [bsz, "
+             "num_head, max_seq_len, max_seq_len] and dtype should be "
+             "float32/float64,"
+             "Attn is attention scores of input sequences which will be used "
+             "to sort another input tensor: X's indices so that "
+             "some elements of X with lower attention score will not be "
+             "considered after this op.");
+
+    AddInput("X",
+             "(Tensor)"
+             "The input of fused_token_prune op, whose shape should be [bsz, "
+             "max_seq_len, c] and dtype should be float32/float64.");
+
+    AddInput(
+        "Mask",
+        "(Tensor)"
+        "The input of fused_token_prune op, whose shape should be [bsz, "
+        "num_head, "
+        "max_seq_len, max_seq_len] and dtype should be float32/float64."
+        "Mask is corresponding to Attn's elemnts one by one. Elements of Attn "
+        "will be set to zero if their corresponding mask is smaller than 0."
+        "This process happens before sorting X by attn.");
+
+    AddInput("NewMask",
+             "(Tensor)"
+             "The input of fused_token_prune op, whose shape should be [bsz, "
+             "num_head, slimmed_seq_len, slimmed_seq_len]."
+             "NewMask is just used to get slimmed_seq_len, so the value of "
+             "this input is not important in this op.");
+
+    AddOutput("SlimmedX",
+              "(Tensor)"
+              "The output of fused_token_prune op, whose shape should be [bsz, "
+              "slimmed_seq_len, C]."
+              "The tokens of X will be sorted by Attn firstly and then the "
+              "last (max_seq_len - slimmed_seq_len)"
+              "tokens will be deleted. SlimmedX is the remainning part of X. "
+              "");
+
+    AddOutput(
+        "CLSInds",
+        "(Tensor)"
+        "The output of fused_token_prune op, whose shape should be [bsz, "
+        "slimmed_seq_len] and dtype is int64. CLSInds contains token indices "
+        " of each batch after sorting and pruning. ");
+
+    AddAttr<bool>("keep_first_token",
+                  "If keep_first_token is True, the element located in "
+                  "CLSInds[:, 1] must be 0.")
+        .SetDefault(true);
+
+    AddAttr<bool>("keep_order",
+                  "If keep_order is True, the relative order of SlimmedX and "
+                  "CLSInds remains unchanged")
+        .SetDefault(false);
+
+    AddComment(R"DOC(
+            fused_token_prune op is used to fuse multiple ops to perform token pruning.
+            In this op:
+                1. Elements of Attn will be set to zero if their corresponding mask is smaller than 0. 
+                2. The second dimension of X will be sorted by Attn.
+                3. The last (max_seq_len - slimmed_seq_len) lines of X will be pruned.
+                4. The remainning part of sorted X will output.
+                )DOC");
+  }
+};
+
+class FusedTokenPruneOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("Attn"), "Input", "Attn", "FusedTokenPrune");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusedTokenPrune");
+    OP_INOUT_CHECK(ctx->HasInput("Mask"), "Input", "Mask", "FusedTokenPrune");
+    OP_INOUT_CHECK(
+        ctx->HasInput("NewMask"), "Input", "NewMask", "FusedTokenPrune");
+    OP_INOUT_CHECK(
+        ctx->HasOutput("SlimmedX"), "Output", "SlimmedX", "FusedTokenPrune");
+    OP_INOUT_CHECK(
+        ctx->HasOutput("CLSInds"), "Output", "CLSInds", "FusedTokenPrune");
+
+    auto mask_dim = ctx->GetInputDim("Mask");
+    auto attn_dim = ctx->GetInputDim("Attn");
+    auto x_dim = ctx->GetInputDim("X");
+    auto new_mask_dim = ctx->GetInputDim("NewMask");
+
+    // check input dims number
+    PADDLE_ENFORCE_EQ(mask_dim.size(),
+                      4,
+                      platform::errors::InvalidArgument(
+                          "The input mask must be 4-dimention"));
+    PADDLE_ENFORCE_EQ(attn_dim.size(),
+                      4,
+                      platform::errors::InvalidArgument(
+                          "The input attn must be 4-dimention"));
+    PADDLE_ENFORCE_EQ(
+        x_dim.size(),
+        3,
+        platform::errors::InvalidArgument("The input x must be 4-dimention"));
+    PADDLE_ENFORCE_EQ(new_mask_dim.size(),
+                      4,
+                      platform::errors::InvalidArgument(
+                          "The input attn must be 4-dimention"));
+
+    // check input dims relations
+    PADDLE_ENFORCE_EQ(mask_dim[0],
+                      attn_dim[0],
+                      platform::errors::InvalidArgument(
+                          "The first dim of mask and attn should be the same"
+                          "which is batch size"));
+    PADDLE_ENFORCE_EQ(mask_dim[1],
+                      attn_dim[1],
+                      platform::errors::InvalidArgument(
+                          "The second dim of mask and attn should be the same"
+                          "which is nb_head"));
+    PADDLE_ENFORCE_EQ(mask_dim[0],
+                      x_dim[0],
+                      platform::errors::InvalidArgument(
+                          "The first dim of mask and x should be the same"
+                          "which is batch size"));
+    PADDLE_ENFORCE_EQ(
+        mask_dim[2],
+        mask_dim[3],
+        platform::errors::InvalidArgument(
+            "The third dim and the fourth dim of mask should be the same"
+            "which is max seq len"));
+    PADDLE_ENFORCE_EQ(
+        attn_dim[2],
+        attn_dim[3],
+        platform::errors::InvalidArgument(
+            "The third dim and the fourth dim of mask should be the same"
+            "which is max seq len"));
+    PADDLE_ENFORCE_EQ(attn_dim[2],
+                      mask_dim[2],
+                      platform::errors::InvalidArgument(
+                          "The third dim of mask and attn should be the same"
+                          "which is max seq len"));
+    PADDLE_ENFORCE_EQ(attn_dim[2],
+                      x_dim[1],
+                      platform::errors::InvalidArgument(
+                          "The third dim of mask and the second dim of attn"
+                          "should be the same which is max seq len"));
+
+    auto bsz = mask_dim[0];
+    auto c = x_dim[2];
+    auto slim_seq_len = new_mask_dim[2];
+
+    ctx->SetOutputDim("SlimmedX", {bsz, slim_seq_len, c});
+    ctx->SetOutputDim("CLSInds", {bsz, slim_seq_len});
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(
+    fused_token_prune,
+    ops::FusedTokenPruneOp,
+    ops::FusedTokenPruneOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/fused_token_prune_op.cu b/paddle/fluid/operators/fused_token_prune_op.cu
new file mode 100644
index 0000000000000..90044f30d8a6e
--- /dev/null
+++ b/paddle/fluid/operators/fused_token_prune_op.cu
@@ -0,0 +1,287 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <limits>
+
+#ifdef __NVCC__
+#include <cub/cub.cuh>
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/fused_token_prune_op.cu.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+template <typename T>
+struct AttnMaskFunctor {
+  inline HOSTDEVICE T operator()(const T a, const T b) const {
+    return b >= 0 ? a : 0;
+  }
+};
+
+__global__ void FillIndex(int64_t* indices, int num_raws, int num_cols) {
+  int num_threads = num_raws * num_cols;
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int stride = blockDim.x * gridDim.x;
+
+  for (; tid < num_threads; tid += stride) {
+    int col = tid % num_cols;
+    indices[tid] = (int64_t)col;
+  }
+}
+
+template <typename T>
+__global__ void TakeAlongAxis(const T* src,
+                              T* dst,
+                              int64_t* indices,
+                              int num_raws,
+                              int src_num_cols,
+                              int dst_num_cols,
+                              int num_elements) {
+  int num_threads = num_raws * dst_num_cols;
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int stride = blockDim.x * gridDim.x;
+
+  for (; tid < num_threads; tid += stride) {
+    int raw = tid / dst_num_cols;
+    int col = tid % dst_num_cols;
+    for (int i = 0; i < num_elements; ++i) {
+      dst[tid * num_elements + i] =
+          *(src + (raw * src_num_cols + indices[tid]) * num_elements + i);
+    }
+  }
+}
+
+template <typename T>
+__global__ void MaximumFirst(T* mat, int num_raws, int num_cols, T max_value) {
+  int num_threads = num_raws;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+
+  for (; tid < num_threads; tid += stride) {
+    mat[tid * num_cols] = max_value;
+  }
+}
+
+template <typename T>
+class FusedTokenPruneOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto& dev_ctx = context.cuda_device_context();
+    // Inouts
+    const Tensor* attn = context.Input<Tensor>("Attn");
+    const Tensor* x = context.Input<Tensor>("X");
+    const Tensor* mask = context.Input<Tensor>("Mask");
+    const Tensor* new_mask = context.Input<Tensor>("NewMask");
+
+    // Input dims
+    auto attn_dims = attn->dims();
+    auto x_dims = x->dims();
+    auto new_mask_dims = new_mask->dims();
+
+    auto bsz = attn_dims[0];
+    auto num_heads = attn_dims[1];
+    auto max_seq_len = attn_dims[2];
+    auto c = x_dims[2];
+    int slimmed_x_len = new_mask_dims[2];
+
+    // Attrs
+    const bool keep_first_token = context.Attr<bool>("keep_first_token");
+    const bool keep_order = context.Attr<bool>("keep_order");
+
+    // Outputs
+    Tensor* out_slimmed_x = context.Output<Tensor>("SlimmedX");
+    Tensor* slimmed_indices = context.Output<Tensor>("CLSInds");
+    auto* out_slimmed_x_data =
+        out_slimmed_x->mutable_data<T>(context.GetPlace());
+    auto* slimmed_indices_data =
+        slimmed_indices->mutable_data<int64_t>(context.GetPlace());
+
+    // Intermediate variable
+    Tensor attn_tmp;
+    auto* attn_tmp_data =
+        attn_tmp.mutable_data<T>(attn_dims, context.GetPlace());
+    Tensor attn_accu;
+    auto* attn_accu_data =
+        attn_accu.mutable_data<T>({bsz, max_seq_len}, context.GetPlace());
+    Tensor attn_accu_indices;
+    auto* attn_accu_indices_data = attn_accu_indices.mutable_data<int64_t>(
+        {bsz, max_seq_len}, context.GetPlace());
+    Tensor sort_attn_accu;
+    auto* sort_attn_accu_data =
+        sort_attn_accu.mutable_data<T>({bsz, max_seq_len}, context.GetPlace());
+    Tensor sort_attn_accu_indices;
+    auto* sort_attn_accu_indices_data =
+        sort_attn_accu_indices.mutable_data<int64_t>({bsz, max_seq_len},
+                                                     context.GetPlace());
+    Tensor temp_storage;
+
+    // 1. Filter attn by mask
+    std::vector<const Tensor*> ins;
+    std::vector<Tensor*> outs;
+    ins.emplace_back(attn);
+    ins.emplace_back(mask);
+    outs.emplace_back(&attn_tmp);
+    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+        dev_ctx, ins, &outs, -1, AttnMaskFunctor<T>());
+
+    // 2. Reduce sum
+    const std::vector<int64_t> reduce_dims{1, 2};
+    phi::Reduce<T, kps::AddFunctor, kps::IdentityFunctor>(dev_ctx,
+                                                          attn_tmp,
+                                                          false,
+                                                          reduce_dims,
+                                                          false,
+                                                          attn_accu.dtype(),
+                                                          &attn_accu);
+    // 3. Prepare token indices
+    phi::backends::gpu::GpuLaunchConfig config =
+        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, bsz * max_seq_len);
+    FillIndex<<<config.block_per_grid,
+                config.thread_per_block,
+                0,
+                dev_ctx.stream()>>>(attn_accu_indices_data, bsz, max_seq_len);
+
+    // 4. Sort token indices by attn
+    if (keep_first_token) {
+      T max = std::numeric_limits<T>::max();
+      config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, bsz);
+      MaximumFirst<T>
+          <<<config.block_per_grid,
+             config.thread_per_block,
+             0,
+             dev_ctx.stream()>>>(attn_accu_data, bsz, max_seq_len, max);
+    }
+    size_t temp_storage_bytes = -1;
+    int num_items = bsz * max_seq_len;
+    int num_segments = bsz;
+
+    cub::CountingInputIterator<int64_t> counting_iter(0);
+    cub::TransformInputIterator<int64_t,
+                                SegmentOffsetIter,
+                                cub::CountingInputIterator<int64_t>>
+        segment_offsets_t(counting_iter, SegmentOffsetIter(max_seq_len));
+    // Determine temporary device storage requirements
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        cub::DeviceSegmentedRadixSort::SortPairsDescending(
+            nullptr,
+            temp_storage_bytes,
+            attn_accu_data,
+            sort_attn_accu_data,
+            attn_accu_indices_data,
+            sort_attn_accu_indices_data,
+            num_items,
+            num_segments,
+            segment_offsets_t,
+            segment_offsets_t + 1,
+            0,
+            sizeof(T) * 8,
+            dev_ctx.stream()));
+    // Allocate temporary storage
+    int64_t temp_size = temp_storage_bytes;
+    auto* temp_storage_data =
+        temp_storage.mutable_data<uint8_t>({temp_size}, context.GetPlace());
+    // Run sorting operation
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        cub::DeviceSegmentedRadixSort::SortPairsDescending(
+            temp_storage_data,
+            temp_storage_bytes,
+            attn_accu_data,
+            sort_attn_accu_data,
+            attn_accu_indices_data,
+            sort_attn_accu_indices_data,
+            num_items,
+            num_segments,
+            segment_offsets_t,
+            segment_offsets_t + 1,
+            0,
+            sizeof(T) * 8,
+            dev_ctx.stream()));
+    // 5. Slice
+    auto slimmed_indices_tmp =
+        phi::funcs::Slice<int64_t>(dev_ctx,
+                                   sort_attn_accu_indices,
+                                   {1} /*axes*/,
+                                   {0} /*starts*/,
+                                   {slimmed_x_len} /*ends*/);
+    if (keep_order) {
+      // 6. reorder
+      num_items = bsz * slimmed_x_len;
+      temp_storage_bytes = -1;
+      cub::TransformInputIterator<int64_t,
+                                  SegmentOffsetIter,
+                                  cub::CountingInputIterator<int64_t>>
+          segment_offsets_t2(counting_iter, SegmentOffsetIter(slimmed_x_len));
+      PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceSegmentedRadixSort::SortKeys(
+          nullptr,
+          temp_storage_bytes,
+          static_cast<int64_t*>(slimmed_indices_tmp.data()),
+          static_cast<int64_t*>(slimmed_indices->data()),
+          num_items,
+          num_segments,
+          segment_offsets_t2,
+          segment_offsets_t2 + 1,
+          0,
+          sizeof(int64_t) * 8,
+          dev_ctx.stream()));
+      temp_size = temp_storage_bytes;
+      temp_storage.Resize({temp_size});
+      temp_storage_data =
+          temp_storage.mutable_data<uint8_t>(context.GetPlace());
+      PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceSegmentedRadixSort::SortKeys(
+          temp_storage_data,
+          temp_storage_bytes,
+          static_cast<int64_t*>(slimmed_indices_tmp.data()),
+          static_cast<int64_t*>(slimmed_indices->data()),
+          num_items,
+          num_segments,
+          segment_offsets_t2,
+          segment_offsets_t2 + 1,
+          0,
+          sizeof(int64_t) * 8,
+          dev_ctx.stream()));
+    } else {
+      framework::TensorCopy(
+          slimmed_indices_tmp, context.GetPlace(), slimmed_indices);
+    }
+    // 7. Get slimmed X by indices
+    config =
+        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, bsz * slimmed_x_len);
+    TakeAlongAxis<T><<<config.block_per_grid,
+                       config.thread_per_block,
+                       0,
+                       dev_ctx.stream()>>>(x->data<T>(),
+                                           out_slimmed_x_data,
+                                           slimmed_indices->data<int64_t>(),
+                                           bsz,
+                                           max_seq_len,
+                                           slimmed_x_len,
+                                           c);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(fused_token_prune,
+                        ops::FusedTokenPruneOpCUDAKernel<float>,
+                        ops::FusedTokenPruneOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/fused_token_prune_op.cu.h b/paddle/fluid/operators/fused_token_prune_op.cu.h
new file mode 100644
index 0000000000000..e1e73a5e3d9e2
--- /dev/null
+++ b/paddle/fluid/operators/fused_token_prune_op.cu.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
+#include "paddle/phi/kernels/funcs/slice.h"
+#include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/primitive/functor_primitives.h"
+
+namespace paddle {
+namespace operators {
+
+HOSTDEVICE inline int CeilDivide(int n, int m) { return (n + m - 1) / m; }
+
+inline int ComputeBlockSize(int col) {
+  if (col > 512)
+    return 1024;
+  else if (col > 256 && col <= 512)
+    return 512;
+  else if (col > 128 && col <= 256)
+    return 256;
+  else if (col > 64 && col <= 128)
+    return 128;
+  else
+    return 64;
+}
+
+// Iter for move to next row
+struct SegmentOffsetIter {
+  EIGEN_DEVICE_FUNC
+  explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const {
+    return idx * num_cols_;
+  }
+
+  int num_cols_;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/gather_test.cc b/paddle/fluid/operators/gather_test.cc
index 676143bf01145..11c46d1772957 100644
--- a/paddle/fluid/operators/gather_test.cc
+++ b/paddle/fluid/operators/gather_test.cc
@@ -39,7 +39,7 @@ TEST(Gather, GatherData) {
                                             paddle::platform::CPUPlace());
 
   auto* cpu_place = new paddle::platform::CPUPlace();
-  paddle::platform::CPUDeviceContext ctx(*cpu_place);
+  phi::CPUContext ctx(*cpu_place);
   phi::funcs::CPUGather<int>(ctx, *src, *index, output);
   delete cpu_place;
   cpu_place = NULL;
diff --git a/paddle/fluid/operators/graph_khop_sampler_op.cc b/paddle/fluid/operators/graph_khop_sampler_op.cc
index 7f45b49518594..4702d66c3ccb3 100644
--- a/paddle/fluid/operators/graph_khop_sampler_op.cc
+++ b/paddle/fluid/operators/graph_khop_sampler_op.cc
@@ -132,7 +132,7 @@ Graph Learning Sampling Neighbors operator, for graphsage sampling method.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
+using CPU = phi::CPUContext;
 
 REGISTER_OPERATOR(graph_khop_sampler,
                   ops::GraphKhopSamplerOP,
diff --git a/paddle/fluid/operators/grid_sampler_op_xpu.cc b/paddle/fluid/operators/grid_sampler_op_xpu.cc
new file mode 100644
index 0000000000000..2843a90492cec
--- /dev/null
+++ b/paddle/fluid/operators/grid_sampler_op_xpu.cc
@@ -0,0 +1,138 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_XPU
+
+#include <memory>
+#include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/fluid/platform/device/xpu/xpu_header.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class GridSamplerXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_xpu_place(context.GetPlace()),
+        true,
+        platform::errors::Unavailable("This kernel only runs on XPU."));
+
+    // input and output data
+    const Tensor* input = context.Input<Tensor>("X");
+    const Tensor* grid = context.Input<Tensor>("Grid");
+    Tensor* output = context.Output<Tensor>("Output");
+
+    int n = input->dims()[0];
+    int c = input->dims()[1];
+    int h = input->dims()[2];
+    int w = input->dims()[3];
+    int out_h = grid->dims()[1];
+    int out_w = grid->dims()[2];
+
+    // attrs
+    // paddle.nn.functional.grid_sample(x, grid, mode='bilinear',
+    // padding_mode='zeros', align_corners=True, name=None)
+    const std::string mode = context.Attr<std::string>("mode");
+    const std::string padding_mode = context.Attr<std::string>("padding_mode");
+    bool align_corners_bool = context.Attr<bool>("align_corners");
+    const std::string data_format =
+        paddle::framework::DataLayoutToString(input->layout());
+
+    // attr to real param
+    bool is_nearest_bool;
+    if (mode == "bilinear") {
+      is_nearest_bool = false;
+    } else if (mode == "nearest") {
+      is_nearest_bool = true;
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "should not reach here: mode should be either 'bilinear' or "
+          "'nearest', bot got %s.",
+          mode));
+    }
+
+    // attention: 0: zeros, 2: reflection, 1: border according to XDNN api.
+    int padding_mode_int;
+    if (padding_mode == "zeros") {
+      padding_mode_int = 0;
+    } else if (padding_mode == "reflection") {
+      padding_mode_int = 2;
+    } else if (padding_mode == "border") {
+      padding_mode_int = 1;
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "should not reach here: padding_mode should be either 'zeros' or "
+          "'reflection' or 'border', bot got %s.",
+          padding_mode));
+    }
+
+    bool is_nchw_bool;
+    if (data_format == "NCHW") {
+      is_nchw_bool = true;
+    } else if (data_format == "NHWC") {
+      is_nchw_bool = false;
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "should not reach here: data_format should be either 'NCHW' or "
+          "'NHWC', bot got %s.",
+          data_format));
+    }
+
+    // data pointers
+    const T* input_data = input->data<T>();
+    const T* grid_data = grid->data<T>();
+    T* output_data =
+        output->mutable_data<T>({n, c, out_h, out_w}, context.GetPlace());
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    // int grid_sample(Context* ctx, const T* x, const T* grid, T* y, int n, int
+    // c, int xh, int xw, int yh, int yw, bool is_nearest, bool align_corners,
+    // int padding_mode, bool is_nchw);
+    int r = xpu::grid_sample(dev_ctx.x_context(),
+                             input_data,
+                             grid_data,
+                             output_data,
+                             n,
+                             c,
+                             h,
+                             w,
+                             out_h,
+                             out_w,
+                             is_nearest_bool,
+                             align_corners_bool,
+                             padding_mode_int,
+                             is_nchw_bool);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "grid_sampler");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU_KERNEL(
+    grid_sampler,
+    ops::GridSamplerXPUKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif
diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc
index d3c06ea496f1a..f5cfd7a162c8d 100644
--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
@@ -318,7 +318,7 @@ template <typename T>
 class GRUCPUKernel : public framework::OpKernel<T> {
  public:
   void BatchCompute(const framework::ExecutionContext& context) const {
-    using DeviceContext = paddle::platform::CPUDeviceContext;
+    using DeviceContext = phi::CPUContext;
     using LodTensorPtr = LoDTensor*;
     bool is_test = context.Attr<bool>("is_test");
 
@@ -588,7 +588,6 @@ REGISTER_OPERATOR(gru_grad,
 REGISTER_OP_CPU_KERNEL(gru,
                        ops::GRUCPUKernel<float>,
                        ops::GRUCPUKernel<double>);
-REGISTER_OP_CPU_KERNEL(
-    gru_grad,
-    ops::GRUGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GRUGradKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(gru_grad,
+                       ops::GRUGradKernel<phi::CPUContext, float>,
+                       ops::GRUGradKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/gru_unit_op.cc b/paddle/fluid/operators/gru_unit_op.cc
index 404d434a88058..24d4771fac539 100644
--- a/paddle/fluid/operators/gru_unit_op.cc
+++ b/paddle/fluid/operators/gru_unit_op.cc
@@ -325,11 +325,9 @@ REGISTER_OPERATOR(gru_unit_grad,
                   ops::GRUUnitGradOp,
                   ops::GRUUnitGradOpNoNeedBufferVarInferer);
 
-REGISTER_OP_CPU_KERNEL(
-    gru_unit,
-    ops::GRUUnitKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GRUUnitKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    gru_unit_grad,
-    ops::GRUUnitGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GRUUnitGradKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(gru_unit,
+                       ops::GRUUnitKernel<phi::CPUContext, float>,
+                       ops::GRUUnitKernel<phi::CPUContext, double>);
+REGISTER_OP_CPU_KERNEL(gru_unit_grad,
+                       ops::GRUUnitGradKernel<phi::CPUContext, float>,
+                       ops::GRUUnitGradKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc
index 8de20d53ba8fa..835312851b2e4 100644
--- a/paddle/fluid/operators/hinge_loss_op.cc
+++ b/paddle/fluid/operators/hinge_loss_op.cc
@@ -150,12 +150,10 @@ REGISTER_OPERATOR(hinge_loss,
                   ops::HingeLossGradOpMaker<paddle::framework::OpDesc>,
                   ops::HingeLossGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(hinge_loss_grad, ops::HingeLossGradOp);
-REGISTER_OP_CPU_KERNEL(
-    hinge_loss,
-    ops::HingeLossKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    hinge_loss_grad,
-    ops::HingeLossGradKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(hinge_loss,
+                       ops::HingeLossKernel<phi::CPUContext, float>);
+REGISTER_OP_CPU_KERNEL(hinge_loss_grad,
+                       ops::HingeLossGradKernel<phi::CPUContext, float>);
 
 REGISTER_OP_CUDA_KERNEL(
     hinge_loss,
diff --git a/paddle/fluid/operators/identity_loss_op.cc b/paddle/fluid/operators/identity_loss_op.cc
new file mode 100644
index 0000000000000..bc9986c7ffea1
--- /dev/null
+++ b/paddle/fluid/operators/identity_loss_op.cc
@@ -0,0 +1,108 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace paddle {
+namespace operators {
+
+class IdentityLossOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
+        platform::CPUPlace());
+  }
+};
+
+class IdentityLossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) The input of identity_loss op");
+    AddOutput("Out", "(Tensor) The output of identity_loss op");
+    AddAttr<int>("reduction", "(int, default 1). The reduction.")
+        .SetDefault(1)
+        .InEnum({0, 1, 2});
+    AddComment(R"DOC(
+IdentityLoss Operator mark the Loss var.
+
+)DOC");
+  }
+};
+
+class IdentityLossGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", framework::GradVarName("X"));
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto input_data_type = OperatorWithKernel::IndicateVarDataType(
+        ctx, framework::GradVarName("Out"));
+    return framework::OpKernelType(input_data_type, platform::CPUPlace());
+  }
+};
+
+template <typename T>
+class IdentityLossGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> grad_op) const override {
+    grad_op->SetType("identity_loss_grad");
+    grad_op->SetInput("X", this->Input("X"));
+    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    grad_op->SetAttrMap(this->Attrs());
+  }
+};
+
+DECLARE_INPLACE_OP_INFERER(IdentityLossInplaceInferer, {"X", "Out"});
+DECLARE_INPLACE_OP_INFERER(IdentityLossGradInplaceInferer,
+                           {framework::GradVarName("Out"),
+                            framework::GradVarName("X")});
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(identity_loss,
+                            IdentityLossInferShapeFunctor,
+                            PD_INFER_META(phi::IdentityLossInferMeta));
+
+REGISTER_OPERATOR(identity_loss,
+                  ops::IdentityLossOp,
+                  ops::IdentityLossOpMaker,
+                  ops::IdentityLossGradMaker<paddle::framework::OpDesc>,
+                  ops::IdentityLossGradMaker<paddle::imperative::OpBase>,
+                  ops::IdentityLossInplaceInferer,
+                  IdentityLossInferShapeFunctor);
+
+REGISTER_OPERATOR(identity_loss_grad,
+                  ops::IdentityLossGradOp,
+                  ops::IdentityLossGradInplaceInferer);
diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc
index ae8c91e2444ab..dce0ca7a646fd 100644
--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
@@ -195,12 +195,10 @@ REGISTER_OPERATOR(im2sequence,
                   ops::Im2SequenceGradMaker<paddle::framework::OpDesc>,
                   ops::Im2SequenceGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(im2sequence_grad, ops::Im2SequenceGradOp);
-REGISTER_OP_CPU_KERNEL(
-    im2sequence,
-    ops::Im2SequenceKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    im2sequence_grad,
-    ops::Im2SequenceGradKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(im2sequence,
+                       ops::Im2SequenceKernel<phi::CPUContext, float>);
+REGISTER_OP_CPU_KERNEL(im2sequence_grad,
+                       ops::Im2SequenceGradKernel<phi::CPUContext, float>);
 
 REGISTER_OP_CUDA_KERNEL(
     im2sequence,
diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc
index 7d62bf2d628d0..3ab6b9f9405ed 100644
--- a/paddle/fluid/operators/increment_op.cc
+++ b/paddle/fluid/operators/increment_op.cc
@@ -25,9 +25,6 @@ class OpDesc;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc
index dfff354c4bd5e..c1880d2a1a194 100644
--- a/paddle/fluid/operators/inplace_abn_op.cc
+++ b/paddle/fluid/operators/inplace_abn_op.cc
@@ -382,11 +382,9 @@ REGISTER_OPERATOR(inplace_abn,
                   InplaceAbnOpInplaceInferer)
 REGISTER_OPERATOR(inplace_abn_grad, ops::InplaceABNGradOp)
 
-REGISTER_OP_CPU_KERNEL(
-    inplace_abn,
-    ops::InplaceABNKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::InplaceABNKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    inplace_abn_grad,
-    ops::InplaceABNGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::InplaceABNGradKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(inplace_abn,
+                       ops::InplaceABNKernel<phi::CPUContext, float>,
+                       ops::InplaceABNKernel<phi::CPUContext, double>);
+REGISTER_OP_CPU_KERNEL(inplace_abn_grad,
+                       ops::InplaceABNGradKernel<phi::CPUContext, float>,
+                       ops::InplaceABNGradKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/interpolate_op.h b/paddle/fluid/operators/interpolate_op.h
index fd8d88ac940de..ff474cfff9727 100644
--- a/paddle/fluid/operators/interpolate_op.h
+++ b/paddle/fluid/operators/interpolate_op.h
@@ -1201,8 +1201,8 @@ static void Interpolate1DCPUBwd(const framework::ExecutionContext& ctx,
   }
   input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
 
-  auto& device_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-  phi::funcs::SetConstant<platform::CPUDeviceContext, T> zero;
+  auto& device_ctx = ctx.template device_context<phi::CPUContext>();
+  phi::funcs::SetConstant<phi::CPUContext, T> zero;
   zero(device_ctx, input_grad, static_cast<T>(0.0));
 
   if (in_w == out_w) {
@@ -1279,8 +1279,8 @@ static void Interpolate2DCPUBwd(const framework::ExecutionContext& ctx,
   }
   input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
 
-  auto& device_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-  phi::funcs::SetConstant<platform::CPUDeviceContext, T> zero;
+  auto& device_ctx = ctx.template device_context<phi::CPUContext>();
+  phi::funcs::SetConstant<phi::CPUContext, T> zero;
   zero(device_ctx, input_grad, static_cast<T>(0.0));
 
   if (in_h == out_h && in_w == out_w) {
@@ -1393,8 +1393,8 @@ static void Interpolate3DCPUBwd(const framework::ExecutionContext& ctx,
     dim_grad = {n, in_d, in_h, in_w, c};
   }
   input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
-  auto& device_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-  phi::funcs::SetConstant<platform::CPUDeviceContext, T> zero;
+  auto& device_ctx = ctx.template device_context<phi::CPUContext>();
+  phi::funcs::SetConstant<phi::CPUContext, T> zero;
   zero(device_ctx, input_grad, static_cast<T>(0.0));
 
   if (in_d == out_d && in_h == out_h && in_w == out_w) {
diff --git a/paddle/fluid/operators/inverse_op.cc b/paddle/fluid/operators/inverse_op.cc
index 3c84b7c983eff..e93ca5ad54035 100644
--- a/paddle/fluid/operators/inverse_op.cc
+++ b/paddle/fluid/operators/inverse_op.cc
@@ -137,11 +137,9 @@ REGISTER_OPERATOR(inverse,
 
 REGISTER_OPERATOR(inverse_grad, ops::InverseGradOp);
 
-REGISTER_OP_CPU_KERNEL(
-    inverse,
-    ops::InverseKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::InverseKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    inverse_grad,
-    ops::InverseGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::InverseGradKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(inverse,
+                       ops::InverseKernel<phi::CPUContext, float>,
+                       ops::InverseKernel<phi::CPUContext, double>);
+REGISTER_OP_CPU_KERNEL(inverse_grad,
+                       ops::InverseGradKernel<phi::CPUContext, float>,
+                       ops::InverseGradKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/isfinite_op.cc b/paddle/fluid/operators/isfinite_op.cc
index a7fc4865f78cb..77583fd2d30f1 100644
--- a/paddle/fluid/operators/isfinite_op.cc
+++ b/paddle/fluid/operators/isfinite_op.cc
@@ -26,9 +26,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
@@ -125,64 +122,35 @@ namespace ops = paddle::operators;
       paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>, \
       paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>)
 
-#define REGISTER_OVERFLOW_CPU_KERNEL(op_type, functor)        \
-  REGISTER_OP_CPU_KERNEL(                                     \
-      op_type,                                                \
-      ops::OverflowKernel<paddle::platform::CPUDeviceContext, \
-                          int,                                \
-                          ops::functor>,                      \
-      ops::OverflowKernel<paddle::platform::CPUDeviceContext, \
-                          int64_t,                            \
-                          ops::functor>,                      \
-      ops::OverflowKernel<paddle::platform::CPUDeviceContext, \
-                          float,                              \
-                          ops::functor>,                      \
-      ops::OverflowKernel<paddle::platform::CPUDeviceContext, \
-                          double,                             \
-                          ops::functor>);
+#define REGISTER_OVERFLOW_CPU_KERNEL(op_type, functor)             \
+  REGISTER_OP_CPU_KERNEL(                                          \
+      op_type,                                                     \
+      ops::OverflowKernel<phi::CPUContext, int, ops::functor>,     \
+      ops::OverflowKernel<phi::CPUContext, int64_t, ops::functor>, \
+      ops::OverflowKernel<phi::CPUContext, float, ops::functor>,   \
+      ops::OverflowKernel<phi::CPUContext, double, ops::functor>);
 
 REGISTER_OP_MAKER(isinf, "isinf(X)");
 REGISTER_OP_MAKER(isnan, "isnan(X)");
 REGISTER_OP_MAKER(isfinite, "isfinite(X)");
 
-REGISTER_OP_CPU_KERNEL(isinf,
-                       ops::OverflowKernel<paddle::platform::CPUDeviceContext,
-                                           int,
-                                           ops::InfinityFunctor>,
-                       ops::OverflowKernel<paddle::platform::CPUDeviceContext,
-                                           int64_t,
-                                           ops::InfinityFunctor>,
-                       ops::OverflowKernel<paddle::platform::CPUDeviceContext,
-                                           float,
-                                           ops::InfinityFunctor>,
-                       ops::OverflowKernel<paddle::platform::CPUDeviceContext,
-                                           double,
-                                           ops::InfinityFunctor>);
-
-REGISTER_OP_CPU_KERNEL(isnan,
-                       ops::OverflowKernel<paddle::platform::CPUDeviceContext,
-                                           int,
-                                           ops::NANFunctor>,
-                       ops::OverflowKernel<paddle::platform::CPUDeviceContext,
-                                           int64_t,
-                                           ops::NANFunctor>,
-                       ops::OverflowKernel<paddle::platform::CPUDeviceContext,
-                                           float,
-                                           ops::NANFunctor>,
-                       ops::OverflowKernel<paddle::platform::CPUDeviceContext,
-                                           double,
-                                           ops::NANFunctor>);
-
-REGISTER_OP_CPU_KERNEL(isfinite,
-                       ops::OverflowKernel<paddle::platform::CPUDeviceContext,
-                                           int,
-                                           ops::IsfiniteFunctor>,
-                       ops::OverflowKernel<paddle::platform::CPUDeviceContext,
-                                           int64_t,
-                                           ops::IsfiniteFunctor>,
-                       ops::OverflowKernel<paddle::platform::CPUDeviceContext,
-                                           float,
-                                           ops::IsfiniteFunctor>,
-                       ops::OverflowKernel<paddle::platform::CPUDeviceContext,
-                                           double,
-                                           ops::IsfiniteFunctor>);
+REGISTER_OP_CPU_KERNEL(
+    isinf,
+    ops::OverflowKernel<phi::CPUContext, int, ops::InfinityFunctor>,
+    ops::OverflowKernel<phi::CPUContext, int64_t, ops::InfinityFunctor>,
+    ops::OverflowKernel<phi::CPUContext, float, ops::InfinityFunctor>,
+    ops::OverflowKernel<phi::CPUContext, double, ops::InfinityFunctor>);
+
+REGISTER_OP_CPU_KERNEL(
+    isnan,
+    ops::OverflowKernel<phi::CPUContext, int, ops::NANFunctor>,
+    ops::OverflowKernel<phi::CPUContext, int64_t, ops::NANFunctor>,
+    ops::OverflowKernel<phi::CPUContext, float, ops::NANFunctor>,
+    ops::OverflowKernel<phi::CPUContext, double, ops::NANFunctor>);
+
+REGISTER_OP_CPU_KERNEL(
+    isfinite,
+    ops::OverflowKernel<phi::CPUContext, int, ops::IsfiniteFunctor>,
+    ops::OverflowKernel<phi::CPUContext, int64_t, ops::IsfiniteFunctor>,
+    ops::OverflowKernel<phi::CPUContext, float, ops::IsfiniteFunctor>,
+    ops::OverflowKernel<phi::CPUContext, double, ops::IsfiniteFunctor>);
diff --git a/paddle/fluid/operators/isfinite_v2_op.cc b/paddle/fluid/operators/isfinite_v2_op.cc
index 314bbf556aed6..65857b6d87db1 100644
--- a/paddle/fluid/operators/isfinite_v2_op.cc
+++ b/paddle/fluid/operators/isfinite_v2_op.cc
@@ -34,9 +34,6 @@ namespace operators {
 template <typename DeviceContext, typename T, typename Functor>
 class OverflowKernel;
 }  // namespace operators
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle
 
 namespace plat = paddle::platform;
diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc
index 7ffdd6ff32ba7..50fd6056d84b0 100644
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -22,7 +22,6 @@
 #include "paddle/fluid/platform/device_tracer.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/variant.h"  // for UNUSED
 
 DEFINE_int32(burning, 10, "Burning times.");
 DEFINE_int32(repeat, 3000, "Repeat times.");
diff --git a/paddle/fluid/operators/jit/registry.h b/paddle/fluid/operators/jit/registry.h
index b006d21f3b558..9d0e47e826075 100644
--- a/paddle/fluid/operators/jit/registry.h
+++ b/paddle/fluid/operators/jit/registry.h
@@ -22,7 +22,7 @@
 #include "paddle/fluid/operators/jit/kernel_base.h"
 #include "paddle/fluid/operators/jit/kernel_pool.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/variant.h"  // for UNUSED
+#include "paddle/phi/core/macros.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc
index 56e1c19721378..c7bf0d538bd97 100644
--- a/paddle/fluid/operators/l1_norm_op.cc
+++ b/paddle/fluid/operators/l1_norm_op.cc
@@ -93,11 +93,9 @@ REGISTER_OPERATOR(l1_norm,
                   ops::L1NormGradMaker<paddle::framework::OpDesc>,
                   ops::L1NormGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(l1_norm_grad, ops::L1NormGradOp);
-REGISTER_OP_CPU_KERNEL(
-    l1_norm, ops::L1NormKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    l1_norm_grad,
-    ops::L1NormGradKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(l1_norm, ops::L1NormKernel<phi::CPUContext, float>);
+REGISTER_OP_CPU_KERNEL(l1_norm_grad,
+                       ops::L1NormGradKernel<phi::CPUContext, float>);
 
 REGISTER_OP_CUDA_KERNEL(
     l1_norm, ops::L1NormKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/label_smooth_op.cc b/paddle/fluid/operators/label_smooth_op.cc
index ccd4db816bdce..873ab62a3d246 100644
--- a/paddle/fluid/operators/label_smooth_op.cc
+++ b/paddle/fluid/operators/label_smooth_op.cc
@@ -24,9 +24,6 @@ class OpDesc;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h
index e37f048235e7c..8ed706a5443af 100644
--- a/paddle/fluid/operators/layer_norm_kernel.cu.h
+++ b/paddle/fluid/operators/layer_norm_kernel.cu.h
@@ -252,7 +252,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fast_ln_fwd_kernel(
         smem[warp_m * WARPS_N + warp_n] = mu_local;
       }
       __syncthreads();
-      if (tidx == 0) {
+      if (tidx % THREADS_PER_ROW == 0) {
         mu_local = 0.f;
 #pragma unroll
         for (int it = 0; it < WARPS_N; ++it) {
@@ -289,7 +289,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fast_ln_fwd_kernel(
         smem[warp_m * WARPS_N + warp_n] = var_local;
       }
       __syncthreads();
-      if (tidx == 0) {
+      if (tidx % THREADS_PER_ROW == 0) {
         var_local = 0.f;
 #pragma unroll
         for (int it = 0; it < WARPS_N; ++it) {
diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc
index 397b26e119416..99c10e868a396 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
@@ -395,12 +395,10 @@ REGISTER_OPERATOR(linear_chain_crf,
 REGISTER_OPERATOR(linear_chain_crf_grad,
                   ops::LinearChainCRFGradOp,
                   ops::LinearChainCRFGradNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    linear_chain_crf,
-    ops::LinearChainCRFOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LinearChainCRFOpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(linear_chain_crf,
+                       ops::LinearChainCRFOpKernel<phi::CPUContext, float>,
+                       ops::LinearChainCRFOpKernel<phi::CPUContext, double>);
 REGISTER_OP_CPU_KERNEL(
     linear_chain_crf_grad,
-    ops::LinearChainCRFGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LinearChainCRFGradOpKernel<paddle::platform::CPUDeviceContext,
-                                    double>);
+    ops::LinearChainCRFGradOpKernel<phi::CPUContext, float>,
+    ops::LinearChainCRFGradOpKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/linear_chain_crf_op.h b/paddle/fluid/operators/linear_chain_crf_op.h
index 8d345b237bfc5..de6daf33f8426 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.h
+++ b/paddle/fluid/operators/linear_chain_crf_op.h
@@ -129,8 +129,8 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
     emission_row_max.mutable_data<T>(
         phi::make_ddim({static_cast<int64_t>(batch_size), 1}),
         platform::CPUPlace());
-    auto& place = *ctx.template device_context<platform::CPUDeviceContext>()
-                       .eigen_device();
+    auto& place =
+        *ctx.template device_context<phi::CPUContext>().eigen_device();
     auto x = framework::EigenMatrix<T>::From(emission_weights_tmp);
     auto x_row_max = framework::EigenMatrix<T>::From(emission_row_max);
     x_row_max.device(place) =
@@ -325,21 +325,20 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
       Tensor one_seq_beta = beta.Slice(start_pos, end_pos);
       Tensor one_seq_emission_grad =
           emission_grad_tmp.Slice(start_pos, end_pos);
-      BackwardOneSequence(
-          ctx.template device_context<platform::CPUDeviceContext>(),
-          ll_grad[i],
-          one_seq_emission_exps,
-          *transition_exps,
-          one_seq_alpha,
-          one_seq_label,
-          &one_seq_beta,
-          transition_grad,
-          &one_seq_emission_grad);
+      BackwardOneSequence(ctx.template device_context<phi::CPUContext>(),
+                          ll_grad[i],
+                          one_seq_emission_exps,
+                          *transition_exps,
+                          one_seq_alpha,
+                          one_seq_label,
+                          &one_seq_beta,
+                          transition_grad,
+                          &one_seq_emission_grad);
     }
   };
 
  private:
-  void BackwardOneSequence(const platform::CPUDeviceContext& ctx,
+  void BackwardOneSequence(const phi::CPUContext& ctx,
                            const T ll_grad,
                            const Tensor& emission_exps,
                            const Tensor& transition_exps,
diff --git a/paddle/fluid/operators/lite/lite_engine_op_test.cc b/paddle/fluid/operators/lite/lite_engine_op_test.cc
index c38386365f3dc..fed71abe16637 100644
--- a/paddle/fluid/operators/lite/lite_engine_op_test.cc
+++ b/paddle/fluid/operators/lite/lite_engine_op_test.cc
@@ -79,7 +79,7 @@ TEST(LiteEngineOp, engine_op) {
   ctx.PartialInitWithAllocator();
 #else
   platform::CPUPlace place;
-  platform::CPUDeviceContext ctx(place);
+  phi::CPUContext ctx(place);
 #endif
   // Prepare variables.
   CreateTensor(&scope, "x", std::vector<int64_t>({2, 4}), true);
diff --git a/paddle/fluid/operators/lite/ut_helper.h b/paddle/fluid/operators/lite/ut_helper.h
index c1a67edbfd455..574b7cbec28ce 100644
--- a/paddle/fluid/operators/lite/ut_helper.h
+++ b/paddle/fluid/operators/lite/ut_helper.h
@@ -60,7 +60,7 @@ void serialize_params(std::string* str,
   platform::CUDAPlace place;
   platform::CUDADeviceContext ctx(place);
 #else
-  platform::CPUDeviceContext ctx;
+  phi::CPUContext ctx;
 #endif
   for (const auto& param : params) {
     PADDLE_ENFORCE_NOT_NULL(
diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc
index 65d8a03245f8f..94bfc44977fb3 100644
--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
@@ -86,10 +86,9 @@ REGISTER_OPERATOR(load_combine,
 
 REGISTER_OP_CPU_KERNEL(
     load_combine,
-    ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext,
-                             paddle::platform::bfloat16>,
-    ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
+    ops::LoadCombineOpKernel<phi::CPUContext, float>,
+    ops::LoadCombineOpKernel<phi::CPUContext, double>,
+    ops::LoadCombineOpKernel<phi::CPUContext, paddle::platform::bfloat16>,
+    ops::LoadCombineOpKernel<phi::CPUContext, int>,
+    ops::LoadCombineOpKernel<phi::CPUContext, int8_t>,
+    ops::LoadCombineOpKernel<phi::CPUContext, int64_t>);
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index c1a9782b97a4c..4eebda7d53a3c 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -68,10 +68,9 @@ REGISTER_OPERATOR(load, ops::LoadOp, ops::LoadOpProtoMaker);
 
 REGISTER_OP_CPU_KERNEL(
     load,
-    ops::LoadOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LoadOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::LoadOpKernel<paddle::platform::CPUDeviceContext,
-                      paddle::platform::bfloat16>,
-    ops::LoadOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::LoadOpKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::LoadOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
+    ops::LoadOpKernel<phi::CPUContext, float>,
+    ops::LoadOpKernel<phi::CPUContext, double>,
+    ops::LoadOpKernel<phi::CPUContext, paddle::platform::bfloat16>,
+    ops::LoadOpKernel<phi::CPUContext, int>,
+    ops::LoadOpKernel<phi::CPUContext, int8_t>,
+    ops::LoadOpKernel<phi::CPUContext, int64_t>);
diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc
index 4c1a2deeaf480..d4b36f31e6201 100644
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@@ -44,7 +44,8 @@ struct LoDTensorToArrayFunctorImpl {
   void apply();
 };
 
-struct LoDTensorToArrayFunctor : public boost::static_visitor<void> {
+struct LoDTensorToArrayFunctor
+    : public std::unary_function<platform::Place, void> {
   std::vector<const framework::Tensor *> ref_inputs_;
   mutable std::vector<framework::Tensor *> outputs_;
   const framework::Tensor &input_;
@@ -62,7 +63,7 @@ struct LoDTensorToArrayFunctor : public boost::static_visitor<void> {
     auto &pool = platform::DeviceContextPool::Instance();
     auto *dev_ctx = pool.Get(place);
     if (std::is_same<Place, platform::CPUPlace>::value) {
-      Apply(static_cast<platform::CPUDeviceContext *>(dev_ctx));
+      Apply(static_cast<phi::CPUContext *>(dev_ctx));
     } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       Apply(static_cast<platform::CUDADeviceContext *>(dev_ctx));
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index 3ca39e621b2a4..31a3e40f12e82 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -116,8 +116,7 @@ class LookupTableKernel : public framework::OpKernel<T> {
                        table + id_index * row_width,
                        row_width * sizeof(T));
               } else {
-                auto blas =
-                    phi::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
+                auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(context);
                 blas.VCOPY(row_width,
                            table + id_index * row_width,
                            output + i * row_width);
@@ -148,8 +147,7 @@ class LookupTableKernel : public framework::OpKernel<T> {
                      table + id_index * row_width,
                      row_width * sizeof(T));
             } else {
-              auto blas =
-                  phi::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
+              auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(context);
               blas.VCOPY(row_width,
                          table + id_index * row_width,
                          output + i * row_width);
diff --git a/paddle/fluid/operators/lookup_table_v2_op.h b/paddle/fluid/operators/lookup_table_v2_op.h
index 9f9dbe9b336bd..1e12b00ebb944 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.h
+++ b/paddle/fluid/operators/lookup_table_v2_op.h
@@ -132,8 +132,7 @@ struct LookupTableV2CPUFunctor {
                    table + id_index * row_width,
                    row_width * sizeof(T));
           } else {
-            auto blas =
-                phi::funcs::GetBlas<platform::CPUDeviceContext, T>(context_);
+            auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(context_);
             blas.VCOPY(row_width,
                        table + id_index * row_width,
                        output + i * row_width);
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index 8ec7f3a142c30..73fe170f6d5e8 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -31,7 +31,7 @@ using framework::Tensor;
 using DataLayout = framework::DataLayout;
 
 template <typename T>
-struct LRNFunctor<platform::CPUDeviceContext, T> {
+struct LRNFunctor<phi::CPUContext, T> {
   void operator()(const framework::ExecutionContext& ctx,
                   const framework::Tensor& input,
                   framework::Tensor* out,
@@ -46,9 +46,9 @@ struct LRNFunctor<platform::CPUDeviceContext, T> {
                   T beta,
                   const DataLayout data_layout) {
     auto place = ctx.GetPlace();
-    auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, T>(ctx);
-    phi::funcs::Transpose<platform::CPUDeviceContext, T, 4> transpose;
-    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
+    auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(ctx);
+    phi::funcs::Transpose<phi::CPUContext, T, 4> transpose;
+    auto& dev_ctx = ctx.template device_context<phi::CPUContext>();
     Tensor in_transpose, mid_transpose, out_transpose;
     // if channel_last, transpose to channel_first
     if (data_layout == DataLayout::kNHWC) {
@@ -116,11 +116,11 @@ struct LRNFunctor<platform::CPUDeviceContext, T> {
     }
   }
 };
-template struct LRNFunctor<platform::CPUDeviceContext, float>;
-template struct LRNFunctor<platform::CPUDeviceContext, double>;
+template struct LRNFunctor<phi::CPUContext, float>;
+template struct LRNFunctor<phi::CPUContext, double>;
 
 template <typename T>
-struct LRNGradFunctor<platform::CPUDeviceContext, T> {
+struct LRNGradFunctor<phi::CPUContext, T> {
   void operator()(const framework::ExecutionContext& ctx,
                   const framework::Tensor& x,
                   const framework::Tensor& out,
@@ -183,8 +183,8 @@ struct LRNGradFunctor<platform::CPUDeviceContext, T> {
     }
   }
 };
-template struct LRNGradFunctor<platform::CPUDeviceContext, float>;
-template struct LRNGradFunctor<platform::CPUDeviceContext, double>;
+template struct LRNGradFunctor<phi::CPUContext, float>;
+template struct LRNGradFunctor<phi::CPUContext, double>;
 
 class LRNOp : public framework::OperatorWithKernel {
  public:
@@ -435,7 +435,5 @@ REGISTER_OPERATOR(lrn,
                   ops::LRNGradOpMaker<paddle::imperative::OpBase>);
 
 REGISTER_OPERATOR(lrn_grad, ops::LRNOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    lrn, ops::LRNKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    lrn_grad, ops::LRNGradKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(lrn, ops::LRNKernel<phi::CPUContext, float>);
+REGISTER_OP_CPU_KERNEL(lrn_grad, ops::LRNGradKernel<phi::CPUContext, float>);
diff --git a/paddle/fluid/operators/lstm_op.cc b/paddle/fluid/operators/lstm_op.cc
index 668200cf4ff5d..ba56eeddf89d1 100644
--- a/paddle/fluid/operators/lstm_op.cc
+++ b/paddle/fluid/operators/lstm_op.cc
@@ -356,11 +356,9 @@ REGISTER_OPERATOR(lstm,
                   ops::LSTMGradOpMaker<paddle::framework::OpDesc>,
                   ops::LSTMGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(lstm_grad, ops::LSTMGradOp);
-REGISTER_OP_CPU_KERNEL(
-    lstm,
-    ops::LSTMKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LSTMKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    lstm_grad,
-    ops::LSTMGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LSTMGradKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(lstm,
+                       ops::LSTMKernel<phi::CPUContext, float>,
+                       ops::LSTMKernel<phi::CPUContext, double>);
+REGISTER_OP_CPU_KERNEL(lstm_grad,
+                       ops::LSTMGradKernel<phi::CPUContext, float>,
+                       ops::LSTMGradKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/lstmp_op.cc b/paddle/fluid/operators/lstmp_op.cc
index bc064fb61caa4..84e4e5cd2cdf8 100644
--- a/paddle/fluid/operators/lstmp_op.cc
+++ b/paddle/fluid/operators/lstmp_op.cc
@@ -400,11 +400,9 @@ REGISTER_OPERATOR(lstmp,
                   ops::LSTMPGradMaker<paddle::framework::OpDesc>,
                   ops::LSTMPGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(lstmp_grad, ops::LSTMPGradOp);
-REGISTER_OP_CPU_KERNEL(
-    lstmp,
-    ops::LSTMPKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LSTMPKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    lstmp_grad,
-    ops::LSTMPGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LSTMPGradKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(lstmp,
+                       ops::LSTMPKernel<phi::CPUContext, float>,
+                       ops::LSTMPKernel<phi::CPUContext, double>);
+REGISTER_OP_CPU_KERNEL(lstmp_grad,
+                       ops::LSTMPGradKernel<phi::CPUContext, float>,
+                       ops::LSTMPGradKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/lstsq_op.cc b/paddle/fluid/operators/lstsq_op.cc
index 792ede9959f77..70ce5082ced30 100644
--- a/paddle/fluid/operators/lstsq_op.cc
+++ b/paddle/fluid/operators/lstsq_op.cc
@@ -150,7 +150,6 @@ This API processes Lstsq functor for general matrices.
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(lstsq, ops::LstsqOp, ops::LstsqOpMaker)
 
-REGISTER_OP_CPU_KERNEL(
-    lstsq,
-    ops::LstsqCPUKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LstsqCPUKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(lstsq,
+                       ops::LstsqCPUKernel<phi::CPUContext, float>,
+                       ops::LstsqCPUKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/lstsq_op.cu b/paddle/fluid/operators/lstsq_op.cu
index d0b44d0ec88f4..82a56af7eb4f1 100644
--- a/paddle/fluid/operators/lstsq_op.cu
+++ b/paddle/fluid/operators/lstsq_op.cu
@@ -100,7 +100,7 @@ class LstsqCUDAKernel : public framework::OpKernel<T> {
                                      true,
                                      batch_count,
                                      m,
-                                     n,
+                                     nrhs,
                                      k,
                                      x_data,
                                      x_stride,
@@ -137,14 +137,17 @@ class LstsqCUDAKernel : public framework::OpKernel<T> {
 
       // Step 2, solve R^H Z = Y
       Tensor trans_r = dito.Transpose(new_x);
+      Tensor slice_r = dito.Slice(trans_r, {-2}, {0}, {min_mn});
+      Tensor res_r = dito.TrilTriu(slice_r, 0, false);
+
       phi::TriangularSolveKernel<T, Context>(
-          phi_dev_ctx, trans_r, new_y, true, true, false, solution);
+          phi_dev_ctx, res_r, new_y, true, true, false, solution);
 
       // Step 3, X <- Q Z
       BatchedOrgqr<DeviceContext, T>(dev_ctx,
                                      batch_count,
                                      n,
-                                     n,
+                                     m,
                                      min_mn,
                                      x_data,
                                      n,
@@ -183,8 +186,6 @@ void BatchedOrmqr<platform::CUDADeviceContext, float>(
   auto handle = dev_ctx.cusolver_dn_handle();
   PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSormqr_bufferSize(
       handle, side, trans, m, n, k, a, lda, tau, other, ldc, &lwork));
-  auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float));
-  float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());
   auto info = memory::Alloc(dev_ctx, sizeof(int));
   int* info_d = reinterpret_cast<int*>(info->ptr());
 
@@ -192,6 +193,11 @@ void BatchedOrmqr<platform::CUDADeviceContext, float>(
     float* a_working_ptr = &a[i * a_stride];
     float* tau_working_ptr = &tau[i * tau_stride];
     float* other_working_ptr = &other[i * other_stride];
+
+    handle = dev_ctx.cusolver_dn_handle();
+    auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float));
+    float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());
+
     // compute ormgr
     PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cusolverDnSormqr(handle,
@@ -249,8 +255,6 @@ void BatchedOrmqr<platform::CUDADeviceContext, double>(
   auto handle = dev_ctx.cusolver_dn_handle();
   PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDormqr_bufferSize(
       handle, side, trans, m, n, k, a, lda, tau, other, ldc, &lwork));
-  auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double));
-  double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());
   auto info = memory::Alloc(dev_ctx, sizeof(int));
   int* info_d = reinterpret_cast<int*>(info->ptr());
 
@@ -258,6 +262,11 @@ void BatchedOrmqr<platform::CUDADeviceContext, double>(
     double* a_working_ptr = &a[i * a_stride];
     double* tau_working_ptr = &tau[i * tau_stride];
     double* other_working_ptr = &other[i * other_stride];
+
+    handle = dev_ctx.cusolver_dn_handle();
+    auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double));
+    double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());
+
     // compute ormgr
     PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cusolverDnDormqr(handle,
diff --git a/paddle/fluid/operators/lstsq_op.h b/paddle/fluid/operators/lstsq_op.h
index f99e027e9ced2..b3e5894a9451e 100644
--- a/paddle/fluid/operators/lstsq_op.h
+++ b/paddle/fluid/operators/lstsq_op.h
@@ -21,13 +21,13 @@
 
 #include "paddle/fluid/operators/eig_op.h"
 #include "paddle/fluid/operators/math/eigen_values_vectors.h"
-#include "paddle/fluid/operators/math/matrix_solve.h"
 #include "paddle/fluid/operators/svd_helper.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/matrix_solve.h"
 
 #define EPSILON 1e-6
 
diff --git a/paddle/fluid/operators/lu_op.cc b/paddle/fluid/operators/lu_op.cc
index 1021b157ba374..1f569950dad52 100644
--- a/paddle/fluid/operators/lu_op.cc
+++ b/paddle/fluid/operators/lu_op.cc
@@ -114,9 +114,7 @@ class LUKernel : public framework::OpKernel<T> {
                           "lu without pivoting is not implemented on the CPU, "
                           "but got pivots=False"));
 
-    math::DeviceIndependenceTensorOperations<paddle::platform::CPUDeviceContext,
-                                             T>
-        helper(ctx);
+    math::DeviceIndependenceTensorOperations<phi::CPUContext, T> helper(ctx);
     *out = helper.Transpose(*xin);
 
     auto outdims = out->dims();
@@ -235,5 +233,5 @@ REGISTER_OPERATOR(lu_grad,
 
 REGISTER_OP_CPU_KERNEL(lu, ops::LUKernel<float>, ops::LUKernel<double>);
 REGISTER_OP_CPU_KERNEL(lu_grad,
-                       ops::LUGradKernel<plat::CPUDeviceContext, float>,
-                       ops::LUGradKernel<plat::CPUDeviceContext, double>);
+                       ops::LUGradKernel<phi::CPUContext, float>,
+                       ops::LUGradKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/lu_unpack_op.cc b/paddle/fluid/operators/lu_unpack_op.cc
index b696f3fbd04bb..4c6b37ed3e55e 100644
--- a/paddle/fluid/operators/lu_unpack_op.cc
+++ b/paddle/fluid/operators/lu_unpack_op.cc
@@ -186,9 +186,8 @@ REGISTER_OPERATOR(lu_unpack_grad,
                   ops::LU_UnpackGradOpVarTypeInference);
 
 REGISTER_OP_CPU_KERNEL(lu_unpack,
-                       ops::LU_UnpackKernel<plat::CPUDeviceContext, float>,
-                       ops::LU_UnpackKernel<plat::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    lu_unpack_grad,
-    ops::LU_UnpackGradKernel<plat::CPUDeviceContext, float>,
-    ops::LU_UnpackGradKernel<plat::CPUDeviceContext, double>);
+                       ops::LU_UnpackKernel<phi::CPUContext, float>,
+                       ops::LU_UnpackKernel<phi::CPUContext, double>);
+REGISTER_OP_CPU_KERNEL(lu_unpack_grad,
+                       ops::LU_UnpackGradKernel<phi::CPUContext, float>,
+                       ops::LU_UnpackGradKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/margin_rank_loss_op.cc b/paddle/fluid/operators/margin_rank_loss_op.cc
index 4b11497058e43..44f77afee0005 100644
--- a/paddle/fluid/operators/margin_rank_loss_op.cc
+++ b/paddle/fluid/operators/margin_rank_loss_op.cc
@@ -181,9 +181,7 @@ REGISTER_OPERATOR(margin_rank_loss,
                   ops::MarginRankLossGradMaker<paddle::framework::OpDesc>,
                   ops::MarginRankLossGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(margin_rank_loss_grad, ops::MarginRankLossGradOp);
-REGISTER_OP_CPU_KERNEL(
-    margin_rank_loss,
-    ops::MarginRankLossKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    margin_rank_loss_grad,
-    ops::MarginRankLossGradKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(margin_rank_loss,
+                       ops::MarginRankLossKernel<phi::CPUContext, float>);
+REGISTER_OP_CPU_KERNEL(margin_rank_loss_grad,
+                       ops::MarginRankLossGradKernel<phi::CPUContext, float>);
diff --git a/paddle/fluid/operators/masked_select_op_mlu.cc b/paddle/fluid/operators/masked_select_op_mlu.cc
new file mode 100644
index 0000000000000..279096b762ca8
--- /dev/null
+++ b/paddle/fluid/operators/masked_select_op_mlu.cc
@@ -0,0 +1,204 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class MaskedSelectedMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto input = ctx.Input<framework::Tensor>("X");
+    auto mask = ctx.Input<framework::Tensor>("Mask");
+    auto out = ctx.Output<framework::Tensor>("Y");
+
+    auto input_dim = input->dims();
+    auto mask_dim = mask->dims();
+    PADDLE_ENFORCE_EQ(
+        input_dim,
+        mask_dim,
+        platform::errors::InvalidArgument(
+            "The dim size of input and mask in OP(masked_selected) "
+            "must be equal, but got input dim:(%ld), mask dim: "
+            "(%ld). Please check input "
+            "value.",
+            input_dim,
+            mask_dim));
+
+    Tensor number(framework::TransToPhiDataType(VT::INT32));
+    void* number_ptr = number.mutable_data<int32_t>({1}, ctx.GetPlace());
+
+    out->Resize(mask->dims());
+    out->mutable_data<T>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc input_desc(*input);
+    MLUCnnlTensorDesc mask_desc(*mask);
+    MLUCnnlTensorDesc out_desc(*out);
+    MLUCnnl::Mask(ctx,
+                  CNNL_MASKED_SELECT,
+                  input_desc.get(),
+                  GetBasePtr(input),
+                  mask_desc.get(),
+                  GetBasePtr(mask),
+                  nullptr,
+                  nullptr,
+                  out_desc.get(),
+                  GetBasePtr(out),
+                  static_cast<uint32_t*>(number_ptr));
+  }
+};
+
+template <typename T>
+class MaskedSelectedGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto mask = ctx.Input<framework::Tensor>("Mask");
+    auto y_grad = ctx.Input<framework::Tensor>(framework::GradVarName("Y"));
+    auto x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::MLUDeviceContext>();
+    Tensor mask_int32, out_size;
+    std::vector<int32_t> out_size_vec;
+    mask_int32.mutable_data<int32_t>(mask->dims(), ctx.GetPlace());
+    out_size.mutable_data<int32_t>({1}, ctx.GetPlace());
+
+    MLUCnnlTensorDesc mask_desc(*mask);
+    MLUCnnlTensorDesc mask_int32_desc(mask_int32);
+    MLUCnnlTensorDesc out_size_desc(out_size);
+    auto cast_type = GetCastDataType(mask->dtype(), DataType::INT32);
+    MLUCnnl::Cast(ctx,
+                  cast_type,
+                  mask_desc.get(),
+                  GetBasePtr(mask),
+                  mask_int32_desc.get(),
+                  GetBasePtr(&mask_int32));
+
+    auto mask_int32_dim = phi::vectorize(mask_int32.dims());
+    std::vector<int32_t> reduce_dims;
+    for (size_t i = 0; i < mask_int32_dim.size(); i++) {
+      reduce_dims.push_back(static_cast<int>(i));
+    }
+
+    std::string reduce_name = "reduce_sum";
+    cnnlReduceOp_t reduce_op = GetMLUCnnlReduceOp(reduce_name);
+    MLUCnnlReduceDesc reduce_desc(reduce_dims,
+                                  reduce_op,
+                                  ToCnnlDataType<int32_t>(),
+                                  CNNL_NOT_PROPAGATE_NAN,
+                                  CNNL_REDUCE_NO_INDICES,
+                                  CNNL_32BIT_INDICES);
+
+    MLUCnnl::Reduce(ctx,
+                    true,
+                    reduce_desc.get(),
+                    nullptr,
+                    mask_int32_desc.get(),
+                    GetBasePtr(&mask_int32),
+                    0,
+                    nullptr,
+                    nullptr,
+                    out_size_desc.get(),
+                    GetBasePtr(&out_size));
+
+    paddle::framework::TensorToVector(out_size, dev_ctx, &out_size_vec);
+    dev_ctx.Wait();
+
+    Tensor mask_int32_tmp;
+    mask_int32_tmp.ShareDataWith(mask_int32);
+    mask_int32_tmp.Resize({mask_int32.numel()});
+    Tensor topk_v2_out(framework::TransToPhiDataType(VT::INT32)),
+        indices_int32(framework::TransToPhiDataType(VT::INT32));
+    topk_v2_out.mutable_data<int32_t>({mask_int32.numel()}, ctx.GetPlace());
+    indices_int32.mutable_data<int32_t>({mask_int32.numel()}, ctx.GetPlace());
+
+    MLUCnnlTensorDesc topk_v2_out_desc(topk_v2_out);
+    MLUCnnlTensorDesc indices_int32_desc(indices_int32);
+    MLUCnnlTensorDesc mask_int32_tmp_desc(mask_int32_tmp);
+
+    const int dim = 0;
+    MLUCnnl::TopK(ctx,
+                  mask_int32.numel(),
+                  dim,
+                  true,
+                  false,
+                  mask_int32_tmp_desc.get(),
+                  GetBasePtr(&mask_int32_tmp),
+                  topk_v2_out_desc.get(),
+                  GetBasePtr(&topk_v2_out),
+                  indices_int32_desc.get(),
+                  GetBasePtr(&indices_int32));
+
+    auto stream = ctx.template device_context<MLUDeviceContext>().stream();
+
+    Tensor indices_int32_out;
+    indices_int32_out.mutable_data<int32_t>({out_size_vec[0]}, ctx.GetPlace());
+    memory::Copy(ctx.GetPlace(),
+                 GetBasePtr(&indices_int32_out),
+                 ctx.GetPlace(),
+                 GetBasePtr(&indices_int32),
+                 out_size_vec[0] * sizeof(int32_t),
+                 stream);
+
+    Tensor y_grad_tmp_out;
+    y_grad_tmp_out.mutable_data<T>({out_size_vec[0]}, ctx.GetPlace());
+    MLUCnnlTensorDesc y_grad_tmp_out_desc(y_grad_tmp_out);
+    memory::Copy(ctx.GetPlace(),
+                 GetBasePtr(&y_grad_tmp_out),
+                 ctx.GetPlace(),
+                 GetBasePtr(y_grad),
+                 out_size_vec[0] * sizeof(T),
+                 stream);
+
+    Tensor indices_int32_tmp;
+    indices_int32_tmp.ShareDataWith(indices_int32_out);
+    indices_int32_tmp.Resize({out_size_vec[0], 1});
+    MLUCnnlTensorDesc indices_int32_tmp_desc(indices_int32_tmp);
+
+    const cnnlScatterNdMode_t mode = CNNL_SCATTERND_UPDATE;
+    x_grad->Resize({x_grad->numel()});
+    x_grad->mutable_data<T>(ctx.GetPlace());
+    MLUCnnlTensorDesc x_grad_desc(*x_grad);
+    MLUCnnl::ScatterNd(ctx,
+                       mode,
+                       indices_int32_tmp_desc.get(),
+                       GetBasePtr(&indices_int32_tmp),
+                       y_grad_tmp_out_desc.get(),
+                       GetBasePtr(&y_grad_tmp_out),
+                       nullptr,
+                       nullptr,
+                       x_grad_desc.get(),
+                       GetBasePtr(x_grad));
+    x_grad->Resize(mask->dims());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(masked_select,
+                       ops::MaskedSelectedMLUKernel<float>,
+                       ops::MaskedSelectedMLUKernel<int>,
+                       ops::MaskedSelectedMLUKernel<plat::float16>);
+
+REGISTER_OP_MLU_KERNEL(masked_select_grad,
+                       ops::MaskedSelectedGradMLUKernel<float>,
+                       ops::MaskedSelectedGradMLUKernel<int>,
+                       ops::MaskedSelectedGradMLUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/match_matrix_tensor_op.cc b/paddle/fluid/operators/match_matrix_tensor_op.cc
index d6a39faea519c..992d9e9f276c4 100644
--- a/paddle/fluid/operators/match_matrix_tensor_op.cc
+++ b/paddle/fluid/operators/match_matrix_tensor_op.cc
@@ -273,7 +273,7 @@ class CPUMatchMatrixTensorOPKernel : public framework::OpKernel<T> {
     memset(
         bottom_l_trans_data, 0.0, tmp->dims()[0] * tmp->dims()[1] * sizeof(T));
 
-    auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, T>(ctx);
+    auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(ctx);
 
     call_gemm(blas,
               CblasNoTrans,
@@ -295,7 +295,7 @@ class CPUMatchMatrixTensorOPKernel : public framework::OpKernel<T> {
         const auto* l_t_data =
             bottom_l_trans_data + offset_l[b] * dim_t * dim_in + t * dim_in;
         const auto* r_data = bottom_r_data + offset_r[b] * dim_in;
-        auto blas_2 = phi::funcs::GetBlas<platform::CPUDeviceContext, T>(ctx);
+        auto blas_2 = phi::funcs::GetBlas<phi::CPUContext, T>(ctx);
         call_gemm_with_lda(blas_2,
                            CblasNoTrans,
                            CblasTrans,
@@ -388,7 +388,7 @@ class CPUMatchMatrixTensorOPGradKernel : public framework::OpKernel<T> {
       }
     }
 
-    auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, T>(ctx);
+    auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(ctx);
 
     auto* t_data = w->data<T>();
     auto* d_w = ctx.Output<Tensor>(framework::GradVarName("W"));
@@ -456,10 +456,8 @@ REGISTER_OPERATOR(match_matrix_tensor_grad, ops::MatchMatrixTensorOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     match_matrix_tensor,
-    ops::CPUMatchMatrixTensorOPKernel<paddle::platform::CPUDeviceContext,
-                                      float>);
+    ops::CPUMatchMatrixTensorOPKernel<phi::CPUContext, float>);
 
 REGISTER_OP_CPU_KERNEL(
     match_matrix_tensor_grad,
-    ops::CPUMatchMatrixTensorOPGradKernel<paddle::platform::CPUDeviceContext,
-                                          float>);
+    ops::CPUMatchMatrixTensorOPGradKernel<phi::CPUContext, float>);
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 3f7206ac08bf2..927feedd1851e 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -54,7 +54,6 @@ math_library(vol2col)
 math_library(prelu)
 math_library(bert_encoder_functor)
 math_library(tree2col DEPS math_function)
-math_library(matrix_solve)
 
 cc_test(
   selected_rows_functor_test
diff --git a/paddle/fluid/operators/math/beam_search.cc b/paddle/fluid/operators/math/beam_search.cc
index cda085aabe99b..2b607ade728c4 100644
--- a/paddle/fluid/operators/math/beam_search.cc
+++ b/paddle/fluid/operators/math/beam_search.cc
@@ -13,26 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/beam_search.h"
-
+#include "paddle/phi/backends/cpu/cpu_context.h"
 namespace phi {
 class DenseTensor;
 }  // namespace phi
 
-namespace paddle {
-namespace framework {}  // namespace framework
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 namespace math {
 
 template <typename T>
-class BeamSearchFunctor<platform::CPUDeviceContext, T> {
+class BeamSearchFunctor<phi::CPUContext, T> {
  public:
-  void operator()(const platform::CPUDeviceContext &context,
+  void operator()(const phi::CPUContext &context,
                   const framework::LoDTensor *pre_ids,
                   const framework::LoDTensor *pre_scores,
                   const framework::LoDTensor *ids,
@@ -308,10 +301,10 @@ class BeamSearchFunctor<platform::CPUDeviceContext, T> {
   }
 };
 
-template class BeamSearchFunctor<platform::CPUDeviceContext, int>;
-template class BeamSearchFunctor<platform::CPUDeviceContext, int64_t>;
-template class BeamSearchFunctor<platform::CPUDeviceContext, float>;
-template class BeamSearchFunctor<platform::CPUDeviceContext, double>;
+template class BeamSearchFunctor<phi::CPUContext, int>;
+template class BeamSearchFunctor<phi::CPUContext, int64_t>;
+template class BeamSearchFunctor<phi::CPUContext, float>;
+template class BeamSearchFunctor<phi::CPUContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/beam_search_test.cc b/paddle/fluid/operators/math/beam_search_test.cc
index b51b9ee0d675a..f6b0349f1ca28 100644
--- a/paddle/fluid/operators/math/beam_search_test.cc
+++ b/paddle/fluid/operators/math/beam_search_test.cc
@@ -230,8 +230,7 @@ void TestBeamSearch<paddle::platform::CUDADeviceContext,
 #endif
 
 TEST(BeamSearch, CPU) {
-  TestBeamSearch<paddle::platform::CPUDeviceContext,
-                 paddle::platform::CPUPlace>();
+  TestBeamSearch<phi::CPUContext, paddle::platform::CPUPlace>();
 }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/fluid/operators/math/concat_and_split.cc b/paddle/fluid/operators/math/concat_and_split.cc
index 4ce2db1e579db..603584629cc92 100644
--- a/paddle/fluid/operators/math/concat_and_split.cc
+++ b/paddle/fluid/operators/math/concat_and_split.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/concat_and_split.h"
+#include "paddle/fluid/platform/device_context.h"
 
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #ifdef PADDLE_WITH_ASCEND_CL
@@ -28,13 +29,6 @@ namespace phi {
 class DenseTensor;
 }  // namespace phi
 
-namespace paddle {
-namespace framework {}  // namespace framework
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 namespace math {
@@ -44,9 +38,9 @@ namespace math {
  * each dimension must be the same, except the axis dimension.
  */
 template <typename T>
-class ConcatFunctor<platform::CPUDeviceContext, T> {
+class ConcatFunctor<phi::CPUContext, T> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
+  void operator()(const phi::CPUContext& context,
                   const std::vector<framework::Tensor>& input,
                   int axis,
                   framework::Tensor* output) {
@@ -60,9 +54,9 @@ class ConcatFunctor<platform::CPUDeviceContext, T> {
  * each dimension must be the same, except the axis dimension.
  */
 template <typename T>
-class SplitFunctor<platform::CPUDeviceContext, T> {
+class SplitFunctor<phi::CPUContext, T> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
+  void operator()(const phi::CPUContext& context,
                   const framework::Tensor& input,
                   const std::vector<const framework::Tensor*>& ref_inputs,
                   const int axis,
@@ -341,9 +335,9 @@ class SplitFunctor<platform::MLUDeviceContext, T> {
 };
 #endif
 
-#define DEFINE_FUNCTOR(type)                                      \
-  template class ConcatFunctor<platform::CPUDeviceContext, type>; \
-  template class SplitFunctor<platform::CPUDeviceContext, type>;
+#define DEFINE_FUNCTOR(type)                           \
+  template class ConcatFunctor<phi::CPUContext, type>; \
+  template class SplitFunctor<phi::CPUContext, type>;
 
 FOR_ALL_TYPES(DEFINE_FUNCTOR);
 
diff --git a/paddle/fluid/operators/math/concat_test.cc b/paddle/fluid/operators/math/concat_test.cc
index 3ae314e55d87d..4f0fee91e5919 100644
--- a/paddle/fluid/operators/math/concat_test.cc
+++ b/paddle/fluid/operators/math/concat_test.cc
@@ -493,8 +493,7 @@ void TestConcatMain<paddle::platform::CUDADeviceContext,
 #endif
 
 TEST(math, concat) {
-  TestConcatMain<paddle::platform::CPUDeviceContext,
-                 paddle::platform::CPUPlace>();
+  TestConcatMain<phi::CPUContext, paddle::platform::CPUPlace>();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   TestConcatMain<paddle::platform::CUDADeviceContext,
                  paddle::platform::CUDAPlace>();
diff --git a/paddle/fluid/operators/math/context_project.cc b/paddle/fluid/operators/math/context_project.cc
index 927d610e2ce47..beee93ae0166c 100644
--- a/paddle/fluid/operators/math/context_project.cc
+++ b/paddle/fluid/operators/math/context_project.cc
@@ -13,19 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/context_project.h"
-
-namespace paddle {
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
+#include "paddle/phi/backends/cpu/cpu_context.h"
 
 namespace paddle {
 namespace operators {
 namespace math {
 
-template class ContextProjectFunctor<platform::CPUDeviceContext, float>;
-template class ContextProjectFunctor<platform::CPUDeviceContext, double>;
+template class ContextProjectFunctor<phi::CPUContext, float>;
+template class ContextProjectFunctor<phi::CPUContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/cos_sim_functor.cc b/paddle/fluid/operators/math/cos_sim_functor.cc
index 4a3da2ef86d37..0daf46d36fd21 100644
--- a/paddle/fluid/operators/math/cos_sim_functor.cc
+++ b/paddle/fluid/operators/math/cos_sim_functor.cc
@@ -14,19 +14,12 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/cos_sim_functor.h"
 
-namespace paddle {
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 namespace math {
-
 template <typename T>
-struct CosSimDyFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx,
+struct CosSimDyFunctor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext& ctx,
                   const T* x_norm,
                   const T* y_norm,
                   const T* x,
@@ -53,8 +46,8 @@ struct CosSimDyFunctor<platform::CPUDeviceContext, T> {
   }
 };
 
-template struct CosSimDyFunctor<platform::CPUDeviceContext, float>;
-template struct CosSimDyFunctor<platform::CPUDeviceContext, double>;
+template struct CosSimDyFunctor<phi::CPUContext, float>;
+template struct CosSimDyFunctor<phi::CPUContext, double>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/cross_entropy.cc b/paddle/fluid/operators/math/cross_entropy.cc
index 45c7e47b810ac..17ff6aff6f93d 100644
--- a/paddle/fluid/operators/math/cross_entropy.cc
+++ b/paddle/fluid/operators/math/cross_entropy.cc
@@ -17,12 +17,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 
-namespace paddle {
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 namespace math {
@@ -129,9 +123,6 @@ void CrossEntropyFunctor<DeviceContext, T>::operator()(
   }
 }
 
-template class CrossEntropyFunctor<platform::CPUDeviceContext, float>;
-template class CrossEntropyFunctor<platform::CPUDeviceContext, double>;
-
 template class CrossEntropyFunctor<phi::CPUContext, float>;
 template class CrossEntropyFunctor<phi::CPUContext, double>;
 }  // namespace math
diff --git a/paddle/fluid/operators/math/eigen_values_vectors.h b/paddle/fluid/operators/math/eigen_values_vectors.h
index dcbd66c12b91e..a056341c3bf3c 100644
--- a/paddle/fluid/operators/math/eigen_values_vectors.h
+++ b/paddle/fluid/operators/math/eigen_values_vectors.h
@@ -66,7 +66,7 @@ struct MatrixEighFunctor {
 // symmetric matrices, and uses the variable has_vectors to
 // control whether to return the eigenvectors.
 template <typename T>
-struct MatrixEighFunctor<platform::CPUDeviceContext, T> {
+struct MatrixEighFunctor<phi::CPUContext, T> {
  public:
   void operator()(const framework::ExecutionContext &ctx,
                   const Tensor &input,
@@ -78,8 +78,7 @@ struct MatrixEighFunctor<platform::CPUDeviceContext, T> {
     auto *out_value = eigen_values->mutable_data<ValueType>(ctx.GetPlace());
 
     auto dito =
-        math::DeviceIndependenceTensorOperations<platform::CPUDeviceContext, T>(
-            ctx);
+        math::DeviceIndependenceTensorOperations<phi::CPUContext, T>(ctx);
 
     Tensor input_trans;
     // lapack is a column-major storge, transpose make the input to
diff --git a/paddle/fluid/operators/math/gru_compute.cc b/paddle/fluid/operators/math/gru_compute.cc
index d8fa1b5a869b1..857d870847ee8 100644
--- a/paddle/fluid/operators/math/gru_compute.cc
+++ b/paddle/fluid/operators/math/gru_compute.cc
@@ -15,19 +15,13 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/detail/gru_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 
-namespace paddle {
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 namespace math {
 
 template <typename T>
-struct GRUUnitFunctor<platform::CPUDeviceContext, T> {
-  static void compute(const platform::CPUDeviceContext &context,
+struct GRUUnitFunctor<phi::CPUContext, T> {
+  static void compute(const phi::CPUContext &context,
                       GRUMetaValue<T> value,
                       int frame_size,
                       int batch_size,
@@ -35,7 +29,7 @@ struct GRUUnitFunctor<platform::CPUDeviceContext, T> {
                       const detail::ActivationType active_gate,
                       bool origin_mode) {
 #if !defined(__NVCC__) && !defined(__HIPCC___)
-    auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
+    auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(context);
     if (value.prev_out_value) {
       blas.GEMM(false,
                 false,
@@ -89,8 +83,8 @@ struct GRUUnitFunctor<platform::CPUDeviceContext, T> {
 };
 
 template <typename T>
-struct GRUUnitGradFunctor<platform::CPUDeviceContext, T> {
-  static void compute(const platform::CPUDeviceContext &context,
+struct GRUUnitGradFunctor<phi::CPUContext, T> {
+  static void compute(const phi::CPUContext &context,
                       GRUMetaValue<T> value,
                       GRUMetaGrad<T> grad,
                       int frame_size,
@@ -106,7 +100,7 @@ struct GRUUnitGradFunctor<platform::CPUDeviceContext, T> {
                                 batch_size,
                                 active_node,
                                 origin_mode);
-    auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
+    auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(context);
     if (value.prev_out_value && grad.prev_out_grad) {
       blas.GEMM(false,
                 true,
@@ -181,15 +175,15 @@ struct GRUUnitGradFunctor<platform::CPUDeviceContext, T> {
 };
 
 template <typename T>
-struct GRUUnitFunctorV2<platform::CPUDeviceContext, T> {
-  static void compute(const platform::CPUDeviceContext &context,
+struct GRUUnitFunctorV2<phi::CPUContext, T> {
+  static void compute(const phi::CPUContext &context,
                       GRUMetaValue<T> value,
                       int frame_size,
                       int batch_size,
                       const detail::ActivationType active_node,
                       const detail::ActivationType active_gate) {
 #if !defined(__NVCC__) && !defined(__HIPCC___)
-    auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
+    auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(context);
     if (value.prev_out_value) {
       blas.GEMM(CblasNoTrans,
                 CblasTrans,
@@ -232,8 +226,8 @@ struct GRUUnitFunctorV2<platform::CPUDeviceContext, T> {
 };
 
 template <typename T>
-struct GRUUnitGradFunctorV2<platform::CPUDeviceContext, T> {
-  static void compute(const platform::CPUDeviceContext &context,
+struct GRUUnitGradFunctorV2<phi::CPUContext, T> {
+  static void compute(const phi::CPUContext &context,
                       GRUMetaValue<T> value,
                       GRUMetaGrad<T> grad,
                       int frame_size,
@@ -251,7 +245,7 @@ struct GRUUnitGradFunctorV2<platform::CPUDeviceContext, T> {
                              batch_size,
                              active_node,
                              active_gate);
-    auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
+    auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(context);
     if (grad.prev_out_grad && value.prev_out_value) {
       // update prev_out_grad
       blas.GEMM(false,
@@ -355,15 +349,15 @@ struct GRUUnitGradFunctorV2<platform::CPUDeviceContext, T> {
   }
 };
 
-template struct GRUUnitFunctor<platform::CPUDeviceContext, float>;
-template struct GRUUnitFunctor<platform::CPUDeviceContext, double>;
-template struct GRUUnitGradFunctor<platform::CPUDeviceContext, float>;
-template struct GRUUnitGradFunctor<platform::CPUDeviceContext, double>;
+template struct GRUUnitFunctor<phi::CPUContext, float>;
+template struct GRUUnitFunctor<phi::CPUContext, double>;
+template struct GRUUnitGradFunctor<phi::CPUContext, float>;
+template struct GRUUnitGradFunctor<phi::CPUContext, double>;
 
-template struct GRUUnitFunctorV2<platform::CPUDeviceContext, float>;
-template struct GRUUnitFunctorV2<platform::CPUDeviceContext, double>;
-template struct GRUUnitGradFunctorV2<platform::CPUDeviceContext, float>;
-template struct GRUUnitGradFunctorV2<platform::CPUDeviceContext, double>;
+template struct GRUUnitFunctorV2<phi::CPUContext, float>;
+template struct GRUUnitFunctorV2<phi::CPUContext, double>;
+template struct GRUUnitGradFunctorV2<phi::CPUContext, float>;
+template struct GRUUnitGradFunctorV2<phi::CPUContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc
index e7ed2cbf67563..9192badedcfff 100644
--- a/paddle/fluid/operators/math/im2col.cc
+++ b/paddle/fluid/operators/math/im2col.cc
@@ -16,12 +16,6 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
 
-namespace paddle {
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace phi {
 class CPUContext;
 }  // namespace phi
@@ -166,24 +160,12 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
   }
 };
 
-template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CPUDeviceContext,
-                             float>;
-template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CPUDeviceContext,
-                             double>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
                              phi::CPUContext,
                              float>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
                              phi::CPUContext,
                              double>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CPUDeviceContext,
-                             float>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CPUDeviceContext,
-                             double>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
                              phi::CPUContext,
                              float>;
@@ -353,24 +335,12 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
   }
 };
 
-template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CPUDeviceContext,
-                             float>;
-template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CPUDeviceContext,
-                             double>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
                              phi::CPUContext,
                              float>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
                              phi::CPUContext,
                              double>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CPUDeviceContext,
-                             float>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CPUDeviceContext,
-                             double>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
                              phi::CPUContext,
                              float>;
diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc
index fc045ba8be458..93ee9d3a15bad 100644
--- a/paddle/fluid/operators/math/im2col_test.cc
+++ b/paddle/fluid/operators/math/im2col_test.cc
@@ -341,7 +341,7 @@ void testIm2col<paddle::platform::CUDADeviceContext,
 #endif
 
 TEST(math, im2col) {
-  testIm2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
+  testIm2col<phi::CPUContext, paddle::platform::CPUPlace>();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   testIm2col<paddle::platform::CUDADeviceContext,
              paddle::platform::CUDAPlace>();
@@ -350,7 +350,7 @@ TEST(math, im2col) {
 
 #define PREPARE_IM2COL_CPU                                                   \
   paddle::platform::CPUPlace place;                                          \
-  paddle::platform::CPUDeviceContext context(place);                         \
+  phi::CPUContext context(place);                                            \
   paddle::framework::Tensor input;                                           \
   paddle::framework::Tensor out;                                             \
   paddle::framework::Tensor ref;                                             \
@@ -367,7 +367,7 @@ TEST(math, im2col) {
   ref.mutable_data<float>({ic, fh, fw, output_height, output_width}, place); \
   paddle::operators::math::Im2ColFunctor<                                    \
       paddle::operators::math::ColFormat::kCFO,                              \
-      paddle::platform::CPUDeviceContext,                                    \
+      phi::CPUContext,                                                       \
       float>                                                                 \
       im2col
 
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
deleted file mode 100644
index 5eff0a5d4575b..0000000000000
--- a/paddle/fluid/operators/math/math_function.cc
+++ /dev/null
@@ -1,335 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/math_function.h"
-
-#ifdef PADDLE_WITH_MKLML
-#include "paddle/fluid/platform/dynload/mklml.h"
-#endif
-
-#ifdef PADDLE_USE_OPENBLAS
-#include <cblas.h>
-#endif
-
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/operators/math/math_function_impl.h"
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/kernels/funcs/eigen/common.h"
-#include "unsupported/Eigen/CXX11/Tensor"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-using float16 = paddle::platform::float16;
-
-template struct SetConstant<platform::CPUDeviceContext, platform::float16>;
-template struct SetConstant<platform::CPUDeviceContext, platform::bfloat16>;
-template struct SetConstant<platform::CPUDeviceContext, float>;
-template struct SetConstant<platform::CPUDeviceContext, double>;
-template struct SetConstant<platform::CPUDeviceContext, int16_t>;
-template struct SetConstant<platform::CPUDeviceContext, int>;
-template struct SetConstant<platform::CPUDeviceContext, int64_t>;
-template struct SetConstant<platform::CPUDeviceContext, bool>;
-template struct SetConstant<platform::CPUDeviceContext, uint8_t>;
-template struct SetConstant<platform::CPUDeviceContext,
-                            platform::complex<float>>;
-template struct SetConstant<platform::CPUDeviceContext,
-                            platform::complex<double>>;
-
-template struct SetConstant<phi::CPUContext, platform::float16>;
-template struct SetConstant<phi::CPUContext, platform::bfloat16>;
-template struct SetConstant<phi::CPUContext, float>;
-template struct SetConstant<phi::CPUContext, double>;
-template struct SetConstant<phi::CPUContext, int16_t>;
-template struct SetConstant<phi::CPUContext, int>;
-template struct SetConstant<phi::CPUContext, int64_t>;
-template struct SetConstant<phi::CPUContext, bool>;
-template struct SetConstant<phi::CPUContext, uint8_t>;
-template struct SetConstant<phi::CPUContext, platform::complex<float>>;
-template struct SetConstant<phi::CPUContext, platform::complex<double>>;
-
-#ifdef PADDLE_WITH_XPU
-template struct SetConstant<platform::XPUDeviceContext, platform::float16>;
-template struct SetConstant<platform::XPUDeviceContext, platform::bfloat16>;
-template struct SetConstant<platform::XPUDeviceContext, float>;
-template struct SetConstant<platform::XPUDeviceContext, double>;
-template struct SetConstant<platform::XPUDeviceContext, uint8_t>;
-template struct SetConstant<platform::XPUDeviceContext, int16_t>;
-template struct SetConstant<platform::XPUDeviceContext, int>;
-template struct SetConstant<platform::XPUDeviceContext, int64_t>;
-template struct SetConstant<platform::XPUDeviceContext, bool>;
-template struct SetConstant<platform::XPUDeviceContext,
-                            platform::complex<float>>;
-template struct SetConstant<platform::XPUDeviceContext,
-                            platform::complex<double>>;
-#endif
-
-#define DEFINE_CPU_TRANS(RANK)                                          \
-  template struct Transpose<platform::CPUDeviceContext,                 \
-                            platform::float16,                          \
-                            RANK>;                                      \
-  template struct Transpose<platform::CPUDeviceContext,                 \
-                            platform::bfloat16,                         \
-                            RANK>;                                      \
-  template struct Transpose<platform::CPUDeviceContext, float, RANK>;   \
-  template struct Transpose<platform::CPUDeviceContext, double, RANK>;  \
-  template struct Transpose<platform::CPUDeviceContext, int, RANK>;     \
-  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>; \
-  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;    \
-  template struct Transpose<platform::CPUDeviceContext, int16_t, RANK>; \
-  template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>; \
-  template struct Transpose<platform::CPUDeviceContext, int8_t, RANK>;  \
-  template struct Transpose<platform::CPUDeviceContext,                 \
-                            platform::complex<float>,                   \
-                            RANK>;                                      \
-  template struct Transpose<platform::CPUDeviceContext,                 \
-                            platform::complex<double>,                  \
-                            RANK>;
-
-DEFINE_CPU_TRANS(1);
-DEFINE_CPU_TRANS(2);
-DEFINE_CPU_TRANS(3);
-DEFINE_CPU_TRANS(4);
-DEFINE_CPU_TRANS(5);
-DEFINE_CPU_TRANS(6);
-
-template <typename T>
-struct TransposeNormal<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& in,
-                  framework::Tensor* out,
-                  const std::vector<int>& axis) {
-    const int rank = axis.size();
-    auto in_stride = phi::stride(in.dims());
-    auto out_stride = phi::stride(out->dims());
-    const T* in_ptr = in.data<T>();
-    T* out_ptr = out->data<T>();
-
-    auto transpose_helper = [&](int64_t beg, int64_t end) {
-      for (int64_t out_idx = beg; out_idx < end; ++out_idx) {
-        int64_t in_idx = 0;
-        int64_t tmp_idx = out_idx;
-        // calculate the input index
-        for (int i = 0; i < rank; ++i) {
-          const int64_t coordinate = tmp_idx / out_stride[i];
-          tmp_idx -= coordinate * out_stride[i];
-          in_idx += coordinate * in_stride[axis[i]];
-        }
-        out_ptr[out_idx] = in_ptr[in_idx];
-      }
-    };
-    transpose_helper(0, out->numel());
-  }
-};
-
-// define transpose normal
-#define DEFINE_CPU_TRANS_NORMAL(TYPE) \
-  template struct TransposeNormal<platform::CPUDeviceContext, TYPE>
-
-DEFINE_CPU_TRANS_NORMAL(platform::float16);
-DEFINE_CPU_TRANS_NORMAL(platform::bfloat16);
-DEFINE_CPU_TRANS_NORMAL(float);
-DEFINE_CPU_TRANS_NORMAL(double);
-DEFINE_CPU_TRANS_NORMAL(int);
-DEFINE_CPU_TRANS_NORMAL(int64_t);
-DEFINE_CPU_TRANS_NORMAL(bool);
-DEFINE_CPU_TRANS_NORMAL(int16_t);
-DEFINE_CPU_TRANS_NORMAL(uint8_t);
-DEFINE_CPU_TRANS_NORMAL(int8_t);
-DEFINE_CPU_TRANS_NORMAL(platform::complex<float>);
-DEFINE_CPU_TRANS_NORMAL(platform::complex<double>);
-
-struct TensorSetConstantCPU {
-  TensorSetConstantCPU(framework::Tensor* tensor, float value)
-      : tensor_(tensor), value_(value) {}
-  template <typename T>
-  void apply() const {
-    auto cpu = platform::CPUPlace();
-    auto* begin = tensor_->mutable_data<T>(cpu);
-    std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
-  }
-  framework::Tensor* tensor_;
-  float value_;
-};
-
-template <>
-void set_constant_with_place<platform::XPUPlace>(
-    const platform::DeviceContext& context,
-    framework::Tensor* tensor,
-    float value) {
-  PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
-}
-
-template <>
-void set_constant_with_place<platform::NPUPlace>(
-    const platform::DeviceContext& context,
-    framework::Tensor* tensor,
-    float value) {
-  PADDLE_THROW(platform::errors::Unimplemented("NPUPlace is not supported"));
-}
-
-template <>
-void set_constant_with_place<platform::NPUPinnedPlace>(
-    const platform::DeviceContext& context,
-    framework::Tensor* tensor,
-    float value) {
-  PADDLE_THROW(
-      platform::errors::Unimplemented("NPUPinnedPlace is not supported"));
-}
-
-template <>
-void set_constant_with_place<platform::IPUPlace>(
-    const platform::DeviceContext& context,
-    framework::Tensor* tensor,
-    float value) {
-  PADDLE_THROW(platform::errors::Unimplemented("IPUPlace is not supported"));
-}
-
-template <>
-void set_constant_with_place<platform::CPUPlace>(
-    const platform::DeviceContext& context,
-    framework::Tensor* tensor,
-    float value) {
-  framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value));
-}
-
-template <>
-void set_constant_with_place<platform::MLUPlace>(
-    const platform::DeviceContext& context,
-    framework::Tensor* tensor,
-    float value) {
-  PADDLE_THROW(platform::errors::Unimplemented("MLUPlace is not supported"));
-}
-
-template <>
-void set_constant_with_place<platform::CustomPlace>(
-    const platform::DeviceContext& context,
-    framework::Tensor* tensor,
-    float value) {
-  PADDLE_THROW(platform::errors::Unimplemented("CustomPlace is not supported"));
-}
-
-template <>
-void set_constant_with_place<platform::CUDAPinnedPlace>(
-    const platform::DeviceContext& context,
-    framework::Tensor* tensor,
-    float value) {
-  framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value));
-}
-
-struct TensorSetConstantWithPlace : public boost::static_visitor<void> {
-  TensorSetConstantWithPlace(const platform::DeviceContext& context,
-                             framework::Tensor* tensor,
-                             float value)
-      : context_(context), tensor_(tensor), value_(value) {}
-
-  template <typename Place>
-  void operator()(Place place) const {
-    set_constant_with_place<Place>(context_, tensor_, value_);
-  }
-
-  const platform::DeviceContext& context_;
-  framework::Tensor* tensor_;
-  float value_;
-};
-
-void set_constant(const platform::DeviceContext& context,
-                  framework::Tensor* tensor,
-                  float value) {
-  TensorSetConstantWithPlace func(context, tensor, value);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  // tensor->place().apply_visitor(func);
-  paddle::platform::VisitPlace(tensor->place(), func);
-#else
-  func(platform::CPUPlace());
-#endif
-}
-
-template <typename T>
-struct RowwiseAdd<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& vector,
-                  framework::Tensor* output) {
-    auto in_dims = input.dims();
-    auto out_dims = output->dims();
-    auto size = input.numel() / in_dims[0];
-    PADDLE_ENFORCE_EQ(
-        vector.numel(),
-        size,
-        platform::errors::InvalidArgument(
-            "The input vector size"
-            " should be equal to the size of each row of input tensor."
-            " Expected vector size=%d, but received %d",
-            size,
-            vector.numel()));
-    const char* in_dims_cstr = in_dims.to_str().c_str();
-    const char* out_dims_cstr = out_dims.to_str().c_str();
-    PADDLE_ENFORCE_EQ(out_dims,
-                      in_dims,
-                      platform::errors::InvalidArgument(
-                          "The output tensor shape should be same as the input"
-                          " tensor shape. Expected output tensor shape: %s,"
-                          " but received %s",
-                          in_dims_cstr,
-                          out_dims_cstr));
-
-    auto in = framework::EigenMatrix<T>::From(input);
-    auto vec = framework::EigenVector<T>::Flatten(vector);
-    auto out = framework::EigenMatrix<T>::From(*output);
-
-    for (int64_t i = 0; i < in_dims[0]; ++i) {
-      out.chip(i, 0) = in.chip(i, 0) + vec;
-    }
-  }
-};
-
-template struct RowwiseAdd<platform::CPUDeviceContext, float>;
-template struct RowwiseAdd<platform::CPUDeviceContext, double>;
-
-template struct ColwiseSum<platform::CPUDeviceContext, float>;
-template struct ColwiseSum<platform::CPUDeviceContext, double>;
-template struct ColwiseSum<platform::CPUDeviceContext, int>;
-template struct ColwiseSum<platform::CPUDeviceContext, int64_t>;
-
-template struct RowwiseSum<platform::CPUDeviceContext, float>;
-template struct RowwiseSum<platform::CPUDeviceContext, double>;
-
-template struct RowwiseMean<platform::CPUDeviceContext, float>;
-template struct RowwiseMean<platform::CPUDeviceContext, double>;
-
-template <typename T>
-struct ElementwiseAddTo<platform::CPUDeviceContext, T> {
-  void operator()(platform::CPUDeviceContext* ctx,
-                  const framework::Tensor& src,
-                  framework::Tensor* dst) {
-    auto in = framework::EigenVector<T>::Flatten(src);
-    auto out = framework::EigenVector<T>::Flatten(*dst);
-    auto& place = *(ctx->eigen_device());
-    out.device(place) = out + in;
-  }
-};
-
-template struct ElementwiseAddTo<platform::CPUDeviceContext, platform::float16>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc
index 133680ca9a8c7..0648f2497d9d7 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.cc
+++ b/paddle/fluid/operators/math/matrix_bit_code.cc
@@ -19,7 +19,7 @@ namespace operators {
 namespace math {
 
 template <typename T>
-struct MatrixBitCodeFunctorAdd : public boost::static_visitor<void> {
+struct MatrixBitCodeFunctorAdd {
   const framework::Tensor &vec_;
   framework::Tensor *tmat_;
 
@@ -51,7 +51,7 @@ void MatrixBitCodeFunctor<T>::Add(const framework::Tensor &vec,
 }
 
 template <typename T>
-struct MatrixBitCodeFunctorAddGrad : public boost::static_visitor<void> {
+struct MatrixBitCodeFunctorAddGrad {
   const framework::Tensor &tmat_;
   framework::Tensor *vec_;
   MatrixBitCodeFunctorAddGrad(const framework::Tensor &tmat,
@@ -83,7 +83,7 @@ void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor &tmat,
 }
 
 template <typename T>
-struct MatrixBitCodeFunctorSum : public boost::static_visitor<void> {
+struct MatrixBitCodeFunctorSum {
   const framework::Tensor &tmat_;
   framework::Tensor *sum_;
   T scale_sum_;
@@ -125,7 +125,7 @@ void MatrixBitCodeFunctor<T>::Sum(const framework::Tensor &tmat,
 }
 
 template <typename T>
-struct MatrixBitCodeFunctorMul : public boost::static_visitor<void> {
+struct MatrixBitCodeFunctorMul {
   framework::Tensor *tmat_;
   const framework::Tensor &weight_;
   const framework::Tensor &input_;
@@ -137,8 +137,7 @@ struct MatrixBitCodeFunctorMul : public boost::static_visitor<void> {
 
   template <typename CodeTable>
   void operator()(const CodeTable &code_table) {
-    auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, T>(
-        platform::CPUDeviceContext());
+    auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(phi::CPUContext());
     size_t num_samples = tmat_->dims()[0];
     size_t tmat_width = tmat_->dims()[1];
     size_t input_width = input_.dims()[1];
@@ -175,7 +174,7 @@ class ReservedVector : public std::vector<T> {
 };
 
 template <typename T>
-struct MatrixBitCodeFunctorMulGradWeight : public boost::static_visitor<void> {
+struct MatrixBitCodeFunctorMulGradWeight {
   const framework::Tensor &tmat_;
   framework::Tensor *weight_;
   const framework::Tensor &input_;
@@ -185,8 +184,7 @@ struct MatrixBitCodeFunctorMulGradWeight : public boost::static_visitor<void> {
       : tmat_(tmat), weight_(weight), input_(input) {}
   template <typename CodeTable>
   void operator()(const CodeTable &code_table) {
-    auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, T>(
-        platform::CPUDeviceContext());
+    auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(phi::CPUContext());
     size_t num_samples = tmat_.dims()[0];
     size_t input_width = input_.dims()[1];
     size_t tmat_width = tmat_.dims()[1];
@@ -226,8 +224,7 @@ void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor &tmat,
 }
 
 template <typename T>
-struct MatrixBitCodeFunctorMulGradWeightSR
-    : public boost::static_visitor<void> {
+struct MatrixBitCodeFunctorMulGradWeightSR {
   const framework::Tensor &tmat_;
   phi::SelectedRows *weight_;
   const framework::Tensor &input_;
@@ -239,8 +236,7 @@ struct MatrixBitCodeFunctorMulGradWeightSR
 
   template <typename CodeTable>
   void operator()(const CodeTable &code_table) {
-    auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, T>(
-        platform::CPUDeviceContext());
+    auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(phi::CPUContext());
     size_t num_samples = tmat_.dims()[0];
     size_t input_width = input_.dims()[1];
     size_t tmat_width = tmat_.dims()[1];
@@ -283,7 +279,7 @@ void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor &tmat,
 }
 
 template <typename T>
-struct MatrixBitCodeFunctorMulGradError : public boost::static_visitor<void> {
+struct MatrixBitCodeFunctorMulGradError {
   const framework::Tensor &tmat_;
   const framework::Tensor &weight_;
   framework::Tensor *input_;
@@ -327,7 +323,7 @@ void MatrixBitCodeFunctor<T>::MulGradError(const framework::Tensor &tmat,
 }
 
 template <typename T>
-struct MatrixBitCodeFunctorSub : public boost::static_visitor<void> {
+struct MatrixBitCodeFunctorSub {
   framework::Tensor *tmat_;
 
   explicit MatrixBitCodeFunctorSub(framework::Tensor *tmat) : tmat_(tmat) {}
diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h
index 780003c1b451e..7c9d94aa8713b 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
@@ -23,7 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/variant.h"
+
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 
 #if defined(_WIN32)
diff --git a/paddle/fluid/operators/math/matrix_solve.cc b/paddle/fluid/operators/math/matrix_solve.cc
deleted file mode 100644
index 4d38dc7137935..0000000000000
--- a/paddle/fluid/operators/math/matrix_solve.cc
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/matrix_solve.h"
-
-#include "Eigen/Core"
-#include "Eigen/LU"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-class MatrixSolveFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& dev_ctx,
-                  const framework::Tensor& a,
-                  const framework::Tensor& b,
-                  framework::Tensor* out) {
-    compute_solve_eigen<platform::CPUDeviceContext, T>(dev_ctx, a, b, out);
-  }
-};
-
-template class MatrixSolveFunctor<platform::CPUDeviceContext, float>;
-template class MatrixSolveFunctor<platform::CPUDeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/matrix_solve.cu.cc b/paddle/fluid/operators/math/matrix_solve.cu.cc
deleted file mode 100644
index 41b14c07b7360..0000000000000
--- a/paddle/fluid/operators/math/matrix_solve.cu.cc
+++ /dev/null
@@ -1,189 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/matrix_solve.h"
-
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/solve_op.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace platform {
-class CUDADeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename DeviceContext, typename T>
-class MatrixSolveFunctor;
-
-template <typename T>
-class MatrixSolveFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& a,
-                  const framework::Tensor& b,
-                  framework::Tensor* out) {
-#ifndef PADDLE_WITH_HIP
-
-    // solve the equation: Ax = B,
-    // use cuBlas cublas<S/D>getrfBatched funcion to performs the LU
-    // factorization of each matrix A,
-    // and then use cuBlas cublas<S/D>getriBatched function to solve the
-    // equation after LU factorization.
-    // ref:
-    // https://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-getrfbatched
-    const auto& a_dims = a.dims();
-    const int a_rank = a_dims.size();
-    int n = a_dims[a_rank - 1];
-    int lda = n;
-    int batch_size = a_rank > 2 ? a.numel() / (n * n) : 1;
-
-    const auto& b_dims = b.dims();
-    const int b_rank = b_dims.size();
-    int nrhs = b_dims[b_rank - 1];
-    int ldb = b_dims[b_rank - 2];
-
-    // make sure the out dims is right
-    out->Resize(b_dims);
-    out->mutable_data<T>(context.GetPlace());
-
-    // copy input A to a temporary tensor tmp_a,
-    // LU factorization, written back to original matrix A, so in the beginning,
-    // it's necessary to create a temporary tensor tmp_a.
-    Tensor tmp_a(a.dtype());
-    tmp_a.Resize(a.dims());
-    tmp_a.mutable_data<T>(context.GetPlace());
-    framework::TensorCopy(a, context.GetPlace(), &tmp_a);
-
-    // copy input B to a temporary tensor tmp_b, and transpose tmp_b,
-    // because cuBlas assumes column-major while Paddle uses row-majar.
-    Tensor tmp_b(b.type());
-    const auto& new_dims_vec = getNewDimsVec(b_dims);
-    tmp_b.Resize(phi::make_ddim(new_dims_vec));
-    tmp_b.mutable_data<T>(context.GetPlace());
-    phi::funcs::TransposeNormal<platform::CUDADeviceContext, T> trans;
-    std::vector<int> new_axis = getNewAxis(b_rank);
-    trans(context, b, &tmp_b, new_axis);
-
-    const T* a_data_in_gpu = tmp_a.data<T>();
-    const T* b_data_in_gpu = tmp_b.data<T>();
-
-    std::vector<const T*> cpu_ptrs(batch_size * 2);
-    for (int i = 0; i < batch_size; ++i) {
-      cpu_ptrs[i] = a_data_in_gpu + i * n * n;
-      cpu_ptrs[i + batch_size] = b_data_in_gpu + i * n * nrhs;
-    }
-
-    // Copy the addresses of A and tmp_b from host to device.
-    memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
-        memory::Alloc(context, cpu_ptrs.size() * sizeof(T*));
-    memory::Copy(context.GetPlace(),
-                 tmp_gpu_ptrs_data->ptr(),
-                 platform::CPUPlace(),
-                 static_cast<void*>(cpu_ptrs.data()),
-                 cpu_ptrs.size() * sizeof(T*),
-                 context.stream());
-
-    T** gpu_tmp_b_ptrs =
-        reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()) + batch_size;
-
-    // Allocate device memory for BatchedGETRF's info and pivots.
-    int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
-    memory::allocation::AllocationPtr tmp_gpu_info_data =
-        memory::Alloc(context, num_ints * sizeof(int));
-    int* gpu_info_ptr = reinterpret_cast<int*>(tmp_gpu_info_data->ptr());
-
-    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(context);
-
-    // only for singular checking
-    std::vector<int> info;
-    info.resize(batch_size);
-
-    int* gpu_pivot_ptr =
-        reinterpret_cast<int*>(tmp_gpu_info_data->ptr()) + batch_size;
-
-    // This function performs the LU factorization of each matrix A by the
-    // equation A = L * U. L and U are written back to original matrix A,
-    // and diagonal elements of L are discarded.
-    blas.BatchedGETRF(n,
-                      reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()),
-                      gpu_pivot_ptr,
-                      gpu_info_ptr,
-                      batch_size);
-
-    // check whether BatchedGETRF is executed successfully or not
-    memory::Copy(platform::CPUPlace(),
-                 info.data(),
-                 context.GetPlace(),
-                 gpu_info_ptr,
-                 sizeof(int) * batch_size,
-                 context.stream());
-    for (int i = 0; i < batch_size; ++i) {
-      PADDLE_ENFORCE_EQ(info[i],
-                        0,
-                        platform::errors::PreconditionNotMet(
-                            "For batch [%d]: U(%d, %d) is zero, singular U. "
-                            "Please check the matrix value and change it to a "
-                            "non-singular matrix",
-                            i,
-                            info[i],
-                            info[i]));
-    }
-
-    // hold the result code from BatchedGETRS
-    int host_info = 0;
-
-    // to solve the equation after LU factorization
-    CBLAS_TRANSPOSE transA = CblasTrans;
-    blas.BatchedGETRS(transA,
-                      n,
-                      nrhs,
-                      reinterpret_cast<const T**>(tmp_gpu_ptrs_data->ptr()),
-                      lda,
-                      gpu_pivot_ptr,
-                      gpu_tmp_b_ptrs,
-                      ldb,
-                      &host_info,
-                      batch_size);
-
-    // check whether BatchedGETRS is executed successfully or not
-    PADDLE_ENFORCE_EQ(host_info,
-                      0,
-                      platform::errors::InvalidArgument(
-                          "The [%d]'th argument to cublas*getrsBatched had "
-                          "an illegal value.",
-                          -host_info));
-
-    // transpose tmp_b to get the final result in row-major form.
-    phi::funcs::TransposeNormal<platform::CUDADeviceContext, T> trans2;
-    trans2(context, tmp_b, out, new_axis);
-
-#else
-    compute_solve_eigen<platform::CUDADeviceContext, T>(context, a, b, out);
-#endif
-  }
-};
-
-template class MatrixSolveFunctor<platform::CUDADeviceContext, float>;
-template class MatrixSolveFunctor<platform::CUDADeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/maxouting.cc b/paddle/fluid/operators/math/maxouting.cc
index 7729b86cc3e0b..2205ed51e1913 100644
--- a/paddle/fluid/operators/math/maxouting.cc
+++ b/paddle/fluid/operators/math/maxouting.cc
@@ -109,11 +109,6 @@ void MaxOutGradFunctor<DeviceContext, T>::operator()(
   }
 }
 
-template class MaxOutGradFunctor<platform::CPUDeviceContext, float>;
-template class MaxOutGradFunctor<platform::CPUDeviceContext, double>;
-template class MaxOutFunctor<platform::CPUDeviceContext, float>;
-template class MaxOutFunctor<platform::CPUDeviceContext, double>;
-
 template class MaxOutGradFunctor<phi::CPUContext, float>;
 template class MaxOutGradFunctor<phi::CPUContext, double>;
 template class MaxOutFunctor<phi::CPUContext, float>;
diff --git a/paddle/fluid/operators/math/sample_prob.cc b/paddle/fluid/operators/math/sample_prob.cc
index 16342493e4597..18321cf9b9ece 100644
--- a/paddle/fluid/operators/math/sample_prob.cc
+++ b/paddle/fluid/operators/math/sample_prob.cc
@@ -14,19 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/sample_prob.h"
 
-namespace paddle {
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
-namespace math {
-
-template class SampleWithProb<platform::CPUDeviceContext, float>;
-template class SampleWithProb<platform::CPUDeviceContext, double>;
-
-}  // namespace math
+namespace math {}  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index 81b0e9102bbac..9ec1172c410d8 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -25,8 +25,8 @@ namespace paddle {
 namespace operators {
 namespace math {
 template <typename T>
-struct SelectedRowsAdd<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context,
+struct SelectedRowsAdd<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext& context,
                   const phi::SelectedRows& input1,
                   const phi::SelectedRows& input2,
                   phi::SelectedRows* output) {
@@ -106,12 +106,12 @@ struct SelectedRowsAdd<platform::CPUDeviceContext, T> {
   }
 };
 
-template struct SelectedRowsAdd<platform::CPUDeviceContext, float>;
-template struct SelectedRowsAdd<platform::CPUDeviceContext, double>;
+template struct SelectedRowsAdd<phi::CPUContext, float>;
+template struct SelectedRowsAdd<phi::CPUContext, double>;
 
 template <typename T>
-struct SelectedRowsAddTensor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context,
+struct SelectedRowsAddTensor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext& context,
                   const phi::SelectedRows& input1,
                   const framework::Tensor& input2,
                   framework::Tensor* output) {
@@ -156,7 +156,7 @@ struct SelectedRowsAddTensor<platform::CPUDeviceContext, T> {
             in1_row_numel,
             output->numel() / in1_height));
 
-    phi::funcs::SetConstant<platform::CPUDeviceContext, T> functor;
+    phi::funcs::SetConstant<phi::CPUContext, T> functor;
     functor(context, output, 0.0);
 
     auto* in1_data = in1_value.data<T>();
@@ -175,12 +175,12 @@ struct SelectedRowsAddTensor<platform::CPUDeviceContext, T> {
   }
 };
 
-template struct SelectedRowsAddTensor<platform::CPUDeviceContext, float>;
-template struct SelectedRowsAddTensor<platform::CPUDeviceContext, double>;
+template struct SelectedRowsAddTensor<phi::CPUContext, float>;
+template struct SelectedRowsAddTensor<phi::CPUContext, double>;
 
 template <typename T>
-struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context,
+struct SelectedRowsAddTo<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext& context,
                   const phi::SelectedRows& input1,
                   const int64_t input2_offset,
                   phi::SelectedRows* input2) {
@@ -225,14 +225,14 @@ struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
   }
 };
 
-template struct SelectedRowsAddTo<platform::CPUDeviceContext, float>;
-template struct SelectedRowsAddTo<platform::CPUDeviceContext, double>;
-template struct SelectedRowsAddTo<platform::CPUDeviceContext, int>;
-template struct SelectedRowsAddTo<platform::CPUDeviceContext, int64_t>;
+template struct SelectedRowsAddTo<phi::CPUContext, float>;
+template struct SelectedRowsAddTo<phi::CPUContext, double>;
+template struct SelectedRowsAddTo<phi::CPUContext, int>;
+template struct SelectedRowsAddTo<phi::CPUContext, int64_t>;
 
 template <typename T>
-struct SelectedRowsSumTo<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context,
+struct SelectedRowsSumTo<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext& context,
                   const std::vector<phi::SelectedRows*>& input1,
                   const std::vector<int64_t>& input2_offsets,
                   phi::SelectedRows* input2) {
@@ -262,7 +262,7 @@ struct SelectedRowsSumTo<platform::CPUDeviceContext, T> {
 
     auto* in2_value = input2->mutable_value();
     auto* in2_data = in2_value->data<T>();
-    auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
+    auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(context);
     size_t offset = 0u;
     for (size_t i = 0u; i != input1.size(); ++i) {
       auto& in_value = input1[i]->value();
@@ -273,53 +273,8 @@ struct SelectedRowsSumTo<platform::CPUDeviceContext, T> {
   }
 };
 
-template struct SelectedRowsSumTo<platform::CPUDeviceContext, float>;
-template struct SelectedRowsSumTo<platform::CPUDeviceContext, double>;
-
-template <typename T>
-struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context,
-                  const phi::SelectedRows& input1,
-                  framework::Tensor* input2) {
-    if (UNLIKELY(input1.rows().size() == 0)) {
-      LOG(WARNING) << "input selected rows is empty!";
-      return;
-    }
-    auto in1_height = input1.height();
-    const auto& in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(
-        in1_height,
-        in2_dims[0],
-        platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But received first input height = "
-                                          "[%d], second input height = [%d]",
-                                          in1_height,
-                                          in2_dims[0]));
-
-    auto& in1_value = input1.value();
-    auto& in1_rows = input1.rows();
-
-    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(
-        in1_row_numel,
-        input2->numel() / in1_height,
-        platform::errors::InvalidArgument(
-            "The two inputs width must be equal."
-            "But received first input width = [%d], second input width = [%d]",
-            in1_row_numel,
-            input2->numel() / in1_height));
-
-    auto* in1_data = in1_value.data<T>();
-    auto* input2_data = input2->data<T>();
-
-    for (size_t i = 0; i < in1_rows.size(); i++) {
-      for (int64_t j = 0; j < in1_row_numel; j++) {
-        input2_data[in1_rows[i] * in1_row_numel + j] +=
-            in1_data[i * in1_row_numel + j];
-      }
-    }
-  }
-};
+template struct SelectedRowsSumTo<phi::CPUContext, float>;
+template struct SelectedRowsSumTo<phi::CPUContext, double>;
 
 template <typename T>
 struct SelectedRowsAddToTensor<phi::CPUContext, T> {
@@ -366,13 +321,6 @@ struct SelectedRowsAddToTensor<phi::CPUContext, T> {
   }
 };
 
-template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, float>;
-template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, double>;
-template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int>;
-template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int64_t>;
-template struct SelectedRowsAddToTensor<platform::CPUDeviceContext,
-                                        platform::bfloat16>;
-
 template struct SelectedRowsAddToTensor<phi::CPUContext, float>;
 template struct SelectedRowsAddToTensor<phi::CPUContext, double>;
 template struct SelectedRowsAddToTensor<phi::CPUContext, int>;
@@ -582,34 +530,6 @@ struct MergeAddImpl {
   }
 };
 
-template <typename T>
-struct MergeAdd<platform::CPUDeviceContext, T> {
-  // unary functor, merge by adding duplicated rows in
-  // the input SelectedRows object.
-  phi::SelectedRows operator()(const platform::CPUDeviceContext& context,
-                               const phi::SelectedRows& input,
-                               const bool sorted_result) {
-    return MergeAddImpl<platform::CPUDeviceContext, T>()(
-        context, input, sorted_result);
-  }
-
-  void operator()(const platform::CPUDeviceContext& context,
-                  const phi::SelectedRows& input,
-                  phi::SelectedRows* output,
-                  const bool sorted_result) {
-    MergeAddImpl<platform::CPUDeviceContext, T>()(
-        context, input, output, sorted_result);
-  }
-
-  void operator()(const platform::CPUDeviceContext& context,
-                  const std::vector<const phi::SelectedRows*>& inputs,
-                  phi::SelectedRows* output,
-                  const bool sorted_result) {
-    MergeAddImpl<platform::CPUDeviceContext, T>()(
-        context, inputs, output, sorted_result);
-  }
-};
-
 template <typename T>
 struct MergeAdd<phi::CPUContext, T> {
   // unary functor, merge by adding duplicated rows in
@@ -635,10 +555,8 @@ struct MergeAdd<phi::CPUContext, T> {
   }
 };
 
-#define TEMPLATE_SPECIALIZED_FOR_MERGEADD_CPU(dtype)               \
-  template struct MergeAddImpl<platform::CPUDeviceContext, dtype>; \
-  template struct MergeAddImpl<phi::CPUContext, dtype>;            \
-  template struct MergeAdd<platform::CPUDeviceContext, dtype>;     \
+#define TEMPLATE_SPECIALIZED_FOR_MERGEADD_CPU(dtype)    \
+  template struct MergeAddImpl<phi::CPUContext, dtype>; \
   template struct MergeAdd<phi::CPUContext, dtype>;
 
 TEMPLATE_SPECIALIZED_FOR_MERGEADD_CPU(float)
@@ -816,15 +734,15 @@ struct MergeAdd<platform::XPUDeviceContext, T> {
 
 #endif
 template <typename T>
-struct MergeAverage<platform::CPUDeviceContext, T> {
-  phi::SelectedRows operator()(const platform::CPUDeviceContext& context,
+struct MergeAverage<phi::CPUContext, T> {
+  phi::SelectedRows operator()(const phi::CPUContext& context,
                                const phi::SelectedRows& input) {
     phi::SelectedRows out;
     (*this)(context, input, &out);
     return out;
   }
 
-  void operator()(const platform::CPUDeviceContext& context,
+  void operator()(const phi::CPUContext& context,
                   const phi::SelectedRows& input,
                   phi::SelectedRows* output) {
     std::vector<const phi::SelectedRows*> inputs;
@@ -832,7 +750,7 @@ struct MergeAverage<platform::CPUDeviceContext, T> {
     (*this)(context, inputs, output);
   }
 
-  void operator()(const platform::CPUDeviceContext& context,
+  void operator()(const phi::CPUContext& context,
                   const std::vector<const phi::SelectedRows*>& inputs,
                   phi::SelectedRows* output) {
     if (inputs.size() == 0) {
@@ -885,7 +803,7 @@ struct MergeAverage<platform::CPUDeviceContext, T> {
 
     out.set_rows(merge_rows);
 
-    phi::funcs::SetConstant<platform::CPUDeviceContext, T> constant_functor;
+    phi::funcs::SetConstant<phi::CPUContext, T> constant_functor;
     constant_functor(context, out.mutable_value(), 0.0);
 
     std::unordered_map<int64_t, size_t> rows_to_id;
@@ -893,7 +811,7 @@ struct MergeAverage<platform::CPUDeviceContext, T> {
       rows_to_id[merge_rows[i]] = i;
     }
 
-    auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
+    auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(context);
     for (auto* input : inputs) {
       if (input->rows().size() == 0) {
         continue;
@@ -923,14 +841,14 @@ struct MergeAverage<platform::CPUDeviceContext, T> {
 template struct MergeAdd<platform::XPUDeviceContext, float>;
 #endif
 
-template struct MergeAverage<platform::CPUDeviceContext, int>;
-template struct MergeAverage<platform::CPUDeviceContext, int64_t>;
-template struct MergeAverage<platform::CPUDeviceContext, float>;
-template struct MergeAverage<platform::CPUDeviceContext, double>;
+template struct MergeAverage<phi::CPUContext, int>;
+template struct MergeAverage<phi::CPUContext, int64_t>;
+template struct MergeAverage<phi::CPUContext, float>;
+template struct MergeAverage<phi::CPUContext, double>;
 
 template <typename T>
-struct UpdateToTensor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context,
+struct UpdateToTensor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext& context,
                   const ScatterOps& op,
                   const phi::SelectedRows& input1,
                   framework::Tensor* input2) {
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cc
index d0383ee5fc21d..ecb8aa7824724 100644
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc
@@ -19,8 +19,8 @@ limitations under the License. */
 
 TEST(selected_rows_functor, cpu_add) {
   paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CPUDeviceContext ctx(cpu_place);
-  phi::funcs::SetConstant<paddle::platform::CPUDeviceContext, float> functor;
+  phi::CPUContext ctx(cpu_place);
+  phi::funcs::SetConstant<phi::CPUContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
 
@@ -48,9 +48,7 @@ TEST(selected_rows_functor, cpu_add) {
   // simplely concat two SelectedRows
   out_value->mutable_data<float>(phi::make_ddim({7, 10}), cpu_place);
 
-  paddle::operators::math::SelectedRowsAdd<paddle::platform::CPUDeviceContext,
-                                           float>
-      add_functor;
+  paddle::operators::math::SelectedRowsAdd<phi::CPUContext, float> add_functor;
   add_functor(ctx, *selected_rows1, *selected_rows2, output.get());
 
   auto out_height = output->height();
@@ -90,9 +88,8 @@ TEST(selected_rows_functor, cpu_add) {
       new paddle::framework::Tensor()};
   tensor2->mutable_data<float>(phi::make_ddim({height, row_numel}), cpu_place);
 
-  paddle::operators::math::
-      SelectedRowsAddTensor<paddle::platform::CPUDeviceContext, float>
-          add_tensor_functor;
+  paddle::operators::math::SelectedRowsAddTensor<phi::CPUContext, float>
+      add_tensor_functor;
   add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
 
   auto* tensor2_data = tensor2->data<float>();
@@ -114,8 +111,8 @@ TEST(selected_rows_functor, cpu_add) {
 
 TEST(selected_rows_functor, cpu_add_to) {
   paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CPUDeviceContext ctx(cpu_place);
-  phi::funcs::SetConstant<paddle::platform::CPUDeviceContext, float> functor;
+  phi::CPUContext ctx(cpu_place);
+  phi::funcs::SetConstant<phi::CPUContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
 
@@ -144,8 +141,7 @@ TEST(selected_rows_functor, cpu_add_to) {
   // simplely concat two SelectedRows
   out_value->mutable_data<float>(phi::make_ddim({7, 10}), cpu_place);
 
-  paddle::operators::math::SelectedRowsAddTo<paddle::platform::CPUDeviceContext,
-                                             float>
+  paddle::operators::math::SelectedRowsAddTo<phi::CPUContext, float>
       add_to_functor;
   add_to_functor(ctx, *selected_rows1, 0, output.get());
   add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get());
@@ -183,9 +179,8 @@ TEST(selected_rows_functor, cpu_add_to) {
   tensor1->mutable_data<float>(phi::make_ddim({height, row_numel}), cpu_place);
   functor(ctx, tensor1.get(), 3.0);
 
-  paddle::operators::math::
-      SelectedRowsAddToTensor<paddle::platform::CPUDeviceContext, float>
-          add_to_tensor_functor;
+  paddle::operators::math::SelectedRowsAddToTensor<phi::CPUContext, float>
+      add_to_tensor_functor;
   add_to_tensor_functor(ctx, *output, tensor1.get());
 
   auto* tensor1_data = tensor1->data<float>();
@@ -207,8 +202,8 @@ TEST(selected_rows_functor, cpu_add_to) {
 
 TEST(selected_rows_functor, cpu_merge_average_float) {
   paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CPUDeviceContext ctx(cpu_place);
-  phi::funcs::SetConstant<paddle::platform::CPUDeviceContext, float> functor;
+  phi::CPUContext ctx(cpu_place);
+  phi::funcs::SetConstant<phi::CPUContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
 
@@ -221,9 +216,8 @@ TEST(selected_rows_functor, cpu_merge_average_float) {
       cpu_place);
   functor(ctx, in_value, 1.0);
 
-  paddle::operators::math::scatter::
-      MergeAverage<paddle::platform::CPUDeviceContext, float>
-          merge_average_functor;
+  paddle::operators::math::scatter::MergeAverage<phi::CPUContext, float>
+      merge_average_functor;
   phi::SelectedRows output = merge_average_functor(ctx, *selected_rows);
 
   auto out_height = output.height();
@@ -243,8 +237,8 @@ TEST(selected_rows_functor, cpu_merge_average_float) {
 
 TEST(selected_rows_functor, cpu_merge_add_float) {
   paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CPUDeviceContext ctx(cpu_place);
-  phi::funcs::SetConstant<paddle::platform::CPUDeviceContext, float> functor;
+  phi::CPUContext ctx(cpu_place);
+  phi::funcs::SetConstant<phi::CPUContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
 
@@ -259,8 +253,7 @@ TEST(selected_rows_functor, cpu_merge_add_float) {
 
   std::unique_ptr<phi::SelectedRows> output{new phi::SelectedRows()};
 
-  paddle::operators::math::scatter::MergeAdd<paddle::platform::CPUDeviceContext,
-                                             float>
+  paddle::operators::math::scatter::MergeAdd<phi::CPUContext, float>
       merge_add_functor;
   merge_add_functor(ctx, *selected_rows, output.get());
 
@@ -281,8 +274,8 @@ TEST(selected_rows_functor, cpu_merge_add_float) {
 
 TEST(selected_rows_functor, cpu_merge_add_int) {
   paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CPUDeviceContext ctx(cpu_place);
-  phi::funcs::SetConstant<paddle::platform::CPUDeviceContext, int> functor;
+  phi::CPUContext ctx(cpu_place);
+  phi::funcs::SetConstant<phi::CPUContext, int> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
 
@@ -297,8 +290,7 @@ TEST(selected_rows_functor, cpu_merge_add_int) {
 
   std::unique_ptr<phi::SelectedRows> output{new phi::SelectedRows()};
 
-  paddle::operators::math::scatter::MergeAdd<paddle::platform::CPUDeviceContext,
-                                             int>
+  paddle::operators::math::scatter::MergeAdd<phi::CPUContext, int>
       merge_add_functor;
   merge_add_functor(ctx, *selected_rows, output.get());
 
@@ -319,8 +311,8 @@ TEST(selected_rows_functor, cpu_merge_add_int) {
 
 TEST(selected_rows_functor, cpu_merge_add_multi) {
   paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CPUDeviceContext ctx(cpu_place);
-  phi::funcs::SetConstant<paddle::platform::CPUDeviceContext, float> set_const;
+  phi::CPUContext ctx(cpu_place);
+  phi::funcs::SetConstant<phi::CPUContext, float> set_const;
 
   int64_t height = 10;
   int64_t row_numel = 8;
@@ -345,8 +337,7 @@ TEST(selected_rows_functor, cpu_merge_add_multi) {
 
   std::unique_ptr<phi::SelectedRows> output{new phi::SelectedRows()};
   output->set_height(height);
-  paddle::operators::math::scatter::MergeAdd<paddle::platform::CPUDeviceContext,
-                                             float>
+  paddle::operators::math::scatter::MergeAdd<phi::CPUContext, float>
       merge_add_functor;
 
   std::vector<const phi::SelectedRows*> inputs;
@@ -370,8 +361,8 @@ TEST(selected_rows_functor, cpu_merge_add_multi) {
 
 TEST(selected_rows_functor, cpu_merge_add_multi_noduplicated) {
   paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CPUDeviceContext ctx(cpu_place);
-  phi::funcs::SetConstant<paddle::platform::CPUDeviceContext, float> set_const;
+  phi::CPUContext ctx(cpu_place);
+  phi::funcs::SetConstant<phi::CPUContext, float> set_const;
 
   int64_t height = 10;
   int64_t row_numel = 8;
@@ -396,8 +387,7 @@ TEST(selected_rows_functor, cpu_merge_add_multi_noduplicated) {
 
   std::unique_ptr<phi::SelectedRows> output{new phi::SelectedRows()};
   output->set_height(height);
-  paddle::operators::math::scatter::MergeAdd<paddle::platform::CPUDeviceContext,
-                                             float>
+  paddle::operators::math::scatter::MergeAdd<phi::CPUContext, float>
       merge_add_functor;
 
   std::vector<const phi::SelectedRows*> inputs;
@@ -427,8 +417,8 @@ TEST(selected_rows_functor, cpu_merge_add_multi_noduplicated) {
 
 TEST(selected_rows_functor, cpu_sum_to) {
   paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CPUDeviceContext ctx(cpu_place);
-  phi::funcs::SetConstant<paddle::platform::CPUDeviceContext, float> functor;
+  phi::CPUContext ctx(cpu_place);
+  phi::funcs::SetConstant<phi::CPUContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
   std::vector<int64_t> rows1{0, 4, 7};
@@ -454,8 +444,7 @@ TEST(selected_rows_functor, cpu_sum_to) {
   auto* out_value = output->mutable_value();
   // simplely concat two SelectedRows
   out_value->mutable_data<float>(phi::make_ddim({7, 10}), cpu_place);
-  paddle::operators::math::SelectedRowsSumTo<paddle::platform::CPUDeviceContext,
-                                             float>
+  paddle::operators::math::SelectedRowsSumTo<phi::CPUContext, float>
       sum_to_functor;
   sum_to_functor(ctx,
                  std::vector<phi::SelectedRows*>(
@@ -490,9 +479,8 @@ TEST(selected_rows_functor, cpu_sum_to) {
       new paddle::framework::Tensor()};
   tensor1->mutable_data<float>(phi::make_ddim({height, row_numel}), cpu_place);
   functor(ctx, tensor1.get(), 3.0);
-  paddle::operators::math::
-      SelectedRowsAddToTensor<paddle::platform::CPUDeviceContext, float>
-          add_to_tensor_functor;
+  paddle::operators::math::SelectedRowsAddToTensor<phi::CPUContext, float>
+      add_to_tensor_functor;
   add_to_tensor_functor(ctx, *output, tensor1.get());
   auto* tensor1_data = tensor1->data<float>();
   // row0: 1.0 + 2.0 + 3.0
diff --git a/paddle/fluid/operators/math/sequence_padding.cc b/paddle/fluid/operators/math/sequence_padding.cc
index 1a952bbb62d52..826eda5559a46 100644
--- a/paddle/fluid/operators/math/sequence_padding.cc
+++ b/paddle/fluid/operators/math/sequence_padding.cc
@@ -20,13 +20,6 @@ namespace phi {
 class DenseTensor;
 }  // namespace phi
 
-namespace paddle {
-namespace framework {}  // namespace framework
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 namespace math {
@@ -101,66 +94,6 @@ static void fast_mem_init(void* dest,
   }
 }
 
-template <typename T>
-class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::LoDTensor& seq_tensor,
-                  framework::LoDTensor* pad_tensor,
-                  const framework::LoDTensor& pad_value,
-                  int pad_seq_len = -1,
-                  int lod_level = 0,
-                  bool norm_by_times = false,
-                  const PadLayout layout = kBatchLengthWidth) {
-    auto seq_lod = seq_tensor.lod();
-    const auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level];
-    const auto& seq_tensor_dims = seq_tensor.dims();
-    const auto& pad_tensor_dims = pad_tensor->dims();
-    if (pad_seq_len == -1) {
-      pad_seq_len = MaximumSequenceLength(seq_offsets);
-    }
-    int step_width = seq_tensor.numel() / seq_tensor_dims[0];
-
-    CheckDims(seq_tensor_dims,
-              pad_tensor_dims,
-              seq_offsets,
-              pad_seq_len,
-              step_width,
-              layout);
-
-    PADDLE_ENFORCE_EQ(
-        pad_value.numel() == 1 || pad_value.numel() == step_width,
-        true,
-        platform::errors::InvalidArgument(
-            "The numel of 'pad_value' can only be 1 or be equal to the "
-            "'step_width', but got %ld != 1 and %ld. Please check the input "
-            "value.",
-            pad_value.numel(),
-            step_width));
-
-    // fill padding value
-    T* pad_data = pad_tensor->data<T>();
-    const T* pad_value_data = pad_value.data<T>();
-    if (pad_value.numel() == 1) {
-      fast_mem_init<T>(
-          pad_data, pad_tensor->numel(), pad_value_data, sizeof(T));
-    } else {
-      for (int i = 0; i < pad_tensor->numel(); i += step_width) {
-        memcpy(pad_data + i, pad_value_data, step_width * sizeof(T));
-      }
-    }
-
-    CopyValidData<T>(pad_tensor,
-                     &seq_tensor,
-                     seq_offsets,
-                     pad_seq_len,
-                     step_width,
-                     norm_by_times,
-                     kSeqToPad,
-                     layout);
-  }
-};
-
 template <typename T>
 class PaddingLoDTensorFunctor<phi::CPUContext, T> {
  public:
@@ -221,42 +154,6 @@ class PaddingLoDTensorFunctor<phi::CPUContext, T> {
   }
 };
 
-template <typename T>
-class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::LoDTensor& pad_tensor,
-                  framework::LoDTensor* seq_tensor,
-                  int pad_seq_len = -1,
-                  int lod_level = 0,
-                  bool norm_by_times = false,
-                  const PadLayout layout = kBatchLengthWidth) {
-    auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level];
-    const auto& seq_tensor_dims = seq_tensor->dims();
-    const auto& pad_tensor_dims = pad_tensor.dims();
-    if (pad_seq_len == -1) {
-      pad_seq_len = MaximumSequenceLength(seq_offsets);
-    }
-    int step_width = seq_tensor->numel() / seq_tensor_dims[0];
-
-    CheckDims(seq_tensor_dims,
-              pad_tensor_dims,
-              seq_offsets,
-              pad_seq_len,
-              step_width,
-              layout);
-
-    CopyValidData<T>(seq_tensor,
-                     &pad_tensor,
-                     seq_offsets,
-                     pad_seq_len,
-                     step_width,
-                     norm_by_times,
-                     kPadToSeq,
-                     layout);
-  }
-};
-
 template <typename T>
 class UnpaddingLoDTensorFunctor<phi::CPUContext, T> {
  public:
@@ -293,16 +190,6 @@ class UnpaddingLoDTensorFunctor<phi::CPUContext, T> {
   }
 };
 
-template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, int>;
-template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, int64_t>;
-template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, float>;
-template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, double>;
-
-template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, int>;
-template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, int64_t>;
-template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, float>;
-template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, double>;
-
 template class PaddingLoDTensorFunctor<phi::CPUContext, int>;
 template class PaddingLoDTensorFunctor<phi::CPUContext, int64_t>;
 template class PaddingLoDTensorFunctor<phi::CPUContext, float>;
diff --git a/paddle/fluid/operators/math/sequence_padding_test.cc b/paddle/fluid/operators/math/sequence_padding_test.cc
index 3b30f9358a3a0..06eca480ec622 100644
--- a/paddle/fluid/operators/math/sequence_padding_test.cc
+++ b/paddle/fluid/operators/math/sequence_padding_test.cc
@@ -101,18 +101,16 @@ void TestSequencePadding(const DeviceContext &context,
 
 TEST(Seq2BatchPadding, CPU) {
   auto place = paddle::platform::CPUPlace();
-  auto *context = static_cast<paddle::platform::CPUDeviceContext *>(
+  auto *context = static_cast<phi::CPUContext *>(
       paddle::platform::DeviceContextPool::Instance().Get(place));
 
   paddle::framework::LoD lod1;
   lod1.push_back(std::vector<size_t>{0, 10});
-  TestSequencePadding<paddle::platform::CPUDeviceContext, float>(
-      *context, lod1, 16);
+  TestSequencePadding<phi::CPUContext, float>(*context, lod1, 16);
 
   paddle::framework::LoD lod2;
   lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
-  TestSequencePadding<paddle::platform::CPUDeviceContext, float>(
-      *context, lod2, 128);
+  TestSequencePadding<phi::CPUContext, float>(*context, lod2, 128);
 }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc
index 69675f5e9219a..a600c37a89108 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
@@ -38,7 +38,7 @@ using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 template <typename T, bool is_test>
 class MaxSeqPoolFunctor {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
+  void operator()(const phi::CPUContext& context,
                   const framework::LoDTensor& input,
                   T pad_value,
                   framework::LoDTensor* output,
@@ -117,7 +117,7 @@ class MaxSeqPoolFunctor {
 template <typename T>
 class MaxSeqPoolFunctor<T, true> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
+  void operator()(const phi::CPUContext& context,
                   const framework::LoDTensor& input,
                   T pad_value,
                   framework::LoDTensor* output,
@@ -178,7 +178,7 @@ class MaxSeqPoolFunctor<T, true> {
 template <typename T>
 class MaxSeqPoolGradFunctor {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
+  void operator()(const phi::CPUContext& context,
                   const framework::LoDTensor& out_grad,
                   const framework::Tensor& index,
                   framework::LoDTensor* in_grad) {
@@ -224,7 +224,7 @@ class MaxSeqPoolGradFunctor {
     const int* max_index = index.data<int>();
     T* ig_data = in_grad->data<T>();
 
-    phi::funcs::SetConstant<platform::CPUDeviceContext, T> set_zero;
+    phi::funcs::SetConstant<phi::CPUContext, T> set_zero;
     set_zero(context, in_grad, static_cast<T>(0.0));
     int64_t num_seq = og_dims[0];
     int64_t dim = out_grad.numel() / num_seq;
@@ -241,7 +241,7 @@ class MaxSeqPoolGradFunctor {
 template <typename T>
 class LastSeqPoolFunctor {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
+  void operator()(const phi::CPUContext& context,
                   const framework::LoDTensor& input,
                   T pad_value,
                   framework::LoDTensor* output) {
@@ -275,7 +275,7 @@ class LastSeqPoolFunctor {
 template <typename T>
 class FirstSeqPoolFunctor {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
+  void operator()(const phi::CPUContext& context,
                   const framework::LoDTensor& input,
                   T pad_value,
                   framework::LoDTensor* output) {
@@ -309,7 +309,7 @@ class FirstSeqPoolFunctor {
 template <typename T>
 class SumSeqPoolGradFunctor {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
+  void operator()(const phi::CPUContext& context,
                   const framework::LoDTensor& out_grad,
                   framework::LoDTensor* in_grad) {
     auto lod_level = in_grad->lod().size();
@@ -328,7 +328,7 @@ class SumSeqPoolGradFunctor {
                           out_w));
     const T* out_g_data = out_grad.data<T>();
     T* in_g_data = in_grad->mutable_data<T>(context.GetPlace());
-    auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
+    auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(context);
     for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
       int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
       if (h == 0) continue;
@@ -343,10 +343,10 @@ class SumSeqPoolGradFunctor {
 };
 
 template <typename T>
-class SequencePoolFunctor<platform::CPUDeviceContext, T> {
+class SequencePoolFunctor<phi::CPUContext, T> {
  public:
   /* max pool has index output */
-  void operator()(const platform::CPUDeviceContext& context,
+  void operator()(const phi::CPUContext& context,
                   const std::string pooltype,
                   T pad_value,
                   const framework::LoDTensor& input,
@@ -435,9 +435,9 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
 };
 
 template <typename T>
-class SequencePoolGradFunctor<platform::CPUDeviceContext, T> {
+class SequencePoolGradFunctor<phi::CPUContext, T> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
+  void operator()(const phi::CPUContext& context,
                   const std::string pooltype,
                   const framework::LoDTensor& out_grad,
                   framework::LoDTensor* in_grad,
@@ -451,7 +451,7 @@ class SequencePoolGradFunctor<platform::CPUDeviceContext, T> {
 
     if (pooltype == "LAST" || pooltype == "FIRST") {
       // set X@Grad be zero at first when pooltype is LAST/FIRST
-      phi::funcs::SetConstant<platform::CPUDeviceContext, T> functor;
+      phi::funcs::SetConstant<phi::CPUContext, T> functor;
       functor(context, in_grad, 0);
     }
 
@@ -495,10 +495,10 @@ class SequencePoolGradFunctor<platform::CPUDeviceContext, T> {
   }
 };
 
-template class SequencePoolFunctor<platform::CPUDeviceContext, float>;
-template class SequencePoolFunctor<platform::CPUDeviceContext, double>;
-template class SequencePoolGradFunctor<platform::CPUDeviceContext, float>;
-template class SequencePoolGradFunctor<platform::CPUDeviceContext, double>;
+template class SequencePoolFunctor<phi::CPUContext, float>;
+template class SequencePoolFunctor<phi::CPUContext, double>;
+template class SequencePoolGradFunctor<phi::CPUContext, float>;
+template class SequencePoolGradFunctor<phi::CPUContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/sequence_pooling_test.cc b/paddle/fluid/operators/math/sequence_pooling_test.cc
index ec82a2439a6c3..63d922b7ebb80 100644
--- a/paddle/fluid/operators/math/sequence_pooling_test.cc
+++ b/paddle/fluid/operators/math/sequence_pooling_test.cc
@@ -117,18 +117,16 @@ void TestSequencePoolingSum(const DeviceContext &context,
 
 TEST(SequencePoolingGrad, CPU_SUM) {
   auto place = paddle::platform::CPUPlace();
-  auto *context = static_cast<paddle::platform::CPUDeviceContext *>(
+  auto *context = static_cast<phi::CPUContext *>(
       paddle::platform::DeviceContextPool::Instance().Get(place));
 
   paddle::framework::LoD lod1;
   lod1.push_back(std::vector<size_t>{0, 10});
-  TestSequencePoolingSum<paddle::platform::CPUDeviceContext, float>(
-      *context, lod1, 128);
+  TestSequencePoolingSum<phi::CPUContext, float>(*context, lod1, 128);
 
   paddle::framework::LoD lod2;
   lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
-  TestSequencePoolingSum<paddle::platform::CPUDeviceContext, float>(
-      *context, lod2, 128);
+  TestSequencePoolingSum<phi::CPUContext, float>(*context, lod2, 128);
 }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/fluid/operators/math/sequence_scale.cc b/paddle/fluid/operators/math/sequence_scale.cc
index cd91b2eb53149..8faf9572bef0d 100644
--- a/paddle/fluid/operators/math/sequence_scale.cc
+++ b/paddle/fluid/operators/math/sequence_scale.cc
@@ -24,29 +24,6 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template <typename T>
-class ScaleLoDTensorFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const T* scales,
-                  framework::LoDTensor* seq) {
-    const size_t level = 0;
-    auto lod = seq->lod();
-    const size_t num_seq = lod[level].size() - 1;
-    size_t seq_width = seq->dims()[1];
-    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
-
-    T* seq_data = seq->mutable_data<T>(context.GetPlace());
-    for (size_t i = 0; i < num_seq; ++i) {
-      for (size_t j = lod[level][i] * seq_width;
-           j < lod[level][i + 1] * seq_width;
-           ++j) {
-        seq_data[j] *= scales[i];
-      }
-    }
-  }
-};
-
 template <typename T>
 class ScaleLoDTensorFunctor<phi::CPUContext, T> {
  public:
@@ -70,9 +47,6 @@ class ScaleLoDTensorFunctor<phi::CPUContext, T> {
   }
 };
 
-template class ScaleLoDTensorFunctor<platform::CPUDeviceContext, float>;
-template class ScaleLoDTensorFunctor<platform::CPUDeviceContext, double>;
-
 template class ScaleLoDTensorFunctor<phi::CPUContext, float>;
 template class ScaleLoDTensorFunctor<phi::CPUContext, double>;
 
diff --git a/paddle/fluid/operators/math/softmax.cc b/paddle/fluid/operators/math/softmax.cc
index adea86a6c5a87..730dcbf59a605 100644
--- a/paddle/fluid/operators/math/softmax.cc
+++ b/paddle/fluid/operators/math/softmax.cc
@@ -21,13 +21,6 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template class SoftmaxFunctor<platform::CPUDeviceContext, float, true>;
-template class SoftmaxFunctor<platform::CPUDeviceContext, float, false>;
-template class SoftmaxFunctor<platform::CPUDeviceContext, double, true>;
-template class SoftmaxFunctor<platform::CPUDeviceContext, double, false>;
-template class SoftmaxGradFunctor<platform::CPUDeviceContext, float>;
-template class SoftmaxGradFunctor<platform::CPUDeviceContext, double>;
-
 template class SoftmaxFunctor<phi::CPUContext, float, true>;
 template class SoftmaxFunctor<phi::CPUContext, float, false>;
 template class SoftmaxFunctor<phi::CPUContext, double, true>;
diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h
index 93ae9cad7674e..18cd3e7261dd7 100644
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -226,7 +226,7 @@ void SoftmaxFunctor<DeviceContext, T, is_test, Enable>::operator()(
 
 template <class DeviceContext>
 using enable_if_CPU = typename std::enable_if<
-    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type;
+    std::is_same<DeviceContext, phi::CPUContext>::value>::type;
 
 template <typename DeviceContext, typename T, bool is_test>
 class SoftmaxFunctor<DeviceContext, T, is_test, enable_if_CPU<DeviceContext>> {
diff --git a/paddle/fluid/operators/math/squared_l2_norm.h b/paddle/fluid/operators/math/squared_l2_norm.h
index ba584953a0d1e..3054d5f8f0029 100644
--- a/paddle/fluid/operators/math/squared_l2_norm.h
+++ b/paddle/fluid/operators/math/squared_l2_norm.h
@@ -34,7 +34,7 @@ namespace operators {
 namespace math {
 
 template <typename T1, typename T2 = T1>
-void SquaredL2Norm(const platform::CPUDeviceContext& ctx,
+void SquaredL2Norm(const phi::CPUContext& ctx,
                    const T1* x,
                    T2* y,
                    size_t numel,
diff --git a/paddle/fluid/operators/math/tree2col.cc b/paddle/fluid/operators/math/tree2col.cc
index fae1122fa0596..70f377e42e59f 100644
--- a/paddle/fluid/operators/math/tree2col.cc
+++ b/paddle/fluid/operators/math/tree2col.cc
@@ -84,9 +84,9 @@ void Tree2ColUtil::construct_tree(const framework::Tensor &EdgeSet,
 }
 
 template <typename T>
-class Tree2ColFunctor<platform::CPUDeviceContext, T> {
+class Tree2ColFunctor<phi::CPUContext, T> {
  public:
-  void operator()(const platform::CPUDeviceContext &context,
+  void operator()(const phi::CPUContext &context,
                   const framework::Tensor &EdgeSet,
                   const framework::Tensor &node_features,
                   framework::Tensor *patch,
@@ -94,7 +94,7 @@ class Tree2ColFunctor<platform::CPUDeviceContext, T> {
     std::vector<std::vector<int>> tr;
     const auto &feature_dims = node_features.dims();
     auto cpu_place = context.GetPlace();
-    phi::funcs::SetConstant<platform::CPUDeviceContext, T> constant;
+    phi::funcs::SetConstant<phi::CPUContext, T> constant;
     int64_t feature_size = feature_dims[1];
     size_t patch_elem_size = 3 * static_cast<size_t>(feature_size);
     size_t node_count = 0, patch_count = 0, patch_size;
@@ -138,9 +138,9 @@ class Tree2ColFunctor<platform::CPUDeviceContext, T> {
   }
 };
 template <typename T>
-class Col2TreeFunctor<platform::CPUDeviceContext, T> {
+class Col2TreeFunctor<phi::CPUContext, T> {
  public:
-  void operator()(const platform::CPUDeviceContext &context,
+  void operator()(const phi::CPUContext &context,
                   const framework::Tensor &EdgeSet,
                   const framework::Tensor &out_grad,
                   framework::Tensor *in_grad,
@@ -148,7 +148,7 @@ class Col2TreeFunctor<platform::CPUDeviceContext, T> {
     std::vector<std::vector<int>> tr;
     const auto &output_dims = out_grad.dims();
     auto cpu_place = context.GetPlace();
-    phi::funcs::SetConstant<platform::CPUDeviceContext, T> constant;
+    phi::funcs::SetConstant<phi::CPUContext, T> constant;
     int64_t output_size = output_dims[1];
     size_t grad_elem_size = 3 * static_cast<size_t>(output_size);
     size_t node_count = 0, grad_count = 0;
@@ -195,10 +195,10 @@ class Col2TreeFunctor<platform::CPUDeviceContext, T> {
   }
 };
 
-template class Tree2ColFunctor<platform::CPUDeviceContext, float>;
-template class Tree2ColFunctor<platform::CPUDeviceContext, double>;
-template class Col2TreeFunctor<platform::CPUDeviceContext, float>;
-template class Col2TreeFunctor<platform::CPUDeviceContext, double>;
+template class Tree2ColFunctor<phi::CPUContext, float>;
+template class Tree2ColFunctor<phi::CPUContext, double>;
+template class Col2TreeFunctor<phi::CPUContext, float>;
+template class Col2TreeFunctor<phi::CPUContext, double>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/unpooling.cc b/paddle/fluid/operators/math/unpooling.cc
index c834a03f9731b..d119e814585b5 100644
--- a/paddle/fluid/operators/math/unpooling.cc
+++ b/paddle/fluid/operators/math/unpooling.cc
@@ -18,9 +18,9 @@ namespace paddle {
 namespace operators {
 namespace math {
 template <typename T>
-class Unpool2dMaxFunctor<platform::CPUDeviceContext, T> {
+class Unpool2dMaxFunctor<phi::CPUContext, T> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
+  void operator()(const phi::CPUContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& indices,
                   framework::Tensor* output) {
@@ -61,9 +61,9 @@ class Unpool2dMaxFunctor<platform::CPUDeviceContext, T> {
   }
 };
 template <class T>
-class Unpool2dMaxGradFunctor<platform::CPUDeviceContext, T> {
+class Unpool2dMaxGradFunctor<phi::CPUContext, T> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
+  void operator()(const phi::CPUContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& indices,
                   const framework::Tensor& output,
@@ -107,9 +107,9 @@ class Unpool2dMaxGradFunctor<platform::CPUDeviceContext, T> {
 };
 
 template <typename T>
-class Unpool3dMaxFunctor<platform::CPUDeviceContext, T> {
+class Unpool3dMaxFunctor<phi::CPUContext, T> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
+  void operator()(const phi::CPUContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& indices,
                   framework::Tensor* output) {
@@ -153,9 +153,9 @@ class Unpool3dMaxFunctor<platform::CPUDeviceContext, T> {
   }
 };
 template <class T>
-class Unpool3dMaxGradFunctor<platform::CPUDeviceContext, T> {
+class Unpool3dMaxGradFunctor<phi::CPUContext, T> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
+  void operator()(const phi::CPUContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& indices,
                   const framework::Tensor& output,
@@ -201,14 +201,14 @@ class Unpool3dMaxGradFunctor<platform::CPUDeviceContext, T> {
   }
 };
 
-template class Unpool2dMaxGradFunctor<platform::CPUDeviceContext, float>;
-template class Unpool2dMaxGradFunctor<platform::CPUDeviceContext, double>;
-template class Unpool2dMaxFunctor<platform::CPUDeviceContext, float>;
-template class Unpool2dMaxFunctor<platform::CPUDeviceContext, double>;
-template class Unpool3dMaxGradFunctor<platform::CPUDeviceContext, float>;
-template class Unpool3dMaxGradFunctor<platform::CPUDeviceContext, double>;
-template class Unpool3dMaxFunctor<platform::CPUDeviceContext, float>;
-template class Unpool3dMaxFunctor<platform::CPUDeviceContext, double>;
+template class Unpool2dMaxGradFunctor<phi::CPUContext, float>;
+template class Unpool2dMaxGradFunctor<phi::CPUContext, double>;
+template class Unpool2dMaxFunctor<phi::CPUContext, float>;
+template class Unpool2dMaxFunctor<phi::CPUContext, double>;
+template class Unpool3dMaxGradFunctor<phi::CPUContext, float>;
+template class Unpool3dMaxGradFunctor<phi::CPUContext, double>;
+template class Unpool3dMaxFunctor<phi::CPUContext, float>;
+template class Unpool3dMaxFunctor<phi::CPUContext, double>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/vol2col.cc b/paddle/fluid/operators/math/vol2col.cc
index 7b687909306c0..36ce3e6474254 100644
--- a/paddle/fluid/operators/math/vol2col.cc
+++ b/paddle/fluid/operators/math/vol2col.cc
@@ -16,12 +16,6 @@ limitations under the License. */
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 
-namespace paddle {
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 namespace math {
@@ -32,126 +26,6 @@ namespace math {
  *   [input_channels, filter_depth, filter_height, filter_width,
  *                    output_depth, output_height, output_width]
  */
-template <class T>
-class Vol2ColFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& vol,
-                  const std::vector<int>& dilations,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  framework::Tensor* col,
-                  const DataLayout data_layout) const {
-    PADDLE_ENFORCE_EQ(vol.dims().size(),
-                      4,
-                      platform::errors::InvalidArgument(
-                          "The dimension of vol should be 4, but received %d.",
-                          vol.dims().size()));
-
-    PADDLE_ENFORCE_EQ(col->dims().size(),
-                      7,
-                      platform::errors::InvalidArgument(
-                          "The dimension of col should be 7, but received %d.",
-                          col->dims().size()));
-
-    int input_channels =
-        (data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]);
-    int input_depth =
-        (data_layout != DataLayout::kNHWC ? vol.dims()[1] : vol.dims()[0]);
-    int input_height =
-        (data_layout != DataLayout::kNHWC ? vol.dims()[2] : vol.dims()[1]);
-    int input_width =
-        (data_layout != DataLayout::kNHWC ? vol.dims()[3] : vol.dims()[2]);
-    int filter_depth = col->dims()[1];
-    int filter_height = col->dims()[2];
-    int filter_width = col->dims()[3];
-    int output_depth = col->dims()[4];
-    int output_height = col->dims()[5];
-    int output_width = col->dims()[6];
-    int channels_col =
-        input_channels * filter_depth * filter_height * filter_width;
-
-    // changed
-    bool paddings_size_is_6 = (paddings.size() == 6);
-    int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0];
-    int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0];
-    int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1];
-    int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1];
-    int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
-    int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
-
-    auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
-                            ((dilations[0] * (filter_depth - 1) + 1))) /
-                               strides[0] +
-                           1;
-    PADDLE_ENFORCE_EQ(
-        input_depth_tmp,
-        output_depth,
-        platform::errors::InvalidArgument(
-            "input_depth(%d) and output_depth(%d) are mismatching.",
-            input_depth_tmp,
-            output_depth));
-    auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
-                             ((dilations[1] * (filter_height - 1) + 1))) /
-                                strides[1] +
-                            1;
-    PADDLE_ENFORCE_EQ(
-        input_height_tmp,
-        output_height,
-        platform::errors::InvalidArgument(
-            "input_height(%d) and output_height(%d) are mismatching.",
-            input_height_tmp,
-            output_height));
-    auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
-                            ((dilations[2] * (filter_width - 1) + 1))) /
-                               strides[2] +
-                           1;
-    PADDLE_ENFORCE_EQ(
-        input_width_tmp,
-        output_width,
-        platform::errors::InvalidArgument(
-            "input_width(%d) and output_width(%d) are mismatching.",
-            input_width_tmp,
-            output_width));
-    const T* vol_data = vol.data<T>();
-    T* col_data = col->data<T>();
-
-    for (int c = 0; c < channels_col; ++c) {
-      int w_offset = c % filter_width;
-      int h_offset = (c / filter_width) % filter_height;
-      int d_offset = (c / filter_width / filter_height) % filter_depth;
-      int c_in = c / filter_width / filter_height / filter_depth;
-      for (int d = 0; d < output_depth; ++d) {
-        int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0];
-        for (int h = 0; h < output_height; ++h) {
-          int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1];
-          for (int w = 0; w < output_width; ++w) {
-            int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2];
-
-            int col_idx =
-                ((c * output_depth + d) * output_height + h) * output_width + w;
-            int vol_idx;
-            if (data_layout != DataLayout::kNHWC) {
-              vol_idx = ((c_in * input_depth + d_pad) * input_height + h_pad) *
-                            input_width +
-                        w_pad;
-            } else {
-              vol_idx = ((d_pad * input_height + h_pad) * input_width + w_pad) *
-                            input_channels +
-                        c_in;
-            }
-            col_data[col_idx] =
-                (h_pad < 0 || h_pad >= input_height || w_pad < 0 ||
-                 w_pad >= input_width || d_pad < 0 || d_pad >= input_depth)
-                    ? static_cast<T>(0)
-                    : vol_data[vol_idx];
-          }
-        }
-      }
-    }
-  }
-};
-
 template <class T>
 class Vol2ColFunctor<phi::CPUContext, T> {
  public:
@@ -278,126 +152,6 @@ class Vol2ColFunctor<phi::CPUContext, T> {
  *   [input_channels, filter_depth, filter_height, filter_width,
  *                    output_depth, output_height, output_width]
  */
-template <class T>
-class Col2VolFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& col,
-                  const std::vector<int>& dilations,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  framework::Tensor* vol,
-                  const DataLayout data_layout) const {
-    PADDLE_ENFORCE_EQ(vol->dims().size(),
-                      4,
-                      platform::errors::InvalidArgument(
-                          "The dimension of vol should be 4, but received %d.",
-                          vol->dims().size()));
-
-    PADDLE_ENFORCE_EQ(col.dims().size(),
-                      7,
-                      platform::errors::InvalidArgument(
-                          "The dimension of col  should be 7, but received %d.",
-                          col.dims().size()));
-
-    int input_channels =
-        (data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]);
-    int input_depth =
-        (data_layout != DataLayout::kNHWC ? vol->dims()[1] : vol->dims()[0]);
-    int input_height =
-        (data_layout != DataLayout::kNHWC ? vol->dims()[2] : vol->dims()[1]);
-    int input_width =
-        (data_layout != DataLayout::kNHWC ? vol->dims()[3] : vol->dims()[2]);
-    int filter_depth = col.dims()[1];
-    int filter_height = col.dims()[2];
-    int filter_width = col.dims()[3];
-    int output_depth = col.dims()[4];
-    int output_height = col.dims()[5];
-    int output_width = col.dims()[6];
-    int channels_col =
-        input_channels * filter_depth * filter_height * filter_width;
-
-    bool paddings_size_is_6 = (paddings.size() == 6);
-    int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0];
-    int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0];
-    int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1];
-    int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1];
-    int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
-    int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
-
-    auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
-                            ((dilations[0] * (filter_depth - 1) + 1))) /
-                               strides[0] +
-                           1;
-    PADDLE_ENFORCE_EQ(
-        input_depth_tmp,
-        output_depth,
-        platform::errors::InvalidArgument(
-            "input_depth(%d) and output_depth(%d) are mismatching.",
-            input_depth_tmp,
-            output_depth));
-    auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
-                             ((dilations[1] * (filter_height - 1) + 1))) /
-                                strides[1] +
-                            1;
-    PADDLE_ENFORCE_EQ(
-        input_height_tmp,
-        output_height,
-        platform::errors::InvalidArgument(
-            "input_height(%d) and output_height(%d) are mismatching.",
-            input_height_tmp,
-            output_height));
-    auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
-                            ((dilations[2] * (filter_width - 1) + 1))) /
-                               strides[2] +
-                           1;
-    PADDLE_ENFORCE_EQ(
-        input_width_tmp,
-        output_width,
-        platform::errors::InvalidArgument(
-            "input_width(%d)  and output_width(%d) are mismatching.",
-            input_width_tmp,
-            output_width));
-    T* vol_data = vol->data<T>();
-    const T* col_data = col.data<T>();
-
-    for (int c = 0; c < channels_col; ++c) {
-      int w_offset = c % filter_width;
-      int h_offset = (c / filter_width) % filter_height;
-      int d_offset = (c / filter_width / filter_height) % filter_depth;
-      int cIm = c / filter_width / filter_height / filter_depth;
-      for (int d = 0; d < output_depth; ++d) {
-        int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0];
-        for (int h = 0; h < output_height; ++h) {
-          int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1];
-          for (int w = 0; w < output_width; ++w) {
-            int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2];
-
-            if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 &&
-                w_pad < input_width && d_pad >= 0 && d_pad < input_depth) {
-              int vol_idx;
-              if (data_layout != DataLayout::kNHWC) {
-                vol_idx = ((cIm * input_depth + d_pad) * input_height + h_pad) *
-                              input_width +
-                          w_pad;
-              } else {
-                vol_idx =
-                    ((d_pad * input_height + h_pad) * input_width + w_pad) *
-                        input_channels +
-                    cIm;
-              }
-              int col_idx =
-                  ((c * output_depth + d) * output_height + h) * output_width +
-                  w;
-              vol_data[vol_idx] += col_data[col_idx];
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
 template <class T>
 class Col2VolFunctor<phi::CPUContext, T> {
  public:
@@ -518,13 +272,9 @@ class Col2VolFunctor<phi::CPUContext, T> {
   }
 };
 
-template class Vol2ColFunctor<platform::CPUDeviceContext, float>;
-template class Vol2ColFunctor<platform::CPUDeviceContext, double>;
 template class Vol2ColFunctor<phi::CPUContext, float>;
 template class Vol2ColFunctor<phi::CPUContext, double>;
 
-template class Col2VolFunctor<platform::CPUDeviceContext, float>;
-template class Col2VolFunctor<platform::CPUDeviceContext, double>;
 template class Col2VolFunctor<phi::CPUContext, float>;
 template class Col2VolFunctor<phi::CPUContext, double>;
 
diff --git a/paddle/fluid/operators/math/vol2col_test.cc b/paddle/fluid/operators/math/vol2col_test.cc
index f2c5fa88fda60..ec3926b95ee87 100644
--- a/paddle/fluid/operators/math/vol2col_test.cc
+++ b/paddle/fluid/operators/math/vol2col_test.cc
@@ -254,7 +254,7 @@ void testVol2col<paddle::platform::CUDADeviceContext,
 #endif
 
 TEST(math, vol2col) {
-  testVol2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
+  testVol2col<phi::CPUContext, paddle::platform::CPUPlace>();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   testVol2col<paddle::platform::CUDADeviceContext,
               paddle::platform::CUDAPlace>();
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index 13df41852dd5a..c79073861ab6e 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -1041,19 +1041,16 @@ REGISTER_OPERATOR(matmul_grad,
                   ops::MatMulOpDoubleGradMaker<paddle::framework::OpDesc>,
                   ops::MatMulOpDoubleGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(matmul_grad_grad, ops::MatMulOpDoubleGrad);
-REGISTER_OP_CPU_KERNEL(
-    matmul,
-    ops::MatMulKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MatMulKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    matmul_grad,
-    ops::MatMulGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MatMulGradKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    matmul_grad_grad,
-    ops::MatMulDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MatMulDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(matmul,
+                       ops::MatMulKernel<phi::CPUContext, float>,
+                       ops::MatMulKernel<phi::CPUContext, double>);
+REGISTER_OP_CPU_KERNEL(matmul_grad,
+                       ops::MatMulGradKernel<phi::CPUContext, float>,
+                       ops::MatMulGradKernel<phi::CPUContext, double>);
+
+REGISTER_OP_CPU_KERNEL(matmul_grad_grad,
+                       ops::MatMulDoubleGradKernel<phi::CPUContext, float>,
+                       ops::MatMulDoubleGradKernel<phi::CPUContext, double>);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/fluid/operators/matmul_op_xpu.cc b/paddle/fluid/operators/matmul_op_xpu.cc
index 945546a502aeb..efad516cdbfe5 100644
--- a/paddle/fluid/operators/matmul_op_xpu.cc
+++ b/paddle/fluid/operators/matmul_op_xpu.cc
@@ -20,276 +20,40 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/xpu_api_wrapper.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
 
 namespace paddle {
 namespace operators {
 using framework::Tensor;
 
-static framework::DDim RowMatrixFromVector(const framework::DDim &x_dim) {
-  if (x_dim.size() > 1) {
-    return x_dim;
-  }
-  return phi::make_ddim({1, x_dim[0]});
-}
-
-static framework::Tensor FoldInitDims(const framework::Tensor &input) {
-  auto output = input;
-  auto in_dims = input.dims();
-  if (in_dims.size() == 3) {
-    output.Resize({in_dims[0] * in_dims[1], in_dims[2]});
-  }
-  return output;
-}
-/**
- * Get column matrix shape from a vector shape. If the ran of y_dim > 1, the
- * original y_dim is returned.
- */
-static framework::DDim ColumnMatrixFromVector(const framework::DDim &y_dim) {
-  if (y_dim.size() > 1) {
-    return y_dim;
-  }
-  return phi::make_ddim({y_dim[0], 1});
-}
-
-static void ReshapeTensorIntoMatrixSequence(
-    framework::Tensor *x, const phi::funcs::MatDescriptor &descriptor) {
-  int64_t h, w;
-  h = descriptor.height_;
-  w = descriptor.width_;
-  if (descriptor.trans_) {
-    std::swap(w, h);
-  }
-  if (descriptor.batch_size_) {
-    x->Resize({descriptor.batch_size_, h, w});
-  } else {
-    x->Resize({h, w});
-  }
-}
-/**
- * Reshape the x,y,out tensor to 3-D or 2-D tensor by matrix descriptor
- * Out = matmul(x, y)
- *
- * This method will first calculate X,Y matrix sequence, and then calculate
- * the out shape.
- *
- * Assume X = [BatchSize, H1, W1], Y = [BatchSize, H2, W2]
- * The out = [BatchSize, H1, W2]
- *
- * If there is no batch size in `X` and `Y`, the out will be [H1, W2]
- * If any of `X` and `Y` has batch size BatchSize, the out will have the
- * BatchSize.
- */
-static void ReshapeXYOutIntoMatrixSequence(framework::Tensor *x,
-                                           framework::Tensor *y,
-                                           framework::Tensor *out,
-                                           bool trans_x,
-                                           bool trans_y) {
-  auto x_dim = RowMatrixFromVector(x->dims());
-  auto y_dim = ColumnMatrixFromVector(y->dims());
-  auto mat_dim_x = phi::funcs::CreateMatrixDescriptor(x_dim, 0, trans_x);
-  auto mat_dim_y = phi::funcs::CreateMatrixDescriptor(y_dim, 0, trans_y);
-  if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) {
-    out->Resize({mat_dim_x.height_, mat_dim_y.width_});
-  } else {
-    out->Resize({std::max(mat_dim_x.batch_size_, mat_dim_y.batch_size_),
-                 mat_dim_x.height_,
-                 mat_dim_y.width_});
-  }
-
-  ReshapeTensorIntoMatrixSequence(x, mat_dim_x);
-  ReshapeTensorIntoMatrixSequence(y, mat_dim_y);
-}
-
-template <typename T, typename FCT>
-static void MatMulXPUFunction(const Tensor *x,
-                              const Tensor *y,
-                              Tensor *out,
-                              bool trans_x,
-                              bool trans_y,
-                              const paddle::framework::ExecutionContext &ctx) {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-  const auto &x_dims = x->dims();
-  const auto &y_dims = y->dims();
-  auto &dev_ctx =
-      ctx.template device_context<paddle::platform::XPUDeviceContext>();
-
-  auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(
-      RowMatrixFromVector(x_dims), 0, trans_x);
-  auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(
-      ColumnMatrixFromVector(y_dims), 0, trans_y);
-
-  if (x_dims.size() == 3 && y_dims.size() <= 2) {
-    // if transpose_X is true, the transpose cost much time
-    if (!trans_x) {
-      mat_dim_a.height_ *= mat_dim_a.batch_size_;
-      mat_dim_a.batch_size_ = 0;
-    } else {
-      mat_dim_b.batch_size_ = mat_dim_a.batch_size_;
-      mat_dim_b.height_ = mat_dim_b.height_ / mat_dim_b.batch_size_;
-    }
-  }
-
-  if (mat_dim_a.width_ == mat_dim_b.height_) {
-    if (mat_dim_a.batch_size_ == 0 && mat_dim_b.batch_size_ == 1) {
-      mat_dim_a.batch_size_ = mat_dim_b.batch_size_ = 0;
-    }
-    if (mat_dim_a.batch_size_ == 1 && mat_dim_b.batch_size_ == 0) {
-      mat_dim_a.batch_size_ = mat_dim_b.batch_size_ = 0;
-    }
-  }
-
-  PADDLE_ENFORCE_EQ(mat_dim_a.width_,
-                    mat_dim_b.height_,
-                    platform::errors::InvalidArgument(
-                        "Shape mistake in matmul_op, the "
-                        "first tensor width must be same as "
-                        "second tensor height, but received "
-                        "width:%d, height:%d x_dims = %s , y_dims = %s",
-                        mat_dim_a.width_,
-                        mat_dim_b.height_,
-                        x_dims.to_str().c_str(),
-                        y_dims.to_str().c_str()));
-  PADDLE_ENFORCE_EQ(mat_dim_a.batch_size_,
-                    mat_dim_b.batch_size_,
-                    platform::errors::InvalidArgument(
-                        "Shape mistake in matmul_op, the two input"
-                        "tensor batch_size must be same, but received first "
-                        "tensor batch_size:%d, second "
-                        "tensor batch_size:%d, x_dims = %s , y_dims = %s",
-                        mat_dim_a.batch_size_,
-                        mat_dim_b.batch_size_,
-                        x_dims.to_str().c_str(),
-                        y_dims.to_str().c_str()));
-
-  float alpha = static_cast<T>(ctx.Attr<float>("alpha"));
-  T *data_c = out->data<T>();
-  int m = mat_dim_a.height_;
-  int n = mat_dim_b.width_;
-  int k = mat_dim_a.width_;
-  int batch_size = mat_dim_a.batch_size_;
-  int ldx = mat_dim_a.trans_ ? m : k;
-  int ldy = mat_dim_b.trans_ ? k : n;
-  int ldout = n;
-  if (batch_size <= 1) {
-    int r = 0;
-    r = xpu_fc_wrapper<XPUType, FCT>(
-        dev_ctx.x_context(),
-        reinterpret_cast<const XPUType *>(x->data<T>()),
-        reinterpret_cast<const XPUType *>(y->data<T>()),
-        reinterpret_cast<XPUType *>(data_c),
-        m,
-        n,
-        k,
-        mat_dim_a.trans_,
-        mat_dim_b.trans_,
-        nullptr,
-        nullptr,
-        nullptr,
-        ldx,
-        ldy,
-        ldout,
-        alpha,
-        0,
-        nullptr,
-        xpu::Activation_t::LINEAR);
-    PADDLE_ENFORCE_EQ(
-        r,
-        XPU_SUCCESS,
-        platform::errors::External(
-            "XPU fc kernel return wrong value[%d %s]", r, XPUAPIErrorMsg[r]));
-  } else {
-    // batch matmul
-    int r = xpu::fc_batched<XPUType, XPUType, XPUType, FCT>(
-        dev_ctx.x_context(),                              // Context* ctx,
-        batch_size,                                       // int batch_size,
-        mat_dim_a.trans_,                                 // bool x_trans,
-        mat_dim_b.trans_,                                 // bool w_trans,
-        m,                                                // int m,
-        n,                                                // int n,
-        k,                                                // int k,
-        alpha,                                            // float alpha,
-        reinterpret_cast<const XPUType *>(x->data<T>()),  // const TX* x,
-        mat_dim_a.stride_,                                // int stride_a,
-        reinterpret_cast<const XPUType *>(y->data<T>()),  // const TW* w,
-        mat_dim_b.stride_,                                // int stride_b,
-        0.0,                                              // float beta,
-        reinterpret_cast<XPUType *>(data_c),              // TY* y,
-        m * n,                                            // int stride_c,
-        nullptr,   // const float* x_maxptr,
-        nullptr);  // const float* w_maxptr
-
-    PADDLE_ENFORCE_EQ(r,
-                      XPU_SUCCESS,
-                      platform::errors::External(
-                          "XPU fc_batched kernel return wrong value[%d %s] "
-                          "x_dims = %s , y_dims = %s",
-                          r,
-                          XPUAPIErrorMsg[r],
-                          x_dims.to_str().c_str(),
-                          y_dims.to_str().c_str()));
-  }
-}
-
 template <typename DeviceContext, typename T>
 class MatMulXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
  public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *x = context.Input<framework::Tensor>("X");
-    auto *y = context.Input<framework::Tensor>("Y");
-    auto *out = context.Output<framework::Tensor>("Out");
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<framework::Tensor>("X");
+    auto* y = context.Input<framework::Tensor>("Y");
+    auto* out = context.Output<framework::Tensor>("Out");
     out->mutable_data<T>(context.GetPlace());
     bool trans_x = context.Attr<bool>("transpose_X");
     bool trans_y = context.Attr<bool>("transpose_Y");
-    if (std::is_same<paddle::platform::float16, T>::value) {
-      MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, context);
-    } else {
-      if (std::getenv("XPU_PADDLE_FC_INT32") != nullptr) {
-        MatMulXPUFunction<T, int32_t>(x, y, out, trans_x, trans_y, context);
-      } else if (std::getenv("XPU_PADDLE_FC_LOCAL_INT16") != nullptr) {
-        MatMulXPUFunction<T, float>(x, y, out, trans_x, trans_y, context);
-      } else {
-        MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, context);
-      }
-    }
+    float alpha = static_cast<T>(context.Attr<float>("alpha"));
+    const XPUType* x_ptr = reinterpret_cast<const XPUType*>(x->data<T>());
+    const XPUType* y_ptr = reinterpret_cast<const XPUType*>(y->data<T>());
+    XPUType* out_ptr = reinterpret_cast<XPUType*>(out->data<T>());
+    auto x_dims = x->dims();
+    auto y_dims = y->dims();
+
+    XpuFcInfo fc_info;
+    GetFCInfo(x_dims, y_dims, trans_x, trans_y, &fc_info);
+    auto& dev_ctx =
+        context.template device_context<paddle::platform::XPUDeviceContext>();
+    xpu::Context* xpu_ctx = dev_ctx.x_context();
+
+    MatMulXPUFunction<XPUType>(xpu_ctx, x_ptr, y_ptr, out_ptr, fc_info, alpha);
   }
 };
 
-// Reshape a rank-3 tensor from P x M x N to M x (P * N).
-// (Warning: This requires transposing data and writes into new memory.)
-// Identity op if the tensor is not of rank 3.
-template <typename DeviceContext, typename T>
-static framework::Tensor XPUFoldHeadAndLastDims(
-    const DeviceContext &context, const framework::Tensor &input) {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-  auto in_dims = input.dims();
-  if (in_dims.size() != 3) {
-    return input;
-  }
-
-  framework::Tensor output;
-  output.Resize({in_dims[1], in_dims[0], in_dims[2]});
-  output.mutable_data<T>(context.GetPlace());
-  std::vector<int> in_shape_host = {static_cast<int>(in_dims[0]),
-                                    static_cast<int>(in_dims[1]),
-                                    static_cast<int>(in_dims[2])};
-  std::vector<int> axis_host = {1, 0, 2};
-  int r = xpu::transpose(context.x_context(),
-                         reinterpret_cast<const XPUType *>(input.data<T>()),
-                         reinterpret_cast<XPUType *>(output.data<T>()),
-                         in_shape_host,
-                         axis_host);
-  PADDLE_ENFORCE_EQ(r,
-                    XPU_SUCCESS,
-                    platform::errors::External(
-                        "XPU transpose kernel return wrong value[%d %s]",
-                        r,
-                        XPUAPIErrorMsg[r]));
-  output.Resize({in_dims[1], in_dims[0] * in_dims[2]});
-
-  return output;
-}
-
 // Using dimensional constraints on matrix multiplication, it is
 // straight-forward to check the following table for when X and Y
 // are both matrices.
@@ -317,107 +81,68 @@ static framework::Tensor XPUFoldHeadAndLastDims(
 // to X: (P * M) x K, dOut: (P * M) x N.
 template <typename DeviceContext, typename T>
 class MatMulGradXPUKernel : public framework::OpKernel<T> {
- public:
-  void MatMul(const framework::ExecutionContext &context,
-              const framework::Tensor &a,
-              bool trans_a,
-              const framework::Tensor &b,
-              bool trans_b,
-              framework::Tensor *out) const {
-    out->mutable_data<T>(context.GetPlace());
-    if (std::is_same<paddle::platform::float16, T>::value) {
-      MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, context);
-    } else {
-      if (std::getenv("XPU_PADDLE_FC_INT32") != nullptr) {
-        MatMulXPUFunction<T, int32_t>(&a, &b, out, trans_a, trans_b, context);
-      } else if (std::getenv("XPU_PADDLE_FC_LOCAL_INT16") != nullptr) {
-        MatMulXPUFunction<T, float>(&a, &b, out, trans_a, trans_b, context);
-      } else {
-        MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, context);
-      }
-    }
-  }
-
-  void CalcInputGrad(const framework::ExecutionContext &context,
-                     const framework::Tensor &a,
-                     bool trans_a,
-                     bool is_fold_init_dims_a,
-                     const framework::Tensor &b,
-                     bool trans_b,
-                     bool is_fold_init_dims_b,
-                     framework::Tensor *out) const {
-    if (out == nullptr) return;
-    bool need_combine = (a.dims().size() == 3 || b.dims().size() == 3) &&
-                        out->dims().size() == 2;
-    if (!need_combine) {
-      MatMul(context, a, trans_a, b, trans_b, out);
-    } else {
-      auto &dev_ctx = context.template device_context<DeviceContext>();
-      MatMul(context,
-             is_fold_init_dims_a
-                 ? FoldInitDims(a)
-                 : XPUFoldHeadAndLastDims<DeviceContext, T>(dev_ctx, a),
-             trans_a,
-             is_fold_init_dims_b
-                 ? FoldInitDims(b)
-                 : XPUFoldHeadAndLastDims<DeviceContext, T>(dev_ctx, b),
-             trans_b,
-             out);
-    }
-  }
+  using XPUType = typename XPUTypeTrait<T>::Type;
 
-  void Compute(const framework::ExecutionContext &context) const override {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
     auto x = *context.Input<framework::Tensor>("X");
     auto y = *context.Input<framework::Tensor>("Y");
     auto dout =
         *context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto *dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto *dy = context.Output<framework::Tensor>(framework::GradVarName("Y"));
+    auto* dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* dy = context.Output<framework::Tensor>(framework::GradVarName("Y"));
     bool transpose_x = context.Attr<bool>("transpose_X");
     bool transpose_y = context.Attr<bool>("transpose_Y");
-
-    ReshapeXYOutIntoMatrixSequence(&x, &y, &dout, transpose_x, transpose_y);
-
-    framework::DDim dx_dims;
+    float alpha = static_cast<T>(context.Attr<float>("alpha"));
     if (dx) {
-      dx_dims = dx->dims();
-      if (dx_dims != x.dims()) {
-        dx->Resize(x.dims());
-      }
+      dx->mutable_data<T>(context.GetPlace());
     }
-
-    framework::DDim dy_dims;
     if (dy) {
-      dy_dims = dy->dims();
-      if (dy_dims != y.dims()) {
-        dy->Resize(y.dims());
-      }
-    }
-
-    if (transpose_x && transpose_y) {
-      CalcInputGrad(context, y, true, true, dout, true, false, dx);
-      CalcInputGrad(context, dout, true, true, x, true, false, dy);
-    } else if (transpose_x) {
-      CalcInputGrad(context, y, false, false, dout, true, false, dx);
-      CalcInputGrad(context, x, false, false, dout, false, true, dy);
-    } else if (transpose_y) {
-      CalcInputGrad(context, dout, false, false, y, false, true, dx);
-      CalcInputGrad(context, dout, true, true, x, false, true, dy);
-    } else {
-      CalcInputGrad(context, dout, false, false, y, true, false, dx);
-      CalcInputGrad(context, x, true, true, dout, false, true, dy);
+      dy->mutable_data<T>(context.GetPlace());
     }
-
+    auto& dev_ctx =
+        context.template device_context<paddle::platform::XPUDeviceContext>();
+
+    const XPUType* dout_ptr = reinterpret_cast<const XPUType*>(dout.data<T>());
+    const XPUType* x_ptr = reinterpret_cast<const XPUType*>(x.data<T>());
+    const XPUType* y_ptr = reinterpret_cast<const XPUType*>(y.data<T>());
+
+    xpu::Context* xpu_ctx = dev_ctx.x_context();
+
+    XpuFcInfo info_forward;
+    GetFCInfo(x.dims(), y.dims(), transpose_x, transpose_y, &info_forward);
+    xpu::ctx_guard RAII_GUARD(xpu_ctx);
+    // begin calculate
+    const XPUType* a_1 = reinterpret_cast<const XPUType*>(NULL);
+    const XPUType* b_1 = reinterpret_cast<const XPUType*>(NULL);
+    const XPUType* a_2 = reinterpret_cast<const XPUType*>(NULL);
+    const XPUType* b_2 = reinterpret_cast<const XPUType*>(NULL);
+    XPUType* c_1 = (dx == NULL) ? reinterpret_cast<XPUType*>(NULL)
+                                : reinterpret_cast<XPUType*>(dx->data<T>());
+    XPUType* c_2 = (dy == NULL) ? reinterpret_cast<XPUType*>(NULL)
+                                : reinterpret_cast<XPUType*>(dy->data<T>());
+    XpuFcInfo info_dx;
+    XpuFcInfo info_dy;
+    std::tuple<XpuFcInfo,
+               XpuFcInfo,
+               const XPUType*,
+               const XPUType*,
+               const XPUType*,
+               const XPUType*>
+        fc_info = MatmulGradFcInfo(xpu_ctx,
+                                   &RAII_GUARD,
+                                   info_forward,
+                                   transpose_x,
+                                   transpose_y,
+                                   x_ptr,
+                                   y_ptr,
+                                   dout_ptr);
+    std::tie(info_dx, info_dy, a_1, b_1, a_2, b_2) = fc_info;
     if (dx) {
-      if (dx_dims != x.dims()) {
-        dx->Resize(dx_dims);
-      }
+      MatMulXPUFunction<XPUType>(xpu_ctx, a_1, b_1, c_1, info_dx, alpha);
     }
-
     if (dy) {
-      if (dy_dims != y.dims()) {
-        dy->Resize(dy_dims);
-      }
+      MatMulXPUFunction<XPUType>(xpu_ctx, a_2, b_2, c_2, info_dy, alpha);
     }
   }
 };
diff --git a/paddle/fluid/operators/matmul_v2_op.h b/paddle/fluid/operators/matmul_v2_op.h
index 36267b9f9a391..8e436dd6afbfb 100644
--- a/paddle/fluid/operators/matmul_v2_op.h
+++ b/paddle/fluid/operators/matmul_v2_op.h
@@ -21,7 +21,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/dot_op.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
diff --git a/paddle/fluid/operators/matmul_v2_op_xpu.cc b/paddle/fluid/operators/matmul_v2_op_xpu.cc
index e9cb665f4fc0e..7b4195c1c19fa 100644
--- a/paddle/fluid/operators/matmul_v2_op_xpu.cc
+++ b/paddle/fluid/operators/matmul_v2_op_xpu.cc
@@ -16,146 +16,17 @@
 
 #include <string>
 #include <vector>
-
 #include "paddle/fluid/operators/matmul_v2_op.h"
+
 #include "paddle/fluid/operators/xpu_api_wrapper.h"
 
 namespace paddle {
 namespace operators {
 
-template <typename T, typename FCT>
-static void MatMulXPUFunction(const Tensor* x,
-                              const Tensor* y,
-                              Tensor* out,
-                              bool trans_x,
-                              bool trans_y,
-                              const paddle::framework::ExecutionContext& ctx) {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-  const auto& x_dims = x->dims();
-  const auto& y_dims = y->dims();
-  auto& dev_ctx =
-      ctx.template device_context<paddle::platform::XPUDeviceContext>();
-
-  auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(
-      RowMatrixFromVector(x_dims), 0, trans_x);
-  auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(
-      ColumnMatrixFromVector(y_dims), 0, trans_y);
-
-  if (x_dims.size() >= 3 && y_dims.size() <= 2) {
-    // if transpose_X is true, the transpose cost much time
-    if (!trans_x) {
-      mat_dim_a.height_ *= mat_dim_a.batch_size_;
-      mat_dim_a.batch_size_ = 0;
-    } else {
-      mat_dim_b.batch_size_ = mat_dim_a.batch_size_;
-      mat_dim_b.height_ = mat_dim_b.height_ / mat_dim_b.batch_size_;
-    }
-  }
-
-  if (mat_dim_a.width_ == mat_dim_b.height_) {
-    if (mat_dim_a.batch_size_ == 0 && mat_dim_b.batch_size_ == 1) {
-      mat_dim_a.batch_size_ = mat_dim_b.batch_size_ = 0;
-    }
-    if (mat_dim_a.batch_size_ == 1 && mat_dim_b.batch_size_ == 0) {
-      mat_dim_a.batch_size_ = mat_dim_b.batch_size_ = 0;
-    }
-  }
-
-  PADDLE_ENFORCE_EQ(mat_dim_a.width_,
-                    mat_dim_b.height_,
-                    platform::errors::InvalidArgument(
-                        "Shape mistake in matmul_v2_op xdims = %s ydims = %s "
-                        "x_trans = %d y_trans = %d",
-                        x_dims.to_str(),
-                        y_dims.to_str(),
-                        mat_dim_a.trans_,
-                        mat_dim_b.trans_));
-  PADDLE_ENFORCE_EQ(mat_dim_a.batch_size_,
-                    mat_dim_b.batch_size_,
-                    platform::errors::InvalidArgument(
-                        "Shape mistake in matmul_v2_op xdims = %s ydims = %s "
-                        "x_trans = %d y_trans = %d",
-                        x_dims.to_str(),
-                        y_dims.to_str(),
-                        mat_dim_a.trans_,
-                        mat_dim_b.trans_));
-
-  T* data_c = out->data<T>();
-  int m = mat_dim_a.height_;
-  int n = mat_dim_b.width_;
-  int k = mat_dim_a.width_;
-  int batch_size = mat_dim_a.batch_size_;
-  int ldx = mat_dim_a.trans_ ? m : k;
-  int ldy = mat_dim_b.trans_ ? k : n;
-  int ldout = n;
-  if (batch_size <= 1) {
-    int r = 0;
-    r = xpu_fc_wrapper<XPUType, FCT>(
-        dev_ctx.x_context(),
-        reinterpret_cast<const XPUType*>(x->data<T>()),
-        reinterpret_cast<const XPUType*>(y->data<T>()),
-        reinterpret_cast<XPUType*>(data_c),
-        m,
-        n,
-        k,
-        mat_dim_a.trans_,
-        mat_dim_b.trans_,
-        nullptr,
-        nullptr,
-        nullptr,
-        ldx,
-        ldy,
-        ldout,
-        1.0,
-        0,
-        nullptr,
-        xpu::Activation_t::LINEAR);
-    PADDLE_ENFORCE_EQ(
-        r,
-        XPU_SUCCESS,
-        platform::errors::External(
-            "XPU fc kernel return wrong value[%d %s] , m = %d, n = "
-            "%d, "
-            "k = %d, a_tr = %d, b_tr = %d",
-            r,
-            XPUAPIErrorMsg[r],
-            m,
-            n,
-            k,
-            mat_dim_a.trans_,
-            mat_dim_b.trans_));
-  } else {
-    // batch matmul
-    int r = xpu::fc_batched<XPUType, XPUType, XPUType, FCT>(
-        dev_ctx.x_context(),                             // Context* ctx,
-        batch_size,                                      // int batch_size,
-        mat_dim_a.trans_,                                // bool x_trans,
-        mat_dim_b.trans_,                                // bool w_trans,
-        m,                                               // int m,
-        n,                                               // int n,
-        k,                                               // int k,
-        1.0,                                             // float alpha,
-        reinterpret_cast<const XPUType*>(x->data<T>()),  // const TX* x,
-        mat_dim_a.stride_,                               // int stride_a,
-        reinterpret_cast<const XPUType*>(y->data<T>()),  // const TW* w,
-        mat_dim_b.stride_,                               // int stride_b,
-        0.0,                                             // float beta,
-        reinterpret_cast<XPUType*>(data_c),              // TY* y,
-        m * n,                                           // int stride_c,
-        nullptr,   // const float* x_maxptr,
-        nullptr);  // const float* w_maxptr
-
-    PADDLE_ENFORCE_EQ(r,
-                      XPU_SUCCESS,
-                      platform::errors::External(
-                          "XPU fc_batched kernel return wrong value[%d %s]",
-                          r,
-                          XPUAPIErrorMsg[r]));
-  }
-}
-
 template <typename T>
 class MatMulV2XPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
  public:
   void Compute(const paddle::framework::ExecutionContext& ctx) const override {
     auto* x = ctx.Input<Tensor>("X");
@@ -164,160 +35,84 @@ class MatMulV2XPUKernel : public framework::OpKernel<T> {
     bool trans_x = ctx.Attr<bool>("trans_x");
     bool trans_y = ctx.Attr<bool>("trans_y");
     out->mutable_data<T>(ctx.GetPlace());
-    if (std::is_same<paddle::platform::float16, T>::value) {
-      MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, ctx);
-    } else {
-      if (std::getenv("XPU_PADDLE_FC_INT32") != nullptr) {
-        MatMulXPUFunction<T, int32_t>(x, y, out, trans_x, trans_y, ctx);
-      } else if (std::getenv("XPU_PADDLE_FC_LOCAL_INT16") != nullptr) {
-        MatMulXPUFunction<T, float>(x, y, out, trans_x, trans_y, ctx);
-      } else {
-        MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, ctx);
-      }
-    }
+    const XPUType* x_ptr = reinterpret_cast<const XPUType*>(x->data<T>());
+    const XPUType* y_ptr = reinterpret_cast<const XPUType*>(y->data<T>());
+    XPUType* out_ptr = reinterpret_cast<XPUType*>(out->data<T>());
+    auto x_dims = x->dims();
+    auto y_dims = y->dims();
+
+    XpuFcInfo fc_info;
+    GetFCInfo(x_dims, y_dims, trans_x, trans_y, &fc_info);
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::XPUDeviceContext>();
+    xpu::Context* xpu_ctx = dev_ctx.x_context();
+    MatMulXPUFunction<XPUType>(xpu_ctx, x_ptr, y_ptr, out_ptr, fc_info, 1.0f);
   }
 };
 
-template <typename DeviceContext, typename T>
-static framework::Tensor XPUFoldHeadAndLastDims(
-    const DeviceContext& context, const framework::Tensor& input) {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-  auto in_dims = input.dims();
-  if (in_dims.size() != 3) {
-    return input;
-  }
-
-  framework::Tensor output;
-  output.Resize({in_dims[1], in_dims[0], in_dims[2]});
-  output.mutable_data<T>(context.GetPlace());
-  std::vector<int> in_shape_host = {static_cast<int>(in_dims[0]),
-                                    static_cast<int>(in_dims[1]),
-                                    static_cast<int>(in_dims[2])};
-  std::vector<int> axis_host = {1, 0, 2};
-
-  int r = xpu::transpose(context.x_context(),
-                         reinterpret_cast<const XPUType*>(input.data<T>()),
-                         reinterpret_cast<XPUType*>(output.data<T>()),
-                         in_shape_host,
-                         axis_host);
-  PADDLE_ENFORCE_EQ(r,
-                    XPU_SUCCESS,
-                    platform::errors::External(
-                        "XPU transpose kernel return wrong value[%d %s]",
-                        r,
-                        XPUAPIErrorMsg[r]));
-  output.Resize({in_dims[1], in_dims[0] * in_dims[2]});
-
-  return output;
-}
-
 template <typename T>
 class MatMulV2XPUGradKernel : public framework::OpKernel<T> {
- public:
-  void MatMul(const framework::ExecutionContext& ctx,
-              const framework::Tensor& a,
-              bool trans_a,
-              const framework::Tensor& b,
-              bool trans_b,
-              framework::Tensor* out) const {
-    out->mutable_data<T>(ctx.GetPlace());
-    if (std::is_same<paddle::platform::float16, T>::value) {
-      MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, ctx);
-    } else {
-      if (std::getenv("XPU_PADDLE_FC_INT32") != nullptr) {
-        MatMulXPUFunction<T, int32_t>(&a, &b, out, trans_a, trans_b, ctx);
-      } else if (std::getenv("XPU_PADDLE_FC_LOCAL_INT16") != nullptr) {
-        MatMulXPUFunction<T, float>(&a, &b, out, trans_a, trans_b, ctx);
-      } else {
-        MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, ctx);
-      }
-    }
-  }
-
-  void CalcInputGrad(const framework::ExecutionContext& context,
-                     const framework::Tensor& a,
-                     bool trans_a,
-                     bool is_fold_init_dims_a,
-                     const framework::Tensor& b,
-                     bool trans_b,
-                     bool is_fold_init_dims_b,
-                     framework::Tensor* out) const {
-    if (out == nullptr) return;
-    bool need_combine = (a.dims().size() == 3 || b.dims().size() == 3) &&
-                        out->dims().size() == 2;
-    if (!need_combine) {
-      MatMul(context, a, trans_a, b, trans_b, out);
-    } else {
-      auto& dev_ctx =
-          context.template device_context<paddle::platform::XPUDeviceContext>();
-      MatMul(
-          context,
-          is_fold_init_dims_a
-              ? FoldInitDims(a)
-              : XPUFoldHeadAndLastDims<paddle::platform::XPUDeviceContext, T>(
-                    dev_ctx, a),
-          trans_a,
-          is_fold_init_dims_b
-              ? FoldInitDims(b)
-              : XPUFoldHeadAndLastDims<paddle::platform::XPUDeviceContext, T>(
-                    dev_ctx, b),
-          trans_b,
-          out);
-    }
-  }
+  using XPUType = typename XPUTypeTrait<T>::Type;
 
+ public:
   void Compute(const framework::ExecutionContext& context) const override {
     bool transpose_x = context.Attr<bool>("trans_x");
     bool transpose_y = context.Attr<bool>("trans_y");
-
     auto x = *context.Input<framework::Tensor>("X");
     auto y = *context.Input<framework::Tensor>("Y");
     auto dout =
         *context.Input<framework::Tensor>(framework::GradVarName("Out"));
     auto* dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
     auto* dy = context.Output<framework::Tensor>(framework::GradVarName("Y"));
-    ReshapeXYOutIntoMatrixSequence(&x, &y, &dout, transpose_x, transpose_y);
-
-    framework::DDim dx_dims;
     if (dx) {
-      dx_dims = dx->dims();
-      if (dx_dims != x.dims()) {
-        dx->Resize(x.dims());
-      }
+      dx->mutable_data<T>(context.GetPlace());
     }
-
-    framework::DDim dy_dims;
     if (dy) {
-      dy_dims = dy->dims();
-      if (dy_dims != y.dims()) {
-        dy->Resize(y.dims());
-      }
-    }
-
-    if (transpose_x && transpose_y) {
-      CalcInputGrad(context, y, true, true, dout, true, false, dx);
-      CalcInputGrad(context, dout, true, true, x, true, false, dy);
-    } else if (transpose_x) {
-      CalcInputGrad(context, y, false, false, dout, true, false, dx);
-      CalcInputGrad(context, x, false, false, dout, false, true, dy);
-    } else if (transpose_y) {
-      CalcInputGrad(context, dout, false, false, y, false, true, dx);
-      CalcInputGrad(context, dout, true, true, x, false, true, dy);
-    } else {
-      CalcInputGrad(context, dout, false, false, y, true, false, dx);
-      CalcInputGrad(context, x, true, true, dout, false, true, dy);
+      dy->mutable_data<T>(context.GetPlace());
     }
-
+    auto& dev_ctx =
+        context.template device_context<paddle::platform::XPUDeviceContext>();
+
+    const XPUType* dout_ptr = reinterpret_cast<const XPUType*>(dout.data<T>());
+    const XPUType* x_ptr = reinterpret_cast<const XPUType*>(x.data<T>());
+    const XPUType* y_ptr = reinterpret_cast<const XPUType*>(y.data<T>());
+
+    xpu::Context* xpu_ctx = dev_ctx.x_context();
+
+    XpuFcInfo info_forward;
+    GetFCInfo(x.dims(), y.dims(), transpose_x, transpose_y, &info_forward);
+    xpu::ctx_guard RAII_GUARD(xpu_ctx);
+    // begin calculate
+    const XPUType* a_1 = reinterpret_cast<const XPUType*>(NULL);
+    const XPUType* b_1 = reinterpret_cast<const XPUType*>(NULL);
+    const XPUType* a_2 = reinterpret_cast<const XPUType*>(NULL);
+    const XPUType* b_2 = reinterpret_cast<const XPUType*>(NULL);
+    XPUType* c_1 = (dx == NULL) ? reinterpret_cast<XPUType*>(NULL)
+                                : reinterpret_cast<XPUType*>(dx->data<T>());
+    XPUType* c_2 = (dy == NULL) ? reinterpret_cast<XPUType*>(NULL)
+                                : reinterpret_cast<XPUType*>(dy->data<T>());
+    XpuFcInfo info_dx;
+    XpuFcInfo info_dy;
+    std::tuple<XpuFcInfo,
+               XpuFcInfo,
+               const XPUType*,
+               const XPUType*,
+               const XPUType*,
+               const XPUType*>
+        fc_info = MatmulGradFcInfo(xpu_ctx,
+                                   &RAII_GUARD,
+                                   info_forward,
+                                   transpose_x,
+                                   transpose_y,
+                                   x_ptr,
+                                   y_ptr,
+                                   dout_ptr);
+    std::tie(info_dx, info_dy, a_1, b_1, a_2, b_2) = fc_info;
     if (dx) {
-      if (dx_dims != x.dims()) {
-        dx->Resize(dx_dims);
-      }
+      MatMulXPUFunction<XPUType>(xpu_ctx, a_1, b_1, c_1, info_dx, 1.0f);
     }
-
     if (dy) {
-      if (dy_dims != y.dims()) {
-        dy->Resize(dy_dims);
-      }
+      MatMulXPUFunction<XPUType>(xpu_ctx, a_2, b_2, c_2, info_dy, 1.0f);
     }
   }
 };
diff --git a/paddle/fluid/operators/mean_iou_op.h b/paddle/fluid/operators/mean_iou_op.h
index 88f8be63f26c8..b5ac6fd677bac 100644
--- a/paddle/fluid/operators/mean_iou_op.h
+++ b/paddle/fluid/operators/mean_iou_op.h
@@ -31,8 +31,8 @@ template <typename T>
 class MeanIoUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& place = *ctx.template device_context<platform::CPUDeviceContext>()
-                       .eigen_device();
+    auto& place =
+        *ctx.template device_context<phi::CPUContext>().eigen_device();
     // get input and output tensor
     auto* predictions = ctx.Input<Tensor>("Predictions");
     auto* labels = ctx.Input<Tensor>("Labels");
diff --git a/paddle/fluid/operators/memcpy_d2h_op.cc b/paddle/fluid/operators/memcpy_d2h_op.cc
index 26ae22f8ea380..f3f15f000d50a 100644
--- a/paddle/fluid/operators/memcpy_d2h_op.cc
+++ b/paddle/fluid/operators/memcpy_d2h_op.cc
@@ -233,3 +233,31 @@ REGISTER_OP_NPU_KERNEL_FUNCTOR(memcpy_d2h,
                                int16_t,
                                ops::MemcpyD2HKernel);
 #endif
+
+#ifdef PADDLE_WITH_IPU
+REGISTER_OP_IPU_KERNEL_FUNCTOR(memcpy_d2h,
+                               float,
+                               ops::MemcpyD2HKernel,
+                               double,
+                               ops::MemcpyD2HKernel,
+                               int8_t,
+                               ops::MemcpyD2HKernel,
+                               uint8_t,
+                               ops::MemcpyD2HKernel,
+                               int,
+                               ops::MemcpyD2HKernel,
+                               int64_t,
+                               ops::MemcpyD2HKernel,
+                               bool,
+                               ops::MemcpyD2HKernel,
+                               paddle::platform::bfloat16,
+                               ops::MemcpyD2HKernel,
+                               paddle::platform::complex<float>,
+                               ops::MemcpyD2HKernel,
+                               paddle::platform::complex<double>,
+                               ops::MemcpyD2HKernel,
+                               plat::float16,
+                               ops::MemcpyD2HKernel,
+                               int16_t,
+                               ops::MemcpyD2HKernel);
+#endif
diff --git a/paddle/fluid/operators/memcpy_h2d_op.cc b/paddle/fluid/operators/memcpy_h2d_op.cc
index 411841c4502fa..ff7b786d04018 100644
--- a/paddle/fluid/operators/memcpy_h2d_op.cc
+++ b/paddle/fluid/operators/memcpy_h2d_op.cc
@@ -94,14 +94,14 @@ class MemcpyH2DOpProtoMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out",
               "(LoDTensor) The type of output "
               "is the same as input X.");
-    AddAttr<int>(
-        "dst_place_type",
-        "Determine the dst place of tensor copy. "
-        "By Now it ONLY support CUDAPinnedPlace/CPU <-> NPUPlace/CUDAPlace "
-        "Other place type is Unimplemented and will cause ERROR."
-        "0: dst is on CUDAPlace. "
-        "1: dst is on NPUPlace. "
-        "2: dst is on XPUPlace. ");
+    AddAttr<int>("dst_place_type",
+                 "Determine the dst place of tensor copy. "
+                 "By Now it support:"
+                 "0. CUDAPinnedPlace/CPU <->CUDAPlace"
+                 "1. NPUPinnedPlace/CPU <-> NPUPlace"
+                 "2. CPU <->XPUPlace"
+                 "3. CPU <->IPUPlace"
+                 "Other place type is Unimplemented and will cause ERROR.");
     AddComment(R"DOC(
     MemcpyD2H Operator.
     By now, it ONLY supports the memcopy between CUDAPinnedPlace/CPU <-> NPUPlace/CUDAPlace.
@@ -234,3 +234,31 @@ REGISTER_OP_NPU_KERNEL_FUNCTOR(memcpy_h2d,
                                int16_t,
                                ops::MemcpyH2DKernel);
 #endif
+
+#ifdef PADDLE_WITH_IPU
+REGISTER_OP_IPU_KERNEL_FUNCTOR(memcpy_h2d,
+                               float,
+                               ops::MemcpyH2DKernel,
+                               double,
+                               ops::MemcpyH2DKernel,
+                               int8_t,
+                               ops::MemcpyH2DKernel,
+                               uint8_t,
+                               ops::MemcpyH2DKernel,
+                               int,
+                               ops::MemcpyH2DKernel,
+                               int64_t,
+                               ops::MemcpyH2DKernel,
+                               bool,
+                               ops::MemcpyH2DKernel,
+                               paddle::platform::bfloat16,
+                               ops::MemcpyH2DKernel,
+                               paddle::platform::complex<float>,
+                               ops::MemcpyH2DKernel,
+                               paddle::platform::complex<double>,
+                               ops::MemcpyH2DKernel,
+                               plat::float16,
+                               ops::MemcpyH2DKernel,
+                               int16_t,
+                               ops::MemcpyH2DKernel);
+#endif
diff --git a/paddle/fluid/operators/memcpy_h2d_op.h b/paddle/fluid/operators/memcpy_h2d_op.h
index 3fcc4b89eefe8..8cd84f4b59e8c 100644
--- a/paddle/fluid/operators/memcpy_h2d_op.h
+++ b/paddle/fluid/operators/memcpy_h2d_op.h
@@ -50,7 +50,7 @@ class MemcpyH2DFunctor {
         lod_tensor.dtype(),
         phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
 
-    if (dst_place_type_ == 0 || dst_place_type_ == 1 || dst_place_type_ == 2) {
+    if (dst_place_type_ >= 0 && dst_place_type_ <= 3) {
       framework::TensorCopy(
           lod_tensor, dev_ctx_.GetPlace(), dev_ctx_, &out_tensor);
     } else {
diff --git a/paddle/fluid/operators/merge_selected_rows_op.cc b/paddle/fluid/operators/merge_selected_rows_op.cc
index fc1944b2ad6fb..ef89a730a0ff9 100644
--- a/paddle/fluid/operators/merge_selected_rows_op.cc
+++ b/paddle/fluid/operators/merge_selected_rows_op.cc
@@ -100,7 +100,6 @@ REGISTER_OPERATOR(merge_selected_rows,
                   ops::MergeSelectedRowsOpMaker,
                   ops::MergeSelectedRowsOpInferVarType);
 
-REGISTER_OP_CPU_KERNEL(
-    merge_selected_rows,
-    ops::MergeSelectedRowsKernel<plat::CPUDeviceContext, float>,
-    ops::MergeSelectedRowsKernel<plat::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(merge_selected_rows,
+                       ops::MergeSelectedRowsKernel<phi::CPUContext, float>,
+                       ops::MergeSelectedRowsKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/minus_op.cc b/paddle/fluid/operators/minus_op.cc
index 5edc39f8fc7b8..1e369c81538ed 100644
--- a/paddle/fluid/operators/minus_op.cc
+++ b/paddle/fluid/operators/minus_op.cc
@@ -153,8 +153,7 @@ REGISTER_OPERATOR(minus,
                   ops::MinusOpMaker,
                   ops::MinusGradDescMaker,
                   ops::MinusGradMaker);
-REGISTER_OP_CPU_KERNEL(
-    minus, ops::MinusKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(minus, ops::MinusKernel<phi::CPUContext, float>);
 
 REGISTER_OP_CUDA_KERNEL(
     minus, ops::MinusKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
index c9b4514995290..eb0d03ce00a97 100644
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
@@ -20,13 +20,6 @@ namespace phi {
 class DenseTensor;
 }  // namespace phi
 
-namespace paddle {
-namespace framework {}  // namespace framework
-namespace platform {
-class MKLDNNDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
index ee83ffffd9786..f41068dd5f1ae 100644
--- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
@@ -19,13 +19,6 @@ namespace phi {
 class DenseTensor;
 }  // namespace phi
 
-namespace paddle {
-namespace framework {}  // namespace framework
-namespace platform {
-class MKLDNNDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
index cefa4fc1b995b..837d4357737a2 100644
--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
@@ -77,7 +77,24 @@ class ConcatMKLDNNHandler
     }
 
     auto dst_dims = phi::vectorize<int64_t>(output->dims());
-    auto dst_md = memory::desc(dst_dims, dt, MKLDNNMemoryFormat::any);
+
+    dnnl::memory::desc dst_md;
+
+    // if concat is being used as a stack op(all source memories dims on
+    // concat_axis are equal to 1), then it may choose a non-optimal memory
+    // format tag for destination, because concat primitive is chosing it based
+    // on source memory descriptors and f.e.200x1x10 can be described as both
+    // abc and bac and both would be using exact same physical layout, but in
+    // that scenario bac will be chosen for destination no matter which
+    // formats are being set in inputs. In that scenario we are enforcing using
+    // a dense format, because it is the most common one and should be the best
+    // in terms of the performance
+    if (dst_dims[concat_axis] == static_cast<int64_t>(srcs_md.size())) {
+      dst_md = memory::desc(
+          dst_dims, dt, platform::GetPlainMKLDNNFormat(dst_dims.size()));
+    } else {
+      dst_md = memory::desc(dst_dims, dt, MKLDNNMemoryFormat::any);
+    }
 
     this->AcquireForwardPrimitiveDescriptor(dst_md, concat_axis, srcs_md);
   }
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index 17d4c2fad96b8..8ee97c281e3f4 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -553,10 +553,6 @@ class ConvMKLDNNHandlerT
     dnnl::primitive_attr conv_attr;
     dnnl::post_ops post_operations;
 
-    const std::string fuse_activation =
-        ctx.Attr<std::string>("fuse_activation");
-    const float fuse_alpha = ctx.Attr<float>("fuse_alpha");
-    const float fuse_beta = ctx.Attr<float>("fuse_beta");
     const bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
 
     float sum_scale = 1.0f;
@@ -587,19 +583,7 @@ class ConvMKLDNNHandlerT
       post_operations.append_sum(sum_scale);
     }
 
-    if (fuse_activation == "hard_sigmoid") {
-      post_operations.append_eltwise(activation_scale,
-                                     dnnl::algorithm::eltwise_linear,
-                                     fuse_alpha,
-                                     fuse_beta);
-      post_operations.append_eltwise(
-          activation_scale, dnnl::algorithm::eltwise_clip, 0.0f, 1.0f);
-    } else if (fuse_activation != "") {
-      const auto activation_algorithm =
-          platform::AcquireActivationAlgorithm(fuse_activation);
-      post_operations.append_eltwise(
-          activation_scale, activation_algorithm, fuse_alpha, fuse_beta);
-    }
+    platform::AppendActivation(ctx, post_operations, activation_scale);
 
     conv_attr.set_post_ops(post_operations);
     return conv_attr;
diff --git a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
index 7a297b3daefd7..cd81168753bed 100644
--- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
@@ -12,7 +12,6 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "boost/optional.hpp"
 #include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/malloc.h"
diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
index 1755b0f208207..7404972ea7cca 100644
--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
@@ -21,13 +21,6 @@ namespace phi {
 class DenseTensor;
 }  // namespace phi
 
-namespace paddle {
-namespace framework {}  // namespace framework
-namespace platform {
-class MKLDNNDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 
@@ -170,6 +163,9 @@ class FCPrimitiveFactory {
     // In case of 2 dims, we set the only possible format, nc
     if (dim_num == 2) {
       out->set_format(MKLDNNMemoryFormat::nc);
+      out->set_mem_desc({phi::vectorize(out->dims()),
+                         platform::MKLDNNGetDataType<T_out>(),
+                         out->format()});
       // In case of 3 dims, we generate a format that is based on number
       // of output dims and the layout of input format (nchw or nhwc).
     } else if (dim_num == 3) {
@@ -185,9 +181,6 @@ class FCPrimitiveFactory {
     } else {
       out->set_format(in_format);
     }
-    out->set_mem_desc({phi::vectorize(out->dims()),
-                       platform::MKLDNNGetDataType<T_out>(),
-                       out->format()});
   }
 
   void UpdateDataPointers(const ExecutionContext& ctx,
@@ -209,7 +202,7 @@ class FCPrimitiveFactory {
       const Tensor* bias,
       LoDTensor* output,
       const ExecutionContext& ctx) {
-    auto src_desc = CreateMemDescriptor<T_in>(input, input->format());
+    auto src_desc = CreateMemDescriptor<T_in>(input, MKLDNNMemoryFormat::any);
     auto weight_dims = Get2DWeightDimsForDNNL(weights);
     auto weights_desc =
         CreateMemDescriptor<T_w>(weight_dims, MKLDNNMemoryFormat::any);
@@ -236,7 +229,8 @@ class FCPrimitiveFactory {
     auto input_dims = phi::vectorize(input->dims());
     std::vector<int64_t> new_input_dims = {
         input_dims[0] * input_dims[1], input_dims[2], 1};
-    auto src_desc = CreateMemDescriptor<T_in>(new_input_dims, input->format());
+    auto src_desc =
+        CreateMemDescriptor<T_in>(new_input_dims, MKLDNNMemoryFormat::any);
 
     auto weight_dims = Get3DWeightDimsForDNNL(weights);
     auto weights_desc =
@@ -267,7 +261,7 @@ class FCPrimitiveFactory {
       const Tensor* bias,
       LoDTensor* output,
       const ExecutionContext& ctx) {
-    auto src_desc = CreateMemDescriptor<T_in>(input, input->format());
+    auto src_desc = CreateMemDescriptor<T_in>(input, MKLDNNMemoryFormat::any);
     // Since MKL-DNN doesn't support 4D column-major data formats in
     // inner_product primitive, transpose the weights to be in
     // row-major format
diff --git a/paddle/fluid/operators/mkldnn/log_softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/log_softmax_mkldnn_op.cc
deleted file mode 100644
index b8ca40a0309e6..0000000000000
--- a/paddle/fluid/operators/mkldnn/log_softmax_mkldnn_op.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/mkldnn_reuse.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-template <typename T>
-class LogSoftmaxMKLDNNHandler
-    : public platform::MKLDNNHandlerNoCachingT<T, dnnl::logsoftmax_forward> {
- public:
-  LogSoftmaxMKLDNNHandler(const dnnl::engine mkldnn_engine,
-                          platform::Place cpu_place,
-                          const Tensor* x,
-                          const int axis)
-      : platform::MKLDNNHandlerNoCachingT<T, dnnl::logsoftmax_forward>(
-            mkldnn_engine, cpu_place) {
-    this->AcquireForwardPrimitiveDescriptor(
-        dnnl::prop_kind::forward_inference, x->mem_desc(), axis);
-  }
-};
-
-template <typename T>
-class LogSoftmaxMKLDNNKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx =
-        ctx.template device_context<platform::MKLDNNDeviceContext>();
-    const auto& mkldnn_engine = dev_ctx.GetEngine();
-
-    const Tensor* x = ctx.Input<Tensor>("X");
-    Tensor* out = ctx.Output<Tensor>("Out");
-
-    int axis = ctx.Attr<int>("axis");
-    axis = axis >= 0 ? axis : x->dims().size() + axis;
-
-    LogSoftmaxMKLDNNHandler<T> handler(mkldnn_engine, ctx.GetPlace(), x, axis);
-
-    auto src_memory_p = handler.AcquireSrcMemory(x);
-    auto dst_memory_p = handler.AcquireDstMemory(out);
-
-    auto logsoftmax_p = handler.AcquireForwardPrimitive();
-
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-    logsoftmax_p->execute(
-        astream,
-        {{DNNL_ARG_SRC, *src_memory_p}, {DNNL_ARG_DST, *dst_memory_p}});
-    astream.wait();
-
-    out->set_mem_desc(dst_memory_p->get_desc());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_KERNEL(log_softmax,
-                   MKLDNN,
-                   ::paddle::platform::CPUPlace,
-                   ops::LogSoftmaxMKLDNNKernel<float>,
-                   ops::LogSoftmaxMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
index 9ab09c3f3cecc..912b1be813a58 100644
--- a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <tuple>
 
 #include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
 
 using dnnl::memory;
 using dnnl::primitive;
@@ -453,6 +454,8 @@ class MatMulMKLDNNHandler
       matmul_attrs.set_output_scales(0, {scale_out});
     }
 
+    paddle::platform::AppendActivation(ctx, post_operations);
+
     matmul_attrs.set_post_ops(post_operations);
     return matmul_attrs;
   }
diff --git a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
index 813ebb2c81ce9..ec341c30773e8 100644
--- a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
@@ -21,13 +21,6 @@ namespace phi {
 class DenseTensor;
 }  // namespace phi
 
-namespace paddle {
-namespace framework {}  // namespace framework
-namespace platform {
-class MKLDNNDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
index 058be90cd82ac..ea56b84c90889 100644
--- a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
@@ -198,8 +198,7 @@ class ReshapeMKLDNNKernel : public framework::OpKernel<T> {
     x_dims = x->dims();
     auto axes = ctx.Attr<int>("axis");
     out_dims = phi::make_ddim(
-        FlattenKernel<platform::CPUDeviceContext, float>::GetOutputShape(
-            axes, x_dims));
+        FlattenKernel<phi::CPUContext, float>::GetOutputShape(axes, x_dims));
   }
 
  protected:
diff --git a/paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h b/paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h
index d2aa1cfc6bbf7..c41864ee26f55 100644
--- a/paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h
+++ b/paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h
@@ -46,7 +46,7 @@ class SoftplusMKLDNNHandler
           1.0f, dnnl::algorithm::eltwise_linear, 1.0f / beta, 0.0f);
     }
 
-    AppendFusedActivationIfExists(ctx, &post_ops);
+    platform::AppendActivation(ctx, post_ops);
 
     dnnl::primitive_attr attrs;
     attrs.set_post_ops(post_ops);
@@ -62,42 +62,8 @@ class SoftplusMKLDNNHandler
     return this->AcquireMemoryFromPrimitive(
         this->fwd_pd_->src1_desc(), platform::to_void_cast<float>(beta));
   }
-
- private:
-  void AppendFusedActivationIfExists(const framework::ExecutionContext& ctx,
-                                     dnnl::post_ops* post_ops) {
-    const auto& fused_activation_type =
-        algo_map.find(ctx.Attr<std::string>("fuse_activation_type"));
-
-    if (fused_activation_type != algo_map.end()) {
-      auto scale_out =
-          ctx.Attr<float>("fuse_activation_scale");  // for future int8 support
-      post_ops->append_eltwise(scale_out,
-                               fused_activation_type->second,
-                               ctx.Attr<float>("fuse_activation_alpha"),
-                               ctx.Attr<float>("fuse_activation_beta"));
-    }
-  }
-
-  static const std::unordered_map<std::string, dnnl::algorithm> algo_map;
 };
 
-template <typename T>
-const std::unordered_map<std::string, dnnl::algorithm>
-    SoftplusMKLDNNHandler<T>::algo_map = {
-        {"relu", dnnl::algorithm::eltwise_relu},
-        {"tanh", dnnl::algorithm::eltwise_tanh},
-        {"leaky_relu", dnnl::algorithm::eltwise_relu},
-        {"swish", dnnl::algorithm::eltwise_swish},
-        {"hardswish", dnnl::algorithm::eltwise_hardswish},
-        {"sqrt", dnnl::algorithm::eltwise_sqrt},
-        {"abs", dnnl::algorithm::eltwise_abs},
-        {"clip", dnnl::algorithm::eltwise_clip},
-        {"gelu", dnnl::algorithm::eltwise_gelu_erf},
-        {"gelu_tanh", dnnl::algorithm::eltwise_gelu_tanh},
-        {"relu6", dnnl::algorithm::eltwise_bounded_relu},
-        {"sigmoid", dnnl::algorithm::eltwise_logistic}};
-
 template <typename T>
 void custom_softplus_eltwise_forward(const framework::ExecutionContext& ctx) {
   const auto& dev_ctx =
diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
index a92d9ec2f2b4b..f71785e72cd4d 100644
--- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
@@ -31,19 +31,11 @@ namespace phi {
 class DenseTensor;
 }  // namespace phi
 
-namespace paddle {
-namespace framework {}  // namespace framework
-namespace platform {
-class CPUDeviceContext;
-class MKLDNNDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 
-using paddle::platform::CPUDeviceContext;
 using paddle::platform::MKLDNNDeviceContext;
+using phi::CPUContext;
 using platform::to_void_cast;
 
 template <typename T>
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc
index c0619145ad5ab..95a365f459f18 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.cc
+++ b/paddle/fluid/operators/mlu/mlu_baseop.cc
@@ -259,15 +259,16 @@ MLUCnnlTensorDesc::~MLUCnnlTensorDesc() {
 MLUCnnlActivationDesc::MLUCnnlActivationDesc(
     const cnnlActivationMode_t act_mode, const float ceof) {
   PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreateActivationDescriptor(&active_desc_));
-  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetActivationDescriptor_v4(
-      active_desc_,
-      act_mode,
-      CNNL_ACTIVATION_HIGH_PRECISION,
-      CNNL_NOT_PROPAGATE_NAN,
-      ceof,
-      1.0f /*sliced_dim*/,
-      1.67326319217681884765625 /*selu_alpha*/,
-      1.05070102214813232421875 /*selu_lambda*/));
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlSetActivationDescriptor_v5(active_desc_,
+                                     act_mode,
+                                     CNNL_ACTIVATION_HIGH_PRECISION,
+                                     CNNL_NOT_PROPAGATE_NAN,
+                                     ceof,
+                                     1.0f /*sliced_dim*/,
+                                     1.67326319217681884765625 /*selu_alpha*/,
+                                     1.05070102214813232421875 /*selu_lambda*/,
+                                     false /*is_elu_mode*/));
 }
 
 MLUCnnlActivationDesc::MLUCnnlActivationDesc(
@@ -278,14 +279,15 @@ MLUCnnlActivationDesc::MLUCnnlActivationDesc(
     const float selu_lambda) {
   PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreateActivationDescriptor(&active_desc_));
   PADDLE_ENFORCE_MLU_SUCCESS(
-      cnnlSetActivationDescriptor_v4(active_desc_,
+      cnnlSetActivationDescriptor_v5(active_desc_,
                                      act_mode,
                                      CNNL_ACTIVATION_HIGH_PRECISION,
                                      CNNL_NOT_PROPAGATE_NAN,
                                      ceof,
                                      sliced_dim,
                                      selu_alpha,
-                                     selu_lambda));
+                                     selu_lambda,
+                                     false /*is_elu_mode*/));
 }
 
 const cnnlActivationDescriptor_t MLUCnnlActivationDesc::get() const {
@@ -2350,6 +2352,36 @@ MLURNNDesc::~MLURNNDesc() {
                                          workspace_size));
 }
 
+/* static */ void MLUCnnl::Pow(const ExecutionContext& ctx,
+                               cnnlComputationPreference_t prefer,
+                               const cnnlTensorDescriptor_t input1_desc,
+                               const void* input1,
+                               const cnnlTensorDescriptor_t input2_desc,
+                               const void* input2,
+                               const cnnlTensorDescriptor_t output_desc,
+                               void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetPowWorkspaceSize(
+      handle, input1_desc, input2_desc, output_desc, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlPow(handle,
+                                     prefer,
+                                     input1_desc,
+                                     input1,
+                                     input2_desc,
+                                     input2,
+                                     workspace_ptr,
+                                     workspace_size,
+                                     output_desc,
+                                     output));
+}
+
 /* static */ void MLUCnnl::PowR(const ExecutionContext& ctx,
                                 cnnlComputationPreference_t prefer,
                                 const cnnlTensorDescriptor_t input1_desc,
@@ -2597,6 +2629,19 @@ MLURNNDesc::~MLURNNDesc() {
       cnnlSign(handle, input_desc, input, output_desc, output));
 }
 
+/* static */ void MLUCnnl::IndexSelect(const ExecutionContext& ctx,
+                                       const int dim,
+                                       cnnlTensorDescriptor_t input_desc,
+                                       const void* input,
+                                       const cnnlTensorDescriptor_t index_desc,
+                                       const void* index,
+                                       const cnnlTensorDescriptor_t output_desc,
+                                       void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlIndexSelect(
+      handle, dim, input_desc, input, index_desc, index, output_desc, output));
+}
+
 /* static */ void MLUCnnl::IsFinite(const ExecutionContext& ctx,
                                     const cnnlTensorDescriptor_t input_desc,
                                     const void* input,
@@ -4229,21 +4274,12 @@ MLURNNDesc::~MLURNNDesc() {
 /* static */ void MLUCnnl::NumTrue(const ExecutionContext& ctx,
                                    const cnnlTensorDescriptor_t x_desc,
                                    const void* x,
-                                   Tensor index,
-                                   uint32_t* num_true) {
+                                   const cnnlTensorDescriptor_t num_true_desc,
+                                   void* num_true) {
   cnnlHandle_t handle = GetHandleFromCTX(ctx);
 
-  size_t workspace_size = 0;
   PADDLE_ENFORCE_MLU_SUCCESS(
-      cnnlGetNumTrueWorkspaceSize(handle, x_desc, &workspace_size));
-
-  auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  index = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
-      {static_cast<int64_t>(workspace_size)}, dev_ctx);
-  void* index_ptr = index.mutable_data(ctx.GetPlace());
-
-  PADDLE_ENFORCE_MLU_SUCCESS(cnnlNumTrue(
-      handle, x_desc, x, static_cast<uint32_t*>(index_ptr), num_true));
+      cnnlNumTrue_v2(handle, x_desc, x, num_true_desc, num_true));
 }
 
 /* static */ void MLUCnnl::Where(const ExecutionContext& ctx,
@@ -4603,6 +4639,88 @@ MLURNNDesc::~MLURNNDesc() {
                                                     reservespace_size));
 }
 
+/* static */ void MLUCnnl::RNNBackward(const ExecutionContext& ctx,
+                                       const cnnlRNNDescriptor_t rnn_desc,
+                                       cnnlWgradMode_t add_grad,
+                                       const int dev_seq_lengths[],
+                                       const void* weight_param_ptr,
+                                       void* dweight_param_ptr,
+                                       size_t weightspace_size,
+                                       const cnnlSeqDataDescriptor_t x_desc,
+                                       const void* x,
+                                       void* dx,
+                                       const cnnlSeqDataDescriptor_t y_desc,
+                                       const void* y,
+                                       const void* dy,
+                                       const cnnlTensorDescriptor_t hx_desc,
+                                       const void* hx,
+                                       const void* dhy,
+                                       void* dhx,
+                                       const cnnlTensorDescriptor_t cx_desc,
+                                       const void* cx,
+                                       const void* dcy,
+                                       void* dcx,
+                                       void* reservespace_ptr,
+                                       size_t reservespace_size) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_NOT_NULL(
+      rnn_desc,
+      paddle::platform::errors::Fatal(
+          "MLU RNNForward failed. rnn_desc initializing failed."));
+  PADDLE_ENFORCE_NOT_NULL(
+      x_desc,
+      paddle::platform::errors::Fatal(
+          "MLU RNNForward failed. x_desc initializing failed."));
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  size_t workspace_size;
+  Tensor workspace;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetRNNTempSizes(
+      handle, rnn_desc, x_desc, &workspace_size, &reservespace_size));
+  workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlRNNBackwardData(handle,
+                                                 rnn_desc,
+                                                 dev_seq_lengths,
+                                                 y_desc,
+                                                 y,
+                                                 dy,
+                                                 x_desc,
+                                                 dx,
+                                                 hx_desc,
+                                                 hx,
+                                                 dhy,
+                                                 dhx,
+                                                 cx_desc,
+                                                 cx,
+                                                 dcy,
+                                                 dcx,
+                                                 weight_param_ptr,
+                                                 weightspace_size,
+                                                 workspace_ptr,
+                                                 workspace_size,
+                                                 reservespace_ptr,
+                                                 reservespace_size));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlRNNBackwardWeights(handle,
+                                                    rnn_desc,
+                                                    add_grad,
+                                                    dev_seq_lengths,
+                                                    x_desc,
+                                                    x,
+                                                    hx_desc,
+                                                    hx,
+                                                    y_desc,
+                                                    y,
+                                                    dweight_param_ptr,
+                                                    weightspace_size,
+                                                    workspace_ptr,
+                                                    workspace_size,
+                                                    reservespace_ptr,
+                                                    reservespace_size));
+}
+
 /* static */ void MLUCnnl::Mask(const ExecutionContext& ctx,
                                 cnnlMaskedOp_t masked_mode,
                                 const cnnlTensorDescriptor_t input_desc,
@@ -4800,5 +4918,180 @@ MLURNNDesc::~MLURNNDesc() {
                                                      grads_image));
 }
 
+/* static */ void MLUCnnl::SyncBatchNormStats(
+    const ExecutionContext& ctx,
+    const cnnlTensorDescriptor_t x_desc,
+    const void* x,
+    const float eps,
+    const cnnlTensorDescriptor_t mean_desc,
+    void* mean,
+    const cnnlTensorDescriptor_t invstd_desc,
+    void* invstd) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSyncBatchNormStats(
+      handle, x_desc, x, eps, mean_desc, mean, invstd_desc, invstd));
+}
+
+/* static */ void MLUCnnl::SyncBatchNormGatherStatsWithCounts(
+    const ExecutionContext& ctx,
+    float momentum,
+    float eps,
+    const cnnlTensorDescriptor_t mean_all_desc,
+    const void* mean_all,
+    const cnnlTensorDescriptor_t invstd_all_desc,
+    const void* invstd_all,
+    const cnnlTensorDescriptor_t moving_mean_desc,
+    void* moving_mean,
+    const cnnlTensorDescriptor_t moving_var_desc,
+    void* moving_var,
+    const cnnlTensorDescriptor_t count_all_desc,
+    const void* count_all,
+    const cnnlTensorDescriptor_t mean_desc,
+    void* mean,
+    const cnnlTensorDescriptor_t invstd_desc,
+    void* invstd) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlSyncBatchNormGatherStatsWithCounts(handle,
+                                             mean_all_desc,
+                                             mean_all,
+                                             invstd_all_desc,
+                                             invstd_all,
+                                             moving_mean_desc,
+                                             moving_mean,
+                                             moving_var_desc,
+                                             moving_var,
+                                             momentum,
+                                             eps,
+                                             count_all_desc,
+                                             count_all,
+                                             mean_desc,
+                                             mean,
+                                             invstd_desc,
+                                             invstd));
+}
+
+/* static */ void MLUCnnl::SyncBatchNormElemt(
+    const ExecutionContext& ctx,
+    const cnnlTensorDescriptor_t x_desc,
+    const void* x,
+    const cnnlTensorDescriptor_t mean_desc,
+    const void* mean,
+    const cnnlTensorDescriptor_t invstd_desc,
+    const void* invstd,
+    const cnnlTensorDescriptor_t weight_desc,
+    const void* weight,
+    const cnnlTensorDescriptor_t bias_desc,
+    const void* bias,
+    const cnnlTensorDescriptor_t y_desc,
+    void* y) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSyncBatchNormElemt(handle,
+                                                    x_desc,
+                                                    x,
+                                                    mean_desc,
+                                                    mean,
+                                                    invstd_desc,
+                                                    invstd,
+                                                    weight_desc,
+                                                    weight,
+                                                    bias_desc,
+                                                    bias,
+                                                    y_desc,
+                                                    y));
+}
+
+/* static */ void MLUCnnl::SyncBatchnormBackwardReduce(
+    const ExecutionContext& ctx,
+    const cnnlTensorDescriptor_t desc_dz,
+    const void* dz,
+    const cnnlTensorDescriptor_t desc_x,
+    const void* x,
+    const cnnlTensorDescriptor_t desc_mean,
+    const void* mean,
+    const cnnlTensorDescriptor_t desc_invstd,
+    const void* invstd,
+    const cnnlTensorDescriptor_t desc_dweight,
+    void* dweight,
+    const cnnlTensorDescriptor_t desc_dbias,
+    void* dbias,
+    const cnnlTensorDescriptor_t desc_sum_dy,
+    void* sum_dy,
+    const cnnlTensorDescriptor_t desc_sum_dy_xmu,
+    void* sum_dy_xmu,
+    const bool needs_input_grad0,
+    const bool needs_input_grad1,
+    const bool needs_input_grad2) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlSyncBatchnormBackwardReduce(handle,
+                                      desc_dz,
+                                      dz,
+                                      desc_x,
+                                      x,
+                                      desc_mean,
+                                      mean,
+                                      desc_invstd,
+                                      invstd,
+                                      desc_dweight,
+                                      dweight,
+                                      desc_dbias,
+                                      dbias,
+                                      desc_sum_dy,
+                                      sum_dy,
+                                      desc_sum_dy_xmu,
+                                      sum_dy_xmu,
+                                      needs_input_grad0,
+                                      needs_input_grad1,
+                                      needs_input_grad2));
+}
+
+/* static */ void MLUCnnl::SyncBatchNormBackwardElemt(
+    const ExecutionContext& ctx,
+    const cnnlTensorDescriptor_t diff_y_desc,
+    const void* diff_y,
+    const cnnlTensorDescriptor_t x_desc,
+    const void* x,
+    const cnnlTensorDescriptor_t mean_desc,
+    const void* mean,
+    const cnnlTensorDescriptor_t invstd_desc,
+    const void* invstd,
+    const cnnlTensorDescriptor_t weight_desc,
+    const void* weight,
+    const cnnlTensorDescriptor_t sum_dy_desc,
+    const void* sum_dy,
+    const cnnlTensorDescriptor_t sum_dy_xmu_desc,
+    const void* sum_dy_xmu,
+    const cnnlTensorDescriptor_t count_desc,
+    const void* count,
+    const cnnlTensorDescriptor_t diff_x_desc,
+    void* diff_x) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSyncBatchNormBackwardElemtV2(handle,
+                                                              diff_y_desc,
+                                                              diff_y,
+                                                              x_desc,
+                                                              x,
+                                                              mean_desc,
+                                                              mean,
+                                                              invstd_desc,
+                                                              invstd,
+                                                              weight_desc,
+                                                              weight,
+                                                              sum_dy_desc,
+                                                              sum_dy,
+                                                              sum_dy_xmu_desc,
+                                                              sum_dy_xmu,
+                                                              count_desc,
+                                                              count,
+                                                              diff_x_desc,
+                                                              diff_x));
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h
index 9031040ec5598..72446f56a18dc 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.h
+++ b/paddle/fluid/operators/mlu/mlu_baseop.h
@@ -1276,6 +1276,15 @@ class MLUCnnl {
                       const cnnlTensorDescriptor_t output_desc,
                       void* output);
 
+  static void Pow(const ExecutionContext& ctx,
+                  cnnlComputationPreference_t prefer,
+                  const cnnlTensorDescriptor_t input1_desc,
+                  const void* input1,
+                  const cnnlTensorDescriptor_t input2_desc,
+                  const void* input2,
+                  const cnnlTensorDescriptor_t output_desc,
+                  void* output);
+
   static void PowR(const ExecutionContext& ctx,
                    cnnlComputationPreference_t prefer,
                    const cnnlTensorDescriptor_t input1_desc,
@@ -1391,6 +1400,15 @@ class MLUCnnl {
                    const cnnlTensorDescriptor_t output_desc,
                    void* output);
 
+  static void IndexSelect(const ExecutionContext& ctx,
+                          const int dim,
+                          cnnlTensorDescriptor_t input_desc,
+                          const void* input,
+                          const cnnlTensorDescriptor_t index_desc,
+                          const void* index,
+                          const cnnlTensorDescriptor_t output_desc,
+                          void* output);
+
   static void IsFinite(const ExecutionContext& ctx,
                        const cnnlTensorDescriptor_t input_desc,
                        const void* input,
@@ -1685,8 +1703,8 @@ class MLUCnnl {
   static void NumTrue(const ExecutionContext& ctx,
                       const cnnlTensorDescriptor_t x_desc,
                       const void* x,
-                      Tensor index,
-                      uint32_t* num_true);
+                      const cnnlTensorDescriptor_t num_true_desc,
+                      void* num_true);
 
   static void Where(const ExecutionContext& ctx,
                     const cnnlTensorDescriptor_t x_desc,
@@ -1915,6 +1933,30 @@ class MLUCnnl {
                          void* cy,
                          void* reservespace_ptr);
 
+  static void RNNBackward(const ExecutionContext& ctx,
+                          const cnnlRNNDescriptor_t rnn_desc,
+                          cnnlWgradMode_t add_grad,
+                          const int dev_seq_lengths[],
+                          const void* weight_param_ptr,
+                          void* dweight_param_ptr,
+                          size_t weightspace_size,
+                          const cnnlSeqDataDescriptor_t x_desc,
+                          const void* x,
+                          void* dx,
+                          const cnnlSeqDataDescriptor_t y_desc,
+                          const void* y,
+                          const void* dy,
+                          const cnnlTensorDescriptor_t hx_desc,
+                          const void* hx,
+                          const void* dhy,
+                          void* dhx,
+                          const cnnlTensorDescriptor_t cx_desc,
+                          const void* cx,
+                          const void* dcy,
+                          void* dcx,
+                          void* reservespace_ptr,
+                          size_t reservespace_size);
+
   static void Mask(const ExecutionContext& ctx,
                    cnnlMaskedOp_t masked_mode,
                    const cnnlTensorDescriptor_t input_desc,
@@ -1997,8 +2039,152 @@ class MLUCnnl {
                                const void* boxes,
                                const cnnlTensorDescriptor_t grads_image_desc,
                                void* grads_image);
+
+  static void SyncBatchNormStats(const ExecutionContext& ctx,
+                                 const cnnlTensorDescriptor_t x_desc,
+                                 const void* x,
+                                 const float eps,
+                                 const cnnlTensorDescriptor_t mean_desc,
+                                 void* mean,
+                                 const cnnlTensorDescriptor_t invstd_desc,
+                                 void* invstd);
+
+  static void SyncBatchNormGatherStatsWithCounts(
+      const ExecutionContext& ctx,
+      float momentum,
+      float eps,
+      const cnnlTensorDescriptor_t mean_all_desc,
+      const void* mean_all,
+      const cnnlTensorDescriptor_t invstd_all_desc,
+      const void* invstd_all,
+      const cnnlTensorDescriptor_t moving_mean_desc,
+      void* moving_mean,
+      const cnnlTensorDescriptor_t moving_var_desc,
+      void* moving_var,
+      const cnnlTensorDescriptor_t count_all_desc,
+      const void* count_all,
+      const cnnlTensorDescriptor_t mean_desc,
+      void* mean,
+      const cnnlTensorDescriptor_t invstd_desc,
+      void* invstd);
+
+  static void SyncBatchNormElemt(const ExecutionContext& ctx,
+                                 const cnnlTensorDescriptor_t x_desc,
+                                 const void* x,
+                                 const cnnlTensorDescriptor_t mean_desc,
+                                 const void* mean,
+                                 const cnnlTensorDescriptor_t invstd_desc,
+                                 const void* invstd,
+                                 const cnnlTensorDescriptor_t weight_desc,
+                                 const void* weight,
+                                 const cnnlTensorDescriptor_t bias_desc,
+                                 const void* bias,
+                                 const cnnlTensorDescriptor_t y_desc,
+                                 void* y);
+
+  static void SyncBatchnormBackwardReduce(
+      const ExecutionContext& ctx,
+      const cnnlTensorDescriptor_t desc_dz,
+      const void* dz,
+      const cnnlTensorDescriptor_t desc_x,
+      const void* x,
+      const cnnlTensorDescriptor_t desc_mean,
+      const void* mean,
+      const cnnlTensorDescriptor_t desc_invstd,
+      const void* invstd,
+      const cnnlTensorDescriptor_t desc_dweight,
+      void* dweight,
+      const cnnlTensorDescriptor_t desc_dbias,
+      void* dbias,
+      const cnnlTensorDescriptor_t desc_sum_dy,
+      void* sum_dy,
+      const cnnlTensorDescriptor_t desc_sum_dy_xmu,
+      void* sum_dy_xmu,
+      const bool needs_input_grad0,
+      const bool needs_input_grad1,
+      const bool needs_input_grad2);
+
+  static void SyncBatchNormBackwardElemt(
+      const ExecutionContext& ctx,
+      const cnnlTensorDescriptor_t diff_y_desc,
+      const void* diff_y,
+      const cnnlTensorDescriptor_t x_desc,
+      const void* x,
+      const cnnlTensorDescriptor_t mean_desc,
+      const void* mean,
+      const cnnlTensorDescriptor_t invstd_desc,
+      const void* invstd,
+      const cnnlTensorDescriptor_t weight_desc,
+      const void* weight,
+      const cnnlTensorDescriptor_t sum_dy_desc,
+      const void* sum_dy,
+      const cnnlTensorDescriptor_t sum_dy_xmu_desc,
+      const void* sum_dy_xmu,
+      const cnnlTensorDescriptor_t count_desc,
+      const void* count,
+      const cnnlTensorDescriptor_t diff_x_desc,
+      void* diff_x);
 };
 
+const std::map<const std::string, std::pair<std::vector<int>, std::vector<int>>>
+    TransPermMap = {
+        // trans_mode, (forward_perm, backward_perm)
+        {"3D_NCHW2NHWC", {{0, 2, 1}, {0, 2, 1}}},
+        {"4D_NCHW2NHWC", {{0, 2, 3, 1}, {0, 3, 1, 2}}},
+        {"5D_NCHWD2NDHWC", {{0, 4, 2, 3, 1}, {0, 4, 2, 3, 1}}},
+        {"5D_NHWDC2NDHWC", {{0, 3, 1, 2, 4}, {0, 2, 3, 4, 1}}}};
+
+inline void SetMLUTransposePerm(const framework::DDim& dims,
+                                const DataLayout& data_layout,
+                                std::vector<int>* forward_perm,
+                                std::vector<int>* backward_perm,
+                                std::vector<int>* out_shape) {
+  const int dim_size = dims.size();
+  PADDLE_ENFORCE_EQ((dim_size >= 3) && (dim_size <= 5),
+                    true,
+                    platform::errors::InvalidArgument(
+                        "MLUTransposePerm func only support (dim_size >= 3) && "
+                        "(dim_size <= 5), but now dim_size is %d.",
+                        dim_size));
+
+  PADDLE_ENFORCE_EQ(
+      (data_layout == DataLayout::kNCHW) || (data_layout == DataLayout::kNHWC),
+      true,
+      platform::errors::InvalidArgument(
+          "MLUTransposePerm func only support DataLayout: kNCHW or kNHWC, but "
+          "now data_layout is %s.",
+          data_layout));
+
+  // case 1: NCHW of Paddle != NHWC of MLU when dims==3,4
+  // case 2： NHWDC and NCHWD of Paddle != NDHWC of MLU when dims==5
+  std::string map_key = "";
+  if (data_layout == DataLayout::kNCHW) {
+    switch (dim_size) {
+      case 3:
+        map_key = "3D_NCHW2NHWC";
+        break;
+      case 4:
+        map_key = "4D_NCHW2NHWC";
+        break;
+      case 5:
+        map_key = "5D_NCHWD2NDHWC";
+        break;
+    }
+  } else if (data_layout == DataLayout::kNHWC && dim_size == 5) {
+    map_key = "5D_NHWDC2NDHWC";
+  }
+  assert(map_key != "");
+  forward_perm->assign(TransPermMap.at(map_key).first.begin(),
+                       TransPermMap.at(map_key).first.end());
+  backward_perm->assign(TransPermMap.at(map_key).second.begin(),
+                        TransPermMap.at(map_key).second.end());
+
+  auto in_dims = phi::vectorize(dims);
+  for (size_t i = 0; i < in_dims.size(); i++) {
+    out_shape->push_back(in_dims[forward_perm->at(i)]);
+  }
+}
+
 template <typename T>
 inline void TransposeFromMLUTensor(const ExecutionContext& ctx,
                                    const std::vector<int> perm,
diff --git a/paddle/fluid/operators/modified_huber_loss_op.cc b/paddle/fluid/operators/modified_huber_loss_op.cc
index f3fcab3ca5490..17f323d0bcba8 100644
--- a/paddle/fluid/operators/modified_huber_loss_op.cc
+++ b/paddle/fluid/operators/modified_huber_loss_op.cc
@@ -176,8 +176,7 @@ REGISTER_OPERATOR(
     ops::ModifiedHuberLossGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(modified_huber_loss_grad, ops::ModifiedHuberLossGradOp);
 
-REGISTER_OP_CPU_KERNEL(
-    modified_huber_loss,
-    ops::ModifiedHuberLossKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(modified_huber_loss,
+                       ops::ModifiedHuberLossKernel<phi::CPUContext, float>);
 REGISTER_OP_CPU_KERNEL(modified_huber_loss_grad,
                        ops::ModifiedHuberLossGradCPUKernel<float>);
diff --git a/paddle/fluid/operators/mul_op_xpu.cc b/paddle/fluid/operators/mul_op_xpu.cc
index 706af96d1a6c4..727a7c0f6e52c 100644
--- a/paddle/fluid/operators/mul_op_xpu.cc
+++ b/paddle/fluid/operators/mul_op_xpu.cc
@@ -49,50 +49,23 @@ class MulXPUKernel : public framework::OpKernel<T> {
                   *y, context.template Attr<int>("y_num_col_dims"))
             : *y;
     z->mutable_data<T>(context.GetPlace());
-    auto z_dim = z->dims();
-    if (z_dim.size() != 2) {
-      z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
-    }
+
+    const XPUType* x_ptr = reinterpret_cast<const XPUType*>(x_matrix.data<T>());
+    const XPUType* y_ptr = reinterpret_cast<const XPUType*>(y_matrix.data<T>());
+    XPUType* out_ptr = reinterpret_cast<XPUType*>(z->data<T>());
+
     bool trans_a = false;
     bool trans_b = false;
-    int m = x_matrix.dims()[0];
-    int k = x_matrix.dims()[1];
-    int k1 = y_matrix.dims()[0];
-    int n = y_matrix.dims()[1];
-    PADDLE_ENFORCE_EQ(
-        k, k1, platform::errors::InvalidArgument("Shape mistake in mul_op"));
-    T alpha = static_cast<T>(1.0);
-    T beta = static_cast<T>(0.0);
-    const T* data_a = x_matrix.data<T>();
-    const T* data_b = y_matrix.data<T>();
-    T* data_c = z->data<T>();
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    int ret = xpu_fc_wrapper<XPUType, int16_t>(
-        dev_ctx.x_context(),
-        reinterpret_cast<const XPUType*>(data_a),
-        reinterpret_cast<const XPUType*>(data_b),
-        reinterpret_cast<XPUType*>(data_c),
-        m,
-        n,
-        k,
-        trans_a,
-        trans_b,
-        nullptr,
-        nullptr,
-        nullptr,
-        k,
-        n,
-        n,
-        alpha,
-        beta,
-        nullptr,
-        xpu::Activation_t::LINEAR);
-    PADDLE_ENFORCE_XDNN_SUCCESS(ret, "xpu_fc_wrapper");
-
-    if (z_dim.size() != 2) {
-      z->Resize(z_dim);
-    }
+    auto x_dims = x_matrix.dims();
+    auto y_dims = y_matrix.dims();
+
+    XpuFcInfo fc_info;
+    GetFCInfo(x_dims, y_dims, trans_a, trans_b, &fc_info);
+    auto& dev_ctx =
+        context.template device_context<paddle::platform::XPUDeviceContext>();
+    xpu::Context* xpu_ctx = dev_ctx.x_context();
+
+    MatMulXPUFunction<XPUType>(xpu_ctx, x_ptr, y_ptr, out_ptr, fc_info, 1.0f);
   }
 };
 
@@ -125,98 +98,51 @@ class MulGradXPUKernel : public framework::OpKernel<T> {
       dy->set_lod(y->lod());
     }
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
+    XpuFcInfo info_forward;
+    GetFCInfo(x_matrix.dims(), y_matrix.dims(), false, false, &info_forward);
+
+    const XPUType* dout_ptr = reinterpret_cast<const XPUType*>(dout->data<T>());
+    const XPUType* x_ptr = reinterpret_cast<const XPUType*>(x->data<T>());
+    const XPUType* y_ptr = reinterpret_cast<const XPUType*>(y->data<T>());
+
+    xpu::Context* xpu_ctx = dev_ctx.x_context();
+    xpu::ctx_guard RAII_GUARD(xpu_ctx);
+    // begin calculate
+    const XPUType* a_1 = reinterpret_cast<const XPUType*>(NULL);
+    const XPUType* b_1 = reinterpret_cast<const XPUType*>(NULL);
+    const XPUType* a_2 = reinterpret_cast<const XPUType*>(NULL);
+    const XPUType* b_2 = reinterpret_cast<const XPUType*>(NULL);
+    XPUType* c_1 =
+        (dx == NULL)
+            ? reinterpret_cast<XPUType*>(NULL)
+            : reinterpret_cast<XPUType*>(dx->mutable_data<T>(ctx.GetPlace()));
+    XPUType* c_2 =
+        (dy == NULL)
+            ? reinterpret_cast<XPUType*>(NULL)
+            : reinterpret_cast<XPUType*>(dy->mutable_data<T>(ctx.GetPlace()));
+    XpuFcInfo info_dx;
+    XpuFcInfo info_dy;
+    std::tuple<XpuFcInfo,
+               XpuFcInfo,
+               const XPUType*,
+               const XPUType*,
+               const XPUType*,
+               const XPUType*>
+        fc_info = MatmulGradFcInfo(xpu_ctx,
+                                   &RAII_GUARD,
+                                   info_forward,
+                                   false,
+                                   false,
+                                   x_ptr,
+                                   y_ptr,
+                                   dout_ptr);
+    std::tie(info_dx, info_dy, a_1, b_1, a_2, b_2) = fc_info;
     if (dx) {
-      dx->mutable_data<T>(ctx.GetPlace());
-      Tensor dx_matrix = dx->dims().size() > 2
-                             ? framework::ReshapeToMatrix(*dx, x_num_col_dims)
-                             : *dx;
-      // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
-      // blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix);
-      bool trans_a = false;
-      bool trans_b = true;
-      int m = dout_mat.dims()[0];
-      int k = dout_mat.dims()[1];
-      int n = y_matrix.dims()[0];
-      int k1 = y_matrix.dims()[1];
-      PADDLE_ENFORCE_EQ(
-          k, k1, platform::errors::InvalidArgument("Shape mistake in mul_op"));
-      int lda = (!trans_a) ? k : m;
-      int ldb = (!trans_b) ? n : k;
-      int ldc = n;
-      T alpha = static_cast<T>(1.0);
-      T beta = static_cast<T>(0.0);
-      const T* data_a = dout->data<T>();
-      const T* data_b = y_matrix.data<T>();
-      T* data_c = dx_matrix.data<T>();
-
-      int ret = xpu_fc_wrapper<XPUType, int16_t>(
-          dev_ctx.x_context(),
-          reinterpret_cast<const XPUType*>(data_a),
-          reinterpret_cast<const XPUType*>(data_b),
-          reinterpret_cast<XPUType*>(data_c),
-          m,
-          n,
-          k,
-          trans_a,
-          trans_b,
-          nullptr,
-          nullptr,
-          nullptr,
-          lda,
-          ldb,
-          ldc,
-          alpha,
-          beta,
-          nullptr,
-          xpu::Activation_t::LINEAR);
-      PADDLE_ENFORCE_XDNN_SUCCESS(ret, "xpu_fc_wrapper");
+      MatMulXPUFunction<XPUType>(xpu_ctx, a_1, b_1, c_1, info_dx, 1.0f);
     }
-
     if (dy) {
-      dy->mutable_data<T>(ctx.GetPlace());
-      Tensor dy_matrix = dy->dims().size() > 2
-                             ? framework::ReshapeToMatrix(*dy, y_num_col_dims)
-                             : *dy;
-      // dy = x' * dout. dy K x N, dout : M x N, x : M x K
-      // blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix);
-      bool trans_a = true;
-      bool trans_b = false;
-      int k = x_matrix.dims()[0];
-      int m = x_matrix.dims()[1];
-      int k1 = dout_mat.dims()[0];
-      int n = dout_mat.dims()[1];
-      PADDLE_ENFORCE_EQ(
-          k, k1, platform::errors::InvalidArgument("Shape mistake in mul_op"));
-      int lda = (!trans_a) ? k : m;
-      int ldb = (!trans_b) ? n : k;
-      int ldc = n;
-      T alpha = static_cast<T>(1.0);
-      T beta = static_cast<T>(0.0);
-      const T* data_a = x_matrix.data<T>();
-      const T* data_b = dout->data<T>();
-      T* data_c = dy_matrix.data<T>();
-
-      int ret = xpu_fc_wrapper<XPUType, int16_t>(
-          dev_ctx.x_context(),
-          reinterpret_cast<const XPUType*>(data_a),
-          reinterpret_cast<const XPUType*>(data_b),
-          reinterpret_cast<XPUType*>(data_c),
-          m,
-          n,
-          k,
-          trans_a,
-          trans_b,
-          nullptr,
-          nullptr,
-          nullptr,
-          lda,
-          ldb,
-          ldc,
-          alpha,
-          beta,
-          nullptr,
-          xpu::Activation_t::LINEAR);
-      PADDLE_ENFORCE_XDNN_SUCCESS(ret, "xpu_fc_wrapper");
+      MatMulXPUFunction<XPUType>(xpu_ctx, a_2, b_2, c_2, info_dy, 1.0f);
     }
   }
 };
diff --git a/paddle/fluid/operators/norm_op.cc b/paddle/fluid/operators/norm_op.cc
index e43112f423692..76737f2bc35a7 100644
--- a/paddle/fluid/operators/norm_op.cc
+++ b/paddle/fluid/operators/norm_op.cc
@@ -100,7 +100,7 @@ class NormOpGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
+using CPU = phi::CPUContext;
 
 DECLARE_INFER_SHAPE_FUNCTOR(norm,
                             NormInferShapeFunctor,
diff --git a/paddle/fluid/operators/one_hot_op.cc b/paddle/fluid/operators/one_hot_op.cc
index e749a267c970b..59842249adcdd 100644
--- a/paddle/fluid/operators/one_hot_op.cc
+++ b/paddle/fluid/operators/one_hot_op.cc
@@ -133,7 +133,6 @@ REGISTER_OPERATOR(
     ops::OneHotOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    one_hot,
-    ops::OneHotKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::OneHotKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(one_hot,
+                       ops::OneHotKernel<phi::CPUContext, int>,
+                       ops::OneHotKernel<phi::CPUContext, int64_t>);
diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
index 8b9569db1a63c..90ce98c4dc316 100644
--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
@@ -131,6 +131,5 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(decayed_adagrad,
                              ops::DecayedAdagradOp,
                              ops::DecayedAdagradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    decayed_adagrad,
-    ops::DecayedAdagradOpKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(decayed_adagrad,
+                       ops::DecayedAdagradOpKernel<phi::CPUContext, float>);
diff --git a/paddle/fluid/operators/optimizers/dgc_momentum_op.cc b/paddle/fluid/operators/optimizers/dgc_momentum_op.cc
index f283cbd21ef9e..09847ff216f5a 100644
--- a/paddle/fluid/operators/optimizers/dgc_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/dgc_momentum_op.cc
@@ -74,6 +74,5 @@ REGISTER_OP_WITHOUT_GRADIENT(dgc_momentum,
                              ops::DGCMomentumOp,
                              ops::DGCMomentumOpMaker);
 
-REGISTER_OP_CPU_KERNEL(
-    dgc_momentum,
-    ops::DGCMomentumKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(dgc_momentum,
+                       ops::DGCMomentumKernel<phi::CPUContext, float>);
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc
index 95b45934ea6d2..e32cf36251742 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc
@@ -120,4 +120,4 @@ REGISTER_OP_WITHOUT_GRADIENT(distributed_fused_lamb_init,
 
 REGISTER_OP_CPU_KERNEL(
     distributed_fused_lamb_init,
-    ops::DistributedFusedLambInitOpKernel<plat::CPUDeviceContext, float>);
+    ops::DistributedFusedLambInitOpKernel<phi::CPUContext, float>);
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
index 224e2a4de3f74..b85eb16a39cf2 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
@@ -167,4 +167,4 @@ REGISTER_OP_WITHOUT_GRADIENT(distributed_fused_lamb,
 
 REGISTER_OP_CPU_KERNEL(
     distributed_fused_lamb,
-    ops::DistributedFusedLambOpKernel<plat::CPUDeviceContext, float>);
+    ops::DistributedFusedLambOpKernel<phi::CPUContext, float>);
diff --git a/paddle/fluid/operators/optimizers/dpsgd_op.cc b/paddle/fluid/operators/optimizers/dpsgd_op.cc
index 023c3f27cf29e..ad1262a7d2d55 100644
--- a/paddle/fluid/operators/optimizers/dpsgd_op.cc
+++ b/paddle/fluid/operators/optimizers/dpsgd_op.cc
@@ -132,7 +132,6 @@ CCS16 - Deep Learning with Differential Privacy.
 
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(dpsgd, ops::DpsgdOp, ops::DpsgdOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    dpsgd,
-    ops::DpsgdOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DpsgdOpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(dpsgd,
+                       ops::DpsgdOpKernel<phi::CPUContext, float>,
+                       ops::DpsgdOpKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/optimizers/ftrl_op.cc b/paddle/fluid/operators/optimizers/ftrl_op.cc
index edafacf508dcb..50060b1636943 100644
--- a/paddle/fluid/operators/optimizers/ftrl_op.cc
+++ b/paddle/fluid/operators/optimizers/ftrl_op.cc
@@ -157,5 +157,4 @@ The paper that proposed Follow The Regularized Leader (FTRL):
 
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(ftrl, ops::FTRLOp, ops::FTRLOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    ftrl, ops::FTRLOpKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(ftrl, ops::FTRLOpKernel<phi::CPUContext, float>);
diff --git a/paddle/fluid/operators/optimizers/lamb_op.cc b/paddle/fluid/operators/optimizers/lamb_op.cc
index e2df17fd720ad..8434da2bb0e76 100644
--- a/paddle/fluid/operators/optimizers/lamb_op.cc
+++ b/paddle/fluid/operators/optimizers/lamb_op.cc
@@ -247,10 +247,9 @@ learning rate, $\lambda$ the weight decay rate.
 
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(lamb, ops::LambOp, ops::LambOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    lamb,
-    ops::LambOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::LambOpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(lamb,
+                       ops::LambOpKernel<phi::CPUContext, float>,
+                       ops::LambOpKernel<phi::CPUContext, double>);
 
 /* ==========================  register checkpoint ===========================*/
 REGISTER_OP_VERSION(lamb).AddCheckpoint(
diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.h b/paddle/fluid/operators/optimizers/lars_momentum_op.h
index df4d7b9a0438b..459900b14f61d 100644
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.h
@@ -33,6 +33,7 @@ class LarsMomentumOpKernel : public framework::OpKernel<T> {
     T mu = static_cast<T>(ctx.Attr<float>("mu"));
     T lars_coeff = ctx.Attr<float>("lars_coeff");
     T epsilon = ctx.Attr<float>("epsilon");
+    T rescale_grad = ctx.Attr<float>("rescale_grad");
 
     int op_num = param.size();
     for (int i = 0; i < op_num; ++i) {
@@ -46,6 +47,7 @@ class LarsMomentumOpKernel : public framework::OpKernel<T> {
       auto p = framework::EigenVector<T>::Flatten(*(param[i]));
       auto v = framework::EigenVector<T>::Flatten(*(velocity[i]));
       auto g = framework::EigenVector<T>::Flatten(*(grad[i]));
+      auto rescale_g = rescale_grad * g;
 
       framework::Tensor p_norm_t, g_norm_t;
       p_norm_t.Resize({1});
@@ -55,14 +57,14 @@ class LarsMomentumOpKernel : public framework::OpKernel<T> {
       auto ep_norm = framework::EigenScalar<T>::From(p_norm_t);
       auto eg_norm = framework::EigenScalar<T>::From(g_norm_t);
       ep_norm = p.square().sum().sqrt();
-      eg_norm = g.square().sum().sqrt();
+      eg_norm = rescale_g.square().sum().sqrt();
 
       T local_lr = lr[0];
       if (lars_weight_decay > 0 && ep_norm(0) > 0 && eg_norm(0) > 0) {
         local_lr = lr[0] * lars_coeff * ep_norm(0) /
                    (eg_norm(0) + lars_weight_decay * ep_norm(0) + epsilon);
       }
-      v_out = v * mu + local_lr * (g + lars_weight_decay * p);
+      v_out = v * mu + local_lr * (rescale_g + lars_weight_decay * p);
       p_out = p - v_out;
     }
   }
diff --git a/paddle/fluid/operators/optimizers/merged_adam_op.cc b/paddle/fluid/operators/optimizers/merged_adam_op.cc
index 042905ddfe489..f49fc72d01030 100644
--- a/paddle/fluid/operators/optimizers/merged_adam_op.cc
+++ b/paddle/fluid/operators/optimizers/merged_adam_op.cc
@@ -10,7 +10,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/optimizers/merged_adam_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,8 +25,6 @@ class MergedAdamOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {}
-
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     auto param_dtype =
@@ -128,14 +130,15 @@ param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsil
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(merged_adam,
-                             ops::MergedAdamOp,
-                             ops::MergedAdamOpMaker);
-REGISTER_OP_WITHOUT_GRADIENT(merged_adamw,
-                             ops::MergedAdamOp,
-                             ops::MergedAdamOpMaker);
-
-REGISTER_OP_CPU_KERNEL(
+
+DECLARE_INFER_SHAPE_FUNCTOR(merged_adam,
+                            MergedAdamInferMetaFunctor,
+                            PD_INFER_META(phi::MergedAdamInferMeta));
+
+REGISTER_OPERATOR(
     merged_adam,
-    ops::MergedAdamOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MergedAdamOpKernel<paddle::platform::CPUDeviceContext, double>);
+    ops::MergedAdamOp,
+    ops::MergedAdamOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    MergedAdamInferMetaFunctor);
diff --git a/paddle/fluid/operators/optimizers/merged_adam_op.cu b/paddle/fluid/operators/optimizers/merged_adam_op.cu
deleted file mode 100644
index 578c9864fa42d..0000000000000
--- a/paddle/fluid/operators/optimizers/merged_adam_op.cu
+++ /dev/null
@@ -1,230 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/optimizers/merged_adam_op.h"
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename MT>
-__global__ void AdamKernelREG(MT beta1,
-                              MT beta2,
-                              MT epsilon,
-                              MT beta1_pow_,
-                              MT beta2_pow_,
-                              const MT* moment1,
-                              MT* moment1_out,
-                              const MT* moment2,
-                              MT* moment2_out,
-                              const MT* lr_,
-                              const T* grad,
-                              const T* param,
-                              T* param_out,
-                              const MT* master_param,
-                              MT* master_param_out,
-                              int ndim) {
-  MT lr = *lr_;
-  MT beta1_pow = beta1_pow_;
-  MT beta2_pow = beta2_pow_;
-
-  int id = blockIdx.x * blockDim.x + threadIdx.x;
-
-  for (; id < ndim; id += gridDim.x * blockDim.x) {
-    MT p = master_param ? master_param[id] : static_cast<MT>(param[id]);
-    MT g = static_cast<MT>(grad[id]);
-    MT mom1 = static_cast<MT>(moment1[id]);
-    MT mom2 = static_cast<MT>(moment2[id]);
-    mom1 = beta1 * mom1 + (static_cast<MT>(1.0) - beta1) * g;
-    mom2 = beta2 * mom2 + (static_cast<MT>(1.0) - beta2) * g * g;
-
-    MT denom = (sqrt(mom2) / sqrt(static_cast<MT>(1.0) - beta2_pow)) + epsilon;
-    p += (mom1 / denom) * (-(lr / (static_cast<MT>(1.0) - beta1_pow)));
-
-    moment1_out[id] = mom1;
-    moment2_out[id] = mom2;
-    param_out[id] = static_cast<T>(p);
-    if (master_param_out) {
-      master_param_out[id] = p;
-    }
-  }
-}
-
-template <typename T, typename MT>
-__global__ void AdamKernelMEM(MT beta1,
-                              MT beta2,
-                              MT epsilon,
-                              const MT* beta1_pow_,
-                              const MT* beta2_pow_,
-                              const MT* moment1,
-                              MT* moment1_out,
-                              const MT* moment2,
-                              MT* moment2_out,
-                              const MT* lr_,
-                              const T* grad,
-                              const T* param,
-                              T* param_out,
-                              const MT* master_param,
-                              MT* master_param_out,
-                              int ndim) {
-  MT lr = *lr_;
-  MT beta1_pow = *beta1_pow_;
-  MT beta2_pow = *beta2_pow_;
-
-  int id = blockIdx.x * blockDim.x + threadIdx.x;
-
-  for (; id < ndim; id += gridDim.x * blockDim.x) {
-    MT p = master_param ? master_param[id] : static_cast<MT>(param[id]);
-    MT g = static_cast<MT>(grad[id]);
-    MT mom1 = static_cast<MT>(moment1[id]);
-    MT mom2 = static_cast<MT>(moment2[id]);
-    mom1 = beta1 * mom1 + (static_cast<MT>(1.0) - beta1) * g;
-    mom2 = beta2 * mom2 + (static_cast<MT>(1.0) - beta2) * g * g;
-
-    MT denom = (sqrt(mom2) / sqrt(static_cast<MT>(1.0) - beta2_pow)) + epsilon;
-    p += (mom1 / denom) * (-(lr / (static_cast<MT>(1.0) - beta1_pow)));
-
-    moment1_out[id] = mom1;
-    moment2_out[id] = mom2;
-    param_out[id] = static_cast<T>(p);
-    if (master_param_out) {
-      master_param_out[id] = p;
-    }
-  }
-}
-
-template <typename T>
-__global__ void UpdateBetaPow(T beta1,
-                              T beta2,
-                              const T* beta1_pow_,
-                              const T* beta2_pow_,
-                              T* beta1_pow_out,
-                              T* beta2_pow_out) {
-  *beta1_pow_out = beta1 * beta1_pow_[0];
-  *beta2_pow_out = beta2 * beta2_pow_[0];
-}
-
-template <typename T>
-class MergedAdamOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using MPDType = typename details::MPTypeTrait<T>::Type;
-
-    auto param = ctx.MultiInput<framework::Tensor>("Param");
-    auto grad = ctx.MultiInput<framework::Tensor>("Grad");
-    auto lr = ctx.MultiInput<framework::Tensor>("LearningRate");
-    auto mom1 = ctx.MultiInput<framework::Tensor>("Moment1");
-    auto mom2 = ctx.MultiInput<framework::Tensor>("Moment2");
-    auto beta1_pow = ctx.MultiInput<framework::Tensor>("Beta1Pow");
-    auto beta2_pow = ctx.MultiInput<framework::Tensor>("Beta2Pow");
-
-    auto param_out = ctx.MultiOutput<framework::Tensor>("ParamOut");
-    auto mom1_out = ctx.MultiOutput<framework::Tensor>("Moment1Out");
-    auto mom2_out = ctx.MultiOutput<framework::Tensor>("Moment2Out");
-    auto beta1_pow_out = ctx.MultiOutput<framework::Tensor>("Beta1PowOut");
-    auto beta2_pow_out = ctx.MultiOutput<framework::Tensor>("Beta2PowOut");
-
-    MPDType beta1 = static_cast<MPDType>(ctx.Attr<float>("beta1"));
-    MPDType beta2 = static_cast<MPDType>(ctx.Attr<float>("beta2"));
-    MPDType epsilon = static_cast<MPDType>(ctx.Attr<float>("epsilon"));
-    bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
-    VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
-
-    const bool multi_precision = ctx.Attr<bool>("multi_precision");
-    auto master_param = ctx.MultiInput<framework::Tensor>("MasterParam");
-    auto master_param_out =
-        ctx.MultiOutput<framework::Tensor>("MasterParamOut");
-
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-
-    size_t param_num = param.size();
-    for (size_t idx = 0; idx < param_num; idx++) {
-      const MPDType* master_in_data =
-          multi_precision ? master_param[idx]->data<MPDType>() : nullptr;
-      MPDType* master_out_data =
-          multi_precision
-              ? master_param_out[idx]->mutable_data<MPDType>(ctx.GetPlace())
-              : nullptr;
-
-      // update param and moment
-      int threads = 512;
-      int blocks = (param[idx]->numel() + threads - 1) / threads;
-
-      if (beta1_pow[idx]->place() == platform::CPUPlace() &&
-          beta2_pow[idx]->place() == platform::CPUPlace()) {
-        // Compute with betapow in REG
-        AdamKernelREG<T, MPDType><<<blocks, threads, 0, dev_ctx.stream()>>>(
-            beta1,
-            beta2,
-            epsilon,
-            *beta1_pow[idx]->data<MPDType>(),
-            *beta2_pow[idx]->data<MPDType>(),
-            mom1[idx]->data<MPDType>(),
-            mom1_out[idx]->mutable_data<MPDType>(ctx.GetPlace()),
-            mom2[idx]->data<MPDType>(),
-            mom2_out[idx]->mutable_data<MPDType>(ctx.GetPlace()),
-            lr[idx]->data<MPDType>(),
-            grad[idx]->data<T>(),
-            param[idx]->data<T>(),
-            param_out[idx]->mutable_data<T>(ctx.GetPlace()),
-            master_in_data,
-            master_out_data,
-            param[idx]->numel());
-        if (!use_global_beta_pow) {
-          // Cpu update
-          beta1_pow_out[idx]->mutable_data<MPDType>(platform::CPUPlace())[0] =
-              beta1 * beta1_pow[idx]->data<MPDType>()[0];
-          beta2_pow_out[idx]->mutable_data<MPDType>(platform::CPUPlace())[0] =
-              beta2 * beta2_pow[idx]->data<MPDType>()[0];
-        }
-      } else {
-        AdamKernelMEM<T, MPDType><<<blocks, threads, 0, dev_ctx.stream()>>>(
-            beta1,
-            beta2,
-            epsilon,
-            beta1_pow[idx]->data<MPDType>(),
-            beta2_pow[idx]->data<MPDType>(),
-            mom1[idx]->data<MPDType>(),
-            mom1_out[idx]->mutable_data<MPDType>(ctx.GetPlace()),
-            mom2[idx]->data<MPDType>(),
-            mom2_out[idx]->mutable_data<MPDType>(ctx.GetPlace()),
-            lr[idx]->data<MPDType>(),
-            grad[idx]->data<T>(),
-            param[idx]->data<T>(),
-            param_out[idx]->mutable_data<T>(ctx.GetPlace()),
-            master_in_data,
-            master_out_data,
-            param[idx]->numel());
-        if (!use_global_beta_pow) {
-          // Update with gpu
-          UpdateBetaPow<MPDType><<<1, 32, 0, dev_ctx.stream()>>>(
-              beta1,
-              beta2,
-              beta1_pow[idx]->data<MPDType>(),
-              beta2_pow[idx]->data<MPDType>(),
-              beta1_pow_out[idx]->mutable_data<MPDType>(ctx.GetPlace()),
-              beta2_pow_out[idx]->mutable_data<MPDType>(ctx.GetPlace()));
-        }
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(merged_adam,
-                        ops::MergedAdamOpCUDAKernel<float>,
-                        ops::MergedAdamOpCUDAKernel<double>,
-                        ops::MergedAdamOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/optimizers/merged_adam_op.h b/paddle/fluid/operators/optimizers/merged_adam_op.h
deleted file mode 100644
index 3b7c8ab0286c3..0000000000000
--- a/paddle/fluid/operators/optimizers/merged_adam_op.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/phi/kernels/funcs/adam_functors.h"
-
-namespace paddle {
-namespace operators {
-
-namespace scatter = paddle::operators::math::scatter;
-
-template <typename DeviceContext, typename T>
-class MergedAdamOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto param = ctx.MultiInput<framework::Tensor>("Param");
-    size_t n = param.size();
-    auto grad = ctx.MultiInput<framework::Tensor>("Grad");
-    PADDLE_ENFORCE_EQ(n,
-                      grad.size(),
-                      platform::errors::InvalidArgument(
-                          "The size of Input(Grad) must be equal to "
-                          "Input(Param), but got the size of Input(Grad) "
-                          "is %d, the size of Input(Param) is %d.",
-                          grad.size(),
-                          n));
-    auto lr = ctx.MultiInput<framework::Tensor>("LearningRate");
-    PADDLE_ENFORCE_EQ(
-        n,
-        lr.size(),
-        platform::errors::InvalidArgument(
-            "The size of Input(LearningRate) must be equal to "
-            "Input(Param), but got the size of Input(LearningRate) "
-            "is %d, the size of Input(Param) is %d.",
-            lr.size(),
-            n));
-    auto mom1 = ctx.MultiInput<framework::Tensor>("Moment1");
-    PADDLE_ENFORCE_EQ(n,
-                      mom1.size(),
-                      platform::errors::InvalidArgument(
-                          "The size of Input(Moment1) must be equal to "
-                          "Input(Param), but got the size of Input(Moment1) "
-                          "is %d, the size of Input(Param) is %d.",
-                          mom1.size(),
-                          n));
-    auto mom2 = ctx.MultiInput<framework::Tensor>("Moment2");
-    PADDLE_ENFORCE_EQ(n,
-                      mom2.size(),
-                      platform::errors::InvalidArgument(
-                          "The size of Input(Moment2) must be equal to "
-                          "Input(Param), but got the size of Input(Moment2) "
-                          "is %d, the size of Input(Param) is %d.",
-                          mom2.size(),
-                          n));
-    auto beta1_pow = ctx.MultiInput<framework::Tensor>("Beta1Pow");
-    PADDLE_ENFORCE_EQ(n,
-                      beta1_pow.size(),
-                      platform::errors::InvalidArgument(
-                          "The size of Input(Beta1Pow) must be equal to "
-                          "Input(Param), but got the size of Input(Beta1Pow) "
-                          "is %d, the size of Input(Param) is %d.",
-                          beta1_pow.size(),
-                          n));
-    auto beta2_pow = ctx.MultiInput<framework::Tensor>("Beta2Pow");
-    PADDLE_ENFORCE_EQ(n,
-                      beta2_pow.size(),
-                      platform::errors::InvalidArgument(
-                          "The size of Input(Beta2Pow) must be equal to "
-                          "Input(Param), but got the size of Input(Beta2Pow) "
-                          "is %d, the size of Input(Param) is %d.",
-                          beta2_pow.size(),
-                          n));
-
-    auto param_out = ctx.MultiOutput<framework::Tensor>("ParamOut");
-    auto mom1_out = ctx.MultiOutput<framework::Tensor>("Moment1Out");
-    auto mom2_out = ctx.MultiOutput<framework::Tensor>("Moment2Out");
-    auto beta1_pow_out = ctx.MultiOutput<framework::Tensor>("Beta1PowOut");
-    auto beta2_pow_out = ctx.MultiOutput<framework::Tensor>("Beta2PowOut");
-
-    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
-    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-    bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
-    VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
-
-    size_t param_num = param.size();
-    for (size_t idx = 0; idx < param_num; idx++) {
-      phi::funcs::AdamFunctor<T, phi::funcs::CPUAdam> functor(
-          beta1,
-          beta2,
-          epsilon,
-          beta1_pow[idx]->data<T>(),
-          beta2_pow[idx]->data<T>(),
-          mom1[idx]->data<T>(),
-          mom1_out[idx]->mutable_data<T>(ctx.GetPlace()),
-          mom2[idx]->data<T>(),
-          mom2_out[idx]->mutable_data<T>(ctx.GetPlace()),
-          lr[idx]->data<T>(),
-          grad[idx]->data<T>(),
-          param[idx]->data<T>(),
-          param_out[idx]->mutable_data<T>(ctx.GetPlace()));
-      functor(param[idx]->numel());
-      if (!use_global_beta_pow) {
-        beta1_pow_out[idx]->mutable_data<T>(ctx.GetPlace())[0] =
-            beta1 * beta1_pow[idx]->data<T>()[0];
-        beta2_pow_out[idx]->mutable_data<T>(ctx.GetPlace())[0] =
-            beta2 * beta2_pow[idx]->data<T>()[0];
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op.cc b/paddle/fluid/operators/optimizers/merged_momentum_op.cc
index b640e47e6e638..85b2f818fe137 100644
--- a/paddle/fluid/operators/optimizers/merged_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/merged_momentum_op.cc
@@ -12,7 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/optimizers/merged_momentum_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
 
 namespace paddle {
 namespace operators {
@@ -22,8 +25,6 @@ class MergedMomentumOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {}
-
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     auto param_dtype =
@@ -100,11 +101,11 @@ class MergedMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
+DECLARE_INFER_SHAPE_FUNCTOR(merged_momentum,
+                            MergedMomentumInferShapeFunctor,
+                            PD_INFER_META(phi::MergedMomentumInferMeta));
+
 REGISTER_OP_WITHOUT_GRADIENT(merged_momentum,
                              ops::MergedMomentumOp,
-                             ops::MergedMomentumOpMaker);
-
-REGISTER_OP_CPU_KERNEL(
-    merged_momentum,
-    ops::MergedMomentumOpKernel<plat::CPUDeviceContext, float>,
-    ops::MergedMomentumOpKernel<plat::CPUDeviceContext, double>);
+                             ops::MergedMomentumOpMaker,
+                             MergedMomentumInferShapeFunctor);
diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op.h b/paddle/fluid/operators/optimizers/merged_momentum_op.h
deleted file mode 100644
index 77c8f3dbd3555..0000000000000
--- a/paddle/fluid/operators/optimizers/merged_momentum_op.h
+++ /dev/null
@@ -1,370 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/fluid/platform/macros.h"
-#include "paddle/phi/kernels/impl/momentum_kernel_impl.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-using MultiPrecisionType = typename details::MPTypeTrait<T>::Type;
-
-template <typename MT, uint32_t kParamNum, bool kHasMasterParams>
-struct MergedMomentumMasterParams {
-  MT *PADDLE_RESTRICT master_params[kParamNum];
-
-  HOSTDEVICE MT *MasterParam(size_t idx) const { return master_params[idx]; }
-  HOSTDEVICE void SetMasterParam(size_t idx, MT *p) { master_params[idx] = p; }
-};
-
-template <typename MT, uint32_t kParamNum>
-struct MergedMomentumMasterParams<MT, kParamNum, false> {
-  HOSTDEVICE constexpr MT *MasterParam(size_t) const { return nullptr; }
-  HOSTDEVICE constexpr void SetMasterParam(size_t, MT *) {}
-};
-
-template <typename T,
-          typename MT,
-          bool kHasMasterParams,
-          uint32_t kParamNum = kHasMasterParams ? 55 : 110>
-struct MergedMomentumKernelParam
-    : public MergedMomentumMasterParams<MT, kParamNum, kHasMasterParams> {
-  static constexpr auto N = kParamNum;
-  size_t sizes[N];
-  T *PADDLE_RESTRICT params[N];
-  const T *PADDLE_RESTRICT grads[N];
-  MT *PADDLE_RESTRICT velocitys[N];
-  const MultiPrecisionType<MT> *PADDLE_RESTRICT lr;
-  MT mu;
-  MT rescale_grad;
-  uint32_t param_num;
-
-  HOSTDEVICE void operator()(size_t i) const {
-    const MT lr_val = static_cast<MT>(*lr);
-    for (uint32_t idx = 0; idx < param_num; ++idx) {
-      auto size = sizes[idx];
-      if (i >= size) continue;
-
-      auto param_p = params[idx];
-      auto grad_p = grads[idx];
-      auto velocity_p = velocitys[idx];
-      auto master_param_p = this->MasterParam(idx);
-
-      const MT param =
-          master_param_p ? master_param_p[i] : static_cast<MT>(param_p[i]);
-      const MT grad = static_cast<MT>(grad_p[i]) * rescale_grad;
-      const MT velocity = velocity_p[i];
-      const MT velocity_out = velocity * mu + grad;
-      const MT param_out = param - lr_val * velocity_out;
-      velocity_p[i] = velocity_out;
-      param_p[i] = static_cast<T>(param_out);
-      if (master_param_p) {
-        master_param_p[i] = param_out;
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class MergedMomentumOpKernel : public framework::OpKernel<T> {
-  using MPType = typename operators::details::MPTypeTrait<T>::Type;
-
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const bool multi_precision = ctx.Attr<bool>("multi_precision");
-    if (multi_precision) {
-      InnerCompute<MPType>(ctx, multi_precision);
-    } else {
-      InnerCompute<T>(ctx, multi_precision);
-    }
-  }
-
- private:
-  template <typename MT>
-  void InnerCompute(const framework::ExecutionContext &ctx,
-                    const bool multi_precision) const {
-    auto params = ctx.MultiInput<framework::Tensor>("Param");
-    auto params_out = ctx.MultiOutput<framework::Tensor>("ParamOut");
-    size_t n = params.size();
-    PADDLE_ENFORCE_EQ(n,
-                      params_out.size(),
-                      platform::errors::InvalidArgument(
-                          "The size of Output(ParamOut) must be equal to "
-                          "Input(Param), but got the size of Output(ParamOut) "
-                          "is %d, the size of Input(Param) is %d.",
-                          params_out.size(),
-                          n));
-    for (size_t i = 0; i < n; ++i) {
-      PADDLE_ENFORCE_EQ(params[i],
-                        params_out[i],
-                        platform::errors::InvalidArgument(
-                            "The size of Input(Param) and Output(ParamOut) "
-                            "must be the same Tensors."));
-    }
-
-    auto grads = ctx.MultiInput<framework::Tensor>("Grad");
-    PADDLE_ENFORCE_EQ(
-        n,
-        grads.size(),
-        platform::errors::InvalidArgument(
-            "The size of Input(Grad) must be equal to Input(Param), but got "
-            "the size of Input(Grad) is %d, the size of Input(Param) is %d.",
-            grads.size(),
-            n));
-
-    auto velocitys = ctx.MultiInput<framework::Tensor>("Velocity");
-    PADDLE_ENFORCE_EQ(n,
-                      velocitys.size(),
-                      platform::errors::InvalidArgument(
-                          "The size of Input(Velocity) must be equal to "
-                          "Input(Param), but got the size of Input(Velocity) "
-                          "is %d, the size of Input(Param) is %d.",
-                          velocitys.size(),
-                          n));
-
-    auto velocitys_out = ctx.MultiOutput<framework::Tensor>("VelocityOut");
-    PADDLE_ENFORCE_EQ(
-        n,
-        velocitys_out.size(),
-        platform::errors::InvalidArgument(
-            "The size of Output(VelocityOut) must be "
-            "equal to Input(Param), but got the size of Output(VelocityOut) is "
-            "%d, the size of Input(Param) is %d.",
-            velocitys_out.size(),
-            n));
-    for (size_t i = 0; i < n; ++i) {
-      PADDLE_ENFORCE_EQ(velocitys[i],
-                        velocitys_out[i],
-                        platform::errors::InvalidArgument(
-                            "Input(Velocity) and Output(VelocityOut) must be "
-                            "the same Tensors."));
-    }
-
-    auto master_params = ctx.MultiInput<framework::Tensor>("MasterParam");
-    auto master_params_out =
-        ctx.MultiOutput<framework::Tensor>("MasterParamOut");
-    if (multi_precision) {
-      PADDLE_ENFORCE_EQ(
-          n,
-          master_params.size(),
-          platform::errors::InvalidArgument(
-              "The size of Input(MasterParam) must be "
-              "equal to Input(Param), but got the size of Input(MasterParam) "
-              "is %d, the size of Input(Param) is %d.",
-              master_params.size(),
-              n));
-      PADDLE_ENFORCE_EQ(
-          n,
-          master_params_out.size(),
-          platform::errors::InvalidArgument(
-              "The size of Output(MasterParamOut) must be equal to "
-              "Input(MasterParam), but got the size of Output(MasterParamOut) "
-              "is %d, the size of Input(Param) is %d.",
-              master_params_out.size(),
-              n));
-      for (size_t i = 0; i < n; ++i) {
-        PADDLE_ENFORCE_EQ(master_params[i],
-                          master_params_out[i],
-                          platform::errors::InvalidArgument(
-                              "Input(MasterParam) and Output(MasterParamOut) "
-                              "must be the same Tensors."));
-        PADDLE_ENFORCE_NOT_NULL(master_params[i],
-                                platform::errors::InvalidArgument(
-                                    "Input(MasterParam) must be provided when "
-                                    "multi_precision=True."));
-      }
-    } else {
-      master_params.clear();
-      master_params_out.clear();
-    }
-
-    auto mu = ctx.Attr<float>("mu");
-    auto rescale_grad = ctx.Attr<float>("rescale_grad");
-    auto lrs = ctx.MultiInput<framework::Tensor>("LearningRate");
-    if (lrs.size() != 1) {
-      PADDLE_ENFORCE_EQ(
-          n,
-          lrs.size(),
-          platform::errors::InvalidArgument(
-              "If the size of Input(LearningRate) is not 1, the size of "
-              "Input(LearningRate) must be "
-              "equal to Input(Param), but got the size of Input(LearningRate) "
-              "is %d, the size of Input(Param) is %d.",
-              lrs.size(),
-              n));
-    }
-    auto use_nesterov = ctx.Attr<bool>("use_nesterov");
-    auto regularization_methods =
-        ctx.Attr<std::vector<std::string>>("regularization_method");
-    auto regularization_coeffs =
-        ctx.Attr<std::vector<float>>("regularization_coeff");
-    if (regularization_methods.size() != 0) {
-      PADDLE_ENFORCE_EQ(
-          n,
-          regularization_methods.size(),
-          platform::errors::InvalidArgument(
-              "The size of Attr(regularization_method) must be equal "
-              "to Input(Param), but got the size of "
-              "Attr(regularization_method) is %d, the size of Input(Param) is "
-              "%d.",
-              regularization_methods.size(),
-              n));
-      PADDLE_ENFORCE_EQ(
-          n,
-          regularization_coeffs.size(),
-          platform::errors::InvalidArgument(
-              "The size of Attr(regularization_coeff) must be equal "
-              "to Input(Param), but got the size of Attr(regularization_coeff) "
-              "is %d, the size of Input(Param) is %d.",
-              regularization_coeffs.size(),
-              n));
-    }
-
-    VLOG(5) << "use_nesterov: " << use_nesterov
-            << ",  regularization_methods.size(): "
-            << regularization_methods.size()
-            << ",  regularization_coeffs.size(): "
-            << regularization_coeffs.size();
-
-    auto &dev_ctx = ctx.template device_context<DeviceContext>();
-
-    if (lrs.size() == 1 && use_nesterov == false &&
-        regularization_methods.size() == 0) {
-#define PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL(kMultiPrecision)            \
-  MergedMomentumKernelParam<T, MT, kMultiPrecision> kernel_params;       \
-  constexpr auto kMaxMergedNum = decltype(kernel_params)::N;             \
-  size_t kernel_num = (n + kMaxMergedNum - 1) / kMaxMergedNum;           \
-  kernel_params.mu = static_cast<MT>(mu);                                \
-  kernel_params.rescale_grad = static_cast<MT>(rescale_grad);            \
-  kernel_params.lr = lrs[0]->data<MPType>();                             \
-  for (size_t i = 0; i < kernel_num; ++i) {                              \
-    size_t start = i * kMaxMergedNum;                                    \
-    size_t end = std::min((i + 1) * kMaxMergedNum, n);                   \
-    kernel_params.param_num = static_cast<uint32_t>(end - start);        \
-    size_t max_size = 0;                                                 \
-    for (size_t j = 0; j < kernel_params.param_num; ++j) {               \
-      auto size = static_cast<size_t>(params_out[j + start]->numel());   \
-      max_size = std::max(max_size, size);                               \
-      kernel_params.sizes[j] = size;                                     \
-      kernel_params.params[j] = params_out[j + start]->data<T>();        \
-      kernel_params.grads[j] = grads[j + start]->data<T>();              \
-      kernel_params.velocitys[j] = velocitys_out[j + start]->data<MT>(); \
-      kernel_params.SetMasterParam(                                      \
-          j,                                                             \
-          kMultiPrecision ? master_params_out[j + start]->data<MT>()     \
-                          : nullptr);                                    \
-    }                                                                    \
-    platform::ForRange<DeviceContext> for_range(dev_ctx, max_size);      \
-    for_range(kernel_params);                                            \
-    VLOG(10) << "Launch MergedMomentum kernel " << i << " "              \
-             << kernel_params.param_num;                                 \
-  }
-      if (multi_precision) {
-        PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL(true);
-      } else {
-        PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL(false);
-      }
-#undef PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL
-    } else {
-      for (size_t idx = 0; idx < n; idx++) {
-        phi::RegularizationType regularization_flag =
-            regularization_methods.size() > 0 &&
-                    regularization_methods[idx] == "l2_decay"
-                ? phi::RegularizationType::kL2DECAY
-                : phi::RegularizationType::kNONE;
-
-        MT regularization_coeff = static_cast<MT>(0.0);
-        if (regularization_coeffs.size() != 0) {
-          regularization_coeff = static_cast<MT>(regularization_coeffs[idx]);
-        }
-        auto lr_temp = lrs.size() > 1 ? lrs[idx] : lrs[0];
-
-        const MT *master_in_data =
-            multi_precision ? master_params[idx]->data<MT>() : nullptr;
-        MT *master_out_data =
-            multi_precision ? master_params_out[idx]->data<MT>() : nullptr;
-        if (platform::is_cpu_place(ctx.GetPlace())) {
-          phi::CPUDenseMomentumFunctor<MT> functor;
-          functor(params[idx],
-                  grads[idx],
-                  velocitys[idx],
-                  lr_temp,
-                  static_cast<MT>(mu),
-                  use_nesterov,
-                  regularization_flag,
-                  regularization_coeff,
-                  params_out[idx],
-                  velocitys_out[idx]);
-          VLOG(10) << "Launch MergedMomentum cpu kernel.";
-        } else if (platform::is_gpu_place(ctx.GetPlace())) {
-          platform::ForRange<DeviceContext> for_range(
-              static_cast<const DeviceContext &>(ctx.device_context()),
-              params[idx]->numel());
-#define PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL(__nesterov, __reg_type) \
-  phi::DenseMomentumFunctor<T, MT, __reg_type, __nesterov> functor(   \
-      params[idx]->data<T>(),                                         \
-      grads[idx]->data<T>(),                                          \
-      velocitys[idx]->data<MT>(),                                     \
-      lr_temp->data<MPType>(),                                        \
-      master_in_data,                                                 \
-      static_cast<MT>(mu),                                            \
-      static_cast<MT>(rescale_grad),                                  \
-      params[idx]->numel(),                                           \
-      regularization_coeff,                                           \
-      params_out[idx]->data<T>(),                                     \
-      velocitys_out[idx]->data<MT>(),                                 \
-      master_out_data);                                               \
-  for_range(functor);
-          if (use_nesterov) {
-            if (regularization_flag == phi::RegularizationType::kL2DECAY) {
-              PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL(
-                  phi::UseNesterov, phi::RegularizationType::kL2DECAY);
-              VLOG(10)
-                  << "Launch MergedMomentum gpu kernel use_nesterov kL2DECAY.";
-            } else {
-              PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL(
-                  phi::UseNesterov, phi::RegularizationType::kNONE);
-              VLOG(10)
-                  << "Launch MergedMomentum gpu kernel use_nesterov kNONE.";
-            }
-          } else {
-            if (regularization_flag == phi::RegularizationType::kL2DECAY) {
-              PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL(
-                  phi::NoNesterov, phi::RegularizationType::kL2DECAY);
-              VLOG(10)
-                  << "Launch MergedMomentum gpu kernel no_nesterov kL2DECAY.";
-            } else {
-              PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL(
-                  phi::NoNesterov, phi::RegularizationType::kNONE);
-              VLOG(10) << "Launch MergedMomentum gpu kernel no_nesterov kNONE.";
-            }
-          }
-        }
-      }
-      VLOG(10)
-          << "Launch MergedMomentum kernel with multi_lr and regularization.";
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
index 32af057ecd417..90faf8f389a89 100644
--- a/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
+++ b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
@@ -12,8 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/fluid/operators/optimizers/merged_momentum_op.h"
+#include "paddle/fluid/platform/for_range.h"
+#include "paddle/fluid/platform/macros.h"
+#include "paddle/phi/kernels/impl/momentum_kernel_impl.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc b/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc
index ff131138e8a6f..38479d6dba22e 100644
--- a/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc
@@ -12,8 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/optimizers/merged_momentum_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+#include "paddle/fluid/platform/for_range.h"
+#include "paddle/fluid/platform/macros.h"
 #include "paddle/phi/kernels/impl/momentum_kernel_impl.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/optimizers/mkldnn/sgd_mkldnn_op.cc b/paddle/fluid/operators/optimizers/mkldnn/sgd_mkldnn_op.cc
index 7486c0c2b8cbe..e332972f7576a 100644
--- a/paddle/fluid/operators/optimizers/mkldnn/sgd_mkldnn_op.cc
+++ b/paddle/fluid/operators/optimizers/mkldnn/sgd_mkldnn_op.cc
@@ -23,7 +23,7 @@ namespace paddle {
 namespace operators {
 
 template <typename T>
-class SGDOneDNNKernel : public SGDOpKernel<pplat::CPUDeviceContext, T> {
+class SGDOneDNNKernel : public SGDOpKernel<phi::CPUContext, T> {
  protected:
   void dense_param_and_grad_kernel(
       const framework::ExecutionContext &ctx) const override {
diff --git a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc
index 5eeeb7353072e..f576827f9cadf 100644
--- a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc
+++ b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc
@@ -85,5 +85,5 @@ REGISTER_OP_WITHOUT_GRADIENT(pow2_decay_with_linear_warmup,
                              ops::Pow2DecayWithLinearWarmupOpMaker);
 REGISTER_OP_CPU_KERNEL(
     pow2_decay_with_linear_warmup,
-    ops::Pow2DecayWithLinearWarmupOpKernel<plat::CPUDeviceContext, double>,
-    ops::Pow2DecayWithLinearWarmupOpKernel<plat::CPUDeviceContext, float>);
+    ops::Pow2DecayWithLinearWarmupOpKernel<phi::CPUContext, double>,
+    ops::Pow2DecayWithLinearWarmupOpKernel<phi::CPUContext, float>);
diff --git a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h
index 60274f6b667da..d3d2e48fdcd6c 100644
--- a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h
+++ b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h
@@ -17,7 +17,7 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/for_range.h"
-#include "paddle/fluid/platform/macros.h"
+#include "paddle/phi/core/macros.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
index a5424f5cda5f0..072e39dd91cc0 100644
--- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
@@ -134,6 +134,5 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(proximal_adagrad,
                              ops::ProximalAdagradOp,
                              ops::ProximalAdagradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    proximal_adagrad,
-    ops::ProximalAdagradOpKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(proximal_adagrad,
+                       ops::ProximalAdagradOpKernel<phi::CPUContext, float>);
diff --git a/paddle/fluid/operators/optimizers/proximal_gd_op.cc b/paddle/fluid/operators/optimizers/proximal_gd_op.cc
index dc7e9c90af59f..50676863678c1 100644
--- a/paddle/fluid/operators/optimizers/proximal_gd_op.cc
+++ b/paddle/fluid/operators/optimizers/proximal_gd_op.cc
@@ -107,6 +107,5 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(proximal_gd,
                              ops::ProximalGDOp,
                              ops::ProximalGDOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    proximal_gd,
-    ops::ProximalGDOpKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(proximal_gd,
+                       ops::ProximalGDOpKernel<phi::CPUContext, float>);
diff --git a/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc b/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc
index eb987151472e2..6addb7c2febd8 100644
--- a/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc
@@ -1,141 +1,145 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_XPU
-
-#include <gflags/gflags.h>
-
-#include <iostream>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/device_wrapper.h"
-
-namespace paddle {
-namespace operators {
-
-static inline float GetAttrFromTensor(const framework::Tensor* tensor) {
-  const float* tensor_data = tensor->data<float>();
-  framework::Tensor cpu_tensor;
-  if (platform::is_gpu_place(tensor->place()) ||
-      platform::is_xpu_place(tensor->place())) {
-    paddle::framework::TensorCopySync(
-        *tensor, platform::CPUPlace(), &cpu_tensor);
-    tensor_data = cpu_tensor.data<float>();
-  }
-  return tensor_data[0];
-}
-
-using framework::OpKernelType;
-using framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class RmspropOpXPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using paddle::framework::LoDTensor;
-
-    // check Param & Grad tensor type
-    const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE_EQ(param_var->IsType<LoDTensor>(),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Tensor holds the wrong type，Expected Var(%s)'s "
-                          "type is LoDTensor, "
-                          "but the received is %s",
-                          ctx.InputNames("Param").front(),
-                          framework::ToTypeName(param_var->Type())));
-
-    const auto* grad_var = ctx.InputVar("Grad");
-    PADDLE_ENFORCE_EQ(grad_var->IsType<LoDTensor>(),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Tensor holds the wrong type，Expected Var(%s)'s "
-                          "type is LoDTensor, "
-                          "but the received is %s",
-                          ctx.InputNames("Grad").front(),
-                          framework::ToTypeName(grad_var->Type())));
-
-    // inputs
-    auto& param = GET_DATA_SAFELY(
-        ctx.Input<LoDTensor>("Param"), "Input", "Param", "Rmsprop");
-    auto& meanSquare = GET_DATA_SAFELY(
-        ctx.Input<LoDTensor>("MeanSquare"), "Input", "MeanSquare", "Rmsprop");
-    auto& grad = GET_DATA_SAFELY(
-        ctx.Input<LoDTensor>("Grad"), "Input", "Grad", "Rmsprop");
-    auto& mom = GET_DATA_SAFELY(
-        ctx.Input<LoDTensor>("Moment"), "Input", "Moment", "Rmsprop");
-
-    auto* learning_rate = ctx.Input<Tensor>("LearningRate");
-    PADDLE_ENFORCE_EQ(learning_rate->dims().size(),
-                      1,
-                      platform::errors::InvalidArgument(
-                          "learining rate should have dimension = 1."
-                          " But received learning rate dim [%s] ",
-                          learning_rate->dims().size()));
-    T lr = static_cast<T>(GetAttrFromTensor(learning_rate));
-
-    // constants
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-    T decay = static_cast<T>(ctx.Attr<float>("decay"));
-    T momentum = static_cast<T>(ctx.Attr<float>("momentum"));
-
-    // outputs
-    auto& param_out = GET_DATA_SAFELY(
-        ctx.Output<LoDTensor>("ParamOut"), "Output", "ParamOut", "Rmsprop");
-    auto& mom_out = GET_DATA_SAFELY(
-        ctx.Output<LoDTensor>("MomentOut"), "Output", "MomentOut", "Rmsprop");
-    auto& mom_sqrt_out = GET_DATA_SAFELY(ctx.Output<LoDTensor>("MeanSquareOut"),
-                                         "Output",
-                                         "MeanSquareOut",
-                                         "Rmsprop");
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    ///// rmsprop优化算法
-    ///
-    /// ms_out[i] = rho * ms[i] + (1 - rho) * (g[i] * g[i]);
-    ///
-    /// mom_out[i] = momentum * mom[i] + lr *
-    /// (g[i] / ((float)sqrt(ms_out[i] + epsilon)));
-    ///
-    /// p_out[i] = p[i] - mom_out[i];
-    /// DLL_EXPORT int rmsprop(Context* ctx, const float* p,
-    /// const float* ms, const float* g, const float* mom,
-    /// float epsilon, float rho, float momentum, float lr,
-    /// float *ms_out, float *mom_out, float *p_out, int n)
-    int r = xpu::rmsprop(dev_ctx.x_context(),
-                         grad.template data<T>(),
-                         param.template data<T>(),
-                         meanSquare.template data<T>(),
-                         mom.template data<T>(),
-                         param_out.template mutable_data<T>(ctx.GetPlace()),
-                         mom_sqrt_out.template mutable_data<T>(ctx.GetPlace()),
-                         mom_out.template mutable_data<T>(ctx.GetPlace()),
-                         epsilon,
-                         decay,
-                         momentum,
-                         lr,
-                         param.numel());
-
-    PADDLE_ENFORCE_XDNN_SUCCESS(r, "rmsprop");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_XPU_KERNEL(
-    rmsprop,
-    ops::RmspropOpXPUKernel<paddle::platform::XPUDeviceContext, float>);
-#endif
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include <gflags/gflags.h>
+
+#include <iostream>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+
+namespace paddle {
+namespace operators {
+
+static inline float GetAttrFromTensor(const framework::Tensor* tensor) {
+  const float* tensor_data = tensor->data<float>();
+  framework::Tensor cpu_tensor;
+  if (platform::is_gpu_place(tensor->place()) ||
+      platform::is_xpu_place(tensor->place())) {
+    paddle::framework::TensorCopySync(
+        *tensor, platform::CPUPlace(), &cpu_tensor);
+    tensor_data = cpu_tensor.data<float>();
+  }
+  return tensor_data[0];
+}
+
+using framework::OpKernelType;
+using framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class RmspropOpXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using paddle::framework::LoDTensor;
+
+    // check Param & Grad tensor type
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE_EQ(param_var->IsType<LoDTensor>(),
+                      true,
+                      platform::errors::InvalidArgument(
+                          "Tensor holds the wrong type，Expected Var(%s)'s "
+                          "type is LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Param").front(),
+                          framework::ToTypeName(param_var->Type())));
+
+    const auto* grad_var = ctx.InputVar("Grad");
+    PADDLE_ENFORCE_EQ(grad_var->IsType<LoDTensor>(),
+                      true,
+                      platform::errors::InvalidArgument(
+                          "Tensor holds the wrong type，Expected Var(%s)'s "
+                          "type is LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Grad").front(),
+                          framework::ToTypeName(grad_var->Type())));
+
+    // inputs
+    auto& param = GET_DATA_SAFELY(
+        ctx.Input<LoDTensor>("Param"), "Input", "Param", "Rmsprop");
+    auto& meanSquare = GET_DATA_SAFELY(
+        ctx.Input<LoDTensor>("MeanSquare"), "Input", "MeanSquare", "Rmsprop");
+    auto& grad = GET_DATA_SAFELY(
+        ctx.Input<LoDTensor>("Grad"), "Input", "Grad", "Rmsprop");
+    auto& mom = GET_DATA_SAFELY(
+        ctx.Input<LoDTensor>("Moment"), "Input", "Moment", "Rmsprop");
+
+    auto* learning_rate = ctx.Input<Tensor>("LearningRate");
+    PADDLE_ENFORCE_EQ(learning_rate->dims().size(),
+                      1,
+                      platform::errors::InvalidArgument(
+                          "learining rate should have dimension = 1."
+                          " But received learning rate dim [%s] ",
+                          learning_rate->dims().size()));
+    T lr = static_cast<T>(GetAttrFromTensor(learning_rate));
+
+    // constants
+    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+    T decay = static_cast<T>(ctx.Attr<float>("decay"));
+    T momentum = static_cast<T>(ctx.Attr<float>("momentum"));
+
+    bool centered = ctx.Attr<bool>("centered");
+    PADDLE_ENFORCE_EQ(centered,
+                      false,
+                      platform::errors::Unimplemented(
+                          "centered=True is not supported in the xpu kernel of "
+                          "rmsprop. use XPU_BLACK_LIST to disable this op."));
+    /*
+      TODO(houj04): when XDNN api supports 'center', add input of
+      mean_grad_input and output of mean_grad_output. auto *mean_grad_input =
+      ctx.Input<Tensor>("MeanGrad"); auto *mean_grad_output =
+      ctx.Output<Tensor>("MeanGradOut");
+      */
+
+    // outputs
+    auto& param_out = GET_DATA_SAFELY(
+        ctx.Output<LoDTensor>("ParamOut"), "Output", "ParamOut", "Rmsprop");
+    auto& mom_out = GET_DATA_SAFELY(
+        ctx.Output<LoDTensor>("MomentOut"), "Output", "MomentOut", "Rmsprop");
+    auto& mom_sqrt_out = GET_DATA_SAFELY(ctx.Output<LoDTensor>("MeanSquareOut"),
+                                         "Output",
+                                         "MeanSquareOut",
+                                         "Rmsprop");
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
+    // int rmsprop(Context* ctx, const T* g, const T* p, const float* ms, const
+    // float* mom, T* p_out, float* ms_out, float* mom_out, float epsilon, float
+    // rho, float momentum, float lr, int n);
+    int r = xpu::rmsprop(dev_ctx.x_context(),
+                         grad.template data<T>(),
+                         param.template data<T>(),
+                         meanSquare.template data<T>(),
+                         mom.template data<T>(),
+                         param_out.template mutable_data<T>(ctx.GetPlace()),
+                         mom_sqrt_out.template mutable_data<T>(ctx.GetPlace()),
+                         mom_out.template mutable_data<T>(ctx.GetPlace()),
+                         epsilon,
+                         decay,
+                         momentum,
+                         lr,
+                         param.numel());
+
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "rmsprop");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    rmsprop,
+    ops::RmspropOpXPUKernel<paddle::platform::XPUDeviceContext, float>);
+#endif
diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h
index cb87850f43c5e..02d8bcbd279dc 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.h
+++ b/paddle/fluid/operators/optimizers/sgd_op.h
@@ -159,8 +159,7 @@ class SGDOpKernel : public framework::OpKernel<T> {
 };
 
 template <typename T>
-class SGDOpKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
+class SGDOpKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     const auto *param_var = ctx.InputVar("Param");
diff --git a/paddle/fluid/operators/optimizers/sparse_momentum_op.cc b/paddle/fluid/operators/optimizers/sparse_momentum_op.cc
index 48f211f9c5ace..a92bbbc838a8a 100644
--- a/paddle/fluid/operators/optimizers/sparse_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/sparse_momentum_op.cc
@@ -119,7 +119,6 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
     ops::SparseMomentumOpInferVarType);
-REGISTER_OP_CPU_KERNEL(
-    sparse_momentum,
-    ops::SparseMomentumOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SparseMomentumOpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(sparse_momentum,
+                       ops::SparseMomentumOpKernel<phi::CPUContext, float>,
+                       ops::SparseMomentumOpKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/overlap_add_op.cc b/paddle/fluid/operators/overlap_add_op.cc
index 18d88f1069185..108c2df4cd2e1 100644
--- a/paddle/fluid/operators/overlap_add_op.cc
+++ b/paddle/fluid/operators/overlap_add_op.cc
@@ -186,22 +186,20 @@ REGISTER_OPERATOR(overlap_add_grad, ops::OverlapAddOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     overlap_add,
-    ops::OverlapAddKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::OverlapAddKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::OverlapAddKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::OverlapAddKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::OverlapAddKernel<paddle::platform::CPUDeviceContext,
-                          paddle::platform::complex<float>>,
-    ops::OverlapAddKernel<paddle::platform::CPUDeviceContext,
-                          paddle::platform::complex<double>>);
+    ops::OverlapAddKernel<phi::CPUContext, int>,
+    ops::OverlapAddKernel<phi::CPUContext, int64_t>,
+    ops::OverlapAddKernel<phi::CPUContext, float>,
+    ops::OverlapAddKernel<phi::CPUContext, double>,
+    ops::OverlapAddKernel<phi::CPUContext, paddle::platform::complex<float>>,
+    ops::OverlapAddKernel<phi::CPUContext, paddle::platform::complex<double>>);
 
 REGISTER_OP_CPU_KERNEL(
     overlap_add_grad,
-    ops::OverlapAddGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::OverlapAddGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::OverlapAddGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::OverlapAddGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::OverlapAddGradKernel<paddle::platform::CPUDeviceContext,
+    ops::OverlapAddGradKernel<phi::CPUContext, int>,
+    ops::OverlapAddGradKernel<phi::CPUContext, int64_t>,
+    ops::OverlapAddGradKernel<phi::CPUContext, float>,
+    ops::OverlapAddGradKernel<phi::CPUContext, double>,
+    ops::OverlapAddGradKernel<phi::CPUContext,
                               paddle::platform::complex<float>>,
-    ops::OverlapAddGradKernel<paddle::platform::CPUDeviceContext,
+    ops::OverlapAddGradKernel<phi::CPUContext,
                               paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/p_norm_op.cc b/paddle/fluid/operators/p_norm_op.cc
index c512870792073..766ecaee0d6c9 100644
--- a/paddle/fluid/operators/p_norm_op.cc
+++ b/paddle/fluid/operators/p_norm_op.cc
@@ -112,7 +112,7 @@ class PnormOpGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
+using CPU = phi::CPUContext;
 
 DECLARE_INFER_SHAPE_FUNCTOR(p_norm,
                             PNormInferShapeFunctor,
diff --git a/paddle/fluid/operators/p_norm_op_xpu.cc b/paddle/fluid/operators/p_norm_op_xpu.cc
new file mode 100644
index 0000000000000..b37a65e794d08
--- /dev/null
+++ b/paddle/fluid/operators/p_norm_op_xpu.cc
@@ -0,0 +1,354 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op_xpu.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/fluid/platform/device/xpu/xpu_header.h"
+
+namespace paddle {
+namespace operators {
+
+inline void GetDims(
+    const phi::DDim& dim, int axis, int* m, int* t, int* n, bool asvector) {
+  *m = 1;
+  *n = 1;
+  *t = dim[axis];
+  if (asvector) {
+    *t = product(dim);
+  } else {
+    for (int i = 0; i < axis; ++i) {
+      (*m) *= dim[i];
+    }
+    for (int i = axis + 1; i < dim.size(); ++i) {
+      (*n) *= dim[i];
+    }
+  }
+}
+
+using Tensor = framework::Tensor;
+template <typename DeviceContext, typename T>
+class P_NormXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    float porder = ctx.Attr<float>("porder");
+    int axis = ctx.Attr<int>("axis");
+    bool asvector = ctx.Attr<bool>("asvector");
+
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto xdim = in->dims();
+    if (axis < 0) axis = xdim.size() + axis;
+    std::vector<int> r_dim;
+    std::vector<int> x_dim;
+    std::vector<int> y_dim;
+    int m = 1;
+    int n = 1;
+    int t = 1;
+    GetDims(xdim, axis, &m, &t, &n, asvector);
+    x_dim.push_back(m);
+    x_dim.push_back(t);
+    x_dim.push_back(n);
+
+    r_dim.push_back(1);
+
+    y_dim.push_back(m);
+    y_dim.push_back(n);
+
+    int r = 0;
+
+    xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+    XPUType* tmp_x = RAII_GUARD.alloc_l3_or_gm<XPUType>(m * t * n);
+    PADDLE_ENFORCE_XDNN_NOT_NULL(tmp_x);
+    r = xpu::abs(dev_ctx.x_context(),
+                 reinterpret_cast<const XPUType*>(in->data<T>()),
+                 tmp_x,
+                 m * t * n);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "abs");
+    if (porder == INFINITY) {
+      r = xpu::reduce_max(dev_ctx.x_context(),
+                          tmp_x,
+                          reinterpret_cast<XPUType*>(out->data<T>()),
+                          x_dim,
+                          r_dim);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_max");
+    } else if (porder == -INFINITY) {
+      r = xpu::reduce_min(dev_ctx.x_context(),
+                          tmp_x,
+                          reinterpret_cast<XPUType*>(out->data<T>()),
+                          x_dim,
+                          r_dim);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_min");
+    } else if (porder == 0) {
+      XPUType* zeros = RAII_GUARD.alloc_l3_or_gm<XPUType>(1);
+      PADDLE_ENFORCE_XDNN_NOT_NULL(zeros);
+      r = xpu::constant(dev_ctx.x_context(), zeros, 1, 0.0f);
+      std::vector<int> zeros_dim(1, 1);
+
+      bool* tmp2_x = RAII_GUARD.alloc_l3_or_gm<bool>(m * t * n);
+      PADDLE_ENFORCE_XDNN_NOT_NULL(tmp2_x);
+
+      r = xpu::broadcast_not_equal(
+          dev_ctx.x_context(), tmp_x, zeros, tmp2_x, x_dim, zeros_dim);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_not_equal");
+
+      XPUType* x_mid = tmp_x;
+
+      r = xpu::cast<bool, XPUType>(
+          dev_ctx.x_context(), tmp2_x, x_mid, m * t * n);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
+
+      r = xpu::reduce_sum(dev_ctx.x_context(),
+                          x_mid,
+                          reinterpret_cast<XPUType*>(out->data<T>()),
+                          x_dim,
+                          r_dim);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum");
+
+    } else {
+      Tensor porder_tensor;
+      framework::DDim pdim = phi::make_ddim({1});
+      porder_tensor.mutable_data<float>(pdim, in->place());
+      r = xpu::constant(
+          dev_ctx.x_context(), porder_tensor.data<float>(), 1, porder);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
+      std::vector<int> p_dim(1, 1);
+
+      XPUType* tmp2_x = RAII_GUARD.alloc_l3_or_gm<XPUType>(m * t * n);
+      PADDLE_ENFORCE_XDNN_NOT_NULL(tmp2_x);
+      r = xpu::broadcast_pow(
+          dev_ctx.x_context(),
+          reinterpret_cast<const XPUType*>(tmp_x),
+          reinterpret_cast<const XPUType*>(porder_tensor.data<float>()),
+          tmp2_x,
+          x_dim,
+          p_dim);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_pow");
+
+      XPUType* tmp_y = RAII_GUARD.alloc_l3_or_gm<XPUType>(m * n);
+      PADDLE_ENFORCE_XDNN_NOT_NULL(tmp_y);
+
+      r = xpu::reduce_sum(dev_ctx.x_context(),
+                          reinterpret_cast<const XPUType*>(tmp2_x),
+                          tmp_y,
+                          x_dim,
+                          r_dim);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum");
+
+      r = xpu::constant(
+          dev_ctx.x_context(), porder_tensor.data<float>(), 1, 1.0f / porder);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
+
+      r = xpu::broadcast_pow(
+          dev_ctx.x_context(),
+          reinterpret_cast<const XPUType*>(tmp_y),
+          reinterpret_cast<const XPUType*>(porder_tensor.data<float>()),
+          reinterpret_cast<XPUType*>(out->data<T>()),
+          y_dim,
+          p_dim);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_pow");
+      dev_ctx.Wait();
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class P_NormGradXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Out");
+    auto* dy = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    dx->mutable_data<T>(ctx.GetPlace());
+    auto xdim = x->dims();
+    float porder = ctx.Attr<float>("porder");
+    bool asvector = ctx.Attr<bool>("asvector");
+    int axis = ctx.Attr<int>("axis");
+    axis = axis < 0 ? xdim.size() + axis : axis;
+
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
+    int m, t, n;
+    GetDims(xdim, axis, &m, &t, &n, asvector);
+
+    std::vector<int> r_dim;
+    std::vector<int> x_dim;
+    std::vector<int> y_dim;
+
+    x_dim.push_back(m);
+    x_dim.push_back(t);
+    x_dim.push_back(n);
+
+    y_dim.push_back(m);
+    y_dim.push_back(1);
+    y_dim.push_back(n);
+
+    int r = 0;
+    if (porder == 0) {
+      r = xpu::constant(dev_ctx.x_context(),
+                        reinterpret_cast<XPUType*>(dx->data<T>()),
+                        m * t * n,
+                        static_cast<T>(0));
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
+    } else if (porder == INFINITY || porder == -INFINITY) {
+      xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+      XPUType* x_abs = RAII_GUARD.alloc_l3_or_gm<XPUType>(m * t * n);
+      PADDLE_ENFORCE_XDNN_NOT_NULL(x_abs);
+      r = xpu::abs(dev_ctx.x_context(),
+                   reinterpret_cast<const XPUType*>(x->data<T>()),
+                   x_abs,
+                   m * t * n);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "abs");
+
+      bool* dx_t = RAII_GUARD.alloc_l3_or_gm<bool>(m * t * n);
+      PADDLE_ENFORCE_XDNN_NOT_NULL(dx_t);
+
+      XPUType* dx_mid = RAII_GUARD.alloc_l3_or_gm<XPUType>(m * t * n);
+      PADDLE_ENFORCE_XDNN_NOT_NULL(dx_mid);
+
+      r = xpu::broadcast_equal<XPUType>(
+          dev_ctx.x_context(),
+          reinterpret_cast<const XPUType*>(x_abs),
+          reinterpret_cast<const XPUType*>(y->data<T>()),
+          dx_t,
+          x_dim,
+          y_dim);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_equal");
+
+      r = xpu::cast<bool, XPUType>(
+          dev_ctx.x_context(), dx_t, dx_mid, m * t * n);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
+
+      XPUType* x_sign = RAII_GUARD.alloc_l3_or_gm<XPUType>(m * t * n);
+      PADDLE_ENFORCE_XDNN_NOT_NULL(x_sign);
+      r = xpu::sign(dev_ctx.x_context(),
+                    reinterpret_cast<const XPUType*>(x->data<T>()),
+                    x_sign,
+                    m * t * n);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "sign");
+
+      XPUType* dx_pre_dy = x_abs;
+      r = xpu::mul(dev_ctx.x_context(),
+                   reinterpret_cast<const XPUType*>(dx_mid),
+                   reinterpret_cast<const XPUType*>(x_sign),
+                   dx_pre_dy,
+                   m * t * n);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "mul");
+
+      r = xpu::broadcast_mul(dev_ctx.x_context(),
+                             dx_pre_dy,
+                             reinterpret_cast<const XPUType*>(dy->data<T>()),
+                             reinterpret_cast<XPUType*>(dx->data<T>()),
+                             x_dim,
+                             y_dim);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_mul");
+
+    } else {
+      xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+      XPUType* x_abs = RAII_GUARD.alloc_l3_or_gm<XPUType>(m * t * n);
+      PADDLE_ENFORCE_XDNN_NOT_NULL(x_abs);
+      r = xpu::abs(dev_ctx.x_context(),
+                   reinterpret_cast<const XPUType*>(x->data<T>()),
+                   x_abs,
+                   m * t * n);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "abs");
+
+      Tensor porder_tensor;
+      framework::DDim pdim = phi::make_ddim({1});
+      porder_tensor.mutable_data<float>(pdim, x->place());
+      r = xpu::constant(
+          dev_ctx.x_context(), porder_tensor.data<float>(), 1, porder - 1.0f);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
+      std::vector<int> p_dim(1, 1);
+
+      XPUType* x_pow = RAII_GUARD.alloc_l3_or_gm<XPUType>(m * t * n);
+      PADDLE_ENFORCE_XDNN_NOT_NULL(x_pow);
+      r = xpu::broadcast_pow(
+          dev_ctx.x_context(),
+          reinterpret_cast<const XPUType*>(x_abs),
+          reinterpret_cast<const XPUType*>(porder_tensor.data<float>()),
+          x_pow,
+          x_dim,
+          p_dim);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_pow");
+
+      XPUType* y_pow = RAII_GUARD.alloc_l3_or_gm<XPUType>(m * n);
+      PADDLE_ENFORCE_XDNN_NOT_NULL(y_pow);
+      r = xpu::broadcast_pow(
+          dev_ctx.x_context(),
+          reinterpret_cast<const XPUType*>(y->data<T>()),
+          reinterpret_cast<const XPUType*>(porder_tensor.data<float>()),
+          y_pow,
+          y_dim,
+          p_dim);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_pow");
+      dev_ctx.Wait();
+
+      XPUType* dx_t = x_abs;
+
+      r = xpu::broadcast_div(
+          dev_ctx.x_context(), x_pow, y_pow, dx_t, x_dim, y_dim);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_div");
+
+      XPUType* x_sign = x_pow;
+      r = xpu::sign(dev_ctx.x_context(),
+                    reinterpret_cast<const XPUType*>(x->data<T>()),
+                    x_sign,
+                    m * t * n);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "sign");
+
+      XPUType* dx_mid = RAII_GUARD.alloc_l3_or_gm<XPUType>(m * t * n);
+      PADDLE_ENFORCE_XDNN_NOT_NULL(dx_mid);
+
+      r = xpu::broadcast_mul(dev_ctx.x_context(),
+                             reinterpret_cast<const XPUType*>(x_sign),
+                             reinterpret_cast<const XPUType*>(dy->data<T>()),
+                             dx_mid,
+                             x_dim,
+                             y_dim);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_mul");
+
+      r = xpu::broadcast_mul(dev_ctx.x_context(),
+                             reinterpret_cast<const XPUType*>(dx_t),
+                             reinterpret_cast<const XPUType*>(dx_mid),
+                             reinterpret_cast<XPUType*>(dx->data<T>()),
+                             x_dim,
+                             x_dim);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_mul");
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    p_norm, ops::P_NormXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    p_norm_grad,
+    ops::P_NormGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif
diff --git a/paddle/fluid/operators/pad2d_op.cc b/paddle/fluid/operators/pad2d_op.cc
index 72073ed3067c3..c2dfb8e61e5eb 100644
--- a/paddle/fluid/operators/pad2d_op.cc
+++ b/paddle/fluid/operators/pad2d_op.cc
@@ -536,8 +536,8 @@ class Pad2dGradCPUKernel : public framework::OpKernel<T> {
     auto d_out_dims = d_out->dims();
     const T* d_out_data = d_out->data<T>();
     T* d_in_data = d_in->mutable_data<T>(context.GetPlace());
-    phi::funcs::SetConstant<platform::CPUDeviceContext, T> set_zero;
-    set_zero(context.template device_context<platform::CPUDeviceContext>(),
+    phi::funcs::SetConstant<phi::CPUContext, T> set_zero;
+    set_zero(context.template device_context<phi::CPUContext>(),
              d_in,
              static_cast<T>(0));
     const int pad_top = pads[0];
diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc
index 571ead1710a92..e523c93f5d10b 100644
--- a/paddle/fluid/operators/pad_constant_like_op.cc
+++ b/paddle/fluid/operators/pad_constant_like_op.cc
@@ -247,19 +247,17 @@ REGISTER_OPERATOR(pad_constant_like,
                   ops::PadConstantLikeOpGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(pad_constant_like_grad, ops::PadConstantLikeOpGrad);
 
-REGISTER_OP_CPU_KERNEL(
-    pad_constant_like,
-    ops::PadConstantLikeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PadConstantLikeKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::PadConstantLikeKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::PadConstantLikeKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(pad_constant_like,
+                       ops::PadConstantLikeKernel<phi::CPUContext, float>,
+                       ops::PadConstantLikeKernel<phi::CPUContext, double>,
+                       ops::PadConstantLikeKernel<phi::CPUContext, int>,
+                       ops::PadConstantLikeKernel<phi::CPUContext, int64_t>);
 REGISTER_OP_CPU_KERNEL(
     pad_constant_like_grad,
-    ops::PadConstantLikeGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PadConstantLikeGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::PadConstantLikeGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::PadConstantLikeGradKernel<paddle::platform::CPUDeviceContext,
-                                   int64_t>);
+    ops::PadConstantLikeGradKernel<phi::CPUContext, float>,
+    ops::PadConstantLikeGradKernel<phi::CPUContext, double>,
+    ops::PadConstantLikeGradKernel<phi::CPUContext, int>,
+    ops::PadConstantLikeGradKernel<phi::CPUContext, int64_t>);
 
 REGISTER_OP_CUDA_KERNEL(
     pad_constant_like,
diff --git a/paddle/fluid/operators/partial_concat_op.cc b/paddle/fluid/operators/partial_concat_op.cc
index 98bbede0323a9..e9b54632ddc01 100644
--- a/paddle/fluid/operators/partial_concat_op.cc
+++ b/paddle/fluid/operators/partial_concat_op.cc
@@ -211,12 +211,11 @@ REGISTER_OPERATOR(partial_concat,
 
 REGISTER_OPERATOR(partial_concat_grad, ops::PartialConcatGradOp);
 
-REGISTER_OP_CPU_KERNEL(
-    partial_concat,
-    ops::PartialConcatKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::PartialConcatKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PartialConcatKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::PartialConcatKernel<paddle::platform::CPUDeviceContext, int>);
+REGISTER_OP_CPU_KERNEL(partial_concat,
+                       ops::PartialConcatKernel<phi::CPUContext, double>,
+                       ops::PartialConcatKernel<phi::CPUContext, float>,
+                       ops::PartialConcatKernel<phi::CPUContext, int64_t>,
+                       ops::PartialConcatKernel<phi::CPUContext, int>);
 
 REGISTER_OP_CPU_KERNEL(partial_concat_grad,
                        ops::PartialConcatGradientOpKernel<float>,
diff --git a/paddle/fluid/operators/partial_concat_op.h b/paddle/fluid/operators/partial_concat_op.h
index f99617fdc634f..affe06f20956a 100644
--- a/paddle/fluid/operators/partial_concat_op.h
+++ b/paddle/fluid/operators/partial_concat_op.h
@@ -111,8 +111,8 @@ class PartialConcatGradientOpKernel : public framework::OpKernel<T> {
     auto all_length = grad_batch_len * batch_size;
 
     // initialize
-    auto& place = *ctx.template device_context<platform::CPUDeviceContext>()
-                       .eigen_device();
+    auto& place =
+        *ctx.template device_context<phi::CPUContext>().eigen_device();
     for (size_t i = 0; i < outs.size(); ++i) {
       outs[i]->mutable_data<T>(ctx.GetPlace());
       auto dxt = framework::EigenVector<T>::Flatten(*outs[i]);
diff --git a/paddle/fluid/operators/partial_sum_op.cc b/paddle/fluid/operators/partial_sum_op.cc
index 3b69efb8e7489..4d4c1e54cff27 100644
--- a/paddle/fluid/operators/partial_sum_op.cc
+++ b/paddle/fluid/operators/partial_sum_op.cc
@@ -210,12 +210,11 @@ REGISTER_OPERATOR(partial_sum,
 
 REGISTER_OPERATOR(partial_sum_grad, ops::PartialSumGradOp);
 
-REGISTER_OP_CPU_KERNEL(
-    partial_sum,
-    ops::PartialSumKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PartialSumKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::PartialSumKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::PartialSumKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(partial_sum,
+                       ops::PartialSumKernel<phi::CPUContext, float>,
+                       ops::PartialSumKernel<phi::CPUContext, int>,
+                       ops::PartialSumKernel<phi::CPUContext, double>,
+                       ops::PartialSumKernel<phi::CPUContext, int64_t>);
 
 REGISTER_OP_CPU_KERNEL(partial_sum_grad,
                        ops::PartialSumGradientOpKernel<float>,
diff --git a/paddle/fluid/operators/partial_sum_op.h b/paddle/fluid/operators/partial_sum_op.h
index b45c4cb9b65c7..58ac0671dde10 100644
--- a/paddle/fluid/operators/partial_sum_op.h
+++ b/paddle/fluid/operators/partial_sum_op.h
@@ -79,8 +79,8 @@ class PartialSumGradientOpKernel : public framework::OpKernel<T> {
     }
 
     // initialize
-    auto& place = *ctx.template device_context<platform::CPUDeviceContext>()
-                       .eigen_device();
+    auto& place =
+        *ctx.template device_context<phi::CPUContext>().eigen_device();
     for (size_t i = 0; i < outs.size(); ++i) {
       outs[i]->mutable_data<T>(ctx.GetPlace());
       auto dxt = framework::EigenVector<T>::Flatten(*outs[i]);
diff --git a/paddle/fluid/operators/pool_op_mlu.cc b/paddle/fluid/operators/pool_op_mlu.cc
index e40fe2025e281..4f9d1343c8395 100644
--- a/paddle/fluid/operators/pool_op_mlu.cc
+++ b/paddle/fluid/operators/pool_op_mlu.cc
@@ -122,9 +122,9 @@ class MLUPoolOpKernel : public framework::OpKernel<T> {
           handle, pool_mode, out_w, out_h, &extra_input_size);
 
       if (extra_input_size > 0) {
-        paddle::platform::CPUDeviceContext cpu_ctx;
+        phi::CPUContext cpu_ctx;
         framework::Tensor extra_host_tensor =
-            ctx.AllocateTmpTensor<int8_t, platform::CPUDeviceContext>(
+            ctx.AllocateTmpTensor<int8_t, phi::CPUContext>(
                 {static_cast<int64_t>(extra_input_size)}, cpu_ctx);
         cnnlInitPoolingExtraInput(handle,
                                   pool_desc.get(),
diff --git a/paddle/fluid/operators/pool_op_xpu.cc b/paddle/fluid/operators/pool_op_xpu.cc
index d8d814a6ba78a..7208b195b4600 100644
--- a/paddle/fluid/operators/pool_op_xpu.cc
+++ b/paddle/fluid/operators/pool_op_xpu.cc
@@ -13,6 +13,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/phi/kernels/funcs/pooling.h"
 
 #ifdef PADDLE_WITH_XPU
 namespace paddle {
@@ -51,6 +52,9 @@ class PoolXPUKernel : public framework::OpKernel<T> {
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
     bool exclusive = context.Attr<bool>("exclusive");
     bool adaptive = context.Attr<bool>("adaptive");
+    bool ceil_mode = context.Attr<bool>("ceil_mode");
+    std::string padding_algorithm =
+        context.Attr<std::string>("padding_algorithm");
     PADDLE_ENFORCE_EQ(
         ksize.size(),
         2,
@@ -70,10 +74,27 @@ class PoolXPUKernel : public framework::OpKernel<T> {
         ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
       }
     }
+
     const int n = in_x->dims()[0];
     const int c = in_x->dims()[1];
     const int in_h = in_x->dims()[2];
     const int in_w = in_x->dims()[3];
+
+    framework::DDim data_dims;
+
+    data_dims = phi::slice_ddim(in_x->dims(), 2, in_x->dims().size());
+    phi::funcs::UpdatePadding(&paddings,
+                              global_pooling,
+                              adaptive,
+                              padding_algorithm,
+                              data_dims,
+                              strides,
+                              ksize);
+    if (ceil_mode) {
+      paddings[1] += (strides[0] - 1);
+      paddings[3] += (strides[1] - 1);
+    }
+
     auto input = reinterpret_cast<const XPUType*>(in_x->data<T>());
     out->mutable_data<T>(context.GetPlace());
     auto output = reinterpret_cast<XPUType*>(out->data<T>());
@@ -135,6 +156,9 @@ class PoolGradXPUKernel : public framework::OpKernel<T> {
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
     bool exclusive = context.Attr<bool>("exclusive");
     bool adaptive = context.Attr<bool>("adaptive");
+    bool ceil_mode = context.Attr<bool>("ceil_mode");
+    std::string padding_algorithm =
+        context.Attr<std::string>("padding_algorithm");
     const int* index_data = nullptr;
     PADDLE_ENFORCE_EQ(
         ksize.size(),
@@ -163,6 +187,22 @@ class PoolGradXPUKernel : public framework::OpKernel<T> {
     const int c = in_x->dims()[1];
     const int in_h = in_x->dims()[2];
     const int in_w = in_x->dims()[3];
+
+    framework::DDim data_dims;
+
+    data_dims = phi::slice_ddim(in_x->dims(), 2, in_x->dims().size());
+    phi::funcs::UpdatePadding(&paddings,
+                              global_pooling,
+                              adaptive,
+                              padding_algorithm,
+                              data_dims,
+                              strides,
+                              ksize);
+    if (ceil_mode) {
+      paddings[1] += (strides[0] - 1);
+      paddings[3] += (strides[1] - 1);
+    }
+
     auto input = reinterpret_cast<const XPUType*>(in_x->data<T>());
     auto output = reinterpret_cast<const XPUType*>(out->data<T>());
     auto output_grad = reinterpret_cast<const XPUType*>(out_grad->data<T>());
diff --git a/paddle/fluid/operators/prroi_pool_op.cc b/paddle/fluid/operators/prroi_pool_op.cc
index c9e45fe51cf14..cf8f17d5f747c 100644
--- a/paddle/fluid/operators/prroi_pool_op.cc
+++ b/paddle/fluid/operators/prroi_pool_op.cc
@@ -200,15 +200,13 @@ REGISTER_OPERATOR(prroi_pool,
                   ops::PRROIPoolGradMaker<paddle::framework::OpDesc>,
                   ops::PRROIPoolGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(prroi_pool_grad, ops::PRROIPoolGradOp);
-REGISTER_OP_CPU_KERNEL(
-    prroi_pool,
-    ops::CPUPRROIPoolOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUPRROIPoolOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::CPUPRROIPoolOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::CPUPRROIPoolOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    prroi_pool_grad,
-    ops::CPUPRROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CPUPRROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::CPUPRROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::CPUPRROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(prroi_pool,
+                       ops::CPUPRROIPoolOpKernel<phi::CPUContext, float>,
+                       ops::CPUPRROIPoolOpKernel<phi::CPUContext, double>,
+                       ops::CPUPRROIPoolOpKernel<phi::CPUContext, int>,
+                       ops::CPUPRROIPoolOpKernel<phi::CPUContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(prroi_pool_grad,
+                       ops::CPUPRROIPoolGradOpKernel<phi::CPUContext, float>,
+                       ops::CPUPRROIPoolGradOpKernel<phi::CPUContext, double>,
+                       ops::CPUPRROIPoolGradOpKernel<phi::CPUContext, int>,
+                       ops::CPUPRROIPoolGradOpKernel<phi::CPUContext, int64_t>);
diff --git a/paddle/fluid/operators/prune_gate_by_capacity_op.cc b/paddle/fluid/operators/prune_gate_by_capacity_op.cc
index 91223ff0d4813..14494f426d2d0 100644
--- a/paddle/fluid/operators/prune_gate_by_capacity_op.cc
+++ b/paddle/fluid/operators/prune_gate_by_capacity_op.cc
@@ -128,6 +128,5 @@ REGISTER_OP_WITHOUT_GRADIENT(prune_gate_by_capacity,
 
 REGISTER_OP_CPU_KERNEL(
     prune_gate_by_capacity,
-    ops::PruneGateByCapacityCPUKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::PruneGateByCapacityCPUKernel<paddle::platform::CPUDeviceContext,
-                                      int64_t>);
+    ops::PruneGateByCapacityCPUKernel<phi::CPUContext, int>,
+    ops::PruneGateByCapacityCPUKernel<phi::CPUContext, int64_t>);
diff --git a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
index 6f0bb0a39d473..dbdf58637580d 100644
--- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
+++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
@@ -152,5 +152,4 @@ REGISTER_OPERATOR(distributed_lookup_table,
 
 REGISTER_OP_CPU_KERNEL(
     distributed_lookup_table,
-    ops::DistributedLookupTableKernel<paddle::platform::CPUDeviceContext,
-                                      float>);
+    ops::DistributedLookupTableKernel<phi::CPUContext, float>);
diff --git a/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc b/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc
index 32326531dd779..a2bf63da10bd2 100644
--- a/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc
+++ b/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc
@@ -135,6 +135,5 @@ REGISTER_OPERATOR(distributed_push_sparse,
 
 REGISTER_OP_CPU_KERNEL(
     distributed_push_sparse,
-    ops::DistributedPushSparseKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DistributedPushSparseKernel<paddle::platform::CPUDeviceContext,
-                                     double>);
+    ops::DistributedPushSparseKernel<phi::CPUContext, float>,
+    ops::DistributedPushSparseKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/pscore/heter_cloud_comm_cpu_test.cc b/paddle/fluid/operators/pscore/heter_cloud_comm_cpu_test.cc
index 9f28bd27f10af..0d0897b0af011 100644
--- a/paddle/fluid/operators/pscore/heter_cloud_comm_cpu_test.cc
+++ b/paddle/fluid/operators/pscore/heter_cloud_comm_cpu_test.cc
@@ -134,7 +134,7 @@ void PressTestSendRecv(
   int64_t data_size = vars_len;
   VLOG(0) << "float num: " << data_size;
   float* data_ptr = new float[data_size];
-  file.read((char*)data_ptr, 9437184);
+  file.read(static_cast<char*>(data_ptr), 9437184);
   VLOG(0) << "send data is: " << data_ptr[0] << ", " << data_ptr[1];
   std::vector<std::string> var_names{"34"};
   int loopCnt = 10000;
@@ -169,7 +169,7 @@ void PressTestSendRecv(
   delete[] values;
 
   std::ofstream recv("/recv_20_34", std::ios::out | std::ios::binary);
-  recv.write((char*)values, data_size);
+  recv.write(static_cast<char*>(values, data_size));
   recv.close();
   t.join();
 }
@@ -177,7 +177,7 @@ void PressTestSendRecv(
 void TestScopeSendRecv(
     std::shared_ptr<distributed::HeterClient> heter_client_ptr_) {
   platform::CPUPlace place;
-  platform::CPUDeviceContext ctx(place);
+  phi::CPUContext ctx(place);
   framework::Executor exe(place);
   std::shared_ptr<framework::Scope> send_scope_ptr =
       std::make_shared<framework::Scope>();
diff --git a/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc b/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc
index 1bf0cd598d438..b7267d0c6bc55 100644
--- a/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc
+++ b/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc
@@ -150,7 +150,7 @@ void RunHeterServerOp(std::string endpoint) {
   framework::Scope scope;
   platform::CPUPlace place;
   framework::Executor exe(place);
-  platform::CPUDeviceContext ctx(place);
+  phi::CPUContext ctx(place);
 
   LOG(INFO) << "before GetHeterListenAndServProgram";
   GetHeterListenAndServProgram(&program, endpoint);
@@ -211,7 +211,7 @@ TEST(HETER_LISTEN_AND_SERV, CPU) {
 
   framework::Scope* scope = (*micro_scope)[0];
   platform::CPUPlace place;
-  platform::CPUDeviceContext ctx(place);
+  phi::CPUContext ctx(place);
 
   // create var on local scope
   int64_t rows_numel = 10;
diff --git a/paddle/fluid/operators/pscore/heter_server_test.cc b/paddle/fluid/operators/pscore/heter_server_test.cc
index c9cd445c98a14..a0332e857cc4a 100644
--- a/paddle/fluid/operators/pscore/heter_server_test.cc
+++ b/paddle/fluid/operators/pscore/heter_server_test.cc
@@ -162,7 +162,7 @@ void StartSendAndRecvServer(std::string endpoint) {
   framework::Scope scope;
   platform::CPUPlace place;
   framework::Executor exe(place);
-  platform::CPUDeviceContext ctx(place);
+  phi::CPUContext ctx(place);
   LOG(INFO) << "before AppendSendAndRecvBlock";
   auto block = AppendSendAndRecvBlock(&program);
   std::string in_var_name("x");
@@ -254,7 +254,7 @@ TEST(SENDANDRECV, CPU) {
 
   framework::Scope* scope = (*micro_scope)[0];
   platform::CPUPlace place;
-  platform::CPUDeviceContext ctx(place);
+  phi::CPUContext ctx(place);
 
   // create var on local scope
   int64_t rows_numel = 10;
diff --git a/paddle/fluid/operators/pscore/send_and_recv_op.cc b/paddle/fluid/operators/pscore/send_and_recv_op.cc
index 71f7cf6a91be5..73eb3f1509223 100644
--- a/paddle/fluid/operators/pscore/send_and_recv_op.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op.cc
@@ -104,12 +104,11 @@ REGISTER_OP_CUDA_KERNEL(
     ops::SendAndRecvKernel<paddle::platform::CUDADeviceContext, double>,
     ops::SendAndRecvKernel<paddle::platform::CUDADeviceContext, int>,
     ops::SendAndRecvKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    send_and_recv,
-    ops::SendAndRecvKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SendAndRecvKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SendAndRecvKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SendAndRecvKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(send_and_recv,
+                       ops::SendAndRecvKernel<phi::CPUContext, float>,
+                       ops::SendAndRecvKernel<phi::CPUContext, double>,
+                       ops::SendAndRecvKernel<phi::CPUContext, int>,
+                       ops::SendAndRecvKernel<phi::CPUContext, int64_t>);
 
 REGISTER_OP_VERSION(send_and_recv)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc b/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc
index 11ef5cc99e842..61ef001930a04 100644
--- a/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc
@@ -138,7 +138,7 @@ void StartSendAndRecvServer(std::string endpoint) {
   framework::Scope scope;
   platform::CPUPlace place;
   framework::Executor exe(place);
-  platform::CPUDeviceContext ctx(place);
+  phi::CPUContext ctx(place);
   LOG(INFO) << "before AppendSendAndRecvBlock";
   auto block = AppendSendAndRecvBlock(&program);
   std::string in_var_name("x");
@@ -227,7 +227,7 @@ TEST(SENDANDRECV, CPU) {
 
   framework::Scope* scope = (*micro_scope)[0];
   platform::CPUPlace place;
-  platform::CPUDeviceContext ctx(place);
+  phi::CPUContext ctx(place);
 
   framework::Executor exe(place);
   // create var on local scope
diff --git a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
index e443439dafe83..8d0d2d3090c17 100644
--- a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
@@ -163,7 +163,7 @@ void StartSendAndRecvServer(std::string endpoint) {
   framework::Scope scope;
   platform::CPUPlace place;
   framework::Executor exe(place);
-  platform::CPUDeviceContext ctx(place);
+  phi::CPUContext ctx(place);
   LOG(INFO) << "before AppendSendAndRecvBlock";
   auto block = AppendSendAndRecvBlock(&program);
   std::string in_var_name("x");
diff --git a/paddle/fluid/operators/pscore/switch_server_test.cc b/paddle/fluid/operators/pscore/switch_server_test.cc
index 4af4d4b89275d..a5e6fff4804af 100644
--- a/paddle/fluid/operators/pscore/switch_server_test.cc
+++ b/paddle/fluid/operators/pscore/switch_server_test.cc
@@ -55,7 +55,7 @@ void StartSwitchInterServer(
 
 int main(int argc, char* argv[]) {
   platform::CPUPlace place;
-  platform::CPUDeviceContext ctx(place);
+  phi::CPUContext ctx(place);
   framework::Executor exe(place);
 
   framework::ProgramDesc program;
diff --git a/paddle/fluid/operators/py_layer_op.cc b/paddle/fluid/operators/py_layer_op.cc
index 512179ba56526..9255a5f164bc4 100644
--- a/paddle/fluid/operators/py_layer_op.cc
+++ b/paddle/fluid/operators/py_layer_op.cc
@@ -207,23 +207,19 @@ REGISTER_OPERATOR(py_layer,
 
 REGISTER_OP_CPU_KERNEL(
     py_layer,
-    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext,
-                         ::paddle::platform::float16>,
-    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext,
-                         ::paddle::platform::bfloat16>,
-    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext, int64_t>,
-
-    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext, uint8_t>,
-    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext, int16_t>,
-    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext,
-                         ::paddle::platform::complex<float>>,
-    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext,
-                         ::paddle::platform::complex<double>>);
+    ops::PyLayerOpKernel<phi::CPUContext, float>,
+    ops::PyLayerOpKernel<phi::CPUContext, ::paddle::platform::float16>,
+    ops::PyLayerOpKernel<phi::CPUContext, ::paddle::platform::bfloat16>,
+    ops::PyLayerOpKernel<phi::CPUContext, double>,
+    ops::PyLayerOpKernel<phi::CPUContext, int>,
+    ops::PyLayerOpKernel<phi::CPUContext, int64_t>,
+
+    ops::PyLayerOpKernel<phi::CPUContext, bool>,
+    ops::PyLayerOpKernel<phi::CPUContext, uint8_t>,
+    ops::PyLayerOpKernel<phi::CPUContext, int16_t>,
+    ops::PyLayerOpKernel<phi::CPUContext, int8_t>,
+    ops::PyLayerOpKernel<phi::CPUContext, ::paddle::platform::complex<float>>,
+    ops::PyLayerOpKernel<phi::CPUContext, ::paddle::platform::complex<double>>);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 REGISTER_OP_CUDA_KERNEL(
     py_layer,
diff --git a/paddle/fluid/operators/pyramid_hash_op.cc b/paddle/fluid/operators/pyramid_hash_op.cc
index fbda6a13d6592..0dd74f9324fa3 100644
--- a/paddle/fluid/operators/pyramid_hash_op.cc
+++ b/paddle/fluid/operators/pyramid_hash_op.cc
@@ -586,10 +586,8 @@ REGISTER_OPERATOR(pyramid_hash,
                   ops::PyramidHashGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(pyramid_hash_grad, ops::PyramidHashOpGrad);
 
-REGISTER_OP_CPU_KERNEL(
-    pyramid_hash,
-    ops::CPUPyramidHashOPKernel<plt::CPUDeviceContext, float>,
-    ops::CPUPyramidHashOPKernel<plt::CPUDeviceContext, int8_t>);
-REGISTER_OP_CPU_KERNEL(
-    pyramid_hash_grad,
-    ops::CPUPyramidHashOPGradKernel<plt::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(pyramid_hash,
+                       ops::CPUPyramidHashOPKernel<phi::CPUContext, float>,
+                       ops::CPUPyramidHashOPKernel<phi::CPUContext, int8_t>);
+REGISTER_OP_CPU_KERNEL(pyramid_hash_grad,
+                       ops::CPUPyramidHashOPGradKernel<phi::CPUContext, float>);
diff --git a/paddle/fluid/operators/qr_op.cc b/paddle/fluid/operators/qr_op.cc
index 4074d0dfc63af..90ace1ba773d1 100644
--- a/paddle/fluid/operators/qr_op.cc
+++ b/paddle/fluid/operators/qr_op.cc
@@ -124,7 +124,6 @@ REGISTER_OPERATOR(qr,
 
 REGISTER_OPERATOR(qr_grad, ops::QrGradOp);
 
-REGISTER_OP_CPU_KERNEL(
-    qr_grad,
-    ops::QrGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::QrGradKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(qr_grad,
+                       ops::QrGradKernel<phi::CPUContext, float>,
+                       ops::QrGradKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/quantize_linear_op.cc b/paddle/fluid/operators/quantize_linear_op.cc
index 4580acbe3fc83..65be8acaa5525 100644
--- a/paddle/fluid/operators/quantize_linear_op.cc
+++ b/paddle/fluid/operators/quantize_linear_op.cc
@@ -25,8 +25,8 @@ namespace paddle {
 namespace operators {
 
 template <typename T>
-struct ChannelDequantizeFunctorV2<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext &dev_ctx,
+struct ChannelDequantizeFunctorV2<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext &dev_ctx,
                   const framework::Tensor *in,
                   const framework::Tensor *scale,
                   T max_range,
@@ -72,8 +72,8 @@ struct ChannelDequantizeFunctorV2<platform::CPUDeviceContext, T> {
   }
 };
 
-template struct ChannelDequantizeFunctorV2<platform::CPUDeviceContext, float>;
-template struct ChannelDequantizeFunctorV2<platform::CPUDeviceContext, double>;
+template struct ChannelDequantizeFunctorV2<phi::CPUContext, float>;
+template struct ChannelDequantizeFunctorV2<phi::CPUContext, double>;
 
 class QuantizeLinearOp : public framework::OperatorWithKernel {
  public:
@@ -176,7 +176,7 @@ In above three formulas, the range value of c is as follow:
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
+using CPU = phi::CPUContext;
 
 REGISTER_OPERATOR(
     quantize_linear,
diff --git a/paddle/fluid/operators/random_crop_op.cc b/paddle/fluid/operators/random_crop_op.cc
index d7c30142ee778..b86cd9538acea 100644
--- a/paddle/fluid/operators/random_crop_op.cc
+++ b/paddle/fluid/operators/random_crop_op.cc
@@ -98,7 +98,7 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 
 template <typename T>
-using Kernel = ops::RandomCropKernel<paddle::platform::CPUDeviceContext, T>;
+using Kernel = ops::RandomCropKernel<phi::CPUContext, T>;
 REGISTER_OP_CPU_KERNEL(random_crop,
                        Kernel<float>,
                        Kernel<int>,
diff --git a/paddle/fluid/operators/random_crop_op.h b/paddle/fluid/operators/random_crop_op.h
index 78841dae77fb6..aee430b50579d 100644
--- a/paddle/fluid/operators/random_crop_op.h
+++ b/paddle/fluid/operators/random_crop_op.h
@@ -30,7 +30,7 @@ template <typename DeviceContext>
 struct Random;
 
 template <>
-struct Random<platform::CPUDeviceContext> {
+struct Random<phi::CPUContext> {
   using Engine = std::minstd_rand;
 
   template <typename T>
@@ -218,7 +218,7 @@ class RandomCropKernel : public framework::OpKernel<T> {
 
     for_range(functor);
 
-    Random<platform::CPUDeviceContext>::Engine engine(seed);
+    Random<phi::CPUContext>::Engine engine(seed);
     engine.discard(functor.prod_batchsize_dims_ *
                    (functor.rank_ - functor.num_batchsize_dims_));
     *ctx.Output<framework::LoDTensor>("SeedOut")->mutable_data<int64_t>(
diff --git a/paddle/fluid/operators/randperm_op_mlu.cc b/paddle/fluid/operators/randperm_op_mlu.cc
index 0d4fbf2d12f7c..a3ebf8f5c00fc 100644
--- a/paddle/fluid/operators/randperm_op_mlu.cc
+++ b/paddle/fluid/operators/randperm_op_mlu.cc
@@ -15,9 +15,32 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/randperm_op.h"
 
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class RandpermMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    int n = ctx.Attr<int>("n");
+    unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
+    framework::Variable* out_var = ctx.OutputVar("Out");
+    framework::Tensor* out_tensor =
+        framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(out_var);
+
+    framework::Tensor tmp_tensor;
+    tmp_tensor.Resize(phi::make_ddim({n}));
+    T* tmp_data = tmp_tensor.mutable_data<T>(platform::CPUPlace());
+    random_permate<T>(tmp_data, n, seed);
+    framework::TensorCopySync(tmp_tensor, ctx.GetPlace(), out_tensor);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
 template <typename T>
-using kernel =
-    paddle::operators::RandpermKernel<paddle::platform::MLUDeviceContext, T>;
+using kernel = paddle::operators::RandpermMLUKernel<T>;
 
 REGISTER_OP_MLU_KERNEL(
     randperm, kernel<int64_t>, kernel<int>, kernel<float>, kernel<double>);
diff --git a/paddle/fluid/operators/rank_attention_op.cc b/paddle/fluid/operators/rank_attention_op.cc
index 4c97b9bf5bd1c..716fc58d4187b 100644
--- a/paddle/fluid/operators/rank_attention_op.cc
+++ b/paddle/fluid/operators/rank_attention_op.cc
@@ -194,10 +194,9 @@ REGISTER_OPERATOR(rank_attention_grad,
                   ops::RankAttentionGradOp,
                   ops::RankAttentionGradOpNoNeedBufferVarsInference);
 
-REGISTER_OP_CPU_KERNEL(
-    rank_attention,
-    ops::RankAttentionKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::RankAttentionKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(rank_attention,
+                       ops::RankAttentionKernel<phi::CPUContext, float>,
+                       ops::RankAttentionKernel<phi::CPUContext, double>);
 
 REGISTER_OP_VERSION(rank_attention)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc
index 05f2fb7067e51..edf82d00950ae 100644
--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
@@ -24,9 +24,6 @@ class OpDesc;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
@@ -243,11 +240,9 @@ REGISTER_OPERATOR(rank_loss,
                   ops::RankLossGradMaker<paddle::framework::OpDesc>,
                   ops::RankLossGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(rank_loss_grad, ops::RankLossGradOp);
-REGISTER_OP_CPU_KERNEL(
-    rank_loss, ops::RankLossKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    rank_loss_grad,
-    ops::RankLossGradKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(rank_loss, ops::RankLossKernel<phi::CPUContext, float>);
+REGISTER_OP_CPU_KERNEL(rank_loss_grad,
+                       ops::RankLossGradKernel<phi::CPUContext, float>);
 
 REGISTER_OP_CUDA_KERNEL(
     rank_loss,
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index f3913a62b29d1..a36d51e42f5c8 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -19,6 +19,9 @@
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 
+#include "paddle/phi/backends/device_guard.h"
+#include "paddle/phi/backends/device_manager.h"
+
 namespace paddle {
 namespace operators {
 namespace reader {
@@ -105,11 +108,30 @@ BufferedReader::BufferedReader(
   }
 #endif
 
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  if (platform::is_custom_place(place_)) {
+    auto stream = ((platform::CustomDeviceContext
+                        *)(platform::DeviceContextPool::Instance().Get(place_)))
+                      ->stream();
+    custom_device_compute_stream_ =
+        std::make_shared<phi::stream::Stream>(place_, stream);
+
+    custom_device_events_.resize(buffer_size);
+    for (auto &event : custom_device_events_) {
+      event = std::make_shared<phi::event::Event>();
+      event->Init(place_);
+    }
+    custom_device_stream_ = std::make_shared<phi::stream::Stream>();
+    custom_device_stream_->Init(place_);
+  }
+#endif
+
   cpu_buffer_.resize(buffer_size);
   cuda_buffer_.resize(buffer_size);
   npu_buffer_.resize(buffer_size);
   mlu_buffer_.resize(buffer_size);
   xpu_buffer_.resize(buffer_size);
+  custom_device_buffer_.resize(buffer_size);
   ReadTillBufferFullAsync();
 }
 
@@ -410,6 +432,58 @@ void BufferedReader::ReadAsync(size_t i) {
       platform::XPUStreamSync(stream_.get());
     }
 #endif
+
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    if (platform::is_custom_place(place_)) {
+      TensorVec &custom_device = custom_device_buffer_[i];
+      if (custom_device.empty()) {
+        custom_device.resize(cpu.size());
+      } else {
+        PADDLE_ENFORCE_EQ(custom_device.size(),
+                          cpu.size(),
+                          platform::errors::InvalidArgument(
+                              "Input tensor number on CustomDevice and CPU "
+                              "devices are not matched. "
+                              "The number on CustomDevice is %d, on CPU is %d",
+                              custom_device.size(),
+                              cpu.size()));
+      }
+
+      std::vector<void *> custom_device_ptrs;
+      custom_device_ptrs.reserve(cpu.size());
+      for (size_t i = 0; i < cpu.size(); ++i) {
+        custom_device[i].Resize(cpu[i].dims());
+        custom_device[i].set_layout(cpu[i].layout());
+        custom_device_ptrs.emplace_back(
+            custom_device[i].mutable_data(place_, cpu[i].type()));
+      }
+
+      phi::DeviceManager::SetDevice(place_);
+      phi::DeviceManager::GetDeviceWithPlace(place_)->RecordEvent(
+          custom_device_events_[i].get(), custom_device_compute_stream_.get());
+      phi::DeviceManager::GetDeviceWithPlace(place_)->StreamWaitEvent(
+          custom_device_stream_.get(), custom_device_events_[i].get());
+
+      platform::RecordEvent record_event("BufferedReader:MemoryCopy",
+                                         platform::TracerEventType::UserDefined,
+                                         1);
+      for (size_t i = 0; i < cpu.size(); ++i) {
+        auto cpu_place = cpu[i].place();
+        auto cpu_ptr = cpu[i].data();
+        auto custom_device_ptr = custom_device_ptrs[i];
+        auto size =
+            cpu[i].numel() * paddle::framework::DataTypeSize(cpu[i].dtype());
+        if ((platform::is_custom_place(cpu_place))) {
+          memory::Copy(place_, custom_device_ptr, cpu_place, cpu_ptr, size);
+          custom_device_stream_->Synchronize();
+        } else {
+          memory::Copy(place_, custom_device_ptr, cpu_place, cpu_ptr, size);
+        }
+        custom_device[i].set_lod(cpu[i].lod());
+      }
+      custom_device_stream_->Synchronize();
+    }
+#endif
     return i;
   }));
 }
@@ -449,6 +523,8 @@ void BufferedReader::ReadNextImpl(std::vector<framework::LoDTensor> *out) {
     *out = std::move(mlu_buffer_[i]);
   } else if (platform::is_xpu_place(place_)) {
     *out = std::move(xpu_buffer_[i]);
+  } else if (platform::is_custom_place(place_)) {
+    *out = std::move(custom_device_buffer_[i]);
   } else {
     *out = std::move(cpu_buffer_[i]);
   }
diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h
index 94c2fb12486bc..06aaf4c12057d 100644
--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -37,7 +37,10 @@
 #include "paddle/fluid/platform/device/xpu/xpu_info.h"
 #include "paddle/fluid/platform/device/xpu/xpu_resource_pool.h"
 #endif
-
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#include "paddle/phi/backends/event.h"
+#include "paddle/phi/backends/stream.h"
+#endif
 namespace paddle {
 namespace operators {
 namespace reader {
@@ -82,6 +85,7 @@ class BufferedReader : public framework::DecoratedReader {
   std::vector<TensorVec> npu_buffer_;
   std::vector<TensorVec> mlu_buffer_;
   std::vector<TensorVec> xpu_buffer_;
+  std::vector<TensorVec> custom_device_buffer_;
   size_t prev_pos_{-1UL};
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   gpuStream_t compute_stream_;
@@ -106,6 +110,12 @@ class BufferedReader : public framework::DecoratedReader {
   std::shared_ptr<platform::XpuStreamObject> stream_;
   std::vector<std::shared_ptr<platform::XpuEventObject>> events_;
 #endif
+
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  std::shared_ptr<phi::stream::Stream> custom_device_compute_stream_;
+  std::shared_ptr<phi::stream::Stream> custom_device_stream_;
+  std::vector<std::shared_ptr<phi::event::Event>> custom_device_events_;
+#endif
 };
 
 }  // namespace reader
diff --git a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc
index c7b0e8ced59e7..7fba45fa53923 100644
--- a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc
+++ b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc
@@ -27,9 +27,6 @@ class OpDesc;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
index 6947ca5b71a93..f0de94666357e 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
@@ -27,9 +27,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle
 
 DECLARE_INFER_SHAPE_FUNCTOR(reduce_all,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_amax_op.cc b/paddle/fluid/operators/reduce_ops/reduce_amax_op.cc
index fce8c51f003d3..36776cebfcd46 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_amax_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_amax_op.cc
@@ -17,26 +17,14 @@
 REGISTER_REDUCE_OP(reduce_amax);
 REGISTER_OP_CPU_KERNEL(
     reduce_amax,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                      float,
-                      ops::MaxFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                      double,
-                      ops::MaxFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::MaxFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                      int64_t,
-                      ops::MaxFunctor>);
-REGISTER_OP_CPU_KERNEL(reduce_amax_grad,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                             float,
-                                             ops::AMaxOrAMinGradFunctor>,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                             double,
-                                             ops::AMaxOrAMinGradFunctor>,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                             int,
-                                             ops::AMaxOrAMinGradFunctor>,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                             int64_t,
-                                             ops::AMaxOrAMinGradFunctor>);
+    ops::ReduceKernel<phi::CPUContext, float, ops::MaxFunctor>,
+    ops::ReduceKernel<phi::CPUContext, double, ops::MaxFunctor>,
+    ops::ReduceKernel<phi::CPUContext, int, ops::MaxFunctor>,
+    ops::ReduceKernel<phi::CPUContext, int64_t, ops::MaxFunctor>);
+REGISTER_OP_CPU_KERNEL(
+    reduce_amax_grad,
+    ops::ReduceGradKernel<phi::CPUContext, float, ops::AMaxOrAMinGradFunctor>,
+    ops::ReduceGradKernel<phi::CPUContext, double, ops::AMaxOrAMinGradFunctor>,
+    ops::ReduceGradKernel<phi::CPUContext, int, ops::AMaxOrAMinGradFunctor>,
+    ops::
+        ReduceGradKernel<phi::CPUContext, int64_t, ops::AMaxOrAMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_amin_op.cc b/paddle/fluid/operators/reduce_ops/reduce_amin_op.cc
index a6c4cb5510529..bb99ca9b17e7e 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_amin_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_amin_op.cc
@@ -17,26 +17,14 @@
 REGISTER_REDUCE_OP(reduce_amin);
 REGISTER_OP_CPU_KERNEL(
     reduce_amin,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                      float,
-                      ops::MinFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                      double,
-                      ops::MinFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::MinFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                      int64_t,
-                      ops::MinFunctor>);
-REGISTER_OP_CPU_KERNEL(reduce_amin_grad,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                             float,
-                                             ops::AMaxOrAMinGradFunctor>,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                             double,
-                                             ops::AMaxOrAMinGradFunctor>,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                             int,
-                                             ops::AMaxOrAMinGradFunctor>,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                             int64_t,
-                                             ops::AMaxOrAMinGradFunctor>);
+    ops::ReduceKernel<phi::CPUContext, float, ops::MinFunctor>,
+    ops::ReduceKernel<phi::CPUContext, double, ops::MinFunctor>,
+    ops::ReduceKernel<phi::CPUContext, int, ops::MinFunctor>,
+    ops::ReduceKernel<phi::CPUContext, int64_t, ops::MinFunctor>);
+REGISTER_OP_CPU_KERNEL(
+    reduce_amin_grad,
+    ops::ReduceGradKernel<phi::CPUContext, float, ops::AMaxOrAMinGradFunctor>,
+    ops::ReduceGradKernel<phi::CPUContext, double, ops::AMaxOrAMinGradFunctor>,
+    ops::ReduceGradKernel<phi::CPUContext, int, ops::AMaxOrAMinGradFunctor>,
+    ops::
+        ReduceGradKernel<phi::CPUContext, int64_t, ops::AMaxOrAMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
index 85e262add2e74..6634ccaaa0121 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
@@ -26,9 +26,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle
 
 DECLARE_INFER_SHAPE_FUNCTOR(reduce_any,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index ec3cf1908c5b5..e9bc3905a22ee 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -545,6 +545,38 @@ class ReduceOp : public framework::OperatorWithKernel {
     }
   }
 
+  // oneDNN's reduction kernel is optimized only for reducing throughout the
+  // most outer dims, so in case of another type of reduction, it would be
+  // better to fallback to native implementation
+  static bool HasOptimizedOneDNNKernel(const framework::ExecutionContext& ctx) {
+    // native reduce kernels don't support bf16
+    // so oneDNN kernel is enforced in that case
+    if (ctx.Input<framework::LoDTensor>("X")->dtype() ==
+        experimental::DataType::BFLOAT16)
+      return true;
+
+    auto reduce_dims = ctx.Attr<std::vector<int>>("dim");
+    const bool reduce_all = ctx.Attr<bool>("reduce_all");
+    int ndims = ctx.Input<framework::LoDTensor>("X")->dims().size();
+
+    if (reduce_all) {
+      return true;
+    }
+
+    for (size_t i = 0; i < reduce_dims.size(); ++i) {
+      if (reduce_dims[i] < 0) reduce_dims[i] = ndims + reduce_dims[i];
+    }
+    sort(reduce_dims.begin(), reduce_dims.end());
+    for (size_t i = 0; i < reduce_dims.size(); ++i) {
+      if (reduce_dims[reduce_dims.size() - i - 1] !=
+          static_cast<int>(ndims - i - 1)) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     // choose cudnn kernel if the runtime supported.
@@ -554,7 +586,8 @@ class ReduceOp : public framework::OperatorWithKernel {
       return framework::OpKernelType(input_data_type, ctx.GetPlace());
 
 #ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+    if (this->CanMKLDNNBeUsed(ctx, input_data_type) &&
+        HasOptimizedOneDNNKernel(ctx)) {
       return framework::OpKernelType(input_data_type,
                                      ctx.GetPlace(),
                                      framework::DataLayout::kMKLDNN,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
index 1c88c4cb70842..578954663c7f5 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
@@ -25,9 +25,6 @@ class OpDesc;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
index ca24cc9c634a3..d072dcfa5eb94 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
@@ -27,9 +27,6 @@ class OpDesc;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/repeat_interleave_op.cc b/paddle/fluid/operators/repeat_interleave_op.cc
index 2ed43cf8f0ea4..f3bec9489fdb0 100644
--- a/paddle/fluid/operators/repeat_interleave_op.cc
+++ b/paddle/fluid/operators/repeat_interleave_op.cc
@@ -172,16 +172,14 @@ REGISTER_OPERATOR(repeat_interleave,
 REGISTER_OPERATOR(repeat_interleave_grad,
                   ops::RepeatInterleaveGradOp,
                   ops::RepeatInterleaveGradNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    repeat_interleave,
-    ops::RepeatInterleaveKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::RepeatInterleaveKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::RepeatInterleaveKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::RepeatInterleaveKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(repeat_interleave,
+                       ops::RepeatInterleaveKernel<phi::CPUContext, float>,
+                       ops::RepeatInterleaveKernel<phi::CPUContext, double>,
+                       ops::RepeatInterleaveKernel<phi::CPUContext, int>,
+                       ops::RepeatInterleaveKernel<phi::CPUContext, int64_t>);
 REGISTER_OP_CPU_KERNEL(
     repeat_interleave_grad,
-    ops::RepeatInterleaveGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::RepeatInterleaveGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::RepeatInterleaveGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::RepeatInterleaveGradKernel<paddle::platform::CPUDeviceContext,
-                                    int64_t>);
+    ops::RepeatInterleaveGradKernel<phi::CPUContext, float>,
+    ops::RepeatInterleaveGradKernel<phi::CPUContext, double>,
+    ops::RepeatInterleaveGradKernel<phi::CPUContext, int>,
+    ops::RepeatInterleaveGradKernel<phi::CPUContext, int64_t>);
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index ec9c1198996c1..b665cce096207 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -420,7 +420,7 @@ class ReshapeKernel {
       pt_scalar_shape = phi::IntArray(shape_attr);
     }
     if (platform::is_cpu_place(ctx.GetPlace())) {
-      auto &dev_ctx = ctx.device_context<platform::CPUDeviceContext>();
+      auto &dev_ctx = ctx.device_context<phi::CPUContext>();
       phi::ReshapeKernel(static_cast<const phi::CPUContext &>(dev_ctx),
                          *in,
                          pt_scalar_shape,
@@ -455,7 +455,7 @@ class ReshapeGradKernel {
     d_x->mutable_data(ctx.GetPlace(), d_out->type());
 
     if (platform::is_cpu_place(ctx.GetPlace())) {
-      auto &dev_ctx = ctx.device_context<platform::CPUDeviceContext>();
+      auto &dev_ctx = ctx.device_context<phi::CPUContext>();
       phi::ReshapeGradKernel(
           static_cast<const phi::CPUContext &>(dev_ctx), *d_out, d_x);
     }
@@ -485,7 +485,7 @@ class ReshapeDoubleGradKernel {
     dd_out->mutable_data(ctx.GetPlace(), dd_x->type());
 
     if (platform::is_cpu_place(ctx.GetPlace())) {
-      auto &dev_ctx = ctx.device_context<platform::CPUDeviceContext>();
+      auto &dev_ctx = ctx.device_context<phi::CPUContext>();
       phi::ReshapeDoubleGradKernel(
           static_cast<const phi::CPUContext &>(dev_ctx), *d_out, *dd_x, dd_out);
     }
diff --git a/paddle/fluid/operators/rnn_op_mlu.cc b/paddle/fluid/operators/rnn_op_mlu.cc
index 653c50c83b83e..fe567333b6d40 100644
--- a/paddle/fluid/operators/rnn_op_mlu.cc
+++ b/paddle/fluid/operators/rnn_op_mlu.cc
@@ -28,7 +28,7 @@ void reset_parameter_vector(
     const std::vector<TensorType>& raw_params_vec,
     const int& num_layers,
     const bool& is_bidirec,
-    std::vector<std::vector<std::pair<const T*, size_t>>>* params_vec) {
+    std::vector<std::vector<std::pair<T*, size_t>>>* params_vec) {
   // the parameter raw seuquence is [FWhi, FWhh, BWhi, BWhh] * num_layers
   // + [FBhi, FBhh, BBhi, BBhh] * num_layers, we will reset the parameter to
   // ([FWhi, FWhh, FBhi, FBhh] + [BWhi, BWhh, BBhi, BBhh]) * num_layers
@@ -47,7 +47,8 @@ void reset_parameter_vector(
       }
       using remove_cv_t = typename std::remove_cv<T>::type;
       params_vec->at(i)[j] = std::make_pair(
-          raw_params_vec[tensor_idx]->template data<remove_cv_t>(),
+          const_cast<T*>(
+              raw_params_vec[tensor_idx]->template data<remove_cv_t>()),
           raw_params_vec[tensor_idx]->numel() * sizeof(T));
     }
   }
@@ -66,7 +67,6 @@ class RNNMLUKernel : public framework::OpKernel<T> {
     // Output
     auto state = ctx.MultiOutput<Tensor>("State");
     auto* output = ctx.Output<Tensor>("Out");
-    // auto* dropout_mask = ctx.Output<Tensor>("DropoutState");
     auto* reserve_data = ctx.Output<Tensor>("Reserve");
     // Attributes
     const int& num_layers = ctx.Attr<int>("num_layers");
@@ -79,14 +79,6 @@ class RNNMLUKernel : public framework::OpKernel<T> {
       sequence_length = ctx.Input<Tensor>("SequenceLength");
     }
 
-    // if (dropout_mask->IsInitialized()) {
-    //   if (dropout_mask->numel() != output->numel()) dropout_mask->clear();
-    // }
-    // dropout_mask->mutable_data<uint8_t>(output->dims(), ctx.GetPlace());
-    // auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    // phi::funcs::SetConstant<platform::XPUDeviceContext, uint8_t> ones;
-    // ones(dev_ctx, dropout_mask, static_cast<uint8_t>(1));
-
     auto init_h = pre_state[0];  // -> hx
     auto init_c = pre_state[1];  // -> cx
     auto last_h = state[0];
@@ -143,7 +135,7 @@ class RNNMLUKernel : public framework::OpKernel<T> {
             init_c->dims()[0]));
 
     // weightlist
-    std::vector<std::vector<std::pair<const T*, size_t>>> parameter_lists;
+    std::vector<std::vector<std::pair<T*, size_t>>> parameter_lists;
     parameter_lists.resize(num_layers);
     reset_parameter_vector(
         weight_list, num_layers, is_bidirec, &parameter_lists);
@@ -363,9 +355,390 @@ class RNNMLUKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename DeviceContext, typename T>
+class RNNMLUGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto stream = ctx.template device_context<MLUDeviceContext>().stream();
+    // get the tensor pointer for the input
+    auto* input = ctx.Input<Tensor>("Input");
+    auto pre_state = ctx.MultiInput<Tensor>("PreState");
+    auto weight_list = ctx.MultiInput<framework::Tensor>("WeightList");
+    auto* output = ctx.Input<Tensor>("Out");
+    auto* reserve_data = ctx.Input<Tensor>("Reserve");
+    const int& num_layers = ctx.Attr<int>("num_layers");
+    const bool& is_bidirec = ctx.Attr<bool>("is_bidirec");
+    const int& hidden_size = ctx.Attr<int>("hidden_size");
+    const std::string& mode = ctx.Attr<std::string>("mode");
+
+    bool has_seq_length = ctx.HasInput("SequenceLength");
+    const Tensor* sequence_length = nullptr;
+    if (has_seq_length) {
+      sequence_length = ctx.Input<Tensor>("SequenceLength");
+    }
+
+    PADDLE_ENFORCE_EQ(
+        mode,
+        "LSTM",
+        platform::errors::InvalidArgument(
+            "XPU only support LSTM mode now, current mode is %s", mode));
+
+    auto init_h = pre_state[0];  // -> hx
+    auto init_c = pre_state[1];  // -> cx
+
+    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto state_grad = ctx.MultiInput<Tensor>(framework::GradVarName("State"));
+    auto last_h_grad = state_grad[0];  // -> dhy
+    auto last_c_grad = state_grad[1];  // -> dcy
+
+    // get the tensor pointer for the output
+    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    auto weight_grad_list = ctx.MultiOutput<framework::Tensor>(
+        framework::GradVarName("WeightList"));
+    auto pre_state_grad =
+        ctx.MultiOutput<Tensor>(framework::GradVarName("PreState"));
+    Tensor* init_h_grad = nullptr;
+    Tensor* init_c_grad = nullptr;
+    if (pre_state_grad.size() > 0) {    // has gradient
+      init_h_grad = pre_state_grad[0];  // -> dhx
+      init_c_grad = pre_state_grad[1];  // -> dcx
+    }
+
+    // check shape
+    const int in_out_dim_num = input->dims().size();
+    const int& seq_len = input->dims()[0];
+    const int& batch_size = input->dims()[1];
+    const int& input_dim = input->dims()[2];
+    const int& direction_num = is_bidirec ? 2 : 1;
+    int in_dim_arr[in_out_dim_num] = {seq_len, batch_size, input_dim};
+    int out_dim_arr[in_out_dim_num] = {
+        seq_len, batch_size, direction_num * hidden_size};
+    int proj_size = hidden_size;
+    PADDLE_ENFORCE_EQ(
+        num_layers,
+        1,
+        platform::errors::InvalidArgument(
+            "MLU only support 1 num_layers, current num_layers is %s",
+            num_layers));
+    PADDLE_ENFORCE_EQ(
+        init_h->dims()[0],
+        num_layers * direction_num,
+        platform::errors::InvalidArgument("The num_layers of in RNN layer must"
+                                          " be the same as first dim of init"
+                                          "hidden, but received num_layers:%d,"
+                                          " dim:%d",
+                                          num_layers,
+                                          init_h->dims()[0]));
+    PADDLE_ENFORCE_EQ(
+        init_c->dims()[0],
+        num_layers * direction_num,
+        platform::errors::InvalidArgument(
+            "The num_layers of in RNN layer must"
+            " be the same as first dim of cell state hidden, but received"
+            " num_layers:%d, dim:%d",
+            num_layers,
+            init_c->dims()[0]));
+
+    std::vector<std::vector<std::pair<T*, size_t>>> parameter_lists;
+    parameter_lists.resize(num_layers);
+    reset_parameter_vector(
+        weight_list, num_layers, is_bidirec, &parameter_lists);
+
+    for (unsigned int i = 0; i < weight_grad_list.size(); ++i) {
+      weight_grad_list[i]->mutable_data<T>(ctx.GetPlace());
+    }
+    std::vector<std::vector<std::pair<T*, size_t>>> parameter_lists_grad;
+    parameter_lists_grad.resize(num_layers);
+    reset_parameter_vector(
+        weight_grad_list, num_layers, is_bidirec, &parameter_lists_grad);
+
+    // allocate the memory and initization the input_grad
+    input_grad->mutable_data<T>(input->dims(), ctx.GetPlace());
+    FillMLUTensorWithHostValue(ctx, static_cast<T>(0.0), input_grad);
+
+    Tensor a, b;
+    Tensor* dynamic_grad_pre_h = &a;
+    Tensor* dynamic_grad_pre_c = &b;
+    if (init_h_grad) {
+      init_h_grad->mutable_data<T>(last_h_grad->dims(), ctx.GetPlace());
+      FillMLUTensorWithHostValue(ctx, static_cast<T>(0.0), init_h_grad);
+    } else {
+      dynamic_grad_pre_h->Resize(last_h_grad->dims());
+      dynamic_grad_pre_h->mutable_data<T>(ctx.GetPlace());
+      FillMLUTensorWithHostValue(ctx, static_cast<T>(0.0), dynamic_grad_pre_h);
+      init_h_grad = dynamic_grad_pre_h;
+    }
+    if (init_c_grad) {
+      init_c_grad->mutable_data<T>(last_c_grad->dims(), ctx.GetPlace());
+    } else {
+      dynamic_grad_pre_c->Resize(last_h_grad->dims());
+      dynamic_grad_pre_c->mutable_data<T>(ctx.GetPlace());
+      init_c_grad = dynamic_grad_pre_c;
+    }
+
+    std::vector<int> seq_len_vec(batch_size, seq_len);
+    if (has_seq_length) {
+      seq_len_vec = operators::GetDataFromTensor(sequence_length);
+    }
+    cnnlDirectionMode_t direction =
+        is_bidirec ? CNNL_RNN_BIDIRECTIONAL : CNNL_RNN_UNIDIRECTIONAL;
+
+    MLUSeqDataDesc input_seq_data_desc(CNNL_SEQDATA_TNC,
+                                       ToCnnlDataType(input->dtype()),
+                                       in_out_dim_num,
+                                       in_dim_arr,
+                                       static_cast<int>(seq_len_vec.size()),
+                                       seq_len_vec.data(),
+                                       nullptr);
+    MLUSeqDataDesc out_seq_data_desc(CNNL_SEQDATA_TNC,
+                                     ToCnnlDataType(input->dtype()),
+                                     in_out_dim_num,
+                                     out_dim_arr,
+                                     static_cast<int>(seq_len_vec.size()),
+                                     seq_len_vec.data(),
+                                     nullptr);
+    MLUCnnlTensorDesc hx_desc(*init_h);
+    MLUCnnlTensorDesc cx_desc(*init_c);
+    MLURNNDesc rnn_desc(CNNL_LSTM,
+                        CNNL_RNN_DOUBLE_BIAS,
+                        direction,
+                        CNNL_RNN_LINEAR_INPUT,
+                        ToCnnlDataType(input->dtype()),
+                        ToCnnlDataType(input->dtype()),
+                        input_dim,
+                        hidden_size,
+                        /*projection*/ proj_size,
+                        num_layers,
+                        nullptr,
+                        CNNL_RNN_PADDED_IO_DISABLED);
+    rnn_desc.SetRNNMaskMode(CNNL_LSTM_MASK_ENABLED);
+
+    // copy weight
+    size_t weightspace_size;
+    framework::Tensor weightspace, dweightspace;
+    PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetRNNWeightSpaceSize(
+        GetHandleFromCTX(ctx), rnn_desc.get(), &weightspace_size));
+
+    weightspace = ctx.AllocateTmpTensor<T, DeviceContext>(
+        {static_cast<int64_t>(weightspace_size)}, dev_ctx);
+    dweightspace = ctx.AllocateTmpTensor<T, DeviceContext>(
+        {static_cast<int64_t>(weightspace_size)}, dev_ctx);
+    void* weightspace_ptr = weightspace.mutable_data(ctx.GetPlace());
+    auto w_x = parameter_lists[0][0];
+    auto w_h = parameter_lists[0][1];
+    auto b_x = parameter_lists[0][2];
+    auto b_h = parameter_lists[0][3];
+    auto actual_total_w_size =
+        w_x.second + w_h.second + b_x.second + b_h.second;
+
+    void* w_x_ptr = weightspace_ptr;
+    void* w_h_ptr = static_cast<char*>(weightspace_ptr) + w_x.second;
+    void* b_x_ptr =
+        static_cast<char*>(weightspace_ptr) + w_x.second + w_h.second;
+    void* b_h_ptr = static_cast<char*>(weightspace_ptr) + w_x.second +
+                    w_h.second + b_x.second;
+
+    memory::Copy(weightspace.place(),
+                 w_x_ptr,
+                 weightspace.place(),
+                 w_x.first,
+                 w_x.second,
+                 stream);
+    memory::Copy(weightspace.place(),
+                 w_h_ptr,
+                 weightspace.place(),
+                 w_h.first,
+                 w_h.second,
+                 stream);
+    memory::Copy(weightspace.place(),
+                 b_x_ptr,
+                 weightspace.place(),
+                 b_x.first,
+                 b_x.second,
+                 stream);
+    memory::Copy(weightspace.place(),
+                 b_h_ptr,
+                 weightspace.place(),
+                 b_h.first,
+                 b_h.second,
+                 stream);
+
+    if (is_bidirec) {
+      auto bw_x = parameter_lists[0][4];
+      auto bw_h = parameter_lists[0][5];
+      auto bb_x = parameter_lists[0][6];
+      auto bb_h = parameter_lists[0][7];
+      void* bw_x_ptr =
+          static_cast<char*>(weightspace_ptr) + actual_total_w_size;
+      void* bw_h_ptr = static_cast<char*>(weightspace_ptr) +
+                       actual_total_w_size + bw_x.second;
+      void* bb_x_ptr = static_cast<char*>(weightspace_ptr) +
+                       actual_total_w_size + bw_x.second + bw_h.second;
+      void* bb_h_ptr = static_cast<char*>(weightspace_ptr) +
+                       actual_total_w_size + bw_x.second + bw_h.second +
+                       bb_x.second;
+      actual_total_w_size +=
+          bw_x.second + bw_h.second + bb_x.second + bb_h.second;
+
+      memory::Copy(weightspace.place(),
+                   bw_x_ptr,
+                   weightspace.place(),
+                   bw_x.first,
+                   bw_x.second,
+                   stream);
+      memory::Copy(weightspace.place(),
+                   bw_h_ptr,
+                   weightspace.place(),
+                   bw_h.first,
+                   bw_h.second,
+                   stream);
+      memory::Copy(weightspace.place(),
+                   bb_x_ptr,
+                   weightspace.place(),
+                   bb_x.first,
+                   bb_x.second,
+                   stream);
+      memory::Copy(weightspace.place(),
+                   bb_h_ptr,
+                   weightspace.place(),
+                   bb_h.first,
+                   bb_h.second,
+                   stream);
+    }
+    dev_ctx.Wait();
+
+    PADDLE_ENFORCE_EQ(weightspace_size,
+                      actual_total_w_size,
+                      platform::errors::InvalidArgument(
+                          "The weightsize doesn't match"
+                          " weightspace_size:%d, actual_total_w_size:%d",
+                          weightspace_size,
+                          actual_total_w_size));
+
+    MLUCnnl::RNNBackward(ctx,
+                         rnn_desc.get(),
+                         CNNL_WGRAD_MODE_SET,
+                         seq_len_vec.data(),
+                         GetBasePtr(&weightspace),
+                         GetBasePtr(&dweightspace),
+                         weightspace.numel() * sizeof(T),
+                         input_seq_data_desc.get(),
+                         GetBasePtr(input),
+                         GetBasePtr(input_grad),
+                         out_seq_data_desc.get(),
+                         GetBasePtr(output),
+                         GetBasePtr(output_grad),
+                         hx_desc.get(),
+                         GetBasePtr(init_h),
+                         GetBasePtr(last_h_grad),
+                         GetBasePtr(init_h_grad),
+                         cx_desc.get(),
+                         GetBasePtr(init_c),
+                         GetBasePtr(last_c_grad),
+                         GetBasePtr(init_c_grad),
+                         const_cast<void*>(GetBasePtr(reserve_data)),
+                         reserve_data->numel() * sizeof(T));
+
+    void* dweightspace_ptr = dweightspace.mutable_data(ctx.GetPlace());
+    auto dw_x = parameter_lists_grad[0][0];
+    auto dw_h = parameter_lists_grad[0][1];
+    auto db_x = parameter_lists_grad[0][2];
+    auto db_h = parameter_lists_grad[0][3];
+    auto dactual_total_w_size =
+        dw_x.second + dw_h.second + db_x.second + db_h.second;
+
+    void* dw_x_ptr = dweightspace_ptr;
+    void* dw_h_ptr = static_cast<char*>(dweightspace_ptr) + dw_x.second;
+    void* db_x_ptr =
+        static_cast<char*>(dweightspace_ptr) + dw_x.second + dw_h.second;
+    void* db_h_ptr = static_cast<char*>(dweightspace_ptr) + dw_x.second +
+                     dw_h.second + db_x.second;
+
+    memory::Copy(weightspace.place(),
+                 dw_x.first,
+                 weightspace.place(),
+                 dw_x_ptr,
+                 dw_x.second,
+                 stream);
+    memory::Copy(weightspace.place(),
+                 dw_h.first,
+                 weightspace.place(),
+                 dw_h_ptr,
+                 dw_h.second,
+                 stream);
+    memory::Copy(weightspace.place(),
+                 db_x.first,
+                 weightspace.place(),
+                 db_x_ptr,
+                 db_x.second,
+                 stream);
+    memory::Copy(weightspace.place(),
+                 db_h.first,
+                 weightspace.place(),
+                 db_h_ptr,
+                 db_h.second,
+                 stream);
+
+    if (is_bidirec) {
+      auto dbw_x = parameter_lists_grad[0][4];
+      auto dbw_h = parameter_lists_grad[0][5];
+      auto dbb_x = parameter_lists_grad[0][6];
+      auto dbb_h = parameter_lists_grad[0][7];
+      void* dbw_x_ptr =
+          static_cast<char*>(dweightspace_ptr) + dactual_total_w_size;
+      void* dbw_h_ptr = static_cast<char*>(dweightspace_ptr) +
+                        dactual_total_w_size + dbw_x.second;
+      void* dbb_x_ptr = static_cast<char*>(dweightspace_ptr) +
+                        dactual_total_w_size + dbw_x.second + dbw_h.second;
+      void* dbb_h_ptr = static_cast<char*>(dweightspace_ptr) +
+                        dactual_total_w_size + dbw_x.second + dbw_h.second +
+                        dbb_x.second;
+      dactual_total_w_size +=
+          dbw_x.second + dbw_h.second + dbb_x.second + dbb_h.second;
+
+      memory::Copy(weightspace.place(),
+                   dbw_x.first,
+                   weightspace.place(),
+                   dbw_x_ptr,
+                   dbw_x.second,
+                   stream);
+      memory::Copy(weightspace.place(),
+                   dbw_h.first,
+                   weightspace.place(),
+                   dbw_h_ptr,
+                   dbw_h.second,
+                   stream);
+      memory::Copy(weightspace.place(),
+                   dbb_x.first,
+                   weightspace.place(),
+                   dbb_x_ptr,
+                   dbb_x.second,
+                   stream);
+      memory::Copy(weightspace.place(),
+                   dbb_h.first,
+                   weightspace.place(),
+                   dbb_h_ptr,
+                   dbb_h.second,
+                   stream);
+    }
+    dev_ctx.Wait();
+
+    PADDLE_ENFORCE_EQ(weightspace_size,
+                      dactual_total_w_size,
+                      platform::errors::InvalidArgument(
+                          "The weightsize doesn't match"
+                          " weightspace_size:%d, dactual_total_w_size:%d",
+                          weightspace_size,
+                          dactual_total_w_size));
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OP_MLU_KERNEL(
     rnn, ops::RNNMLUKernel<paddle::platform::MLUDeviceContext, float>);
+REGISTER_OP_MLU_KERNEL(
+    rnn_grad, ops::RNNMLUGradKernel<paddle::platform::MLUDeviceContext, float>);
diff --git a/paddle/fluid/operators/row_conv_op.cc b/paddle/fluid/operators/row_conv_op.cc
index 1cf72e320ffad..fc39d174c90ae 100644
--- a/paddle/fluid/operators/row_conv_op.cc
+++ b/paddle/fluid/operators/row_conv_op.cc
@@ -140,8 +140,7 @@ the design document
 };
 
 template <typename T>
-class RowConvKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
+class RowConvKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
     auto *x = context.Input<LoDTensor>("X");
@@ -216,8 +215,7 @@ class RowConvKernel<platform::CPUDeviceContext, T>
 };
 
 template <typename T>
-class RowConvGradKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
+class RowConvGradKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
     auto *x = context.Input<LoDTensor>("X");
@@ -353,8 +351,6 @@ REGISTER_OPERATOR(row_conv,
                   ops::RowConvGradOpMaker<paddle::framework::OpDesc>,
                   ops::RowConvGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(row_conv_grad, ops::RowConvGradOp);
-REGISTER_OP_CPU_KERNEL(
-    row_conv, ops::RowConvKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    row_conv_grad,
-    ops::RowConvGradKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(row_conv, ops::RowConvKernel<phi::CPUContext, float>);
+REGISTER_OP_CPU_KERNEL(row_conv_grad,
+                       ops::RowConvGradKernel<phi::CPUContext, float>);
diff --git a/paddle/fluid/operators/run_program_op.cc b/paddle/fluid/operators/run_program_op.cc
index 99ad2328b77cd..fd400d2913670 100644
--- a/paddle/fluid/operators/run_program_op.cc
+++ b/paddle/fluid/operators/run_program_op.cc
@@ -232,9 +232,7 @@ REGISTER_OPERATOR(run_program,
 REGISTER_OPERATOR(run_program_grad, ops::RunProgramGradOp);
 
 /* see [Why use single type kernel] */
-REGISTER_OP_CPU_KERNEL(
-    run_program,
-    ops::RunProgramOpKernel<paddle::platform::CPUDeviceContext, float>)
-REGISTER_OP_CPU_KERNEL(
-    run_program_grad,
-    ops::RunProgramGradOpKernel<paddle::platform::CPUDeviceContext, float>)
+REGISTER_OP_CPU_KERNEL(run_program,
+                       ops::RunProgramOpKernel<phi::CPUContext, float>)
+REGISTER_OP_CPU_KERNEL(run_program_grad,
+                       ops::RunProgramGradOpKernel<phi::CPUContext, float>)
diff --git a/paddle/fluid/operators/sample_logits_op.h b/paddle/fluid/operators/sample_logits_op.h
index d4e862f26cd6d..d6affde0ce022 100644
--- a/paddle/fluid/operators/sample_logits_op.h
+++ b/paddle/fluid/operators/sample_logits_op.h
@@ -244,8 +244,7 @@ class SampleLogitsKernel : public framework::OpKernel<T> {
         context.Attr<bool>("remove_accidental_hits");
 
     // device contexts
-    auto& dev_ctx =
-        context.template device_context<platform::CPUDeviceContext>();
+    auto& dev_ctx = context.template device_context<phi::CPUContext>();
 
     // UNDERSTAND: allocate memories for temporaries
     sampled_logits->mutable_data<T>(samples_dim, context.GetPlace());
@@ -278,8 +277,7 @@ class SampleLogitsKernel : public framework::OpKernel<T> {
       probabilities->mutable_data<T>(samples_dim, context.GetPlace());
       // UNDERSTAND: sampling
       const auto seed = context.Attr<int>("seed");
-      auto sampler_with_prob =
-          math::SampleWithProb<platform::CPUDeviceContext, T>();
+      auto sampler_with_prob = math::SampleWithProb<phi::CPUContext, T>();
       sampler_with_prob(dev_ctx,
                         math::LogUniformSampler(num_classes, seed),
                         num_samples,
@@ -315,9 +313,8 @@ class SampleLogitsGradKernel : public framework::OpKernel<T> {
         context.Input<Tensor>(framework::GradVarName("SampledLogits"));
     logits_grad->mutable_data<T>(context.GetPlace());
 
-    auto& dev_ctx =
-        context.template device_context<platform::CPUDeviceContext>();
-    phi::funcs::SetConstant<platform::CPUDeviceContext, T> set_zero;
+    auto& dev_ctx = context.template device_context<phi::CPUContext>();
+    phi::funcs::SetConstant<phi::CPUContext, T> set_zero;
     set_zero(dev_ctx, logits_grad, static_cast<T>(0));
 
     // UNDERSTAND: scatter it back to logit_grad
diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc
index 385b092c4bcc0..6b5c2367bb9ad 100644
--- a/paddle/fluid/operators/save_combine_op.cc
+++ b/paddle/fluid/operators/save_combine_op.cc
@@ -104,9 +104,8 @@ REGISTER_OPERATOR(save_combine,
 
 REGISTER_OP_CPU_KERNEL(
     save_combine,
-    ops::SaveCombineOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SaveCombineOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SaveCombineOpKernel<paddle::platform::CPUDeviceContext,
-                             paddle::platform::bfloat16>,
-    ops::SaveCombineOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SaveCombineOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
+    ops::SaveCombineOpKernel<phi::CPUContext, float>,
+    ops::SaveCombineOpKernel<phi::CPUContext, double>,
+    ops::SaveCombineOpKernel<phi::CPUContext, paddle::platform::bfloat16>,
+    ops::SaveCombineOpKernel<phi::CPUContext, int>,
+    ops::SaveCombineOpKernel<phi::CPUContext, int64_t>);
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index 0ff381bdbab3f..f269c4aa32dea 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -90,14 +90,12 @@ REGISTER_OPERATOR(save,
 
 REGISTER_OP_CPU_KERNEL(
     save,
-    ops::SaveOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SaveOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SaveOpKernel<paddle::platform::CPUDeviceContext,
-                      paddle::platform::float16>,
-    ops::SaveOpKernel<paddle::platform::CPUDeviceContext,
-                      paddle::platform::bfloat16>,
-    ops::SaveOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SaveOpKernel<paddle::platform::CPUDeviceContext, uint8_t>,
-    ops::SaveOpKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::SaveOpKernel<paddle::platform::CPUDeviceContext, int16_t>,
-    ops::SaveOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
+    ops::SaveOpKernel<phi::CPUContext, float>,
+    ops::SaveOpKernel<phi::CPUContext, double>,
+    ops::SaveOpKernel<phi::CPUContext, paddle::platform::float16>,
+    ops::SaveOpKernel<phi::CPUContext, paddle::platform::bfloat16>,
+    ops::SaveOpKernel<phi::CPUContext, int>,
+    ops::SaveOpKernel<phi::CPUContext, uint8_t>,
+    ops::SaveOpKernel<phi::CPUContext, int8_t>,
+    ops::SaveOpKernel<phi::CPUContext, int16_t>,
+    ops::SaveOpKernel<phi::CPUContext, int64_t>);
diff --git a/paddle/fluid/operators/scale_op_mlu.cc b/paddle/fluid/operators/scale_op_mlu.cc
index 7acaad8ddaad6..363c3e98a6dfc 100644
--- a/paddle/fluid/operators/scale_op_mlu.cc
+++ b/paddle/fluid/operators/scale_op_mlu.cc
@@ -21,7 +21,7 @@ namespace operators {
 template <typename T>
 class ScaleMLUKernel : public framework::OpKernel<T> {
  public:
-  virtual void Compute(const framework::ExecutionContext& ctx) const {
+  void Compute(const framework::ExecutionContext& ctx) const {
     auto& dev_ctx = GetDevCtxFromCTX(ctx);
     auto* in_var = ctx.InputVar("X");
     auto* in = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_var);
diff --git a/paddle/fluid/operators/scatter_test.cc b/paddle/fluid/operators/scatter_test.cc
index 93f2d60e5f232..1249e3e807ec7 100644
--- a/paddle/fluid/operators/scatter_test.cc
+++ b/paddle/fluid/operators/scatter_test.cc
@@ -42,7 +42,7 @@ TEST(scatter, ScatterUpdate) {
   }
 
   auto* cpu_place = new paddle::platform::CPUPlace();
-  paddle::platform::CPUDeviceContext ctx(*cpu_place);
+  phi::CPUContext ctx(*cpu_place);
   phi::funcs::ScatterAssign<float>(ctx, src, index, &output);
 
   for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], 0.0f);
diff --git a/paddle/fluid/operators/search_compute.h b/paddle/fluid/operators/search_compute.h
index 32aa7442f5199..07cd48604b8aa 100644
--- a/paddle/fluid/operators/search_compute.h
+++ b/paddle/fluid/operators/search_compute.h
@@ -63,7 +63,7 @@ void call_gemm(const framework::ExecutionContext& ctx,
                T* C) {
   int lda = (TransA == CblasNoTrans) ? K : M;
   int ldb = (TransB == CblasNoTrans) ? N : K;
-  auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, T>(ctx);
+  auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(ctx);
   blas.GEMM(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N);
 }
 
diff --git a/paddle/fluid/operators/seed_op.cc b/paddle/fluid/operators/seed_op.cc
index 1364d4c1d2ae9..527884ec9c9b6 100644
--- a/paddle/fluid/operators/seed_op.cc
+++ b/paddle/fluid/operators/seed_op.cc
@@ -72,8 +72,7 @@ REGISTER_OPERATOR(
     ops::SeedOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    seed, ops::CPUSeedKernel<paddle::platform::CPUDeviceContext, int>);
+REGISTER_OP_CPU_KERNEL(seed, ops::CPUSeedKernel<phi::CPUContext, int>);
 
 /* ==========================  register checkpoint ===========================*/
 REGISTER_OP_VERSION(seed).AddCheckpoint(
diff --git a/paddle/fluid/operators/seed_op.cu b/paddle/fluid/operators/seed_op.cu
index 449bd694ceb46..9b1d7a27e58e4 100644
--- a/paddle/fluid/operators/seed_op.cu
+++ b/paddle/fluid/operators/seed_op.cu
@@ -32,8 +32,8 @@ class GPUSeedKernel : public framework::OpKernel<T> {
           platform::DeviceContextPool::Instance();
       auto &dev_ctx = *pool.Get(platform::CPUPlace());
       out->mutable_data<T>(platform::CPUPlace());
-      phi::funcs::SetConstant<platform::CPUDeviceContext, T> functor;
-      functor(reinterpret_cast<const platform::CPUDeviceContext &>(dev_ctx),
+      phi::funcs::SetConstant<phi::CPUContext, T> functor;
+      functor(reinterpret_cast<const phi::CPUContext &>(dev_ctx),
               out,
               static_cast<T>(seed));
     } else {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc
index 9b7bd3fd6c6ab..117fc4ebe0c36 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc
@@ -141,19 +141,17 @@ REGISTER_OPERATOR(sequence_concat,
                   op::SeqConcatOpMaker,
                   op::SeqConcatGradOpMaker<paddle::framework::OpDesc>,
                   op::SeqConcatGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    sequence_concat,
-    op::SeqConcatKernel<paddle::platform::CPUDeviceContext, float>,
-    op::SeqConcatKernel<paddle::platform::CPUDeviceContext, double>,
-    op::SeqConcatKernel<paddle::platform::CPUDeviceContext, int>,
-    op::SeqConcatKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(sequence_concat,
+                       op::SeqConcatKernel<phi::CPUContext, float>,
+                       op::SeqConcatKernel<phi::CPUContext, double>,
+                       op::SeqConcatKernel<phi::CPUContext, int>,
+                       op::SeqConcatKernel<phi::CPUContext, int64_t>);
 
 REGISTER_OPERATOR(sequence_concat_grad,
                   op::SeqConcatGradOp,
                   op::SeqConcatGradNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    sequence_concat_grad,
-    op::SeqConcatGradKernel<paddle::platform::CPUDeviceContext, float>,
-    op::SeqConcatGradKernel<paddle::platform::CPUDeviceContext, double>,
-    op::SeqConcatGradKernel<paddle::platform::CPUDeviceContext, int>,
-    op::SeqConcatGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(sequence_concat_grad,
+                       op::SeqConcatGradKernel<phi::CPUContext, float>,
+                       op::SeqConcatGradKernel<phi::CPUContext, double>,
+                       op::SeqConcatGradKernel<phi::CPUContext, int>,
+                       op::SeqConcatGradKernel<phi::CPUContext, int64_t>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.h b/paddle/fluid/operators/sequence_ops/sequence_concat_op.h
index 8d9302fa43b7a..4943e0e2ea09b 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.h
@@ -17,7 +17,6 @@
 #include <utility>
 #include <vector>
 
-#include "boost/optional.hpp"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc b/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc
index dced2038eb680..f1350ce334b41 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc
@@ -268,11 +268,9 @@ REGISTER_OPERATOR(sequence_conv_grad,
                   ops::SequenceConvGradOp,
                   ops::SequenceConvGradNoNeedBufferVarsInference);
 
-REGISTER_OP_CPU_KERNEL(
-    sequence_conv,
-    ops::SequenceConvKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequenceConvKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    sequence_conv_grad,
-    ops::SequenceConvGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequenceConvGradKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(sequence_conv,
+                       ops::SequenceConvKernel<phi::CPUContext, float>,
+                       ops::SequenceConvKernel<phi::CPUContext, double>);
+REGISTER_OP_CPU_KERNEL(sequence_conv_grad,
+                       ops::SequenceConvGradKernel<phi::CPUContext, float>,
+                       ops::SequenceConvGradKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
index a06ed8b02d110..de55f1ab52a35 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
@@ -88,7 +88,6 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(sequence_enumerate,
                              ops::SequenceEnumerateOp,
                              ops::SequenceEnumerateOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    sequence_enumerate,
-    ops::SequenceEnumerateKernel<paddle::platform::CPUDeviceContext, int32_t>,
-    ops::SequenceEnumerateKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(sequence_enumerate,
+                       ops::SequenceEnumerateKernel<phi::CPUContext, int32_t>,
+                       ops::SequenceEnumerateKernel<phi::CPUContext, int64_t>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc
index e763635b7f419..c64b568e533d0 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc
@@ -97,7 +97,6 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(sequence_erase,
                              ops::SequenceEraseOp,
                              ops::SequenceEraseOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    sequence_erase,
-    ops::SequenceEraseKernel<paddle::platform::CPUDeviceContext, int32_t>,
-    ops::SequenceEraseKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(sequence_erase,
+                       ops::SequenceEraseKernel<phi::CPUContext, int32_t>,
+                       ops::SequenceEraseKernel<phi::CPUContext, int64_t>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc
index 4135f046c21e2..5c3731fc90253 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc
@@ -208,16 +208,14 @@ REGISTER_OPERATOR(
 REGISTER_OPERATOR(sequence_expand_as_grad,
                   ops::SequenceExpandAsOpGrad,
                   ops::SequenceExpandAsGradOpNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    sequence_expand_as,
-    ops::SequenceExpandAsKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequenceExpandAsKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SequenceExpandAsKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SequenceExpandAsKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(sequence_expand_as,
+                       ops::SequenceExpandAsKernel<phi::CPUContext, float>,
+                       ops::SequenceExpandAsKernel<phi::CPUContext, double>,
+                       ops::SequenceExpandAsKernel<phi::CPUContext, int>,
+                       ops::SequenceExpandAsKernel<phi::CPUContext, int64_t>);
 REGISTER_OP_CPU_KERNEL(
     sequence_expand_as_grad,
-    ops::SequenceExpandAsGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequenceExpandAsGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SequenceExpandAsGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SequenceExpandAsGradKernel<paddle::platform::CPUDeviceContext,
-                                    int64_t>);
+    ops::SequenceExpandAsGradKernel<phi::CPUContext, float>,
+    ops::SequenceExpandAsGradKernel<phi::CPUContext, double>,
+    ops::SequenceExpandAsGradKernel<phi::CPUContext, int>,
+    ops::SequenceExpandAsGradKernel<phi::CPUContext, int64_t>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h
index da9ad3574db2f..02d2b87874d05 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h
@@ -43,9 +43,9 @@ struct SequenceExpandAsGradFunctor {
 };
 
 template <typename T>
-struct SequenceExpandAsFunctor<platform::CPUDeviceContext, T> {
+struct SequenceExpandAsFunctor<phi::CPUContext, T> {
   void operator()(
-      const platform::CPUDeviceContext &context,
+      const phi::CPUContext &context,
       const framework::LoDTensor &x,
       const framework::Vector<size_t> &ref_lod, /*expand referenced lod*/
       framework::LoDTensor *out) {
@@ -121,9 +121,9 @@ class SequenceExpandAsKernel : public framework::OpKernel<T> {
  *
  * */
 template <typename T>
-struct SequenceExpandAsGradFunctor<platform::CPUDeviceContext, T> {
+struct SequenceExpandAsGradFunctor<phi::CPUContext, T> {
   void operator()(
-      const platform::CPUDeviceContext &context,
+      const phi::CPUContext &context,
       const framework::LoDTensor &dout,
       const framework::Vector<size_t> &ref_lod, /*expand referenced lod*/
       framework::LoDTensor *dx) {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
index e9e7912fe5036..a2fb088975e39 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
@@ -281,15 +281,13 @@ REGISTER_OPERATOR(sequence_expand,
 REGISTER_OPERATOR(sequence_expand_grad,
                   ops::SequenceExpandOpGrad,
                   ops::SequenceExpandGradOpNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    sequence_expand,
-    ops::SequenceExpandKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequenceExpandKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SequenceExpandKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SequenceExpandKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    sequence_expand_grad,
-    ops::SequenceExpandGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequenceExpandGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SequenceExpandGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SequenceExpandGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(sequence_expand,
+                       ops::SequenceExpandKernel<phi::CPUContext, float>,
+                       ops::SequenceExpandKernel<phi::CPUContext, double>,
+                       ops::SequenceExpandKernel<phi::CPUContext, int>,
+                       ops::SequenceExpandKernel<phi::CPUContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(sequence_expand_grad,
+                       ops::SequenceExpandGradKernel<phi::CPUContext, float>,
+                       ops::SequenceExpandGradKernel<phi::CPUContext, double>,
+                       ops::SequenceExpandGradKernel<phi::CPUContext, int>,
+                       ops::SequenceExpandGradKernel<phi::CPUContext, int64_t>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.h b/paddle/fluid/operators/sequence_ops/sequence_expand_op.h
index b6cd8c3b9079a..158aa0e4fe190 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.h
@@ -49,9 +49,9 @@ struct SequenceExpandGradFunctor {
 };
 
 template <typename T>
-struct SequenceExpandFunctor<platform::CPUDeviceContext, T> {
+struct SequenceExpandFunctor<phi::CPUContext, T> {
   void operator()(
-      const platform::CPUDeviceContext& context,
+      const phi::CPUContext& context,
       const LoDTensor& x,
       const framework::Vector<size_t>& x_lod,   /*expand source lod*/
       const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
@@ -161,9 +161,9 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
  *
  * */
 template <typename T>
-struct SequenceExpandGradFunctor<platform::CPUDeviceContext, T> {
+struct SequenceExpandGradFunctor<phi::CPUContext, T> {
   void operator()(
-      const platform::CPUDeviceContext& context,
+      const phi::CPUContext& context,
       const LoDTensor& dout,
       const framework::Vector<size_t>& x_lod,   /*expand source lod*/
       const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
@@ -181,7 +181,7 @@ struct SequenceExpandGradFunctor<platform::CPUDeviceContext, T> {
         int dout_end = dout_offset + repeat_num * x_seq_len;
         auto dout_sub = dout.Slice(dout_offset, dout_end);
         dout_sub.Resize({repeat_num, dx_sub.dims()[0]});
-        phi::funcs::ColwiseSum<platform::CPUDeviceContext, T> col_sum;
+        phi::funcs::ColwiseSum<phi::CPUContext, T> col_sum;
         col_sum(context, dout_sub, &dx_sub);
         dout_offset += repeat_num * x_seq_len;
       }
diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
index 6b20338f95eb7..2ed9c44f5928c 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
@@ -103,11 +103,7 @@ REGISTER_OPERATOR(
 
 REGISTER_OP_CPU_KERNEL(
     sequence_mask,
-    paddle::operators::SequenceMaskKernel<paddle::platform::CPUDeviceContext,
-                                          int>,
-    paddle::operators::SequenceMaskKernel<paddle::platform::CPUDeviceContext,
-                                          int64_t>,
-    paddle::operators::SequenceMaskKernel<paddle::platform::CPUDeviceContext,
-                                          float>,
-    paddle::operators::SequenceMaskKernel<paddle::platform::CPUDeviceContext,
-                                          double>);
+    paddle::operators::SequenceMaskKernel<phi::CPUContext, int>,
+    paddle::operators::SequenceMaskKernel<phi::CPUContext, int64_t>,
+    paddle::operators::SequenceMaskKernel<phi::CPUContext, float>,
+    paddle::operators::SequenceMaskKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
index dc04a6cce7abd..ad4876970c532 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
@@ -282,15 +282,13 @@ REGISTER_OPERATOR(sequence_pad,
 REGISTER_OPERATOR(sequence_pad_grad,
                   ops::SequencePadGradOp,
                   ops::SequencePadGradOpNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    sequence_pad,
-    ops::SequencePadOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequencePadOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SequencePadOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SequencePadOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    sequence_pad_grad,
-    ops::SequencePadGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequencePadGradOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SequencePadGradOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SequencePadGradOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(sequence_pad,
+                       ops::SequencePadOpKernel<phi::CPUContext, float>,
+                       ops::SequencePadOpKernel<phi::CPUContext, double>,
+                       ops::SequencePadOpKernel<phi::CPUContext, int>,
+                       ops::SequencePadOpKernel<phi::CPUContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(sequence_pad_grad,
+                       ops::SequencePadGradOpKernel<phi::CPUContext, float>,
+                       ops::SequencePadGradOpKernel<phi::CPUContext, double>,
+                       ops::SequencePadGradOpKernel<phi::CPUContext, int>,
+                       ops::SequencePadGradOpKernel<phi::CPUContext, int64_t>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
index 327fdfda5e28f..6c146a699af8b 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
@@ -193,12 +193,10 @@ REGISTER_OPERATOR(sequence_pool,
 REGISTER_OPERATOR(sequence_pool_grad,
                   ops::SequencePoolGradOp,
                   ops::SequencePoolGradOpNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    sequence_pool,
-    ops::SequencePoolKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequencePoolKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    sequence_pool_grad,
-    ops::SequencePoolGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequencePoolGradKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(sequence_pool,
+                       ops::SequencePoolKernel<phi::CPUContext, float>,
+                       ops::SequencePoolKernel<phi::CPUContext, double>);
+
+REGISTER_OP_CPU_KERNEL(sequence_pool_grad,
+                       ops::SequencePoolGradKernel<phi::CPUContext, float>,
+                       ops::SequencePoolGradKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc b/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc
index 5266650f2279d..6925267f1a981 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc
@@ -144,15 +144,13 @@ REGISTER_OPERATOR(sequence_reshape,
                   ops::SequenceReshapeGradOpMaker<paddle::framework::OpDesc>,
                   ops::SequenceReshapeGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(sequence_reshape_grad, ops::SequenceReshapeGradOp);
-REGISTER_OP_CPU_KERNEL(
-    sequence_reshape,
-    ops::SequenceReshapeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequenceReshapeKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SequenceReshapeKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SequenceReshapeKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    sequence_reshape_grad,
-    ops::SequenceReshapeGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequenceReshapeGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SequenceReshapeGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::SequenceReshapeGradKernel<paddle::platform::CPUDeviceContext, int>);
+REGISTER_OP_CPU_KERNEL(sequence_reshape,
+                       ops::SequenceReshapeKernel<phi::CPUContext, float>,
+                       ops::SequenceReshapeKernel<phi::CPUContext, double>,
+                       ops::SequenceReshapeKernel<phi::CPUContext, int>,
+                       ops::SequenceReshapeKernel<phi::CPUContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(sequence_reshape_grad,
+                       ops::SequenceReshapeGradKernel<phi::CPUContext, float>,
+                       ops::SequenceReshapeGradKernel<phi::CPUContext, double>,
+                       ops::SequenceReshapeGradKernel<phi::CPUContext, int64_t>,
+                       ops::SequenceReshapeGradKernel<phi::CPUContext, int>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.cc b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.cc
index f17c2baca9896..d1e8409653a5e 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.cc
@@ -22,10 +22,9 @@ REGISTER_OPERATOR(sequence_reverse,
                   ops::SequenceReverseGradOpMaker<paddle::framework::OpDesc>,
                   ops::SequenceReverseGradOpMaker<paddle::imperative::OpBase>);
 
-REGISTER_OP_CPU_KERNEL(
-    sequence_reverse,
-    ops::SequenceReverseOpKernel<paddle::platform::CPUDeviceContext, uint8_t>,
-    ops::SequenceReverseOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SequenceReverseOpKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::SequenceReverseOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequenceReverseOpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(sequence_reverse,
+                       ops::SequenceReverseOpKernel<phi::CPUContext, uint8_t>,
+                       ops::SequenceReverseOpKernel<phi::CPUContext, int>,
+                       ops::SequenceReverseOpKernel<phi::CPUContext, int64_t>,
+                       ops::SequenceReverseOpKernel<phi::CPUContext, float>,
+                       ops::SequenceReverseOpKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc
index a7578e25f93f4..9375cea85c78f 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc
@@ -159,16 +159,14 @@ REGISTER_OPERATOR(sequence_slice,
 REGISTER_OPERATOR(sequence_slice_grad,
                   ops::SequenceSliceGradOp,
                   ops::SequenceSliceGradNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    sequence_slice,
-    ops::SequenceSliceOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequenceSliceOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SequenceSliceOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SequenceSliceOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(sequence_slice,
+                       ops::SequenceSliceOpKernel<phi::CPUContext, float>,
+                       ops::SequenceSliceOpKernel<phi::CPUContext, double>,
+                       ops::SequenceSliceOpKernel<phi::CPUContext, int>,
+                       ops::SequenceSliceOpKernel<phi::CPUContext, int64_t>);
 REGISTER_OP_CPU_KERNEL(
     sequence_slice_grad,
-    ops::SequenceSliceGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequenceSliceGradOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SequenceSliceGradOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SequenceSliceGradOpKernel<paddle::platform::CPUDeviceContext,
-                                   int64_t>);
+    ops::SequenceSliceGradOpKernel<phi::CPUContext, float>,
+    ops::SequenceSliceGradOpKernel<phi::CPUContext, double>,
+    ops::SequenceSliceGradOpKernel<phi::CPUContext, int>,
+    ops::SequenceSliceGradOpKernel<phi::CPUContext, int64_t>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
index 863d2e01d73e5..bb0ad26b51bb4 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
@@ -185,11 +185,9 @@ REGISTER_OPERATOR(
 REGISTER_OPERATOR(sequence_softmax_grad,
                   ops::SequenceSoftmaxGradOp,
                   ops::SequenceSoftmaxGradOpNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    sequence_softmax,
-    ops::SequenceSoftmaxKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequenceSoftmaxKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    sequence_softmax_grad,
-    ops::SequenceSoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequenceSoftmaxGradKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(sequence_softmax,
+                       ops::SequenceSoftmaxKernel<phi::CPUContext, float>,
+                       ops::SequenceSoftmaxKernel<phi::CPUContext, double>);
+REGISTER_OP_CPU_KERNEL(sequence_softmax_grad,
+                       ops::SequenceSoftmaxGradKernel<phi::CPUContext, float>,
+                       ops::SequenceSoftmaxGradKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.h b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.h
index 3ebf955fe259a..0d3d3b695af4b 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.h
@@ -41,8 +41,8 @@ struct SequenceSoftmaxGradFunctor {
 };
 
 template <typename T>
-struct SequenceSoftmaxFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext &ctx,
+struct SequenceSoftmaxFunctor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext &ctx,
                   const LoDTensor &x,
                   const framework::Vector<size_t> &ref_lod, /*referenced lod*/
                   LoDTensor *out) {
@@ -63,8 +63,8 @@ struct SequenceSoftmaxFunctor<platform::CPUDeviceContext, T> {
 };
 
 template <typename T>
-struct SequenceSoftmaxGradFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext &ctx,
+struct SequenceSoftmaxGradFunctor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext &ctx,
                   const LoDTensor &dout,
                   const LoDTensor &out,
                   const framework::Vector<size_t> &ref_lod, /*referenced lod*/
diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc
index 0c312cfb1cf83..b19dfe40ed95e 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc
@@ -139,9 +139,7 @@ REGISTER_OPERATOR(sequence_topk_avg_pooling_grad,
                   ops::SequenceTopkAvgPoolingGradOp);
 REGISTER_OP_CPU_KERNEL(
     sequence_topk_avg_pooling,
-    ops::SequenceTopkAvgPoolingKernel<paddle::platform::CPUDeviceContext,
-                                      float>);
+    ops::SequenceTopkAvgPoolingKernel<phi::CPUContext, float>);
 REGISTER_OP_CPU_KERNEL(
     sequence_topk_avg_pooling_grad,
-    ops::SequenceTopkAvgPoolingGradKernel<paddle::platform::CPUDeviceContext,
-                                          float>);
+    ops::SequenceTopkAvgPoolingGradKernel<phi::CPUContext, float>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
index 04115c69a9a7d..1c1168e449eb7 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
@@ -202,9 +202,8 @@ class SequenceTopkAvgPoolingGradKernel : public framework::OpKernel<T> {
     auto pos_data = pos_input->data<int>();
     auto dout_data = d_out->data<T>();
 
-    auto& dev_ctx =
-        context.template device_context<platform::CPUDeviceContext>();
-    phi::funcs::SetConstant<paddle::platform::CPUDeviceContext, T> zero;
+    auto& dev_ctx = context.template device_context<phi::CPUContext>();
+    phi::funcs::SetConstant<phi::CPUContext, T> zero;
     zero(dev_ctx, d_in, static_cast<T>(0.0));
 
     auto din_data = d_in->data<T>();
diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
index 4b90d64d26fe3..613dc8bfbc9b1 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
@@ -194,16 +194,14 @@ REGISTER_OPERATOR(sequence_unpad,
 REGISTER_OPERATOR(sequence_unpad_grad,
                   ops::SequenceUnpadGradOp,
                   ops::SequenceUnpadGradOpNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    sequence_unpad,
-    ops::SequenceUnpadOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequenceUnpadOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SequenceUnpadOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SequenceUnpadOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(sequence_unpad,
+                       ops::SequenceUnpadOpKernel<phi::CPUContext, float>,
+                       ops::SequenceUnpadOpKernel<phi::CPUContext, double>,
+                       ops::SequenceUnpadOpKernel<phi::CPUContext, int>,
+                       ops::SequenceUnpadOpKernel<phi::CPUContext, int64_t>);
 REGISTER_OP_CPU_KERNEL(
     sequence_unpad_grad,
-    ops::SequenceUnpadGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SequenceUnpadGradOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SequenceUnpadGradOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SequenceUnpadGradOpKernel<paddle::platform::CPUDeviceContext,
-                                   int64_t>);
+    ops::SequenceUnpadGradOpKernel<phi::CPUContext, float>,
+    ops::SequenceUnpadGradOpKernel<phi::CPUContext, double>,
+    ops::SequenceUnpadGradOpKernel<phi::CPUContext, int>,
+    ops::SequenceUnpadGradOpKernel<phi::CPUContext, int64_t>);
diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc
index da2cf4c0dbe14..074642e1b0241 100644
--- a/paddle/fluid/operators/set_value_op.cc
+++ b/paddle/fluid/operators/set_value_op.cc
@@ -31,9 +31,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/shuffle_channel_op.cc b/paddle/fluid/operators/shuffle_channel_op.cc
index 74896d8499672..4a3668b114059 100644
--- a/paddle/fluid/operators/shuffle_channel_op.cc
+++ b/paddle/fluid/operators/shuffle_channel_op.cc
@@ -137,13 +137,11 @@ REGISTER_OPERATOR(shuffle_channel,
 
 REGISTER_OPERATOR(shuffle_channel_grad, ops::ShuffleChannelGradOp);
 
-REGISTER_OP_CPU_KERNEL(
-    shuffle_channel,
-    ops::ShuffleChannelOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ShuffleChannelOpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(shuffle_channel,
+                       ops::ShuffleChannelOpKernel<phi::CPUContext, float>,
+                       ops::ShuffleChannelOpKernel<phi::CPUContext, double>);
 
 REGISTER_OP_CPU_KERNEL(
     shuffle_channel_grad,
-    ops::ShuffleChannelGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ShuffleChannelGradOpKernel<paddle::platform::CPUDeviceContext,
-                                    double>);
+    ops::ShuffleChannelGradOpKernel<phi::CPUContext, float>,
+    ops::ShuffleChannelGradOpKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc
index 4b6bcae7635b8..4e81226188304 100644
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
@@ -466,31 +466,25 @@ REGISTER_OPERATOR(slice_grad,
 
 REGISTER_OP_CPU_KERNEL(
     slice,
-    ops::SliceKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::SliceKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SliceKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::SliceKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SliceKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SliceKernel<paddle::platform::CPUDeviceContext,
-                     paddle::platform::complex<float>>,
-    ops::SliceKernel<paddle::platform::CPUDeviceContext,
-                     paddle::platform::complex<double>>,
-    ops::SliceKernel<paddle::platform::CPUDeviceContext,
-                     paddle::platform::bfloat16>);
+    ops::SliceKernel<phi::CPUContext, bool>,
+    ops::SliceKernel<phi::CPUContext, int>,
+    ops::SliceKernel<phi::CPUContext, int64_t>,
+    ops::SliceKernel<phi::CPUContext, float>,
+    ops::SliceKernel<phi::CPUContext, double>,
+    ops::SliceKernel<phi::CPUContext, paddle::platform::complex<float>>,
+    ops::SliceKernel<phi::CPUContext, paddle::platform::complex<double>>,
+    ops::SliceKernel<phi::CPUContext, paddle::platform::bfloat16>);
 
 REGISTER_OP_CPU_KERNEL(
     slice_grad,
-    ops::SliceGradKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::SliceGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SliceGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::SliceGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SliceGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SliceGradKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex<float>>,
-    ops::SliceGradKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex<double>>,
-    ops::SliceGradKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::bfloat16>);
+    ops::SliceGradKernel<phi::CPUContext, bool>,
+    ops::SliceGradKernel<phi::CPUContext, int>,
+    ops::SliceGradKernel<phi::CPUContext, int64_t>,
+    ops::SliceGradKernel<phi::CPUContext, float>,
+    ops::SliceGradKernel<phi::CPUContext, double>,
+    ops::SliceGradKernel<phi::CPUContext, paddle::platform::complex<float>>,
+    ops::SliceGradKernel<phi::CPUContext, paddle::platform::complex<double>>,
+    ops::SliceGradKernel<phi::CPUContext, paddle::platform::bfloat16>);
 
 REGISTER_OP_CUDA_KERNEL(
     slice,
diff --git a/paddle/fluid/operators/smooth_l1_loss_op.cc b/paddle/fluid/operators/smooth_l1_loss_op.cc
index eb391fd3fb73c..f8bebe331d8be 100644
--- a/paddle/fluid/operators/smooth_l1_loss_op.cc
+++ b/paddle/fluid/operators/smooth_l1_loss_op.cc
@@ -225,9 +225,7 @@ REGISTER_OPERATOR(smooth_l1_loss,
                   ops::SmoothL1LossGradMaker<paddle::framework::OpDesc>,
                   ops::SmoothL1LossGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(smooth_l1_loss_grad, ops::SmoothL1LossGradOp);
-REGISTER_OP_CPU_KERNEL(
-    smooth_l1_loss,
-    ops::SmoothL1LossKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    smooth_l1_loss_grad,
-    ops::SmoothL1LossGradKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(smooth_l1_loss,
+                       ops::SmoothL1LossKernel<phi::CPUContext, float>);
+REGISTER_OP_CPU_KERNEL(smooth_l1_loss_grad,
+                       ops::SmoothL1LossGradKernel<phi::CPUContext, float>);
diff --git a/paddle/fluid/operators/solve_op.cc b/paddle/fluid/operators/solve_op.cc
index 77a45684aca0e..daa020e4a0d74 100644
--- a/paddle/fluid/operators/solve_op.cc
+++ b/paddle/fluid/operators/solve_op.cc
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/solve_op.h"
-
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/phi/core/ddim.h"
 
 namespace paddle {
@@ -220,12 +220,3 @@ REGISTER_OPERATOR(solve,
                   ops::SolveOpGradMaker<paddle::imperative::OpBase>);
 
 REGISTER_OPERATOR(solve_grad, ops::SolveGradOp);
-
-REGISTER_OP_CPU_KERNEL(
-    solve,
-    ops::SolveKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SolveKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    solve_grad,
-    ops::SolveGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SolveGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/solve_op.cu b/paddle/fluid/operators/solve_op.cu
deleted file mode 100644
index a1e56fab5702b..0000000000000
--- a/paddle/fluid/operators/solve_op.cu
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/solve_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(solve,
-                        ops::SolveKernel<plat::CUDADeviceContext, float>,
-                        ops::SolveKernel<plat::CUDADeviceContext, double>);
-
-REGISTER_OP_CUDA_KERNEL(solve_grad,
-                        ops::SolveGradKernel<plat::CUDADeviceContext, float>,
-                        ops::SolveGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/solve_op.h b/paddle/fluid/operators/solve_op.h
deleted file mode 100644
index b97b8d01ccd37..0000000000000
--- a/paddle/fluid/operators/solve_op.h
+++ /dev/null
@@ -1,722 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "Eigen/Core"
-#include "Eigen/LU"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/fluid/operators/math/matrix_solve.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"
-#include "paddle/fluid/operators/squeeze_op.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#endif
-
-#define MAX_RANK_SUPPORTED 6
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using framework::To32BitIndex;
-
-constexpr int kMULMKLDNNINT8 = 1;
-
-template <typename DeviceContext, typename T>
-void ReduceSumForSolve(const Tensor* input,
-                       Tensor* output,
-                       const std::vector<int>& reduce_dims,
-                       bool keep_dim,
-                       const paddle::framework::ExecutionContext& ctx) {
-#if defined(__NVCC__) || defined(__HIPCC__)
-  auto stream = ctx.cuda_device_context().stream();
-  TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-      ctx.cuda_device_context(),
-      *input,
-      output,
-      kps::IdentityFunctor<T>(),
-      reduce_dims,
-      stream);
-#else
-  ReduceKernelFunctor<DeviceContext, T, ops::SumFunctor>(
-      input, output, reduce_dims, keep_dim, false, ctx)
-      .template apply<T>();
-#endif
-}
-
-// check the input other is vector_case or not
-static inline bool is_vector_rhs(const Tensor& input, const Tensor& other) {
-  auto x_dim = input.dims();
-  auto y_dim = other.dims();
-  auto x_dim_size = x_dim.size();
-  auto y_dim_size = y_dim.size();
-  std::vector<int64_t> x_dims_vec = phi::vectorize(x_dim);
-  std::vector<int64_t> y_dims_vec = phi::vectorize(y_dim);
-
-  std::vector<int64_t>::const_iterator f = x_dims_vec.begin();
-  std::vector<int64_t>::const_iterator l = x_dims_vec.end() - 1;
-  std::vector<int64_t> x_dims_vec_cut(f, l);  // input.shape[:-1]
-
-  std::vector<int64_t> expected_batched_rhs_shape(x_dims_vec_cut);
-  bool vector_case =
-      y_dim_size == 1 || (x_dim_size - 1 == y_dim_size &&
-                          y_dims_vec == (expected_batched_rhs_shape));
-
-  return vector_case;
-}
-
-// unsqueeze operation helper
-static framework::DDim GetOutputShapeUnsqueeze(
-    const std::vector<int> unsqz_dims, const framework::DDim& in_dims) {
-  int output_size = in_dims.size() + static_cast<int>(unsqz_dims.size());
-  int cur_output_size = in_dims.size();
-  std::vector<int64_t> output_shape(output_size, 0);
-
-  // Validity Check: rank range.
-  PADDLE_ENFORCE_LE(output_size,
-                    6,
-                    platform::errors::InvalidArgument(
-                        "The output "
-                        "tensor's rank should be less than 6."));
-
-  for (int axis : unsqz_dims) {
-    int cur = axis < 0 ? axis + cur_output_size + 1 : axis;
-    // Vaildity Check: the axis bound
-    PADDLE_ENFORCE_GE(
-        cur,
-        0,
-        platform::errors::InvalidArgument("The insert dimension value should "
-                                          "not be less than 0"));
-    PADDLE_ENFORCE_LE(cur,
-                      cur_output_size,
-                      platform::errors::InvalidArgument(
-                          "The insert dimension value shoule not be larger "
-                          "than the dimension size of input tensor"));
-    // Move old axis, and insert new axis
-    for (int i = cur_output_size; i >= cur; --i) {
-      if (output_shape[i] == 1) {
-        // Move axis
-        output_shape[i + 1] = 1;
-        output_shape[i] = 0;
-      }
-    }
-    output_shape[cur] = 1;
-    // Add the output size.
-    cur_output_size++;
-  }
-
-  // Make output shape
-  for (int in_idx = 0, out_idx = 0; out_idx < output_size; ++out_idx) {
-    if (output_shape[out_idx] == 0) {
-      output_shape[out_idx] = in_dims[in_idx++];
-    }
-  }
-
-  return phi::make_ddim(output_shape);
-}
-
-// operation like squeeze(-1)
-static void to_squeeze(const framework::ExecutionContext& context,
-                       const framework::Tensor& in,
-                       framework::Tensor* out) {
-  auto x_dims = in.dims();
-  std::vector<int> sqz_dims = {-1};
-  auto out_dims = GetOutputShape(sqz_dims, x_dims, true);
-  out->mutable_data(context.GetPlace(), in.type());
-  framework::TensorCopy(
-      in,
-      context.GetPlace(),
-      context.template device_context<platform::DeviceContext>(),
-      out);
-  out->Resize(out_dims);
-}
-
-// vector_case, need to operate like unsqueeze(-1)
-static void to_unsqueeze(const framework::ExecutionContext& context,
-                         const framework::Tensor& in,
-                         framework::Tensor* out) {
-  auto x_dims = in.dims();
-  std::vector<int> unsqz_dims = {-1};
-  framework::DDim out_dims = out->dims();
-  out_dims = GetOutputShapeUnsqueeze(unsqz_dims, x_dims);
-  framework::TensorCopy(
-      in,
-      context.GetPlace(),
-      context.template device_context<platform::DeviceContext>(),
-      out);
-  out->Resize(out_dims);
-}
-
-// Prepared for the broadcast operation
-static std::vector<int64_t> get_broadcast_batch_portion(
-    std::vector<int64_t> x, std::vector<int64_t> y) {
-  size_t size_x = x.size();
-  size_t size_y = y.size();
-  size_t size = std::max(size_x, size_y);
-  std::vector<int64_t> batchPortion(size);
-
-  ptrdiff_t i = (ptrdiff_t)size - 1;
-  for (; i >= 0; --i) {
-    ptrdiff_t offset = size - i - 1;
-    ptrdiff_t dim_x = size_x - offset - 1;
-    ptrdiff_t dim_y = size_y - offset - 1;
-    int64_t x_size = (dim_x >= 0) ? x[dim_x] : 1;
-    int64_t y_size = (dim_y >= 0) ? y[dim_y] : 1;
-
-    PADDLE_ENFORCE_EQ(
-        (x_size == y_size || x_size == 1 || y_size == 1),
-        true,
-        platform::errors::PreconditionNotMet(
-            "The size of tensor x (%d) must match the size of tensor y "
-            "(%d) at non-singleton dimension %d.",
-            x_size,
-            y_size,
-            i));
-
-    batchPortion[i] = x_size != 1 ? x_size : y_size;
-  }
-  return batchPortion;
-}
-
-// broadcast the batch dimensions of tensor x and tensor y.
-static inline std::tuple<std::vector<int64_t>, std::vector<int64_t>>
-get_broadcast_dims(const Tensor& x, const Tensor& y) {
-  std::vector<int64_t> x_dims_vec = phi::vectorize(x.dims());
-  std::vector<int64_t> y_dims_vec = phi::vectorize(y.dims());
-
-  std::vector<int64_t>::const_iterator f1 = x_dims_vec.begin();
-  std::vector<int64_t>::const_iterator l1 = x_dims_vec.end() - 2;
-  std::vector<int64_t> x_dims_vec_cut(f1, l1);
-
-  std::vector<int64_t>::const_iterator f2 = y_dims_vec.begin();
-  std::vector<int64_t>::const_iterator l2 = y_dims_vec.end() - 2;
-  std::vector<int64_t> y_dims_vec_cut(f2, l2);
-
-  std::vector<int64_t> expand_batch_portion =
-      get_broadcast_batch_portion(x_dims_vec_cut, y_dims_vec_cut);
-
-  std::vector<int64_t> x_expand_size({expand_batch_portion});
-  x_expand_size.insert(x_expand_size.end(),
-                       {x_dims_vec[static_cast<int>(x_dims_vec.size()) - 2],
-                        x_dims_vec[static_cast<int>(x_dims_vec.size()) - 1]});
-
-  std::vector<int64_t> y_expand_size({expand_batch_portion});
-  y_expand_size.insert(y_expand_size.end(),
-                       {y_dims_vec[static_cast<int>(y_dims_vec.size()) - 2],
-                        y_dims_vec[static_cast<int>(y_dims_vec.size()) - 1]});
-
-  return std::make_tuple(x_expand_size, y_expand_size);
-}
-
-template <int Rank, typename T, typename DeviceContext>
-void expand_impl(const DeviceContext& context,
-                 const Tensor& in,
-                 Tensor* out,
-                 const std::vector<int64_t>& expand_shape) {
-  auto vec_in_dims = phi::vectorize<int>(in.dims());
-  auto diff = expand_shape.size() - vec_in_dims.size();
-  vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
-  std::vector<int> repeat_times(vec_in_dims.size());
-
-  for (size_t i = 0; i < vec_in_dims.size(); ++i) {
-    PADDLE_ENFORCE_NE(
-        expand_shape[i],
-        0,
-        platform::errors::InvalidArgument("The expanded size cannot be zero."));
-    if (i < diff) {
-      PADDLE_ENFORCE_GT(
-          expand_shape[i],
-          0,
-          platform::errors::InvalidArgument(
-              "The expanded size (%d) for non-existing dimensions must be "
-              "positive for expand operation.",
-              expand_shape[i]));
-      repeat_times[i] = expand_shape[i];
-    } else if (expand_shape[i] > 0) {
-      if (vec_in_dims[i] != 1) {
-        PADDLE_ENFORCE_EQ(
-            vec_in_dims[i],
-            expand_shape[i],
-            platform::errors::InvalidArgument(
-                "The value (%d) of the non-singleton dimension does not match"
-                " the corresponding value (%d) in shape for expand operation.",
-                vec_in_dims[i],
-                expand_shape[i]));
-        repeat_times[i] = 1;
-      } else {
-        repeat_times[i] = expand_shape[i];
-      }
-    } else {
-      PADDLE_ENFORCE_EQ(
-          expand_shape[i],
-          -1,
-          platform::errors::InvalidArgument(
-              "When the value in shape is negative for expand_v2 op, "
-              "only -1 is supported, but the value received is %d.",
-              expand_shape[i]));
-      repeat_times[i] = 1;
-    }
-  }
-
-  Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
-  for (size_t i = 0; i < repeat_times.size(); ++i) {
-    bcast_dims[i] = repeat_times[i];
-  }
-
-  framework::DDim new_in_dims = phi::make_ddim(vec_in_dims);
-  framework::DDim out_dims(new_in_dims);
-  for (size_t i = 0; i < repeat_times.size(); ++i) {
-    out_dims[i] *= repeat_times[i];
-  }
-
-  out->Resize(out_dims);
-  out->mutable_data<T>(context.GetPlace());
-  auto x = EigenTensor<T, Rank>::From(in, new_in_dims);
-  auto y = EigenTensor<T, Rank>::From(*out, out_dims);
-  auto& place = *context.eigen_device();
-  // use 32-bit index to speed up
-  bool use_32bit_index = y.size() < Eigen::NumTraits<int>::highest();
-  if (use_32bit_index) {
-    EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(
-        place, To32BitIndex(y), To32BitIndex(x), bcast_dims);
-  } else {
-    EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(
-        place, y, x, bcast_dims);
-  }
-}
-
-template <typename T, typename DeviceContext>
-void TensorExpand(const DeviceContext& context,
-                  const Tensor& in,
-                  Tensor* out,
-                  const std::vector<int64_t>& expand_shape) {
-  // necessary check before expand operation
-  PADDLE_ENFORCE_GE(expand_shape.size(),
-                    in.dims().size(),
-                    platform::errors::InvalidArgument(
-                        "The size of 'expand_shape' (%d) should >= the input "
-                        "Tensor's rank (%d).",
-                        expand_shape.size(),
-                        in.dims().size()));
-  PADDLE_ENFORCE_LE(expand_shape.size(),
-                    MAX_RANK_SUPPORTED,
-                    platform::errors::InvalidArgument(
-                        "The size of 'expand_shape' (%d) should be <= %d",
-                        expand_shape.size(),
-                        MAX_RANK_SUPPORTED));
-  switch (expand_shape.size()) {
-    case 1:
-      expand_impl<1, T, DeviceContext>(context, in, out, expand_shape);
-      break;
-    case 2:
-      expand_impl<2, T, DeviceContext>(context, in, out, expand_shape);
-      break;
-    case 3:
-      expand_impl<3, T, DeviceContext>(context, in, out, expand_shape);
-      break;
-    case 4:
-      expand_impl<4, T, DeviceContext>(context, in, out, expand_shape);
-      break;
-    case 5:
-      expand_impl<5, T, DeviceContext>(context, in, out, expand_shape);
-      break;
-    case 6:
-      expand_impl<6, T, DeviceContext>(context, in, out, expand_shape);
-      break;
-  }
-}
-
-template <typename DeviceContext, typename T>
-static void linalg_solve(const framework::ExecutionContext& context,
-                         const framework::Tensor* x,
-                         const framework::Tensor* y,
-                         framework::Tensor* out) {
-  out->mutable_data<T>(context.GetPlace());
-
-  auto& dev_ctx = context.template device_context<DeviceContext>();
-  math::MatrixSolveFunctor<DeviceContext, T> mat_solve;
-
-  // input y can be vector or matrix
-  // but need to be unsqueezed if y is a vector
-  bool is_vector = false;
-  is_vector = is_vector_rhs(*x, *y);
-
-  Tensor tmp_y;
-  if (is_vector) {
-    tmp_y.mutable_data(context.GetPlace(), y->dtype());
-    to_unsqueeze(context, *y, &tmp_y);
-  } else {
-    tmp_y.Resize(y->dims());
-    tmp_y.mutable_data(context.GetPlace(), y->dtype());
-    framework::TensorCopy(
-        *y,
-        context.GetPlace(),
-        context.template device_context<platform::DeviceContext>(),
-        &tmp_y);
-  }
-
-  Tensor tmp_x;
-  tmp_x.Resize(x->dims());
-  tmp_x.mutable_data(context.GetPlace(), x->dtype());
-  framework::TensorCopy(
-      *x,
-      context.GetPlace(),
-      context.template device_context<platform::DeviceContext>(),
-      &tmp_x);
-
-  std::vector<int64_t> x_broadcast_dims;
-  std::vector<int64_t> y_broadcast_dims;
-  std::tie(x_broadcast_dims, y_broadcast_dims) =
-      get_broadcast_dims(tmp_x, tmp_y);
-
-  Tensor tmp_x_bc;
-  TensorExpand<T, DeviceContext>(dev_ctx, tmp_x, &tmp_x_bc, x_broadcast_dims);
-
-  Tensor tmp_y_bc;
-  TensorExpand<T, DeviceContext>(dev_ctx, tmp_y, &tmp_y_bc, y_broadcast_dims);
-
-  auto x_dim = x->dims();
-  auto y_dim = y->dims();
-  auto x_dim_size = x_dim.size();
-  auto y_dim_size = y_dim.size();
-
-  if (is_vector) {                 // vector case
-    out->Resize(tmp_y_bc.dims());  // out.unsqueeze(-1)
-    mat_solve(dev_ctx, tmp_x_bc, tmp_y_bc, out);
-
-    Tensor out_tmp;
-    out_tmp.Resize(out->dims());
-    out_tmp = *out;
-    to_squeeze(context, out_tmp, out);  // out.squeeze(-1)
-  } else {
-    PADDLE_ENFORCE_EQ(
-        x_dim[x_dim_size - 1],
-        y_dim[y_dim_size - 2],
-        platform::errors::InvalidArgument(
-            "Matrix X1 with dimension greater than 2 and any matrix Y1,"
-            "the matrix X1's width must be equal with matrix Y1's "
-            "height. But received X's shape = [%s], X1's shape = [%s], X1's "
-            "width = %s; Y's shape = [%s], Y1's shape = [%s], Y1's height = "
-            "%s.",
-            x_dim,
-            x_dim,
-            x_dim[x_dim_size - 1],
-            y_dim,
-            y_dim,
-            y_dim[y_dim_size - 2]));
-    mat_solve(dev_ctx, tmp_x_bc, tmp_y_bc, out);
-  }
-}
-
-// for TransposeNormal
-static std::vector<int> getNewAxis(const int b_rank) {
-  std::vector<int> axis_1 = {0};
-  std::vector<int> axis_2 = {1, 0};
-  std::vector<int> axis_3 = {0, 2, 1};
-  std::vector<int> axis_4 = {0, 1, 3, 2};
-  std::vector<int> axis_5 = {0, 1, 2, 4, 3};
-  std::vector<int> axis_6 = {0, 1, 2, 3, 5, 4};
-  std::vector<int> axis_7 = {0, 1, 2, 3, 4, 6, 5};
-  std::vector<int> axis_8 = {0, 1, 2, 3, 4, 5, 7, 6};
-  std::vector<int> axis_9 = {0, 1, 2, 3, 4, 5, 6, 8, 7};
-  switch (b_rank) {
-    case 1:
-      return axis_1;
-      break;
-    case 2:
-      return axis_2;
-      break;
-    case 3:
-      return axis_3;
-      break;
-    case 4:
-      return axis_4;
-      break;
-    case 5:
-      return axis_5;
-      break;
-    case 6:
-      return axis_6;
-      break;
-    case 7:
-      return axis_7;
-      break;
-    case 8:
-      return axis_8;
-      break;
-    default:
-      return axis_9;
-  }
-}
-
-// for Resize
-static std::vector<int64_t> getNewDimsVec(const DDim& b_dims) {
-  std::vector<int64_t> b_dims_vec = phi::vectorize(b_dims);
-  int size = b_dims_vec.size();
-  if (size >= 2) {
-    // swap the last 2 elements in b_dims_vec
-    int64_t temp = b_dims_vec[size - 1];
-    b_dims_vec[size - 1] = b_dims_vec[size - 2];
-    b_dims_vec[size - 2] = temp;
-    return b_dims_vec;
-  }
-  PADDLE_ENFORCE_NE(
-      b_dims_vec.empty(),
-      true,
-      platform::errors::PreconditionNotMet(
-          "The size of tensor b must not be %d after getting new dims", 0));
-  // if b_dims_vec.size() == 1, just retun original vec
-  return b_dims_vec;
-}
-
-template <typename DeviceContext, typename T>
-class SolveKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const auto* x = context.Input<framework::Tensor>("X");
-    const auto* y = context.Input<framework::Tensor>("Y");
-    Tensor* out = context.Output<framework::Tensor>("Out");
-    linalg_solve<DeviceContext, T>(context, x, y, out);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SolveGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<framework::Tensor>("X");
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-
-    // reuse the linalg.solve forward output
-    auto* out = ctx.Input<framework::Tensor>("Out");
-
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-
-    bool is_vector = false;
-    is_vector = is_vector_rhs(*input, *y);
-
-    Tensor tmp_y;
-    if (is_vector) {
-      tmp_y.mutable_data(ctx.GetPlace(), y->dtype());
-      to_unsqueeze(ctx, *y, &tmp_y);
-    } else {
-      tmp_y.Resize(y->dims());
-      tmp_y.mutable_data(ctx.GetPlace(), y->dtype());
-      framework::TensorCopy(
-          *y,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          &tmp_y);
-    }
-
-    Tensor tmp_x;
-    tmp_x.Resize(input->dims());
-    tmp_x.mutable_data(ctx.GetPlace(), input->dtype());
-    framework::TensorCopy(
-        *input,
-        ctx.GetPlace(),
-        ctx.template device_context<platform::DeviceContext>(),
-        &tmp_x);
-
-    std::vector<int64_t> x_broadcast_dims;
-    std::vector<int64_t> y_broadcast_dims;
-    std::tie(x_broadcast_dims, y_broadcast_dims) =
-        get_broadcast_dims(tmp_x, tmp_y);
-
-    // tmp_dx
-    Tensor tmp_dx;
-    tmp_dx.Resize(phi::make_ddim(x_broadcast_dims));
-    tmp_dx.mutable_data<T>(ctx.GetPlace());
-
-    // tmp_dy
-    Tensor tmp_dy;
-    tmp_dy.Resize(phi::make_ddim(y_broadcast_dims));
-    tmp_dy.mutable_data<T>(ctx.GetPlace());
-
-    Tensor tmp_input(input->dtype());
-    const auto& new_dims_vec = getNewDimsVec(input->dims());
-    tmp_input.Resize(phi::make_ddim(new_dims_vec));
-    tmp_input.mutable_data<T>(ctx.GetPlace());
-    phi::funcs::TransposeNormal<DeviceContext, T> trans;
-    std::vector<int> new_axis = getNewAxis(input->dims().size());
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    trans(dev_ctx, *input, &tmp_input, new_axis);
-
-    if (dy) {
-      dy->mutable_data<T>(ctx.GetPlace());
-      // reuse linalg_solve forward logics to get tmp_dy
-      linalg_solve<DeviceContext, T>(ctx, &tmp_input, dout, &tmp_dy);
-    }
-
-    if (dx) {
-      dx->mutable_data<T>(ctx.GetPlace());
-      // to get dx
-      auto blas = phi::funcs::GetBlas<DeviceContext, T>(ctx);
-      if (input->dims().size() == 2 && y->dims().size() == 2) {
-        auto mat_dim_a1 =
-            phi::funcs::CreateMatrixDescriptor(tmp_dy.dims(), 0, false);
-        auto mat_dim_b1 =
-            phi::funcs::CreateMatrixDescriptor(out->dims(), 0, true);
-        blas.MatMul(tmp_dy, mat_dim_a1, *out, mat_dim_b1, T(-1), &tmp_dx, T(0));
-      } else if (is_vector_rhs(*input, *y)) {
-        Tensor tmp_dy_;
-        tmp_dy_.mutable_data(ctx.GetPlace(), y->dtype());
-        to_unsqueeze(ctx, tmp_dy, &tmp_dy_);
-
-        Tensor tmp_out_;
-        tmp_out_.mutable_data(ctx.GetPlace(), out->dtype());
-        to_unsqueeze(ctx, *out, &tmp_out_);
-
-        auto mat_dim_a1 =
-            phi::funcs::CreateMatrixDescriptor(tmp_dy_.dims(), 0, false);
-        auto mat_dim_b1 =
-            phi::funcs::CreateMatrixDescriptor(tmp_out_.dims(), 0, true);
-        blas.MatMul(
-            tmp_dy_, mat_dim_a1, tmp_out_, mat_dim_b1, T(-1), &tmp_dx, T(0));
-      } else {
-        auto mat_dim_a1 =
-            phi::funcs::CreateMatrixDescriptor(tmp_dy.dims(), 0, false);
-        auto mat_dim_b1 =
-            phi::funcs::CreateMatrixDescriptor(out->dims(), 0, true);
-        blas.MatMul(tmp_dy, mat_dim_a1, *out, mat_dim_b1, T(-1), &tmp_dx, T(0));
-      }
-    }
-
-    if (y->dims() != tmp_dy.dims()) {
-      Tensor dy_help;
-      dy_help.Resize(tmp_dy.dims());
-      dy_help.mutable_data(ctx.GetPlace(), tmp_dy.dtype());
-      framework::TensorCopy(
-          tmp_dy,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          &dy_help);
-
-      // get dims
-      std::vector<std::int64_t> x_dims = vectorize(input->dims());
-      std::vector<std::int64_t> y_dims = vectorize(y->dims());
-      std::vector<std::int64_t> dout_dims = vectorize(dout->dims());
-
-      if (is_vector_rhs(*input, *y)) {
-        dout_dims.push_back(1);
-      }
-
-      int y_ndim = y_dims.size();
-      int ndim = dout_dims.size();
-
-      const std::vector<std::int64_t> dy_help_dims = vectorize(dy_help.dims());
-      std::vector<std::int64_t> dy_broadcast_dims(ndim);
-
-      std::fill(dy_broadcast_dims.data(),
-                dy_broadcast_dims.data() + ndim - y_ndim,
-                1);
-      std::copy(y_dims.data(),
-                y_dims.data() + y_ndim,
-                dy_broadcast_dims.data() + ndim - y_ndim);
-
-      std::vector<int> dy_reduce_dims;
-      for (int idx = 0; idx <= ndim - 3; idx++) {
-        if (dy_help_dims[idx] != 1 && dy_broadcast_dims[idx] == 1) {
-          dy_reduce_dims.push_back(idx);
-        }
-      }
-      // reduce sum to get grad by ReduceSum
-      if (dy) {
-        if (dy_reduce_dims.empty()) {
-          *dy = std::move(dy_help);
-        } else {
-          bool keep_dim = true;
-          if (dy_help.dims().size() != dy->dims().size()) {
-            keep_dim = false;
-          }
-          ReduceSumForSolve<DeviceContext, T>(
-              &dy_help, dy, dy_reduce_dims, keep_dim, ctx);
-        }
-        dy->Resize(y->dims());
-      }
-    } else {
-      framework::TensorCopy(
-          tmp_dy,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          dy);
-    }
-
-    if (input->dims() != tmp_dx.dims()) {
-      Tensor dx_help;
-      dx_help.Resize(tmp_dx.dims());
-      dx_help.mutable_data(ctx.GetPlace(), tmp_dx.dtype());
-      framework::TensorCopy(
-          tmp_dx,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          &dx_help);
-
-      // get dims
-      std::vector<std::int64_t> x_dims = vectorize(input->dims());
-      std::vector<std::int64_t> y_dims = vectorize(y->dims());
-
-      int x_ndim = x_dims.size();
-      int ndim = x_broadcast_dims.size();
-
-      const std::vector<std::int64_t> dx_help_dims = vectorize(dx_help.dims());
-      std::vector<std::int64_t> dx_broadcast_dims(ndim);
-
-      std::fill(dx_broadcast_dims.data(),
-                dx_broadcast_dims.data() + ndim - x_ndim,
-                1);
-      std::copy(x_dims.data(),
-                x_dims.data() + x_ndim,
-                dx_broadcast_dims.data() + ndim - x_ndim);
-
-      std::vector<int> dx_reduce_dims;
-      for (int idx = 0; idx <= ndim - 3; idx++) {
-        if (dx_help_dims[idx] != 1 && dx_broadcast_dims[idx] == 1) {
-          dx_reduce_dims.push_back(idx);
-        }
-      }
-      // reduce sum to get grad by ReduceSum
-      if (dx) {
-        dx->mutable_data<T>(ctx.GetPlace());
-        if (dx_reduce_dims.empty()) {
-          *dx = std::move(dx_help);
-        } else {
-          bool keep_dim = true;
-          if (dx_help.dims().size() != dx->dims().size()) {
-            keep_dim = false;
-          }
-          ReduceSumForSolve<DeviceContext, T>(
-              &dx_help, dx, dx_reduce_dims, keep_dim, ctx);
-        }
-        dx->Resize(input->dims());
-      }
-    } else {
-      framework::TensorCopy(
-          tmp_dx,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          dx);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/space_to_depth_op.cc b/paddle/fluid/operators/space_to_depth_op.cc
index fb428594226a9..dce7539fe72b8 100644
--- a/paddle/fluid/operators/space_to_depth_op.cc
+++ b/paddle/fluid/operators/space_to_depth_op.cc
@@ -226,15 +226,13 @@ REGISTER_OPERATOR(space_to_depth,
 REGISTER_OPERATOR(space_to_depth_grad,
                   ops::SpaceToDepthGradOp,
                   ops::SpaceToDepthGradOpNoBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    space_to_depth,
-    ops::SpaceToDepthKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SpaceToDepthKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SpaceToDepthKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SpaceToDepthKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    space_to_depth_grad,
-    ops::SpaceToDepthGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SpaceToDepthGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SpaceToDepthGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SpaceToDepthGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(space_to_depth,
+                       ops::SpaceToDepthKernel<phi::CPUContext, float>,
+                       ops::SpaceToDepthKernel<phi::CPUContext, double>,
+                       ops::SpaceToDepthKernel<phi::CPUContext, int>,
+                       ops::SpaceToDepthKernel<phi::CPUContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(space_to_depth_grad,
+                       ops::SpaceToDepthGradKernel<phi::CPUContext, float>,
+                       ops::SpaceToDepthGradKernel<phi::CPUContext, double>,
+                       ops::SpaceToDepthGradKernel<phi::CPUContext, int>,
+                       ops::SpaceToDepthGradKernel<phi::CPUContext, int64_t>);
diff --git a/paddle/fluid/operators/spectral_helper.h b/paddle/fluid/operators/spectral_helper.h
index d66cc503307e6..f69573e18927e 100644
--- a/paddle/fluid/operators/spectral_helper.h
+++ b/paddle/fluid/operators/spectral_helper.h
@@ -210,7 +210,7 @@ void exec_fft(const DeviceContext& ctx,
   transposed_input.Resize(transposed_input_shape);
   const auto place = ctx.GetPlace();
   transposed_input.mutable_data<Ti>(place);
-  TransCompute<platform::CPUDeviceContext, Ti>(
+  TransCompute<phi::CPUContext, Ti>(
       ndim, ctx, *x, &transposed_input, dim_permute);
 
   // make an collapsed input: collapse batch axes for input
@@ -310,39 +310,39 @@ void exec_fft(const DeviceContext& ctx,
   for (int i = 0; i < ndim; i++) {
     reverse_dim_permute[dim_permute[i]] = i;
   }
-  TransCompute<platform::CPUDeviceContext, To>(
+  TransCompute<phi::CPUContext, To>(
       ndim, ctx, transposed_output, out, reverse_dim_permute);
 }
 
 template <typename Ti, typename To>
-struct FFTC2CFunctor<platform::CPUDeviceContext, Ti, To> {
-  void operator()(const platform::CPUDeviceContext& ctx,
+struct FFTC2CFunctor<phi::CPUContext, Ti, To> {
+  void operator()(const phi::CPUContext& ctx,
                   const Tensor* x,
                   Tensor* out,
                   const std::vector<int64_t>& axes,
                   FFTNormMode normalization,
                   bool forward) {
-    exec_fft<platform::CPUDeviceContext, Ti, To>(
+    exec_fft<phi::CPUContext, Ti, To>(
         ctx, x, out, axes, normalization, forward);
   }
 };
 
 template <typename Ti, typename To>
-struct FFTR2CFunctor<platform::CPUDeviceContext, Ti, To> {
-  void operator()(const platform::CPUDeviceContext& ctx,
+struct FFTR2CFunctor<phi::CPUContext, Ti, To> {
+  void operator()(const phi::CPUContext& ctx,
                   const Tensor* x,
                   Tensor* out,
                   const std::vector<int64_t>& axes,
                   FFTNormMode normalization,
                   bool forward) {
-    exec_fft<platform::CPUDeviceContext, Ti, To>(
+    exec_fft<phi::CPUContext, Ti, To>(
         ctx, x, out, axes, normalization, forward);
   }
 };
 
 template <typename Ti, typename To>
-struct FFTC2RFunctor<platform::CPUDeviceContext, Ti, To> {
-  void operator()(const platform::CPUDeviceContext& ctx,
+struct FFTC2RFunctor<phi::CPUContext, Ti, To> {
+  void operator()(const phi::CPUContext& ctx,
                   const Tensor* x,
                   Tensor* out,
                   const std::vector<int64_t>& axes,
@@ -353,14 +353,14 @@ struct FFTC2RFunctor<platform::CPUDeviceContext, Ti, To> {
       Tensor temp;
       temp.mutable_data<Ti>(x->dims(), ctx.GetPlace());
 
-      FFTC2CFunctor<platform::CPUDeviceContext, Ti, Ti> c2c_functor;
+      FFTC2CFunctor<phi::CPUContext, Ti, Ti> c2c_functor;
       c2c_functor(ctx, x, &temp, c2c_dims, normalization, forward);
 
       const std::vector<int64_t> new_axes{axes.back()};
-      exec_fft<platform::CPUDeviceContext, Ti, To>(
+      exec_fft<phi::CPUContext, Ti, To>(
           ctx, &temp, out, new_axes, normalization, forward);
     } else {
-      exec_fft<platform::CPUDeviceContext, Ti, To>(
+      exec_fft<phi::CPUContext, Ti, To>(
           ctx, x, out, axes, normalization, forward);
     }
   }
@@ -383,8 +383,8 @@ T compute_factor(int64_t size, FFTNormMode normalization) {
 }
 
 template <typename Ti, typename To>
-struct FFTC2CFunctor<platform::CPUDeviceContext, Ti, To> {
-  void operator()(const platform::CPUDeviceContext& ctx,
+struct FFTC2CFunctor<phi::CPUContext, Ti, To> {
+  void operator()(const phi::CPUContext& ctx,
                   const Tensor* x,
                   Tensor* out,
                   const std::vector<int64_t>& axes,
@@ -426,8 +426,8 @@ struct FFTC2CFunctor<platform::CPUDeviceContext, Ti, To> {
 };
 
 template <typename Ti, typename To>
-struct FFTR2CFunctor<platform::CPUDeviceContext, Ti, To> {
-  void operator()(const platform::CPUDeviceContext& ctx,
+struct FFTR2CFunctor<phi::CPUContext, Ti, To> {
+  void operator()(const phi::CPUContext& ctx,
                   const Tensor* x,
                   Tensor* out,
                   const std::vector<int64_t>& axes,
@@ -483,8 +483,8 @@ struct FFTR2CFunctor<platform::CPUDeviceContext, Ti, To> {
 };
 
 template <typename Ti, typename To>
-struct FFTC2RFunctor<platform::CPUDeviceContext, Ti, To> {
-  void operator()(const platform::CPUDeviceContext& ctx,
+struct FFTC2RFunctor<phi::CPUContext, Ti, To> {
+  void operator()(const phi::CPUContext& ctx,
                   const Tensor* x,
                   Tensor* out,
                   const std::vector<int64_t>& axes,
diff --git a/paddle/fluid/operators/spectral_norm_op.cc b/paddle/fluid/operators/spectral_norm_op.cc
index 1af812c336b5a..a6addb2e6f46d 100644
--- a/paddle/fluid/operators/spectral_norm_op.cc
+++ b/paddle/fluid/operators/spectral_norm_op.cc
@@ -256,11 +256,9 @@ REGISTER_OPERATOR(spectral_norm,
                   ops::SpectralNormGradOpMaker<paddle::framework::OpDesc>,
                   ops::SpectralNormGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(spectral_norm_grad, ops::SpectralNormOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    spectral_norm,
-    ops::SpectralNormKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SpectralNormKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    spectral_norm_grad,
-    ops::SpectralNormGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SpectralNormGradKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(spectral_norm,
+                       ops::SpectralNormKernel<phi::CPUContext, float>,
+                       ops::SpectralNormKernel<phi::CPUContext, double>);
+REGISTER_OP_CPU_KERNEL(spectral_norm_grad,
+                       ops::SpectralNormGradKernel<phi::CPUContext, float>,
+                       ops::SpectralNormGradKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/spectral_op.cc b/paddle/fluid/operators/spectral_op.cc
index 3f00333b98089..91e3880dff004 100644
--- a/paddle/fluid/operators/spectral_op.cc
+++ b/paddle/fluid/operators/spectral_op.cc
@@ -351,45 +351,39 @@ REGISTER_OPERATOR(fft_c2c,
                   ops::FFTC2COpMaker,
                   ops::FFTC2CGradOpMaker<paddle::framework::OpDesc>,
                   ops::FFTC2CGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    fft_c2c,
-    ops::FFTC2CKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FFTC2CKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(fft_c2c,
+                       ops::FFTC2CKernel<phi::CPUContext, float>,
+                       ops::FFTC2CKernel<phi::CPUContext, double>);
 
 REGISTER_OPERATOR(fft_c2c_grad, ops::FFTC2CGradOp);
-REGISTER_OP_CPU_KERNEL(
-    fft_c2c_grad,
-    ops::FFTC2CGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FFTC2CGradKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(fft_c2c_grad,
+                       ops::FFTC2CGradKernel<phi::CPUContext, float>,
+                       ops::FFTC2CGradKernel<phi::CPUContext, double>);
 
 REGISTER_OPERATOR(fft_r2c,
                   ops::FFTR2COp,
                   ops::FFTR2COpMaker,
                   ops::FFTR2CGradOpMaker<paddle::framework::OpDesc>,
                   ops::FFTR2CGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    fft_r2c,
-    ops::FFTR2CKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FFTR2CKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(fft_r2c,
+                       ops::FFTR2CKernel<phi::CPUContext, float>,
+                       ops::FFTR2CKernel<phi::CPUContext, double>);
 
 REGISTER_OPERATOR(fft_r2c_grad, ops::FFTR2CGradOp);
-REGISTER_OP_CPU_KERNEL(
-    fft_r2c_grad,
-    ops::FFTR2CGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FFTR2CGradKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(fft_r2c_grad,
+                       ops::FFTR2CGradKernel<phi::CPUContext, float>,
+                       ops::FFTR2CGradKernel<phi::CPUContext, double>);
 
 REGISTER_OPERATOR(fft_c2r,
                   ops::FFTC2ROp,
                   ops::FFTC2ROpMaker,
                   ops::FFTC2RGradOpMaker<paddle::framework::OpDesc>,
                   ops::FFTC2RGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    fft_c2r,
-    ops::FFTC2RKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FFTC2RKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(fft_c2r,
+                       ops::FFTC2RKernel<phi::CPUContext, float>,
+                       ops::FFTC2RKernel<phi::CPUContext, double>);
 
 REGISTER_OPERATOR(fft_c2r_grad, ops::FFTC2RGradOp);
-REGISTER_OP_CPU_KERNEL(
-    fft_c2r_grad,
-    ops::FFTC2RGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FFTC2RGradKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(fft_c2r_grad,
+                       ops::FFTC2RGradKernel<phi::CPUContext, float>,
+                       ops::FFTC2RGradKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/spp_op.cc b/paddle/fluid/operators/spp_op.cc
index 583b0b69a63cf..b1ca67f521816 100644
--- a/paddle/fluid/operators/spp_op.cc
+++ b/paddle/fluid/operators/spp_op.cc
@@ -109,11 +109,9 @@ REGISTER_OPERATOR(
     paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
     paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>);
 REGISTER_OPERATOR(spp_grad, ops::SppOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    spp,
-    ops::SppKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SppKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    spp_grad,
-    ops::SppGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SppGradKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(spp,
+                       ops::SppKernel<phi::CPUContext, float>,
+                       ops::SppKernel<phi::CPUContext, double>);
+REGISTER_OP_CPU_KERNEL(spp_grad,
+                       ops::SppGradKernel<phi::CPUContext, float>,
+                       ops::SppGradKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/squared_l2_distance_op.cc b/paddle/fluid/operators/squared_l2_distance_op.cc
index 2b76a42706535..55d307cf087ec 100644
--- a/paddle/fluid/operators/squared_l2_distance_op.cc
+++ b/paddle/fluid/operators/squared_l2_distance_op.cc
@@ -221,10 +221,8 @@ REGISTER_OPERATOR(
 REGISTER_OPERATOR(squared_l2_distance_grad,
                   ops::SquaredL2DistanceGradOp,
                   ops::SquaredL2DistanceGradOpNoBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    squared_l2_distance,
-    ops::SquaredL2DistanceKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(squared_l2_distance,
+                       ops::SquaredL2DistanceKernel<phi::CPUContext, float>);
 REGISTER_OP_CPU_KERNEL(
     squared_l2_distance_grad,
-    ops::SquaredL2DistanceGradKernel<paddle::platform::CPUDeviceContext,
-                                     float>);
+    ops::SquaredL2DistanceGradKernel<phi::CPUContext, float>);
diff --git a/paddle/fluid/operators/squared_l2_norm_op.cc b/paddle/fluid/operators/squared_l2_norm_op.cc
index 529c4262b0a08..f6792baa1f591 100644
--- a/paddle/fluid/operators/squared_l2_norm_op.cc
+++ b/paddle/fluid/operators/squared_l2_norm_op.cc
@@ -96,11 +96,9 @@ REGISTER_OPERATOR(squared_l2_norm,
                   ops::SquaredL2NormGradOpMaker<paddle::framework::OpDesc>,
                   ops::SquaredL2NormGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(squared_l2_norm_grad, ops::SquaredL2NormGradOp);
-REGISTER_OP_CPU_KERNEL(
-    squared_l2_norm,
-    ops::SquaredL2NormKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SquaredL2NormKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    squared_l2_norm_grad,
-    ops::SquaredL2NormGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SquaredL2NormGradKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(squared_l2_norm,
+                       ops::SquaredL2NormKernel<phi::CPUContext, float>,
+                       ops::SquaredL2NormKernel<phi::CPUContext, double>);
+REGISTER_OP_CPU_KERNEL(squared_l2_norm_grad,
+                       ops::SquaredL2NormGradKernel<phi::CPUContext, float>,
+                       ops::SquaredL2NormGradKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc
index 29c54d2699aff..f532a429b49e2 100644
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -375,31 +375,25 @@ REGISTER_OPERATOR(squeeze2_grad,
 
 REGISTER_OP_CPU_KERNEL(
     squeeze,
-    ops::SqueezeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SqueezeKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SqueezeKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::SqueezeKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SqueezeKernel<paddle::platform::CPUDeviceContext, uint8_t>,
-    ops::SqueezeKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::SqueezeKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::SqueezeKernel<paddle::platform::CPUDeviceContext,
-                       paddle::platform::complex<float>>,
-    ops::SqueezeKernel<paddle::platform::CPUDeviceContext,
-                       paddle::platform::complex<double>>,
-    ops::SqueezeKernel<paddle::platform::CPUDeviceContext,
-                       paddle::platform::bfloat16>);
+    ops::SqueezeKernel<phi::CPUContext, float>,
+    ops::SqueezeKernel<phi::CPUContext, double>,
+    ops::SqueezeKernel<phi::CPUContext, bool>,
+    ops::SqueezeKernel<phi::CPUContext, int>,
+    ops::SqueezeKernel<phi::CPUContext, uint8_t>,
+    ops::SqueezeKernel<phi::CPUContext, int8_t>,
+    ops::SqueezeKernel<phi::CPUContext, int64_t>,
+    ops::SqueezeKernel<phi::CPUContext, paddle::platform::complex<float>>,
+    ops::SqueezeKernel<phi::CPUContext, paddle::platform::complex<double>>,
+    ops::SqueezeKernel<phi::CPUContext, paddle::platform::bfloat16>);
 REGISTER_OP_CPU_KERNEL(
     squeeze_grad,
-    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, uint8_t>,
-    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext,
-                           paddle::platform::complex<float>>,
-    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext,
-                           paddle::platform::complex<double>>,
-    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext,
-                           paddle::platform::bfloat16>);
+    ops::SqueezeGradKernel<phi::CPUContext, float>,
+    ops::SqueezeGradKernel<phi::CPUContext, double>,
+    ops::SqueezeGradKernel<phi::CPUContext, bool>,
+    ops::SqueezeGradKernel<phi::CPUContext, int>,
+    ops::SqueezeGradKernel<phi::CPUContext, uint8_t>,
+    ops::SqueezeGradKernel<phi::CPUContext, int8_t>,
+    ops::SqueezeGradKernel<phi::CPUContext, int64_t>,
+    ops::SqueezeGradKernel<phi::CPUContext, paddle::platform::complex<float>>,
+    ops::SqueezeGradKernel<phi::CPUContext, paddle::platform::complex<double>>,
+    ops::SqueezeGradKernel<phi::CPUContext, paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/stft_op.cc b/paddle/fluid/operators/stft_op.cc
index fd064c255f560..d708abe3d0f8c 100644
--- a/paddle/fluid/operators/stft_op.cc
+++ b/paddle/fluid/operators/stft_op.cc
@@ -164,12 +164,10 @@ REGISTER_OPERATOR(stft,
 
 REGISTER_OPERATOR(stft_grad, ops::StftGradOp);
 
-REGISTER_OP_CPU_KERNEL(
-    stft,
-    ops::StftKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::StftKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    stft_grad,
-    ops::StftGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::StftGradKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(stft,
+                       ops::StftKernel<phi::CPUContext, float>,
+                       ops::StftKernel<phi::CPUContext, double>);
+
+REGISTER_OP_CPU_KERNEL(stft_grad,
+                       ops::StftGradKernel<phi::CPUContext, float>,
+                       ops::StftGradKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/strided_memcpy_test.cc b/paddle/fluid/operators/strided_memcpy_test.cc
index 362be0d5da33d..e16df34542795 100644
--- a/paddle/fluid/operators/strided_memcpy_test.cc
+++ b/paddle/fluid/operators/strided_memcpy_test.cc
@@ -35,7 +35,7 @@ TEST(StridedMemcpy, CPUCrop) {
   framework::DDim dst_dim({2, 2});
   framework::DDim dst_stride({2, 1});
 
-  platform::CPUDeviceContext ctx;
+  phi::CPUContext ctx;
   StridedMemcpy<int>(ctx, src + 1, src_stride, dst_dim, dst_stride, dst);
 
   ASSERT_EQ(1, dst[0]);
@@ -57,7 +57,7 @@ TEST(StridedMemcpy, CPUConcat) {
   framework::DDim src_stride({2, 1});
   framework::DDim dst_dim({2, 2});
   framework::DDim dst_stride({4, 1});
-  platform::CPUDeviceContext ctx;
+  phi::CPUContext ctx;
 
   StridedMemcpy<int>(ctx, src, src_stride, dst_dim, dst_stride, dst);
   StridedMemcpy<int>(ctx, src, src_stride, dst_dim, dst_stride, dst + 2);
diff --git a/paddle/fluid/operators/string/faster_tokenizer_op.cc b/paddle/fluid/operators/string/faster_tokenizer_op.cc
index 4c74b20bf9378..3539e2213a39d 100644
--- a/paddle/fluid/operators/string/faster_tokenizer_op.cc
+++ b/paddle/fluid/operators/string/faster_tokenizer_op.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 #include <utf8proc.h>
 
 #include <algorithm>
-#include <boost/algorithm/string.hpp>
 #include <chrono>
 #include <codecvt>
 #include <fstream>
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index fca510143d0de..ca851b8ee75b1 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -359,9 +359,8 @@ REGISTER_OPERATOR(sum,
 
 REGISTER_OP_CPU_KERNEL(
     sum,
-    ops::SumKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SumKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SumKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SumKernel<paddle::platform::CPUDeviceContext,
-                   paddle::platform::bfloat16>,
-    ops::SumKernel<paddle::platform::CPUDeviceContext, int64_t>);
+    ops::SumKernel<phi::CPUContext, float>,
+    ops::SumKernel<phi::CPUContext, double>,
+    ops::SumKernel<phi::CPUContext, int>,
+    ops::SumKernel<phi::CPUContext, paddle::platform::bfloat16>,
+    ops::SumKernel<phi::CPUContext, int64_t>);
diff --git a/paddle/fluid/operators/svd_op.cc b/paddle/fluid/operators/svd_op.cc
index 4186068cd6e40..7ae85343e0472 100644
--- a/paddle/fluid/operators/svd_op.cc
+++ b/paddle/fluid/operators/svd_op.cc
@@ -172,7 +172,6 @@ REGISTER_OP_CPU_KERNEL(svd,
                        ops::SvdCPUKernel<float>,
                        ops::SvdCPUKernel<double>);
 
-REGISTER_OP_CPU_KERNEL(
-    svd_grad,
-    ops::SvdGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SvdGradKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(svd_grad,
+                       ops::SvdGradKernel<phi::CPUContext, float>,
+                       ops::SvdGradKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/svd_op.h b/paddle/fluid/operators/svd_op.h
index 7b98dc21d07bb..b7d3b7d3e5ae0 100644
--- a/paddle/fluid/operators/svd_op.h
+++ b/paddle/fluid/operators/svd_op.h
@@ -40,10 +40,10 @@ class SvdCPUKernel : public framework::OpKernel<T> {
 
     /*Create Tensors and output, set the dim ...*/
     auto numel = x->numel();
-    auto& orig_dev_ctx =
-        context.template device_context<platform::CPUDeviceContext>();
-    auto& dev_ctx = static_cast<const typename framework::ConvertToPhiContext<
-        platform::CPUDeviceContext>::TYPE&>(orig_dev_ctx);
+    auto& orig_dev_ctx = context.template device_context<phi::CPUContext>();
+    auto& dev_ctx = static_cast<
+        const typename framework::ConvertToPhiContext<phi::CPUContext>::TYPE&>(
+        orig_dev_ctx);
     Tensor trans_x = ::phi::TransposeLast2Dim<T>(dev_ctx, *x);
     auto* x_data = trans_x.data<T>();
     auto x_dims = x->dims();
diff --git a/paddle/fluid/operators/sync_batch_norm_op_mlu.cc b/paddle/fluid/operators/sync_batch_norm_op_mlu.cc
new file mode 100644
index 0000000000000..ce511a12bbfdb
--- /dev/null
+++ b/paddle/fluid/operators/sync_batch_norm_op_mlu.cc
@@ -0,0 +1,492 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the Licnse. */
+
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
+#include "paddle/fluid/operators/batch_norm_op.h"
+#include "paddle/fluid/platform/collective_helper.h"
+#if defined(PADDLE_WITH_CNCL)
+#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
+#endif
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+#define GET_LAYOUT_OFFSET 2
+using Tensor = framework::Tensor;
+static std::vector<cnnlTensorLayout_t> supported_input_layout = {
+    CNNL_LAYOUT_NC, CNNL_LAYOUT_NLC, CNNL_LAYOUT_NHWC, CNNL_LAYOUT_NDHWC};
+
+template <typename T>
+class SyncBatchNormMLUKernel : public framework::OpKernel<T> {
+  using MPDType = typename details::MPTypeTrait<T>::Type;
+
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    float epsilon = ctx.Attr<float>("epsilon");
+    float momentum = ctx.Attr<float>("momentum");
+    const bool is_test = ctx.Attr<bool>("is_test");
+    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
+    const bool trainable_stats = ctx.Attr<bool>("trainable_statistics");
+    const std::string layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout layout = framework::StringToDataLayout(layout_str);
+
+    PADDLE_ENFORCE_EQ(use_global_stats,
+                      false,
+                      platform::errors::InvalidArgument(
+                          "sync_batch_norm doesn't support "
+                          "to set use_global_stats True. Please use batch_norm "
+                          "in this case."));
+
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto *scale = ctx.Input<Tensor>("Scale");
+    const auto *bias = ctx.Input<Tensor>("Bias");
+    const auto *mean = ctx.Input<Tensor>("Mean");
+    const auto *variance = ctx.Input<Tensor>("Variance");
+    auto *mean_out = ctx.Output<Tensor>("MeanOut");
+    auto *variance_out = ctx.Output<Tensor>("VarianceOut");
+    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
+    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
+    auto *y = ctx.Output<Tensor>("Y");
+
+    const auto &x_dims = x->dims();
+    PADDLE_ENFORCE_GE(x_dims.size(),
+                      2,
+                      platform::errors::InvalidArgument(
+                          "The Input dim size should be larger than 1."));
+    PADDLE_ENFORCE_LE(x_dims.size(),
+                      5,
+                      platform::errors::InvalidArgument(
+                          "The Input dim size should be less than 6."));
+
+    int N, C, H, W, D;
+    ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D);
+
+    y->mutable_data<T>(ctx.GetPlace());
+    mean_out->mutable_data<MPDType>(ctx.GetPlace());
+    variance_out->mutable_data<MPDType>(ctx.GetPlace());
+    saved_mean->mutable_data<MPDType>(ctx.GetPlace());
+    saved_variance->mutable_data<MPDType>(ctx.GetPlace());
+
+    Tensor trans_x;
+    Tensor trans_y;
+    std::vector<int> forward_perm;
+    std::vector<int> backward_perm;
+    std::vector<int> trans_shape;
+    const bool need_transpose =
+        ((layout == DataLayout::kNCHW && x_dims.size() != 2) ||
+         x_dims.size() == 5);
+    if (need_transpose) {
+      SetMLUTransposePerm(
+          x_dims, layout, &forward_perm, &backward_perm, &trans_shape);
+      trans_x.mutable_data<T>(phi::make_ddim(trans_shape), ctx.GetPlace());
+      trans_y.mutable_data<T>(phi::make_ddim(trans_shape), ctx.GetPlace());
+      MLUCnnlTensorDesc desc_x(*x);
+      MLUCnnlTensorDesc desc_trans_x(
+          trans_shape.size(), trans_shape.data(), ToCnnlDataType(x->dtype()));
+      MLUCnnl::Transpose(ctx,
+                         forward_perm,
+                         x_dims.size(),
+                         desc_x.get(),
+                         GetBasePtr(x),
+                         desc_trans_x.get(),
+                         GetBasePtr(&trans_x));
+    } else {
+      trans_x = *x;
+      trans_y = *y;
+    }
+
+    MLUCnnlTensorDesc desc_trans(
+        trans_x,
+        supported_input_layout[x_dims.size() - GET_LAYOUT_OFFSET],
+        ToCnnlDataType<T>());
+
+    bool test_mode = is_test && (!trainable_stats);
+    if (test_mode) {  // inference
+      MLUCnnlTensorDesc desc_weight_bias_mean_var(*bias);
+      MLUCnnl::FusedBatchNorm(ctx,
+                              false /*is_training*/,
+                              desc_trans.get(),
+                              GetBasePtr(&trans_x),
+                              desc_weight_bias_mean_var.get(),
+                              GetBasePtr(scale),
+                              GetBasePtr(bias),
+                              GetBasePtr(mean),
+                              GetBasePtr(variance),
+                              epsilon,
+                              momentum,
+                              desc_trans.get(),
+                              GetBasePtr(&trans_y),
+                              nullptr,
+                              nullptr,
+                              nullptr,
+                              nullptr);
+    } else {  // training
+      if (ctx.HasInput("MomentumTensor")) {
+        const auto *mom_tensor = ctx.Input<Tensor>("MomentumTensor");
+        Tensor mom_cpu;
+        paddle::framework::TensorCopySync(
+            *mom_tensor, platform::CPUPlace(), &mom_cpu);
+        momentum = mom_cpu.data<float>()[0];
+      }
+
+      Tensor local_mean, local_var;
+      local_mean.mutable_data<MPDType>(mean->dims(), ctx.GetPlace());
+      local_var.mutable_data<MPDType>(variance->dims(), ctx.GetPlace());
+      MLUCnnlTensorDesc desc_mean_var(*mean_out);
+
+      // cacl local_mean and local_var
+      MLUCnnl::SyncBatchNormStats(ctx,
+                                  desc_trans.get(),
+                                  GetBasePtr(&trans_x),
+                                  epsilon,
+                                  desc_mean_var.get(),
+                                  GetBasePtr(&local_mean),
+                                  desc_mean_var.get(),
+                                  GetBasePtr(&local_var));
+
+      Tensor input_count;
+      input_count.mutable_data<T>(phi::make_ddim({1}), ctx.GetPlace());
+      FillMLUTensorWithHostValue<T>(
+          ctx, static_cast<T>(x->numel() / C), &input_count);
+
+      Tensor count_all;
+      Tensor mean_all(mean->dtype());
+      Tensor invstd_all(variance->dtype());
+
+      auto &dev_ctx =
+          ctx.template device_context<paddle::platform::MLUDeviceContext>();
+      auto stream = dev_ctx.stream();
+      auto *comm = dev_ctx.cncl_comm();
+      if (comm) {
+        auto *comm = paddle::platform::CNCLCommContext::Instance()
+                         .Get(0, ctx.GetPlace())
+                         ->comm();
+        int count;
+        PADDLE_ENFORCE_MLU_SUCCESS(cnclGetCommCount(&count, comm));
+        count_all.mutable_data<T>(phi::make_ddim({count}), ctx.GetPlace());
+        cnclDataType_t dtype = platform::ToCNCLDataType(
+            framework::TransToProtoVarType(count_all.dtype()));
+        PADDLE_ENFORCE_MLU_SUCCESS(cnclAllGather(GetBasePtr(&input_count),
+                                                 GetBasePtr(&count_all),
+                                                 1,
+                                                 dtype,
+                                                 comm,
+                                                 stream));
+
+        mean_all.mutable_data<MPDType>(phi::make_ddim({count, mean->numel()}),
+                                       ctx.GetPlace());
+        invstd_all.mutable_data<MPDType>(
+            phi::make_ddim({count, variance->numel()}), ctx.GetPlace());
+
+        auto cncl_dtype = platform::ToCNCLDataType(
+            framework::TransToProtoVarType(mean_all.dtype()));
+        PADDLE_ENFORCE_MLU_SUCCESS(cnclAllGather(GetBasePtr(&local_mean),
+                                                 GetBasePtr(&mean_all),
+                                                 local_mean.numel(),
+                                                 cncl_dtype,
+                                                 comm,
+                                                 stream));
+
+        PADDLE_ENFORCE_MLU_SUCCESS(cnclAllGather(GetBasePtr(&local_var),
+                                                 GetBasePtr(&invstd_all),
+                                                 local_var.numel(),
+                                                 cncl_dtype,
+                                                 comm,
+                                                 stream));
+
+      } else {
+        count_all = input_count;
+        mean_all.ShareDataWith(local_mean);
+        invstd_all.ShareDataWith(local_var);
+        mean_all.Resize(phi::make_ddim({1, local_mean.numel()}));
+        invstd_all.Resize(phi::make_ddim({1, local_var.numel()}));
+      }
+
+      MLUCnnlTensorDesc desc_all_mean_invstd(
+          invstd_all, CNNL_LAYOUT_NC, ToCnnlDataType<MPDType>());
+      MLUCnnlTensorDesc desc_moving_mean_var(*mean_out);
+      MLUCnnlTensorDesc desc_saved_mean_var(*saved_mean);
+      MLUCnnlTensorDesc desc_count_all(count_all);
+
+      MLUCnnl::SyncBatchNormGatherStatsWithCounts(ctx,
+                                                  momentum,
+                                                  epsilon,
+                                                  desc_all_mean_invstd.get(),
+                                                  GetBasePtr(&mean_all),
+                                                  desc_all_mean_invstd.get(),
+                                                  GetBasePtr(&invstd_all),
+                                                  desc_moving_mean_var.get(),
+                                                  GetBasePtr(mean_out),
+                                                  desc_moving_mean_var.get(),
+                                                  GetBasePtr(variance_out),
+                                                  desc_count_all.get(),
+                                                  GetBasePtr(&count_all),
+                                                  desc_saved_mean_var.get(),
+                                                  GetBasePtr(saved_mean),
+                                                  desc_saved_mean_var.get(),
+                                                  GetBasePtr(saved_variance));
+
+      MLUCnnlTensorDesc desc_other_param(*saved_mean);
+      MLUCnnl::SyncBatchNormElemt(ctx,
+                                  desc_trans.get(),
+                                  GetBasePtr(&trans_x),
+                                  desc_other_param.get(),
+                                  GetBasePtr(saved_mean),
+                                  desc_other_param.get(),
+                                  GetBasePtr(saved_variance),
+                                  desc_other_param.get(),
+                                  GetBasePtr(scale),
+                                  desc_other_param.get(),
+                                  GetBasePtr(bias),
+                                  desc_trans.get(),
+                                  GetBasePtr(&trans_y));
+    }
+    if (need_transpose) {
+      MLUCnnlTensorDesc desc_y(*y);
+      MLUCnnlTensorDesc desc_trans_y(trans_y);
+      MLUCnnl::Transpose(ctx,
+                         backward_perm,
+                         trans_y.dims().size(),
+                         desc_trans_y.get(),
+                         GetBasePtr(&trans_y),
+                         desc_y.get(),
+                         GetBasePtr(y));
+    }
+  }
+};
+
+template <typename T>
+class SyncBatchNormMLUGradKernel : public framework::OpKernel<T> {
+  using MPDType = typename details::MPTypeTrait<T>::Type;
+
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const std::string layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout layout = framework::StringToDataLayout(layout_str);
+
+    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const auto *scale = ctx.Input<Tensor>("Scale");
+    const auto *bias = ctx.Input<Tensor>("Bias");
+
+    // init output
+    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
+    const auto *saved_inv_var = ctx.Input<Tensor>("SavedVariance");
+
+    const Tensor *x;
+    if (ctx.HasInput("Y")) {
+      PADDLE_ENFORCE_EQ(true,
+                        false,
+                        platform::errors::InvalidArgument(
+                            "sync_batch_norm_grad doesn't support input Y"));
+    } else {
+      x = ctx.Input<Tensor>("X");
+    }
+
+    const auto &x_dims = x->dims();
+    PADDLE_ENFORCE_GE(x_dims.size(),
+                      2,
+                      platform::errors::InvalidArgument(
+                          "The Input X dim size should be larger than 1."));
+    PADDLE_ENFORCE_LE(x_dims.size(),
+                      5,
+                      platform::errors::InvalidArgument(
+                          "The Input X dim size should be less than 6."));
+
+    int N, C, H, W, D;
+    ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D);
+    PADDLE_ENFORCE_EQ(scale->dims()[0],
+                      C,
+                      platform::errors::InvalidArgument(
+                          "Expected first dim for input parameter(scale) of "
+                          "OP(sync_batch_norm) be (%d), but given (%d).",
+                          C,
+                          scale->dims()[0]));
+
+    d_x->mutable_data<T>(ctx.GetPlace());
+    if (d_scale && d_bias) {
+      d_scale->mutable_data<MPDType>(ctx.GetPlace());
+      d_bias->mutable_data<MPDType>(ctx.GetPlace());
+    }
+    PADDLE_ENFORCE_EQ(scale->dims().size(),
+                      1UL,
+                      platform::errors::InvalidArgument(
+                          "Expected rank for input parameter(scale) of "
+                          "OP(sync_batch_norm) be (1), but given (%d).",
+                          scale->dims().size()));
+
+    Tensor trans_x;
+    Tensor trans_dy;
+    Tensor trans_dx;
+    std::vector<int> forward_perm;
+    std::vector<int> backward_perm;
+    std::vector<int> trans_shape;
+    const bool need_transpose =
+        ((layout == DataLayout::kNCHW && x_dims.size() != 2) ||
+         x_dims.size() == 5);
+    if (need_transpose) {
+      SetMLUTransposePerm(
+          x_dims, layout, &forward_perm, &backward_perm, &trans_shape);
+      trans_x.mutable_data<T>(phi::make_ddim(trans_shape), ctx.GetPlace());
+      trans_dy.mutable_data<T>(phi::make_ddim(trans_shape), ctx.GetPlace());
+      trans_dx.mutable_data<T>(phi::make_ddim(trans_shape), ctx.GetPlace());
+      MLUCnnlTensorDesc desc_x(*x);
+      MLUCnnlTensorDesc desc_trans_x(
+          trans_shape.size(), trans_shape.data(), ToCnnlDataType(x->dtype()));
+      MLUCnnl::Transpose(ctx,
+                         forward_perm,
+                         x_dims.size(),
+                         desc_x.get(),
+                         GetBasePtr(x),
+                         desc_trans_x.get(),
+                         GetBasePtr(&trans_x));
+      MLUCnnl::Transpose(ctx,
+                         forward_perm,
+                         x_dims.size(),
+                         desc_x.get(),
+                         GetBasePtr(d_y),
+                         desc_trans_x.get(),
+                         GetBasePtr(&trans_dy));
+    } else {
+      trans_x = *x;
+      trans_dy = *d_y;
+      trans_dx = *d_x;
+    }
+    MLUCnnlTensorDesc desc_trans(
+        trans_x,
+        supported_input_layout[x_dims.size() - GET_LAYOUT_OFFSET],
+        ToCnnlDataType<T>());
+
+    Tensor sum_dy, sum_dy_xmu;
+    sum_dy.mutable_data<MPDType>(bias->dims(), ctx.GetPlace());
+    sum_dy_xmu.mutable_data<MPDType>(bias->dims(), ctx.GetPlace());
+    MLUCnnlTensorDesc desc_other_param(*bias);
+
+    MLUCnnl::SyncBatchnormBackwardReduce(
+        ctx,
+        desc_trans.get(),
+        GetBasePtr(&trans_dy),
+        desc_trans.get(),
+        GetBasePtr(&trans_x),
+        desc_other_param.get(),
+        GetBasePtr(saved_mean),
+        desc_other_param.get(),
+        GetBasePtr(saved_inv_var),
+        d_scale ? desc_other_param.get() : nullptr,
+        d_scale ? GetBasePtr(d_scale) : nullptr,
+        d_bias ? desc_other_param.get() : nullptr,
+        d_bias ? GetBasePtr(d_bias) : nullptr,
+        desc_other_param.get(),
+        GetBasePtr(&sum_dy),
+        desc_other_param.get(),
+        GetBasePtr(&sum_dy_xmu),
+        true /*compute sum_dy, sum_dy_xmu*/,
+        d_scale ? true : false /*compute d_scale*/,
+        d_bias ? true : false /*compute d_bias*/);
+
+    Tensor numel_count;
+    numel_count.mutable_data<int32_t>(phi::make_ddim({1}), ctx.GetPlace());
+    FillMLUTensorWithHostValue<int32_t>(
+        ctx, static_cast<int32_t>(x->numel() / C), &numel_count);
+
+    auto &dev_ctx =
+        ctx.template device_context<paddle::platform::MLUDeviceContext>();
+    auto stream = dev_ctx.stream();
+    auto *comm = dev_ctx.cncl_comm();
+    if (comm) {
+      auto *comm = paddle::platform::CNCLCommContext::Instance()
+                       .Get(0, ctx.GetPlace())
+                       ->comm();
+      cnclDataType_t dtype = platform::ToCNCLDataType(
+          framework::TransToProtoVarType(numel_count.dtype()));
+      PADDLE_ENFORCE_MLU_SUCCESS(cnclAllReduce(GetBasePtr(&numel_count),
+                                               GetBasePtr(&numel_count),
+                                               1,
+                                               dtype,
+                                               cnclSum,
+                                               comm,
+                                               stream));
+
+      auto cncl_dtype = platform::ToCNCLDataType(
+          framework::TransToProtoVarType(sum_dy.dtype()));
+      PADDLE_ENFORCE_MLU_SUCCESS(cnclAllReduce(GetBasePtr(&sum_dy),
+                                               GetBasePtr(&sum_dy),
+                                               sum_dy.numel(),
+                                               cncl_dtype,
+                                               cnclSum,
+                                               comm,
+                                               stream));
+
+      PADDLE_ENFORCE_MLU_SUCCESS(cnclAllReduce(GetBasePtr(&sum_dy_xmu),
+                                               GetBasePtr(&sum_dy_xmu),
+                                               sum_dy_xmu.numel(),
+                                               cncl_dtype,
+                                               cnclSum,
+                                               comm,
+                                               stream));
+    }
+
+    if (d_x) {
+      MLUCnnlTensorDesc desc_count(numel_count);
+      MLUCnnl::SyncBatchNormBackwardElemt(ctx,
+                                          desc_trans.get(),
+                                          GetBasePtr(&trans_dy),
+                                          desc_trans.get(),
+                                          GetBasePtr(&trans_x),
+                                          desc_other_param.get(),
+                                          GetBasePtr(saved_mean),
+                                          desc_other_param.get(),
+                                          GetBasePtr(saved_inv_var),
+                                          desc_other_param.get(),
+                                          GetBasePtr(scale),
+                                          desc_other_param.get(),
+                                          GetBasePtr(&sum_dy),
+                                          desc_other_param.get(),
+                                          GetBasePtr(&sum_dy_xmu),
+                                          desc_count.get(),
+                                          GetBasePtr(&numel_count),
+                                          desc_trans.get(),
+                                          GetBasePtr(&trans_dx));
+
+      if (need_transpose) {
+        MLUCnnlTensorDesc desc_dx(*d_x);
+        MLUCnnlTensorDesc desc_trans_dx(trans_dx);
+        MLUCnnl::Transpose(ctx,
+                           backward_perm,
+                           trans_dx.dims().size(),
+                           desc_trans_dx.get(),
+                           GetBasePtr(&trans_dx),
+                           desc_dx.get(),
+                           GetBasePtr(d_x));
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_MLU_KERNEL(sync_batch_norm,
+                       ops::SyncBatchNormMLUKernel<float>,
+                       ops::SyncBatchNormMLUKernel<plat::float16>);
+
+REGISTER_OP_MLU_KERNEL(sync_batch_norm_grad,
+                       ops::SyncBatchNormMLUGradKernel<float>,
+                       ops::SyncBatchNormMLUGradKernel<plat::float16>);
diff --git a/paddle/fluid/operators/tensorrt/CMakeLists.txt b/paddle/fluid/operators/tensorrt/CMakeLists.txt
index e0fed2804a9b7..0d731b14c6a97 100644
--- a/paddle/fluid/operators/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/operators/tensorrt/CMakeLists.txt
@@ -4,3 +4,9 @@ nv_test(
   test_tensorrt_engine_op
   SRCS tensorrt_engine_op_test.cc
   DEPS tensorrt_engine_op analysis)
+
+if(WITH_ONNXRUNTIME AND WIN32)
+  # Copy onnxruntime for some c++ test in Windows, since the test will
+  # be build only in CI, so suppose the generator in Windows is Ninja.
+  copy_onnx(test_tensorrt_engine_op)
+endif()
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index b1aa2b2c49ef6..1cd2683796acd 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -14,7 +14,12 @@
 
 #pragma once
 
+#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/place.h"
 #ifdef PADDLE_WITH_CUDA
 
 #include <memory>
@@ -192,6 +197,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
   std::map<std::string, std::vector<int>> min_input_shape_{};
   std::map<std::string, std::vector<int>> max_input_shape_{};
   std::map<std::string, std::vector<int>> opt_input_shape_{};
+  phi::DataType model_precision_{phi::DataType::FLOAT32};
 
  public:
   TensorRTEngineOp(const std::string &type,
@@ -217,6 +223,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
     if (use_static_engine_) {
       model_opt_cache_dir_ = Attr<std::string>("model_opt_cache_dir");
     }
+    model_precision_ = static_cast<phi::DataType>(Attr<int>("model_precision"));
 
     if (HasAttr("dynamic_shape_names") && HasAttr("min_input_shape") &&
         HasAttr("max_input_shape") && HasAttr("opt_input_shape")) {
@@ -467,13 +474,6 @@ class TensorRTEngineOp : public framework::OperatorBase {
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx).stream();
 
-    PADDLE_ENFORCE_EQ(
-        runtime_input_names_.empty(),
-        false,
-        platform::errors::PreconditionNotMet(
-            "TensorRT engine needs at least one input, but no input is found. "
-            "Please check if you set the input correctly."));
-
     std::vector<std::string> output_maps =
         Attr<std::vector<std::string>>("output_name_mapping");
 
@@ -562,6 +562,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
 #endif
       }
       runtime_batch = t_shape[0];
+      VLOG(1) << "trt input [" << x << "] dtype is " << t.dtype();
       auto type = framework::TransToProtoVarType(t.dtype());
       if (type == framework::proto::VarType::FP32) {
         buffers[bind_index] = static_cast<void *>(t.data<float>());
@@ -626,6 +627,8 @@ class TensorRTEngineOp : public framework::OperatorBase {
                             num_bindings));
       auto trt_type = engine->engine()->getBindingDataType(bind_index);
       // get adr and set type
+      VLOG(1) << "trt output [" << y << "] dtype is "
+              << TRT2FluidDataType(trt_type);
       buffers[bind_index] = static_cast<void *>(
           fluid_t->mutable_data(dev_place, TRT2FluidDataType(trt_type)));
       output_index += 1;
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
index 08a71ad713a1b..8e2b162babce9 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+#include "paddle/phi/common/data_type.h"
 
 USE_NO_KERNEL_OP(tensorrt_engine);
 namespace paddle {
@@ -132,6 +133,8 @@ void DynamicShapeTest(bool allow_build_at_runtime) {
   engine_op_desc.SetAttr("min_input_shape", std::vector<int>{1, 4, 1, 1});
   engine_op_desc.SetAttr("max_input_shape", std::vector<int>{2, 4, 1, 1});
   engine_op_desc.SetAttr("opt_input_shape", std::vector<int>{2, 4, 1, 1});
+  engine_op_desc.SetAttr("model_precision",
+                         static_cast<int>(phi::DataType::FLOAT32));
 
   LOG(INFO) << "create engine op";
   auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
@@ -150,7 +153,6 @@ void DynamicShapeTest(bool allow_build_at_runtime) {
   else
     CreateCUDATensor(&scope, "x", std::vector<int64_t>({2, 4, 1, 1}));
   CreateCUDATensor(&scope, "y", std::vector<int64_t>({4, 6}));
-  CreateCUDATensor(&scope, "z", std::vector<int64_t>({2, 6}));
 
   CreateCUDATensor(&scope, "y0", std::vector<int64_t>({6, 8}));
   CreateCUDATensor(&scope, "z0", std::vector<int64_t>({2, 8}));
diff --git a/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h b/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
index cd0842e4a47bf..6df883e83337f 100644
--- a/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
+++ b/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
@@ -103,10 +103,9 @@ static bool TestLeakyReluGradGradMain(const framework::DDim &dim,
     for_range(actual_functor);
   } else {
 #endif
-    auto &cpu_dev_ctx = dynamic_cast<platform::CPUDeviceContext &>(dev_ctx);
+    auto &cpu_dev_ctx = dynamic_cast<phi::CPUContext &>(dev_ctx);
     functor(cpu_dev_ctx, &x, out, &ddx, &ddout, dout, dx);
-    platform::ForRange<platform::CPUDeviceContext> for_range(cpu_dev_ctx,
-                                                             limit);
+    platform::ForRange<phi::CPUContext> for_range(cpu_dev_ctx, limit);
     for_range(actual_functor);
 #if defined(__NVCC__) || defined(__HIPCC__)
   }
diff --git a/paddle/fluid/operators/transpose_op.cu.h b/paddle/fluid/operators/transpose_op.cu.h
index 1b90ad2c31384..0ae020c0dfd3c 100644
--- a/paddle/fluid/operators/transpose_op.cu.h
+++ b/paddle/fluid/operators/transpose_op.cu.h
@@ -22,7 +22,6 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/autotune/auto_tune_base.h"
-#include "paddle/phi/kernels/autotune/cache.h"
 
 namespace paddle {
 namespace operators {
@@ -1155,50 +1154,31 @@ inline void SimplifyThenLaunch(const int rank,
 }
 
 template <typename T>
-size_t GetTransposeKey(const int rank,
-                       const Tensor& in,
-                       const std::vector<int32_t>& perm) {
-  auto in_shape = phi::vectorize(in.dims());
-  return phi::autotune::GetKey(
-      in_shape, perm, rank, paddle::experimental::CppTypeToDataType<T>::Type());
-}
-
-template <typename T>
-void TransposeGPUKernelDriver(const phi::GPUContext& dev_ctx,
-                              const int rank,
+void TransposeGPUKernelDriver(const phi::GPUContext& ctx,
                               const Tensor& in,
                               const std::vector<int32_t>& perm,
                               Tensor* out) {
-  PADDLE_ENFORCE_LT(
-      rank,
-      phi::DDim::kMaxRank,
-      platform::errors::OutOfRange(
-          "The maximum dimension rank of "
-          "tensor is expected to be less than %d, but here is %d.",
-          phi::DDim::kMaxRank,
-          rank));
-
-  auto ret = TransposeSimple<T>::run(dev_ctx, in, perm, out);
+  const int rank = perm.size();
+  auto ret = TransposeSimple<T>::run(ctx, in, perm, out);
   if (!ret) {
-    auto* tuner = phi::autotune::MakeTransposeTuner<T>(
-        SimplifyThenLaunch<phi::GPUContext, T>);
-    if (!tuner->IsInit()) {
-      tuner->AddCallBack(
-          phi::autotune::MakeCallback<T>(TransCompute<phi::GPUContext, T>));
-      tuner->Finalize();
-    }
-
-    auto key = GetTransposeKey<T>(rank, in, perm);
-    auto& cache = phi::autotune::AutoTuneCache::Instance().GetTranspose();
-    if (cache.Find(key)) {
-      auto index = cache.Get(key);
-      tuner->RunBestKernel(index, rank, dev_ctx, in, out, perm);
-    } else {
-      // All avaliable kernels have ran while picking the best kernel, so
-      // there may be no need for another RunBestKernel.
-      auto index = tuner->PickBestKernel(dev_ctx, rank, dev_ctx, in, out, perm);
-      cache.Set(key, index);
-    }
+    auto* tuner =
+        phi::autotune::MakeTransposeTuner<T>(TransCompute<phi::GPUContext, T>);
+    tuner->AddCallBack(
+        phi::autotune::MakeCallback<T>(SimplifyThenLaunch<phi::GPUContext, T>));
+
+    size_t key = phi::autotune::TransposeKey(
+        phi::vectorize(in.dims()),
+        perm,
+        paddle::experimental::CppTypeToDataType<T>::Type());
+
+    tuner->Run(ctx,
+               phi::autotune::AlgorithmType::kTranspose,
+               key,
+               rank,
+               ctx,
+               in,
+               out,
+               perm);
   }
 }
 
diff --git a/paddle/fluid/operators/tree_conv_op.cc b/paddle/fluid/operators/tree_conv_op.cc
index f62e1d3072fa3..525dd17c39bb9 100644
--- a/paddle/fluid/operators/tree_conv_op.cc
+++ b/paddle/fluid/operators/tree_conv_op.cc
@@ -234,12 +234,10 @@ REGISTER_OPERATOR(tree_conv,
 
 REGISTER_OPERATOR(tree_conv_grad, ops::TreeConvGradOp);
 
-REGISTER_OP_CPU_KERNEL(
-    tree_conv,
-    ops::TreeConvKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TreeConvKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    tree_conv_grad,
-    ops::TreeConvGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TreeConvGradKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(tree_conv,
+                       ops::TreeConvKernel<phi::CPUContext, float>,
+                       ops::TreeConvKernel<phi::CPUContext, double>);
+
+REGISTER_OP_CPU_KERNEL(tree_conv_grad,
+                       ops::TreeConvGradKernel<phi::CPUContext, float>,
+                       ops::TreeConvGradKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/unique_consecutive_op.cc b/paddle/fluid/operators/unique_consecutive_op.cc
index e9c6a4edb66c5..97cd31141da2b 100644
--- a/paddle/fluid/operators/unique_consecutive_op.cc
+++ b/paddle/fluid/operators/unique_consecutive_op.cc
@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/unique_consecutive_op.h"
-
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -23,53 +25,6 @@ class UniqueConsecutiveOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "unique_consecutive");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("Out"), "Output", "Out", "unique_consecutive");
-
-    auto in_dims = ctx->GetInputDim("X");
-    bool return_inverse = ctx->Attrs().Get<bool>("return_inverse");
-    bool return_counts = ctx->Attrs().Get<bool>("return_counts");
-    auto axis_vec = ctx->Attrs().Get<std::vector<int>>("axis");
-    if (return_inverse) {
-      OP_INOUT_CHECK(
-          ctx->HasOutput("Index"), "Output", "Index", "unique_consecutive");
-    }
-    if (return_counts) {
-      OP_INOUT_CHECK(
-          ctx->HasOutput("Counts"), "Output", "Counts", "unique_consecutive");
-    }
-
-    if (axis_vec.empty()) {
-      ctx->SetOutputDim("Out", {-1});
-      if (return_inverse) {
-        ctx->SetOutputDim("Index", {phi::product(in_dims)});
-      }
-    } else {
-      int axis = axis_vec[0];
-      if (axis < 0) {
-        axis += in_dims.size();
-      }
-      PADDLE_ENFORCE_LT(
-          axis,
-          in_dims.size(),
-          platform::errors::InvalidArgument("The axis(%d) should be less than "
-                                            "the dimension size(%d) of x.",
-                                            axis,
-                                            in_dims.size()));
-      auto out_dims = in_dims;
-      out_dims[axis] = -1;
-      ctx->SetOutputDim("Out", out_dims);
-      if (return_inverse) {
-        ctx->SetOutputDim("Index", {in_dims[axis]});
-      }
-    }
-    if (return_counts) {
-      ctx->SetOutputDim("Counts", {-1});
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -115,15 +70,13 @@ class UniqueConsecutiveOpMaker : public framework::OpProtoAndCheckerMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(unique_consecutive,
+                            UniqueConsecutiveInferShapeFunctor,
+                            PD_INFER_META(phi::UniqueConsecutiveInferMeta));
 REGISTER_OP_WITHOUT_GRADIENT(unique_consecutive,
                              ops::UniqueConsecutiveOp,
-                             ops::UniqueConsecutiveOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    unique_consecutive,
-    ops::UniqueConsecutiveKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::UniqueConsecutiveKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::UniqueConsecutiveKernel<paddle::platform::CPUDeviceContext, int32_t>,
-    ops::UniqueConsecutiveKernel<paddle::platform::CPUDeviceContext, int64_t>);
+                             ops::UniqueConsecutiveOpMaker,
+                             UniqueConsecutiveInferShapeFunctor);
 REGISTER_OP_VERSION(unique_consecutive)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/operators/unique_consecutive_op.h b/paddle/fluid/operators/unique_consecutive_op.h
deleted file mode 100644
index b0eadbd877de5..0000000000000
--- a/paddle/fluid/operators/unique_consecutive_op.h
+++ /dev/null
@@ -1,287 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <cmath>
-#include <numeric>
-#include <set>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/operators/unique_op.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-template <typename InT, typename IndexT>
-static void UniqueConsecutiveFlattendTensor(
-    const framework::ExecutionContext& context,
-    const framework::Tensor& in,
-    framework::Tensor* out,
-    bool return_inverse,
-    bool return_counts) {
-  const InT* in_data = in.data<InT>();
-  std::vector<InT> out_vec(in.numel());
-  std::vector<IndexT> inverse_vec(in.numel());
-  std::vector<IndexT> counts_vec(in.numel());
-  memcpy(out_vec.data(), in_data, in.numel() * sizeof(InT));
-  InT* p = out_vec.data();
-  int64_t last = 0;
-  IndexT* q = counts_vec.data();
-  for (int64_t i = 0; i < in.numel(); i++) {
-    if (in_data[i] != *p) {
-      *(++p) = in_data[i];
-      if (return_counts) {
-        *(q++) = i - last;
-        last = i;
-      }
-    }
-    if (return_inverse) {
-      inverse_vec[i] = p - out_vec.data();
-    }
-  }
-
-  int64_t output_size = p - out_vec.data() + 1;
-  if (return_counts) {
-    *q = in.numel() - last;
-    counts_vec.resize(output_size);
-  }
-  out_vec.resize(output_size);
-
-  out->Resize(phi::make_ddim({output_size}));
-  auto* out_data = out->mutable_data<InT>(context.GetPlace());
-  std::copy(out_vec.begin(), out_vec.end(), out_data);
-
-  if (return_inverse) {
-    auto* inverse = context.Output<framework::Tensor>("Index");
-    inverse->Resize(phi::make_ddim({in.numel()}));
-    auto* inverse_data = inverse->mutable_data<IndexT>(context.GetPlace());
-    std::copy(inverse_vec.begin(), inverse_vec.end(), inverse_data);
-  }
-
-  if (return_counts) {
-    auto* count = context.Output<framework::Tensor>("Counts");
-    count->Resize(phi::make_ddim({out->numel()}));
-    auto* counts_data = count->mutable_data<IndexT>(context.GetPlace());
-    std::copy(counts_vec.begin(), counts_vec.end(), counts_data);
-  }
-}
-
-template <class ForwardIt, typename InT, typename IndexT>
-static ForwardIt UniqueConsecutiveDimImpl(
-    const framework::ExecutionContext& context,
-    ForwardIt first,
-    ForwardIt last,
-    const std::vector<IndexT>& sorted_indices_vec,
-    std::vector<IndexT>* inverse_vec,
-    std::vector<IndexT>* counts_vec) {
-  if (first == last) {
-    return last;
-  }
-
-  (*inverse_vec)[sorted_indices_vec[0]] = 0;
-  (*counts_vec)[0] = 1;
-
-  ForwardIt begin = first;
-  ForwardIt result = first;
-
-  while (++first != last) {
-    int64_t idx_first = std::distance(begin, first);
-    int64_t idx_result = std::distance(begin, result);
-    if (!Equal<InT>(*result, *first)) {
-      if (++result != first) {
-        *result = std::move(*first);
-      }
-      idx_result += 1;
-    }
-    (*inverse_vec)[sorted_indices_vec[idx_first]] = idx_result;
-    (*counts_vec)[idx_result] += 1;
-  }
-  return ++result;
-}
-
-template <typename DeviceContext, typename InT, typename IndexT>
-static void UniqueConsecutiveDim(const framework::ExecutionContext& context,
-                                 const framework::Tensor& in,
-                                 framework::Tensor* out,
-                                 bool return_inverse,
-                                 bool return_counts,
-                                 int axis) {
-  // transpose tensor: eg. axis=1, [dim0, dim1, dim2] -> [dim1, dim0, dim2]
-  std::vector<int> permute(in.dims().size());
-  std::iota(permute.begin(), permute.end(), 0);
-  permute[axis] = 0;
-  permute[0] = axis;
-  std::vector<int64_t> in_trans_dims_vec(phi::vectorize(in.dims()));
-  in_trans_dims_vec[axis] = in.dims()[0];
-  in_trans_dims_vec[0] = in.dims()[axis];
-  framework::Tensor in_trans;
-  framework::DDim in_trans_dims = phi::make_ddim(in_trans_dims_vec);
-  in_trans.Resize(in_trans_dims);
-  in_trans.mutable_data<InT>(context.GetPlace());
-  auto& dev_ctx = context.template device_context<DeviceContext>();
-  TransCompute<DeviceContext, InT>(
-      in.dims().size(), dev_ctx, in, &in_trans, permute);
-  // reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2]
-  framework::DDim in_trans_flat_dims = phi::flatten_to_2d(in_trans_dims, 1);
-  in_trans.Resize(in_trans_flat_dims);
-
-  std::vector<IndexT> sorted_indices_vec(in_trans.dims()[0]);
-  std::iota(sorted_indices_vec.begin(), sorted_indices_vec.end(), 0);
-  int64_t col = in_trans.dims()[1];
-  const InT* in_trans_data = in_trans.data<InT>();
-
-  // sort tensor according to indices
-  framework::Tensor input_sorted;
-  input_sorted.Resize(in_trans_dims);
-  input_sorted.mutable_data<InT>(context.GetPlace());
-  InT* input_sorted_data = input_sorted.data<InT>();
-  for (size_t i = 0; i < sorted_indices_vec.size(); ++i) {
-    memcpy(input_sorted_data + i * col,
-           in_trans_data + static_cast<int64_t>(sorted_indices_vec[i]) * col,
-           col * sizeof(InT));
-  }
-  std::vector<framework::Tensor> input_unbind = Unbind(input_sorted);
-  std::vector<IndexT> inverse_vec(sorted_indices_vec.size(), 0);
-  std::vector<IndexT> counts_vec(sorted_indices_vec.size(), 0);
-  auto last =
-      UniqueConsecutiveDimImpl<std::vector<framework::Tensor>::iterator, InT>(
-          context,
-          input_unbind.begin(),
-          input_unbind.end(),
-          sorted_indices_vec,
-          &inverse_vec,
-          &counts_vec);
-  input_unbind.erase(last, input_unbind.end());
-  counts_vec.erase(counts_vec.begin() + input_unbind.size(), counts_vec.end());
-
-  math::ConcatFunctor<DeviceContext, InT> concat_functor;
-  framework::Tensor out_trans;
-  std::vector<int64_t> out_trans_dims_vec = in_trans_dims_vec;
-  out_trans_dims_vec[0] = input_unbind.size();
-  out_trans.Resize(phi::make_ddim(out_trans_dims_vec));
-  out_trans.mutable_data<InT>(context.GetPlace());
-  std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]);
-  out->Resize(phi::make_ddim(out_trans_dims_vec));
-  out->mutable_data<InT>(context.GetPlace());
-  concat_functor(dev_ctx, input_unbind, 0, &out_trans);
-  TransCompute<DeviceContext, InT>(
-      out_trans.dims().size(), dev_ctx, out_trans, out, permute);
-  if (return_inverse) {
-    auto* inverse = context.Output<framework::Tensor>("Index");
-    framework::TensorFromVector(inverse_vec, context.device_context(), inverse);
-  }
-  if (return_counts) {
-    auto* count = context.Output<framework::Tensor>("Counts");
-    framework::TensorFromVector(counts_vec, context.device_context(), count);
-  }
-}
-
-template <typename DeviceContext, typename InT>
-struct UniqueConsecutiveFlattendTensorFunctor {
-  const framework::ExecutionContext& ctx_;
-  const framework::Tensor& in_;
-  framework::Tensor* out_;
-  const bool return_inverse_;
-  const bool return_counts_;
-
-  UniqueConsecutiveFlattendTensorFunctor(
-      const framework::ExecutionContext& context,
-      const framework::Tensor& in,
-      framework::Tensor* out,
-      bool return_inverse,
-      bool return_counts)
-      : ctx_(context),
-        in_(in),
-        out_(out),
-        return_inverse_(return_inverse),
-        return_counts_(return_counts) {}
-
-  template <typename IndexT>
-  void apply() const {
-    UniqueConsecutiveFlattendTensor<InT, IndexT>(
-        ctx_, in_, out_, return_inverse_, return_counts_);
-  }
-};
-
-template <typename DeviceContext, typename InT>
-struct UniqueConsecutiveDimFunctor {
-  const framework::ExecutionContext& ctx_;
-  const framework::Tensor& in_;
-  framework::Tensor* out_;
-  const int axis_;
-  const bool return_inverse_;
-  const bool return_counts_;
-  UniqueConsecutiveDimFunctor(const framework::ExecutionContext& context,
-                              const framework::Tensor& in,
-                              framework::Tensor* out,
-                              const int axis,
-                              bool return_inverse,
-                              bool return_counts)
-      : ctx_(context),
-        in_(in),
-        out_(out),
-        axis_(axis),
-        return_inverse_(return_inverse),
-        return_counts_(return_counts) {}
-
-  template <typename IndexT>
-  void apply() const {
-    UniqueConsecutiveDim<DeviceContext, InT, IndexT>(
-        ctx_, in_, out_, return_inverse_, return_counts_, axis_);
-  }
-};
-template <typename DeviceContext, typename T>
-class UniqueConsecutiveKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto data_type = static_cast<framework::proto::VarType::Type>(
-        context.Attr<int>("dtype"));
-    if (data_type == framework::proto::VarType::INT32) {
-      PADDLE_ENFORCE_LE(
-          x->numel(),
-          INT_MAX,
-          platform::errors::InvalidArgument(
-              "The number of elements in Input(X) should be less than or "
-              "equal to INT_MAX, but received num is %d. Please set `dtype` to "
-              "int64.",
-              x->numel()));
-    }
-    std::vector<int> axis_vec = context.Attr<std::vector<int>>("axis");
-    bool return_inverse = context.Attr<bool>("return_inverse");
-    bool return_counts = context.Attr<bool>("return_counts");
-
-    if (axis_vec.empty()) {
-      framework::VisitDataTypeTiny(
-          data_type,
-          UniqueConsecutiveFlattendTensorFunctor<DeviceContext, T>(
-              context, *x, out, return_inverse, return_counts));
-    } else {
-      int axis = axis_vec[0];
-      framework::VisitDataTypeTiny(
-          data_type,
-          UniqueConsecutiveDimFunctor<DeviceContext, T>(
-              context, *x, out, axis, return_inverse, return_counts));
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/unpool_op.cc b/paddle/fluid/operators/unpool_op.cc
index 82e6b734aa009..47679ca57f5bf 100644
--- a/paddle/fluid/operators/unpool_op.cc
+++ b/paddle/fluid/operators/unpool_op.cc
@@ -328,14 +328,12 @@ REGISTER_OPERATOR(unpool,
                   ops::UnpoolOpGradMaker<paddle::imperative::OpBase>);
 
 REGISTER_OPERATOR(unpool_grad, ops::UnpoolOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    unpool,
-    ops::UnpoolKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::UnpoolKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    unpool_grad,
-    ops::UnpoolGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::UnpoolGradKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(unpool,
+                       ops::UnpoolKernel<phi::CPUContext, float>,
+                       ops::UnpoolKernel<phi::CPUContext, double>);
+REGISTER_OP_CPU_KERNEL(unpool_grad,
+                       ops::UnpoolGradKernel<phi::CPUContext, float>,
+                       ops::UnpoolGradKernel<phi::CPUContext, double>);
 
 REGISTER_OPERATOR(unpool3d,
                   ops::Unpool3dOp,
@@ -344,11 +342,9 @@ REGISTER_OPERATOR(unpool3d,
                   ops::Unpool3dOpGradMaker<paddle::imperative::OpBase>);
 
 REGISTER_OPERATOR(unpool3d_grad, ops::Unpool3dOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    unpool3d,
-    ops::Unpool3dKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::Unpool3dKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    unpool3d_grad,
-    ops::Unpool3dGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::Unpool3dGradKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(unpool3d,
+                       ops::Unpool3dKernel<phi::CPUContext, float>,
+                       ops::Unpool3dKernel<phi::CPUContext, double>);
+REGISTER_OP_CPU_KERNEL(unpool3d_grad,
+                       ops::Unpool3dGradKernel<phi::CPUContext, float>,
+                       ops::Unpool3dGradKernel<phi::CPUContext, double>);
diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc
index 18a1d5435e014..53de6440f1f61 100644
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@@ -378,33 +378,28 @@ REGISTER_OPERATOR(unsqueeze2_grad,
 
 REGISTER_OP_CPU_KERNEL(
     unsqueeze,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int16_t>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, uint8_t>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex<float>>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex<double>>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::bfloat16>);
+    ops::UnsqueezeKernel<phi::CPUContext, float>,
+    ops::UnsqueezeKernel<phi::CPUContext, double>,
+    ops::UnsqueezeKernel<phi::CPUContext, bool>,
+    ops::UnsqueezeKernel<phi::CPUContext, int>,
+    ops::UnsqueezeKernel<phi::CPUContext, int16_t>,
+    ops::UnsqueezeKernel<phi::CPUContext, uint8_t>,
+    ops::UnsqueezeKernel<phi::CPUContext, int8_t>,
+    ops::UnsqueezeKernel<phi::CPUContext, int64_t>,
+    ops::UnsqueezeKernel<phi::CPUContext, paddle::platform::complex<float>>,
+    ops::UnsqueezeKernel<phi::CPUContext, paddle::platform::complex<double>>,
+    ops::UnsqueezeKernel<phi::CPUContext, paddle::platform::bfloat16>);
 REGISTER_OP_CPU_KERNEL(
     unsqueeze_grad,
-    ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, int16_t>,
-    ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, uint8_t>,
-    ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext,
-                             paddle::platform::complex<float>>,
-    ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext,
+    ops::UnsqueezeGradKernel<phi::CPUContext, float>,
+    ops::UnsqueezeGradKernel<phi::CPUContext, double>,
+    ops::UnsqueezeGradKernel<phi::CPUContext, bool>,
+    ops::UnsqueezeGradKernel<phi::CPUContext, int>,
+    ops::UnsqueezeGradKernel<phi::CPUContext, int16_t>,
+    ops::UnsqueezeGradKernel<phi::CPUContext, uint8_t>,
+    ops::UnsqueezeGradKernel<phi::CPUContext, int8_t>,
+    ops::UnsqueezeGradKernel<phi::CPUContext, int64_t>,
+    ops::UnsqueezeGradKernel<phi::CPUContext, paddle::platform::complex<float>>,
+    ops::UnsqueezeGradKernel<phi::CPUContext,
                              paddle::platform::complex<double>>,
-    ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext,
-                             paddle::platform::bfloat16>);
+    ops::UnsqueezeGradKernel<phi::CPUContext, paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/var_conv_2d_op.cc b/paddle/fluid/operators/var_conv_2d_op.cc
index d46d2bd847341..53feefef3e1cc 100644
--- a/paddle/fluid/operators/var_conv_2d_op.cc
+++ b/paddle/fluid/operators/var_conv_2d_op.cc
@@ -321,7 +321,7 @@ class CPUVarConv2dOPKernel : public framework::OpKernel<T> {
     auto* w_data = w->data<T>();
     auto* col_data = col->data<T>();
 
-    auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, T>(ctx);
+    auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(ctx);
     for (int b = 0; b < batch; ++b) {
       int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel;
       if (top_im_size == 0) {
@@ -479,7 +479,7 @@ class CPUVarConv2dOPGradKernel : public framework::OpKernel<T> {
     int batch = x->lod()[0].size() - 1;
     const auto& top_offset = out->lod()[0];
     const auto& col_offset = col->lod()[0];
-    auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, T>(ctx);
+    auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(ctx);
     for (int b = 0; b < batch; ++b) {
       int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel;
       if (top_im_size == 0) {
@@ -526,11 +526,10 @@ REGISTER_OPERATOR(var_conv_2d,
 REGISTER_OPERATOR(var_conv_2d_grad, ops::VarConv2dOpGrad);
 
 REGISTER_OP_CPU_KERNEL(var_conv_2d,
-                       ops::CPUVarConv2dOPKernel<plt::CPUDeviceContext, float>);
-//     ops::CPUVarConv2dOPKernel<plt::CPUDeviceContext,
+                       ops::CPUVarConv2dOPKernel<phi::CPUContext, float>);
+//     ops::CPUVarConv2dOPKernel<phi::CPUContext,
 //                                       double>
-REGISTER_OP_CPU_KERNEL(
-    var_conv_2d_grad,
-    ops::CPUVarConv2dOPGradKernel<plt::CPUDeviceContext, float>);
-//     ops::CPUVarConv2dOPGradKernel<plt::CPUDeviceContext,
+REGISTER_OP_CPU_KERNEL(var_conv_2d_grad,
+                       ops::CPUVarConv2dOPGradKernel<phi::CPUContext, float>);
+//     ops::CPUVarConv2dOPGradKernel<phi::CPUContext,
 //                                           double>
diff --git a/paddle/fluid/operators/where_index_op_mlu.cc b/paddle/fluid/operators/where_index_op_mlu.cc
index d0699521aa46e..389f7960bcdc1 100644
--- a/paddle/fluid/operators/where_index_op_mlu.cc
+++ b/paddle/fluid/operators/where_index_op_mlu.cc
@@ -30,30 +30,36 @@ class MLUWhereIndexKernel : public framework::OpKernel<T> {
     auto* out = context.Output<Tensor>("Out");
     auto dims = condition->dims();
     const int rank = dims.size();
-    std::vector<int> true_num = {0};
-    std::vector<T> vec_condition;
-    paddle::framework::TensorToVector(
-        *condition, context.device_context(), &vec_condition);
-    int vec_con_size = vec_condition.size();
-    for (int i = 0; i < vec_con_size; ++i) {
-      if (vec_condition[i] > 0) true_num[0]++;
-    }
 
-    out->Resize(phi::make_ddim({true_num[0], rank}));
+    Tensor num_true;
+    num_true.mutable_data<int>({1}, context.GetPlace());
+    MLUCnnlTensorDesc con_desc(*condition);
+    MLUCnnlTensorDesc num_true_desc(num_true);
+    MLUCnnl::NumTrue(context,
+                     con_desc.get(),
+                     GetBasePtr(condition),
+                     num_true_desc.get(),
+                     GetBasePtr(&num_true));
+
+    Tensor local_true_num;
+    paddle::framework::TensorCopySync(
+        num_true, platform::CPUPlace(), &local_true_num);
+    auto true_num = *local_true_num.data<int>();
+
+    out->Resize(phi::make_ddim({true_num, rank}));
     out->mutable_data<int64_t>(context.GetPlace());
+
+    if (true_num == 0) {
+      return;
+    }
+
     auto& dev_ctx = context.template device_context<MLUDeviceContext>();
     framework::Tensor out_int32 =
         context.AllocateTmpTensor<int32_t, MLUDeviceContext>(out->dims(),
                                                              dev_ctx);
-    Tensor num_true;
-    paddle::framework::TensorFromVector(
-        true_num, context.device_context(), &num_true);
-    num_true.mutable_data<int>(context.GetPlace());
-    bool as_tuple = false;
-    MLUCnnlTensorDesc con_desc(*condition);
-    MLUCnnlTensorDesc num_true_desc(num_true);
     MLUCnnlTensorDesc out_int32_desc(out_int32);
     MLUCnnlTensorDesc out_desc(*out);
+    bool as_tuple = false;
     MLUCnnl::Where(context,
                    con_desc.get(),
                    GetBasePtr(condition),
diff --git a/paddle/fluid/operators/xpu_api_wrapper.h b/paddle/fluid/operators/xpu_api_wrapper.h
index 8d51e53e8b394..c85a765f3b6fd 100644
--- a/paddle/fluid/operators/xpu_api_wrapper.h
+++ b/paddle/fluid/operators/xpu_api_wrapper.h
@@ -12,42 +12,206 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 #include <vector>
+#include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 
 namespace paddle {
 namespace operators {
 
+using float16 = typename XPUTypeTrait<paddle::platform::float16>::Type;
+
+enum XPUFCCalcType {
+  FC_INT16 = 0,
+  FC_INT32,
+  FC_FLOAT,
+};
+
+template <typename T>
+XPUFCCalcType FCCalcType() {
+  if (std::is_same<paddle::platform::float16, T>::value ||
+      std::is_same<float16, T>::value) {
+    return XPUFCCalcType::FC_INT16;
+  } else if (std::getenv("XPU_PADDLE_FC_INT32") != nullptr) {
+    return XPUFCCalcType::FC_INT32;
+  } else if (std::getenv("XPU_PADDLE_FC_LOCAL_INT16") != nullptr) {
+    return XPUFCCalcType::FC_FLOAT;
+  }
+  return XPUFCCalcType::FC_INT16;
+}
+
+struct XpuFcInfo {
+  int bs;
+  int m;
+  int n;
+  int k;
+  bool trans_x;
+  bool trans_y;
+  int stride_x;
+  int stride_y;
+  int stride_out;
+  float* max_x;
+  float* max_y;
+  float* max_out;
+  XpuFcInfo()
+      : bs(0),
+        m(0),
+        n(0),
+        k(0),
+        trans_x(false),
+        trans_y(false),
+        stride_x(0),
+        stride_y(0),
+        stride_out(0),
+        max_x(nullptr),
+        max_y(nullptr),
+        max_out(nullptr) {}
+  void InitFcInfo(int bs,
+                  int m,
+                  int n,
+                  int k,
+                  bool trans_x,
+                  bool trans_y,
+                  float* max_x,
+                  float* max_y,
+                  float* max_out) {
+    this->bs = bs;
+    this->m = m;
+    this->n = n;
+    this->k = k;
+    this->trans_x = trans_x;
+    this->trans_y = trans_y;
+    this->max_x = max_x;
+    this->max_y = max_y;
+    this->max_out = max_out;
+
+    if (this->bs <= 1) {
+      this->stride_x = trans_x ? m : k;
+      this->stride_y = trans_y ? k : n;
+      this->stride_out = n;
+    } else {
+      this->stride_x = m * k;
+      this->stride_y = k * n;
+      this->stride_out = m * n;
+    }
+  }
+};
+
+static std::ostream& operator<<(std::ostream& os, const XpuFcInfo& fc_inf) {
+  os << "fc_inf[ bs, m, n, k, trans_x, trans_y, stride_x, stride_y, "
+        "stride_out] = "
+     << "[" << fc_inf.bs << ", " << fc_inf.m << ", " << fc_inf.n << ", "
+     << fc_inf.k << ", " << fc_inf.trans_x << ", " << fc_inf.trans_y << ", "
+     << fc_inf.stride_x << ", " << fc_inf.stride_y << ", " << fc_inf.stride_out;
+  return os;
+}
+
+static void GetFCInfo(const phi::DDim& x_dims,
+                      const phi::DDim& y_dims,
+                      bool trans_x,
+                      bool trans_y,
+                      XpuFcInfo* info) {
+  framework::DDim new_x_dims =
+      (x_dims.size() > 1) ? x_dims : phi::make_ddim({1, x_dims[0]});
+  framework::DDim new_y_dims =
+      (y_dims.size() > 1) ? y_dims : phi::make_ddim({y_dims[0], 1});
+
+  auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(new_x_dims, 0, trans_x);
+  auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(new_y_dims, 0, trans_y);
+
+  if (x_dims.size() >= 3 && y_dims.size() <= 2) {
+    if (!trans_x) {
+      mat_dim_a.height_ *= mat_dim_a.batch_size_;
+      mat_dim_a.batch_size_ = 0;
+    } else {
+      mat_dim_b.batch_size_ = mat_dim_a.batch_size_;
+      mat_dim_b.height_ = mat_dim_b.height_ / mat_dim_b.batch_size_;
+    }
+  }
+
+  if (y_dims.size() >= 3 && x_dims.size() <= 2) {
+    PADDLE_ENFORCE_EQ(
+        mat_dim_b.trans_,
+        false,
+        platform::errors::InvalidArgument(
+            "xpu not support this Shape in matmul_op xdims = %s ydims = %s "
+            "x_trans = %d y_trans = %d",
+            x_dims.to_str(),
+            y_dims.to_str(),
+            mat_dim_a.trans_,
+            mat_dim_b.trans_));
+    mat_dim_b.height_ *= mat_dim_b.batch_size_;
+    mat_dim_b.batch_size_ = 0;
+  }
+
+  if (mat_dim_a.width_ == mat_dim_b.height_) {
+    if (mat_dim_a.batch_size_ == 0 && mat_dim_b.batch_size_ == 1) {
+      mat_dim_a.batch_size_ = mat_dim_b.batch_size_ = 0;
+    }
+    if (mat_dim_a.batch_size_ == 1 && mat_dim_b.batch_size_ == 0) {
+      mat_dim_a.batch_size_ = mat_dim_b.batch_size_ = 0;
+    }
+  }
+
+  PADDLE_ENFORCE_EQ(mat_dim_a.width_,
+                    mat_dim_b.height_,
+                    platform::errors::InvalidArgument(
+                        "Shape mistake in matmul_op xdims = %s ydims = %s "
+                        "x_trans = %d y_trans = %d",
+                        x_dims.to_str(),
+                        y_dims.to_str(),
+                        mat_dim_a.trans_,
+                        mat_dim_b.trans_));
+
+  info->m = mat_dim_a.height_;
+  info->n = mat_dim_b.width_;
+  info->k = mat_dim_a.width_;
+  info->bs = mat_dim_a.batch_size_;
+  info->trans_x = trans_x;
+  info->trans_y = trans_y;
+
+  if (info->bs <= 1) {
+    info->stride_x = trans_x ? info->m : info->k;
+    info->stride_y = trans_y ? info->k : info->n;
+    info->stride_out = info->n;
+  } else {
+    info->stride_x = info->m * info->k;
+    info->stride_y = info->k * info->n;
+    info->stride_out = info->m * info->n;
+  }
+}
+
 template <typename XPUType, typename FCT>
-int xpu_fc_wrapper(xpu::Context* ctx,
-                   const XPUType* x,
-                   const XPUType* w,
-                   XPUType* y,
-                   int m,
-                   int n,
-                   int k,
-                   bool x_trans,
-                   bool w_trans,
-                   const float* x_maxptr,
-                   const float* w_maxptr,
-                   float* y_maxptr,
-                   int ldx,
-                   int ldw,
-                   int ldy,
-                   float alpha,
-                   float beta,
-                   const float* bias,
-                   const xpu::Activation_t& act) {
+static void xpu_fc_wrapper(xpu::Context* ctx,
+                           const XPUType* x,
+                           const XPUType* w,
+                           XPUType* y,
+                           int m,
+                           int n,
+                           int k,
+                           bool x_trans,
+                           bool w_trans,
+                           const float* x_maxptr,
+                           const float* w_maxptr,
+                           float* y_maxptr,
+                           int ldx,
+                           int ldw,
+                           int ldy,
+                           float alpha,
+                           float beta,
+                           const float* bias,
+                           const xpu::Activation_t& act) {
   int r = 0;
   if (x_trans && std::getenv("XPU_PADDLE_FC_TRANS_A") != nullptr &&
       std::is_same<float, XPUType>::value) {
     XPUType* l3_addr = nullptr;
     xpu::ctx_guard RAII_GUARD(ctx);
     l3_addr = RAII_GUARD.alloc_l3_or_gm<XPUType>(m * k);
-    if (l3_addr == nullptr) return XPUERR_NOMEM;
+    PADDLE_ENFORCE_XDNN_NOT_NULL(l3_addr);
 
     std::vector<int> shape = {k, m};
     std::vector<int> axis = {1, 0};
     r = xpu::transpose<XPUType>(ctx, x, l3_addr, shape, axis);
-    if (r != XPU_SUCCESS) return r;
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
 
     r = xpu::fc_fusion<XPUType, XPUType, XPUType, FCT>(ctx,
                                                        l3_addr,
@@ -68,7 +232,7 @@ int xpu_fc_wrapper(xpu::Context* ctx,
                                                        beta,
                                                        bias,
                                                        act);
-    if (r != XPU_SUCCESS) return r;
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "fc_fusion");
   } else {
     r = xpu::fc_fusion<XPUType, XPUType, XPUType, FCT>(ctx,
                                                        x,
@@ -89,8 +253,356 @@ int xpu_fc_wrapper(xpu::Context* ctx,
                                                        beta,
                                                        bias,
                                                        act);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "fc_fusion");
   }
-  return r;
+}
+
+template <>
+void xpu_fc_wrapper<float16, int32_t>(xpu::Context* ctx,
+                                      const float16* x,
+                                      const float16* w,
+                                      float16* y,
+                                      int m,
+                                      int n,
+                                      int k,
+                                      bool x_trans,
+                                      bool w_trans,
+                                      const float* x_maxptr,
+                                      const float* w_maxptr,
+                                      float* y_maxptr,
+                                      int ldx,
+                                      int ldw,
+                                      int ldy,
+                                      float alpha,
+                                      float beta,
+                                      const float* bias,
+                                      const xpu::Activation_t& act) {
+  int r = xpu::Error_t::INVALID_PARAM;
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "xpu_fc_wrapper");
+}
+
+template <typename XPUType, typename FCT>
+static void xpu_fc_batch_wrapper(xpu::Context* xpu_ctx,
+                                 int bs,
+                                 bool trans_x,
+                                 bool trans_w,
+                                 int m,
+                                 int n,
+                                 int k,
+                                 float alpha,
+                                 const XPUType* x,
+                                 int stride_x,
+                                 const XPUType* w,
+                                 int stride_w,
+                                 float beta,
+                                 XPUType* y,
+                                 int stride_y,
+                                 const float* x_maxptr,
+                                 const float* w_maxptr) {
+  int r = xpu::fc_batched<XPUType, XPUType, XPUType, FCT>(
+      xpu_ctx,                              // Context* ctx,
+      bs,                                   // int batch_size,
+      trans_x,                              // bool x_trans,
+      trans_w,                              // bool w_trans,
+      m,                                    // int m,
+      n,                                    // int n,
+      k,                                    // int k,
+      alpha,                                // float alpha,
+      reinterpret_cast<const XPUType*>(x),  // const TX* x,
+      stride_x,                             // int stride_a,
+      reinterpret_cast<const XPUType*>(w),  // const TW* w,
+      stride_w,                             // int stride_b,
+      0.0,                                  // float beta,
+      reinterpret_cast<XPUType*>(y),        // TY* y,
+      stride_y,                             // int stride_c,
+      x_maxptr,                             // const float* x_maxptr,
+      w_maxptr);                            // const float* w_maxptr
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "fc_batched");
+}
+
+template <>
+void xpu_fc_batch_wrapper<float16, int32_t>(xpu::Context* xpu_ctx,
+                                            int bs,
+                                            bool trans_x,
+                                            bool trans_w,
+                                            int m,
+                                            int n,
+                                            int k,
+                                            float alpha,
+                                            const float16* x,
+                                            int stride_x,
+                                            const float16* w,
+                                            int stride_w,
+                                            float beta,
+                                            float16* y,
+                                            int stride_y,
+                                            const float* x_maxptr,
+                                            const float* w_maxptr) {
+  int r = xpu::Error_t::INVALID_PARAM;
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "xpu_fc_batch_wrapper");
+}
+
+template <>
+void xpu_fc_batch_wrapper<float16, float>(xpu::Context* xpu_ctx,
+                                          int bs,
+                                          bool trans_x,
+                                          bool trans_w,
+                                          int m,
+                                          int n,
+                                          int k,
+                                          float alpha,
+                                          const float16* x,
+                                          int stride_x,
+                                          const float16* w,
+                                          int stride_w,
+                                          float beta,
+                                          float16* y,
+                                          int stride_y,
+                                          const float* x_maxptr,
+                                          const float* w_maxptr) {
+  int r = xpu::Error_t::INVALID_PARAM;
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "xpu_fc_batch_wrapper");
+}
+
+template <typename T>
+static void MatMulXPUFunction(xpu::Context* xpu_ctx,
+                              const T* x,
+                              const T* y,
+                              T* out,
+                              const XpuFcInfo& fcinfo,
+                              float alpha) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  using float16 = typename XPUTypeTrait<paddle::platform::float16>::Type;
+  int fccal_type = FCCalcType<XPUType>();
+
+  decltype(&paddle::operators::xpu_fc_wrapper<XPUType, int16_t>)
+      fc_api_list[3] = {
+          &paddle::operators::xpu_fc_wrapper<XPUType, int16_t>,
+          &paddle::operators::xpu_fc_wrapper<XPUType, int32_t>,
+          &paddle::operators::xpu_fc_wrapper<XPUType, float>,
+      };
+  decltype(&paddle::operators::xpu_fc_batch_wrapper<XPUType, int16_t>)
+      fc_batch_api_list[3] = {
+          &paddle::operators::xpu_fc_batch_wrapper<XPUType, int16_t>,
+          &paddle::operators::xpu_fc_batch_wrapper<XPUType, int32_t>,
+          &paddle::operators::xpu_fc_batch_wrapper<XPUType, float>,
+      };
+
+  auto fc_api = fc_api_list[fccal_type];
+  auto fc_batch_api = fc_batch_api_list[fccal_type];
+
+  int m = fcinfo.m;
+  int n = fcinfo.n;
+  int k = fcinfo.k;
+  int batch_size = fcinfo.bs;
+  int ldx = fcinfo.stride_x;
+  int ldy = fcinfo.stride_y;
+  int ldout = fcinfo.stride_out;
+  bool trans_x = fcinfo.trans_x;
+  bool trans_y = fcinfo.trans_y;
+  float* max_x = fcinfo.max_x;
+  float* max_y = fcinfo.max_y;
+  float* max_out = fcinfo.max_out;
+
+  if (batch_size <= 1) {
+    fc_api(xpu_ctx,
+           reinterpret_cast<const XPUType*>(x),
+           reinterpret_cast<const XPUType*>(y),
+           reinterpret_cast<XPUType*>(out),
+           m,
+           n,
+           k,
+           trans_x,
+           trans_y,
+           max_x,
+           max_y,
+           max_out,
+           ldx,
+           ldy,
+           ldout,
+           alpha,
+           0,
+           nullptr,
+           xpu::Activation_t::LINEAR);
+  } else {
+    // batch matmul
+    fc_batch_api(xpu_ctx,                              // Context* ctx,
+                 batch_size,                           // int batch_size,
+                 trans_x,                              // bool x_trans,
+                 trans_y,                              // bool w_trans,
+                 m,                                    // int m,
+                 n,                                    // int n,
+                 k,                                    // int k,
+                 alpha,                                // float alpha,
+                 reinterpret_cast<const XPUType*>(x),  // const TX* x,
+                 ldx,                                  // int stride_a,
+                 reinterpret_cast<const XPUType*>(y),  // const TW* w,
+                 ldy,                                  // int stride_b,
+                 0.0,                                  // float beta,
+                 reinterpret_cast<XPUType*>(out),      // TY* y,
+                 ldout,                                // int stride_c,
+                 max_x,                                // const float* x_maxptr,
+                 max_y);                               // const float* w_maxptr
+  }
+}
+
+template <typename T>
+static std::tuple<XpuFcInfo, XpuFcInfo, const T*, const T*, const T*, const T*>
+MatmulGradFcInfo(xpu::Context* xpu_ctx,
+                 xpu::ctx_guard* RAII_GUARD,
+                 const XpuFcInfo& dout_shape,
+                 bool trans_x,
+                 bool trans_y,
+                 const T* x,
+                 const T* y,
+                 const T* dout) {
+  XpuFcInfo dx_shape, dy_shape;
+  const T* dx_a = NULL;
+  const T* dx_b = NULL;
+  const T* dy_a = NULL;
+  const T* dy_b = NULL;
+  bool copy_to_l3 = false;
+  float* max_dout = NULL;
+  int maxptr_size = xpu_ctx->max_ptr_size();
+  uint64_t l3_size = uint64_t(xpu_ctx->_l3_mgr.get_size());
+  int bs = (dout_shape.bs <= 1) ? (1) : (dout_shape.bs);
+  int dx_size = bs * dout_shape.m * dout_shape.k;
+  int dy_size = bs * dout_shape.k * dout_shape.n;
+  int dout_size = bs * dout_shape.m * dout_shape.n;
+  if (trans_x && trans_y) {
+    copy_to_l3 = l3_size >= (dout_size * 2 + dy_size) * sizeof(T);
+  } else if (trans_x) {
+    copy_to_l3 = l3_size >= dout_size * sizeof(T);
+  } else if (trans_y) {
+    copy_to_l3 = l3_size >= dout_size * 2 * sizeof(T);
+  } else {
+    copy_to_l3 = l3_size >= (dout_size + dx_size) * sizeof(T);
+  }
+
+  const T* dout_new = dout;
+  int r = 0;
+  if (copy_to_l3) {
+    T* dout_l3 = RAII_GUARD->alloc_l3<T>(dout_size);
+    PADDLE_ENFORCE_XDNN_NOT_NULL(dout_l3);
+    if ((dout_shape.bs > 1) || ((dout_shape.bs <= 1) &&
+                                (FCCalcType<T>() == XPUFCCalcType::FC_FLOAT))) {
+      r = xpu::copy(xpu_ctx, dout, dout_l3, dout_size);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy");
+      dout_new = dout_l3;
+    } else {
+      max_dout = RAII_GUARD->alloc_l3_or_gm<float>(maxptr_size);
+      PADDLE_ENFORCE_XDNN_NOT_NULL(max_dout);
+
+      r = xpu::findmax_copy_fusion(xpu_ctx, dout, max_dout, dout_l3, dout_size);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "findmax_copy_fusion");
+      dout_new = dout_l3;
+    }
+  } else if (((dout_shape.bs <= 1) &&
+              (FCCalcType<T>() != XPUFCCalcType::FC_FLOAT))) {
+    max_dout = RAII_GUARD->alloc_l3_or_gm<float>(maxptr_size);
+    PADDLE_ENFORCE_XDNN_NOT_NULL(max_dout);
+    r = xpu::findmax_copy_fusion(
+        xpu_ctx, dout, max_dout, reinterpret_cast<T*>(NULL), dout_size);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "findmax_copy_fusion");
+  }
+
+  if (trans_x && trans_y) {
+    // dx = T(y) * T(dout)
+    dx_shape.InitFcInfo(dout_shape.bs,
+                        dout_shape.k,
+                        dout_shape.m,
+                        dout_shape.n,
+                        true,
+                        true,
+                        nullptr,
+                        max_dout,
+                        nullptr);
+    dx_a = y, dx_b = dout_new;
+    // dy = T(dout) * T(x)
+    dy_shape.InitFcInfo(dout_shape.bs,
+                        dout_shape.n,
+                        dout_shape.k,
+                        dout_shape.m,
+                        true,
+                        true,
+                        max_dout,
+                        nullptr,
+                        nullptr);
+    dy_a = dout_new, dy_b = x;
+  } else if (trans_x) {
+    // dx = y * T(dout)
+    dx_shape.InitFcInfo(dout_shape.bs,
+                        dout_shape.k,
+                        dout_shape.m,
+                        dout_shape.n,
+                        false,
+                        true,
+                        nullptr,
+                        max_dout,
+                        nullptr);
+    dx_a = y, dx_b = dout_new;
+    // dy = x * dout
+    dy_shape.InitFcInfo(dout_shape.bs,
+                        dout_shape.k,
+                        dout_shape.n,
+                        dout_shape.m,
+                        false,
+                        false,
+                        nullptr,
+                        max_dout,
+                        nullptr);
+    dy_a = x, dy_b = dout_new;
+  } else if (trans_y) {
+    // dx = dout * y
+    dx_shape.InitFcInfo(dout_shape.bs,
+                        dout_shape.m,
+                        dout_shape.k,
+                        dout_shape.n,
+                        false,
+                        false,
+                        max_dout,
+                        nullptr,
+                        nullptr);
+    dx_a = dout_new, dx_b = y;
+    // dy =  T(dout) * x
+    dy_shape.InitFcInfo(dout_shape.bs,
+                        dout_shape.n,
+                        dout_shape.k,
+                        dout_shape.m,
+                        true,
+                        false,
+                        max_dout,
+                        nullptr,
+                        nullptr);
+    dy_a = dout_new, dy_b = x;
+  } else {
+    // dx = dout * T(y)
+    dx_shape.InitFcInfo(dout_shape.bs,
+                        dout_shape.m,
+                        dout_shape.k,
+                        dout_shape.n,
+                        false,
+                        true,
+                        max_dout,
+                        nullptr,
+                        nullptr);
+    dx_a = dout_new, dx_b = y;
+    // dy = T(x) * dout
+    dy_shape.InitFcInfo(dout_shape.bs,
+                        dout_shape.k,
+                        dout_shape.n,
+                        dout_shape.m,
+                        true,
+                        false,
+                        nullptr,
+                        max_dout,
+                        nullptr);
+    dy_a = x, dy_b = dout_new;
+  }
+  std::tuple<XpuFcInfo, XpuFcInfo, const T*, const T*, const T*, const T*>
+      result = std::make_tuple(dx_shape, dy_shape, dx_a, dx_b, dy_a, dy_b);
+
+  return result;
 }
 
 }  // namespace operators
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index dc6911aecf130..e872fb162530f 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -37,7 +37,7 @@ endif()
 cc_library(
   flags
   SRCS flags.cc
-  DEPS gflags boost)
+  DEPS gflags)
 cc_library(
   denormal
   SRCS denormal.cc
@@ -48,7 +48,7 @@ cc_test(
   SRCS errors_test.cc
   DEPS errors enforce)
 
-set(enforce_deps flags errors boost flags phi_enforce)
+set(enforce_deps flags errors flags phi_enforce)
 if(WITH_GPU)
   set(enforce_deps ${enforce_deps} external_error_proto)
 endif()
@@ -88,18 +88,18 @@ if(WITH_GPU)
   nv_library(
     cuda_graph_with_memory_pool
     SRCS cuda_graph_with_memory_pool.cc
-    DEPS device_context allocator_facade cuda_graph)
+    DEPS device_context allocator cuda_graph)
 else()
   cc_library(
     cuda_graph_with_memory_pool
     SRCS cuda_graph_with_memory_pool.cc
-    DEPS device_context allocator_facade)
+    DEPS device_context allocator)
 endif()
 
 cc_library(
   place
   SRCS place.cc
-  DEPS enforce boost phi_place)
+  DEPS enforce phi_place)
 cc_test(
   place_test
   SRCS place_test.cc
@@ -185,7 +185,7 @@ endif()
 cc_library(
   cudnn_workspace_helper
   SRCS cudnn_workspace_helper.cc
-  DEPS boost)
+  DEPS)
 
 # separate init from device_context to avoid cycle dependencies
 cc_library(
@@ -195,6 +195,7 @@ cc_library(
 
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
+
 cc_library(
   device_context
   SRCS device_context.cc
@@ -219,12 +220,17 @@ cc_library(
        ${XPU_CTX_DEPS}
        ${MLU_CTX_DEPS}
        eigen3
-       cpu_context
        generator)
 if(WITH_XPU)
   target_link_libraries(device_context xpu_context xpu_resource_pool)
 endif()
 
+if(WITH_MKLDNN)
+  target_link_libraries(device_context onednn_context)
+endif()
+
+target_link_libraries(device_context cpu_context)
+
 cc_library(
   collective_helper
   SRCS collective_helper.cc gen_comm_id_helper.cc
@@ -280,6 +286,16 @@ if(WITH_XPU)
       CACHE INTERNAL "device event libs")
 endif()
 
+if(WITH_ASCEND_CL)
+  cc_library(
+    device_event_npu
+    SRCS device_event_npu.cc
+    DEPS device_event_base npu_resource_pool)
+  set(DEVICE_EVENT_LIBS
+      device_event_npu
+      CACHE INTERNAL "device event libs")
+endif()
+
 if(WITH_GPU)
   nv_library(
     device_event_gpu
@@ -297,6 +313,10 @@ if(WITH_GPU)
     device_context_test
     SRCS device_context_test.cu
     DEPS device_context gpu_info)
+  nv_test(
+    device_context_test_cuda_graph
+    SRCS device_context_test_cuda_graph.cu
+    DEPS device_context gpu_info cuda_graph_with_memory_pool)
   nv_test(
     transform_test
     SRCS transform_test.cu
@@ -352,7 +372,7 @@ add_subdirectory(profiler)
 cc_library(
   device_tracer
   SRCS device_tracer.cc
-  DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
+  DEPS profiler_proto framework_proto ${GPU_CTX_DEPS})
 if(WITH_GPU)
   nv_library(
     profiler
diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc
index 655c5a98aeb51..00b5dd7f8afe9 100644
--- a/paddle/fluid/platform/collective_helper.cc
+++ b/paddle/fluid/platform/collective_helper.cc
@@ -347,6 +347,12 @@ BKCLComm* BKCLCommContext::AssignBKCLComm(
     BKCLContext_t comm, int nranks, int rank, int dev_id, int ring_id) {
   std::unique_ptr<XPUDeviceContext> dev_ctx(
       new XPUDeviceContext(XPUPlace(dev_id)));
+  // used in BKCL as comm_stream, for every dev_id there is
+  // a comm_stream at each ring. this stream is passed as input var
+  // when calling collective comm commands like bkcl_all_reduce
+  XPUStream comm_stream;
+  PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_create(&comm_stream));
+  dev_ctx->SetXPUStream(comm_stream);
 
   BKCLCommImpl* c = new BKCLCommImpl;
   c->set_ring_id(ring_id);
diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
index eb9f1ca845a28..bfdf492962de3 100644
--- a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
+++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
@@ -26,7 +26,9 @@ namespace platform {
 void BeginCUDAGraphCapture(platform::CUDAPlace place,
                            cudaStreamCaptureMode mode,
                            int64_t pool_id) {
-  auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
+  auto* mutable_dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+  auto* dev_ctx =
+      reinterpret_cast<platform::CUDADeviceContext*>(mutable_dev_ctx);
   dev_ctx->cudnn_workspace_handle().ResetWorkspace();
 
   // After PR(#43206), cudnn related initializations will change to lazy mode.
@@ -49,6 +51,9 @@ void BeginCUDAGraphCapture(platform::CUDAPlace place,
   pool_id = CUDAGraph::SetMemoryPoolID(pool_id);
   memory::allocation::AllocatorFacade::Instance().PrepareMemoryPoolForCUDAGraph(
       pool_id);
+  dev_ctx->SetCUDAGraphAllocator(memory::allocation::AllocatorFacade::Instance()
+                                     .GetAllocator(place)
+                                     .get());
   if (old_value) {
     FLAGS_use_stream_safe_cuda_allocator = true;
   }
@@ -60,8 +65,11 @@ void BeginCUDAGraphCapture(platform::CUDAPlace place,
 
 std::unique_ptr<CUDAGraph> EndCUDAGraphCapture() {
   auto place = CUDAGraph::CapturingPlace();
-  auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
+  auto* mutable_dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+  auto* dev_ctx =
+      reinterpret_cast<platform::CUDADeviceContext*>(mutable_dev_ctx);
   dev_ctx->cudnn_workspace_handle().ResetWorkspace();
+  dev_ctx->SetCUDAGraphAllocator(nullptr);
   return CUDAGraph::EndCapture();
 }
 #endif
diff --git a/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
index da9121550e07a..64a2f891c21cd 100644
--- a/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
+++ b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
@@ -1,7 +1,7 @@
 nv_library(
   cuda_graph
   SRCS cuda_graph.cc
-  DEPS enforce allocator_facade)
+  DEPS enforce)
 nv_library(
   cuda_profiler
   SRCS cuda_profiler.cc
@@ -10,4 +10,4 @@ nv_library(
 nv_test(
   cudnn_helper_test
   SRCS cudnn_helper_test.cc
-  DEPS dynload_cuda phi)
+  DEPS dynload_cuda)
diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.cc b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
index ca44fbd0a5cb1..09e68ab518746 100644
--- a/paddle/fluid/platform/device/ipu/ipu_compiler.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
@@ -19,7 +19,7 @@
 #include <popart/optimizer.hpp>
 #include <popart/sgd.hpp>
 
-#include "boost/blank.hpp"
+#include "paddle/utils/blank.h"
 
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/platform/device/ipu/ipu_names.h"
@@ -32,7 +32,7 @@ namespace ipu {
 
 namespace {
 
-struct CustomOpAttrVisitor : public boost::static_visitor<void> {
+struct CustomOpAttrVisitor {
   CustomOpAttrVisitor(std::map<std::string, popart::any>* attr,
                       const std::string& attr_name)
       : attrs_(attr), attr_name_(attr_name) {}
@@ -75,14 +75,14 @@ struct CustomOpAttrVisitor : public boost::static_visitor<void> {
   void operator()(const std::vector<double>& v) const {
     attrs_->emplace(attr_name_, v);
   }
-  void operator()(boost::blank) const {
+  void operator()(paddle::blank) const {
     PADDLE_THROW(platform::errors::Unavailable(
-        "Unsupported calling method for `boost::blank` type when extracting "
+        "Unsupported calling method for `paddle::blank` type when extracting "
         "custom operator attributes."));
   }
 };
 
-struct ConstantOpAttrVisitor : public boost::static_visitor<void> {
+struct ConstantOpAttrVisitor {
   ConstantOpAttrVisitor(framework::LoDTensor* tensor, VarType::Type dtype)
       : tensor_(tensor), dtype_(dtype) {}
 
@@ -111,7 +111,13 @@ struct ConstantOpAttrVisitor : public boost::static_visitor<void> {
     framework::TensorFromVector<int64_t>(vec, tensor_);
   }
   void operator()(const std::vector<double>& vec) const {
-    framework::TensorFromVector<double>(vec, tensor_);
+    // popart do not support float64 constant
+    std::vector<float> vec_fp32;
+    std::transform(vec.begin(),
+                   vec.end(),
+                   std::back_inserter(vec_fp32),
+                   [](double f) -> float { return static_cast<float>(f); });
+    framework::TensorFromVector<float>(vec_fp32, tensor_);
   }
 #define RAISE_ERROR \
   PADDLE_THROW(     \
@@ -124,7 +130,7 @@ struct ConstantOpAttrVisitor : public boost::static_visitor<void> {
   void operator()(BlockDesc* desc) const { RAISE_ERROR; }
   void operator()(const std::vector<BlockDesc*>& v) const { RAISE_ERROR; }
   void operator()(int64_t v) const { RAISE_ERROR; }
-  void operator()(boost::blank) const { RAISE_ERROR; }
+  void operator()(paddle::blank) const { RAISE_ERROR; }
 #undef RAISE_ERROR
 };
 
@@ -416,7 +422,7 @@ void Compiler::LowerWeights(const Scope* scope) {
     auto* node = graph_helper_->nodes_id_map[id];
     // Weights are var node and Persistable
     if (node->IsVar() && !node->IsCtrlVar() && node->Var() &&
-        node->Var()->Persistable()) {
+        node->Var()->Persistable() && node->inputs.empty()) {
       // Weights are Parameter in training mode
       if (ipu_strategy_->is_training && !node->Var()->IsParameter()) {
         continue;
@@ -439,6 +445,7 @@ void Compiler::LowerWeights(const Scope* scope) {
       for (size_t i = 0; i < tensor.dims().size(); ++i) {
         shape.push_back(tensor.dims().at(i));
       }
+
       popart::TensorInfo tensor_info(dtype, shape);
       popart::ConstVoidData const_data{tensor.data(), tensor_info};
       if (!node->outputs.empty()) {
@@ -524,19 +531,26 @@ void Compiler::LowerOptimizer(const Scope* scope) {
       auto raw_type =
           BOOST_GET_CONST(std::string, op_desc->GetAttr("raw_type"));
       resources_->optimizer_type = raw_type;
-      auto loss_var =
-          BOOST_GET_CONST(std::string, op_desc->GetAttr("loss_var"));
-      resources_->loss_var = resources_->tensors[loss_var];
       resources_->with_lr_sched =
           BOOST_GET_CONST(bool, op_desc->GetAttr("with_lr_sched"));
-      if (op_desc->HasAttr("lr_var")) {
-        auto lr_var = BOOST_GET_CONST(std::string, op_desc->GetAttr("lr_var"));
-        resources_->lr_var = lr_var;
-        resources_->lr = GetSingleVarFromScope<float>(scope, lr_var);
+      if (ipu_strategy_->is_dynamic) {
+        // loss_var in dy2static is set by identity_loss. And lr is
+        // passed by ipu_strategy.
+        resources_->lr = ipu_strategy_->lr;
       } else {
-        // adadelta has no lr
-        resources_->lr = 0.01f;
-        resources_->with_lr_sched = false;
+        auto loss_var =
+            BOOST_GET_CONST(std::string, op_desc->GetAttr("loss_var"));
+        resources_->loss_var = resources_->tensors[loss_var];
+        if (op_desc->HasAttr("lr_var")) {
+          auto lr_var =
+              BOOST_GET_CONST(std::string, op_desc->GetAttr("lr_var"));
+          resources_->lr_var = lr_var;
+          resources_->lr = GetSingleVarFromScope<float>(scope, lr_var);
+        } else {
+          // adadelta has no lr
+          resources_->lr = 0.01f;
+          resources_->with_lr_sched = false;
+        }
       }
       VLOG(10) << "Set initial lr: " << resources_->lr;
 
@@ -758,6 +772,19 @@ void Compiler::LowerOptimizer(const Scope* scope) {
         PADDLE_THROW(platform::errors::Unimplemented(
             "optimizer %s is not implemented", type));
       }
+    } else if (op_type == "popart_identity_loss") {
+      auto outputs = op_desc->Outputs();
+      PADDLE_ENFORCE_EQ(
+          outputs.size(),
+          1,
+          platform::errors::InvalidArgument("Can only support one loss key"));
+      auto losses = outputs.begin()->second;
+      PADDLE_ENFORCE_EQ(
+          losses.size(),
+          1,
+          platform::errors::InvalidArgument("Can only support one loss name"));
+      auto loss_var = losses.front();
+      resources_->loss_var = resources_->tensors[loss_var];
     }
   }
 }
diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.cc b/paddle/fluid/platform/device/ipu/ipu_executor.cc
index 4db25e880f3a9..cf051f978208d 100644
--- a/paddle/fluid/platform/device/ipu/ipu_executor.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_executor.cc
@@ -213,8 +213,13 @@ void Executor::Run(const std::vector<const Tensor *> &inputs,
       optimizer = compiler_resources_->eval_optimizer.get();
     } else {
       VLOG(10) << "Update learning_rate";
-      auto new_lr =
-          GetSingleVarFromScope<float>(scope_, compiler_resources_->lr_var);
+      float new_lr;
+      if (ipu_strategy_->is_dynamic) {
+        new_lr = ipu_strategy_->lr;
+      } else {
+        new_lr =
+            GetSingleVarFromScope<float>(scope_, compiler_resources_->lr_var);
+      }
       VLOG(10) << "New Lr: " << new_lr;
       optimizer = compiler_resources_->UpdateOptimizer(new_lr);
     }
@@ -257,6 +262,7 @@ void Executor::AcquireDevice() {
             "numIPUs",
             std::to_string(ipu_strategy_->num_ipus),
         },
+        {"tilesPerIPU", std::to_string(ipu_strategy_->tiles_per_ipu)},
         {"ipuVersion", "ipu2"},
     };
     device_ = popart::DeviceManager::createDeviceManager().createIpuModelDevice(
@@ -269,6 +275,7 @@ void Executor::AcquireDevice() {
             "numIPUs",
             std::to_string(ipu_strategy_->num_ipus),
         },
+        {"tilesPerIPU", std::to_string(ipu_strategy_->tiles_per_ipu)},
         {"ipuVersion", "ipu2"},
     };
     device_ =
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.cc b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
index eeffd0a36e015..d796501069651 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
@@ -91,6 +91,7 @@ IpuStrategy::IpuStrategy() {
   ADD_UINT64_OPTION(batches_per_step);
   ADD_UINT64_OPTION(micro_batch_size);
   ADD_UINT64_OPTION(random_seed);
+  ADD_UINT64_OPTION(tiles_per_ipu);
   ADD_DOUBLE_OPTION(available_memory_proportion);
   ADD_DOUBLE_OPTION(loss_scaling);
   ADD_DOUBLE_OPTION(max_weight_norm);
@@ -100,6 +101,10 @@ IpuStrategy::IpuStrategy() {
   ADD_STRING_OPTION(onnx_dump_path);
   ADD_STRING_OPTION(weight_decay_mode);
 
+  // dy2static support
+  ADD_DOUBLE_OPTION(lr);
+  ADD_BOOL_OPTION(is_dynamic);
+
 #undef ADD_STRING_OPTION
 #undef ADD_DOUBLE_OPTION
 #undef ADD_UINT64_OPTION
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.h b/paddle/fluid/platform/device/ipu/ipu_strategy.h
index 1fdde59cf856c..997bc310df308 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.h
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.h
@@ -41,7 +41,7 @@ class IpuStrategy {
   // Average sharding, debugging used
   bool need_avg_shard = false;
 
-  // Flag for fp16, true for pure fp16
+  // Flag for fp16, true for inference with pure fp16
   bool enable_fp16 = false;
 
   // The mode of Adam/Lamb optimizer
@@ -64,6 +64,9 @@ class IpuStrategy {
   // Micro batch-size
   int micro_batch_size = 1;
 
+  // The number of virtual tiles for IPUMODEL
+  int tiles_per_ipu = 4;
+
   // Random seed
   std::uint64_t random_seed = std::numeric_limits<std::uint64_t>::max();
 
@@ -109,6 +112,12 @@ class IpuStrategy {
   // Custom ops
   std::vector<IpuCustomOpIdentifier> custom_ops;
 
+  // lr for dynamic2static
+  float lr = 0.0;
+
+  // whether in dynamic mode
+  bool is_dynamic = false;
+
  public:
   void AddBoolOption(const std::string &option, bool value);
   void AddUint64Option(const std::string &option, std::uint64_t value);
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc
index 44fdf764c5bcc..c4960616b9db0 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc
@@ -138,6 +138,59 @@ const ONNXDataType GetOutputVarDType(const Node *node,
   return GetVarDType(out_node);
 }
 
+bool IsLastVarNode(Node *node) {
+  return node->IsVar() && node->outputs.size() == 0;
+}
+
+void MarkNodeForDeletion(Node *node) { node->Op()->SetAttr("delete_node", 1); }
+
+bool IsMarkedForDeletion(Node *node) {
+  return node->Op()->HasAttr("delete_node") &&
+         BOOST_GET_CONST(int, node->Op()->GetAttr("delete_node")) > 0;
+}
+
+int RemoveTailReduction(Graph *graph,
+                        Node *loss_op,
+                        const std::string &output_var_name) {
+  // Sum: 0. Mean: 1. None: 2
+  int reduction = 2;
+  Node *reduction_op;
+  auto loss_output = GetOutputVarNode(output_var_name, loss_op);
+  for (auto sub_node : loss_output->outputs) {
+    if (!sub_node->IsOp()) continue;
+    if (sub_node->Op()->Type() == "reduce_sum") {
+      reduction = 0;
+      reduction_op = sub_node;
+    } else if (sub_node->Op()->Type() == "reduce_mean") {
+      reduction = 1;
+      reduction_op = sub_node;
+    }
+  }
+  if (reduction == 2) return reduction;
+  auto reduction_out = reduction_op->outputs[0];
+  loss_op->Op()->SetOutput(output_var_name,
+                           std::vector<std::string>({reduction_out->Name()}));
+  MarkNodeForDeletion(reduction_op);
+  DisConnectNodes(loss_output, reduction_op);
+  DisConnectNodes(reduction_op, reduction_out);
+  ConnectNodes(loss_op, reduction_out);
+
+  return reduction;
+}
+
+int ConvertToPopartReduction(const std::string &reduction) {
+  // Sum: 0. Mean: 1. None: 2
+  if (reduction == "sum") {
+    return 0;
+  } else if (reduction == "mean") {
+    return 1;
+  } else if (reduction == "none") {
+    return 2;
+  }
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "reduction %s is not supported on ipu.", reduction));
+}
+
 }  // namespace ipu
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h
index 536b69a39b9a1..611d863c496a8 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h
@@ -85,6 +85,13 @@ const bool is_float_equal(float a, float b, float eps = 1e-8);
 const ONNXDataType GetVarDType(const Node *node);
 const ONNXDataType GetOutputVarDType(const Node *node,
                                      const std::string &output_name = "Out");
+void MarkNodeForDeletion(Node *node);
+bool IsMarkedForDeletion(Node *node);
+bool IsLastVarNode(Node *node);
+int RemoveTailReduction(Graph *graph,
+                        Node *loss_op,
+                        const std::string &output_var_name);
+int ConvertToPopartReduction(const std::string &reduction);
 
 }  // namespace ipu
 }  // namespace platform
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc
index c10a30997a4da..155c11b03b8fc 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc
@@ -91,6 +91,38 @@ Node *less_than_handler(Graph *graph, Node *node) {
                       {});
 }
 
+Node *greater_equal_handler(Graph *graph, Node *node) {
+  auto less_op =
+      CreateBaseOp(graph,
+                   node,
+                   "popart_less",
+                   {GetInputVarNode("X", node), GetInputVarNode("Y", node)},
+                   {},
+                   {});
+  return CreateBaseOp(graph,
+                      node,
+                      "popart_logical_not",
+                      less_op->outputs,
+                      {GetOutputVarNode("Out", node)},
+                      {});
+}
+
+Node *less_equal_handler(Graph *graph, Node *node) {
+  auto less_op =
+      CreateBaseOp(graph,
+                   node,
+                   "popart_greater",
+                   {GetInputVarNode("X", node), GetInputVarNode("Y", node)},
+                   {},
+                   {});
+  return CreateBaseOp(graph,
+                      node,
+                      "popart_logical_not",
+                      less_op->outputs,
+                      {GetOutputVarNode("Out", node)},
+                      {});
+}
+
 }  // namespace
 }  // namespace ipu
 }  // namespace platform
@@ -103,3 +135,5 @@ REGISTER_HANDLER(logical_or, logical_or_handler);
 REGISTER_HANDLER(logical_and, logical_and_handler);
 REGISTER_HANDLER(greater_than, greater_than_handler);
 REGISTER_HANDLER(less_than, less_than_handler);
+REGISTER_HANDLER(greater_equal, greater_equal_handler);
+REGISTER_HANDLER(less_equal, less_equal_handler);
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/loss_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/loss_ops.cc
new file mode 100644
index 0000000000000..438304fcfc709
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/loss_ops.cc
@@ -0,0 +1,508 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+namespace {
+
+bool is_dynamic_graph() {
+  auto *ipu_backend = platform::ipu::IpuBackend::GetInstance();
+  return ipu_backend->GetIpuStrategy()->is_dynamic;
+}
+
+Node *identity_loss_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto reduction = BOOST_GET_CONST(int, op->GetAttr("reduction"));
+  return CreateIdentityLossOp(
+      graph, node, node->inputs, node->outputs, reduction);
+}
+
+Node *cross_entropy_general_handler(Graph *graph,
+                                    Node *node,
+                                    Node *logits,
+                                    Node *label,
+                                    Node *output,
+                                    bool soft_label,
+                                    int ignore_index,
+                                    int reduction,
+                                    int axis) {
+  Node *cast_and_reshape = nullptr;
+  Node *final_loss_node = nullptr;
+  if (soft_label) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "soft_label is not supported yet in IPU"));
+  }
+  bool append_identity_loss = is_dynamic_graph();
+  bool is_last_var_node = IsLastVarNode(output);
+  append_identity_loss = append_identity_loss && is_last_var_node;
+
+  if (label->Var()->GetDataType() == framework::proto::VarType::INT32) {
+    cast_and_reshape = label;
+  } else {
+    cast_and_reshape =
+        CreateCast(graph, node, {label}, {}, framework::proto::VarType::INT32)
+            ->outputs.front();
+  }
+
+  auto label_shape_ = label->Var()->GetShape();
+  auto logits_shape_ = logits->Var()->GetShape();
+
+  axis = axis < 0 ? logits_shape_.size() + axis : axis;
+
+  auto label_transposed(label_shape_);
+
+  if (axis != (logits_shape_.size() - 1)) {
+    // the softmax axis(a) is not at the last dimension.
+    // logit shape: [N1, ..., C, ..., Nk]
+    // label shape: [N1, ..., 1, ..., Nk]
+    //                   _____^_____
+    // dim:           0, ..., a, ..., k-1
+    // needs to transpose the softmax axis in logit to last dimension
+    // with following transpose perm: [0, ..., a-1, a+1, ..., k-1, a]
+    std::vector<int64_t> trans(logits_shape_.size(), 0);
+    std::iota(trans.begin(), trans.begin() + axis, 0);
+    std::iota(trans.begin() + axis, trans.end() - 1, axis + 1);
+    trans.back() = axis;
+
+    // transpose logits
+    logits =
+        CreateBaseOp(
+            graph, node, "popart_transpose", {logits}, {}, {{"perm", trans}})
+            ->outputs.front();
+
+    // no need to transpose label, transform the label size and reshape later.
+    std::transform(
+        trans.cbegin(),
+        trans.cend(),
+        label_transposed.begin(),
+        [&label_shape_](int64_t index) { return label_shape_[index]; });
+  }
+
+  if (label_transposed.back() == 1) {
+    // input shape: [N1, N2, ... , Nk, C]
+    // label shape: [N1, N2, ... , Nk, 1]
+    // reshape label shape to [N1, N2, ... , Nk]
+    std::vector<int64_t> new_shape_(label_transposed.begin(),
+                                    label_transposed.end() - 1);
+    auto const_before_loss =
+        CreateBaseOp(
+            graph,
+            node,
+            "popart_constant",
+            {},
+            {},
+            {{"value", new_shape_},
+             {"dims",
+              std::vector<int64_t>{static_cast<int64_t>(new_shape_.size())}},
+             {"dtype", ONNXDataType::INT64}})
+            ->outputs.front();
+
+    cast_and_reshape = CreateBaseOp(graph,
+                                    node,
+                                    "popart_reshape",
+                                    {cast_and_reshape, const_before_loss},
+                                    {},
+                                    {})
+                           ->outputs.front();
+  }
+
+  auto log = CreateBaseOp(graph, node, "popart_log", {logits}, {}, {})
+                 ->outputs.front();
+
+  bool reshape_back = reduction == 2 && label_transposed.back() == 1;
+
+  final_loss_node = CreateBaseOp(graph,
+                                 node,
+                                 "popart_nllloss_v2",
+                                 {log, cast_and_reshape},
+                                 !(reshape_back || append_identity_loss)
+                                     ? std::vector<Node *>{output}
+                                     : std::vector<Node *>{},
+                                 {
+                                     {"reduction", reduction},
+                                     {"ignoreIndex", ignore_index},
+                                     {"inputIsLogProbability", true},
+                                 })
+                        ->outputs.front();
+
+  if (reshape_back) {
+    // reshape output to the shape of input label.
+    auto const_after_loss =
+        CreateBaseOp(
+            graph,
+            node,
+            "popart_constant",
+            {},
+            {},
+            {{"value", label_shape_},
+             {"dims",
+              std::vector<int64_t>{static_cast<int64_t>(label_shape_.size())}},
+             {"dtype", ONNXDataType::INT64}})
+            ->outputs.front();
+    final_loss_node =
+        CreateBaseOp(graph,
+                     node,
+                     "popart_reshape",
+                     {final_loss_node, const_after_loss},
+                     append_identity_loss ? std::vector<Node *>{}
+                                          : std::vector<Node *>{output},
+                     {})
+            ->outputs.front();
+  }
+
+  if (append_identity_loss) {
+    final_loss_node =
+        CreateIdentityLossOp(graph, node, {final_loss_node}, {output}, 2);
+  }
+
+  return final_loss_node;
+}
+
+Node *cross_entropy2_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  int reduction = RemoveTailReduction(graph, node, "Y");
+  auto logits = GetInputVarNode("X", node);
+  auto label = GetInputVarNode("Label", node);
+  auto output = GetOutputVarNode("Y", node);
+  auto ignore_index = BOOST_GET_CONST(int, op->GetAttr("ignore_index"));
+  return cross_entropy_general_handler(graph,
+                                       node,
+                                       logits,
+                                       label,
+                                       output,
+                                       false, /*soft_label*/
+                                       ignore_index,
+                                       reduction,
+                                       -1); /*axis*/
+}
+
+Node *softmax_with_cross_entropy_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  int reduction = RemoveTailReduction(graph, node, "Loss");
+  auto logits = GetInputVarNode("Logits", node);
+  auto label = GetInputVarNode("Label", node);
+  auto output = GetOutputVarNode("Loss", node);
+  auto ignore_index = BOOST_GET_CONST(int, op->GetAttr("ignore_index"));
+  auto axis = BOOST_GET_CONST(int, op->GetAttr("axis"));
+  auto soft_label = BOOST_GET_CONST(bool, op->GetAttr("soft_label"));
+
+  logits = CreateSoftmaxOpset11(
+               graph, node, {logits}, {GetOutputVarNode("Softmax", node)}, axis)
+               ->outputs.front();
+  return cross_entropy_general_handler(graph,
+                                       node,
+                                       logits,
+                                       label,
+                                       output,
+                                       soft_label,
+                                       ignore_index,
+                                       reduction,
+                                       axis);
+}
+
+Node *kldiv_loss_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto reduction = ConvertToPopartReduction(
+      BOOST_GET_CONST(std::string, op->GetAttr("reduction")));
+  if (reduction == 2) {
+    reduction = RemoveTailReduction(graph, node, "Loss");
+  }
+  bool append_identity_loss = is_dynamic_graph();
+  bool is_last_var_node = IsLastVarNode(GetOutputVarNode("Loss", node));
+  append_identity_loss = append_identity_loss && is_last_var_node;
+
+  // log(pred)
+  auto log =
+      CreateBaseOp(
+          graph, node, "popart_log", {GetInputVarNode("Target", node)}, {}, {})
+          ->outputs.front();
+
+  // log(pred) - label
+  auto log_minus =
+      CreateBaseOp(
+          graph, node, "popart_sub", {log, GetInputVarNode("X", node)}, {}, {})
+          ->outputs.front();
+
+  // label * (log(pred) - label)
+  auto loss =
+      CreateBaseOp(graph,
+                   node,
+                   "popart_mul",
+                   {GetInputVarNode("Target", node), log_minus},
+                   append_identity_loss || reduction != 2
+                       ? std::vector<Node *>{}
+                       : std::vector<Node *>{GetOutputVarNode("Loss", node)},
+                   {});
+
+  auto attrs = AttributeMap{{"reduce_all", true}, {"keepdims", 0L}};
+  if (append_identity_loss) {
+    loss = CreateIdentityLossOp(graph,
+                                node,
+                                loss->outputs,
+                                {GetOutputVarNode("Loss", node)},
+                                reduction);
+  } else if (reduction == 0) {
+    // Sum
+    loss = CreateBaseOp(graph,
+                        node,
+                        "popart_reducesum",
+                        loss->outputs,
+                        {GetOutputVarNode("Loss", node)},
+                        attrs);
+  } else if (reduction == 1) {
+    // Mean
+    loss = CreateBaseOp(graph,
+                        node,
+                        "popart_reducemean",
+                        loss->outputs,
+                        {GetOutputVarNode("Loss", node)},
+                        attrs);
+  }
+  return loss;
+}
+
+Node *binary_cross_entropy_handler(Graph *graph, Node *node) {
+  // Out = -1 * weight * (label * log(x) + (1 - label) * log(1 - x))
+  int reduction = 2;
+  if (is_dynamic_graph()) {
+    reduction = RemoveTailReduction(graph, node, "Out");
+  }
+  bool append_identity_loss =
+      is_dynamic_graph() && IsLastVarNode(GetOutputVarNode("Loss", node));
+
+  auto x = GetInputVarNode("X", node);
+  auto label = GetInputVarNode("Label", node);
+  // log(x)
+  auto log =
+      CreateBaseOp(graph, node, "popart_log", {x}, {}, {})->outputs.front();
+
+  // label * log(x)
+  auto log_mul = CreateBaseOp(graph, node, "popart_mul", {label, log}, {}, {})
+                     ->outputs.front();
+
+  // const one
+  auto one =
+      CreateConst(graph, node, std::vector<float>{1.0}, {1}, GetVarDType(x))
+          ->outputs.front();
+  // (1 - x)
+  auto minus_input = CreateBaseOp(graph, node, "popart_sub", {one, x}, {}, {})
+                         ->outputs.front();
+
+  // log(1 - x)
+  auto log_minus_input =
+      CreateBaseOp(graph, node, "popart_log", {minus_input}, {}, {})
+          ->outputs.front();
+
+  // (1 - label)
+  auto minus_label =
+      CreateBaseOp(graph, node, "popart_sub", {one, label}, {}, {})
+          ->outputs.front();
+
+  // (1 - label) * log(1 - x)
+  auto minus_log_mul =
+      CreateBaseOp(
+          graph, node, "popart_mul", {minus_label, log_minus_input}, {}, {})
+          ->outputs.front();
+
+  // (label * log(x) + (1 - label) * log(1 - x))
+  auto add =
+      CreateBaseOp(graph, node, "popart_add", {log_mul, minus_log_mul}, {}, {})
+          ->outputs.front();
+
+  // -1 * (label * log(x) + (1 - label) * log(1 - x))
+  auto loss = CreateBaseOp(
+      graph,
+      node,
+      "popart_neg",
+      {add},
+      append_identity_loss ? std::vector<Node *>{}
+                           : std::vector<Node *>{GetOutputVarNode("Out", node)},
+      {});
+  if (append_identity_loss) {
+    loss = CreateIdentityLossOp(
+        graph, node, loss->outputs, {GetOutputVarNode("Out", node)}, reduction);
+  }
+  return loss;
+}
+
+Node *huber_loss_handler(Graph *graph, Node *node) {
+  // if abs(label - input) < delta
+  //   huber_loss = 0.5 * (label - input) * (label - input)
+  // else
+  //   huber_loss = delta * abs(label - input) - 0.5 * delta * delta
+  auto *op = node->Op();
+  int reduction = 2;
+  if (is_dynamic_graph()) {
+    reduction = RemoveTailReduction(graph, node, "Out");
+  }
+  bool append_identity_loss =
+      is_dynamic_graph() && IsLastVarNode(GetOutputVarNode("Out", node));
+
+  auto x = GetInputVarNode("X", node);
+  auto label = GetInputVarNode("Y", node);
+  // (label - input)
+  auto diff = CreateBaseOp(graph, node, "popart_sub", {label, x}, {}, {})
+                  ->outputs.front();
+
+  // abs(label - input)
+  auto abs_diff =
+      CreateBaseOp(graph, node, "popart_abs", {diff}, {}, {})->outputs.front();
+
+  // const 0.5
+  auto dot_five =
+      CreateConst(graph, node, std::vector<float>{0.5}, {1}, GetVarDType(x))
+          ->outputs.front();
+
+  // const delta
+  auto delta_value = BOOST_GET_CONST(float, op->GetAttr("delta"));
+  auto delta =
+      CreateConst(
+          graph, node, std::vector<float>{delta_value}, {1}, GetVarDType(x))
+          ->outputs.front();
+  auto delta_square_coff =
+      CreateConst(graph,
+                  node,
+                  std::vector<float>{0.5f * delta_value * delta_value},
+                  {1},
+                  GetVarDType(x))
+          ->outputs.front();
+
+  // (label - input) * (label - input)
+  auto square = CreateBaseOp(graph, node, "popart_mul", {diff, diff}, {}, {})
+                    ->outputs.front();
+
+  // 0.5 * (label - input) * (label - input)
+  auto dot_five_square =
+      CreateBaseOp(graph, node, "popart_mul", {dot_five, square}, {}, {})
+          ->outputs.front();
+
+  // delta * abs(label - input)
+  auto delta_mul_diff =
+      CreateBaseOp(graph, node, "popart_mul", {delta, abs_diff}, {}, {})
+          ->outputs.front();
+
+  // delta * abs(label - input) - 0.5 * delta * delta
+  auto sub_delta_square = CreateBaseOp(graph,
+                                       node,
+                                       "popart_sub",
+                                       {delta_mul_diff, delta_square_coff},
+                                       {},
+                                       {})
+                              ->outputs.front();
+
+  // abs(label - input) < delta
+  auto less_cond =
+      CreateBaseOp(graph, node, "popart_less", {abs_diff, delta}, {}, {})
+          ->outputs.front();
+  auto loss = CreateBaseOp(
+      graph,
+      node,
+      "popart_where",
+      {less_cond, dot_five_square, sub_delta_square},
+      append_identity_loss ? std::vector<Node *>{}
+                           : std::vector<Node *>{GetOutputVarNode("Out", node)},
+      {});
+
+  if (append_identity_loss) {
+    loss = CreateIdentityLossOp(
+        graph, node, loss->outputs, {GetOutputVarNode("Out", node)}, reduction);
+  }
+  return loss;
+}
+
+Node *warpctc_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto logits = GetInputVarNode("Logits", node);
+  auto label = GetInputVarNode("Label", node);
+  auto logits_length = GetInputVarNode("LogitsLength", node);
+  auto label_length = GetInputVarNode("LabelLength", node);
+  auto blank = BOOST_GET_CONST(int, op->GetAttr("blank"));
+  auto norm_by_times = BOOST_GET_CONST(bool, op->GetAttr("norm_by_times"));
+  int reduction = 2;
+  if (is_dynamic_graph()) {
+    reduction = RemoveTailReduction(graph, node, "Loss");
+  }
+  bool append_identity_loss =
+      is_dynamic_graph() && IsLastVarNode(GetOutputVarNode("Loss", node));
+  if (norm_by_times) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "norm_by_times is not supported yet in IPU"));
+  }
+
+  int axis = -1;
+  auto softmax_logits =
+      CreateSoftmaxOpset11(graph, node, {logits}, {}, axis)->outputs.front();
+  auto log_softmax_logits =
+      CreateBaseOp(graph, node, "popart_log", {softmax_logits}, {}, {})
+          ->outputs.front();
+  auto cast_label = CreateBaseOp(graph,
+                                 node,
+                                 "popart_cast",
+                                 {label},
+                                 {},
+                                 {{"to", std::string("UINT32")}})
+                        ->outputs.front();
+  auto cast_logits_length = CreateBaseOp(graph,
+                                         node,
+                                         "popart_cast",
+                                         {logits_length},
+                                         {},
+                                         {{"to", std::string("UINT32")}})
+                                ->outputs.front();
+  auto cast_label_length = CreateBaseOp(graph,
+                                        node,
+                                        "popart_cast",
+                                        {label_length},
+                                        {},
+                                        {{"to", std::string("UINT32")}})
+                               ->outputs.front();
+  // TODO(czr): zero_infinity is not supported in current sdk which lead
+  // difference with paddle result.
+  auto loss = CreateBaseOp(
+      graph,
+      node,
+      "popart_ctcloss",
+      {log_softmax_logits, cast_label, cast_logits_length, cast_label_length},
+      append_identity_loss
+          ? std::vector<Node *>{}
+          : std::vector<Node *>{GetOutputVarNode("Loss", node)},
+      {{"blank", blank},
+       {"reduction", reduction},
+       {"outDataType", std::string("UNDEFINED")}});
+  if (append_identity_loss) {
+    loss = CreateIdentityLossOp(
+        graph, node, loss->outputs, {GetOutputVarNode("Loss", node)}, 2);
+  }
+  return loss;
+}
+
+}  // namespace
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
+
+REGISTER_HANDLER(identity_loss, identity_loss_handler);
+REGISTER_HANDLER(softmax_with_cross_entropy,
+                 softmax_with_cross_entropy_handler);
+REGISTER_HANDLER(cross_entropy2, cross_entropy2_handler);
+REGISTER_HANDLER(kldiv_loss, kldiv_loss_handler);
+REGISTER_HANDLER(bce_loss, binary_cross_entropy_handler);
+REGISTER_HANDLER(huber_loss, huber_loss_handler);
+REGISTER_HANDLER(warpctc, warpctc_handler);
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
index e47a723125b76..ddd7d9453cfa5 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
@@ -114,14 +114,29 @@ Node *matmul_handler(Graph *graph, Node *node) {
   auto transpose_x = BOOST_GET_CONST(bool, op->GetAttr("transpose_X"));
   auto transpose_y = BOOST_GET_CONST(bool, op->GetAttr("transpose_Y"));
   auto alpha = BOOST_GET_CONST(float, op->GetAttr("alpha"));
-  auto x_shape = GetInputVarNode("X", node)->Var()->GetShape();
-  auto y_shape = GetInputVarNode("Y", node)->Var()->GetShape();
+  Node *x_node = GetInputVarNode("X", node);
+  Node *y_node = GetInputVarNode("Y", node);
+  int x_rank = x_node->Var()->GetShape().size();
+  int y_rank = y_node->Var()->GetShape().size();
+
+  auto gen_perm = [](const int rank) -> std::vector<int64_t> {
+    std::vector<int64_t> perm;
+    if (rank == 1) {
+      perm = std::vector<int64_t>{0};
+    } else if (rank == 2) {
+      perm = std::vector<int64_t>{1, 0};
+    } else if (rank == 3) {
+      perm = std::vector<int64_t>{0, 2, 1};
+    } else if (rank == 4) {
+      perm = std::vector<int64_t>{0, 1, 3, 2};
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "op matmul with input rank == %d", rank));
+    }
+    return perm;
+  };
 
-  int x_rank = x_shape.size();
-  std::vector<int64_t> perm;
-  if (x_rank == 1) {
-    perm = std::vector<int64_t>{0};
-  } else if (x_rank == 2) {
+  if (x_rank == 2) {
     if (!transpose_x && !transpose_y && is_float_equal(alpha, 1.0f)) {
       return CreateBaseOp(
           graph,
@@ -137,18 +152,10 @@ Node *matmul_handler(Graph *graph, Node *node) {
                       transpose_x,
                       transpose_y,
                       alpha);
-  } else if (x_rank == 3) {
-    perm = std::vector<int64_t>{0, 2, 1};
-  } else if (x_rank == 4) {
-    perm = std::vector<int64_t>{0, 1, 3, 2};
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "op matmul with input rank == %d", x_rank));
   }
 
-  Node *x_node = GetInputVarNode("X", node);
-  Node *y_node = GetInputVarNode("Y", node);
   if (transpose_x) {
+    auto perm = gen_perm(x_rank);
     x_node = CreateBaseOp(graph,
                           node,
                           "popart_transpose",
@@ -158,6 +165,7 @@ Node *matmul_handler(Graph *graph, Node *node) {
     x_node = x_node->outputs[0];
   }
   if (transpose_y) {
+    auto perm = gen_perm(y_rank);
     y_node = CreateBaseOp(graph,
                           node,
                           "popart_transpose",
@@ -209,7 +217,7 @@ Node *scale_handler(Graph *graph, Node *node) {
       CreateCast(graph, node, {GetInputVarNode("X", node)}, {}, VarType::FP32);
 
   Node *result = nullptr;
-  if (!op->Input("ScaleTensor").empty()) {
+  if (op->InputArgumentNames().size() > 1) {
     auto scale = GetInputVarNode("ScaleTensor", node);
     if (is_float_equal(bias_, 0.0)) {
       result = CreateBaseOp(
@@ -321,183 +329,6 @@ Node *scale_handler(Graph *graph, Node *node) {
   return result_after_cast;
 }
 
-Node *cross_entropy2_handler(Graph *graph, Node *node) {
-  auto *op = node->Op();
-  auto ignoreIndex = BOOST_GET_CONST(int, op->GetAttr("ignore_index"));
-  Node *new_cast = nullptr;
-  if (GetInputVarNode("Label", node)->Var()->GetDataType() == VarType::INT32) {
-    new_cast = GetInputVarNode("Label", node);
-  } else {
-    auto new_cast = CreateCast(
-        graph, node, {GetInputVarNode("Label", node)}, {}, VarType::INT32);
-    new_cast = new_cast->outputs[0];
-  }
-  auto label_shape_ = GetInputVarNode("Label", node)->Var()->GetShape();
-  if (label_shape_[label_shape_.size() - 1] != 1) {
-    auto log = CreateBaseOp(
-        graph, node, "popart_log", {GetInputVarNode("X", node)}, {}, {});
-    return CreateBaseOp(
-        graph,
-        node,
-        "popart_nllloss_v2",
-        {log->outputs[0], new_cast},
-        {GetOutputVarNode("Y", node)},
-        {
-            {"reduction", 2},  // popart::ReductionType::NoReduction
-            {"ignoreIndex", ignoreIndex},
-            {"inputIsLogProbability", true},
-        });
-  } else {
-    std::vector<int64_t> new_shape_{label_shape_[0]};
-    auto const_before_loss = CreateBaseOp(
-        graph,
-        node,
-        "popart_constant",
-        {},
-        {},
-        {{"value", new_shape_},
-         {"dims",
-          std::vector<int64_t>{static_cast<int64_t>(new_shape_.size())}},
-         {"dtype", ONNXDataType::INT64}});
-
-    auto reshape_before_loss =
-        CreateBaseOp(graph,
-                     node,
-                     "popart_reshape",
-                     {new_cast, const_before_loss->outputs[0]},
-                     {},
-                     {});
-
-    auto log = CreateBaseOp(
-        graph, node, "popart_log", {GetInputVarNode("X", node)}, {}, {});
-    auto nllloss = CreateBaseOp(
-        graph,
-        node,
-        "popart_nllloss_v2",
-        {log->outputs[0], reshape_before_loss->outputs[0]},
-        {},
-        {
-            {"reduction", 2},  // popart::ReductionType::NoReduction
-            {"ignoreIndex", ignoreIndex},
-            {"inputIsLogProbability", true},
-        });
-
-    auto const_after_loss = CreateBaseOp(
-        graph,
-        node,
-        "popart_constant",
-        {},
-        {},
-        {{"value", label_shape_},
-         {"dims",
-          std::vector<int64_t>{static_cast<int64_t>(label_shape_.size())}},
-         {"dtype", ONNXDataType::INT64}});
-
-    auto reshape_after_loss =
-        CreateBaseOp(graph,
-                     node,
-                     "popart_reshape",
-                     {nllloss->outputs[0], const_after_loss->outputs[0]},
-                     {GetOutputVarNode("Y", node)},
-                     {});
-    return reshape_after_loss;
-  }
-}
-
-Node *softmax_with_cross_entropy_handler(Graph *graph, Node *node) {
-  auto *op = node->Op();
-  auto ignoreIndex = BOOST_GET_CONST(int, op->GetAttr("ignore_index"));
-  auto axis = BOOST_GET_CONST(int, op->GetAttr("axis"));
-  auto soft_label = BOOST_GET_CONST(bool, op->GetAttr("soft_label"));
-  if (soft_label) {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "soft_label is not supported yet in IPU"));
-  }
-  Node *new_cast = nullptr;
-  if (GetInputVarNode("Label", node)->Var()->GetDataType() == VarType::INT32) {
-    new_cast = GetInputVarNode("Label", node);
-  } else {
-    auto new_cast = CreateCast(
-        graph, node, {GetInputVarNode("Label", node)}, {}, VarType::INT32);
-    new_cast = new_cast->outputs[0];
-  }
-  auto softmax_node = CreateSoftmaxOpset11(
-      graph, node, {GetInputVarNode("Logits", node)}, {}, axis);
-
-  auto label_shape_ = GetInputVarNode("Label", node)->Var()->GetShape();
-  if (label_shape_[label_shape_.size() - 1] != 1) {
-    auto log = CreateBaseOp(
-        graph, node, "popart_log", {softmax_node->outputs[0]}, {}, {});
-    // softmax_with_cross_entropy is split to several ops in python.
-    // reduction is not needed here.
-    return CreateBaseOp(
-        graph,
-        node,
-        "popart_nllloss_v2",
-        {log->outputs[0], new_cast},
-        {GetOutputVarNode("Loss", node)},
-        {
-            {"reduction", 2},  // popart::ReductionType::NoReduction
-            {"ignoreIndex", ignoreIndex},
-            {"inputIsLogProbability", true},
-        });
-  } else {
-    std::vector<int64_t> new_shape_{label_shape_[0]};
-    auto const_before_loss = CreateBaseOp(
-        graph,
-        node,
-        "popart_constant",
-        {},
-        {},
-        {{"value", new_shape_},
-         {"dims",
-          std::vector<int64_t>{static_cast<int64_t>(new_shape_.size())}},
-         {"dtype", ONNXDataType::INT64}});
-
-    auto reshape_before_loss =
-        CreateBaseOp(graph,
-                     node,
-                     "popart_reshape",
-                     {new_cast, const_before_loss->outputs[0]},
-                     {},
-                     {});
-
-    auto log = CreateBaseOp(
-        graph, node, "popart_log", {softmax_node->outputs[0]}, {}, {});
-    auto nllloss = CreateBaseOp(
-        graph,
-        node,
-        "popart_nllloss_v2",
-        {log->outputs[0], reshape_before_loss->outputs[0]},
-        {},
-        {
-            {"reduction", 2},  // popart::ReductionType::NoReduction
-            {"ignoreIndex", ignoreIndex},
-            {"inputIsLogProbability", true},
-        });
-
-    auto const_after_loss = CreateBaseOp(
-        graph,
-        node,
-        "popart_constant",
-        {},
-        {},
-        {{"value", label_shape_},
-         {"dims",
-          std::vector<int64_t>{static_cast<int64_t>(label_shape_.size())}},
-         {"dtype", ONNXDataType::INT64}});
-
-    auto reshape_after_loss =
-        CreateBaseOp(graph,
-                     node,
-                     "popart_reshape",
-                     {nllloss->outputs[0], const_after_loss->outputs[0]},
-                     {GetOutputVarNode("Loss", node)},
-                     {});
-    return reshape_after_loss;
-  }
-}
-
 Node *cumsum_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
   auto exclusive = BOOST_GET_CONST(bool, op->GetAttr("exclusive"));
@@ -512,41 +343,63 @@ Node *cumsum_handler(Graph *graph, Node *node) {
                                {{"value", std::vector<int64_t>{axis}},
                                 {"dims", std::vector<int64_t>{1}},
                                 {"dtype", ONNXDataType::INT64}});
-  return CreateBaseOp(
+  Node *input_x = nullptr;
+  auto data_type_ = GetInputVarNode("X", node)->Var()->GetDataType();
+  bool need_cast = data_type_ != VarType::FP32;
+  std::vector<Node *> cumsum_out;
+  if (need_cast) {
+    auto cast_x = CreateCast(
+        graph, node, {GetInputVarNode("X", node)}, {}, VarType::FP32);
+    input_x = cast_x->outputs[0];
+  } else {
+    input_x = GetInputVarNode("X", node);
+    cumsum_out.emplace_back(GetOutputVarNode("Out", node));
+  }
+  auto cumsum_node = CreateBaseOp(
       graph,
       node,
       "popart_cumsum",
-      {GetInputVarNode("X", node), axis_node->outputs[0]},
-      {GetOutputVarNode("Out", node)},
+      {input_x, axis_node->outputs[0]},
+      cumsum_out,
       {{"exclusive", popart_exclusive}, {"reverse", popart_reverse}});
+  if (need_cast) {
+    cumsum_node = CreateCast(graph,
+                             node,
+                             cumsum_node->outputs,
+                             {GetOutputVarNode("Out", node)},
+                             data_type_);
+  }
+  return cumsum_node;
 }
 
 Node *matmul_v2_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
   auto transpose_x = BOOST_GET_CONST(bool, op->GetAttr("trans_x"));
   auto transpose_y = BOOST_GET_CONST(bool, op->GetAttr("trans_y"));
-  auto x_shape = GetInputVarNode("X", node)->Var()->GetShape();
-  auto y_shape = GetInputVarNode("Y", node)->Var()->GetShape();
-
-  std::vector<int64_t> perm;
-  int x_rank = x_shape.size();
-  if (x_rank == 1) {
-    perm = std::vector<int64_t>{0};
-  } else if (x_rank == 2) {
-    perm = std::vector<int64_t>{1, 0};
-  } else if (x_rank == 3) {
-    perm = std::vector<int64_t>{0, 2, 1};
-  } else if (x_rank == 4) {
-    perm = std::vector<int64_t>{0, 1, 3, 2};
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "op matmul with input rank == %d", x_rank));
-  }
-
   Node *x_node = GetInputVarNode("X", node);
   Node *y_node = GetInputVarNode("Y", node);
+  int x_rank = x_node->Var()->GetShape().size();
+  int y_rank = y_node->Var()->GetShape().size();
+
+  auto gen_perm = [](const int rank) -> std::vector<int64_t> {
+    std::vector<int64_t> perm;
+    if (rank == 1) {
+      perm = std::vector<int64_t>{0};
+    } else if (rank == 2) {
+      perm = std::vector<int64_t>{1, 0};
+    } else if (rank == 3) {
+      perm = std::vector<int64_t>{0, 2, 1};
+    } else if (rank == 4) {
+      perm = std::vector<int64_t>{0, 1, 3, 2};
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "op matmul with input rank == %d", rank));
+    }
+    return perm;
+  };
 
   if (transpose_x) {
+    auto perm = gen_perm(x_rank);
     x_node = CreateBaseOp(graph,
                           node,
                           "popart_transpose",
@@ -556,6 +409,7 @@ Node *matmul_v2_handler(Graph *graph, Node *node) {
     x_node = x_node->outputs[0];
   }
   if (transpose_y) {
+    auto perm = gen_perm(y_rank);
     y_node = CreateBaseOp(graph,
                           node,
                           "popart_transpose",
@@ -611,9 +465,6 @@ REGISTER_HANDLER(matmul, matmul_handler);
 REGISTER_HANDLER(sum, sum_handler);
 REGISTER_HANDLER(softmax, softmax_handler);
 REGISTER_HANDLER(scale, scale_handler);
-REGISTER_HANDLER(softmax_with_cross_entropy,
-                 softmax_with_cross_entropy_handler);
-REGISTER_HANDLER(cross_entropy2, cross_entropy2_handler);
 REGISTER_HANDLER(cumsum, cumsum_handler);
 REGISTER_HANDLER(matmul_v2, matmul_v2_handler);
 REGISTER_HANDLER(bmm, bmm_handler);
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
index 5f0ba745ed3c9..21c9beade3082 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
@@ -376,11 +376,473 @@ Node *dropout_handler(Graph *graph, Node *node) {
   }
 }
 
+Node *conv2d_transpose_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+
+  auto data_format = BOOST_GET_CONST(std::string, op->GetAttr("data_format"));
+  if (data_format != "NCHW") {
+    PADDLE_THROW(
+        platform::errors::InvalidArgument("Only support NCHW as data_format."));
+  }
+
+  auto *kernel_info = GetInputVarNode("Filter", node);
+  auto kernel_shape = kernel_info->Var()->GetShape();
+
+  auto dilations_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("dilations"));
+  auto dilations = std::vector<int64_t>{dilations_.begin(), dilations_.end()};
+  auto strides_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("strides"));
+  auto strides = std::vector<int64_t>{strides_.begin(), strides_.end()};
+  auto output_padding_ =
+      BOOST_GET_CONST(std::vector<int>, op->GetAttr("output_padding"));
+  auto output_padding =
+      std::vector<int64_t>{output_padding_.begin(), output_padding_.end()};
+  auto group_ = BOOST_GET_CONST(int, op->GetAttr("groups"));
+  auto group = int64_t(group_);
+
+  auto padding_algorithm =
+      BOOST_GET_CONST(std::string, op->GetAttr("padding_algorithm"));
+
+  auto paddings_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("paddings"));
+  if (paddings_.size() == 2) {
+    paddings_.push_back(paddings_[0]);
+    paddings_.push_back(paddings_[1]);
+  } else if (paddings_.size() == 4) {
+    std::swap(paddings_[1], paddings_[2]);
+  }
+  auto paddings = std::vector<int64_t>{paddings_.begin(), paddings_.end()};
+
+  if (padding_algorithm == "SAME") {
+    // Update paddings and dilations based on the sizes of H and W.
+    auto input_shape = GetInputVarNode("Input", node)->Var()->GetShape();
+    for (auto i = 0; i < 2; i++) {
+      auto out_size = (input_shape[i + 2] + strides[i] - 1) / strides[i];
+      auto pad_sum = std::max(
+          (out_size - 1) * strides[i] + kernel_shape[i] - input_shape[i + 2],
+          static_cast<int64_t>(0));
+      auto pad_0 = pad_sum / 2;
+      auto pad_1 = pad_sum - pad_0;
+      paddings[i] = pad_0;
+      paddings[i + 2] = pad_1;
+    }
+    for (auto i = 0; i < dilations.size(); i++) {
+      dilations[i] = 1;
+    }
+  } else if (padding_algorithm == "VALID") {
+    for (auto i = 0; i < paddings.size(); i++) {
+      paddings[i] = 0;
+    }
+  }
+
+  auto attrs = AttributeMap{{"dilations", dilations},
+                            {"group", group},
+                            {"kernel_shape", kernel_shape},
+                            {"output_padding", output_padding},
+                            {"pads", paddings},
+                            {"strides", strides}};
+  if (!op->Input("Bias").empty()) {
+    return CreateBaseOp(graph,
+                        node,
+                        "popart_convtranspose",
+                        {
+                            GetInputVarNode("Input", node),
+                            GetInputVarNode("Filter", node),
+                            GetInputVarNode("Bias", node),
+                        },
+                        node->outputs,
+                        attrs);
+  } else {
+    return CreateBaseOp(graph,
+                        node,
+                        "popart_convtranspose",
+                        {
+                            GetInputVarNode("Input", node),
+                            GetInputVarNode("Filter", node),
+                        },
+                        node->outputs,
+                        attrs);
+  }
+}
+
+Node *affine_channel_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+
+  auto data_layout = BOOST_GET_CONST(std::string, op->GetAttr("data_layout"));
+  if (data_layout != "NCHW") {
+    PADDLE_THROW(
+        platform::errors::InvalidArgument("Only support NCHW as data_format."));
+  }
+
+  auto *scale = GetInputVarNode("Scale", node);
+  auto *bias = GetInputVarNode("Bias", node);
+  auto scale_shape = scale->Var()->GetShape();
+  auto bias_shape = bias->Var()->GetShape();
+  if (scale_shape.size() <= 1 || bias_shape.size() <= 1) {
+    auto attrs = AttributeMap{{"value", std::vector<int64_t>{1, -1, 1, 1}},
+                              {"dims", std::vector<int64_t>{4}},
+                              {"dtype", ONNXDataType::INT64}};
+    auto new_shape_const = CreateConst(graph, node, {}, {}, attrs);
+
+    scale = CreateBaseOp(graph,
+                         node,
+                         "popart_reshape",
+                         {scale, new_shape_const->outputs[0]},
+                         {},
+                         {})
+                ->outputs[0];
+    bias = CreateBaseOp(graph,
+                        node,
+                        "popart_reshape",
+                        {bias, new_shape_const->outputs[0]},
+                        {},
+                        {})
+               ->outputs[0];
+  }
+  auto *out = CreateBaseOp(
+      graph, node, "popart_mul", {GetInputVarNode("X", node), scale}, {});
+  return CreateBaseOp(graph,
+                      node,
+                      "popart_add",
+                      {out->outputs[0], bias},
+                      {GetOutputVarNode("Out", node)});
+}
+
+Node *interp_handler(Graph *graph, Node *node, const std::string &mode) {
+  auto *op = node->Op();
+
+  auto data_layout = BOOST_GET_CONST(std::string, op->GetAttr("data_layout"));
+  if (data_layout != "NCHW") {
+    PADDLE_THROW(
+        platform::errors::InvalidArgument("Only support NCHW as data_format."));
+  }
+
+  auto align_corners = BOOST_GET_CONST(bool, op->GetAttr("align_corners"));
+  auto align_mode = BOOST_GET_CONST(int, op->GetAttr("align_mode"));
+
+  auto paddle_target_dtype = VarType::FP32;
+  auto onnx_target_dtype = ONNXDataType::FLOAT;
+  if (GetInputVarNode("X", node)->Var()->GetDataType() == VarType::FP16) {
+    paddle_target_dtype = VarType::FP16;
+    onnx_target_dtype = ONNXDataType::FLOAT16;
+  }
+
+  std::string coordinate_transformation_mode = "half_pixel";
+  if (align_corners) {
+    coordinate_transformation_mode = "align_corners";
+  } else if (mode == "nearest") {
+    coordinate_transformation_mode = "asymmetric";
+  } else if (align_mode == 1 && mode == "cubic") {
+    coordinate_transformation_mode = "asymmetric";
+  }
+
+  bool has_out_size = node->Op()->Input("OutSize").size() > 0;
+  bool has_size_tensor = node->Op()->Input("SizeTensor").size() > 0;
+  bool has_scale_tensor = node->Op()->Input("Scale").size() > 0;
+
+  Node *size = nullptr;
+  Node *scale = nullptr;
+  // Input: Size and Scale
+  if (has_out_size) {
+    // Get 'size' from the tensor
+    size = GetInputVarNode("OutSize", node);
+    if (size->Var()->GetDataType() != VarType::INT64) {
+      size = CreateCast(graph,
+                        node,
+                        {GetInputVarNode("OutSize", node)},
+                        {},
+                        VarType::INT64)
+                 ->outputs[0];
+    }
+  } else if (has_size_tensor) {
+    // Get 'size' from multi-tensors
+    std::vector<Node *> size_nodes;
+    for (auto var_name : node->Op()->Input("SizeTensor")) {
+      Node *size_node = GetInputVarNodeByVarName(var_name, node);
+      if (size_node->Var()->GetDataType() != VarType::INT64) {
+        size_node = CreateCast(graph, node, {size_node}, {}, VarType::INT64)
+                        ->outputs[0];
+      }
+      size_nodes.push_back(size_node);
+    }
+    size = CreateBaseOp(graph,
+                        node,
+                        "popart_concat",
+                        size_nodes,
+                        {},
+                        {{"axis", int64_t(0)}})
+               ->outputs[0];
+  } else if (has_scale_tensor) {
+    // Get 'scale' from tensor
+    scale = GetInputVarNode("Scale", node);
+    if (scale->Var()->GetDataType() != paddle_target_dtype) {
+      scale =
+          CreateCast(graph, node, {scale}, {}, paddle_target_dtype)->outputs[0];
+    }
+    auto *padding = CreateConst(graph,
+                                node,
+                                {},
+                                {},
+                                {{"value", std::vector<float>{1.0, 1.0}},
+                                 {"dims", std::vector<int64_t>{2}},
+                                 {"dtype", onnx_target_dtype}})
+                        ->outputs[0];
+    scale = CreateBaseOp(graph,
+                         node,
+                         "popart_concat",
+                         {padding, scale},
+                         {},
+                         {{"axis", int64_t(0)}})
+                ->outputs[0];
+  } else {
+    // Get 'size' or 'scale' from attribute
+    auto out_d = BOOST_GET_CONST(int, op->GetAttr("out_d"));
+    auto out_h = BOOST_GET_CONST(int, op->GetAttr("out_h"));
+    auto out_w = BOOST_GET_CONST(int, op->GetAttr("out_w"));
+    if (out_d > 0 || out_w > 0 || out_h > 0) {
+      std::vector<int64_t> out_size;
+      if (GetInputVarNode("X", node)->Var()->GetShape().size() == 5) {
+        out_size.push_back(int64_t(out_d));
+        out_size.push_back(int64_t(out_h));
+      } else if (GetInputVarNode("X", node)->Var()->GetShape().size() == 4) {
+        out_size.push_back(int64_t(out_h));
+      }
+      out_size.push_back(int64_t(out_w));
+      size =
+          CreateConst(graph,
+                      node,
+                      {},
+                      {},
+                      {{"value", out_size},
+                       {"dims", std::vector<int64_t>{int64_t(out_size.size())}},
+                       {"dtype", ONNXDataType::INT64}})
+              ->outputs[0];
+    } else {
+      auto scale_value =
+          BOOST_GET_CONST(std::vector<float>, op->GetAttr("scale"));
+      float padding = 1.0;
+      scale_value.insert(scale_value.begin(), padding);
+      scale_value.insert(scale_value.begin(), padding);
+      scale = CreateConst(
+                  graph,
+                  node,
+                  {},
+                  {},
+                  {{"value", scale_value},
+                   {"dims", std::vector<int64_t>{int64_t(scale_value.size())}},
+                   {"dtype", onnx_target_dtype}})
+                  ->outputs[0];
+    }
+  }
+
+  Node *roi =
+      CreateConst(
+          graph,
+          node,
+          {},
+          {},
+          {{"value",
+            std::vector<float>(
+                GetInputVarNode("X", node)->Var()->GetShape().size() * 2, 1.0)},
+           {"dims",
+            std::vector<int64_t>{int64_t(
+                GetInputVarNode("X", node)->Var()->GetShape().size() * 2)}},
+           {"dtype", onnx_target_dtype}})
+          ->outputs[0];
+
+  if (size != nullptr) {
+    Node *input_shape =
+        CreateBaseOp(
+            graph, node, "popart_shape", {GetInputVarNode("X", node)}, {})
+            ->outputs[0];
+    Node *start = CreateConst(graph,
+                              node,
+                              std::vector<int>{0},
+                              std::vector<int64_t>{1},
+                              ONNXDataType::INT32)
+                      ->outputs[0];
+    Node *end = CreateConst(graph,
+                            node,
+                            std::vector<int>{2},
+                            std::vector<int64_t>{1},
+                            ONNXDataType::INT32)
+                    ->outputs[0];
+    Node *axes = CreateConst(graph,
+                             node,
+                             std::vector<int>{0},
+                             std::vector<int64_t>{1},
+                             ONNXDataType::INT32)
+                     ->outputs[0];
+    Node *nc = CreateBaseOp(graph,
+                            node,
+                            "popart_slice",
+                            {input_shape, start, end, axes},
+                            {},
+                            {})
+                   ->outputs[0];
+    size = CreateBaseOp(graph,
+                        node,
+                        "popart_concat",
+                        {nc, size},
+                        {},
+                        {{"axis", int64_t(0)}})
+               ->outputs[0];
+  }
+  auto resize_attrs = AttributeMap{
+      {"coordinate_transformation_mode", coordinate_transformation_mode},
+      {"cubic_coeff_a", float{-0.75}},
+      {"exclude_outside", int64_t{0}},
+      {"extrapolation_value", float{0.0}},
+      {"mode", mode},
+      {"nearest_mode", std::string("round_prefer_floor")}};
+
+  if (mode == "nearest" && coordinate_transformation_mode == "asymmetric") {
+    resize_attrs.at("nearest_mode") = std::string("floor");
+  }
+
+  return CreateBaseOp(graph,
+                      node,
+                      "popart_resize",
+                      {GetInputVarNode("X", node), roi, scale, size},
+                      {GetOutputVarNode("Out", node)},
+                      resize_attrs);
+}
+
+Node *bilinear_interp_v2_handler(Graph *graph, Node *node) {
+  return interp_handler(graph, node, "linear");
+}
+
+Node *nearest_interp_v2_handler(Graph *graph, Node *node) {
+  return interp_handler(graph, node, "nearest");
+}
+
+Node *bicubic_interp_v2_handler(Graph *graph, Node *node) {
+  return interp_handler(graph, node, "cubic");
+}
+
+Node *linear_interp_v2_handler(Graph *graph, Node *node) {
+  return interp_handler(graph, node, "linear");
+}
+
+Node *trilinear_interp_v2_handler(Graph *graph, Node *node) {
+  return interp_handler(graph, node, "linear");
+}
+
+Node *data_norm_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+
+  int slot_dim = -1;
+  if (op->HasAttr("slot_dim")) {
+    slot_dim = BOOST_GET_CONST(int, op->GetAttr("slot_dim"));
+  }
+
+  if (slot_dim > 0) {
+    PADDLE_THROW(
+        platform::errors::InvalidArgument("slot_dim > 0 is not supported."));
+  }
+
+  bool enable_scale_and_shift = false;
+  if (op->HasAttr("enable_scale_and_shift")) {
+    enable_scale_and_shift =
+        BOOST_GET_CONST(bool, op->GetAttr("enable_scale_and_shift"));
+  }
+
+  auto *mean_arr = CreateBaseOp(graph,
+                                node,
+                                "popart_div",
+                                {GetInputVarNode("BatchSum", node),
+                                 GetInputVarNode("BatchSize", node)},
+                                {})
+                       ->outputs[0];
+  auto *scale_arr = CreateBaseOp(graph,
+                                 node,
+                                 "popart_div",
+                                 {GetInputVarNode("BatchSize", node),
+                                  GetInputVarNode("BatchSquareSum", node)},
+                                 {})
+                        ->outputs[0];
+  scale_arr =
+      CreateBaseOp(graph, node, "popart_sqrt", {scale_arr}, {})->outputs[0];
+  auto out =
+      CreateBaseOp(
+          graph, node, "popart_sub", {GetInputVarNode("X", node), mean_arr}, {})
+          ->outputs[0];
+
+  if (enable_scale_and_shift) {
+    auto scale_res = CreateBaseOp(graph,
+                                  node,
+                                  "popart_mul",
+                                  {out, GetInputVarNode("scale_w", node)},
+                                  {})
+                         ->outputs[0];
+    return CreateBaseOp(graph,
+                        node,
+                        "popart_add",
+                        {scale_res, GetInputVarNode("bias", node)},
+                        {GetOutputVarNode("Y", node)});
+  } else {
+    return CreateBaseOp(graph,
+                        node,
+                        "popart_mul",
+                        {out, scale_arr},
+                        {GetOutputVarNode("Y", node)});
+  }
+}
+
+Node *pad_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto mode = BOOST_GET_CONST(std::string, op->GetAttr("mode"));
+  auto value = BOOST_GET_CONST(float, op->GetAttr("value"));
+  auto data_format = BOOST_GET_CONST(std::string, op->GetAttr("data_format"));
+
+  if (data_format == "NDHWC") {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("NDHWC format is not supported."));
+  }
+  if (mode == "replicate" || mode == "circular") {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "circular and replicate modes are not supported."));
+  }
+  if (op->Input("Paddings").size()) {
+    // Paddings -> input tensor
+    // PopART Pad Op only support `pad` as a constant
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Do not support Paddings as a inputs tensor"));
+  }
+  // Paddings -> Attr
+  auto paddings = BOOST_GET_CONST(std::vector<int>, op->GetAttr("paddings"));
+  std::vector<int64_t> new_paddings(10, 0);
+  new_paddings[2] = paddings[4];
+  new_paddings[3] = paddings[2];
+  new_paddings[4] = paddings[0];
+  new_paddings[7] = paddings[5];
+  new_paddings[8] = paddings[3];
+  new_paddings[9] = paddings[1];
+
+  auto *paddings_node = CreateConst(graph,
+                                    node,
+                                    new_paddings,
+                                    std::vector<int64_t>{10},
+                                    ONNXDataType::INT64)
+                            ->outputs[0];
+  auto *value_node = CreateConst(graph,
+                                 node,
+                                 std::vector<float>{value},
+                                 std::vector<int64_t>{1},
+                                 ONNXDataType::FLOAT)
+                         ->outputs[0];
+  return CreateBaseOp(graph,
+                      node,
+                      "popart_pad",
+                      {GetInputVarNode("X", node), paddings_node, value_node},
+                      {GetOutputVarNode("Out", node)},
+                      {{"mode", mode}});
+}
+
 }  // namespace
 }  // namespace ipu
 }  // namespace platform
 }  // namespace paddle
 
+REGISTER_HANDLER(affine_channel, affine_channel_handler);
 REGISTER_HANDLER(pool2d, pool2d_handler);
 REGISTER_HANDLER(max_pool2d_with_index, max_pool2d_with_index_handler);
 REGISTER_HANDLER(batch_norm, batch_norm_handler);
@@ -388,4 +850,12 @@ REGISTER_HANDLER(group_norm, group_norm_handler);
 REGISTER_HANDLER(instance_norm, instance_norm_handler);
 REGISTER_HANDLER(layer_norm, layer_norm_handler);
 REGISTER_HANDLER(conv2d, conv2d_handler);
+REGISTER_HANDLER(conv2d_transpose, conv2d_transpose_handler);
 REGISTER_HANDLER(dropout, dropout_handler);
+REGISTER_HANDLER(bilinear_interp_v2, bilinear_interp_v2_handler);
+REGISTER_HANDLER(nearest_interp_v2, nearest_interp_v2_handler);
+REGISTER_HANDLER(bicubic_interp_v2, bicubic_interp_v2_handler);
+REGISTER_HANDLER(linear_interp_v2, linear_interp_v2_handler);
+REGISTER_HANDLER(trilinear_interp_v2, trilinear_interp_v2_handler);
+REGISTER_HANDLER(data_norm, data_norm_handler);
+REGISTER_HANDLER(pad3d, pad_handler);
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
index 173ea6d4d514e..6badf37d5b334 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
@@ -55,9 +55,20 @@ Node *MakeOpNode(Graph *graph,
   op_desc->SetType(type);
   auto op = graph->CreateOpNode(op_desc.get());
 
+  // inputs
+  std::vector<std::string> input_names;
   for (auto *in : inputs) {
-    ConnectNodes(in, op);
+    if (in != nullptr) {
+      ConnectNodes(in, op);
+      input_names.push_back(in->Name());
+    } else {
+      input_names.push_back(std::string(""));
+    }
   }
+  op->Op()->SetInput("__inputs__", input_names);
+
+  // outputs
+  std::vector<std::string> output_names;
   if (outputs.empty()) {
     auto var = MakeVarNode(graph, node);
     ConnectNodes(op, var);
@@ -66,14 +77,6 @@ Node *MakeOpNode(Graph *graph,
       ConnectNodes(op, out);
     }
   }
-
-  // i/o
-  std::vector<std::string> input_names;
-  for (auto node : op->inputs) {
-    input_names.push_back(node->Name());
-  }
-  op->Op()->SetInput("__inputs__", input_names);
-  std::vector<std::string> output_names;
   for (auto node : op->outputs) {
     output_names.push_back(node->Name());
   }
@@ -138,6 +141,19 @@ Node *CreateCast(Graph *graph,
       graph, node, "popart_cast", inputs, outputs, {{"to", to}});
 }
 
+Node *CreateIdentityLossOp(Graph *graph,
+                           Node *node,
+                           const std::vector<Node *> &inputs,
+                           const std::vector<Node *> &outputs,
+                           int reduction) {
+  return CreateBaseOp(graph,
+                      node,
+                      "popart_identity_loss",
+                      inputs,
+                      outputs,
+                      {{"reduction", reduction}});
+}
+
 Node *CreateGemm(Graph *graph,
                  Node *node,
                  const std::vector<Node *> &inputs,
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h
index 582b506974f95..3071c2a0b90cf 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h
@@ -67,6 +67,12 @@ Node *CreateCast(Graph *graph,
                  const std::vector<Node *> &outputs,
                  const VarType::Type otype);
 
+Node *CreateIdentityLossOp(Graph *graph,
+                           Node *node,
+                           const std::vector<Node *> &inputs,
+                           const std::vector<Node *> &outputs,
+                           int reduction);
+
 Node *CreateGemm(Graph *graph,
                  Node *node,
                  const std::vector<Node *> &inputs,
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/reduce_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/reduce_ops.cc
index 852cb180aa787..e1cc2de8bc547 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/reduce_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/reduce_ops.cc
@@ -36,6 +36,27 @@ Node *reduce_op_handler(Graph *graph, Node *node, const std::string &op_name) {
   return CreateBaseOp(graph, node, op_name, node->inputs, node->outputs, attrs);
 }
 
+Node *reduce_all_op_handler(Graph *graph,
+                            Node *node,
+                            const std::string &op_name) {
+  auto *op = node->Op();
+  auto attrs = AttributeMap{};
+  auto reduce_all = BOOST_GET_CONST(bool, op->GetAttr("reduce_all"));
+  if (!reduce_all) {
+    auto axes_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("dim"));
+    auto axes = std::vector<int64_t>{axes_.begin(), axes_.end()};
+    attrs.emplace("axes", axes);
+  }
+  auto keepdims_ = BOOST_GET_CONST(bool, op->GetAttr("keep_dim"));
+  auto keepdims = int64_t{keepdims_};
+  attrs.emplace("keepdims", keepdims);
+  auto int32_x =
+      CreateCast(graph, node, node->inputs, {}, VarType::INT32)->outputs[0];
+  auto reduce_op = CreateBaseOp(graph, node, op_name, {int32_x}, {}, attrs);
+  return CreateCast(
+      graph, node, reduce_op->outputs, node->outputs, VarType::BOOL);
+}
+
 Node *reduce_mean_handler(Graph *graph, Node *node) {
   return reduce_op_handler(graph, node, "popart_reducemean");
 }
@@ -56,6 +77,34 @@ Node *reduce_prod_handler(Graph *graph, Node *node) {
   return reduce_op_handler(graph, node, "popart_reduceprod");
 }
 
+Node *logsumexp_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto attrs = AttributeMap{};
+  auto reduce_all = BOOST_GET_CONST(bool, op->GetAttr("reduce_all"));
+  if (!reduce_all) {
+    auto axes_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("axis"));
+    auto axes = std::vector<int64_t>{axes_.begin(), axes_.end()};
+    attrs.emplace("axes", axes);
+  }
+  auto keepdims_ = BOOST_GET_CONST(bool, op->GetAttr("keepdim"));
+  auto keepdims = int64_t{keepdims_};
+  attrs.emplace("keepdims", keepdims);
+  return CreateBaseOp(graph,
+                      node,
+                      "popart_reducelogsumexp",
+                      node->inputs,
+                      node->outputs,
+                      attrs);
+}
+
+Node *reduce_all_handler(Graph *graph, Node *node) {
+  return reduce_all_op_handler(graph, node, "popart_reducemin");
+}
+
+Node *reduce_any_handler(Graph *graph, Node *node) {
+  return reduce_all_op_handler(graph, node, "popart_reducemax");
+}
+
 }  // namespace
 }  // namespace ipu
 }  // namespace platform
@@ -66,3 +115,6 @@ REGISTER_HANDLER(reduce_min, reduce_min_handler);
 REGISTER_HANDLER(reduce_sum, reduce_sum_handler);
 REGISTER_HANDLER(reduce_max, reduce_max_handler);
 REGISTER_HANDLER(reduce_prod, reduce_prod_handler);
+REGISTER_HANDLER(logsumexp, logsumexp_handler);
+REGISTER_HANDLER(reduce_all, reduce_all_handler);
+REGISTER_HANDLER(reduce_any, reduce_any_handler);
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
index 9b7fb7b835235..0bf0335db0f34 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
@@ -33,10 +33,15 @@ Node *fill_constant_handler(Graph *graph, Node *node) {
   auto dtype = VarType2OnnxDType(static_cast<VarType::Type>(dtype_));
   auto dims = BOOST_GET_CONST(std::vector<int64_t>, op->GetAttr("shape"));
   auto value_ = BOOST_GET_CONST(float, op->GetAttr("value"));
-  size_t size = 1;
+  int size = 1;
   for (auto &dim : dims) {
     size *= dim;
   }
+  PADDLE_ENFORCE_GT(size,
+                    0,
+                    errors::InvalidArgument(
+                        "IPU doesn't support non-positive dimensions. Please "
+                        "check tensor shape setting."));
   Attribute value;
   switch (dtype_) {
     case VarType::FP16:
@@ -598,10 +603,15 @@ Node *fill_any_like_handler(Graph *graph, Node *node) {
   auto x_shape = GetInputVarNode("X", node)->Var()->GetShape();
   auto dtype_ = BOOST_GET_CONST(int, op->GetAttr("dtype"));
   auto dtype = static_cast<VarType::Type>(dtype_);
-  size_t size = 1;
+  int size = 1;
   for (auto &dim : x_shape) {
     size *= dim;
   }
+  PADDLE_ENFORCE_GT(size,
+                    0,
+                    errors::InvalidArgument(
+                        "IPU doesn't support non-positive dimensions. Please "
+                        "check tensor shape setting."));
 
   Attribute out_value;
   switch (dtype) {
@@ -748,6 +758,491 @@ Node *dot_handler(Graph *graph, Node *node) {
                       });
 }
 
+Node *clip_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  // if (min_value == -FLT_MAX) then means no min_value
+  // if (max_value == FLT_MAX) then means no max_value
+  auto min_value = BOOST_GET_CONST(float, op->GetAttr("min"));
+  auto max_value = BOOST_GET_CONST(float, op->GetAttr("max"));
+
+  bool has_min_tensor = false;
+  bool has_max_tensor = false;
+  if (node->Op()->Input("Min").size()) {
+    has_min_tensor = true;
+  }
+  if (node->Op()->Input("Max").size()) {
+    has_max_tensor = true;
+  }
+
+  bool transfer_input_dtype = false;
+  Node *input_data = GetInputVarNode("X", node);
+  if (input_data->Var()->GetDataType() != VarType::FP32 &&
+      input_data->Var()->GetDataType() != VarType::FP16) {
+    input_data =
+        CreateCast(graph, node, {input_data}, {}, VarType::FP32)->outputs[0];
+    transfer_input_dtype = true;
+  }
+
+  Node *min_tensor = nullptr;
+  if (has_min_tensor) {
+    if (GetInputVarNode("Min", node)->Var()->GetDataType() != VarType::FP32) {
+      min_tensor =
+          CreateCast(
+              graph, node, {GetInputVarNode("Min", node)}, {}, VarType::FP32)
+              ->outputs[0];
+    } else {
+      min_tensor = GetInputVarNode("Min", node);
+    }
+  } else {
+    min_tensor = CreateConst(graph,
+                             node,
+                             {},
+                             {},
+                             {{"value", std::vector<float>{min_value}},
+                              {"dims", std::vector<int64_t>{1}},
+                              {"dtype", ONNXDataType::FLOAT}})
+                     ->outputs[0];
+  }
+
+  Node *max_tensor = nullptr;
+  if (has_max_tensor) {
+    if (GetInputVarNode("Max", node)->Var()->GetDataType() != VarType::FP32) {
+      max_tensor =
+          CreateCast(
+              graph, node, {GetInputVarNode("Max", node)}, {}, VarType::FP32)
+              ->outputs[0];
+    } else {
+      max_tensor = GetInputVarNode("Max", node);
+    }
+  } else {
+    max_tensor = CreateConst(graph,
+                             node,
+                             {},
+                             {},
+                             {{"value", std::vector<float>{max_value}},
+                              {"dims", std::vector<int64_t>{1}},
+                              {"dtype", ONNXDataType::FLOAT}})
+                     ->outputs[0];
+  }
+
+  if (transfer_input_dtype) {
+    auto clip_res = CreateBaseOp(
+        graph, node, "popart_clip", {input_data, min_tensor, max_tensor}, {});
+    return CreateCast(graph,
+                      node,
+                      clip_res->outputs,
+                      {GetOutputVarNode("Out", node)},
+                      GetInputVarNode("X", node)->Var()->GetDataType());
+  } else {
+    return CreateBaseOp(graph,
+                        node,
+                        "popart_clip",
+                        {input_data, min_tensor, max_tensor},
+                        {GetOutputVarNode("Out", node)});
+  }
+}
+
+Node *dist_handler(Graph *graph, Node *node) {
+  // Minimum negative float
+  union neg_infinity {
+    int neg_int_inf;
+    float neg_float_int;
+  };
+  neg_infinity neg_inf;
+  neg_inf.neg_int_inf = 0xFF800000;
+  float g_NegFloatInfinity = neg_inf.neg_float_int;
+
+  auto *op = node->Op();
+  auto *sub_node =
+      CreateBaseOp(graph,
+                   node,
+                   "popart_sub",
+                   {GetInputVarNode("X", node), GetInputVarNode("Y", node)},
+                   {})
+          ->outputs[0];
+  auto *abs_node =
+      CreateBaseOp(graph, node, "popart_abs", {sub_node}, {})->outputs[0];
+
+  auto p = BOOST_GET_CONST(float, op->GetAttr("p"));
+
+  // Reshape to 1-D output
+  auto target_shape = AttributeMap{{"value", std::vector<int64_t>{-1}},
+                                   {"dims", std::vector<int64_t>{1}},
+                                   {"dtype", ONNXDataType::INT64}};
+  auto *target_shape_node =
+      CreateBaseOp(graph, node, "popart_constant", {}, {}, target_shape)
+          ->outputs[0];
+
+  if (fabs(p) < 1e-6) {
+    auto *sign_node =
+        CreateBaseOp(graph, node, "popart_sign", {abs_node}, {})->outputs[0];
+    auto *sum_node = CreateBaseOp(graph,
+                                  node,
+                                  "popart_reducesum",
+                                  {sign_node},
+                                  {},
+                                  {{"keepdims", int64_t{0}}})
+                         ->outputs[0];
+    return CreateBaseOp(graph,
+                        node,
+                        "popart_reshape",
+                        {sum_node, target_shape_node},
+                        {GetOutputVarNode("Out", node)});
+  } else if (p == std::numeric_limits<float>::infinity()) {
+    auto *max_node = CreateBaseOp(graph,
+                                  node,
+                                  "popart_reducemax",
+                                  {abs_node},
+                                  {},
+                                  {{"keepdims", int64_t{0}}})
+                         ->outputs[0];
+    return CreateBaseOp(graph,
+                        node,
+                        "popart_reshape",
+                        {max_node, target_shape_node},
+                        {GetOutputVarNode("Out", node)});
+  } else if (p == g_NegFloatInfinity) {
+    auto *min_node = CreateBaseOp(graph,
+                                  node,
+                                  "popart_reducemin",
+                                  {abs_node},
+                                  {},
+                                  {{"keepdims", int64_t{0}}})
+                         ->outputs[0];
+    return CreateBaseOp(graph,
+                        node,
+                        "popart_reshape",
+                        {min_node, target_shape_node},
+                        {GetOutputVarNode("Out", node)});
+  } else {
+    auto target_dtype = ONNXDataType::FLOAT;
+    if (GetInputVarNode("X", node)->Var()->GetDataType() == VarType::FP16) {
+      target_dtype = ONNXDataType::FLOAT16;
+    }
+
+    auto pow_factor = AttributeMap{{"value", std::vector<float>{p}},
+                                   {"dims", std::vector<int64_t>{1}},
+                                   {"dtype", target_dtype}};
+    auto *pow_factor_node =
+        CreateBaseOp(graph, node, "popart_constant", {}, {}, pow_factor)
+            ->outputs[0];
+    auto *pow_node =
+        CreateBaseOp(graph, node, "popart_pow", {abs_node, pow_factor_node}, {})
+            ->outputs[0];
+    auto *sum_node = CreateBaseOp(graph,
+                                  node,
+                                  "popart_reducesum",
+                                  {pow_node},
+                                  {},
+                                  {{"keepdims", int64_t{0}}})
+                         ->outputs[0];
+    auto *s_node =
+        CreateBaseOp(
+            graph, node, "popart_reshape", {sum_node, target_shape_node}, {})
+            ->outputs[0];
+    auto *p_1 =
+        CreateBaseOp(graph, node, "popart_reciprocal", {pow_factor_node}, {})
+            ->outputs[0];
+    return CreateBaseOp(graph,
+                        node,
+                        "popart_pow",
+                        {s_node, p_1},
+                        {GetOutputVarNode("Out", node)});
+  }
+}
+
+Node *expand_as_v2_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  Node *shape = nullptr;
+  auto op_inputs = op->Inputs();
+  // PopART Expand Op only support the constant tensor as the input `shape`.
+  if (op_inputs.find("target_tensor") != op_inputs.end()) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Do not support input tensor `target_tensor`. Please use the attribute "
+        "`target_shape`."));
+  }
+  auto input_shape = GetInputVarNode("X", node)->Var()->GetShape();
+  auto shape_value =
+      BOOST_GET_CONST(std::vector<int>, op->GetAttr("target_shape"));
+  // Check the dimensions
+  int input_shape_index = input_shape.size() - 1;
+  int target_shape_index = shape_value.size() - 1;
+  while (input_shape_index >= 0) {
+    if (input_shape[input_shape_index] !=
+            int64_t(shape_value[target_shape_index]) &&
+        input_shape[input_shape_index] != int64_t(1)) {
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "For input and `shape`, corresponding dimensions must have the same "
+          "value or input dim = 1."));
+    }
+    target_shape_index--;
+    input_shape_index--;
+  }
+  shape = CreateConst(
+              graph,
+              node,
+              {},
+              {},
+              {{"value",
+                std::vector<int64_t>{shape_value.begin(), shape_value.end()}},
+               {"dims", std::vector<int64_t>{int64_t(shape_value.size())}},
+               {"dtype", ONNXDataType::INT64}})
+              ->outputs[0];
+  return CreateBaseOp(graph,
+                      node,
+                      "popart_expand",
+                      {GetInputVarNode("X", node), shape},
+                      {GetOutputVarNode("Out", node)});
+}
+
+Node *expand_v2_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+
+  // PopART Expand Op only support the constant tensor as the input `shape`.
+  if (op->Input("Shape").size()) {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("Do not support input tensor `Shape`. "
+                                        "Please use the attribute `shape`."));
+  }
+  if (op->Input("expand_shapes_tensor").size()) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Do not support input tensor `expand_shapes_tensor`. Please use the "
+        "attribute `shape`."));
+  }
+  auto input_shape = GetInputVarNode("X", node)->Var()->GetShape();
+  auto shape_value = BOOST_GET_CONST(std::vector<int>, op->GetAttr("shape"));
+  // Check the dimensions
+  int input_shape_index = input_shape.size() - 1;
+  int target_shape_index = shape_value.size() - 1;
+  while (input_shape_index >= 0) {
+    if (input_shape[input_shape_index] !=
+            int64_t(shape_value[target_shape_index]) &&
+        input_shape[input_shape_index] != int64_t(1)) {
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "For input and `shape`, corresponding dimensions must have the same "
+          "value or input dim = 1."));
+    }
+    target_shape_index--;
+    input_shape_index--;
+  }
+
+  auto *shape =
+      CreateConst(
+          graph,
+          node,
+          {},
+          {},
+          {{"value",
+            std::vector<int64_t>{shape_value.begin(), shape_value.end()}},
+           {"dims", std::vector<int64_t>{int64_t(shape_value.size())}},
+           {"dtype", ONNXDataType::INT64}})
+          ->outputs[0];
+
+  return CreateBaseOp(graph,
+                      node,
+                      "popart_expand",
+                      {GetInputVarNode("X", node), shape},
+                      {GetOutputVarNode("Out", node)});
+}
+
+Node *flatten_contiguous_range_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto start_axis = BOOST_GET_CONST(int, op->GetAttr("start_axis"));
+  auto stop_axis = BOOST_GET_CONST(int, op->GetAttr("stop_axis"));
+  auto input_rank = GetInputVarNode("X", node)->Var()->GetShape().size();
+
+  if (start_axis < 0) {
+    start_axis += input_rank;
+  }
+  if (stop_axis < 0) {
+    stop_axis += input_rank;
+  }
+
+  std::vector<int64_t> target_shape;
+  if (start_axis == 0 && stop_axis == input_rank - 1) {
+    target_shape.push_back(-1);
+  } else {
+    auto input_shape = GetInputVarNode("X", node)->Var()->GetShape();
+    if (start_axis == 0) {
+      target_shape.assign(input_shape.begin() + stop_axis + 1,
+                          input_shape.end());
+      target_shape.insert(target_shape.begin(), -1);
+    } else if (stop_axis == input_rank - 1) {
+      target_shape.assign(input_shape.begin(),
+                          input_shape.begin() + start_axis);
+      target_shape.push_back(-1);
+    } else {
+      target_shape.insert(target_shape.begin(),
+                          input_shape.begin(),
+                          input_shape.begin() + start_axis);
+      target_shape.push_back(-1);
+      target_shape.insert(target_shape.end(),
+                          input_shape.begin() + stop_axis + 1,
+                          input_shape.end());
+    }
+  }
+  auto *unknown_dim_node = CreateConst(graph,
+                                       node,
+                                       target_shape,
+                                       {int64_t(target_shape.size())},
+                                       ONNXDataType::INT64)
+                               ->outputs[0];
+  return CreateBaseOp(graph,
+                      node,
+                      "popart_reshape",
+                      {GetInputVarNode("X", node), unknown_dim_node},
+                      {GetOutputVarNode("Out", node)},
+                      {});
+}
+
+Node *flip_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto axes = BOOST_GET_CONST(std::vector<int>, op->GetAttr("axis"));
+  auto input_shape = GetInputVarNode("X", node)->Var()->GetShape();
+  for (auto it = axes.begin(); it != axes.end();) {
+    if (*it < 0) {
+      *it += input_shape.size();
+    }
+    // Remove input_shape[axis] == 1
+    if (input_shape[*it] == 1) {
+      it = axes.erase(it);
+    } else {
+      it++;
+    }
+  }
+  auto *temp_node = GetInputVarNode("X", node);
+  for (auto i = 0; i < axes.size(); i++) {
+    auto axis = axes[i];
+    std::vector<int64_t> split;
+    split.resize(input_shape[axis], 1);
+    std::vector<Node *> splits_output_nodes;
+    for (int j = 0; j < split.size(); j++) {
+      splits_output_nodes.push_back(MakeVarNode(graph, node));
+    }
+    auto splits_outputs = CreateBaseOp(graph,
+                                       node,
+                                       "popart_split",
+                                       {temp_node},
+                                       {splits_output_nodes},
+                                       {{"num_outputs", int64_t(split.size())},
+                                        {"axis", int64_t(axis)},
+                                        {"split", split}})
+                              ->outputs;
+    std::reverse(splits_outputs.begin(), splits_outputs.end());
+    if (i != axes.size() - 1) {
+      temp_node = CreateBaseOp(graph,
+                               node,
+                               "popart_concat",
+                               splits_outputs,
+                               {},
+                               {{"axis", int64_t(axis)}})
+                      ->outputs[0];
+    } else {
+      temp_node = CreateBaseOp(graph,
+                               node,
+                               "popart_concat",
+                               splits_outputs,
+                               {},
+                               {{"axis", int64_t(axis)}})
+                      ->outputs[0];
+    }
+  }
+  // In case of `axis` is empty. Identity Op will be deleted in passes.
+  return CreateBaseOp(graph,
+                      node,
+                      "popart_identity",
+                      {temp_node},
+                      {GetOutputVarNode("Out", node)},
+                      {});
+}
+
+Node *meshgrid_handler(Graph *graph, Node *node) {
+  Node *res = nullptr;
+  // All inputs are 1-D tensors
+  std::vector<int64_t> out_shape;
+  for (auto input : node->inputs) {
+    auto input_shape = input->Var()->GetShape();
+    out_shape.push_back(input_shape[0]);
+  }
+  // Expand Op only allows a const tensor as `shape`
+  auto *out_shape_node = CreateConst(graph,
+                                     node,
+                                     out_shape,
+                                     {int64_t(out_shape.size())},
+                                     ONNXDataType::INT64)
+                             ->outputs[0];
+
+  for (int i = 0; i < node->inputs.size(); i++) {
+    // Reshape each input tensor to [node->inputs.size()] by filling with 1
+    std::vector<int64_t> target_shape(node->inputs.size(), 1);
+    target_shape[i] = node->inputs[i]->Var()->GetShape()[0];
+    auto *target_shape_node = CreateConst(graph,
+                                          node,
+                                          target_shape,
+                                          {int64_t(target_shape.size())},
+                                          ONNXDataType::INT64)
+                                  ->outputs[0];
+    auto *t_reshaped = CreateBaseOp(graph,
+                                    node,
+                                    "popart_reshape",
+                                    {node->inputs[i], target_shape_node},
+                                    {},
+                                    {})
+                           ->outputs[0];
+    res = CreateBaseOp(graph,
+                       node,
+                       "popart_expand",
+                       {t_reshaped, out_shape_node},
+                       {node->outputs[i]});
+  }
+  return res;
+}
+
+Node *p_norm_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto keepdim = BOOST_GET_CONST(bool, op->GetAttr("keepdim"));
+  auto axis = BOOST_GET_CONST(int, op->GetAttr("axis"));
+  auto porder = BOOST_GET_CONST(float, op->GetAttr("porder"));
+
+  auto target_dtype = ONNXDataType::FLOAT;
+  if (GetInputVarNode("X", node)->Var()->GetDataType() == VarType::FP16) {
+    target_dtype = ONNXDataType::FLOAT16;
+  }
+
+  auto *pnode = CreateConst(graph,
+                            node,
+                            std::vector<float>{porder},
+                            std::vector<int64_t>{1},
+                            target_dtype)
+                    ->outputs[0];
+  auto *abs_node =
+      CreateBaseOp(graph, node, "popart_abs", {GetInputVarNode("X", node)}, {})
+          ->outputs[0];
+  auto *pow_node =
+      CreateBaseOp(graph, node, "popart_pow", {abs_node, pnode}, {})
+          ->outputs[0];
+  auto *reducesum_node = CreateBaseOp(graph,
+                                      node,
+                                      "popart_reducesum",
+                                      {pow_node},
+                                      {},
+                                      {{"axes", std::vector<int64_t>{axis}},
+                                       {"keepdims", int64_t(keepdim)}})
+                             ->outputs[0];
+  auto *pnode1 =
+      CreateConst(graph,
+                  node,
+                  std::vector<float>{static_cast<float>(1.0 / porder)},
+                  std::vector<int64_t>{1},
+                  target_dtype)
+          ->outputs[0];
+  return CreateBaseOp(graph,
+                      node,
+                      "popart_pow",
+                      {reducesum_node, pnode1},
+                      {GetOutputVarNode("Out", node)});
+}
+
 }  // namespace
 }  // namespace ipu
 }  // namespace platform
@@ -759,6 +1254,7 @@ REGISTER_HANDLER(uniform_random, uniform_random_handler);
 REGISTER_HANDLER(transpose2, transpose_handler);
 REGISTER_HANDLER(reshape2, reshape_handler);
 REGISTER_HANDLER(flatten2, flatten2_handler);
+REGISTER_HANDLER(flatten_contiguous_range, flatten_contiguous_range_handler);
 REGISTER_HANDLER(gather, gather_handler);
 REGISTER_HANDLER(squeeze2, squeeze_handler);
 REGISTER_HANDLER(cast, cast_handler);
@@ -769,6 +1265,8 @@ REGISTER_HANDLER(stack, stack_handler);
 REGISTER_HANDLER(shape, shape_handler);
 REGISTER_HANDLER(slice, slice_handler);
 REGISTER_HANDLER(expand, expand_handler);
+REGISTER_HANDLER(expand_v2, expand_v2_handler);
+REGISTER_HANDLER(expand_as_v2, expand_as_v2_handler);
 REGISTER_HANDLER(assign, assign_handler);
 REGISTER_HANDLER(assign_value, assign_value_handler);
 REGISTER_HANDLER(fill_any_like, fill_any_like_handler);
@@ -777,3 +1275,8 @@ REGISTER_HANDLER(split, split_handler);
 REGISTER_HANDLER(one_hot, one_hot_handler);
 REGISTER_HANDLER(one_hot_v2, one_hot_v2_handler);
 REGISTER_HANDLER(dot, dot_handler);
+REGISTER_HANDLER(clip, clip_handler);
+REGISTER_HANDLER(dist, dist_handler);
+REGISTER_HANDLER(flip, flip_handler);
+REGISTER_HANDLER(meshgrid, meshgrid_handler);
+REGISTER_HANDLER(p_norm, p_norm_handler);
diff --git a/paddle/fluid/platform/device/ipu/supported_ops_autogen.h b/paddle/fluid/platform/device/ipu/supported_ops_autogen.h
index 763c5a46abe28..14dcf65afeefd 100644
--- a/paddle/fluid/platform/device/ipu/supported_ops_autogen.h
+++ b/paddle/fluid/platform/device/ipu/supported_ops_autogen.h
@@ -33,6 +33,7 @@ OP_DECL(popart_dynamicadd_v2, aiGraphcoreOpset.dynamicadd, ARG(INT_VEC,axes) ARG
 OP_DECL(popart_sequenceslice_v2, aiGraphcoreOpset.sequenceslice, ARG(INT,zeroUnused) ) // NOLINT
 OP_DECL(popart_replicatedallreduce_v2, aiGraphcoreOpset.replicatedallreduce, OPT_ARG(INT_VEC,commGroup) ) // NOLINT
 OP_DECL(popart_ctcbeamsearchdecoder_v2, aiGraphcoreOpset.ctcbeamsearchdecoder, ARG(INT,blank) ARG(INT,beamWidth) ARG(INT,topPaths) ) // NOLINT
+OP_DECL(popart_ctcloss, aiGraphcoreOpset.ctcloss, SIG_ARG(INT32,popart::ReductionType,reduction) ARG(INT32,blank) ARG(STRING,outDataType) ) // NOLINT
 OP_DECL(popart_shapeddropout_v2, aiGraphcoreOpset.shapeddropout, ARG(INT_VEC,shape) ARG(FLOAT,ratio) ) // NOLINT
 OP_DECL(popart_atan2_v2, aiGraphcoreOpset.atan2, NONE) // NOLINT
 OP_DECL(popart_expm1_v2, aiGraphcoreOpset.expm1, NONE) // NOLINT
diff --git a/paddle/fluid/platform/device/ipu/supported_ops_custom.h b/paddle/fluid/platform/device/ipu/supported_ops_custom.h
index 02d215433c5ee..04c57cc0104de 100644
--- a/paddle/fluid/platform/device/ipu/supported_ops_custom.h
+++ b/paddle/fluid/platform/device/ipu/supported_ops_custom.h
@@ -17,5 +17,6 @@
 #pragma once
 
 OP_DECL(popart_nllloss_v2, aiGraphcoreOpset.nllloss, SIG_ARG(INT32,popart::ReductionType,reduction) OPT_ARG(INT32,ignoreIndex) ARG(BOOL,inputIsLogProbability) ) // NOLINT
+OP_DECL(popart_identity_loss, aiGraphcoreOpset.identityloss, SIG_ARG(INT32,popart::ReductionType,reduction) ) // NOLINT
 
 // clang-format on
diff --git a/paddle/fluid/platform/device/mlu/CMakeLists.txt b/paddle/fluid/platform/device/mlu/CMakeLists.txt
index 08b33c9b58f06..43a8f17504750 100644
--- a/paddle/fluid/platform/device/mlu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/mlu/CMakeLists.txt
@@ -13,7 +13,7 @@ cc_library(
 cc_library(
   mlu_stream
   SRCS mlu_stream.cc
-  DEPS boost mlu_info stream_callback_manager eigen3 ${MKLDNN_CTX_DEPS})
+  DEPS mlu_info stream_callback_manager eigen3 ${MKLDNN_CTX_DEPS})
 cc_library(
   mlu_device_context
   SRCS device_context.cc
diff --git a/paddle/fluid/platform/device/mlu/device_context.cc b/paddle/fluid/platform/device/mlu/device_context.cc
index c3c5546a12a2e..087b4803320e5 100644
--- a/paddle/fluid/platform/device/mlu/device_context.cc
+++ b/paddle/fluid/platform/device/mlu/device_context.cc
@@ -42,19 +42,16 @@ MLUDeviceContext::MLUDeviceContext(MLUPlace place) : place_(place) {
   runtime_version_ = GetMLURuntimeVersion(place_.device);
   cnnl_version_ = GetMLUCnnlVersion(place_.device);
 
-  LOG_FIRST_N(WARNING, 1) << "Please NOTE: device: " << place_.device
-                          << ", MLU Compute Capability: "
-                          << compute_capability_ / 10 << "."
-                          << compute_capability_ % 10
-                          << ", Driver API Version: " << driver_version_ / 10000
-                          << "." << (driver_version_ / 100) % 100 << "."
-                          << driver_version_ % 100 << ", Runtime API Version: "
-                          << runtime_version_ / 10000 << "."
-                          << (runtime_version_ / 100) % 100 << "."
-                          << runtime_version_ % 100
-                          << ", Cnnl API Version: " << cnnl_version_ / 10000
-                          << "." << (cnnl_version_ / 100) % 100 << "."
-                          << cnnl_version_ % 100;
+  LOG_FIRST_N(WARNING, 1)
+      << "Please NOTE: device: " << static_cast<int>(place_.device)
+      << ", MLU Compute Capability: " << compute_capability_ / 10 << "."
+      << compute_capability_ % 10
+      << ", Driver API Version: " << driver_version_ / 10000 << "."
+      << (driver_version_ / 100) % 100 << "." << driver_version_ % 100
+      << ", Runtime API Version: " << runtime_version_ / 10000 << "."
+      << (runtime_version_ / 100) % 100 << "." << runtime_version_ % 100
+      << ", Cnnl API Version: " << cnnl_version_ / 10000 << "."
+      << (cnnl_version_ / 100) % 100 << "." << cnnl_version_ % 100;
 
   default_ctx_.reset(new MLUContext(place_));
 }
diff --git a/paddle/fluid/platform/device/npu/CMakeLists.txt b/paddle/fluid/platform/device/npu/CMakeLists.txt
index 9015a76e9cd5a..417b0f9ab6e1a 100644
--- a/paddle/fluid/platform/device/npu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/npu/CMakeLists.txt
@@ -21,7 +21,7 @@ if(WITH_ASCEND_CL)
   cc_library(
     npu_stream
     SRCS npu_stream.cc
-    DEPS enforce boost stream_callback_manager)
+    DEPS enforce stream_callback_manager)
   cc_library(
     npu_collective_helper
     SRCS npu_collective_helper.cc
diff --git a/paddle/fluid/platform/device/npu/npu_info.cc b/paddle/fluid/platform/device/npu/npu_info.cc
index 362c4e8fae8b1..9acdef985ade2 100644
--- a/paddle/fluid/platform/device/npu/npu_info.cc
+++ b/paddle/fluid/platform/device/npu/npu_info.cc
@@ -285,6 +285,10 @@ void NPUEventQuery(aclrtEvent event, aclrtEventStatus *status) {
   PADDLE_ENFORCE_NPU_SUCCESS(aclrtQueryEvent(event, status));
 }
 
+void NPUEventSynchronize(aclrtEvent event) {
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeEvent(event));
+}
+
 void NPUStreamWaitEvent(aclrtStream stream, aclrtEvent event) {
   PADDLE_ENFORCE_NPU_SUCCESS(aclrtStreamWaitEvent(stream, event));
 }
diff --git a/paddle/fluid/platform/device/npu/npu_info.h b/paddle/fluid/platform/device/npu/npu_info.h
index f7af1c246ef6c..ea55831db2e22 100644
--- a/paddle/fluid/platform/device/npu/npu_info.h
+++ b/paddle/fluid/platform/device/npu/npu_info.h
@@ -138,6 +138,9 @@ void NPUEventQuery(aclrtEvent event, aclrtEventStatus *status);
 //! Record NPU event in the stream.
 void NPUEventRecord(aclrtEvent event, aclrtStream stream);
 
+//! Synchronize NPU event.
+void NPUEventSynchronize(aclrtEvent event);
+
 //! Makes a stream wait on an event.
 void NPUStreamWaitEvent(aclrtStream stream, aclrtEvent event);
 
diff --git a/paddle/fluid/platform/device/npu/npu_op_runner.cc b/paddle/fluid/platform/device/npu/npu_op_runner.cc
index e6a847758bdee..99828a425517b 100644
--- a/paddle/fluid/platform/device/npu/npu_op_runner.cc
+++ b/paddle/fluid/platform/device/npu/npu_op_runner.cc
@@ -233,7 +233,7 @@ NpuOpRunner &NpuOpRunner::AddInput(const Tensor &tensor, aclMemType mem_type) {
 NpuOpRunner &NpuOpRunner::AddInput(std::vector<int32_t> &&dims) {
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
   auto *dev_ctx =
-      static_cast<platform::CPUDeviceContext *>(pool.Get(platform::CPUPlace()));
+      static_cast<phi::CPUContext *>(pool.Get(platform::CPUPlace()));
   Tensor host_tensor;
   paddle::framework::TensorFromVector(dims, *dev_ctx, &host_tensor);
   host_tensors_.emplace_back(host_tensor);
@@ -249,7 +249,7 @@ NpuOpRunner &NpuOpRunner::AddInput(std::vector<int32_t> &&dims) {
 NpuOpRunner &NpuOpRunner::AddInput(std::vector<int64_t> &&dims) {
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
   auto *dev_ctx =
-      static_cast<platform::CPUDeviceContext *>(pool.Get(platform::CPUPlace()));
+      static_cast<phi::CPUContext *>(pool.Get(platform::CPUPlace()));
   Tensor host_tensor;
   paddle::framework::TensorFromVector(dims, *dev_ctx, &host_tensor);
   host_tensors_.emplace_back(host_tensor);
@@ -265,7 +265,7 @@ NpuOpRunner &NpuOpRunner::AddInput(std::vector<int64_t> &&dims) {
 NpuOpRunner &NpuOpRunner::AddInput(std::vector<float> &&values) {
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
   auto *dev_ctx =
-      static_cast<platform::CPUDeviceContext *>(pool.Get(platform::CPUPlace()));
+      static_cast<phi::CPUContext *>(pool.Get(platform::CPUPlace()));
   Tensor host_tensor;
   paddle::framework::TensorFromVector(values, *dev_ctx, &host_tensor);
   host_tensors_.emplace_back(host_tensor);
@@ -281,7 +281,7 @@ NpuOpRunner &NpuOpRunner::AddInput(std::vector<float> &&values) {
 NpuOpRunner &NpuOpRunner::AddInput(std::vector<double> &&values) {
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
   auto *dev_ctx =
-      static_cast<platform::CPUDeviceContext *>(pool.Get(platform::CPUPlace()));
+      static_cast<phi::CPUContext *>(pool.Get(platform::CPUPlace()));
   Tensor host_tensor;
   paddle::framework::TensorFromVector(values, *dev_ctx, &host_tensor);
   host_tensors_.emplace_back(host_tensor);
diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
index 27339a0f25a8a..8cae8cfe534ef 100644
--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -60,6 +60,7 @@ XPUOpMap& get_kl2_ops() {
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"bilinear_interp_v2_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"broadcast", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"cast",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace()),
@@ -232,6 +233,7 @@ XPUOpMap& get_kl2_ops() {
                      pOpKernelType(vartype::FP16, XPUPlace())})},
       {"generate_proposals_v2",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"grad_add", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"greater_equal",
        XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace()),
@@ -240,6 +242,8 @@ XPUOpMap& get_kl2_ops() {
        XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"grid_sampler",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"hard_swish_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
@@ -272,6 +276,9 @@ XPUOpMap& get_kl2_ops() {
                      pOpKernelType(vartype::INT32, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
       {"log", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"log_softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"log_softmax_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"lookup_table_v2_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"lookup_table_v2",
@@ -281,11 +288,18 @@ XPUOpMap& get_kl2_ops() {
                      pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"matmul_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"matmul_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
       {"matmul_v2_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"matmul_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"matmul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"matmul_v2",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"matmul",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
       {"mean_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
@@ -310,6 +324,8 @@ XPUOpMap& get_kl2_ops() {
       {"one_hot_v2",
        XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
                      pOpKernelType(vartype::INT64, XPUPlace())})},
+      {"p_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"p_norm_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"pool2d_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
@@ -336,6 +352,8 @@ XPUOpMap& get_kl2_ops() {
       {"reduce_sum_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"reduce_sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"relu6", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"relu6_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"relu_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
@@ -356,6 +374,10 @@ XPUOpMap& get_kl2_ops() {
                      pOpKernelType(vartype::INT32, XPUPlace()),
                      pOpKernelType(vartype::BOOL, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"resnet_unit", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"resnet_unit_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"rmsprop", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"rnn", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"rnn_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"roi_align", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
@@ -475,7 +497,8 @@ XPUOpMap& get_kl2_ops() {
                      pOpKernelType(vartype::BOOL, XPUPlace()),
                      pOpKernelType(vartype::INT8, XPUPlace()),
                      pOpKernelType(vartype::UINT8, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
+                     pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
       {"unsqueeze2",
        XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
                      pOpKernelType(vartype::INT64, XPUPlace()),
@@ -483,7 +506,8 @@ XPUOpMap& get_kl2_ops() {
                      pOpKernelType(vartype::BOOL, XPUPlace()),
                      pOpKernelType(vartype::INT8, XPUPlace()),
                      pOpKernelType(vartype::UINT8, XPUPlace()),
-                     pOpKernelType(vartype::FP32, XPUPlace())})},
+                     pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
       {"where_index",
        XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
                      pOpKernelType(vartype::BOOL, XPUPlace()),
@@ -498,6 +522,12 @@ XPUOpMap& get_kl2_ops() {
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"sequence_conv_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+
+      // Fused op
+      {"resnet_basic_block_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"resnet_basic_block",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
   };
 
   return s_xpu2_kernels;
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 3ad22def69039..1e978f078dc84 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -123,6 +123,10 @@ DeviceType Place2DeviceType(const platform::Place& place) {
     return platform::DeviceType::CUDA;
   } else if (platform::is_xpu_place(place)) {
     return platform::DeviceType::XPU;
+  } else if (platform::is_ipu_place(place)) {
+    return platform::DeviceType::IPU;
+  } else if (platform::is_npu_place(place)) {
+    return platform::DeviceType::NPU;
   } else if (platform::is_mlu_place(place)) {
     return platform::DeviceType::MLU;
   } else {
@@ -261,7 +265,7 @@ void EmplaceDeviceContexts(
           p,
           disable_setting_default_stream_for_allocator);
 #else
-      EmplaceDeviceContext<CPUDeviceContext>(
+      EmplaceDeviceContext<phi::CPUContext>(
           place_to_device_context,
           p,
           disable_setting_default_stream_for_allocator);
@@ -367,14 +371,6 @@ DeviceContextPool::DeviceContextPool(
                         /*disable_setting_default_stream_for_allocator=*/false);
 }
 
-CPUDeviceContext::CPUDeviceContext() : phi::CPUContext() {
-  phi::CPUContext::Init();
-}
-
-CPUDeviceContext::CPUDeviceContext(CPUPlace place) : phi::CPUContext(place) {
-  phi::CPUContext::Init();
-}
-
 #ifdef PADDLE_WITH_IPU
 IPUDeviceContext::IPUDeviceContext(IPUPlace place) : place_(place) {}
 
@@ -757,275 +753,6 @@ Eigen::DefaultDevice* CUDAPinnedDeviceContext::eigen_device() const {
 const Place& CUDAPinnedDeviceContext::GetPlace() const { return place_; }
 #endif
 
-#ifdef PADDLE_WITH_MKLDNN
-MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place)
-    : CPUDeviceContext(place), p_blobmap_() {
-  p_blobmap_.reset(new BlobMap());
-  p_exec_items_.reset(new ExecShape());
-  p_mutex_.reset(new std::mutex());
-}
-
-MKLDNNDeviceContextThreadLocals::Body::Body()
-    : cur_engine(dnnl::engine::kind::cpu, 0), cur_stream(cur_engine) {
-  cur_mkldnn_session_id = kMKLDNNSessionID_Default;
-  cur_input_shape_str = "";
-  cur_input_shape_cache_capacity = 1;
-  cur_paddle_data_layout = paddle::framework::DataLayout::kNCHW;
-}
-
-// When Thread finish we clear oneDNN cache
-// This is needed when we have one executor used by many threads
-// e.g. test_analyzer_detect. Thread ID is not part of caching key
-// (for naive executor) so we need to clear cache when one thread finish
-// and other is to start inference
-// TODO(jczaja): Ideally it would be good to clear only part of cache
-// related to thread that is to be terminated
-MKLDNNDeviceContextThreadLocals::Body::~Body() {
-  auto cpu_place = paddle::platform::CPUPlace();
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  platform::MKLDNNDeviceContext* dev_ctx =
-      (platform::MKLDNNDeviceContext*)pool.Get(cpu_place);
-  dev_ctx->ResetBlobMap(exec_ptr_);
-}
-
-void MKLDNNDeviceContextThreadLocals::Body::set_cur_mkldnn_session_id(
-    size_t sid) {
-  cur_mkldnn_session_id = sid;
-}
-size_t MKLDNNDeviceContextThreadLocals::Body::get_cur_mkldnn_session_id(void) {
-  return cur_mkldnn_session_id;
-}
-
-void MKLDNNDeviceContextThreadLocals::Body::set_cur_input_shape_str(
-    std::string input_shape_str) {
-  cur_input_shape_str = input_shape_str;
-}
-void MKLDNNDeviceContextThreadLocals::Body::set_cur_input_shape_cache_capacity(
-    int input_shape_cache_capacity) {
-  cur_input_shape_cache_capacity = input_shape_cache_capacity;
-}
-
-void MKLDNNDeviceContextThreadLocals::Body::set_cur_paddle_data_layout(
-    framework::DataLayout dl) {
-  cur_paddle_data_layout = dl;
-}
-
-framework::DataLayout
-MKLDNNDeviceContextThreadLocals::Body::get_cur_paddle_data_layout(void) {
-  return cur_paddle_data_layout;
-}
-
-void MKLDNNDeviceContextThreadLocals::Body::log_lib_version(void) {
-  if (!said_once) {
-    said_once = true;
-    auto dv = dnnl::version();
-    LOG(INFO) << "oneDNN v" << dv->major << "." << dv->minor << "."
-              << dv->patch;
-  }
-}
-
-const dnnl::engine& MKLDNNDeviceContextThreadLocals::Body::get_engine(void) {
-  return cur_engine;
-}
-
-dnnl::stream& MKLDNNDeviceContextThreadLocals::Body::get_stream(void) {
-  return cur_stream;
-}
-
-void MKLDNNDeviceContext::ResetBlobMap(void* ptr) {
-  VLOG(4) << tls().get_curr_exec() << " " << ptr;
-  std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
-  if (block_next_cache_clearing_ == 0) {
-    VLOG(3) << "Clearing DNNL cache.";
-    // If no specific executor pointer then clear
-    // everything. For executor pointer then clear only
-    // objects allocated when using given executor
-    if (ptr == nullptr) {
-      p_blobmap_->clear();
-    } else {
-      // Iterate through all shapes and release
-      // for each shape and active executor all entries
-      // of this executor
-      for (auto& s : *p_exec_items_) {
-        for (auto& v : (*s.second)[ptr]) {
-          (v.first)->erase(v.second);
-        }
-        s.second->erase(ptr);
-      }
-    }
-    // Reset paddle layout to NCHW
-    VLOG(3) << "Resetting Paddle data layout to NCHW.";
-    platform::MKLDNNDeviceContext::tls().set_cur_paddle_data_layout(
-        paddle::framework::DataLayout::kNCHW);
-  } else {
-    --block_next_cache_clearing_;
-    VLOG(3) << "Prevented Clearing DNNL cache. Updated "
-               "block_next_cache_clearing_ : "
-            << block_next_cache_clearing_;
-    PADDLE_ENFORCE_GE(block_next_cache_clearing_,
-                      0,
-                      platform::errors::InvalidArgument(
-                          "Cache clearing mark should be non-negative "
-                          ". But received %d.",
-                          block_next_cache_clearing_));
-  }
-}
-
-void MKLDNNDeviceContext::RemoveShapeEntriesWithExecutor(void) const {
-  p_exec_items_->erase(p_exec_items_->begin());
-}
-
-void MKLDNNDeviceContext::LinkEntryWithExecutor(BlobPtr_t<KeyBlob> pblob,
-                                                KeyBlob::iterator it) const {
-  // Take current input shape from TLS
-  // Take current executor addess from TLS
-  // and for this executor's items add the one defined with arguments
-  auto key_it = p_exec_items_
-                    ->insert(std::make_pair(tls().cur_input_shape_str,
-                                            std::make_shared<ExecMap>()))
-                    .first;
-  (*key_it->second)[tls().get_curr_exec()].push_back(std::make_pair(pblob, it));
-
-  VLOG(3) << "LinkEntryWithExecutor, shapes: " << p_exec_items_->size()
-          << " curr exec size: "
-          << (*key_it->second)[tls().get_curr_exec()].size() << "\n";
-}
-
-void MKLDNNDeviceContext::BlockNextCacheClearing() {
-  std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
-  ++block_next_cache_clearing_;
-  VLOG(3) << "Next DNNL cache clearing has been blocked. Updated "
-             "block_next_cache_clearing_ : "
-          << block_next_cache_clearing_;
-}
-
-size_t MKLDNNDeviceContext::GetShapeBlobSize() const {
-  std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
-  BlobMap* pMap = p_blobmap_.get();
-  auto map_it = pMap->find(tls().cur_mkldnn_session_id);
-  if (map_it == pMap->end()) {
-    PADDLE_THROW(platform::errors::NotFound(
-        "MKLDNNDeviceContext don't find cur_mkldnn_session_id: %d.",
-        tls().cur_mkldnn_session_id));
-  }
-  return map_it->second->size();
-}
-
-void MKLDNNDeviceContext::SetBlob(const std::string& name,
-                                  BlobPtr_t<void> data) const {
-  BlobMap* pMap = p_blobmap_.get();
-  BlobPtr_t<ShapeBlob> sBlob = nullptr;
-  BlobPtr_t<KeyBlob> pBlob = nullptr;
-
-  int sid = tls().get_cur_mkldnn_session_id();
-
-  std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
-
-  // Find ShapeBlob for current mkldnn session id.
-  auto map_it = pMap->find(sid);
-
-  if (map_it == pMap->end()) {
-    // 1st time to set blob in current thread
-    sBlob = std::make_shared<ShapeBlob>();
-    (*pMap)[sid] = sBlob;
-    VLOG(2) << "SetBlob: sid=" << sid << ", add new sid\n";
-  } else {
-    sBlob = map_it->second;
-  }
-
-  // Find KeyBlob for current input shape
-  auto key_it = sBlob->find(tls().cur_input_shape_str);
-
-  if (key_it == sBlob->end()) {
-    // In cache clearing mode, cur_input_shape_cache_capacity defines
-    // max pblob capacity
-    if ((static_cast<size_t>(sid) ==
-         MKLDNNDeviceContextThreadLocals::kMKLDNNSessionID_CacheClearing) &&
-        sBlob->size() &&
-        (sBlob->size() >=
-         static_cast<size_t>(tls().cur_input_shape_cache_capacity))) {
-      VLOG(2) << "sid=" << sid
-              << ", remove all blobs of shape: " << sBlob->begin()->first;
-      sBlob->erase(sBlob->begin()->first);
-      RemoveShapeEntriesWithExecutor();
-    }
-    pBlob = std::make_shared<KeyBlob>();
-    (*sBlob)[tls().cur_input_shape_str] = pBlob;
-  } else {
-    pBlob = key_it->second;
-  }
-
-  // Find Blob via name
-  auto blob_it = pBlob->find(name);
-  if (blob_it == pBlob->end()) {
-    auto el =
-        pBlob->insert(std::make_pair(name, data));  //  (*pBlob)[name] = data;
-    // Register new element in per executor map
-    // to have easily erased when executor terminated
-    LinkEntryWithExecutor(pBlob, el.first);
-  } else {
-    blob_it->second = data;  // set data to existing blob
-  }
-  VLOG(2) << "SetBlob: sid=" << sid << ", add blob=" << name << "\n";
-  // lock will be automatically released when out of scope
-  return;
-}
-
-unsigned int MKLDNNDeviceContext::GetCachedObjectsNumber(void) const {
-  unsigned int num_entries = 0;
-  for (auto const& l3 : *p_blobmap_) {
-    for (auto const& l2 : *(l3.second)) {
-      num_entries += (l2.second)->size();
-    }
-  }
-  return num_entries;
-}
-
-MKLDNNDeviceContext::BlobPtr_t<void> MKLDNNDeviceContext::GetBlob(
-    const std::string& name) const {
-  BlobMap* pMap = p_blobmap_.get();
-  BlobPtr_t<ShapeBlob> sBlob = nullptr;
-  BlobPtr_t<KeyBlob> pBlob = nullptr;
-
-  int sid = tls().get_cur_mkldnn_session_id();
-
-  std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
-
-  // Find ShapeBlob for current mkldnn session id firstly
-  auto map_it = pMap->find(sid);
-  // (jczaja): After first iteration of model's execution we
-  // should have all elements cached (mostly) so failures are unlikely (less
-  // likely for dynamic shapes)
-  if (unlikely(map_it == pMap->end())) {
-    VLOG(2) << "GetBlob: sid=" << sid << ", miss sid\n";
-    return nullptr;
-  }
-  sBlob = map_it->second;
-
-  // Find KeyBlob for current input shape secondly
-  auto sBlob_it = sBlob->find(tls().cur_input_shape_str);
-  if (unlikely(sBlob_it == sBlob->end())) {
-    VLOG(2) << "GetBlob: sid=" << tls().cur_input_shape_str
-            << ", miss input_shape_str\n";
-    return nullptr;
-  }
-  pBlob = sBlob_it->second;
-
-  // Find Blob via name
-  auto key_it = pBlob->find(name);
-
-  if (unlikely(key_it == pBlob->end())) {
-    VLOG(2) << "GetBlob sid=" << sid << ", miss blob=" << name << "\n";
-    return nullptr;
-  }
-
-  VLOG(2) << "GetBlob sid=" << sid << ", get blob=" << name << "\n";
-  // lock will be automatically released when out of scope
-  return key_it->second;
-}
-
-#endif
-
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 CustomDeviceContext::CustomDeviceContext(CustomPlace place)
     : phi::CustomContext(place) {
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index c6cc29d9ca1c8..d0443e30cf9c6 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -59,6 +59,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_MKLDNN
 #include "dnnl.hpp"  // NOLINT
 #include "paddle/fluid/framework/data_layout.h"
+#include "paddle/phi/backends/onednn/onednn_context.h"
 #endif
 
 #include <map>
@@ -134,21 +135,12 @@ constexpr DeviceType kMLU = DeviceType::MLU;
 
 using DeviceContext = phi::DeviceContext;
 
-// using CPUDeviceContext = phi::CPUContext;
-// TODO(wilber): The place constructor is used in many places, it is more
-// difficult to use CPUDeviceContext = phi::CPUContext directly.
-class CPUDeviceContext : public phi::CPUContext {
- public:
-  CPUDeviceContext();
-  explicit CPUDeviceContext(CPUPlace place);
-};
-
 template <typename Place>
 struct DefaultDeviceContextType;
 
 template <>
 struct DefaultDeviceContextType<platform::CPUPlace> {
-  using TYPE = CPUDeviceContext;
+  using TYPE = phi::CPUContext;
 };
 
 // Graphcore IPU
@@ -725,132 +717,8 @@ struct DefaultDeviceContextType<platform::CUDAPinnedPlace> {
 #endif
 
 #ifdef PADDLE_WITH_MKLDNN
-
-class MKLDNNDeviceContextThreadLocals {
-  // default mkldnn session id
-
-  typedef MKLDNNDeviceContextThreadLocals self;
-  struct Body {
-    bool said_once = false;
-    size_t cur_mkldnn_session_id;
-    // Current data input shape string.
-    // - For fixed-shape, it's a null string in default.
-    // - For dynamic-shape, it's user specific.
-    std::string cur_input_shape_str;
-    // the cache capacity of different input shapes for MKLDNN.
-    // Default 1 means fixed input shape, not dynamic shape.
-    int cur_input_shape_cache_capacity;
-    // Recently registered data_format. This is needed to
-    // know for converting MKL-DNN Tensor to non MKL-DNN
-    paddle::framework::DataLayout cur_paddle_data_layout;
-    // MKL-DNN stream used for execution of primitives (per-thread)
-    dnnl::engine cur_engine;
-    dnnl::stream cur_stream;
-    std::string key_suffix;  // Key identifying current Executor
-    bool key_attach_thread_id = true;
-    void* exec_ptr_ = nullptr;
-
-    Body();
-    ~Body();
-    void set_cur_mkldnn_session_id(size_t sid);
-    size_t get_cur_mkldnn_session_id(void);
-    void set_cur_input_shape_str(std::string input_shape_str);
-    void set_cur_input_shape_cache_capacity(int input_shape_cache_capacity);
-    void set_cur_paddle_data_layout(framework::DataLayout dl);
-    framework::DataLayout get_cur_paddle_data_layout(void);
-    void log_lib_version(void);
-    const dnnl::engine& get_engine(void);
-    dnnl::stream& get_stream(void);
-    void set_key_suffix(const std::string& suffix) { key_suffix = suffix; }
-    const std::string& get_key_suffix(void) const { return key_suffix; }
-    void disable_tid_in_key(void) { key_attach_thread_id = false; }
-    bool is_tid_used_in_key(void) const { return key_attach_thread_id; }
-    void set_curr_exec(void* exec_ptr) { exec_ptr_ = exec_ptr; }
-    void* get_curr_exec(void) const { return exec_ptr_; }
-  };
-  MKLDNNDeviceContextThreadLocals() = default;
-  MKLDNNDeviceContextThreadLocals(const MKLDNNDeviceContextThreadLocals& c) =
-      delete;
-
- public:
-  // default mkldnn session id
-  static constexpr size_t kMKLDNNSessionID_Default = 0;
-  // mkldnn session id for cache clearing mode
-  static constexpr size_t kMKLDNNSessionID_CacheClearing = -1;
-  static Body& fetch() {
-    thread_local Body b;
-    return b;
-  }
-};
-
-class MKLDNNDeviceContext : public CPUDeviceContext {
- public:
-  template <class T>
-  using BlobPtr_t = std::shared_ptr<T>;
-  template <class P1, class P2>
-  using umap_value_smart_t = std::unordered_map<P1, BlobPtr_t<P2>>;
-  template <class T>
-  using umap_key_string_t = umap_value_smart_t<std::string, T>;
-
-  // Following three maps are used to cache MKLDNN primitives.
-  // There relations are:
-  // - BlobMap = Map<cur_thread_id, ShapeBlob>
-  // - ShapeBlob = Map<cur_input_shape_str, KeyBlob>
-  // - KeyBlob  = Map<blob_name, blob>
-
-  using KeyBlob = umap_key_string_t<void>;
-  using ShapeBlob = umap_key_string_t<KeyBlob>;
-  using BlobMap = umap_value_smart_t<int, ShapeBlob>;
-
-  // Auxillary two-level structure (shape, executor) to easier control
-  // clearing cache objects related to specific executor
-
-  using ExecKey = void*;
-  using ExecMapCacheIterPair = std::pair<BlobPtr_t<KeyBlob>, KeyBlob::iterator>;
-  using ExecMap =
-      std::unordered_map<ExecKey, std::vector<ExecMapCacheIterPair>>;
-  using ExecShape = std::unordered_map<std::string, std::shared_ptr<ExecMap>>;
-
-  explicit MKLDNNDeviceContext(CPUPlace place);
-
-  /* \brief  Get the active engine */
-  const dnnl::engine& GetEngine() const { return tls().get_engine(); }
-
-  // Register object to currently used executor's map
-  void LinkEntryWithExecutor(BlobPtr_t<KeyBlob>, KeyBlob::iterator) const;
-  void RemoveShapeEntriesWithExecutor(void) const;
-
-  // Remove all entries from the blob map
-  void ResetBlobMap(void* ptr);
-
-  // Prevent next ResetBlobMap()
-  void BlockNextCacheClearing();
-
-  // Get the ShapeBlob size in cur_mkldnn_session_id.
-  size_t GetShapeBlobSize() const;
-
-  // Set data to blob (i.e. name/data pair). Create blob if not existing
-  void SetBlob(const std::string& name, std::shared_ptr<void> data) const;
-
-  // Calculate number of oneDNN objects cached
-  unsigned int GetCachedObjectsNumber(void) const;
-
-  // Find a saved blob. Return nullptr if not found
-  std::shared_ptr<void> GetBlob(const std::string& name) const;
-
-  static auto tls() -> decltype(MKLDNNDeviceContextThreadLocals::fetch()) {
-    return MKLDNNDeviceContextThreadLocals::fetch();
-  }
-
- private:
-  std::shared_ptr<BlobMap> p_blobmap_;
-  // Map key is pointer of executor and value is a data(iterator in map) needed
-  // to erase
-  std::shared_ptr<ExecShape> p_exec_items_;
-  std::shared_ptr<std::mutex> p_mutex_;
-  // 0 - clearing is allowed. x > 0 do not clear.
-  unsigned int block_next_cache_clearing_ = 0;
-};
+using MKLDNNDeviceContextThreadLocals = phi::OneDNNContextThreadLocals;
+using MKLDNNDeviceContext = phi::OneDNNContext;
 #endif
 
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
diff --git a/paddle/fluid/platform/device_context_test_cuda_graph.cu b/paddle/fluid/platform/device_context_test_cuda_graph.cu
new file mode 100644
index 0000000000000..efb0d9ed75689
--- /dev/null
+++ b/paddle/fluid/platform/device_context_test_cuda_graph.cu
@@ -0,0 +1,39 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "cuda.h"          // NOLINT
+#include "cuda_runtime.h"  // NOLINT
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+#include "paddle/fluid/platform/device_context.h"
+
+TEST(Device, DeviceContextWithCUDAGraph) {
+  using paddle::platform::CUDADeviceContext;
+  using paddle::platform::CUDAPlace;
+  using paddle::platform::DeviceContext;
+  using paddle::platform::DeviceContextPool;
+  using paddle::platform::Place;
+
+  DeviceContextPool& pool = DeviceContextPool::Instance();
+  Place place = CUDAPlace(0);
+  auto* dev_ctx = pool.Get(place);
+
+  paddle::platform::BeginCUDAGraphCapture(
+      place, cudaStreamCaptureMode::cudaStreamCaptureModeThreadLocal, 0);
+  ASSERT_EQ(dev_ctx->IsCUDAGraphAllocatorValid(), true);
+  dev_ctx->GetCUDAGraphAllocator();
+  paddle::platform::EndCUDAGraphCapture();
+  ASSERT_EQ(dev_ctx->IsCUDAGraphAllocatorValid(), false);
+}
diff --git a/paddle/fluid/platform/device_event.h b/paddle/fluid/platform/device_event.h
index 1fd116600624c..2edccfa90c939 100644
--- a/paddle/fluid/platform/device_event.h
+++ b/paddle/fluid/platform/device_event.h
@@ -25,6 +25,7 @@
 
 using ::paddle::platform::kCPU;
 using ::paddle::platform::kCUDA;
+using ::paddle::platform::kNPU;
 using ::paddle::platform::kXPU;
 
 USE_EVENT(kCPU)
@@ -41,3 +42,9 @@ USE_EVENT(kXPU);
 USE_EVENT_WAIT(kXPU, kXPU)
 USE_EVENT_WAIT(kCPU, kXPU)
 #endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+USE_EVENT(kNPU);
+USE_EVENT_WAIT(kNPU, kNPU)
+USE_EVENT_WAIT(kCPU, kNPU)
+#endif
diff --git a/paddle/fluid/platform/device_event_base.h b/paddle/fluid/platform/device_event_base.h
index b42721a60d974..a2d3fc1dc3818 100644
--- a/paddle/fluid/platform/device_event_base.h
+++ b/paddle/fluid/platform/device_event_base.h
@@ -64,9 +64,9 @@ class DeviceEvent {
                           "Required type < %d, but received type = %d",
                           MaxDeviceTypes,
                           type_id_));
-    // TODO(Aurelius84): only support CPU/CUDA, need consider XPU/NPU later
+    // TODO(Aurelius84): only support CPU/CUDA/XPU/NPU.
     PADDLE_ENFORCE_LT(type_id_,
-                      3,
+                      4,
                       platform::errors::Unavailable(
                           "Currently DeviceEvent do not support %s", place));
     PADDLE_ENFORCE_NOT_NULL(
diff --git a/paddle/fluid/platform/device_event_npu.cc b/paddle/fluid/platform/device_event_npu.cc
new file mode 100644
index 0000000000000..215f308f66348
--- /dev/null
+++ b/paddle/fluid/platform/device_event_npu.cc
@@ -0,0 +1,117 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_ASCEND_CL
+
+#include "paddle/fluid/platform/device/npu/npu_resource_pool.h"
+#include "paddle/fluid/platform/device_event_base.h"
+#include "paddle/fluid/platform/event.h"
+namespace paddle {
+namespace platform {
+struct NPUDeviceEventWrapper {
+  explicit NPUDeviceEventWrapper(const platform::Place& place) {
+    PADDLE_ENFORCE_EQ(
+        platform::is_npu_place(place),
+        true,
+        platform::errors::PreconditionNotMet(
+            "Required device shall be NPUPlace, but received %d. ", place));
+
+    device_id_ = place.device;
+    PADDLE_ENFORCE_GT(
+        device_id_,
+        -1,
+        platform::errors::PreconditionNotMet(
+            "Required DeviceOption.device_id > -1, but received %d. ",
+            device_id_));
+    inner_event_ = NpuEventResourcePool::Instance().New(device_id_);
+  }
+  std::shared_ptr<NpuEventObject> inner_event_;
+  int device_id_;
+};
+
+void DeviceEventCreateNPU(DeviceEvent* event,
+                          const platform::Place& place,
+                          unsigned int) {
+  event->InitEvent(std::make_shared<NPUDeviceEventWrapper>(place));
+}
+
+void DeviceEventRecordNPU(DeviceEvent* event, const DeviceContext* context) {
+  auto* wrapper = static_cast<NPUDeviceEventWrapper*>(event->GetEvent().get());
+  auto* npu_dev_ctx = dynamic_cast<const platform::NPUDeviceContext*>(context);
+  PADDLE_ENFORCE_NOT_NULL(
+      npu_dev_ctx,
+      platform::errors::PreconditionNotMet(
+          "Failed to dynamic_cast context into NPUDeviceContext."));
+  NPUEventRecord(wrapper->inner_event_.get(), npu_dev_ctx->stream());
+}
+
+bool DeviceEventQueryNPU(const DeviceEvent* event) {
+  auto* wrapper = static_cast<NPUDeviceEventWrapper*>(event->GetEvent().get());
+  PADDLE_ENFORCE_NOT_NULL(
+      wrapper,
+      platform::errors::PreconditionNotMet(
+          "Failed to dynamic_cast event into NPUDeviceEventWrapper."));
+  aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE;
+  platform::NPUEventQuery(wrapper->inner_event_.get(), &status);
+  return ACL_EVENT_STATUS_COMPLETE == status;
+}
+
+void DeviceEventFinishNPU(const DeviceEvent* event) {
+  auto* wrapper = static_cast<NPUDeviceEventWrapper*>(event->GetEvent().get());
+  NPUEventSynchronize(wrapper->inner_event_.get());
+}
+
+void DeviceEventNPUWaitNPU(const DeviceEvent* event,
+                           const DeviceContext* context) {
+  auto* wrapper = static_cast<NPUDeviceEventWrapper*>(event->GetEvent().get());
+  auto* npu_dev_ctx = dynamic_cast<const platform::NPUDeviceContext*>(context);
+  PADDLE_ENFORCE_NOT_NULL(
+      npu_dev_ctx,
+      platform::errors::PreconditionNotMet(
+          "Failed to dynamic_cast context into NPUDeviceContext."));
+  NPUStreamWaitEvent(npu_dev_ctx->stream(), wrapper->inner_event_.get());
+}
+
+void DeviceEventCPUWaitNPU(const DeviceEvent* event,
+                           const DeviceContext* context) {
+  DeviceEventFinishNPU(event);
+}
+
+void DeviceEventSetFinishedNPU(const DeviceEvent* event) {
+  // do nothing
+}
+
+void EventResetNPU(const DeviceEvent* event) {
+  // do nothing
+}
+
+}  // namespace platform
+}  // namespace paddle
+
+using ::paddle::platform::kCPU;
+using ::paddle::platform::kNPU;
+REGISTER_EVENT_CREATE_FUNCTION(kNPU, paddle::platform::DeviceEventCreateNPU)
+REGISTER_EVENT_RECORD_FUNCTION(kNPU, paddle::platform::DeviceEventRecordNPU)
+REGISTER_EVENT_QUERY_FUNCTION(kNPU, paddle::platform::DeviceEventQueryNPU)
+REGISTER_EVENT_FINISH_FUNCTION(kNPU, paddle::platform::DeviceEventFinishNPU)
+REGISTER_EVENT_SET_FINISHED_FUNCTION(
+    kNPU, paddle::platform::DeviceEventSetFinishedNPU)
+REGISTER_EVENT_WAIT_FUNCTION(kNPU,
+                             kNPU,
+                             paddle::platform::DeviceEventNPUWaitNPU)
+REGISTER_EVENT_WAIT_FUNCTION(kCPU,
+                             kNPU,
+                             paddle::platform::DeviceEventCPUWaitNPU)
+REGISTER_EVENT_RESET_FUNCTION(kNPU, paddle::platform::EventResetNPU)
+#endif
diff --git a/paddle/fluid/platform/dynload/cusparse.cc b/paddle/fluid/platform/dynload/cusparse.cc
index da93455e8bc7d..756737c1a169f 100644
--- a/paddle/fluid/platform/dynload/cusparse.cc
+++ b/paddle/fluid/platform/dynload/cusparse.cc
@@ -28,6 +28,10 @@ CUSPARSE_ROUTINE_EACH(DEFINE_WRAP);
 CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP);
 #endif
 
+#ifdef CUSPARSE_ROUTINE_EACH_R3
+CUSPARSE_ROUTINE_EACH_R3(DEFINE_WRAP);
+#endif
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 4f26ce0b27dbf..6b33af9ac10ba 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -66,7 +66,7 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/platform/macros.h"
-#include "paddle/fluid/platform/variant.h"
+
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/string/to_string.h"
 #include "paddle/phi/backends/dynload/port.h"
diff --git a/paddle/fluid/platform/flags.h b/paddle/fluid/platform/flags.h
index 03986816c53f9..6db5e710b8dc8 100644
--- a/paddle/fluid/platform/flags.h
+++ b/paddle/fluid/platform/flags.h
@@ -21,7 +21,7 @@
 
 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/macros.h"
-#include "paddle/fluid/platform/variant.h"
+
 #include "paddle/utils/variant.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index b6f6deb80d67b..6e28c775a38bb 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -264,11 +264,11 @@ void InitDevices(const std::vector<int> devices) {
 
       auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
       for (auto &dev_type : device_types) {
-        auto device_count = phi::DeviceManager::GetDeviceCount(dev_type);
+        auto device_list = phi::DeviceManager::GetSelectedDeviceList(dev_type);
         LOG(INFO) << "CustomDevice: " << dev_type
-                  << ", visible devices count: " << device_count;
-        for (size_t i = 0; i < device_count; i++) {
-          places.push_back(platform::CustomPlace(dev_type, i));
+                  << ", visible devices count: " << device_list.size();
+        for (auto &dev_id : device_list) {
+          places.push_back(platform::CustomPlace(dev_type, dev_id));
         }
       }
     } else {
diff --git a/paddle/fluid/platform/macros.h b/paddle/fluid/platform/macros.h
index 9eede99b7b733..2ea58a7bb0c81 100644
--- a/paddle/fluid/platform/macros.h
+++ b/paddle/fluid/platform/macros.h
@@ -29,9 +29,3 @@ limitations under the License. */
 #define FLT_MAX __FLT_MAX__
 #endif  // __FLT_MAX__
 #endif  // PADDLE_WITH_MUSL
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-#define PADDLE_RESTRICT __restrict__
-#else
-#define PADDLE_RESTRICT
-#endif
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index 83fd353f54dd6..0e97a68edfc9d 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -233,15 +233,21 @@ inline dnnl::memory::format_tag GetMKLDNNFormat(dnnl::memory::desc mem_desc) {
     if (inner_nblks == 0) {
       if (strides[0] >= strides[1] && strides[1] >= strides[2] &&
           strides[2] >= strides[3]) {
-        return dnnl::memory::format_tag::nchw;
+        return dnnl::memory::format_tag::abcd;
+      } else if (strides[2] >= strides[3] && strides[3] >= strides[1] &&
+                 strides[1] >= strides[0]) {
+        return dnnl::memory::format_tag::cdba;
+      } else if (strides[0] >= strides[2] && strides[2] >= strides[3] &&
+                 strides[3] >= strides[1]) {
+        return dnnl::memory::format_tag::acdb;
+      } else if (strides[0] >= strides[1] && strides[1] >= strides[3] &&
+                 strides[3] >= strides[2]) {
+        return dnnl::memory::format_tag::abdc;
       } else if (strides[2] >= strides[3] && strides[3] >= strides[1] &&
                  strides[1] >= strides[0]) {
         return dnnl::memory::format_tag::cdba;
-      } else if (strides[3] >= strides[2] && strides[2] >= strides[0] &&
-                 strides[0] >= strides[1]) {
-        return dnnl::memory::format_tag::dcab;
       } else {
-        return dnnl::memory::format_tag::nhwc;
+        return dnnl::memory::format_tag::dcab;
       }
     } else if (inner_nblks == 1) {
       if (inner_blks[0] == 16 && inner_idxs[0] == 1) {
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 05ebedf611a4b..f1963a75b1729 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -20,7 +20,6 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "boost/optional.hpp"
 #include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/pool_op.h"
@@ -691,8 +690,13 @@ class BinaryMKLDNNHandler
     auto attributes =
         CreateAttributes(algo, scale_x, scale_y, scale_out, post_ops);
 
-    this->AcquireForwardPrimitiveDescriptor(
-        attributes, algo, src0_md, src1_md, dst_md);
+    if (x->numel() < y->numel()) {
+      this->AcquireForwardPrimitiveDescriptor(
+          attributes, algo, src1_md, src0_md, dst_md);
+    } else {
+      this->AcquireForwardPrimitiveDescriptor(
+          attributes, algo, src0_md, src1_md, dst_md);
+    }
   }
   std::shared_ptr<dnnl::memory> AcquireSecondSrcMemory(
       const framework::Tensor* input) {
@@ -1009,32 +1013,93 @@ class ActivationMKLDNNHandler
   }
 };
 
-static const dnnl::algorithm AcquireActivationAlgorithm(
-    std::string activation_name) {
-  std::unordered_map<std::string, dnnl::algorithm> activation_map = {
-      {"abs", dnnl::algorithm::eltwise_abs},
-      {"clip", dnnl::algorithm::eltwise_clip},
-      {"gelu", dnnl::algorithm::eltwise_gelu_erf},
-      {"gelu_erf", dnnl::algorithm::eltwise_gelu_erf},
-      {"gelu_tanh", dnnl::algorithm::eltwise_gelu_tanh},
-      {"hard_swish", dnnl::algorithm::eltwise_hardswish},
-      {"leaky_relu", dnnl::algorithm::eltwise_relu},
-      {"mish", dnnl::algorithm::eltwise_mish},
-      {"relu", dnnl::algorithm::eltwise_relu},
-      {"relu6", dnnl::algorithm::eltwise_bounded_relu},
-      {"sigmoid", dnnl::algorithm::eltwise_logistic},
-      {"sqrt", dnnl::algorithm::eltwise_sqrt},
-      {"swish", dnnl::algorithm::eltwise_swish},
-      {"tanh", dnnl::algorithm::eltwise_tanh}};
-
-  const auto& activation_type = activation_map.find(activation_name);
-
-  PADDLE_ENFORCE_NE(activation_type,
-                    activation_map.end(),
-                    platform::errors::InvalidArgument(
-                        "Activation '%s' not found in oneDNN algorithms mapper",
-                        activation_name));
-  return activation_type->second;
+static void AppendActivation(const framework::ExecutionContext& ctx,
+                             dnnl::post_ops& post_ops,
+                             float activation_scale = 1.0f) {
+  const auto invalid_attribute =
+      ctx.HasAttr("fuse_activation")
+          ? ctx.Attr<std::string>("fuse_activation").empty()
+          : true;
+  if (invalid_attribute) return;
+
+  const auto fuse_activation = ctx.Attr<std::string>("fuse_activation");
+  const auto fuse_alpha =
+      ctx.HasAttr("fuse_alpha") ? ctx.Attr<float>("fuse_alpha") : 0.0f;
+  const auto fuse_beta =
+      ctx.HasAttr("fuse_beta") ? ctx.Attr<float>("fuse_beta") : 0.0f;
+
+  if (fuse_activation == "hard_sigmoid") {
+    post_ops.append_eltwise(activation_scale,
+                            dnnl::algorithm::eltwise_linear,
+                            fuse_alpha,
+                            fuse_beta);
+    post_ops.append_eltwise(
+        activation_scale, dnnl::algorithm::eltwise_clip, 0.0f, 1.0f);
+  } else {
+    const std::unordered_map<std::string, dnnl::algorithm> activation_map = {
+        {"abs", dnnl::algorithm::eltwise_abs},
+        {"clip", dnnl::algorithm::eltwise_clip},
+        {"gelu", dnnl::algorithm::eltwise_gelu_erf},
+        {"gelu_erf", dnnl::algorithm::eltwise_gelu_erf},
+        {"gelu_tanh", dnnl::algorithm::eltwise_gelu_tanh},
+        {"hard_swish", dnnl::algorithm::eltwise_hardswish},
+        {"leaky_relu", dnnl::algorithm::eltwise_relu},
+        {"mish", dnnl::algorithm::eltwise_mish},
+        {"relu", dnnl::algorithm::eltwise_relu},
+        {"relu6", dnnl::algorithm::eltwise_bounded_relu},
+        {"sigmoid", dnnl::algorithm::eltwise_logistic},
+        {"sqrt", dnnl::algorithm::eltwise_sqrt},
+        {"swish", dnnl::algorithm::eltwise_swish},
+        {"tanh", dnnl::algorithm::eltwise_tanh}};
+
+    const auto& activation_type = activation_map.find(fuse_activation);
+
+    PADDLE_ENFORCE_NE(
+        activation_type,
+        activation_map.end(),
+        platform::errors::InvalidArgument(
+            "Activation '%s' not found in oneDNN algorithms mapper",
+            fuse_activation));
+
+    post_ops.append_eltwise(
+        activation_scale, activation_type->second, fuse_alpha, fuse_beta);
+  }
+}
+
+static std::unordered_map<std::string, std::string> GetAttributeMap(
+    std::string act_type) {
+  std::unordered_map<std::string, std::string> attr_map;
+  if (act_type == "swish")
+    attr_map.emplace("beta", "fuse_alpha");
+  else if (act_type == "relu6")
+    attr_map.emplace("threshold", "fuse_alpha");
+  else if (act_type == "hard_sigmoid") {
+    attr_map.emplace("slope", "fuse_alpha");
+    attr_map.emplace("offset", "fuse_beta");
+  } else if (act_type == "clip") {
+    attr_map.emplace("min", "fuse_alpha");
+    attr_map.emplace("max", "fuse_beta");
+  } else {
+    attr_map.emplace("alpha", "fuse_alpha");
+    attr_map.emplace("beta", "fuse_beta");
+  }
+  return attr_map;
+}
+
+static std::vector<std::string> GetSupportedActivations() {
+  return std::vector<std::string>{"abs",
+                                  "clip",
+                                  "gelu",
+                                  "hard_sigmoid",
+                                  "hard_swish",
+                                  "leaky_relu",
+                                  "mish",
+                                  "relu",
+                                  "relu6",
+                                  "sigmoid",
+                                  "sqrt",
+                                  "swish",
+                                  "tanh"};
 }
 
 class ReorderMKLDNNHandler {
diff --git a/paddle/fluid/platform/mkldnn_utils.h b/paddle/fluid/platform/mkldnn_utils.h
index 12c48ed412428..38470d18f4623 100644
--- a/paddle/fluid/platform/mkldnn_utils.h
+++ b/paddle/fluid/platform/mkldnn_utils.h
@@ -51,15 +51,21 @@ inline dnnl::memory::format_tag GetMKLDNNFormat(dnnl::memory::desc mem_desc) {
     if (inner_nblks == 0) {
       if (strides[0] >= strides[1] && strides[1] >= strides[2] &&
           strides[2] >= strides[3]) {
-        return dnnl::memory::format_tag::nchw;
+        return dnnl::memory::format_tag::abcd;
+      } else if (strides[2] >= strides[3] && strides[3] >= strides[1] &&
+                 strides[1] >= strides[0]) {
+        return dnnl::memory::format_tag::cdba;
+      } else if (strides[0] >= strides[2] && strides[2] >= strides[3] &&
+                 strides[3] >= strides[1]) {
+        return dnnl::memory::format_tag::acdb;
+      } else if (strides[0] >= strides[1] && strides[1] >= strides[3] &&
+                 strides[3] >= strides[2]) {
+        return dnnl::memory::format_tag::abdc;
       } else if (strides[2] >= strides[3] && strides[3] >= strides[1] &&
                  strides[1] >= strides[0]) {
         return dnnl::memory::format_tag::cdba;
-      } else if (strides[3] >= strides[2] && strides[2] >= strides[0] &&
-                 strides[0] >= strides[1]) {
-        return dnnl::memory::format_tag::dcab;
       } else {
-        return dnnl::memory::format_tag::nhwc;
+        return dnnl::memory::format_tag::dcab;
       }
     } else if (inner_nblks == 1) {
       if (inner_blks[0] == 16 && inner_idxs[0] == 1) {
diff --git a/paddle/fluid/platform/monitor.h b/paddle/fluid/platform/monitor.h
index e7612f6dcb6cd..a0c1129d2cb87 100644
--- a/paddle/fluid/platform/monitor.h
+++ b/paddle/fluid/platform/monitor.h
@@ -26,6 +26,8 @@
 
 #include "glog/logging.h"
 
+#include "paddle/phi/core/macros.h"
+
 namespace paddle {
 namespace platform {
 
diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h
index d544cdecc3994..cde17007715a6 100644
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -18,7 +18,7 @@ limitations under the License. */
 // #include <vector>
 
 #include "paddle/fluid/platform/enforce.h"
-// #include "paddle/fluid/platform/variant.h"
+//
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/platform/device/npu/enforce_npu.h"
 #endif
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index ec33e9e819869..d02fd54578862 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -308,6 +308,12 @@ RecordOpInfoSupplement::RecordOpInfoSupplement(
       PosixInNsec(), type, input_shapes, dtypes, callstack);
 }
 
+std::map<const char *, std::map<uint64_t, std::vector<uint64_t>>>
+    RecordMemEvent::size_cache;
+
+std::map<const char *, std::map<uint64_t, bool>>
+    RecordMemEvent::has_initialized;
+
 RecordMemEvent::RecordMemEvent(const void *ptr,
                                const phi::Place &place,
                                size_t size,
@@ -323,17 +329,75 @@ RecordMemEvent::RecordMemEvent(const void *ptr,
     uint64_t peak_reserved = 0;     // 0 means keep the same as before
     if (platform::is_cpu_place(place) ||
         platform::is_cuda_pinned_place(place)) {
-      current_allocated =
-          HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
-      peak_allocated =
-          HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
+      if (RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] ==
+          false) {
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
+            HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()));
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
+            HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()));
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
+            HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()));
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
+            HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()));
+        current_allocated =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0];
+        current_reserved =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1];
+        peak_allocated =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2];
+        peak_reserved =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3];
+        RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] = true;
+      } else {
+        current_allocated =
+            HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
+        peak_allocated =
+            HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0] =
+            current_allocated;
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2] =
+            peak_allocated;
+        current_reserved =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1];
+        peak_reserved =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3];
+      }
+
     } else {
-      current_allocated =
-          DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
-      peak_allocated =
-          DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
+      if (RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] ==
+          false) {
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
+            DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()));
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
+            DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()));
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
+            DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()));
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
+            DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()));
+        current_allocated =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0];
+        current_reserved =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1];
+        peak_allocated =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2];
+        peak_reserved =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3];
+        RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] = true;
+      } else {
+        current_allocated =
+            DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
+        peak_allocated =
+            DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0] =
+            current_allocated;
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2] =
+            peak_allocated;
+        current_reserved =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1];
+        peak_reserved =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3];
+      }
     }
-
     platform::MemEvenRecorder::Instance().PushMemRecord(ptr,
                                                         place,
                                                         size,
@@ -349,17 +413,74 @@ RecordMemEvent::RecordMemEvent(const void *ptr,
     uint64_t peak_allocated = 0;     // 0 means keep the same as before
     if (platform::is_cpu_place(place) ||
         platform::is_cuda_pinned_place(place)) {
-      current_reserved =
-          HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
-      peak_reserved =
-          HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
+      if (RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] ==
+          false) {
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
+            HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()));
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
+            HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()));
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
+            HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()));
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
+            HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()));
+        current_allocated =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0];
+        current_reserved =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1];
+        peak_allocated =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2];
+        peak_reserved =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3];
+        RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] = true;
+      } else {
+        current_reserved =
+            HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
+        peak_reserved =
+            HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1] =
+            current_reserved;
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3] =
+            peak_reserved;
+        current_allocated =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0];
+        peak_allocated =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2];
+      }
     } else {
-      current_reserved =
-          DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
-      peak_reserved =
-          DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
+      if (RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] ==
+          false) {
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
+            DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()));
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
+            DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()));
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
+            DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()));
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
+            DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()));
+        current_allocated =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0];
+        current_reserved =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1];
+        peak_allocated =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2];
+        peak_reserved =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3];
+        RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] = true;
+      } else {
+        current_reserved =
+            DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
+        peak_reserved =
+            DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1] =
+            current_reserved;
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3] =
+            peak_reserved;
+        current_allocated =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0];
+        peak_allocated =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2];
+      }
     }
-
     platform::MemEvenRecorder::Instance().PushMemRecord(ptr,
                                                         place,
                                                         size,
@@ -375,17 +496,74 @@ RecordMemEvent::RecordMemEvent(const void *ptr,
     uint64_t peak_reserved = 0;     // 0 means keep the same as before
     if (platform::is_cpu_place(place) ||
         platform::is_cuda_pinned_place(place)) {
-      current_allocated =
-          HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
-      peak_allocated =
-          HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
+      if (RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] ==
+          false) {
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
+            HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()));
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
+            HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()));
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
+            HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()));
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
+            HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()));
+        current_allocated =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0];
+        current_reserved =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1];
+        peak_allocated =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2];
+        peak_reserved =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3];
+        RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] = true;
+      } else {
+        current_allocated =
+            HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
+        peak_allocated =
+            HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0] =
+            current_allocated;
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2] =
+            peak_allocated;
+        current_reserved =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1];
+        peak_reserved =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3];
+      }
     } else {
-      current_allocated =
-          DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
-      peak_allocated =
-          DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
+      if (RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] ==
+          false) {
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
+            DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()));
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
+            DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()));
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
+            DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()));
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
+            DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()));
+        current_allocated =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0];
+        current_reserved =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1];
+        peak_allocated =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2];
+        peak_reserved =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3];
+        RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] = true;
+      } else {
+        current_allocated =
+            DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
+        peak_allocated =
+            DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0] =
+            current_allocated;
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2] =
+            peak_allocated;
+        current_reserved =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1];
+        peak_reserved =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3];
+      }
     }
-
     platform::MemEvenRecorder::Instance().PopMemRecord(ptr,
                                                        place,
                                                        size,
@@ -401,17 +579,74 @@ RecordMemEvent::RecordMemEvent(const void *ptr,
     uint64_t peak_allocated = 0;     // 0 means keep the same as before
     if (platform::is_cpu_place(place) ||
         platform::is_cuda_pinned_place(place)) {
-      current_reserved =
-          HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
-      peak_reserved =
-          HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
+      if (RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] ==
+          false) {
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
+            HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()));
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
+            HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()));
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
+            HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()));
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back(
+            HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()));
+        current_allocated =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0];
+        current_reserved =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1];
+        peak_allocated =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2];
+        peak_reserved =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3];
+        RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] = true;
+      } else {
+        current_reserved =
+            HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
+        peak_reserved =
+            HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1] =
+            current_reserved;
+        RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3] =
+            peak_reserved;
+        current_allocated =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0];
+        peak_allocated =
+            RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2];
+      }
     } else {
-      current_reserved =
-          DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
-      peak_reserved =
-          DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
+      if (RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] ==
+          false) {
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
+            DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()));
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
+            DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()));
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
+            DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()));
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back(
+            DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()));
+        current_allocated =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0];
+        current_reserved =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1];
+        peak_allocated =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2];
+        peak_reserved =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3];
+        RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] = true;
+      } else {
+        current_reserved =
+            DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
+        peak_reserved =
+            DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1] =
+            current_reserved;
+        RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3] =
+            peak_reserved;
+        current_allocated =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0];
+        peak_allocated =
+            RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2];
+      }
     }
-
     platform::MemEvenRecorder::Instance().PopMemRecord(ptr,
                                                        place,
                                                        size,
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index 4773b1a177ba0..6046e54b6c876 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -28,7 +28,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/event.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/profiler.pb.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/fluid/platform/profiler/mem_tracing.h"
 #include "paddle/fluid/platform/profiler/supplement_tracing.h"
@@ -39,6 +38,10 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
+namespace proto {
+class Profile;
+}
+
 const int kEnableProfiler = 1;
 const int kDisableProfiler = 2;
 
diff --git a/paddle/fluid/platform/profiler/mem_tracing.h b/paddle/fluid/platform/profiler/mem_tracing.h
index 3d3508c7bd570..5b2a2391c2e79 100644
--- a/paddle/fluid/platform/profiler/mem_tracing.h
+++ b/paddle/fluid/platform/profiler/mem_tracing.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <map>
 #include <string>
 
 #include "paddle/fluid/platform/place.h"
@@ -37,6 +38,17 @@ class RecordMemEvent {
       const Place& place,
       size_t size,
       const TracerMemEventType type = TracerMemEventType::Allocate);
+
+  // size_cache: In the outer map, key is device type, 'cpu'  or 'gpu', and in
+  // the inner map, key is device ip.
+  //   Values record memory sizes for current_allocated, current_reserved,
+  //   peak_allocated and peak_reserved.
+  // has_initialized: Flags to denote whether memory cache for some device has
+  // collected once.
+
+  static std::map<const char*, std::map<uint64_t, std::vector<uint64_t>>>
+      size_cache;
+  static std::map<const char*, std::map<uint64_t, bool>> has_initialized;
 };
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/stream/CMakeLists.txt b/paddle/fluid/platform/stream/CMakeLists.txt
index 25d2874ca04d2..32c1857bf6903 100644
--- a/paddle/fluid/platform/stream/CMakeLists.txt
+++ b/paddle/fluid/platform/stream/CMakeLists.txt
@@ -2,5 +2,5 @@ if(WITH_GPU OR WITH_ROCM)
   cc_library(
     cuda_stream
     SRCS cuda_stream.cc
-    DEPS enforce boost eigen3 ${MKLDNN_CTX_DEPS})
+    DEPS enforce eigen3 ${MKLDNN_CTX_DEPS})
 endif()
diff --git a/paddle/fluid/platform/transform.h b/paddle/fluid/platform/transform.h
index 45756372e2291..575415ef89023 100644
--- a/paddle/fluid/platform/transform.h
+++ b/paddle/fluid/platform/transform.h
@@ -69,30 +69,6 @@ struct Transform {
 };
 
 // NOTE: After the phi kernel is migrated, it needs to be deleted.
-template <>
-struct Transform<platform::CPUDeviceContext> {
-  template <typename InputIter, typename OutputIter, typename UnaryOperation>
-  void operator()(const platform::CPUDeviceContext& context,
-                  InputIter first,
-                  InputIter last,
-                  OutputIter result,
-                  UnaryOperation op) {
-    std::transform(first, last, result, op);
-  }
-
-  template <typename InputIter1,
-            typename InputIter2,
-            typename OutputIter,
-            typename BinaryOperation>
-  void operator()(const platform::CPUDeviceContext& context,
-                  InputIter1 first1,
-                  InputIter1 last1,
-                  InputIter2 first2,
-                  OutputIter result,
-                  BinaryOperation op) {
-    std::transform(first1, last1, first2, result, op);
-  }
-};
 
 template <>
 struct Transform<phi::CPUContext> {
diff --git a/paddle/fluid/platform/transform_test.cu b/paddle/fluid/platform/transform_test.cu
index 1caa2e8770772..5e0717ba635ce 100644
--- a/paddle/fluid/platform/transform_test.cu
+++ b/paddle/fluid/platform/transform_test.cu
@@ -39,17 +39,17 @@ class Multiply {
 using paddle::memory::Alloc;
 using paddle::memory::Copy;
 
-using paddle::platform::CPUDeviceContext;
 using paddle::platform::CPUPlace;
 using paddle::platform::CUDADeviceContext;
 using paddle::platform::CUDAPlace;
+using phi::CPUContext;
 
 using paddle::platform::Transform;
 
 TEST(Transform, CPUUnary) {
-  CPUDeviceContext ctx;
+  CPUContext ctx;
   float buf[4] = {0.1, 0.2, 0.3, 0.4};
-  Transform<CPUDeviceContext> trans;
+  Transform<CPUContext> trans;
   trans(ctx, buf, buf + 4, buf, Scale<float>(10));
   for (int i = 0; i < 4; ++i) {
     ASSERT_NEAR(buf[i], static_cast<float>(i + 1), 1e-5);
@@ -78,8 +78,8 @@ TEST(Transform, GPUUnary) {
 
 TEST(Transform, CPUBinary) {
   int buf[4] = {1, 2, 3, 4};
-  Transform<CPUDeviceContext> trans;
-  CPUDeviceContext ctx;
+  Transform<phi::CPUContext> trans;
+  phi::CPUContext ctx;
   trans(ctx, buf, buf + 4, buf, buf, Multiply<int>());
   for (int i = 0; i < 4; ++i) {
     ASSERT_EQ((i + 1) * (i + 1), buf[i]);
diff --git a/paddle/fluid/platform/variant.h b/paddle/fluid/platform/variant.h
deleted file mode 100644
index fb4772abd3062..0000000000000
--- a/paddle/fluid/platform/variant.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-// Boost 1.41.0 requires __CUDACC_VER__, but in CUDA 9 __CUDACC_VER__
-// is removed, so we have to manually define __CUDACC_VER__ instead.
-// For details, please refer to
-// https://github.com/PaddlePaddle/Paddle/issues/6626
-#if defined(__CUDACC__) && defined(__CUDACC_VER_MAJOR__)
-#undef __CUDACC_VER__
-#define __CUDACC_VER__                                  \
-  __CUDACC_VER_BUILD__ + __CUDACC_VER_MAJOR__ * 10000 + \
-      __CUDACC_VER_MINOR__ * 100
-#endif
-
-#include "boost/config.hpp"
-
-// Because Boost 1.41.0's variadic templates has bug on nvcc, boost
-// will disable variadic template support in NVCC mode.  Define
-// BOOST_NO_CXX11_VARIADIC_TEMPLATES on gcc/clang to generate same
-// function symbols.  For details,
-// https://github.com/PaddlePaddle/Paddle/issues/3386
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#ifndef BOOST_NO_CXX11_VARIADIC_TEMPLATES
-#define BOOST_NO_CXX11_VARIADIC_TEMPLATES
-#endif
-#endif
-
-#include <boost/mpl/comparison.hpp>
-#include <boost/mpl/less_equal.hpp>
-#include <boost/variant.hpp>
-
-#include "paddle/utils/any.h"
-#include "paddle/utils/optional.h"
-
-// some platform-independent defintion
-#if defined(_WIN32)
-#define UNUSED
-#define __builtin_expect(EXP, C) (EXP)
-#else
-#define UNUSED __attribute__((unused))
-#endif
diff --git a/paddle/fluid/pybind/.gitignore b/paddle/fluid/pybind/.gitignore
index 6869b6841a8a6..a6f20e21801f7 100644
--- a/paddle/fluid/pybind/.gitignore
+++ b/paddle/fluid/pybind/.gitignore
@@ -1,5 +1,11 @@
 pybind.h
-op_function_impl.h
-eager_op_function_impl.h
-eager_final_state_op_function_impl.h
-tmp_eager_final_state_op_function_impl.h
+op_function1.cc
+op_function2.cc
+op_function3.cc
+op_function4.cc
+op_function5.cc
+op_function6.cc
+op_function7.cc
+op_function8.cc
+eager_op_function.cc
+eager_final_state_op_function.cc
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 2b7e12499976e..63ebffe9f25f1 100755
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -38,7 +38,8 @@ set(PYBIND_DEPS
     global_utils
     phi_utils
     tcp_store
-    new_profiler)
+    new_profiler
+    jit_layer)
 
 if(WITH_PSCORE)
   set(PYBIND_DEPS ${PYBIND_DEPS} ps_service)
@@ -83,10 +84,6 @@ endif()
 
 if(NOT WIN32)
   set(PYBIND_DEPS ${PYBIND_DEPS} data_loader)
-  set(PYBIND_DEPS ${PYBIND_DEPS} mmap_allocator)
-  if(WITH_GPU)
-    set(PYBIND_DEPS ${PYBIND_DEPS} cuda_ipc_allocator)
-  endif()
   if(WITH_NCCL OR WITH_RCCL)
     set(PYBIND_DEPS ${PYBIND_DEPS} nccl_context)
     set(PYBIND_DEPS ${PYBIND_DEPS} heter_ccl_context)
@@ -100,11 +97,19 @@ endif()
 
 set(PYBIND_SRCS
     pybind.cc
-    exception.cc
+    imperative.cc
+    inference_api.cc
+    ir.cc
+    bind_fleet_executor.cc
+    reader_py.cc
     protobuf.cc
+    exception.cc
+    op_function_common.cc
+    parallel_executor.cc
+    tensor.cc
+    place.cc
     const_value.cc
     global_value_getter_setter.cc
-    reader_py.cc
     fleet_wrapper_py.cc
     heter_wrapper_py.cc
     ps_gpu_wrapper_py.cc
@@ -112,16 +117,21 @@ set(PYBIND_SRCS
     box_helper_py.cc
     metrics_py.cc
     data_set_py.cc
-    imperative.cc
-    ir.cc
     bind_cost_model.cc
-    bind_fleet_executor.cc
-    inference_api.cc
     compatible.cc
     io.cc
     generator_py.cc
     communication.cc
-    cuda_streams_py.cc)
+    cuda_streams_py.cc
+    jit.cc
+    op_function1.cc
+    op_function2.cc
+    op_function3.cc
+    op_function4.cc
+    op_function5.cc
+    op_function6.cc
+    op_function7.cc
+    op_function8.cc)
 
 if(WITH_CUSTOM_DEVICE)
   set(PYBIND_DEPS ${PYBIND_DEPS} phi_capi)
@@ -187,7 +197,8 @@ if(WITH_PSCORE)
   set_source_files_properties(
     fleet_py.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   list(APPEND PYBIND_DEPS fleet communicator index_wrapper index_sampler)
-  list(APPEND PYBIND_SRCS fleet_py.cc)
+  list(APPEND PYBIND_SRCS)
+  set(PYBIND_SRCS fleet_py.cc ${PYBIND_SRCS})
 endif()
 
 if(WITH_NCCL OR WITH_RCCL)
@@ -257,12 +268,35 @@ if(WITH_PYTHON)
     target_link_libraries(kernel_signature_generator ${ROCM_HIPRTC_LIB})
   endif()
 
-  set(impl_file ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function_impl.h)
-  set(tmp_impl_file ${impl_file}.tmp)
+  set(op_function_output_path ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/)
+  set(impl_file1 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function1.cc)
+  set(tmp_impl_file1 ${impl_file1}.tmp)
+  set(impl_file2 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function2.cc)
+  set(tmp_impl_file2 ${impl_file2}.tmp)
+  set(impl_file3 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function3.cc)
+  set(tmp_impl_file3 ${impl_file3}.tmp)
+  set(impl_file4 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function4.cc)
+  set(tmp_impl_file4 ${impl_file4}.tmp)
+  set(impl_file5 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function5.cc)
+  set(tmp_impl_file5 ${impl_file5}.tmp)
+  set(impl_file6 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function6.cc)
+  set(tmp_impl_file6 ${impl_file6}.tmp)
+  set(impl_file7 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function7.cc)
+  set(tmp_impl_file7 ${impl_file7}.tmp)
+  set(impl_file8 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function8.cc)
+  set(tmp_impl_file8 ${impl_file8}.tmp)
+  set(CODE_GEN_SPLIT_FILE_COUNT "8")
   set(eager_impl_file
-      ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/eager_op_function_impl.h)
+      ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/eager_op_function.cc)
   set(tmp_eager_impl_file ${eager_impl_file}.tmp)
 
+  execute_process(
+    COMMAND
+      "${PYTHON_EXECUTABLE}"
+      "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/generate_file_structures.py"
+      "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/"
+      "${CODE_GEN_SPLIT_FILE_COUNT}")
+
   set(OP_IMPL_DEPS op_function_generator)
   set(EAGER_OP_IMPL_DEPS eager_op_function_generator
                          eager_final_state_python_c_codegen)
@@ -282,7 +316,7 @@ if(WITH_PYTHON)
       ":retry\n"
       "ECHO op_function_generator run %build_times% time\n"
       "taskkill /f /im op_function_generator.exe 2>NUL\n"
-      "${op_impl_path}/op_function_generator.exe ${tmp_impl_file}\n"
+      "${op_impl_path}/op_function_generator.exe ${op_function_output_path} ${CODE_GEN_SPLIT_FILE_COUNT}\n"
       "if %ERRORLEVEL% NEQ 0 (\n"
       "    set /a build_times=%build_times%+1\n"
       "    if %build_times% GEQ 10 (\n"
@@ -357,12 +391,33 @@ if(WITH_PYTHON)
     endif()
 
     add_custom_command(
-      OUTPUT ${impl_file}
+      OUTPUT op_function
       COMMAND
         ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/op_function_generator_retry.bat
-      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file}
-              ${impl_file}
-      COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}"
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file1}
+              ${impl_file1}
+      COMMENT "copy_if_different ${tmp_impl_file1} to ${impl_file1}"
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file2}
+              ${impl_file2}
+      COMMENT "copy_if_different ${tmp_impl_file2} to ${impl_file2}"
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file3}
+              ${impl_file3}
+      COMMENT "copy_if_different ${tmp_impl_file3} to ${impl_file3}"
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file4}
+              ${impl_file4}
+      COMMENT "copy_if_different ${tmp_impl_file4} to ${impl_file4}"
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file5}
+              ${impl_file5}
+      COMMENT "copy_if_different ${tmp_impl_file5} to ${impl_file5}"
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file6}
+              ${impl_file6}
+      COMMENT "copy_if_different ${tmp_impl_file6} to ${impl_file6}"
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file7}
+              ${impl_file7}
+      COMMENT "copy_if_different ${tmp_impl_file7} to ${impl_file7}"
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file8}
+              ${impl_file8}
+      COMMENT "copy_if_different ${tmp_impl_file8} to ${impl_file8}"
       DEPENDS ${OP_IMPL_DEPS})
     if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
       add_custom_command(
@@ -421,13 +476,35 @@ if(WITH_PYTHON)
       list(APPEND EAGER_OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/libdnnl.so.0)
     endif()
     add_custom_command(
-      OUTPUT ${impl_file}
+      OUTPUT op_function
       COMMAND
         ${CMAKE_COMMAND} -E env "LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:."
-        "${CMAKE_CURRENT_BINARY_DIR}/op_function_generator" "${tmp_impl_file}"
-      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file}
-              ${impl_file}
-      COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}"
+        "${CMAKE_CURRENT_BINARY_DIR}/op_function_generator"
+        "${op_function_output_path}" "${CODE_GEN_SPLIT_FILE_COUNT}"
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file1}
+              ${impl_file1}
+      COMMENT "copy_if_different ${tmp_impl_file1} to ${impl_file1}"
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file2}
+              ${impl_file2}
+      COMMENT "copy_if_different ${tmp_impl_file2} to ${impl_file2}"
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file3}
+              ${impl_file3}
+      COMMENT "copy_if_different ${tmp_impl_file3} to ${impl_file3}"
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file4}
+              ${impl_file4}
+      COMMENT "copy_if_different ${tmp_impl_file4} to ${impl_file4}"
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file5}
+              ${impl_file5}
+      COMMENT "copy_if_different ${tmp_impl_file5} to ${impl_file5}"
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file6}
+              ${impl_file6}
+      COMMENT "copy_if_different ${tmp_impl_file6} to ${impl_file6}"
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file7}
+              ${impl_file7}
+      COMMENT "copy_if_different ${tmp_impl_file7} to ${impl_file7}"
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file8}
+              ${impl_file8}
+      COMMENT "copy_if_different ${tmp_impl_file8} to ${impl_file8}"
       DEPENDS ${OP_IMPL_DEPS}
       VERBATIM)
     if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
@@ -444,45 +521,39 @@ if(WITH_PYTHON)
         VERBATIM)
     endif()
   endif()
-  add_custom_target(op_function_generator_cmd ALL DEPENDS ${impl_file})
+  add_custom_target(op_function_generator_cmd ALL DEPENDS op_function)
   if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
     add_custom_target(eager_op_function_generator_cmd ALL
                       DEPENDS ${eager_impl_file})
   endif()
 
-  list(APPEND PYBIND_DEPS interpretercore standalone_executor
-       staticgraph_executor_statistics)
-  cc_library(
-    op_function_common
-    SRCS op_function_common.cc
-    DEPS ${PYBIND_DEPS})
-  list(APPEND PYBIND_DEPS op_function_common)
+  list(APPEND PYBIND_DEPS standalone_executor staticgraph_executor_statistics)
 
   if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
-    cc_library(
-      paddle_eager
-      SRCS eager.cc eager_functions.cc eager_method.cc eager_properties.cc
-           eager_utils.cc eager_py_layer.cc
-      DEPS eager_api
-           autograd_meta
-           backward
-           grad_node_info
-           phi
-           op_function_common
-           final_dygraph_function
-           final_dygraph_node
-           dygraph_function
-           dygraph_node
-           accumulation_node
-           py_layer_node
-           global_utils
-           utils
-           python
-           custom_operator
-           custom_operator_node)
-    add_dependencies(paddle_eager eager_codegen)
-    add_dependencies(paddle_eager eager_op_function_generator_cmd)
-    list(APPEND PYBIND_DEPS paddle_eager)
+    set(PYBIND_SRCS eager.cc ${PYBIND_SRCS})
+    set(PYBIND_SRCS eager_functions.cc ${PYBIND_SRCS})
+    set(PYBIND_SRCS eager_method.cc ${PYBIND_SRCS})
+    set(PYBIND_SRCS eager_properties.cc ${PYBIND_SRCS})
+    set(PYBIND_SRCS eager_utils.cc ${PYBIND_SRCS})
+    set(PYBIND_SRCS eager_py_layer.cc ${PYBIND_SRCS})
+    set(PYBIND_SRCS eager_op_function.cc ${PYBIND_SRCS})
+    set(PYBIND_SRCS eager_final_state_op_function.cc ${PYBIND_SRCS})
+    list(APPEND PYBIND_DEPS eager_api)
+    list(APPEND PYBIND_DEPS autograd_meta)
+    list(APPEND PYBIND_DEPS backward)
+    list(APPEND PYBIND_DEPS grad_node_info)
+    list(APPEND PYBIND_DEPS phi)
+    list(APPEND PYBIND_DEPS final_dygraph_function)
+    list(APPEND PYBIND_DEPS final_dygraph_node)
+    list(APPEND PYBIND_DEPS dygraph_function)
+    list(APPEND PYBIND_DEPS dygraph_node)
+    list(APPEND PYBIND_DEPS accumulation_node)
+    list(APPEND PYBIND_DEPS py_layer_node)
+    list(APPEND PYBIND_DEPS global_utils)
+    list(APPEND PYBIND_DEPS utils)
+    list(APPEND PYBIND_DEPS python)
+    list(APPEND PYBIND_DEPS custom_operator)
+    list(APPEND PYBIND_DEPS custom_operator_node)
   endif()
 
   cc_library(
@@ -490,6 +561,11 @@ if(WITH_PYTHON)
     SRCS ${PYBIND_SRCS}
     DEPS ${PYBIND_DEPS} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
 
+  if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
+    add_dependencies(paddle_pybind eager_codegen)
+    add_dependencies(paddle_pybind eager_op_function_generator_cmd)
+  endif()
+
   if(NOT APPLE AND NOT WIN32)
     target_link_libraries(paddle_pybind rt)
   endif()
diff --git a/paddle/fluid/pybind/data_set_py.cc b/paddle/fluid/pybind/data_set_py.cc
index d5f84d7382105..e1950ade92fb2 100644
--- a/paddle/fluid/pybind/data_set_py.cc
+++ b/paddle/fluid/pybind/data_set_py.cc
@@ -35,7 +35,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/variant.h"
+
 #include "paddle/fluid/pybind/data_set_py.h"
 
 namespace py = pybind11;
diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc
index bdaebf13f8d2a..b8d5a0de820e7 100644
--- a/paddle/fluid/pybind/distributed_py.cc
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -225,6 +225,30 @@ void BindDistributed(py::module *m) {
               py::arg("out"),
               py::call_guard<py::gil_scoped_release>())
 
+          .def(
+              "alltoall_single",
+              [](distributed::ProcessGroup &self,
+                 py::handle py_in_tensor,
+                 py::handle py_out_tensor,
+                 std::vector<int64_t> in_sizes,
+                 std::vector<int64_t> out_sizes) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                std::vector<phi::DenseTensor> in_tensors = {*in_dense};
+                std::vector<phi::DenseTensor> out_tensors = {*out_dense};
+                return self.AllToAll_Single(
+                    in_tensors, out_tensors, in_sizes, out_sizes);
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::arg("in_sizes"),
+              py::arg("out_sizes"),
+              py::call_guard<py::gil_scoped_release>())
+
           .def(
               "reduce",
               [](distributed::ProcessGroup &self,
@@ -244,7 +268,6 @@ void BindDistributed(py::module *m) {
               py::arg("dst"),
               py::arg("op") = distributed::ReduceOp::SUM,
               py::call_guard<py::gil_scoped_release>())
-
           .def(
               "scatter",
               [](distributed::ProcessGroup &self,
@@ -266,23 +289,50 @@ void BindDistributed(py::module *m) {
               py::arg("in"),
               py::arg("out"),
               py::arg("src"),
+              py::call_guard<py::gil_scoped_release>())
+          .def(
+              "_reduce_scatter_base",
+              [](distributed::ProcessGroup &self,
+                 py::handle py_out_tensor,
+                 py::handle py_in_tensor,
+                 distributed::ReduceOp op) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                distributed::ReduceScatterOptions opts;
+                opts.reduce_op = op;
+                auto dense_out = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                auto dense_in = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                return self._ReduceScatterBase(*dense_out, *dense_in, opts);
+              },
+              py::arg("out_tensor"),
+              py::arg("in_tensor"),
+              py::arg("op") = distributed::ReduceOp::SUM,
               py::call_guard<py::gil_scoped_release>());
 
 #if defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)
-  py::class_<distributed::ProcessGroupNCCL,
-             std::shared_ptr<distributed::ProcessGroupNCCL>>(
-      *m, "ProcessGroupNCCL", ProcessGroup)
-      .def(py::init<const std::shared_ptr<distributed::Store> &,
-                    int,
-                    int,
-                    const platform::CUDAPlace &,
-                    int>(),
-           py::arg("store"),
-           py::arg("rank"),
-           py::arg("world_size"),
-           py::arg("place"),
-           py::arg("group_id") = 0,
-           py::call_guard<py::gil_scoped_release>());
+  auto processGroupNCCL =
+      py::class_<distributed::ProcessGroupNCCL,
+                 std::shared_ptr<distributed::ProcessGroupNCCL>>(
+          *m, "ProcessGroupNCCL", ProcessGroup)
+          .def(py::init<const std::shared_ptr<distributed::Store> &,
+                        int,
+                        int,
+                        const platform::CUDAPlace &,
+                        int>(),
+               py::arg("store"),
+               py::arg("rank"),
+               py::arg("world_size"),
+               py::arg("place"),
+               py::arg("group_id") = 0,
+               py::call_guard<py::gil_scoped_release>());
+
+  processGroupNCCL.def_static(
+      "group_start", []() { distributed::ProcessGroupNCCL::GroupStart(); });
+  processGroupNCCL.def_static(
+      "group_end", []() { distributed::ProcessGroupNCCL::GroupEnd(); });
+
 #endif
 
 #if defined(PADDLE_WITH_GLOO) && defined(PADDLE_WITH_PSCORE) && \
diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index 7f54f472bdcd5..03aace9b78e38 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -33,7 +33,7 @@ limitations under the License. */
 #include "pybind11/pybind11.h"
 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
 #include "paddle/fluid/framework/python_headers.h"
-#include "paddle/fluid/pybind/eager_op_function_impl.h"
+#include "paddle/fluid/pybind/exception.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
 #include "paddle/phi/core/string_tensor.h"
@@ -98,10 +98,11 @@ void EmptyTensorInitializer(TensorObject* self,
   }
 
   if (!autograd_meta->GetMutableGradNode()) {
-    VLOG(3) << "Tensor(" << name
-            << ") have not GradNode, add GradNodeAccumulation for it.";
     autograd_meta->SetGradNode(
         std::make_shared<egr::GradNodeAccumulation>(autograd_meta));
+    VLOG(3) << "Tensor(" << name
+            << ") have not GradNode, add GradNodeAccumulation"
+            << autograd_meta->GradNode() << " for it.";
   }
 }
 
diff --git a/paddle/fluid/pybind/eager.h b/paddle/fluid/pybind/eager.h
index db2b438c3bd94..5560744ae1d49 100644
--- a/paddle/fluid/pybind/eager.h
+++ b/paddle/fluid/pybind/eager.h
@@ -40,6 +40,7 @@ void BindEager(pybind11::module* m);
 void BindEagerStringTensor(pybind11::module* module);
 void BindFunctions(PyObject* module);
 void BindEagerPyLayer(PyObject* module);
-
+void BindEagerOpFunctions(pybind11::module* module);
+void BindFinalStateEagerOpFunctions(pybind11::module* module);
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/eager_custom_python_api.h b/paddle/fluid/pybind/eager_custom_python_api.h
index 86586123ee46c..7ed58a1e956f6 100644
--- a/paddle/fluid/pybind/eager_custom_python_api.h
+++ b/paddle/fluid/pybind/eager_custom_python_api.h
@@ -15,8 +15,12 @@
 
 #include <iostream>
 
+#include "paddle/fluid/eager/to_static/run_program_op_func.h"
 #include "paddle/phi/core/enforce.h"
 
+namespace paddle {
+namespace pybind {
+
 static PyObject *eager_api_run_program(PyObject *self,
                                        PyObject *args,
                                        PyObject *kwargs) {
@@ -57,55 +61,12 @@ static PyObject *eager_api_run_program(PyObject *self,
   }
 }
 
-static PyObject *eager_api_final_state_linear(PyObject *self,
-                                              PyObject *args,
-                                              PyObject *kwargs) {
-  PyThreadState *tstate = nullptr;
-  try {
-    auto x = GetTensorFromArgs("linear", "X", args, 0, false);
-    auto weight = GetTensorFromArgs("linear", "weight", args, 1, false);
-    auto bias = GetTensorFromArgs("linear", "Bias", args, 2, true);
-    tstate = PyEval_SaveThread();
-    if (bias.initialized()) {
-      auto mm_out =
-          matmul_final_state_dygraph_function(x, weight, false, false);
-      auto out = add_final_state_dygraph_function(mm_out, bias);
-      PyEval_RestoreThread(tstate);
-      tstate = nullptr;
-      return ToPyObject(out);
-    } else {
-      auto mm_out =
-          matmul_final_state_dygraph_function(x, weight, false, false);
-      PyEval_RestoreThread(tstate);
-      tstate = nullptr;
-      return ToPyObject(mm_out);
-    }
-  } catch (paddle::platform::EnforceNotMet &exception) {
-    if (tstate) {
-      PyEval_RestoreThread(tstate);
-    }
-    std::ostringstream sout;
-    sout << exception.what();
-    sout << "  [operator < linear > error]";
-    exception.set_error_str(sout.str());
-    ThrowExceptionToPython(std::current_exception());
-    return nullptr;
-  } catch (...) {
-    if (tstate) {
-      PyEval_RestoreThread(tstate);
-    }
-    ThrowExceptionToPython(std::current_exception());
-    return nullptr;
-  }
-}
-
-static PyMethodDef CustomEagerFinalStateMethods[] = {
+static PyMethodDef CustomEagerMethods[] = {
     {"run_program",
      (PyCFunction)(void (*)(void))eager_api_run_program,
      METH_VARARGS | METH_KEYWORDS,
      "C++ interface function for run_program in dygraph."},
-    {"final_state_linear",
-     (PyCFunction)(void (*)(void))eager_api_final_state_linear,
-     METH_VARARGS | METH_KEYWORDS,
-     "C++ interface function for run_program in dygraph."},
     {nullptr, nullptr, 0, nullptr}};
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/eager_final_state_custom_python_api.h b/paddle/fluid/pybind/eager_final_state_custom_python_api.h
new file mode 100644
index 0000000000000..4774b33a722d5
--- /dev/null
+++ b/paddle/fluid/pybind/eager_final_state_custom_python_api.h
@@ -0,0 +1,73 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <iostream>
+
+#include "paddle/phi/core/enforce.h"
+
+namespace paddle {
+namespace pybind {
+
+static PyObject *eager_api_final_state_linear(PyObject *self,
+                                              PyObject *args,
+                                              PyObject *kwargs) {
+  PyThreadState *tstate = nullptr;
+  try {
+    auto x = GetTensorFromArgs("linear", "X", args, 0, false);
+    auto weight = GetTensorFromArgs("linear", "weight", args, 1, false);
+    auto bias = GetTensorFromArgs("linear", "Bias", args, 2, true);
+    tstate = PyEval_SaveThread();
+    if (bias.initialized()) {
+      auto mm_out =
+          matmul_final_state_dygraph_function(x, weight, false, false);
+      auto out = add_final_state_dygraph_function(mm_out, bias);
+      PyEval_RestoreThread(tstate);
+      tstate = nullptr;
+      return ToPyObject(out);
+    } else {
+      auto mm_out =
+          matmul_final_state_dygraph_function(x, weight, false, false);
+      PyEval_RestoreThread(tstate);
+      tstate = nullptr;
+      return ToPyObject(mm_out);
+    }
+  } catch (paddle::platform::EnforceNotMet &exception) {
+    if (tstate) {
+      PyEval_RestoreThread(tstate);
+    }
+    std::ostringstream sout;
+    sout << exception.what();
+    sout << "  [operator < linear > error]";
+    exception.set_error_str(sout.str());
+    ThrowExceptionToPython(std::current_exception());
+    return nullptr;
+  } catch (...) {
+    if (tstate) {
+      PyEval_RestoreThread(tstate);
+    }
+    ThrowExceptionToPython(std::current_exception());
+    return nullptr;
+  }
+}
+
+static PyMethodDef CustomEagerFinalStateMethods[] = {
+    {"final_state_linear",
+     (PyCFunction)(void (*)(void))eager_api_final_state_linear,
+     METH_VARARGS | METH_KEYWORDS,
+     "C++ interface function for run_program in dygraph."},
+    {nullptr, nullptr, 0, nullptr}};
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index f256787805a0f..3fe2cb170d796 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -357,6 +357,19 @@ static std::vector<paddle::any> CastAttrsToTragetType(
   return res;
 }
 
+static PyObject* eager_api_jit_function_call(PyObject* self,
+                                             PyObject* args,
+                                             PyObject* kwargs) {
+  EAGER_TRY
+  std::shared_ptr<jit::BaseFunction> function =
+      CastPyArg2BaseFunction(PyTuple_GET_ITEM(args, 0), 0);
+  std::vector<paddle::experimental::Tensor> ins =
+      CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 1), 1);
+  std::vector<paddle::experimental::Tensor> outs = (*function)(ins);
+  return ToPyObject(outs);
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 static PyObject* eager_api_run_costum_op(PyObject* self,
                                          PyObject* args,
                                          PyObject* kwargs) {
@@ -911,6 +924,10 @@ PyMethodDef variable_functions[] = {
      (PyCFunction)(void (*)(void))eager_api_read_next_tensor_list,
      METH_VARARGS | METH_KEYWORDS,
      NULL},
+    {"jit_function_call",
+     (PyCFunction)(void (*)(void))eager_api_jit_function_call,
+     METH_VARARGS | METH_KEYWORDS,
+     NULL},
     /**sparse functions**/
     {"sparse_coo_tensor",
      (PyCFunction)(void (*)(void))eager_api_sparse_coo_tensor,
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 77e196291143c..086c15dafdf22 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -1473,6 +1473,27 @@ static PyObject* tensor_method_get_map_tensor(TensorObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static PyObject* tensor_method_get_non_zero_nums(TensorObject* self,
+                                                 PyObject* args,
+                                                 PyObject* kwargs) {
+  EAGER_TRY
+  PADDLE_ENFORCE(
+      self->tensor.is_sparse_coo_tensor() ||
+          self->tensor.is_sparse_csr_tensor(),
+      paddle::platform::errors::Fatal("this method is only effective for "
+                                      "SparseCooTensor or SparseCsrTensor"));
+  if (self->tensor.is_sparse_coo_tensor()) {
+    auto sparse_coo_tensor =
+        std::dynamic_pointer_cast<phi::SparseCooTensor>(self->tensor.impl());
+    return ToPyObject(sparse_coo_tensor->nnz());
+  } else {
+    auto sparse_csr_tensor =
+        std::dynamic_pointer_cast<phi::SparseCsrTensor>(self->tensor.impl());
+    return ToPyObject(sparse_csr_tensor->nnz());
+  }
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 static PyObject* tensor_method_get_non_zero_indices(TensorObject* self,
                                                     PyObject* args,
                                                     PyObject* kwargs) {
@@ -1962,6 +1983,10 @@ PyMethodDef variable_methods[] = {
      METH_VARARGS | METH_KEYWORDS,
      NULL},
     /***the method of sparse tensor****/
+    {"nnz",
+     (PyCFunction)(void (*)(void))tensor_method_get_non_zero_nums,
+     METH_VARARGS | METH_KEYWORDS,
+     NULL},
     {"indices",
      (PyCFunction)(void (*)(void))tensor_method_get_non_zero_indices,
      METH_VARARGS | METH_KEYWORDS,
diff --git a/paddle/fluid/pybind/eager_op_function_generator.cc b/paddle/fluid/pybind/eager_op_function_generator.cc
index 7d84124a264a0..72c12b267d1c9 100644
--- a/paddle/fluid/pybind/eager_op_function_generator.cc
+++ b/paddle/fluid/pybind/eager_op_function_generator.cc
@@ -138,8 +138,6 @@ const char* PYBIND_ITEM_TEMPLATE = R"(  {"%s", (PyCFunction)(void(*)(void))%s, M
 // These operators will skip automatical code generatrion and
 // need to be handwritten in CUSTOM_HANDWRITE_OP_FUNC_FILE
 std::unordered_set<std::string> CUSTOM_HANDWRITE_OPS_SET = {"run_program"};
-const char* CUSTOM_HANDWRITE_OP_FUNC_FILE =
-  "#include \"paddle/fluid/pybind/eager_custom_python_api.h\"\n";
 
 // clang-format on
 static inline bool FindInsMap(const std::string& op_type,
@@ -413,7 +411,6 @@ GenerateOpFunctions() {
 
   std::vector<std::string> op_function_list, bind_function_list;
   auto& all_kernels = paddle::framework::OperatorWithKernel::AllOpKernels();
-  bool append_custom_head_file = false;
   for (auto& pair : op_info_map) {
     auto& op_info = pair.second;
     auto op_proto = op_info.proto_;
@@ -423,7 +420,6 @@ GenerateOpFunctions() {
     auto& op_type = op_proto->type();
     // Skip operators that will be handwriten in CUSTOM_HANDWRITE_OP_FUNC_FILE.
     if (CUSTOM_HANDWRITE_OPS_SET.count(op_type)) {
-      append_custom_head_file = true;
       continue;
     }
     // Skip operator which is not inherit form OperatorWithKernel, like while,
@@ -480,9 +476,7 @@ GenerateOpFunctions() {
       bind_function_list.emplace_back(std::move(inplace_bind_function_str));
     }
   }
-  if (append_custom_head_file) {
-    op_function_list.emplace_back(CUSTOM_HANDWRITE_OP_FUNC_FILE);
-  }
+
   return std::make_tuple(op_function_list, bind_function_list);
 }
 
@@ -498,18 +492,19 @@ int main(int argc, char* argv[]) {
 #endif
 
   std::vector<std::string> headers{
-      "\"pybind11/detail/common.h\"",
-      "\"paddle/fluid/pybind/eager_final_state_op_function_impl.h\"",
-      "\"paddle/fluid/pybind/op_function_common.h\"",
+      "<Python.h>",
+      "\"paddle/fluid/platform/enforce.h\"",
       "\"paddle/fluid/eager/api/generated/fluid_generated/"
       "dygraph_forward_api.h\"",
+      "\"paddle/fluid/pybind/eager_utils.h\"",
+      "\"paddle/fluid/platform/profiler/event_tracing.h\"",
       "\"paddle/fluid/pybind/exception.h\"",
-      "<Python.h>"};
+      "\"paddle/fluid/pybind/op_function_common.h\"",
+      "\"paddle/fluid/pybind/eager_custom_python_api.h\"",
+      "\"paddle/fluid/pybind/eager.h\""};
 
   std::ofstream out(argv[1], std::ios::out);
 
-  out << "#pragma once\n\n";
-
   for (auto& header : headers) {
     out << "#include  " + header + "\n";
   }
@@ -542,22 +537,20 @@ int main(int argc, char* argv[]) {
       << core_ops_infos_registry << "\n  {nullptr,nullptr,0,nullptr}"
       << "};\n\n";
 
-  out << "inline void BindEagerOpFunctions(pybind11::module *module) {\n"
+  out << "void BindEagerOpFunctions(pybind11::module *module) {\n"
       << "  InitOpsAttrTypeMap();\n"
       << "  auto m = module->def_submodule(\"ops\");\n"
       << "  if (PyModule_AddFunctions(m.ptr(), ExtestMethods) < 0) {\n"
       << "    PADDLE_THROW(platform::errors::Fatal (\"Add functions to "
          "core.eager.ops failed!\"));\n"
       << "  }\n\n"
-      << "  if (PyModule_AddFunctions(m.ptr(), EagerFinalStateMethods) < 0) {\n"
-      << "    PADDLE_THROW(platform::errors::Fatal (\"Add functions to "
-         "core.eager.ops failed!\"));\n"
-      << "  }\n\n"
-      << "  if (PyModule_AddFunctions(m.ptr(), CustomEagerFinalStateMethods) < "
+      << "  if (PyModule_AddFunctions(m.ptr(), CustomEagerMethods) < "
          "0) {\n"
       << "    PADDLE_THROW(platform::errors::Fatal (\"Add functions to "
          "core.eager.ops failed!\"));\n"
       << "  }\n\n"
+
+      << "  BindFinalStateEagerOpFunctions(&m);\n\n"
       << "}\n\n"
       << "} // namespace pybind\n"
       << "} // namespace paddle\n";
diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
index a7f11fc963ebe..12e262b3f7cb5 100644
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -95,6 +95,7 @@ PyObject* tensor_properties_get_grad(TensorObject* self, void* closure) {
   EAGER_TRY
   VLOG(6) << "Get grad for tensor: " << self->tensor.name();
   auto meta = egr::EagerUtils::nullable_autograd_meta(self->tensor);
+  VLOG(6) << meta << " initialized: " << meta->Grad().initialized();
   if (meta && meta->Grad().initialized()) {
     return ToPyObject(meta->Grad());
   } else {
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 9e8065a6a438a..185b81677125d 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -51,6 +51,7 @@ extern PyTypeObject* g_customplace_pytype;
 extern PyTypeObject* g_framework_tensor_pytype;
 extern PyTypeObject* g_framework_lodtensorarray_pytype;
 extern PyTypeObject* g_custom_op_kernel_ctx_pytype;
+extern PyTypeObject* g_executor_function_pytype;
 
 int TensorDtype2NumpyDtype(phi::DataType dtype) {
   switch (dtype) {
@@ -227,6 +228,21 @@ std::shared_ptr<imperative::VarBase> CastPyArg2VarBase(PyObject* obj,
   return py::cast<std::shared_ptr<imperative::VarBase>>(obj);
 }
 
+std::shared_ptr<jit::BaseFunction> CastPyArg2BaseFunction(PyObject* obj,
+                                                          ssize_t arg_pos) {
+  if (PyObject_IsInstance(
+          obj, reinterpret_cast<PyObject*>(g_executor_function_pytype))) {
+    return ::pybind11::handle(obj)
+        .cast<std::shared_ptr<jit::ExecutorFunction>>();
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "argument (position %d) must be "
+        "BaseFunction, but got %s",
+        arg_pos + 1,
+        reinterpret_cast<PyTypeObject*>(obj->ob_type)->tp_name));
+  }
+}
+
 std::vector<paddle::experimental::Tensor> CastPyArg2VectorOfTensor(
     PyObject* obj, ssize_t arg_pos) {
   std::vector<paddle::experimental::Tensor> result;
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index 25dcd91bed0d1..b97dcb9cddbec 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -19,6 +19,7 @@ typedef SSIZE_T ssize_t;
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/jit/executor_function.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/common/backend.h"
 #include "paddle/phi/common/data_type.h"
@@ -72,6 +73,8 @@ framework::proto::VarType::Type CastPyArg2ProtoType(PyObject* obj,
 std::unordered_map<std::wstring, int> CastPyArg2Vocab(PyObject* obj,
                                                       ssize_t arg_pos);
 std::vector<std::string> CastPyArg2Strings(PyObject* obj, ssize_t arg_pos);
+std::shared_ptr<jit::BaseFunction> CastPyArg2BaseFunction(PyObject* obj,
+                                                          ssize_t arg_pos);
 
 PyObject* ToPyObject(int value);
 PyObject* ToPyObject(uint32_t value);
diff --git a/paddle/fluid/pybind/fleet_wrapper_py.cc b/paddle/fluid/pybind/fleet_wrapper_py.cc
index 05028a9b70efb..8626659d8633a 100644
--- a/paddle/fluid/pybind/fleet_wrapper_py.cc
+++ b/paddle/fluid/pybind/fleet_wrapper_py.cc
@@ -33,7 +33,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/variant.h"
+
 #include "paddle/fluid/pybind/fleet_wrapper_py.h"
 
 namespace py = pybind11;
diff --git a/paddle/fluid/pybind/generate_file_structures.py b/paddle/fluid/pybind/generate_file_structures.py
new file mode 100644
index 0000000000000..bc61ecdcc96f5
--- /dev/null
+++ b/paddle/fluid/pybind/generate_file_structures.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os
+
+if __name__ == "__main__":
+    assert len(sys.argv) == 3
+    pybind_dir = sys.argv[1]
+    split_count = int(sys.argv[2])
+
+    empty_files = [os.path.join(pybind_dir, "eager_final_state_op_function.cc")]
+    empty_files.append(os.path.join(pybind_dir, "eager_op_function.cc"))
+
+    for i in range(split_count):
+        empty_files.append(
+            os.path.join(pybind_dir, "op_function" + str(i + 1) + ".cc"))
+
+    for path in empty_files:
+        if not os.path.exists(path):
+            open(path, 'a').close()
diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc
index 2871d1de56780..c45566ba35673 100644
--- a/paddle/fluid/pybind/global_value_getter_setter.cc
+++ b/paddle/fluid/pybind/global_value_getter_setter.cc
@@ -27,6 +27,7 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/platform/macros.h"
+#include "paddle/phi/core/macros.h"
 #include "pybind11/stl.h"
 
 // FIXME(zengjinle): these 2 flags may be removed by the linker when compiling
@@ -217,7 +218,7 @@ void BindGlobalValueGetterSetter(pybind11::module *module) {
                        GlobalVarGetterSetterRegistry::CreateSetter(&var)); \
   } while (0)
 
-struct RegisterGetterSetterVisitor : public boost::static_visitor<void> {
+struct RegisterGetterSetterVisitor {
   RegisterGetterSetterVisitor(const std::string &name,
                               bool is_writable,
                               void *value_ptr)
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index ab9fb236dbbcc..8a21271db409f 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -64,6 +64,7 @@ limitations under the License. */
 namespace paddle {
 namespace pybind {
 
+std::atomic<int> VarBaseUniqueNameID{0};
 PyTypeObject *g_varbase_pytype = nullptr;
 
 namespace py = ::pybind11;
@@ -142,6 +143,8 @@ static const platform::Place PyObjectToPlace(const py::object &place_obj) {
     return place_obj.cast<platform::CUDAPinnedPlace>();
   } else if (py::isinstance<platform::NPUPlace>(place_obj)) {
     return place_obj.cast<platform::NPUPlace>();
+  } else if (py::isinstance<platform::IPUPlace>(place_obj)) {
+    return place_obj.cast<platform::IPUPlace>();
   } else if (py::isinstance<platform::Place>(place_obj)) {
     return place_obj.cast<platform::Place>();
   } else if (py::isinstance<platform::MLUPlace>(place_obj)) {
@@ -151,8 +154,8 @@ static const platform::Place PyObjectToPlace(const py::object &place_obj) {
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "Place should be one of "
-        "Place/CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/NPUPlace/MLUPlace/"
-        "CustomPlace"));
+        "Place/CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/NPUPlace/IPUPlace/"
+        "MLUPlace/CustomPlace"));
   }
 }
 
@@ -198,6 +201,8 @@ static void InitVarBaseAndTensor(imperative::VarBase *self,
         tensor, array, place, zero_copy);
   } else if (platform::is_npu_place(place)) {
     SetTensorFromPyArray<platform::NPUPlace>(tensor, array, place, zero_copy);
+  } else if (platform::is_ipu_place(place)) {
+    SetTensorFromPyArray<platform::IPUPlace>(tensor, array, place, zero_copy);
   } else if (platform::is_mlu_place(place)) {
     SetTensorFromPyArray<platform::MLUPlace>(tensor, array, place, zero_copy);
   } else if (platform::is_custom_place(place)) {
@@ -206,7 +211,8 @@ static void InitVarBaseAndTensor(imperative::VarBase *self,
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "Place should be one of "
-        "CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/NPUPlace/MLUPlace"));
+        "CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/NPUPlace/IPUPlace/"
+        "MLUPlace"));
   }
   self->SetDataType(framework::TransToProtoVarType(tensor->dtype()));
 }
@@ -492,7 +498,14 @@ static void VarBaseCopy(std::shared_ptr<imperative::VarBase> &src,  // NOLINT
 void BindImperative(py::module *m_ptr) {
   auto &m = *m_ptr;
 
-  BindOpFunctions(&m);
+  BindOpFunctions1(&m);
+  BindOpFunctions2(&m);
+  BindOpFunctions3(&m);
+  BindOpFunctions4(&m);
+  BindOpFunctions5(&m);
+  BindOpFunctions6(&m);
+  BindOpFunctions7(&m);
+  BindOpFunctions8(&m);
 
 #ifndef _WIN32
   // Dygraph DataLoader signal handler
@@ -1856,6 +1869,18 @@ void BindImperative(py::module *m_ptr) {
             return new_var;
           },
           py::return_value_policy::copy)
+      .def(
+          "_copy_to",
+          [](const std::shared_ptr<imperative::VarBase> &self,
+             const platform::IPUPlace &place,
+             bool blocking) {
+            auto new_var = self->NewVarBase(place, blocking);
+            if (!blocking) {
+              IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
+            }
+            return new_var;
+          },
+          py::return_value_policy::copy)
       .def(
           "_copy_to",
           [](const std::shared_ptr<imperative::VarBase> &self,
@@ -2140,6 +2165,11 @@ void BindImperative(py::module *m_ptr) {
               self.SetExpectedPlace(*p);
               VLOG(4) << "Tracer(" << &self << ")"
                       << " set expected place " << *p;
+            } else if (py::isinstance<platform::IPUPlace>(obj)) {
+              auto p = obj.cast<platform::IPUPlace *>();
+              self.SetExpectedPlace(*p);
+              VLOG(4) << "Tracer(" << &self << ")"
+                      << " set expected place " << *p;
             } else if (py::isinstance<platform::MLUPlace>(obj)) {
               auto p = obj.cast<platform::MLUPlace *>();
               self.SetExpectedPlace(*p);
@@ -2158,7 +2188,7 @@ void BindImperative(py::module *m_ptr) {
             } else {
               PADDLE_THROW(platform::errors::InvalidArgument(
                   "Incompatible Place Type: supports XPUPlace, CUDAPlace, "
-                  "CPUPlace, NPUPlace, MLUPlace"
+                  "CPUPlace, NPUPlace, IPUPlace, MLUPlace"
                   "and CUDAPinnedPlace, "
                   "but got Unknown Type!"));
             }
@@ -2313,6 +2343,28 @@ void BindImperative(py::module *m_ptr) {
                                                  inplace_map);
              }
            })
+      .def("trace",
+           [](imperative::Tracer &self,
+              const std::string &type,
+              const PyNameVarBaseMap &ins,
+              const PyNameVarBaseMap &outs,
+              framework::AttributeMap attrs,
+              const platform::IPUPlace &place,
+              bool trace_backward,
+              const std::map<std::string, std::string> &inplace_map = {}) {
+             auto ins_map = ConvertToNameVarBaseMap(ins);
+             auto outs_map = ConvertToNameVarBaseMap(outs);
+             {
+               py::gil_scoped_release release;
+               self.TraceOp<imperative::VarBase>(type,
+                                                 std::move(ins_map),
+                                                 std::move(outs_map),
+                                                 std::move(attrs),
+                                                 place,
+                                                 trace_backward,
+                                                 inplace_map);
+             }
+           })
       .def("trace",
            [](imperative::Tracer &self,
               const std::string &type,
diff --git a/paddle/fluid/pybind/jit.cc b/paddle/fluid/pybind/jit.cc
new file mode 100644
index 0000000000000..be2ad50400c77
--- /dev/null
+++ b/paddle/fluid/pybind/jit.cc
@@ -0,0 +1,65 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/pybind/jit.h"
+
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/platform/place.h"
+
+#include "paddle/fluid/jit/executor_function.h"
+#include "paddle/fluid/jit/function_schema.h"
+#include "paddle/fluid/jit/layer.h"
+#include "paddle/fluid/jit/serializer.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+
+PyTypeObject *g_executor_function_pytype = nullptr;
+using Variable = paddle::framework::Variable;
+
+void BindJit(pybind11::module *m) {
+  py::class_<jit::Layer>(*m, "Layer", R"DOC(Layer Class.)DOC")
+      .def("function_dict",
+           &jit::Layer::FunctionMap,
+           py::return_value_policy::reference);
+
+  py::class_<jit::ExecutorFunction, std::shared_ptr<jit::ExecutorFunction>>
+      executor_function(
+          *m, "ExectorFunction", R"DOC(ExectorFunction Class.)DOC");
+  g_executor_function_pytype =
+      reinterpret_cast<PyTypeObject *>(executor_function.ptr());
+  executor_function.def("info", &jit::ExecutorFunction::Info);
+
+  py::class_<jit::FunctionInfo, std::shared_ptr<jit::FunctionInfo>>(
+      *m, "FunctionInfo", R"DOC(FunctionInfo Class.)DOC")
+      .def("name", &jit::FunctionInfo::FunctionName)
+      .def("input_names", &jit::FunctionInfo::InputArgNames)
+      .def("output_names", &jit::FunctionInfo::OutputArgNames);
+
+  m->def("Load",
+         [](const std::string &path, const platform::CPUPlace &cpu_place) {
+           return paddle::jit::Load(path, cpu_place);
+         });
+
+  m->def("Load",
+         [](const std::string &path, const platform::CUDAPlace &cuda_place) {
+           return paddle::jit::Load(path, cuda_place);
+         });
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/operators/digamma_op.h b/paddle/fluid/pybind/jit.h
similarity index 67%
rename from paddle/fluid/operators/digamma_op.h
rename to paddle/fluid/pybind/jit.h
index 85f9094e6a0bc..897e22e8b8594 100644
--- a/paddle/fluid/operators/digamma_op.h
+++ b/paddle/fluid/pybind/jit.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,8 +11,17 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
 
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
+#include <Python.h>
+
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace paddle {
+namespace pybind {
+
+void BindJit(pybind11::module* m);
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/metrics_py.cc b/paddle/fluid/pybind/metrics_py.cc
index 50318cf9e6fc4..78e6d528b1af3 100644
--- a/paddle/fluid/pybind/metrics_py.cc
+++ b/paddle/fluid/pybind/metrics_py.cc
@@ -27,7 +27,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/variant.h"
+
 #include "paddle/fluid/pybind/metrics_py.h"
 
 namespace py = pybind11;
diff --git a/paddle/fluid/pybind/nccl_wrapper_py.cc b/paddle/fluid/pybind/nccl_wrapper_py.cc
index bbba03f6660fe..827bcaf39704d 100644
--- a/paddle/fluid/pybind/nccl_wrapper_py.cc
+++ b/paddle/fluid/pybind/nccl_wrapper_py.cc
@@ -33,7 +33,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/variant.h"
+
 #include "paddle/fluid/pybind/nccl_wrapper_py.h"
 
 namespace py = pybind11;
diff --git a/paddle/fluid/pybind/op_function.h b/paddle/fluid/pybind/op_function.h
index 16c902cadf9a1..884136ec0d37b 100644
--- a/paddle/fluid/pybind/op_function.h
+++ b/paddle/fluid/pybind/op_function.h
@@ -257,8 +257,14 @@ PyObject* MakeReturnPyObject(const std::tuple<Args...>& out) {
   return result;
 }
 
+void BindOpFunctions1(pybind11::module* module);
+void BindOpFunctions2(pybind11::module* module);
+void BindOpFunctions3(pybind11::module* module);
+void BindOpFunctions4(pybind11::module* module);
+void BindOpFunctions5(pybind11::module* module);
+void BindOpFunctions6(pybind11::module* module);
+void BindOpFunctions7(pybind11::module* module);
+void BindOpFunctions8(pybind11::module* module);
+
 }  // namespace pybind
 }  // namespace paddle
-
-// This include must be the last line
-#include "paddle/fluid/pybind/op_function_impl.h"
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index b25ed3b5c5894..f659a671c3947 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -422,13 +422,17 @@ std::string GenerateOpFunctionsBody(
   return op_function_str;
 }
 
-static std::tuple<std::vector<std::string>, std::vector<std::string>>
-GenerateOpFunctions() {
+static std::vector<
+    std::tuple<std::vector<std::string>, std::vector<std::string>>>
+GenerateOpFunctions(int split_count) {
   auto& op_info_map = paddle::framework::OpInfoMap::Instance().map();
-
+  std::vector<std::tuple<std::vector<std::string>, std::vector<std::string>>>
+      result;
   std::vector<std::string> op_function_list, bind_function_list;
   auto& all_kernels = paddle::framework::OperatorWithKernel::AllOpKernels();
 
+  paddle::flat_hash_map<std::string, paddle::framework::OpInfo>
+      op_info_map_need_gen;
   for (auto& pair : op_info_map) {
     auto& op_info = pair.second;
     auto op_proto = op_info.proto_;
@@ -444,6 +448,22 @@ GenerateOpFunctions() {
       continue;
     }
 
+    op_info_map_need_gen.emplace(pair);
+  }
+
+  int cc_file_api_size = op_info_map_need_gen.size() / split_count;
+  if (op_info_map_need_gen.size() % split_count != 0) {
+    cc_file_api_size++;
+  }
+  int api_index = 0;
+  int file_index = 0;
+
+  for (auto& pair : op_info_map_need_gen) {
+    auto& op_info = pair.second;
+    auto op_proto = op_info.proto_;
+
+    auto& op_type = op_proto->type();
+
     // NOTE(pangyoki): Inplace Strategy.
     // In this case, output will reuse input varbase.
     // Dygraph mode needs to be aligned with the in-place strategy in static
@@ -489,13 +509,24 @@ GenerateOpFunctions() {
       op_function_list.emplace_back(std::move(inplace_op_function_str));
       bind_function_list.emplace_back(std::move(inplace_bind_function_str));
     }
+
+    api_index++;
+    if (api_index / cc_file_api_size > file_index) {
+      file_index++;
+      result.push_back(std::make_tuple(op_function_list, bind_function_list));
+      op_function_list.clear();
+      bind_function_list.clear();
+    }
   }
-  return std::make_tuple(op_function_list, bind_function_list);
+
+  result.push_back(std::make_tuple(op_function_list, bind_function_list));
+
+  return result;
 }
 
 int main(int argc, char* argv[]) {
-  if (argc != 2) {
-    std::cerr << "argc must be 2" << std::endl;
+  if (argc != 3) {
+    std::cerr << "argc must be 3" << std::endl;
     return -1;
   }
 
@@ -506,44 +537,57 @@ int main(int argc, char* argv[]) {
 
   std::vector<std::string> headers{"\"paddle/fluid/imperative/tracer.h\"",
                                    "\"paddle/fluid/platform/profiler.h\"",
+                                   "\"pybind11/numpy.h\"",
+                                   "\"pybind11/pybind11.h\"",
                                    "\"pybind11/detail/common.h\"",
+                                   "\"paddle/fluid/pybind/eager_utils.h\"",
+                                   "\"paddle/fluid/pybind/op_function.h\"",
                                    "<Python.h>"};
 
-  std::ofstream out(argv[1], std::ios::out);
+  std::string path = argv[1];
+  int split_count = atoi(argv[2]);
 
-  out << "#pragma once\n\n";
+  auto op_funcs = GenerateOpFunctions(split_count);
 
-  for (auto& header : headers) {
-    out << "#include  " + header + "\n";
-  }
+  for (size_t i = 0; i < op_funcs.size(); i++) {
+    std::ofstream out(path + "op_function" + std::to_string(i + 1) + ".cc.tmp",
+                      std::ios::out);
 
-  out << "\n\n";
-
-  auto op_funcs = GenerateOpFunctions();
-
-  out << "namespace paddle {\n"
-      << "namespace pybind {\n\n";
-  out << "std::atomic<int> VarBaseUniqueNameID{0};\n";
-  out << paddle::string::join_strings(std::get<0>(op_funcs), '\n');
-  out << "\n\n";
-
-  out << "static PyMethodDef ExtestMethods[] = {\n"
-      << paddle::string::join_strings(std::get<1>(op_funcs), '\n')
-      << "\n  {nullptr,nullptr,0,nullptr}"
-      << "};\n\n";
-
-  out << "inline void BindOpFunctions(pybind11::module *module) {\n"
-      << "  auto m = module->def_submodule(\"ops\");\n"
-      << "  if (PyModule_AddFunctions(m.ptr(), ExtestMethods) < 0) {\n"
-      << "    PADDLE_THROW(platform::errors::Fatal (\"Add functions to "
-         "core.ops failed!\"));\n"
-      << "  }\n\n"
-      << "  InitOpsAttrTypeMap();"
-      << "}\n\n"
-      << "} // namespace pybind\n"
-      << "} // namespace paddle\n";
-
-  out.close();
+    out << "#if defined(_MSC_VER)\n"
+        << "#include <BaseTsd.h>\n"
+        << "typedef SSIZE_T ssize_t;\n"
+        << "#endif\n";
+
+    for (auto& header : headers) {
+      out << "#include  " + header + "\n";
+    }
+
+    out << "\n\n";
+
+    out << "namespace paddle {\n"
+        << "namespace pybind {\n\n";
+    out << "extern std::atomic<int> VarBaseUniqueNameID;\n";
+    out << paddle::string::join_strings(std::get<0>(op_funcs[i]), '\n');
+    out << "\n\n";
+
+    out << "static PyMethodDef ExtestMethods[] = {\n"
+        << paddle::string::join_strings(std::get<1>(op_funcs[i]), '\n')
+        << "\n  {nullptr,nullptr,0,nullptr}"
+        << "};\n\n";
+
+    out << "void BindOpFunctions" << i + 1 << "(pybind11::module *module) {\n"
+        << "  auto m = module->def_submodule(\"ops\");\n"
+        << "  if (PyModule_AddFunctions(m.ptr(), ExtestMethods) < 0) {\n"
+        << "    PADDLE_THROW(platform::errors::Fatal (\"Add functions to "
+           "core.ops failed!\"));\n"
+        << "  }\n\n"
+        << "  InitOpsAttrTypeMap();"
+        << "}\n\n"
+        << "} // namespace pybind\n"
+        << "} // namespace paddle\n";
+
+    out.close();
+  }
 
 #ifdef PADDLE_WITH_ASCEND_CL
   ge::GEFinalize();
diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h
index 4441c06bca2cf..590d9d2f83e8b 100644
--- a/paddle/fluid/pybind/op_function_generator.h
+++ b/paddle/fluid/pybind/op_function_generator.h
@@ -208,6 +208,23 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"trilinear_interp", {"X", "OutSize"}},
     {"nearest_interp", {"X", "OutSize"}},
     {"bicubic_interp", {"X", "OutSize"}},
+    {"resnet_basic_block",
+     {"X",
+      "Filter1",
+      "Scale1",
+      "Bias1",
+      "Mean1",
+      "Var1",
+      "Filter2",
+      "Scale2",
+      "Bias2",
+      "Mean2",
+      "Var2",
+      "Filter3",
+      "Scale3",
+      "Bias3",
+      "Mean3",
+      "Var3"}},
 };
 
 // NOTE(zhiqiu): Like op_ins_map.
@@ -309,6 +326,12 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
       "Beta2PowOut",
       "MasterParamOut"}},
     {"fused_multi_transformer", {"CacheKVOut", "Out"}},
+    {"resnet_basic_block",
+     {"Y",         "Conv1",     "SavedMean1", "SavedInvstd1", "Mean1Out",
+      "Var1Out",   "Conv2",     "SavedMean2", "SavedInvstd2", "Mean2Out",
+      "Var2Out",   "Conv3",     "SavedMean3", "SavedInvstd3", "Mean3Out",
+      "Var3Out",   "MaxInput1", "MaxFilter1", "MaxInput2",    "MaxFilter2",
+      "MaxInput3", "MaxFilter3"}},
 };
 
 // NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are
@@ -408,6 +431,8 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
     {"concat", {"Out"}},
     {"fused_multi_transformer", {"CacheKVOut"}},
     {"group_norm", {"Mean", "Variance"}},
+    {"resnet_basic_block",
+     {"Mean1Out", "Var1Out", "Mean2Out", "Var2Out", "Mean3Out", "Var3Out"}},
 };
 
 // NOTE(pangyoki): Tensor View Strategy.
diff --git a/paddle/fluid/pybind/parallel_executor.cc b/paddle/fluid/pybind/parallel_executor.cc
new file mode 100644
index 0000000000000..f1d2f456a28d9
--- /dev/null
+++ b/paddle/fluid/pybind/parallel_executor.cc
@@ -0,0 +1,1118 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <Python.h>
+
+#include <algorithm>
+#include <cctype>
+#include <cstdlib>
+#include <iterator>
+#include <map>
+#include <memory>
+#include <mutex>  // NOLINT // for call_once
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/custom_operator.h"
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/data_type_transform.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/executor_cache.h"
+#include "paddle/fluid/framework/executor_gc_helper.h"
+#include "paddle/fluid/framework/feed_fetch_method.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/garbage_collector.h"
+#include "paddle/fluid/framework/io/fs.h"
+#include "paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h"
+#include "paddle/fluid/framework/ir/cost_model.h"
+#include "paddle/fluid/framework/ir/generate_pass.h"
+#include "paddle/fluid/framework/ir/pass_builder.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/new_executor/executor_statistics.h"
+#include "paddle/fluid/framework/new_executor/standalone_executor.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/framework/parallel_executor.h"
+#include "paddle/fluid/framework/phi_utils.h"
+#include "paddle/fluid/framework/prune.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/save_load_util.h"
+#include "paddle/fluid/framework/scope_pool.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/trainer.h"
+#include "paddle/fluid/framework/type_defs.h"
+#include "paddle/fluid/framework/version.h"
+#include "paddle/fluid/imperative/amp_auto_cast.h"
+#include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/memory/allocation/allocator_strategy.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/memory/allocation/cuda_ipc_allocator.h"
+#endif
+#include "paddle/fluid/memory/allocation/mmap_allocator.h"
+#include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/common_infer_shape_functions.h"
+#include "paddle/fluid/operators/py_func_op.h"
+#include "paddle/fluid/platform/cpu_helper.h"
+#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/init.h"
+#include "paddle/fluid/platform/monitor.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_python.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
+#include "paddle/fluid/platform/profiler/profiler.h"
+#include "paddle/fluid/pybind/cuda_streams_py.h"
+#include "paddle/fluid/pybind/distributed_py.h"
+#include "paddle/fluid/pybind/eager.h"
+#include "paddle/fluid/pybind/imperative.h"
+#include "paddle/fluid/pybind/io.h"
+#include "paddle/phi/core/compat/convert_utils.h"
+#include "paddle/phi/core/lod_utils.h"
+#include "paddle/utils/none.h"
+#ifdef PADDLE_WITH_ASCEND
+#include "paddle/fluid/pybind/ascend_wrapper_py.h"
+#endif
+#include "paddle/fluid/pybind/bind_cost_model.h"
+#include "paddle/fluid/pybind/bind_fleet_executor.h"
+#include "paddle/fluid/pybind/box_helper_py.h"
+#include "paddle/fluid/pybind/communication.h"
+#include "paddle/fluid/pybind/compatible.h"
+#include "paddle/fluid/pybind/const_value.h"
+#include "paddle/fluid/pybind/data_set_py.h"
+#include "paddle/fluid/pybind/exception.h"
+#include "paddle/fluid/pybind/fleet_wrapper_py.h"
+#include "paddle/fluid/pybind/generator_py.h"
+#include "paddle/fluid/pybind/global_value_getter_setter.h"
+#include "paddle/fluid/pybind/gloo_context_py.h"
+#include "paddle/fluid/pybind/gloo_wrapper_py.h"
+#include "paddle/fluid/pybind/heter_wrapper_py.h"
+#include "paddle/fluid/pybind/inference_api.h"
+#include "paddle/fluid/pybind/ir.h"
+#include "paddle/fluid/pybind/metrics_py.h"
+#include "paddle/fluid/pybind/ps_gpu_wrapper_py.h"
+#include "paddle/fluid/pybind/pybind_boost_headers.h"
+#include "paddle/phi/backends/device_manager.h"
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/pybind/nccl_wrapper_py.h"
+#endif
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/pybind/protobuf.h"
+#include "paddle/fluid/pybind/pybind.h"  // NOLINT
+#include "paddle/fluid/pybind/reader_py.h"
+#include "paddle/fluid/pybind/tensor_py.h"
+#include "paddle/fluid/string/to_string.h"
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
+#endif
+#ifndef PADDLE_WITH_HIP
+#include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
+#endif
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/npu/npu_info.h"
+#endif
+
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/device/xpu/xpu_info.h"
+#include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
+#endif
+
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#include "paddle/phi/capi/capi.h"
+#endif
+
+#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+
+#ifdef PADDLE_WITH_IPU
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+#include "paddle/fluid/platform/device/ipu/ipu_info.h"
+#endif
+
+#ifdef PADDLE_WITH_MLU
+#include "paddle/fluid/platform/device/mlu/mlu_info.h"
+#endif
+
+#ifdef PADDLE_WITH_CRYPTO
+#include "paddle/fluid/pybind/crypto.h"
+#endif
+
+#if defined PADDLE_WITH_PSCORE
+#include "paddle/fluid/pybind/fleet_py.h"
+#endif
+
+#ifdef PADDLE_WITH_CINN
+#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
+#endif
+
+#include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/imperative/layout_autotune.h"
+#include "paddle/fluid/pybind/eager_utils.h"
+#include "paddle/fluid/pybind/parallel_executor.h"
+#include "paddle/phi/api/ext/op_meta_info.h"
+#include "paddle/phi/kernels/autotune/cache.h"
+#include "paddle/phi/kernels/autotune/switch_autotune.h"
+#include "pybind11/stl.h"
+
+DECLARE_bool(use_mkldnn);
+
+// disable auto conversion to list in Python
+PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
+PYBIND11_MAKE_OPAQUE(paddle::framework::FetchUnmergedList);
+PYBIND11_MAKE_OPAQUE(paddle::framework::FetchList);
+PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType);
+
+namespace paddle {
+namespace pybind {
+using namespace paddle::framework;                // NOLINT
+void BindParallelExecutor(pybind11::module &m) {  // NOLINT
+  // -- python binds for parallel executor.
+  py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
+  py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy", R"DOC(
+    ExecutionStrategy allows the user to more preciously control how to run
+    the program in ParallelExecutor by setting the property.
+
+    Returns:
+        ExecutionStrategy: An ExecutionStrategy object.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import paddle.static as static
+          import paddle.nn.functional as F
+
+          paddle.enable_static()
+
+          x = static.data(name='x', shape=[None, 13], dtype='float32')
+          y = static.data(name='y', shape=[None, 1], dtype='float32')
+          y_predict = static.nn.fc(input=x, size=1, act=None)
+
+          cost = F.square_error_cost(input=y_predict, label=y)
+          avg_loss = paddle.mean(cost)
+
+          sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001)
+          sgd_optimizer.minimize(avg_loss)
+
+          exec_strategy = static.ExecutionStrategy()
+          exec_strategy.num_threads = 4
+
+          train_exe = static.ParallelExecutor(use_cuda=False,
+                                              loss_name=avg_loss.name,
+                                              exec_strategy=exec_strategy)
+        )DOC");
+
+  py::enum_<paddle::platform::DeviceType>(m, "DeviceType", py::arithmetic())
+      .value("CPU", paddle::platform::DeviceType::CPU)
+      .value("CUDA", paddle::platform::DeviceType::CUDA)
+      .value("XPU", paddle::platform::DeviceType::XPU);
+
+  exec_strategy.def(py::init())
+      .def_property(
+          "num_threads",
+          [](const ExecutionStrategy &self) { return self.num_threads_; },
+          [](ExecutionStrategy &self, size_t num_threads) {
+            self.num_threads_ = num_threads;
+          },
+          R"DOC(
+            The type is INT, num_threads represents the size of thread pool that
+            used to run the operators of the current program in ParallelExecutor.
+            If :math:`num\_threads=1`, all the operators will execute one by one,
+            but the order maybe difference between iterations.
+            If it is not set, it will be set in ParallelExecutor according to the
+            device type and device count, for GPU, :math:`num\_threads=device\_count*4`, for CPU,
+            :math:`num\_threads=CPU\_NUM*4`, the explanation of:math:`CPU\_NUM` is in ParallelExecutor.
+            if it is not set, ParallelExecutor will get the cpu count by calling
+            `multiprocessing.cpu_count()`. Default 0.
+
+            Examples:
+                .. code-block:: python
+
+                    import paddle
+                    import paddle.static as static
+
+                    paddle.enable_static()
+
+                    exec_strategy = static.ExecutionStrategy()
+                    exec_strategy.num_threads = 4
+            )DOC")
+      .def_property(
+          "_use_device",
+          [](const ExecutionStrategy &self) { return self.use_device_; },
+          [](ExecutionStrategy &self, paddle::platform::DeviceType use_device) {
+            self.use_device_ = use_device;
+          })  // NOTE(liuyuhui): Doesn't add doc for 'use_device', because
+              // use_device isn‘t exposed to users.
+      .def_property(
+          "allow_op_delay",
+          [](const ExecutionStrategy &self) { return self.allow_op_delay_; },
+          [](ExecutionStrategy &self, bool allow_op_delay) {
+            self.allow_op_delay_ = allow_op_delay;
+          },
+          R"DOC(The type is BOOL, allow_op_delay represents whether to delay the
+                communication operators to run, it may make the execution faster.
+                Note that this option is invalid now, and it will be removed in
+                next version. Default False.)DOC")
+      .def_property(
+          "num_iteration_per_drop_scope",
+          [](const ExecutionStrategy &self) {
+            return self.num_iteration_per_drop_scope_;
+          },
+          [](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) {
+            self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope;
+          },
+          R"DOC(The type is INT, num_iteration_per_drop_scope indicates how
+                many iterations to clean up the temp variables which
+                is generated during execution. It may make the execution faster,
+                because the temp variable's shape maybe the same between two iterations.
+                Default 100.
+
+                .. note::
+                    1. If you fetch data when calling the 'run', the ParallelExecutor 
+                    will clean up the temp variables at the end of the current iteration. 
+                    2. In some NLP model, it may cause the GPU memory is insufficient, 
+                    in this case, you should reduce `num_iteration_per_drop_scope`.
+
+                Examples:
+                    .. code-block:: python
+
+                        import paddle
+                        import paddle.static as static
+
+                        paddle.enable_static()
+
+                        exec_strategy = static.ExecutionStrategy()
+                        exec_strategy.num_iteration_per_drop_scope = 10
+              )DOC")
+      .def_property(
+          "num_iteration_per_run",
+          [](const ExecutionStrategy &self) {
+            return self.num_iteration_per_run_;
+          },
+          [](ExecutionStrategy &self, size_t num_iteration_per_run) {
+            self.num_iteration_per_run_ = num_iteration_per_run;
+          },
+          R"DOC(This config that how many iteration the executor will run when
+                user call exe.run() in python。Default: 1.
+
+                Examples:
+                    .. code-block:: python
+
+                        import paddle
+                        import paddle.static as static
+
+                        paddle.enable_static()
+
+                        exec_strategy = static.ExecutionStrategy()
+                        exec_strategy.num_iteration_per_run = 10
+              )DOC")
+      .def_property(
+          "use_thread_barrier",
+          [](const ExecutionStrategy &self) { return self.thread_barrier_; },
+          [](ExecutionStrategy &self, bool use_thread_barrier) {
+            self.thread_barrier_ = use_thread_barrier;
+          },
+          R"DOC(This config that the this is distributed training with parameter server
+              )DOC")
+      .def_property(
+          "_dry_run",
+          [](const ExecutionStrategy &self) { return self.dry_run_; },
+          [](ExecutionStrategy &self, bool dry_run) {
+            self.dry_run_ = dry_run;
+          });
+
+  exec_strategy.def_property(
+      "use_experimental_executor",
+      [](const ExecutionStrategy &self) {
+        return self.type_ == ExecutionStrategy::kExperimental;
+      },
+      [](ExecutionStrategy &self, bool experimental) {
+        self.type_ = experimental ? ExecutionStrategy::kExperimental
+                                  : ExecutionStrategy::kDefault;
+      });
+
+  py::class_<BuildStrategy> build_strategy(pe, "BuildStrategy", R"DOC(
+    BuildStrategy allows the user to more preciously control how to
+    build the SSA Graph in ParallelExecutor by setting the property.
+
+    Returns:
+        BuildStrategy: An BuildStrategy object.
+
+    Examples:
+        .. code-block:: python
+
+            import os
+            import paddle
+            import paddle.static as static
+
+            paddle.enable_static()
+
+            os.environ['CPU_NUM'] = str(2)
+            places = static.cpu_places()
+
+            data = static.data(name="x", shape=[None, 1], dtype="float32")
+            hidden = static.nn.fc(input=data, size=10)
+            loss = paddle.mean(hidden)
+            paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
+
+            build_strategy = static.BuildStrategy()
+            build_strategy.enable_inplace = True
+            build_strategy.memory_optimize = True
+            build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce
+            program = static.CompiledProgram(static.default_main_program())
+            program = program.with_data_parallel(loss_name=loss.name,
+                                                  build_strategy=build_strategy,
+                                                  places=places)
+)DOC");
+
+  py::enum_<BuildStrategy::ReduceStrategy>(build_strategy, "ReduceStrategy")
+      .value("Reduce", BuildStrategy::ReduceStrategy::kReduce)
+      .value("AllReduce", BuildStrategy::ReduceStrategy::kAllReduce)
+      .value("_NoReduce", BuildStrategy::ReduceStrategy::kNoReduce);
+  py::enum_<BuildStrategy::GradientScaleStrategy>(build_strategy,
+                                                  "GradientScaleStrategy")
+      .value("CoeffNumDevice",
+             BuildStrategy::GradientScaleStrategy::kCoeffNumDevice)
+      .value("One", BuildStrategy::GradientScaleStrategy::kOne)
+      .value("Customized", BuildStrategy::GradientScaleStrategy::kCustomized);
+
+  build_strategy.def(py::init())
+      .def("_clear_finalized", &BuildStrategy::ClearFinalized)
+      .def_property(
+          "reduce_strategy",
+          [](const BuildStrategy &self) { return self.reduce_; },
+          [](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, cannot be "
+                                  "configured again."));
+            self.reduce_ = strategy;
+          },
+          R"DOC((fluid.BuildStrategy.ReduceStrategy, optional): there are two reduce
+                strategies in ParallelExecutor, AllReduce and Reduce. If you want
+                that all the parameters' optimization are done on all devices independently,
+                you should choose AllReduce; otherwise, if you choose Reduce, all the parameters'
+                optimization will be evenly distributed to different devices, and then
+                broadcast the optimized parameter to other devices.
+                Default is 'AllReduce'.
+
+                Examples:
+                    .. code-block:: python
+
+                        import paddle
+                        import paddle.static as static
+
+                        paddle.enable_static()
+
+                        build_strategy = static.BuildStrategy()
+                        build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce
+                  )DOC")
+      .def_property(
+          "gradient_scale_strategy",
+          [](const BuildStrategy &self) { return self.gradient_scale_; },
+          [](BuildStrategy &self,
+             BuildStrategy::GradientScaleStrategy strategy) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, cannot be "
+                                  "configured again."));
+            self.gradient_scale_ = strategy;
+          },
+          R"DOC((paddle.static.BuildStrategy.GradientScaleStrategy, optional): there are three
+                ways of defining :math:`loss@grad` in ParallelExecutor, that is, CoeffNumDevice,
+                One and Customized. By default, ParallelExecutor sets the :math:`loss@grad`
+                according to the number of devices. If you want to customize :math:`loss@grad`,
+                you can choose Customized. Default is 'CoeffNumDevice'.
+
+                Examples:
+                    .. code-block:: python
+
+                        import numpy
+                        import os
+                        import paddle
+                        import paddle.static as static
+
+                        paddle.enable_static()
+
+                        use_cuda = True
+                        place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+                        exe = static.Executor(place)
+
+                        # NOTE: If you use CPU to run the program, you need
+                        # to specify the CPU_NUM, otherwise, paddle will use
+                        # all the number of the logic core as the CPU_NUM,
+                        # in that case, the batch size of the input should be
+                        # greater than CPU_NUM, if not, the process will be
+                        # failed by an exception.
+                        if not use_cuda:
+                            os.environ['CPU_NUM'] = str(2)
+                            places = static.cpu_places()
+                        else:
+                            places = static.cuda_places()
+
+                        data = static.data(name='X', shape=[None, 1], dtype='float32')
+                        hidden = static.nn.fc(input=data, size=10)
+                        loss = paddle.mean(hidden)
+                        paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
+
+                        exe.run(static.default_startup_program())
+
+                        build_strategy = static.BuildStrategy()
+                        build_strategy.gradient_scale_strategy = \
+                                  static.BuildStrategy.GradientScaleStrategy.Customized
+                        compiled_prog = static.CompiledProgram(
+                                  static.default_main_program()).with_data_parallel(
+                                          loss_name=loss.name, build_strategy=build_strategy,
+                                          places=places)
+
+                        dev_count =  len(places)
+                        x = numpy.random.random(size=(10, 1)).astype('float32')
+                        loss_grad = numpy.ones((dev_count)).astype("float32") * 0.01
+                        loss_grad_name = loss.name+"@GRAD"
+                        loss_data = exe.run(compiled_prog,
+                                              feed={"X": x, loss_grad_name : loss_grad},
+                                              fetch_list=[loss.name, loss_grad_name])
+                   )DOC")
+      .def_property(
+          "debug_graphviz_path",
+          [](const BuildStrategy &self) { return self.debug_graphviz_path_; },
+          [](BuildStrategy &self, const std::string &path) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, cannot be "
+                                  "configured again."));
+            self.debug_graphviz_path_ = path;
+          },
+          R"DOC((str, optional): debug_graphviz_path indicates the path that
+                writing the SSA Graph to file in the form of graphviz.
+                It is useful for debugging. Default is empty string, that is, ""
+
+                Examples:
+                    .. code-block:: python
+
+                        import paddle
+                        import paddle.static as static
+
+                        paddle.enable_static()
+
+                        build_strategy = static.BuildStrategy()
+                        build_strategy.debug_graphviz_path = "./graph"
+                    )DOC")
+      .def_property(
+          "enable_sequential_execution",
+          [](const BuildStrategy &self) {
+            return self.enable_sequential_execution_;
+          },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, cannot be "
+                                  "configured again."));
+            self.enable_sequential_execution_ = b;
+          },
+          R"DOC((bool, optional): If set True, the execution order of ops would
+                be the same as what is in the program. Default is False.
+
+                Examples:
+                    .. code-block:: python
+
+                        import paddle
+                        import paddle.static as static
+
+                        paddle.enable_static()
+
+                        build_strategy = static.BuildStrategy()
+                        build_strategy.enable_sequential_execution = True
+          )DOC")
+      .def_property(
+          "remove_unnecessary_lock",
+          [](const BuildStrategy &self) {
+            return self.remove_unnecessary_lock_;
+          },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, cannot be "
+                                  "configured again."));
+            self.remove_unnecessary_lock_ = b;
+          },
+          R"DOC((bool, optional): If set True, some locks in GPU ops would be
+                released and ParallelExecutor would run faster. Default is True.
+
+                Examples:
+                    .. code-block:: python
+
+                        import paddle
+                        import paddle.static as static
+
+                        paddle.enable_static()
+
+                        build_strategy = static.BuildStrategy()
+                        build_strategy.remove_unnecessary_lock = True
+          )DOC")
+      .def_property(
+          "num_trainers",
+          [](const BuildStrategy &self) { return self.num_trainers_; },
+          [](BuildStrategy &self, int num_trainers) {
+#ifdef WIN32
+            PADDLE_THROW(platform::errors::Unavailable(
+                "Distribution mode is not supported on Windows platform."));
+#endif
+            self.num_trainers_ = num_trainers;
+          })
+      .def_property(
+          "trainers_endpoints",
+          [](const BuildStrategy &self) { return self.trainers_endpoints_; },
+          [](BuildStrategy &self,
+             const std::vector<std::string> &trainers_endpoints) {
+            self.trainers_endpoints_ = trainers_endpoints;
+          })
+      .def_property(
+          "trainer_id",
+          [](const BuildStrategy &self) { return self.trainer_id_; },
+          [](BuildStrategy &self, int trainer_id) {
+            self.trainer_id_ = trainer_id;
+          })
+      .def_property(
+          "nccl_comm_num",
+          [](const BuildStrategy &self) { return self.nccl_comm_num_; },
+          [](BuildStrategy &self, int nccl_comm_num) {
+            self.nccl_comm_num_ = nccl_comm_num;
+          })
+      .def_property(
+          "bkcl_comm_num",
+          [](const BuildStrategy &self) { return self.bkcl_comm_num_; },
+          [](BuildStrategy &self, int bkcl_comm_num) {
+            self.bkcl_comm_num_ = bkcl_comm_num;
+          })
+      .def_property(
+          "use_hierarchical_allreduce",
+          [](const BuildStrategy &self) {
+            return self.use_hierarchical_allreduce_;
+          },
+          [](BuildStrategy &self, bool use) {
+            self.use_hierarchical_allreduce_ = use;
+          })
+      .def_property(
+          "hierarchical_allreduce_inter_nranks",
+          [](const BuildStrategy &self) {
+            return self.hierarchical_allreduce_inter_nranks_;
+          },
+          [](BuildStrategy &self, int nranks) {
+            self.hierarchical_allreduce_inter_nranks_ = nranks;
+          })
+
+      .def_property(
+          "fuse_elewise_add_act_ops",
+          [](const BuildStrategy &self) {
+            return self.fuse_elewise_add_act_ops_;
+          },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, cannot be "
+                                  "configured again."));
+            self.fuse_elewise_add_act_ops_ = b;
+          },
+          R"DOC((bool, optional): fuse_elewise_add_act_ops indicate whether
+                to fuse elementwise_add_op and activation_op,
+                it may make the execution faster. Default is False.
+
+                Examples:
+                    .. code-block:: python
+
+                        import paddle
+                        import paddle.static as static
+
+                        paddle.enable_static()
+
+                        build_strategy = static.BuildStrategy()
+                        build_strategy.fuse_elewise_add_act_ops = True
+                     )DOC")
+      .def_property(
+          "fuse_gemm_epilogue",
+          [](const BuildStrategy &self) { return self.fuse_gemm_epilogue_; },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, cannot be "
+                                  "configured again."));
+            self.fuse_gemm_epilogue_ = b;
+          },
+          R"DOC((bool, optional): fuse_gemm_epilogue indicate whether
+                to fuse matmul_op, elemenewist_add_op and activation_op,
+                it may make the execution faster. Default is False.
+
+                Examples:
+                    .. code-block:: python
+
+                        import paddle
+                        import paddle.static as static
+
+                        paddle.enable_static()
+
+                        build_strategy = static.BuildStrategy()
+                        build_strategy.fuse_gemm_epilogue = True
+                     )DOC")
+      .def_property(
+          "fuse_bn_act_ops",
+          [](const BuildStrategy &self) { return self.fuse_bn_act_ops_; },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, cannot be "
+                                  "configured again."));
+            self.fuse_bn_act_ops_ = b;
+          },
+          R"DOC((bool, optional): fuse_bn_act_ops indicate whether
+                to fuse batch_norm and activation_op,
+                it may make the execution faster. Default is False.
+
+                Examples:
+                    .. code-block:: python
+
+                        import paddle
+                        import paddle.static as static
+
+                        paddle.enable_static()
+
+                        build_strategy = static.BuildStrategy()
+                        build_strategy.fuse_bn_act_ops = True
+                     )DOC")
+      .def_property(
+          "fuse_bn_add_act_ops",
+          [](const BuildStrategy &self) { return self.fuse_bn_add_act_ops_; },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, cannot be "
+                                  "configured again."));
+            self.fuse_bn_add_act_ops_ = b;
+          },
+          R"DOC((bool, optional): fuse_bn_add_act_ops indicate whether
+                to fuse batch_norm, elementwise_add and activation_op,
+                it may make the execution faster. Default is True
+
+                Examples:
+                    .. code-block:: python
+
+                        import paddle
+                        import paddle.static as static
+
+                        paddle.enable_static()
+
+                        build_strategy = static.BuildStrategy()
+                        build_strategy.fuse_bn_add_act_ops = True
+                     )DOC")
+      .def_property(
+          "enable_auto_fusion",
+          [](const BuildStrategy &self) { return self.enable_auto_fusion_; },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, cannot be "
+                                  "configured again."));
+            self.enable_auto_fusion_ = b;
+          },
+          R"DOC((bool, optional): Whether to enable fusing subgraph to a
+                fusion_group. Now we only support fusing subgraph that composed
+                of elementwise-like operators, such as elementwise_add/mul
+                without broadcast and activations.
+
+                Examples:
+                    .. code-block:: python
+
+                        import paddle
+                        import paddle.static as static
+
+                        paddle.enable_static()
+
+                        build_strategy = static.BuildStrategy()
+                        build_strategy.enable_auto_fusion = True
+                    )DOC")
+      .def_property(
+          "fuse_relu_depthwise_conv",
+          [](const BuildStrategy &self) {
+            return self.fuse_relu_depthwise_conv_;
+          },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, cannot be "
+                                  "configured again."));
+            self.fuse_relu_depthwise_conv_ = b;
+          },
+          R"DOC((bool, optional): fuse_relu_depthwise_conv indicate whether
+                to fuse relu and depthwise_conv2d,
+                it will save GPU memory and may make the execution faster.
+                This options is only available in GPU devices.
+                Default is False.
+
+                Examples:
+                    .. code-block:: python
+
+                        import paddle
+                        import paddle.static as static
+
+                        paddle.enable_static()
+
+                        build_strategy = static.BuildStrategy()
+                        build_strategy.fuse_relu_depthwise_conv = True
+          )DOC")
+      .def_property(
+          "fuse_broadcast_ops",
+          [](const BuildStrategy &self) {
+            return self.fuse_broadcast_ops_ == true ||
+                   self.fuse_broadcast_ops_ == paddle::none;
+          },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, "
+                                  "cannot be configured again."));
+            self.fuse_broadcast_ops_ = b;
+          },
+          R"DOC((bool, optional): fuse_broadcast_op indicates whether
+                      to fuse the broadcast ops. Note that, in Reduce mode,
+                      fusing broadcast ops may make the program faster. Because
+                      fusing broadcast OP equals delaying the execution of all
+                      broadcast Ops, in this case, all nccl streams are used only
+                      for NCCLReduce operations for a period of time. Default False.
+
+                      Examples:
+                          .. code-block:: python
+
+                              import paddle
+                              import paddle.static as static
+
+                              paddle.enable_static()
+
+                              build_strategy = static.BuildStrategy()
+                              build_strategy.fuse_broadcast_ops = True
+                    )DOC")
+      .def_property(
+          "fuse_all_optimizer_ops",
+          [](const BuildStrategy &self) {
+            return self.fuse_all_optimizer_ops_ == true ||
+                   self.fuse_all_optimizer_ops_ == paddle::none;
+          },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, "
+                                  "cannot be configured again."));
+            self.fuse_all_optimizer_ops_ = b;
+          })
+      .def_property(
+          "sync_batch_norm",
+          [](const BuildStrategy &self) { return self.sync_batch_norm_; },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(),
+                              true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, cannot be "
+                                  "configured again."));
+            self.sync_batch_norm_ = b;
+          },
+          R"DOC((bool, optional): sync_batch_norm indicates whether to use
+                synchronous batch normalization which synchronizes the mean
+                and variance through multi-devices in training phase.
+                Current implementation doesn't support FP16 training and CPU.
+                And only synchronous on one machine, not all machines. 
+                Default is False.
+
+                Examples:
+                    .. code-block:: python
+
+                        import paddle
+                        import paddle.static as static
+
+                        paddle.enable_static()
+
+                        build_strategy = static.BuildStrategy()
+                        build_strategy.sync_batch_norm = True
+                )DOC")
+      .def_property(
+          "memory_optimize",
+          [](const BuildStrategy &self) -> py::object {
+            if (self.memory_optimize_) {
+              return py::cast(self.memory_optimize_.get());
+            } else {
+              return py::cast(nullptr);
+            }
+          },
+          [](BuildStrategy &self, const py::handle &value) {
+            auto *py_obj = value.ptr();
+            if (py_obj == nullptr || py_obj == Py_None) {
+              self.memory_optimize_ = paddle::none;
+            } else if (PyBool_Check(py_obj)) {
+              self.memory_optimize_ = (py_obj == Py_True);
+            } else {
+              PADDLE_THROW(platform::errors::InvalidArgument(
+                  "BuildStrategy.memory_optimize must be set to None, False "
+                  "or True"));
+            }
+          },
+          R"DOC((bool, optional): memory opitimize aims to save total memory
+                consumption, set to True to enable it.
+
+                Default None. None means framework would choose to use or not use 
+                this strategy automatically. Currently, None means that it is 
+                enabled when GC is disabled, and disabled when GC is enabled. 
+                True means enabling and False means disabling. Default is None.
+
+                Examples:
+                    .. code-block:: python
+
+                        import paddle
+                        import paddle.static as static
+
+                        paddle.enable_static()
+
+                        build_strategy = static.BuildStrategy()
+                        build_strategy.memory_optimize = True
+                
+                )DOC")
+      .def_property(
+          "is_distribution",
+          [](const BuildStrategy &self) { return self.is_distribution_; },
+          [](BuildStrategy &self, bool b) {
+#ifdef WIN32
+            if (b) {
+              PADDLE_THROW(platform::errors::Unavailable(
+                  "Distribution mode is not supported on Windows platform."));
+            }
+#else
+            self.is_distribution_ = b;
+#endif
+          })
+      .def_property(
+          "async_mode",
+          [](const BuildStrategy &self) { return self.async_mode_; },
+          [](BuildStrategy &self, bool b) { self.async_mode_ = b; })
+      .def_property(
+          "enable_inplace",
+          [](const BuildStrategy &self) { return self.enable_inplace_; },
+          [](BuildStrategy &self, bool b) { self.enable_inplace_ = b; })
+      .def_property(
+          "enable_addto",
+          [](const BuildStrategy &self) { return self.enable_addto_; },
+          [](BuildStrategy &self, bool b) { self.enable_addto_ = b; })
+      .def_property(
+          "fuse_all_reduce_ops",
+          [](const BuildStrategy &self) {
+            return self.fuse_all_reduce_ops_ == true ||
+                   self.fuse_all_reduce_ops_ == paddle::none;
+          },
+          [](BuildStrategy &self, bool b) { self.fuse_all_reduce_ops_ = b; })
+      .def_property(
+          "enable_backward_optimizer_op_deps",
+          [](const BuildStrategy &self) {
+            return self.enable_backward_optimizer_op_deps_;
+          },
+          [](BuildStrategy &self, bool b) {
+            self.enable_backward_optimizer_op_deps_ = b;
+          })
+      .def_property(
+          "cache_runtime_context",
+          [](const BuildStrategy &self) { return self.cache_runtime_context_; },
+          [](BuildStrategy &self, bool b) { self.cache_runtime_context_ = b; })
+      .def_property(
+          "mkldnn_enabled_op_types",
+          [](const BuildStrategy &self) {
+            return self.mkldnn_enabled_op_types_;
+          },
+          [](BuildStrategy &self,
+             const std::unordered_set<std::string> &mkldnn_enabled_op_types) {
+            self.mkldnn_enabled_op_types_ = mkldnn_enabled_op_types;
+          })
+      .def_property(
+          "fix_op_run_order",
+          [](const BuildStrategy &self) { return self.fix_op_run_order_; },
+          [](BuildStrategy &self, bool fix_op_run_order) {
+            self.fix_op_run_order_ = fix_op_run_order;
+          })
+      .def_property(
+          "allow_cuda_graph_capture",
+          [](const BuildStrategy &self) {
+            return self.allow_cuda_graph_capture_;
+          },
+          [](BuildStrategy &self, bool allow_cuda_graph_capture) {
+            self.allow_cuda_graph_capture_ = allow_cuda_graph_capture;
+          })
+      .def("_copy",
+           [](const BuildStrategy &self) {
+             auto new_bs = self;
+             new_bs.ClearFinalized();
+             return new_bs;
+           })
+      .def(
+          "_finalize_strategy_and_create_passes",
+          [](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
+            return self.CreatePassesFromStrategy(true);
+          },
+          R"DOC(Allow user to customized passes. Normally model-specific
+                optimization passes should be defined in this way. BuildStrategy
+                cannot be updated after being finalized.)DOC");
+
+  m.def("_set_cached_executor_build_strategy",
+        [](int64_t program_id, const BuildStrategy &build_strategy) {
+          auto &cached_exe_info = framework::ExecutorInfoCache::Instance();
+          cached_exe_info.SetBuildStrategy(program_id, build_strategy);
+        });
+
+  pe.def(py::init<const std::vector<platform::Place> &,
+                  const std::vector<std::string> &,
+                  const std::string &,
+                  Scope *,
+                  std::vector<Scope *> &,
+                  const ExecutionStrategy &,
+                  const BuildStrategy &,
+                  ir::Graph *>())
+      // NOTE: even we return a vec<Scope*>* to Python use reference policy.
+      // We still cannot get local_scope from this vector, since the element
+      // of vec<Scope*> will be freed by Python GC. We can only return Scope*
+      // one by one and mark them as reference.
+      .def(
+          "local_scopes",
+          [](ParallelExecutor &self) -> std::vector<Scope *> * {
+            return &self.GetLocalScopes();
+          },
+          py::return_value_policy::reference)
+      .def("drop_local_exe_scopes", &ParallelExecutor::DropLocalExeScopes)
+      .def("_need_create_local_exe_scopes",
+           &ParallelExecutor::NeedCreateLocalExeScope)
+      .def("feed_tensors_into_local_scopes",
+           &ParallelExecutor::FeedTensorsIntoLocalScopes)
+      .def("feed_and_split_tensor_into_local_scopes",
+           &ParallelExecutor::FeedAndSplitTensorIntoLocalScopes)
+      .def("run",
+           [](ParallelExecutor &self,
+              const std::vector<std::string> &fetch_tensors,
+              bool return_merged) -> py::object {
+             if (return_merged) {
+               paddle::framework::FetchList ret;
+               /*gil_scoped_release*/ {
+                 pybind11::gil_scoped_release release;
+                 ret = self.RunAndMerge(fetch_tensors);
+               }
+               return py::cast(std::move(ret));
+             } else {
+               paddle::framework::FetchUnmergedList ret;
+               /*gil_scoped_release*/ {
+                 pybind11::gil_scoped_release release;
+                 ret = self.Run(fetch_tensors);
+               }
+               return py::cast(std::move(ret));
+             }
+           })
+      .def("device_count", &ParallelExecutor::DeviceCount);
+  using VarQuantScale =
+      std::unordered_map<std::string, std::pair<bool, LoDTensor>>;
+  py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
+  pass.def(py::init())
+      .def("has", &ir::Pass::Has)
+      .def("set_not_owned",
+           [](ir::Pass &self, const std::string &attr_name, ProgramDesc &attr) {
+             self.SetNotOwned<ProgramDesc>(attr_name, &attr);
+           })
+      .def(
+          "set",
+          [](ir::Pass &self, const std::string &name, const std::string &attr) {
+            self.Set<std::string>(name, new std::string(attr));
+          })
+      .def("set",
+           [](ir::Pass &self, const std::string &name, bool val) {
+             self.Set<bool>(name, new bool(val));
+           })
+      .def("set",
+           [](ir::Pass &self, const std::string &name, int val) {
+             self.Set<const int>(name, new int(val));
+           })
+      .def("set",
+           [](ir::Pass &self,
+              const std::string &name,
+              std::vector<std::string> set) {
+             self.Set(name, new std::vector<std::string>(set));
+           })
+      .def("set",
+           [](ir::Pass &self,
+              const std::string &name,
+              std::unordered_set<std::string> set) {
+             self.Set(name, new std::unordered_set<std::string>(set));
+           })
+      .def("set",
+           [](ir::Pass &self,
+              const std::string &name,
+              std::unordered_set<int> set) {
+             self.Set(name, new std::unordered_set<int>(set));
+           })
+      .def("set",
+           [](ir::Pass &self, const std::string &name, VarQuantScale scales) {
+             self.Set(name, new VarQuantScale(scales));
+           })
+      .def("type", &ir::Pass::Type)
+      .def("apply", [](ir::Pass &self, std::shared_ptr<ir::Graph> graph) {
+        self.Apply(graph.get());
+      });
+
+  py::class_<ir::PassBuilder, std::shared_ptr<ir::PassBuilder>> pb(
+      m, "PassBuilder");
+  pb.def(py::init())
+      .def("append_pass",
+           [](ir::PassBuilder &self,
+              const std::string &pass_type) -> std::shared_ptr<ir::Pass> {
+             return self.AppendPass(pass_type);
+           })
+      .def("all_passes", [](ir::PassBuilder &self) { return self.AllPasses(); })
+      .def("insert_pass",
+           [](ir::PassBuilder &self, size_t idx, const std::string &pass_type) {
+             return self.InsertPass(idx, pass_type);
+           })
+      .def("remove_pass",
+           [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); });
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/parallel_executor.h b/paddle/fluid/pybind/parallel_executor.h
new file mode 100644
index 0000000000000..3c3acace033a7
--- /dev/null
+++ b/paddle/fluid/pybind/parallel_executor.h
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "pybind11/pybind11.h"
+
+namespace paddle {
+namespace pybind {
+
+void BindParallelExecutor(pybind11::module& m);  // NOLINT
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/place.cc b/paddle/fluid/pybind/place.cc
new file mode 100644
index 0000000000000..84dca60c210f2
--- /dev/null
+++ b/paddle/fluid/pybind/place.cc
@@ -0,0 +1,816 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <Python.h>
+
+#include <algorithm>
+#include <cctype>
+#include <cstdlib>
+#include <iterator>
+#include <map>
+#include <memory>
+#include <mutex>  // NOLINT // for call_once
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/custom_operator.h"
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/data_type_transform.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/executor_cache.h"
+#include "paddle/fluid/framework/executor_gc_helper.h"
+#include "paddle/fluid/framework/feed_fetch_method.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/garbage_collector.h"
+#include "paddle/fluid/framework/io/fs.h"
+#include "paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h"
+#include "paddle/fluid/framework/ir/cost_model.h"
+#include "paddle/fluid/framework/ir/generate_pass.h"
+#include "paddle/fluid/framework/ir/pass_builder.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/new_executor/executor_statistics.h"
+#include "paddle/fluid/framework/new_executor/standalone_executor.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/framework/parallel_executor.h"
+#include "paddle/fluid/framework/phi_utils.h"
+#include "paddle/fluid/framework/prune.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/save_load_util.h"
+#include "paddle/fluid/framework/scope_pool.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/trainer.h"
+#include "paddle/fluid/framework/type_defs.h"
+#include "paddle/fluid/framework/version.h"
+#include "paddle/fluid/imperative/amp_auto_cast.h"
+#include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/memory/allocation/allocator_strategy.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/memory/allocation/cuda_ipc_allocator.h"
+#endif
+#include "paddle/fluid/memory/allocation/mmap_allocator.h"
+#include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/common_infer_shape_functions.h"
+#include "paddle/fluid/operators/py_func_op.h"
+#include "paddle/fluid/platform/cpu_helper.h"
+#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/init.h"
+#include "paddle/fluid/platform/monitor.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_python.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
+#include "paddle/fluid/platform/profiler/profiler.h"
+#include "paddle/fluid/pybind/cuda_streams_py.h"
+#include "paddle/fluid/pybind/distributed_py.h"
+#include "paddle/fluid/pybind/eager.h"
+#include "paddle/fluid/pybind/imperative.h"
+#include "paddle/fluid/pybind/io.h"
+#include "paddle/phi/core/compat/convert_utils.h"
+#include "paddle/phi/core/lod_utils.h"
+#include "paddle/utils/none.h"
+#ifdef PADDLE_WITH_ASCEND
+#include "paddle/fluid/pybind/ascend_wrapper_py.h"
+#endif
+#include "paddle/fluid/pybind/bind_cost_model.h"
+#include "paddle/fluid/pybind/bind_fleet_executor.h"
+#include "paddle/fluid/pybind/box_helper_py.h"
+#include "paddle/fluid/pybind/communication.h"
+#include "paddle/fluid/pybind/compatible.h"
+#include "paddle/fluid/pybind/const_value.h"
+#include "paddle/fluid/pybind/data_set_py.h"
+#include "paddle/fluid/pybind/exception.h"
+#include "paddle/fluid/pybind/fleet_wrapper_py.h"
+#include "paddle/fluid/pybind/generator_py.h"
+#include "paddle/fluid/pybind/global_value_getter_setter.h"
+#include "paddle/fluid/pybind/gloo_context_py.h"
+#include "paddle/fluid/pybind/gloo_wrapper_py.h"
+#include "paddle/fluid/pybind/heter_wrapper_py.h"
+#include "paddle/fluid/pybind/inference_api.h"
+#include "paddle/fluid/pybind/ir.h"
+#include "paddle/fluid/pybind/metrics_py.h"
+#include "paddle/fluid/pybind/ps_gpu_wrapper_py.h"
+#include "paddle/fluid/pybind/pybind_boost_headers.h"
+#include "paddle/phi/backends/device_manager.h"
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/pybind/nccl_wrapper_py.h"
+#endif
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/pybind/protobuf.h"
+#include "paddle/fluid/pybind/pybind.h"  // NOLINT
+#include "paddle/fluid/pybind/reader_py.h"
+#include "paddle/fluid/pybind/tensor_py.h"
+#include "paddle/fluid/string/to_string.h"
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
+#endif
+#ifndef PADDLE_WITH_HIP
+#include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
+#endif
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/npu/npu_info.h"
+#endif
+
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/device/xpu/xpu_info.h"
+#include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
+#endif
+
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#include "paddle/phi/capi/capi.h"
+#endif
+
+#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+
+#ifdef PADDLE_WITH_IPU
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+#include "paddle/fluid/platform/device/ipu/ipu_info.h"
+#endif
+
+#ifdef PADDLE_WITH_MLU
+#include "paddle/fluid/platform/device/mlu/mlu_info.h"
+#endif
+
+#ifdef PADDLE_WITH_CRYPTO
+#include "paddle/fluid/pybind/crypto.h"
+#endif
+
+#if defined PADDLE_WITH_PSCORE
+#include "paddle/fluid/pybind/fleet_py.h"
+#endif
+
+#ifdef PADDLE_WITH_CINN
+#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
+#endif
+
+#include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/imperative/layout_autotune.h"
+#include "paddle/fluid/pybind/eager_utils.h"
+#include "paddle/fluid/pybind/place.h"
+#include "paddle/phi/api/ext/op_meta_info.h"
+#include "paddle/phi/kernels/autotune/cache.h"
+#include "paddle/phi/kernels/autotune/switch_autotune.h"
+#include "pybind11/stl.h"
+
+DECLARE_bool(use_mkldnn);
+
+// disable auto conversion to list in Python
+PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
+PYBIND11_MAKE_OPAQUE(paddle::framework::FetchUnmergedList);
+PYBIND11_MAKE_OPAQUE(paddle::framework::FetchList);
+PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType);
+
+namespace paddle {
+namespace pybind {
+PyTypeObject *g_place_pytype = nullptr;
+PyTypeObject *g_customplace_pytype = nullptr;
+PyTypeObject *g_cudaplace_pytype = nullptr;
+PyTypeObject *g_cpuplace_pytype = nullptr;
+PyTypeObject *g_xpuplace_pytype = nullptr;
+PyTypeObject *g_npuplace_pytype = nullptr;
+PyTypeObject *g_cudapinnedplace_pytype = nullptr;
+PyTypeObject *g_mluplace_pytype = nullptr;
+
+template <typename PlaceType>
+static inline int PlaceIndex(const PlaceType &p) {  // NOLINT
+  return static_cast<int>(paddle::platform::Place(p).GetType());
+}
+
+template <typename PlaceType1, typename PlaceType2>
+static inline bool IsSamePlace(const PlaceType1 &p1, const PlaceType2 &p2) {
+  return paddle::platform::Place(p1) == paddle::platform::Place(p2);
+}
+
+void BindPlace(pybind11::module &m) {  // NOLINT
+  using namespace paddle::framework;   // NOLINT
+  py::class_<platform::CustomPlace> customplace(m,
+                                                "CustomPlace",
+                                                R"DOC(
+    CustomPlace is a descriptor of a device.
+    It represents a custom device on which a tensor will be allocated and a model will run.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          fake_cpu_place = paddle.CustomPlace("FakeCPU", 0)
+                                             )DOC");
+  g_customplace_pytype = reinterpret_cast<PyTypeObject *>(customplace.ptr());
+  customplace
+      .def("__init__",
+           [](platform::CustomPlace &self,
+              const std::string &device_type,
+              int dev_id) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+             if (UNLIKELY(dev_id < 0)) {
+               LOG(ERROR) << string::Sprintf(
+                   "Invalid CustomPlace(%s, %d), device id must be 0 "
+                   "or "
+                   "positive integer",
+                   device_type,
+                   dev_id);
+               std::exit(-1);
+             }
+
+             if (LIKELY(phi::DeviceManager::HasDeviceType(device_type) &&
+                        phi::DeviceManager::IsCustom(device_type))) {
+               int dev_count = static_cast<int>(
+                   phi::DeviceManager::GetDeviceCount(device_type));
+               if (UNLIKELY(dev_id >= dev_count)) {
+                 if (dev_count == 0) {
+                   LOG(ERROR) << "Cannot use " << device_type
+                              << " because there is no " << device_type
+                              << " detected on your "
+                                 "machine.";
+                   std::exit(-1);
+                 } else {
+                   LOG(ERROR) << string::Sprintf(
+                       "Invalid CustomPlace(%s, %d), dev_id must "
+                       "inside "
+                       "[0, %d), because %s "
+                       "number on your machine is %d",
+                       device_type,
+                       dev_id,
+                       dev_count,
+                       device_type,
+                       dev_count);
+                   std::exit(-1);
+                 }
+               }
+               new (&self) platform::CustomPlace(device_type, dev_id);
+             } else {
+               LOG(ERROR) << string::Sprintf(
+                   "Invalid CustomPlace(%s, %d), the device type is "
+                   "not registered "
+                   "as a custom device.",
+                   device_type,
+                   dev_id);
+               std::exit(-1);
+             }
+#else
+             LOG(ERROR) << string::Sprintf(
+                 "Cannot use CustomDevice because you have installed CPU/GPU"
+                 "version PaddlePaddle.\n"
+                 "If you want to use CustomDevice, please try to install"
+                 "CustomDevice version "
+                 "PaddlePaddle by: pip install paddlepaddle\n"
+                 "If you only have CPU, please change "
+                 "CustomPlace(%s, %d) to be CPUPlace().\n",
+                 device_type, dev_id);
+             std::exit(-1);
+#endif
+           })
+      .def("_type", &PlaceIndex<platform::CustomPlace>)
+      .def("get_device_id",
+           [](const platform::CustomPlace &self) { return self.GetDeviceId(); })
+      .def("get_device_type",
+           [](const platform::CustomPlace &self) {
+             return self.GetDeviceType();
+           })
+      .def("__repr__", string::to_string<const platform::CustomPlace &>)
+      .def("__str__", string::to_string<const platform::CustomPlace &>);
+  py::class_<platform::CUDAPlace> cudaplace(m, "CUDAPlace", R"DOC(
+
+    CUDAPlace is a descriptor of a device.
+    It represents a GPU device allocated or to be allocated with Tensor or LoDTensor.
+    Each CUDAPlace has a dev_id to indicate the graphics card ID represented by the current CUDAPlace,
+    staring from 0.
+    The memory of CUDAPlace with different dev_id is not accessible.
+    Numbering here refers to the logical ID of the visible graphics card, not the actual ID of the graphics card.
+    You can set visible GPU devices by setting the `CUDA_VISIBLE_DEVICES` environment variable.
+    When the program starts, visible GPU devices will be numbered from 0.
+    If `CUDA_VISIBLE_DEVICES` is not set, all devices are visible by default,
+    and the logical ID is the same as the actual ID.
+
+    Parameters:
+        id (int): GPU device ID.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+
+          place = paddle.CUDAPlace(0)
+
+        )DOC");
+  g_cudaplace_pytype = reinterpret_cast<PyTypeObject *>(cudaplace.ptr());
+  cudaplace
+      .def("__init__",
+           [](platform::CUDAPlace &self, int dev_id) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+             if (UNLIKELY(dev_id < 0)) {
+               LOG(ERROR) << string::Sprintf(
+                   "Invalid CUDAPlace(%d), device id must be 0 or "
+                   "positive integer",
+                   dev_id);
+               std::exit(-1);
+             }
+
+             if (UNLIKELY(dev_id >= platform::GetGPUDeviceCount())) {
+               if (platform::GetGPUDeviceCount() == 0) {
+                 LOG(ERROR) << "Cannot use GPU because there is no GPU "
+                               "detected on your "
+                               "machine.";
+                 std::exit(-1);
+               } else {
+                 LOG(ERROR) << string::Sprintf(
+                     "Invalid CUDAPlace(%d), must inside [0, %d), because GPU "
+                     "number on your machine is %d",
+                     dev_id,
+                     platform::GetGPUDeviceCount(),
+                     platform::GetGPUDeviceCount());
+                 std::exit(-1);
+               }
+             }
+
+             new (&self) platform::CUDAPlace(dev_id);
+#else
+             LOG(ERROR) << string::Sprintf(
+                 "Cannot use GPU because you have installed CPU version "
+                 "PaddlePaddle.\n"
+                 "If you want to use GPU, please try to install GPU version "
+                 "PaddlePaddle by: pip install paddlepaddle-gpu\n"
+                 "If you only have CPU, please change CUDAPlace(%d) to be "
+                 "CPUPlace().\n",
+                 dev_id);
+             std::exit(-1);
+#endif
+           })
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      .def("get_device_id",
+           [](const platform::CUDAPlace &self) { return self.GetDeviceId(); })
+      .def("_type", &PlaceIndex<platform::CUDAPlace>)
+      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::Place>)
+      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CUDAPlace>)
+      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CPUPlace>)
+      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::NPUPlace>)
+      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::MLUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::CUDAPlace, platform::CUDAPinnedPlace>)
+      .def("_get_device_id",
+           [](platform::CUDAPlace &self) -> int { return self.GetDeviceId(); })
+#endif
+      .def("__repr__", string::to_string<const platform::CUDAPlace &>)
+      .def("__str__", string::to_string<const platform::CUDAPlace &>);
+
+  py::class_<platform::XPUPlace> xpuplace(m, "XPUPlace", R"DOC(
+    **Note**:
+    Examples:
+        .. code-block:: python
+          import paddle.fluid as fluid
+          xpu_place = fluid.XPUPlace(0)
+        )DOC");
+  g_xpuplace_pytype = reinterpret_cast<PyTypeObject *>(xpuplace.ptr());
+  xpuplace
+      .def("__init__",
+           [](platform::XPUPlace &self, int dev_id) {
+#ifdef PADDLE_WITH_XPU
+             if (UNLIKELY(dev_id < 0)) {
+               LOG(ERROR) << string::Sprintf(
+                   "Invalid XPUPlace(%d), device id must be 0 or "
+                   "positive integer",
+                   dev_id);
+               std::exit(-1);
+             }
+             if (UNLIKELY(dev_id >= platform::GetXPUDeviceCount())) {
+               if (platform::GetXPUDeviceCount() == 0) {
+                 LOG(ERROR) << "Cannot use XPU because there is no XPU "
+                               "detected on your "
+                               "machine.";
+                 std::exit(-1);
+               } else {
+                 LOG(ERROR) << string::Sprintf(
+                     "Invalid XPUPlace(%d), must inside [0, %d), because XPU "
+                     "number on your machine is %d",
+                     dev_id,
+                     platform::GetXPUDeviceCount(),
+                     platform::GetXPUDeviceCount());
+                 std::exit(-1);
+               }
+             }
+             new (&self) platform::XPUPlace(dev_id);
+#else
+             LOG(ERROR) << string::Sprintf(
+                 "Cannot use XPU because you have installed CPU/GPU version "
+                 "PaddlePaddle.\n"
+                 "If you want to use XPU, please try to install XPU version "
+                 "PaddlePaddle by: pip install paddlepaddle-xpu\n"
+                 "If you only have CPU, please change XPUPlace(%d) to be "
+                 "CPUPlace().\n",
+                 dev_id);
+             std::exit(-1);
+#endif
+           })
+#ifdef PADDLE_WITH_XPU
+      .def("_type", &PlaceIndex<platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::XPUPlace, platform::Place>)
+      .def("_equals", &IsSamePlace<platform::XPUPlace, platform::CUDAPlace>)
+      .def("_equals", &IsSamePlace<platform::XPUPlace, platform::CPUPlace>)
+      .def("_equals", &IsSamePlace<platform::XPUPlace, platform::XPUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::XPUPlace, platform::CUDAPinnedPlace>)
+      .def("get_device_id",
+           [](const platform::XPUPlace &self) { return self.GetDeviceId(); })
+#endif
+      .def("__repr__", string::to_string<const platform::XPUPlace &>)
+      .def("__str__", string::to_string<const platform::XPUPlace &>);
+#ifdef PADDLE_WITH_XPU
+  py::enum_<phi::backends::xpu::XPUVersion>(m, "XPUVersion", py::arithmetic())
+      .value("XPU1", phi::backends::xpu::XPUVersion::XPU1)
+      .value("XPU2", phi::backends::xpu::XPUVersion::XPU2)
+      .export_values();
+  m.def("get_xpu_device_count", platform::GetXPUDeviceCount);
+  m.def("get_xpu_device_version",
+        [](int device_id) { return platform::get_xpu_version(device_id); });
+#ifdef PADDLE_WITH_XPU_KP
+  m.def("get_xpu_device_op_support_types",
+        [](const std::string &op_name, phi::backends::xpu::XPUVersion version) {
+          return platform::get_xpu_kp_op_support_type(op_name, version);
+        });
+#else
+  m.def("get_xpu_device_op_support_types",
+        [](const std::string &op_name, phi::backends::xpu::XPUVersion version) {
+          return platform::get_xpu_op_support_type(op_name, version);
+        });
+#endif
+  m.def("get_xpu_device_op_list", [](phi::backends::xpu::XPUVersion version) {
+    return platform::get_xpu_op_list(version);
+  });
+  m.def("is_float16_supported", [](const platform::XPUPlace &place) -> bool {
+    // XPUs with Compute Capability > xpu2 support float16 and bfloat16
+    return platform::get_xpu_version(place.device) >
+           phi::backends::xpu::XPUVersion::XPU1;
+  });
+  m.def("is_bfloat16_supported", [](const platform::XPUPlace &place) -> bool {
+    // XPUs with Compute Capability > xpu2 support float16 and bfloat16
+    return platform::get_xpu_version(place.device) >
+           phi::backends::xpu::XPUVersion::XPU1;
+  });
+#endif
+
+  py::class_<paddle::platform::CPUPlace> cpuplace(m, "CPUPlace", R"DOC(
+    CPUPlace is a descriptor of a device.
+    It represents a CPU device on which a tensor will be allocated and a model will run.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          cpu_place = paddle.CPUPlace()
+
+        )DOC");
+  g_cpuplace_pytype = reinterpret_cast<PyTypeObject *>(cpuplace.ptr());
+  cpuplace.def(py::init<>())
+      .def("_type", &PlaceIndex<platform::CPUPlace>)
+      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::Place>)
+      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::NPUPlace>)
+      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CUDAPlace>)
+      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CPUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::CPUPlace, platform::CUDAPinnedPlace>)
+      .def("__repr__", string::to_string<const platform::CPUPlace &>)
+      .def("__str__", string::to_string<const platform::CPUPlace &>);
+
+  py::class_<paddle::platform::CUDAPinnedPlace> cudapinnedplace(
+      m, "CUDAPinnedPlace", R"DOC(
+    CUDAPinnedPlace is a descriptor of a device.
+    It refers to the page locked memory allocated by the CUDA function `cudaHostAlloc()` in the host memory.
+    The host operating system will not paging and exchanging the memory.
+    It can be accessed through direct memory access technology to speed up the copy of data between the host and GPU.
+    For more information on CUDA data transfer and `pinned memory`,
+    please refer to `official document <https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#pinned-memory>`_ .
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          place = paddle.CUDAPinnedPlace()
+
+        )DOC");
+  g_cudapinnedplace_pytype =
+      reinterpret_cast<PyTypeObject *>(cudapinnedplace.ptr());
+  cudapinnedplace
+      .def("__init__",
+           [](platform::CUDAPinnedPlace &self) {
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+             PADDLE_THROW(platform::errors::PermissionDenied(
+                 "Cannot use CUDAPinnedPlace in CPU only version, "
+                 "Please recompile or reinstall Paddle with CUDA support."));
+#endif
+             new (&self) platform::CUDAPinnedPlace();
+           })
+      .def("_type", &PlaceIndex<platform::CUDAPinnedPlace>)
+      .def("_equals", &IsSamePlace<platform::CUDAPinnedPlace, platform::Place>)
+      .def("_equals",
+           &IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::CUDAPinnedPlace, platform::XPUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::CUDAPinnedPlace, platform::NPUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::CUDAPinnedPlace, platform::CPUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>)
+      .def("__repr__", string::to_string<const platform::CUDAPinnedPlace &>)
+      .def("__str__", string::to_string<const platform::CUDAPinnedPlace &>);
+
+  // NPUPlace
+  py::class_<platform::NPUPlace> npuplace(m, "NPUPlace", R"DOC(
+    NPUPlace is a descriptor of a device.
+    It represents a NPU device on which a tensor will be allocated and a model will run.
+
+    Examples:
+        .. code-block:: python
+          import paddle
+          npu_place = paddle.NPUPlace(0)
+
+        )DOC");
+  g_npuplace_pytype = reinterpret_cast<PyTypeObject *>(npuplace.ptr());
+  npuplace
+      .def("__init__",
+           [](platform::NPUPlace &self, int dev_id) {
+#ifdef PADDLE_WITH_ASCEND_CL
+             if (UNLIKELY(dev_id < 0)) {
+               LOG(ERROR) << string::Sprintf(
+                   "Invalid NPUPlace(%d), device id must be 0 or "
+                   "positive integer",
+                   dev_id);
+               std::exit(-1);
+             }
+             if (UNLIKELY(dev_id >= platform::GetNPUDeviceCount())) {
+               if (platform::GetNPUDeviceCount() == 0) {
+                 LOG(ERROR) << "Cannot use NPU because there is no NPU "
+                               "detected on your "
+                               "machine.";
+                 std::exit(-1);
+               } else {
+                 LOG(ERROR) << string::Sprintf(
+                     "Invalid NPUPlace(%d), must inside [0, %d), because NPU "
+                     "number on your machine is %d",
+                     dev_id,
+                     platform::GetNPUDeviceCount(),
+                     platform::GetNPUDeviceCount());
+                 std::exit(-1);
+               }
+             }
+             new (&self) platform::NPUPlace(dev_id);
+#else
+             LOG(ERROR) << string::Sprintf(
+                 "Cannot use NPU because you have installed CPU/GPU version "
+                 "PaddlePaddle.\n"
+                 "If you want to use NPU, please try to install NPU version "
+                 "PaddlePaddle by: pip install paddlepaddle-npu\n"
+                 "If you only have CPU, please change NPUPlace(%d) to be "
+                 "CPUPlace().\n",
+                 dev_id);
+             std::exit(-1);
+#endif
+           })
+      .def("_type", &PlaceIndex<platform::NPUPlace>)
+      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::Place>)
+      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::CUDAPlace>)
+      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::CPUPlace>)
+      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::NPUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::NPUPlace, platform::CUDAPinnedPlace>)
+      .def("get_device_id",
+           [](const platform::NPUPlace &self) { return self.GetDeviceId(); })
+      .def("__str__", string::to_string<const platform::NPUPlace &>);
+
+  // IPUPlace
+  py::class_<platform::IPUPlace>(m, "IPUPlace", R"DOC(
+    IPUPlace is a descriptor of a device.
+    It represents a IPU device on which a tensor will be allocated and a model will run.
+
+    Examples:
+        .. code-block:: python
+          import paddle
+
+          # required: ipu
+
+          ipu_place = paddle.IPUPlace()
+
+        )DOC")
+      .def("__init__",
+           [](platform::IPUPlace &self) {
+#ifdef PADDLE_WITH_IPU
+             if (platform::GetIPUDeviceCount() == 0) {
+               LOG(ERROR) << "Cannot use IPU because there is no IPU "
+                             "detected on your "
+                             "machine.";
+               std::exit(-1);
+             }
+             // use ipu(0) to comile, while run with the number user configure
+             // in sharding and pipline.
+             new (&self) platform::IPUPlace(0);
+#else
+             LOG(ERROR) << string::Sprintf(
+                 "Cannot use IPU because you didn't install IPU version "
+                 "PaddlePaddle.\n"
+                 "If you want to use IPU, please try to install IPU version "
+                 "PaddlePaddle by: pip install paddlepaddle*\n"
+                 "If you only have CPU, please change IPUPlace to be "
+                 "CPUPlace().\n");
+             std::exit(-1);
+#endif
+           })
+      .def("_type", &PlaceIndex<platform::IPUPlace>)
+      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::Place>)
+      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::CUDAPlace>)
+      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::CPUPlace>)
+      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::NPUPlace>)
+      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::IPUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::IPUPlace, platform::CUDAPinnedPlace>)
+#ifdef PADDLE_WITH_IPU
+      .def("get_device_id",
+           [](const platform::IPUPlace &self) { return self.GetDeviceId(); })
+#endif
+      .def("__str__", string::to_string<const platform::IPUPlace &>);
+
+  // MLUPlace
+  py::class_<platform::MLUPlace> mluplace(m, "MLUPlace", R"DOC(
+    MLUPlace is a descriptor of a device.
+    It represents a MLU device on which a tensor will be allocated and a model will run.
+
+    Examples:
+        .. code-block:: python
+          import paddle
+          # required: mlu
+          mlu_place = paddle.MLUPlace(0)
+
+        )DOC");
+  g_mluplace_pytype = reinterpret_cast<PyTypeObject *>(mluplace.ptr());
+  mluplace
+      .def("__init__",
+           [](platform::MLUPlace &self, int dev_id) {
+#ifdef PADDLE_WITH_MLU
+             if (UNLIKELY(dev_id < 0)) {
+               LOG(ERROR) << string::Sprintf(
+                   "Invalid MLUPlace(%d), device id must be 0 or "
+                   "positive integer",
+                   dev_id);
+               std::exit(-1);
+             }
+             if (UNLIKELY(dev_id >= platform::GetMLUDeviceCount())) {
+               if (platform::GetMLUDeviceCount() == 0) {
+                 LOG(ERROR) << "Cannot use MLU because there is no MLU "
+                               "detected on your "
+                               "machine.";
+                 std::exit(-1);
+               } else {
+                 LOG(ERROR) << string::Sprintf(
+                     "Invalid MLUPlace(%d), must inside [0, %d), because MLU "
+                     "number on your machine is %d",
+                     dev_id,
+                     platform::GetMLUDeviceCount(),
+                     platform::GetMLUDeviceCount());
+                 std::exit(-1);
+               }
+             }
+             new (&self) platform::MLUPlace(dev_id);
+#else
+             LOG(ERROR) << string::Sprintf(
+                 "Cannot use MLU because you have installed CPU/GPU/... "
+                 "version "
+                 "PaddlePaddle.\n"
+                 "If you want to use MLU, please try to install MLU version "
+                 "PaddlePaddle by: pip install paddlepaddle-mlu\n"
+                 "If you only have CPU, please change MLUPlace(%d) to be "
+                 "CPUPlace().\n",
+                 dev_id);
+             std::exit(-1);
+#endif
+           })
+      .def("_type", &PlaceIndex<platform::MLUPlace>)
+#ifdef PADDLE_WITH_MLU
+      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::Place>)
+      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::CUDAPlace>)
+      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::CPUPlace>)
+      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::NPUPlace>)
+      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::IPUPlace>)
+      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::MLUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::MLUPlace, platform::CUDAPinnedPlace>)
+      .def("get_device_id",
+           [](const platform::MLUPlace &self) { return self.GetDeviceId(); })
+#endif
+      .def("__str__", string::to_string<const platform::MLUPlace &>);
+
+  py::class_<platform::Place> platformplace(m, "Place");
+  g_place_pytype = reinterpret_cast<PyTypeObject *>(platformplace.ptr());
+  platformplace.def(py::init<>())
+      .def("_type", &PlaceIndex<platform::Place>)
+      .def("_equals", &IsSamePlace<platform::Place, platform::Place>)
+      .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPlace>)
+      .def("_equals", &IsSamePlace<platform::Place, platform::CPUPlace>)
+      .def("_equals", &IsSamePlace<platform::Place, platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::Place, platform::NPUPlace>)
+      .def("_equals", &IsSamePlace<platform::Place, platform::IPUPlace>)
+      .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>)
+      .def("_equals", &IsSamePlace<platform::Place, platform::MLUPlace>)
+      .def("_equals", &IsSamePlace<platform::Place, platform::CustomPlace>)
+      .def("is_gpu_place",
+           [](platform::Place &self) { return platform::is_gpu_place(self); })
+      .def("is_cpu_place",
+           [](platform::Place &self) { return platform::is_cpu_place(self); })
+      .def("is_xpu_place",
+           [](platform::Place &self) { return platform::is_xpu_place(self); })
+      .def("is_npu_place",
+           [](platform::Place &self) { return platform::is_npu_place(self); })
+      .def("is_ipu_place",
+           [](platform::Place &self) { return platform::is_ipu_place(self); })
+      .def("is_cuda_pinned_place",
+           [](platform::Place &self) {
+             return platform::is_cuda_pinned_place(self);
+           })
+      .def("is_mlu_place",
+           [](platform::Place &self) { return platform::is_mlu_place(self); })
+      .def(
+          "is_custom_place",
+          [](platform::Place &self) { return platform::is_custom_place(self); })
+      .def("gpu_device_id", [](platform::Place &self) { return self.device; })
+      .def("xpu_device_id", [](platform::Place &self) { return self.device; })
+      .def("npu_device_id", [](platform::Place &self) { return self.device; })
+      .def("ipu_device_id", [](platform::Place &self) { return self.device; })
+      .def("mlu_device_id", [](platform::Place &self) { return self.device; })
+      .def("custom_device_id",
+           [](platform::Place &self) { return self.device; })
+      .def("set_place",
+           [](platform::Place &self, const platform::Place &other) {
+             self = other;
+           })
+      .def("set_place",
+           [](platform::Place &self, const platform::CPUPlace &cpu_place) {
+             self = cpu_place;
+           })
+      .def("set_place",
+           [](platform::Place &self, const platform::XPUPlace &xpu_place) {
+             self = xpu_place;
+           })
+      .def("set_place",
+           [](platform::Place &self, const platform::CUDAPlace &gpu_place) {
+             self = gpu_place;
+           })
+      .def("set_place",
+           [](platform::Place &self,
+              const platform::CUDAPinnedPlace &cuda_pinned_place) {
+             self = cuda_pinned_place;
+           })
+      .def("set_place",
+           [](platform::Place &self, const platform::NPUPlace &npu_place) {
+             self = npu_place;
+           })
+      .def("set_place",
+           [](platform::Place &self, const platform::IPUPlace &ipu_place) {
+             self = ipu_place;
+           })
+      .def("set_place",
+           [](platform::Place &self, const platform::MLUPlace &mlu_place) {
+             self = mlu_place;
+           })
+      .def("set_place",
+           [](platform::Place &self, const platform::CustomPlace &plug_place) {
+             self = plug_place;
+           })
+      .def("__repr__", string::to_string<const platform::Place &>)
+      .def("__str__", string::to_string<const platform::Place &>);
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/place.h b/paddle/fluid/pybind/place.h
new file mode 100644
index 0000000000000..40fb8d4c7f472
--- /dev/null
+++ b/paddle/fluid/pybind/place.h
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "pybind11/pybind11.h"
+
+namespace paddle {
+namespace pybind {
+
+void BindPlace(pybind11::module& m);  // NOLINT
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/protobuf.h b/paddle/fluid/pybind/protobuf.h
index 54b788cccba5b..79f174b5eb607 100644
--- a/paddle/fluid/pybind/protobuf.h
+++ b/paddle/fluid/pybind/protobuf.h
@@ -22,7 +22,6 @@ typedef SSIZE_T ssize_t;
 #include <fstream>
 #include <vector>
 
-#include "paddle/fluid/platform/variant.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index c294c8eb4a7c9..40a03248cd22d 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -90,6 +90,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/eager.h"
 #include "paddle/fluid/pybind/imperative.h"
 #include "paddle/fluid/pybind/io.h"
+#include "paddle/fluid/pybind/jit.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/lod_utils.h"
 #include "paddle/utils/none.h"
@@ -121,9 +122,12 @@ limitations under the License. */
 #include "paddle/fluid/pybind/nccl_wrapper_py.h"
 #endif
 #include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/pybind/parallel_executor.h"
+#include "paddle/fluid/pybind/place.h"
 #include "paddle/fluid/pybind/protobuf.h"
 #include "paddle/fluid/pybind/pybind.h"  // NOLINT
 #include "paddle/fluid/pybind/reader_py.h"
+#include "paddle/fluid/pybind/tensor.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/fluid/string/to_string.h"
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -193,16 +197,7 @@ PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType);
 namespace paddle {
 namespace pybind {
 
-PyTypeObject *g_place_pytype = nullptr;
 PyTypeObject *g_framework_scope_pytype = nullptr;
-PyTypeObject *g_cudaplace_pytype = nullptr;
-PyTypeObject *g_cpuplace_pytype = nullptr;
-PyTypeObject *g_xpuplace_pytype = nullptr;
-PyTypeObject *g_npuplace_pytype = nullptr;
-PyTypeObject *g_cudapinnedplace_pytype = nullptr;
-PyTypeObject *g_mluplace_pytype = nullptr;
-PyTypeObject *g_customplace_pytype = nullptr;
-PyTypeObject *g_framework_tensor_pytype = nullptr;
 PyTypeObject *g_framework_lodtensorarray_pytype = nullptr;
 PyTypeObject *g_custom_op_kernel_ctx_pytype = nullptr;
 
@@ -348,16 +343,6 @@ bool IsCompiledWithDIST() {
 #endif
 }
 
-template <typename PlaceType1, typename PlaceType2>
-static inline bool IsSamePlace(const PlaceType1 &p1, const PlaceType2 &p2) {
-  return paddle::platform::Place(p1) == paddle::platform::Place(p2);
-}
-
-template <typename PlaceType>
-static inline int PlaceIndex(const PlaceType &p) {
-  return static_cast<int>(paddle::platform::Place(p).GetType());
-}
-
 static PyObject *GetPythonAttribute(PyObject *obj, const char *attr_name) {
   // NOTE(zjl): PyObject_GetAttrString would return nullptr when attr_name
   // is not inside obj, but it would also set the error flag of Python.
@@ -540,19 +525,6 @@ static int GetNCCLVersion() {
 }
 #endif
 
-template <typename PlaceType>
-static void TensorCopyFrom(framework::Tensor *dst,
-                           const framework::Tensor &src,
-                           const PlaceType &place,
-                           int64_t batch_size) {
-  if (batch_size < 0) {
-    framework::TensorCopy(src, place, dst);
-  } else {
-    auto sliced = src.Slice(0, batch_size);
-    framework::TensorCopy(sliced, place, dst);
-  }
-}
-
 #ifdef PADDLE_WITH_AVX
 PYBIND11_MODULE(core_avx, m) {
 #else
@@ -563,6 +535,7 @@ PYBIND11_MODULE(core_noavx, m) {
   BindEager(&m);
   BindEagerStringTensor(&m);
   BindCudaStream(&m);
+  BindJit(&m);
 
   // Not used, just make sure cpu_info.cc is linked.
   paddle::platform::CpuTotalPhysicalMemory();
@@ -852,897 +825,6 @@ PYBIND11_MODULE(core_noavx, m) {
              self.EmplaceBackAttr(attr);
            });
 
-  py::class_<framework::Tensor> framework_tensor(
-      m, "Tensor", py::buffer_protocol());
-  g_framework_tensor_pytype =
-      reinterpret_cast<PyTypeObject *>(framework_tensor.ptr());
-  framework_tensor
-      .def("__array__",
-           [](framework::Tensor &self) { return TensorToPyArray(self); })
-      .def("_ptr",
-           [](const framework::Tensor &self) {
-             return reinterpret_cast<uintptr_t>(self.data());
-           })
-      .def("_slice", &framework::Tensor::Slice)
-      .def("_numel", &framework::Tensor::numel)
-      .def("_is_initialized",
-           [](const framework::Tensor &self) { return self.IsInitialized(); })
-      .def("_get_dims",
-           [](const framework::Tensor &self) { return vectorize(self.dims()); })
-      .def("_set_dims",
-           [](framework::Tensor &self, const std::vector<int64_t> &dim) {
-             self.Resize(phi::make_ddim(dim));
-           })
-      .def("_set_layout",
-           [](framework::Tensor &self, const std::string &layout) {
-             self.set_layout(StringToDataLayout(layout));
-           })
-      .def("_alloc_float",
-           [](framework::Tensor &self, paddle::platform::CustomPlace &place) {
-             self.mutable_data<float>(place);
-           })
-      .def("_alloc_float",
-           [](framework::Tensor &self, paddle::platform::CUDAPlace &place) {
-             self.mutable_data<float>(place);
-           })
-      .def("_alloc_float",
-           [](framework::Tensor &self, paddle::platform::XPUPlace &place) {
-             self.mutable_data<float>(place);
-           })
-      .def("_alloc_float",
-           [](framework::Tensor &self, paddle::platform::CPUPlace &place) {
-             self.mutable_data<float>(place);
-           })
-      .def("_alloc_float",
-           [](framework::Tensor &self, paddle::platform::NPUPlace &place) {
-             self.mutable_data<float>(place);
-           })
-      .def("_alloc_float",
-           [](framework::Tensor &self, paddle::platform::MLUPlace &place) {
-             self.mutable_data<float>(place);
-           })
-      .def("_alloc_double",
-           [](framework::Tensor &self, paddle::platform::CPUPlace &place) {
-             self.mutable_data<double>(place);
-           })
-      .def("_alloc_int",
-           [](framework::Tensor &self, paddle::platform::CPUPlace &place) {
-             self.mutable_data<int>(place);
-           })
-      .def("_alloc_int",
-           [](framework::Tensor &self, paddle::platform::CustomPlace &place) {
-             self.mutable_data<int>(place);
-           })
-      .def("_alloc_int",
-           [](framework::Tensor &self, paddle::platform::XPUPlace &place) {
-             self.mutable_data<int>(place);
-           })
-      .def("_alloc_int",
-           [](framework::Tensor &self, paddle::platform::CUDAPlace &place) {
-             self.mutable_data<int>(place);
-           })
-      .def("_alloc_int",
-           [](framework::Tensor &self, paddle::platform::MLUPlace &place) {
-             self.mutable_data<int>(place);
-           })
-      .def("_alloc_int",
-           [](framework::Tensor &self,
-              paddle::platform::CUDAPinnedPlace &place) {
-             self.mutable_data<int>(place);
-           })
-      .def("_alloc_float",
-           [](framework::Tensor &self,
-              paddle::platform::CUDAPinnedPlace &place) {
-             self.mutable_data<float>(place);
-           })
-      .def("_mutable_data",
-           [](framework::Tensor &self,
-              paddle::platform::CPUPlace &place,
-              paddle::framework::proto::VarType::Type type) {
-             return reinterpret_cast<uintptr_t>(
-                 self.mutable_data(place, framework::TransToPhiDataType(type)));
-           })
-      .def("_mutable_data",
-           [](framework::Tensor &self,
-              paddle::platform::CustomPlace &place,
-              paddle::framework::proto::VarType::Type type) {
-             return reinterpret_cast<uintptr_t>(
-                 self.mutable_data(place, framework::TransToPhiDataType(type)));
-           })
-      .def("_mutable_data",
-           [](framework::Tensor &self,
-              paddle::platform::XPUPlace &place,
-              paddle::framework::proto::VarType::Type type) {
-             return reinterpret_cast<uintptr_t>(
-                 self.mutable_data(place, framework::TransToPhiDataType(type)));
-           })
-      .def("_mutable_data",
-           [](framework::Tensor &self,
-              paddle::platform::CUDAPlace &place,
-              paddle::framework::proto::VarType::Type type) {
-             return reinterpret_cast<uintptr_t>(
-                 self.mutable_data(place, framework::TransToPhiDataType(type)));
-           })
-      .def("_mutable_data",
-           [](framework::Tensor &self,
-              paddle::platform::CUDAPinnedPlace &place,
-              paddle::framework::proto::VarType::Type type) {
-             return reinterpret_cast<uintptr_t>(
-                 self.mutable_data(place, framework::TransToPhiDataType(type)));
-           })
-      .def("_mutable_data",
-           [](framework::Tensor &self,
-              paddle::platform::MLUPlace &place,
-              paddle::framework::proto::VarType::Type type) {
-             return reinterpret_cast<uintptr_t>(
-                 self.mutable_data(place, framework::TransToPhiDataType(type)));
-           })
-      .def("_clear", &framework::Tensor::clear)
-      .def("_mutable_data",
-           [](framework::Tensor &self,
-              paddle::platform::NPUPlace &place,
-              paddle::framework::proto::VarType::Type type) {
-             return reinterpret_cast<uintptr_t>(
-                 self.mutable_data(place, framework::TransToPhiDataType(type)));
-           })
-      .def("_copy_from",
-           &TensorCopyFrom<paddle::platform::CPUPlace>,
-           py::arg("tensor"),
-           py::arg("place"),
-           py::arg("batch_size") = -1)
-      .def("_copy_from",
-           &TensorCopyFrom<paddle::platform::CustomPlace>,
-           py::arg("tensor"),
-           py::arg("place"),
-           py::arg("batch_size") = -1)
-      .def("_copy_from",
-           &TensorCopyFrom<paddle::platform::XPUPlace>,
-           py::arg("tensor"),
-           py::arg("place"),
-           py::arg("batch_size") = -1)
-      .def("_copy_from",
-           &TensorCopyFrom<paddle::platform::CUDAPlace>,
-           py::arg("tensor"),
-           py::arg("place"),
-           py::arg("batch_size") = -1)
-      .def("_copy_from",
-           &TensorCopyFrom<paddle::platform::NPUPlace>,
-           py::arg("tensor"),
-           py::arg("place"),
-           py::arg("batch_size") = -1)
-      .def("_copy_from",
-           &TensorCopyFrom<paddle::platform::CUDAPinnedPlace>,
-           py::arg("tensor"),
-           py::arg("place"),
-           py::arg("batch_size") = -1)
-      .def("_copy_from",
-           &TensorCopyFrom<paddle::platform::MLUPlace>,
-           py::arg("tensor"),
-           py::arg("place"),
-           py::arg("batch_size") = -1)
-      .def("_copy_from",
-           &TensorCopyFrom<paddle::platform::Place>,
-           py::arg("tensor"),
-           py::arg("place"),
-           py::arg("batch_size") = -1)
-      .def("set",
-           SetTensorFromPyArray<paddle::platform::CPUPlace>,
-           py::arg("array"),
-           py::arg("place"),
-           py::arg("zero_copy") = false)
-      .def("set",
-           SetTensorFromPyArray<paddle::platform::CustomPlace>,
-           py::arg("array"),
-           py::arg("place"),
-           py::arg("zero_copy") = false)
-      .def("set",
-           SetTensorFromPyArray<paddle::platform::XPUPlace>,
-           py::arg("array"),
-           py::arg("place"),
-           py::arg("zero_copy") = false)
-      .def("set",
-           SetTensorFromPyArray<paddle::platform::CUDAPlace>,
-           py::arg("array"),
-           py::arg("place"),
-           py::arg("zero_copy") = false)
-      .def("set",
-           SetTensorFromPyArray<paddle::platform::NPUPlace>,
-           py::arg("array"),
-           py::arg("place"),
-           py::arg("zero_copy") = false)
-      .def("set",
-           SetTensorFromPyArray<paddle::platform::IPUPlace>,
-           py::arg("array"),
-           py::arg("place"),
-           py::arg("zero_copy") = false)
-      .def("set",
-           SetTensorFromPyArray<paddle::platform::MLUPlace>,
-           py::arg("array"),
-           py::arg("place"),
-           py::arg("zero_copy") = false)
-      .def("set",
-           SetTensorFromPyArray<paddle::platform::CUDAPinnedPlace>,
-           py::arg("array"),
-           py::arg("place"),
-           py::arg("zero_copy") = false,
-           R"DOC(
-        Set the data of Tensor on place with given numpy array.
-        
-        Args:
-          lod (numpy.ndarray): The data to set.
-          place (CPUPlace|CUDAPlace|XPUPlace|IPUPlace|CUDAPinnedPlace|NPUPlace|MLUPlace): The place where the
-          Tensor is to be set.
-          zero_copy (bool, optional): Whether to share memory with the input numpy array.
-          This parameter only works with CPUPlace. Default: False.
-
-        Returns:
-            None.
-
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-                import numpy as np
-
-                t = fluid.Tensor()
-                t.set(np.ndarray([5, 30]), fluid.CPUPlace())
-          )DOC")
-
-      .def(
-          "shape",
-          [](framework::Tensor &self) { return vectorize(self.dims()); },
-          R"DOC(
-           Return the shape of Tensor.
-
-           Returns:
-               list[int]: The shape of Tensor.
-
-
-           Examples:
-               .. code-block:: python
-
-                  import paddle.fluid as fluid
-                  import numpy as np
-
-                  t = fluid.Tensor()
-                  t.set(np.ndarray([5, 30]), fluid.CPUPlace())
-                  print(t.shape())  # [5, 30]
-           )DOC")
-      .def("_to_dlpack",
-           [](framework::Tensor &self) {
-             DLPackTensor dlpack_tensor(self, 1);
-             DLManagedTensor *dmt = dlpack_tensor.ToDLManagedTensor();
-             auto capsule = py::capsule(
-                 static_cast<void *>(dmt), "dltensor", [](PyObject *ptr) {
-                   if (ptr) {
-                     auto dltensor = new DLManagedTensor;
-                     try {
-                       dltensor = reinterpret_cast<DLManagedTensor *>(
-                           PyCapsule_GetPointer(ptr, "used_dltensor"));
-                       return;
-                     } catch (...) {
-                       dltensor = reinterpret_cast<DLManagedTensor *>(
-                           PyCapsule_GetPointer(ptr, "dltensor"));
-                     }
-                     dltensor->deleter(dltensor);
-                   }
-                 });
-             return capsule;
-           })
-      .def("_set_float_element", TensorSetElement<float>)
-      .def("_get_float_element", TensorGetElement<float>)
-      .def("_set_double_element", TensorSetElement<double>)
-      .def("_get_double_element", TensorGetElement<double>)
-      .def("_place", [](framework::Tensor &self) { return self.place(); })
-      .def("_dtype",
-           [](framework::Tensor &self) {
-             return framework::TransToProtoVarType(self.type());
-           })
-      .def("_layout",
-           [](framework::Tensor &self) {
-             return DataLayoutToString(self.layout());
-           })
-      .def("_share_data_with", &framework::Tensor::ShareDataWith)
-      .def("__getitem__", PySliceTensor, py::return_value_policy::reference)
-      .def("__str__",
-           [](const framework::Tensor &self) {
-             std::stringstream ostr;
-             ostr << self;
-             return ostr.str();
-           }) /* ------ End of original Tensor ------ */
-      .def("__init__",
-           [](framework::Tensor &instance,
-              const std::vector<std::vector<size_t>>
-                  &recursive_sequence_lengths) {
-             LoD new_lod;
-             new_lod.reserve(recursive_sequence_lengths.size());
-             std::copy(recursive_sequence_lengths.begin(),
-                       recursive_sequence_lengths.end(),
-                       std::back_inserter(new_lod));
-             LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
-             PADDLE_ENFORCE_EQ(
-                 CheckLoD(new_offset_lod, -1),
-                 true,
-                 platform::errors::InvalidArgument(
-                     "The provided recursive_sequence_lengths info is "
-                     "invalid, "
-                     "the LoD converted by recursive_sequence_lengths is %s",
-                     new_lod));
-             new (&instance) framework::Tensor(new_offset_lod);
-           })
-      .def("__init__",
-           [](framework::Tensor &instance) {
-             new (&instance) framework::Tensor();
-           })
-      // We implement offset based LOD in C++ while we use length based with
-      // Python API. So we changed set_lod to set_recursive_sequence_lengths
-      // to
-      // avoid misuse.
-      // The discussion is here:
-      // https://github.com/PaddlePaddle/Paddle/issues/10855
-      .def(
-          "set_lod",
-          [](framework::Tensor &self,
-             const std::vector<std::vector<size_t>> &lod) {
-            // the input lod is offset-based level-of-detail info
-            LoD new_lod;
-            new_lod.reserve(lod.size());
-            std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-            PADDLE_ENFORCE_EQ(
-                CheckLoD(new_lod, vectorize(self.dims()).front()),
-                true,
-                platform::errors::InvalidArgument(
-                    "The provided LoD is invalid, the LoD is %s", new_lod));
-            self.set_lod(new_lod);
-          },
-          py::arg("lod"),
-          R"DOC(
-           Set LoD of the Tensor.
-
-           Args:
-               lod (list[list[int]]): The lod to set.
-
-           Returns:
-                None.
-
-           Examples:
-               .. code-block:: python
-
-                 import paddle.fluid as fluid
-                 import numpy as np
-
-                 t = fluid.Tensor()
-                 t.set(np.ndarray([5, 30]), fluid.CPUPlace())
-                 t.set_lod([[0, 2, 5]])
-                 print(t.lod()) # [[0, 2, 5]]
-           )DOC")
-      .def(
-          "set_recursive_sequence_lengths",
-          [](framework::Tensor &self,
-             const std::vector<std::vector<size_t>>
-                 &recursive_sequence_lengths) {
-            // the input recursive_sequence_lengths is length-based
-            // level-of-detail info
-            LoD new_lod;
-            new_lod.reserve(recursive_sequence_lengths.size());
-            std::copy(recursive_sequence_lengths.begin(),
-                      recursive_sequence_lengths.end(),
-                      std::back_inserter(new_lod));
-            LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
-            PADDLE_ENFORCE_EQ(
-                CheckLoD(new_offset_lod, vectorize(self.dims()).front()),
-                true,
-                platform::errors::InvalidArgument(
-                    "The provided recursive_sequence_lengths info is "
-                    "invalid, "
-                    "the LoD converted by recursive_sequence_lengths is "
-                    "%s",
-                    new_lod));
-            self.set_lod(new_offset_lod);
-          },
-          py::arg("recursive_sequence_lengths"),
-          R"DOC(
-           Set LoD of the Tensor according to recursive sequence lengths.
-
-           For example, if recursive_sequence_lengths=[[2, 3]], which means
-           there are two sequences with length 2 and 3 respectively, the
-           corresponding lod would be [[0, 2, 2+3]], i.e., [[0, 2, 5]].
-
-           Args:
-                recursive_sequence_lengths (list[list[int]]): The recursive sequence lengths.
-           
-           Returns:
-                None.
-
-           Examples:
-               .. code-block:: python
-
-                 import paddle.fluid as fluid
-                 import numpy as np
-
-                 t = fluid.Tensor()
-                 t.set(np.ndarray([5, 30]), fluid.CPUPlace())
-                 t.set_recursive_sequence_lengths([[2, 3]])
-                 print(t.recursive_sequence_lengths())  # [[2, 3]]
-                 print(t.lod())  # [[0, 2, 5]]
-           )DOC")
-      .def(
-          "lod",
-          [](framework::Tensor &self) -> std::vector<std::vector<size_t>> {
-            // output the offset-based lod info
-            LoD lod = self.lod();
-            std::vector<std::vector<size_t>> new_lod;
-            new_lod.reserve(lod.size());
-            std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-            return new_lod;
-          },
-          R"DOC(
-           Return the LoD of the Tensor.
-
-           Returns:
-               list[list[int]]: The lod of the Tensor.
-           
-           Examples:
-               .. code-block:: python
-
-                 import paddle.fluid as fluid
-                 import numpy as np
-
-                 t = fluid.Tensor()
-                 t.set(np.ndarray([5, 30]), fluid.CPUPlace())
-                 t.set_lod([[0, 2, 5]])
-                 print(t.lod()) # [[0, 2, 5]]
-           )DOC")
-      // Set above comments of set_lod.
-      .def(
-          "recursive_sequence_lengths",
-          [](framework::Tensor &self) -> std::vector<std::vector<size_t>> {
-            // output the length-based lod info
-            LoD lod = phi::ConvertToLengthBasedLoD(self.lod());
-            std::vector<std::vector<size_t>> new_lod;
-            new_lod.reserve(lod.size());
-            std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-            return new_lod;
-          },
-          R"DOC(
-           Return the recursive sequence lengths corresponding to of the LodD 
-           of the Tensor.
-
-           Returns:
-                list[list[int]]: The recursive sequence lengths.
-
-           Examples:
-               .. code-block:: python
-
-                 import paddle.fluid as fluid
-                 import numpy as np
-
-                 t = fluid.Tensor()
-                 t.set(np.ndarray([5, 30]), fluid.CPUPlace())
-                 t.set_recursive_sequence_lengths([[2, 3]])
-                 print(t.recursive_sequence_lengths()) # [[2, 3]]
-           )DOC")
-      .def(
-          "has_valid_recursive_sequence_lengths",
-          [](framework::Tensor &self) -> bool {
-            // Check that the lod info is valid and match the outermost
-            // dimension of the Tensor data
-            return CheckLoD(self.lod(), vectorize(self.dims()).front());
-          },
-          R"DOC(
-           Check whether the LoD of the Tensor is valid.
-
-           Returns:
-               bool: Whether the LoD is valid.
-
-           Examples:
-               .. code-block:: python
-
-                 import paddle.fluid as fluid
-                 import numpy as np
-
-                 t = fluid.Tensor()
-                 t.set(np.ndarray([5, 30]), fluid.CPUPlace())
-                 t.set_recursive_sequence_lengths([[2, 3]])
-                 print(t.has_valid_recursive_sequence_lengths()) # True
-           )DOC")
-      .def("_as_type",
-           [](const framework::Tensor &self,
-              paddle::framework::proto::VarType::Type type) {
-             framework::Tensor dst;
-             if (self.IsInitialized() && self.numel() > 0) {
-               TransDataType(self, type, &dst);
-             }
-             return dst;
-           })
-      .def("_copy",
-           [](const framework::Tensor &self, const platform::Place &place) {
-             // follow fetch_op's inplementation
-             framework::Tensor dst;
-             if (self.IsInitialized() && self.numel() > 0) {
-               TensorCopySync(self, place, &dst);
-             } else {
-               // Not copy, if the src tensor is empty.
-               dst.clear();
-               dst.Resize({0});
-             }
-             dst.set_lod(self.lod());
-             return dst;
-#ifdef _WIN32
-           });
-#else
-           })
-#ifdef PADDLE_WITH_CUDA
-      .def("_share_buffer_with",
-           [](framework::Tensor &self, const framework::Tensor src,
-              py::tuple t) {
-             auto *cuda_ipc_allocation =
-                 dynamic_cast<memory::allocation::CudaIpcAllocation *>(
-                     src.Holder().get());
-
-             PADDLE_ENFORCE_NOT_NULL(
-                 cuda_ipc_allocation,
-                 platform::errors::PreconditionNotMet(
-                     "Tensor is not Cuda IPC shared tensor. "
-                     "Now only Tensor shared by cuda ipc could use this "
-                     "api."));
-
-             size_t size = t[0].cast<size_t>();
-             auto dtype =
-                 static_cast<paddle::experimental::DataType>(t[1].cast<int>());
-             auto dims = phi::make_ddim(t[2].cast<std::vector<int>>());
-             auto lod_info = t[3].cast<framework::LoD>();
-             auto device_id = t[4].cast<int>();
-
-             auto shared_reader_holder =
-                 std::make_shared<memory::allocation::Allocation>(
-                     cuda_ipc_allocation->ptr(),
-                     cuda_ipc_allocation->base_ptr(), size,
-                     platform::CUDAPlace(device_id));
-
-             self.ResetHolderWithType(shared_reader_holder, dtype);
-             self.Resize(dims);
-             self.set_lod(lod_info);
-
-             VLOG(6) << "Reconstructed tensor with buffer shared!";
-           },
-           R"DOC(
-           Deserialize GPU Tensor for existed shared Cuda IPC tensor.
-
-           Params:
-               tensor: Shared Cuda IPC tensor.
-               tuple: contrains data size, data type,
-                      tensor dims, lod information, device index.
-
-       )DOC")
-      .def("_share_cuda",
-           [](framework::Tensor self) {
-             if (!self.IsInitialized() || self.numel() == 0)
-               throw std::runtime_error(
-                   "Tensor not initialized or numel is 0.  could not pass "
-                   "to shared memory. ");
-
-             auto *holder = dynamic_cast<memory::allocation::Allocation *>(
-                 self.Holder().get());
-             PADDLE_ENFORCE_EQ(
-                 platform::is_gpu_place(holder->place()), true,
-                 platform::errors::InvalidArgument(
-                     "Tensor is not on GPU. share_cuda only support GPU "
-                     "Tensor, share_filename is for CPU tensor."));
-
-             void *base_ptr = holder->base_ptr();
-             ptrdiff_t offset_bytes = reinterpret_cast<char *>(holder->ptr()) -
-                                      reinterpret_cast<char *>(base_ptr);
-
-             cudaIpcMemHandle_t handle;
-             PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcGetMemHandle(&handle, base_ptr));
-
-             auto _handle = py::bytes(reinterpret_cast<char *>(&handle),
-                                      (py::ssize_t)CUDA_IPC_HANDLE_SIZE);
-
-             // TODO(ZHUI): use cuda event, to avoid sync.
-             const auto &device_id = paddle::platform::GetCurrentDeviceId();
-             auto stream =
-                 paddle::platform::stream::get_current_stream(device_id);
-             stream->Synchronize();
-
-             int type_idx = static_cast<int>(self.type());
-             size_t data_size =
-                 self.numel() *
-                 framework::SizeOfType(
-                     framework::TransToProtoVarType(self.type()));
-
-             return py::make_tuple(_handle, (py::size_t)offset_bytes, data_size,
-                                   type_idx, vectorize(self.dims()), self.lod(),
-                                   device_id);
-           },
-           R"DOC(
-           Serialize GPU Tensor by cudaIpcMemHandle.
-
-           Returns:
-               tuple: contrains handle, data size, data type,
-                      tensor dims, lod information, device index.
-
-           Examples:
-               .. code-block:: python
-
-                 import paddle
-                 tensor = paddle.ones([3,3])
-                 metainfo = tensor.value().get_tensor()._share_cuda()
-
-      )DOC")
-      .def("_new_shared_cuda",
-           [](py::tuple t) {
-             if (t.size() != 7)
-               throw std::runtime_error(
-                   "Invalid Tensor meta info for shared cuda tensor!");
-
-             // 1. Create a new C++ instance
-             framework::Tensor tensor;
-
-             // 2. Rebuild Allocation from handle
-             const std::string &handle = t[0].cast<std::string>();
-             ptrdiff_t offset_bytes = (ptrdiff_t)t[1].cast<int64_t>();
-             auto device_id = t[6].cast<int>();
-             auto base_ptr = memory::allocation::GetIpcBasePtr(handle);
-             size_t size = t[2].cast<size_t>();
-             void *dev = base_ptr.get();
-             dev = reinterpret_cast<char *>(dev) + offset_bytes;
-
-             auto shared_reader_holder =
-                 std::make_shared<memory::allocation::CudaIpcAllocation>(
-                     dev, size, device_id, std::move(base_ptr));
-
-             // 3. Rebuild Tensor
-             tensor.ResetHolderWithType(
-                 shared_reader_holder,
-                 static_cast<paddle::experimental::DataType>(t[3].cast<int>()));
-             tensor.Resize(phi::make_ddim(t[4].cast<std::vector<int>>()));
-             tensor.set_lod(t[5].cast<framework::LoD>());
-
-             return tensor;
-           },
-           R"DOC(
-           Deserialize GPU lod tensor from cudaIpcMemHandle.
-
-           Params:
-               tuple: contrains handle, data size, data type,
-                      tensor dims, lod information, device index.
-
-           Examples:
-               .. code-block:: python
-
-                 import paddle
-                 tensor = paddle.ones([3,3])
-                 metainfo = tensor.value().get_tensor()._share_cuda()
-                 tensor_from_shared = paddle.to_tensor(paddle.fluid.core.LoDTensor._new_shared_cuda(metainfo))
-
-        )DOC")
-#endif
-      .def("_share_filename",
-           [](framework::Tensor &self) {
-             if (!self.IsInitialized() || self.numel() == 0)
-               throw std::runtime_error(
-                   "Tensor not initialized or numel is 0. could not pass to "
-                   "shared memory. ");
-
-             auto holder = self.Holder();
-             PADDLE_ENFORCE_EQ(
-                 platform::is_cpu_place(holder->place()) ||
-                     platform::is_cuda_pinned_place(holder->place()),
-                 true, platform::errors::InvalidArgument(
-                           "Tensor is not on CPU. share_filename only "
-                           "support CPU Tensor."));
-
-             auto *mmap_allocation = dynamic_cast<
-                 memory::allocation::RefcountedMemoryMapAllocation *>(
-                 holder.get());
-             // If the tensor is not shared, allocate memory map allocation.
-             if (mmap_allocation == nullptr) {
-               void *data_ptr = self.data();
-               size_t data_size =
-                   self.numel() *
-                   framework::SizeOfType(
-                       framework::TransToProtoVarType(self.type()));
-
-               int flags = memory::allocation::MAPPED_SHAREDMEM |
-                           memory::allocation::MAPPED_EXCLUSIVE;
-               std::string handle = memory::allocation::GetIPCName();
-               auto shared_holder =
-                   memory::allocation::AllocateRefcountedMemoryMapAllocation(
-                       handle, flags, data_size);
-
-               // copy data & reset holder
-               if (platform::is_cuda_pinned_place(holder->place())) {
-#ifdef PADDLE_WITH_CUDA
-                 memory::Copy(platform::CPUPlace(), shared_holder->ptr(),
-                              platform::CUDAPinnedPlace(), data_ptr, data_size);
-#endif
-               } else {
-                 memory::Copy(platform::CPUPlace(), shared_holder->ptr(),
-                              platform::CPUPlace(), data_ptr, data_size);
-               }
-               self.ResetHolder(shared_holder);
-               mmap_allocation = shared_holder.get();
-             }
-             int type_idx = static_cast<int>(self.type());
-
-             return py::make_tuple(mmap_allocation->ipc_name(),
-                                   mmap_allocation->size(), type_idx,
-                                   vectorize(self.dims()), self.lod());
-           },
-           R"DOC(
-           Serialize CPU lod tensor in shared memory to tuple.
-           If the tensor is not in shared memory, we will copy it first.
-
-           Returns:
-               tuple: contrains ipc name, data size, data type,
-                      tensor dims and lod imformation.
-
-           Examples:
-               .. code-block:: python
-
-                 import paddle
-                 tensor = paddle.ones([3,3])
-                 metainfo = tensor.value().get_tensor()._share_filename()
-
-       )DOC")
-      .def("_new_shared_filename",
-           [](py::tuple t) {  // __setstate__
-             if (t.size() != 5)
-               throw std::runtime_error("Invalid Tensor meta info state!");
-
-             framework::Tensor tensor;
-
-             // 2. Rebuild Allocation
-             const std::string &ipc_name = t[0].cast<std::string>();
-             size_t size = t[1].cast<size_t>();
-             int flags = memory::allocation::MAPPED_SHAREDMEM |
-                         memory::allocation::MAPPED_NOCREATE;
-
-             auto shared_holder =
-                 memory::allocation::AllocateRefcountedMemoryMapAllocation(
-                     ipc_name, flags, size);
-
-             // 3. Rebuild Tensor
-             tensor.ResetHolderWithType(
-                 shared_holder,
-                 static_cast<paddle::experimental::DataType>(t[2].cast<int>()));
-             tensor.Resize(phi::make_ddim(t[3].cast<std::vector<int>>()));
-             tensor.set_lod(t[4].cast<framework::LoD>());
-
-             return tensor;
-           },
-           R"DOC(
-           Deserialize CPU lod tensor from shared memory.
-
-           Params:
-               tuple: contrains ipc file name, data size, data type,
-                      tensor dims and lod information.
-
-           Examples:
-               .. code-block:: python
-
-                 import paddle
-                 tensor = paddle.ones([3,3])
-                 metainfo = tensor.value().get_tensor()._share_filename()
-                 tensor_from_shared = paddle.to_tensor(paddle.fluid.core.LoDTensor._new_shared_filename(metainfo))
-
-        )DOC")
-      .def("_shared_incref",
-           [](framework::Tensor &self) {
-             auto *mmap_allocation = dynamic_cast<
-                 memory::allocation::RefcountedMemoryMapAllocation *>(
-                 self.Holder().get());
-             if (mmap_allocation) {
-               mmap_allocation->incref();
-             }
-           },
-           R"DOC(
-            Increase reference count of share_filename tensor.
-      )DOC")
-      .def("_shared_decref",
-           [](framework::Tensor &self) {
-             auto *mmap_allocation = dynamic_cast<
-                 memory::allocation::RefcountedMemoryMapAllocation *>(
-                 self.Holder().get());
-             if (mmap_allocation) {
-               mmap_allocation->decref();
-             }
-           },
-           R"DOC(
-            Decrease reference count of share_filename tensor.
-      )DOC")
-      .def(py::pickle(
-          [](const framework::Tensor &t) {  // __getstate__
-            auto holder = t.Holder();
-            PADDLE_ENFORCE_EQ(platform::is_cpu_place(holder->place()), true,
-                              platform::errors::PreconditionNotMet(
-                                  "Tensor is not on CPU."
-                                  "Now only Tensor on CPU can be serialized."));
-            auto *mmap_writer_allocation =
-                dynamic_cast<memory::allocation::MemoryMapWriterAllocation *>(
-                    holder.get());
-            PADDLE_ENFORCE_NOT_NULL(
-                mmap_writer_allocation,
-                platform::errors::PreconditionNotMet(
-                    "Tensor is not in shared memory."
-                    "Now only Tensor on shared memory can be serialized."));
-            int type_idx = static_cast<int>(t.type());
-
-            return py::make_tuple(mmap_writer_allocation->ipc_name(),
-                                  mmap_writer_allocation->size(), type_idx,
-                                  vectorize(t.dims()), t.lod());
-          },
-          [](py::tuple t) {  // __setstate__
-            if (t.size() != 5)
-              throw std::runtime_error("Invalid Tensor state!");
-
-            // 1. Create a new C++ instance
-            framework::Tensor tensor;
-
-            // 2. Rebuild Allocation
-            const std::string &ipc_name = t[0].cast<std::string>();
-            size_t size = t[1].cast<size_t>();
-            auto shared_reader_holder =
-                memory::allocation::RebuildMemoryMapReaderAllocation(ipc_name,
-                                                                     size);
-
-            // 3. Maintain global fd set
-            VLOG(3) << "Tensor ipc name: " << ipc_name;
-            memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name);
-
-            // 4. Rebuild Tensor
-            tensor.ResetHolderWithType(
-                shared_reader_holder,
-                static_cast<paddle::experimental::DataType>(t[2].cast<int>()));
-            tensor.Resize(phi::make_ddim(t[3].cast<std::vector<int>>()));
-            tensor.set_lod(t[4].cast<framework::LoD>());
-
-            return tensor;
-          }));
-#endif
-
-  py::class_<phi::SelectedRows>(m, "SelectedRows")
-      .def("__init__",
-           [](phi::SelectedRows &instance) {
-             new (&instance) phi::SelectedRows();
-           })
-      .def("__init__",
-           [](phi::SelectedRows &instance,
-              const std::vector<int64_t> rows,
-              const int64_t &height) {
-             new (&instance) phi::SelectedRows(rows, height);
-           })
-      .def(
-          "get_tensor",
-          [](phi::SelectedRows &self) { return self.mutable_value(); },
-          py::return_value_policy::reference)
-      .def("numel",
-           [](phi::SelectedRows &self) -> int64_t {
-             return self.value().numel();
-           })
-      .def("set_height", &phi::SelectedRows::set_height)
-      .def("height", &phi::SelectedRows::height)
-      .def("set_rows",
-           [](phi::SelectedRows &self, std::vector<int64_t> rows) {
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
-             self.set_rows(rows);
-#else
-        Vector<int64_t> new_rows(rows);
-        self.set_rows(new_rows);
-#endif
-           })
-      .def("sync_index",
-           [](phi::SelectedRows &instance) { instance.SyncIndex(); })
-      .def("rows", [](phi::SelectedRows &self) {
-        auto rows = self.rows();
-        std::vector<int64_t> new_rows;
-        new_rows.reserve(rows.size());
-        std::copy(rows.begin(), rows.end(), std::back_inserter(new_rows));
-        return new_rows;
-      });
-
   py::class_<Variable>(m, "Variable", R"DOC(Variable Class.
 
 All parameter, weight, gradient are variables in Paddle.
@@ -2089,7 +1171,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def_static("create",
                   [](paddle::platform::CPUPlace& place)
                       -> paddle::platform::DeviceContext* {
-    auto* context = new paddle::platform::CPUDeviceContext();
+    auto* context = new phi::CPUContext();
     context->SetAllocator(
       paddle::memory::allocation::AllocatorFacade::Instance()
         .GetAllocator(place)
@@ -2270,603 +1352,6 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
     return devices;
   });
-  py::class_<platform::CustomPlace> customplace(m,
-                                                "CustomPlace",
-                                                R"DOC(
-    CustomPlace is a descriptor of a device.
-    It represents a custom device on which a tensor will be allocated and a model will run.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle
-          fake_cpu_place = paddle.CustomPlace("FakeCPU", 0)
-                                             )DOC");
-  g_customplace_pytype = reinterpret_cast<PyTypeObject *>(customplace.ptr());
-  customplace
-      .def("__init__",
-           [](platform::CustomPlace &self,
-              const std::string &device_type,
-              int dev_id) {
-#ifdef PADDLE_WITH_CUSTOM_DEVICE
-             if (UNLIKELY(dev_id < 0)) {
-               LOG(ERROR) << string::Sprintf(
-                   "Invalid CustomPlace(%s, %d), device id must be 0 "
-                   "or "
-                   "positive integer",
-                   device_type,
-                   dev_id);
-               std::exit(-1);
-             }
-
-             if (LIKELY(phi::DeviceManager::HasDeviceType(device_type) &&
-                        phi::DeviceManager::IsCustom(device_type))) {
-               int dev_count = static_cast<int>(
-                   phi::DeviceManager::GetDeviceCount(device_type));
-               if (UNLIKELY(dev_id >= dev_count)) {
-                 if (dev_count == 0) {
-                   LOG(ERROR) << "Cannot use " << device_type
-                              << " because there is no " << device_type
-                              << " detected on your "
-                                 "machine.";
-                   std::exit(-1);
-                 } else {
-                   LOG(ERROR) << string::Sprintf(
-                       "Invalid CustomPlace(%s, %d), dev_id must "
-                       "inside "
-                       "[0, %d), because %s "
-                       "number on your machine is %d",
-                       device_type,
-                       dev_id,
-                       dev_count,
-                       device_type,
-                       dev_count);
-                   std::exit(-1);
-                 }
-               }
-               new (&self) platform::CustomPlace(device_type, dev_id);
-             } else {
-               LOG(ERROR) << string::Sprintf(
-                   "Invalid CustomPlace(%s, %d), the device type is "
-                   "not registered "
-                   "as a custom device.",
-                   device_type,
-                   dev_id);
-               std::exit(-1);
-             }
-#else
-             LOG(ERROR) << string::Sprintf(
-                 "Cannot use CustomDevice because you have installed CPU/GPU"
-                 "version PaddlePaddle.\n"
-                 "If you want to use CustomDevice, please try to install"
-                 "CustomDevice version "
-                 "PaddlePaddle by: pip install paddlepaddle\n"
-                 "If you only have CPU, please change "
-                 "CustomPlace(%s, %d) to be CPUPlace().\n",
-                 device_type, dev_id);
-             std::exit(-1);
-#endif
-           })
-      .def("_type", &PlaceIndex<platform::CustomPlace>)
-      .def("get_device_id",
-           [](const platform::CustomPlace &self) { return self.GetDeviceId(); })
-      .def("get_device_type",
-           [](const platform::CustomPlace &self) {
-             return self.GetDeviceType();
-           })
-      .def("__repr__", string::to_string<const platform::CustomPlace &>)
-      .def("__str__", string::to_string<const platform::CustomPlace &>);
-  py::class_<platform::CUDAPlace> cudaplace(m, "CUDAPlace", R"DOC(
-
-    CUDAPlace is a descriptor of a device.
-    It represents a GPU device allocated or to be allocated with Tensor or LoDTensor.
-    Each CUDAPlace has a dev_id to indicate the graphics card ID represented by the current CUDAPlace,
-    staring from 0.
-    The memory of CUDAPlace with different dev_id is not accessible.
-    Numbering here refers to the logical ID of the visible graphics card, not the actual ID of the graphics card.
-    You can set visible GPU devices by setting the `CUDA_VISIBLE_DEVICES` environment variable.
-    When the program starts, visible GPU devices will be numbered from 0.
-    If `CUDA_VISIBLE_DEVICES` is not set, all devices are visible by default,
-    and the logical ID is the same as the actual ID.
-
-    Parameters:
-        id (int): GPU device ID.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle
-
-          place = paddle.CUDAPlace(0)
-
-        )DOC");
-  g_cudaplace_pytype = reinterpret_cast<PyTypeObject *>(cudaplace.ptr());
-  cudaplace
-      .def("__init__",
-           [](platform::CUDAPlace &self, int dev_id) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-             if (UNLIKELY(dev_id < 0)) {
-               LOG(ERROR) << string::Sprintf(
-                   "Invalid CUDAPlace(%d), device id must be 0 or "
-                   "positive integer",
-                   dev_id);
-               std::exit(-1);
-             }
-
-             if (UNLIKELY(dev_id >= platform::GetGPUDeviceCount())) {
-               if (platform::GetGPUDeviceCount() == 0) {
-                 LOG(ERROR) << "Cannot use GPU because there is no GPU "
-                               "detected on your "
-                               "machine.";
-                 std::exit(-1);
-               } else {
-                 LOG(ERROR) << string::Sprintf(
-                     "Invalid CUDAPlace(%d), must inside [0, %d), because GPU "
-                     "number on your machine is %d",
-                     dev_id,
-                     platform::GetGPUDeviceCount(),
-                     platform::GetGPUDeviceCount());
-                 std::exit(-1);
-               }
-             }
-
-             new (&self) platform::CUDAPlace(dev_id);
-#else
-             LOG(ERROR) << string::Sprintf(
-                 "Cannot use GPU because you have installed CPU version "
-                 "PaddlePaddle.\n"
-                 "If you want to use GPU, please try to install GPU version "
-                 "PaddlePaddle by: pip install paddlepaddle-gpu\n"
-                 "If you only have CPU, please change CUDAPlace(%d) to be "
-                 "CPUPlace().\n",
-                 dev_id);
-             std::exit(-1);
-#endif
-           })
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      .def("get_device_id",
-           [](const platform::CUDAPlace &self) { return self.GetDeviceId(); })
-      .def("_type", &PlaceIndex<platform::CUDAPlace>)
-      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::Place>)
-      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CUDAPlace>)
-      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CPUPlace>)
-      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::XPUPlace>)
-      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::NPUPlace>)
-      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::MLUPlace>)
-      .def("_equals",
-           &IsSamePlace<platform::CUDAPlace, platform::CUDAPinnedPlace>)
-      .def("_get_device_id",
-           [](platform::CUDAPlace &self) -> int { return self.GetDeviceId(); })
-#endif
-      .def("__repr__", string::to_string<const platform::CUDAPlace &>)
-      .def("__str__", string::to_string<const platform::CUDAPlace &>);
-
-  py::class_<platform::XPUPlace> xpuplace(m, "XPUPlace", R"DOC(
-    **Note**:
-    Examples:
-        .. code-block:: python
-          import paddle.fluid as fluid
-          xpu_place = fluid.XPUPlace(0)
-        )DOC");
-  g_xpuplace_pytype = reinterpret_cast<PyTypeObject *>(xpuplace.ptr());
-  xpuplace
-      .def("__init__",
-           [](platform::XPUPlace &self, int dev_id) {
-#ifdef PADDLE_WITH_XPU
-             if (UNLIKELY(dev_id < 0)) {
-               LOG(ERROR) << string::Sprintf(
-                   "Invalid XPUPlace(%d), device id must be 0 or "
-                   "positive integer",
-                   dev_id);
-               std::exit(-1);
-             }
-             if (UNLIKELY(dev_id >= platform::GetXPUDeviceCount())) {
-               if (platform::GetXPUDeviceCount() == 0) {
-                 LOG(ERROR) << "Cannot use XPU because there is no XPU "
-                               "detected on your "
-                               "machine.";
-                 std::exit(-1);
-               } else {
-                 LOG(ERROR) << string::Sprintf(
-                     "Invalid XPUPlace(%d), must inside [0, %d), because XPU "
-                     "number on your machine is %d",
-                     dev_id,
-                     platform::GetXPUDeviceCount(),
-                     platform::GetXPUDeviceCount());
-                 std::exit(-1);
-               }
-             }
-             new (&self) platform::XPUPlace(dev_id);
-#else
-             LOG(ERROR) << string::Sprintf(
-                 "Cannot use XPU because you have installed CPU/GPU version "
-                 "PaddlePaddle.\n"
-                 "If you want to use XPU, please try to install XPU version "
-                 "PaddlePaddle by: pip install paddlepaddle-xpu\n"
-                 "If you only have CPU, please change XPUPlace(%d) to be "
-                 "CPUPlace().\n",
-                 dev_id);
-             std::exit(-1);
-#endif
-           })
-#ifdef PADDLE_WITH_XPU
-      .def("_type", &PlaceIndex<platform::XPUPlace>)
-      .def("_equals", &IsSamePlace<platform::XPUPlace, platform::Place>)
-      .def("_equals", &IsSamePlace<platform::XPUPlace, platform::CUDAPlace>)
-      .def("_equals", &IsSamePlace<platform::XPUPlace, platform::CPUPlace>)
-      .def("_equals", &IsSamePlace<platform::XPUPlace, platform::XPUPlace>)
-      .def("_equals",
-           &IsSamePlace<platform::XPUPlace, platform::CUDAPinnedPlace>)
-      .def("get_device_id",
-           [](const platform::XPUPlace &self) { return self.GetDeviceId(); })
-#endif
-      .def("__repr__", string::to_string<const platform::XPUPlace &>)
-      .def("__str__", string::to_string<const platform::XPUPlace &>);
-#ifdef PADDLE_WITH_XPU
-  py::enum_<phi::backends::xpu::XPUVersion>(m, "XPUVersion", py::arithmetic())
-      .value("XPU1", phi::backends::xpu::XPUVersion::XPU1)
-      .value("XPU2", phi::backends::xpu::XPUVersion::XPU2)
-      .export_values();
-  m.def("get_xpu_device_count", platform::GetXPUDeviceCount);
-  m.def("get_xpu_device_version",
-        [](int device_id) { return platform::get_xpu_version(device_id); });
-#ifdef PADDLE_WITH_XPU_KP
-  m.def("get_xpu_device_op_support_types",
-        [](const std::string &op_name, phi::backends::xpu::XPUVersion version) {
-          return platform::get_xpu_kp_op_support_type(op_name, version);
-        });
-#else
-  m.def("get_xpu_device_op_support_types",
-        [](const std::string &op_name, phi::backends::xpu::XPUVersion version) {
-          return platform::get_xpu_op_support_type(op_name, version);
-        });
-#endif
-  m.def("get_xpu_device_op_list", [](phi::backends::xpu::XPUVersion version) {
-    return platform::get_xpu_op_list(version);
-  });
-  m.def("is_float16_supported", [](const platform::XPUPlace &place) -> bool {
-    // XPUs with Compute Capability > xpu2 support float16 and bfloat16
-    return platform::get_xpu_version(place.device) >
-           phi::backends::xpu::XPUVersion::XPU1;
-  });
-  m.def("is_bfloat16_supported", [](const platform::XPUPlace &place) -> bool {
-    // XPUs with Compute Capability > xpu2 support float16 and bfloat16
-    return platform::get_xpu_version(place.device) >
-           phi::backends::xpu::XPUVersion::XPU1;
-  });
-#endif
-
-  py::class_<paddle::platform::CPUPlace> cpuplace(m, "CPUPlace", R"DOC(
-    CPUPlace is a descriptor of a device.
-    It represents a CPU device on which a tensor will be allocated and a model will run.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle
-          cpu_place = paddle.CPUPlace()
-
-        )DOC");
-  g_cpuplace_pytype = reinterpret_cast<PyTypeObject *>(cpuplace.ptr());
-  cpuplace.def(py::init<>())
-      .def("_type", &PlaceIndex<platform::CPUPlace>)
-      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::Place>)
-      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::XPUPlace>)
-      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::NPUPlace>)
-      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CUDAPlace>)
-      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CPUPlace>)
-      .def("_equals",
-           &IsSamePlace<platform::CPUPlace, platform::CUDAPinnedPlace>)
-      .def("__repr__", string::to_string<const platform::CPUPlace &>)
-      .def("__str__", string::to_string<const platform::CPUPlace &>);
-
-  py::class_<paddle::platform::CUDAPinnedPlace> cudapinnedplace(
-      m, "CUDAPinnedPlace", R"DOC(
-    CUDAPinnedPlace is a descriptor of a device.
-    It refers to the page locked memory allocated by the CUDA function `cudaHostAlloc()` in the host memory.
-    The host operating system will not paging and exchanging the memory.
-    It can be accessed through direct memory access technology to speed up the copy of data between the host and GPU.
-    For more information on CUDA data transfer and `pinned memory`,
-    please refer to `official document <https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#pinned-memory>`_ .
-
-    Examples:
-        .. code-block:: python
-
-          import paddle
-          place = paddle.CUDAPinnedPlace()
-
-        )DOC");
-  g_cudapinnedplace_pytype =
-      reinterpret_cast<PyTypeObject *>(cudapinnedplace.ptr());
-  cudapinnedplace
-      .def("__init__",
-           [](platform::CUDAPinnedPlace &self) {
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
-             PADDLE_THROW(platform::errors::PermissionDenied(
-                 "Cannot use CUDAPinnedPlace in CPU only version, "
-                 "Please recompile or reinstall Paddle with CUDA support."));
-#endif
-             new (&self) platform::CUDAPinnedPlace();
-           })
-      .def("_type", &PlaceIndex<platform::CUDAPinnedPlace>)
-      .def("_equals", &IsSamePlace<platform::CUDAPinnedPlace, platform::Place>)
-      .def("_equals",
-           &IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPlace>)
-      .def("_equals",
-           &IsSamePlace<platform::CUDAPinnedPlace, platform::XPUPlace>)
-      .def("_equals",
-           &IsSamePlace<platform::CUDAPinnedPlace, platform::NPUPlace>)
-      .def("_equals",
-           &IsSamePlace<platform::CUDAPinnedPlace, platform::CPUPlace>)
-      .def("_equals",
-           &IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>)
-      .def("__repr__", string::to_string<const platform::CUDAPinnedPlace &>)
-      .def("__str__", string::to_string<const platform::CUDAPinnedPlace &>);
-
-  // NPUPlace
-  py::class_<platform::NPUPlace> npuplace(m, "NPUPlace", R"DOC(
-    NPUPlace is a descriptor of a device.
-    It represents a NPU device on which a tensor will be allocated and a model will run.
-
-    Examples:
-        .. code-block:: python
-          import paddle
-          npu_place = paddle.NPUPlace(0)
-
-        )DOC");
-  g_npuplace_pytype = reinterpret_cast<PyTypeObject *>(npuplace.ptr());
-  npuplace
-      .def("__init__",
-           [](platform::NPUPlace &self, int dev_id) {
-#ifdef PADDLE_WITH_ASCEND_CL
-             if (UNLIKELY(dev_id < 0)) {
-               LOG(ERROR) << string::Sprintf(
-                   "Invalid NPUPlace(%d), device id must be 0 or "
-                   "positive integer",
-                   dev_id);
-               std::exit(-1);
-             }
-             if (UNLIKELY(dev_id >= platform::GetNPUDeviceCount())) {
-               if (platform::GetNPUDeviceCount() == 0) {
-                 LOG(ERROR) << "Cannot use NPU because there is no NPU "
-                               "detected on your "
-                               "machine.";
-                 std::exit(-1);
-               } else {
-                 LOG(ERROR) << string::Sprintf(
-                     "Invalid NPUPlace(%d), must inside [0, %d), because NPU "
-                     "number on your machine is %d",
-                     dev_id,
-                     platform::GetNPUDeviceCount(),
-                     platform::GetNPUDeviceCount());
-                 std::exit(-1);
-               }
-             }
-             new (&self) platform::NPUPlace(dev_id);
-#else
-             LOG(ERROR) << string::Sprintf(
-                 "Cannot use NPU because you have installed CPU/GPU version "
-                 "PaddlePaddle.\n"
-                 "If you want to use NPU, please try to install NPU version "
-                 "PaddlePaddle by: pip install paddlepaddle-npu\n"
-                 "If you only have CPU, please change NPUPlace(%d) to be "
-                 "CPUPlace().\n",
-                 dev_id);
-             std::exit(-1);
-#endif
-           })
-      .def("_type", &PlaceIndex<platform::NPUPlace>)
-      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::Place>)
-      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::CUDAPlace>)
-      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::CPUPlace>)
-      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::XPUPlace>)
-      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::NPUPlace>)
-      .def("_equals",
-           &IsSamePlace<platform::NPUPlace, platform::CUDAPinnedPlace>)
-      .def("get_device_id",
-           [](const platform::NPUPlace &self) { return self.GetDeviceId(); })
-      .def("__str__", string::to_string<const platform::NPUPlace &>);
-
-  // IPUPlace
-  py::class_<platform::IPUPlace>(m, "IPUPlace", R"DOC(
-    IPUPlace is a descriptor of a device.
-    It represents a IPU device on which a tensor will be allocated and a model will run.
-
-    Examples:
-        .. code-block:: python
-          import paddle
-
-          # required: ipu
-
-          ipu_place = paddle.IPUPlace()
-
-        )DOC")
-      .def("__init__",
-           [](platform::IPUPlace &self) {
-#ifdef PADDLE_WITH_IPU
-             if (platform::GetIPUDeviceCount() == 0) {
-               LOG(ERROR) << "Cannot use IPU because there is no IPU "
-                             "detected on your "
-                             "machine.";
-               std::exit(-1);
-             }
-             // use ipu(0) to comile, while run with the number user configure
-             // in sharding and pipline.
-             new (&self) platform::IPUPlace(0);
-#else
-             LOG(ERROR) << string::Sprintf(
-                 "Cannot use IPU because you didn't install IPU version "
-                 "PaddlePaddle.\n"
-                 "If you want to use IPU, please try to install IPU version "
-                 "PaddlePaddle by: pip install paddlepaddle*\n"
-                 "If you only have CPU, please change IPUPlace to be "
-                 "CPUPlace().\n");
-             std::exit(-1);
-#endif
-           })
-      .def("_type", &PlaceIndex<platform::IPUPlace>)
-      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::Place>)
-      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::CUDAPlace>)
-      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::CPUPlace>)
-      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::XPUPlace>)
-      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::NPUPlace>)
-      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::IPUPlace>)
-      .def("_equals",
-           &IsSamePlace<platform::IPUPlace, platform::CUDAPinnedPlace>)
-#ifdef PADDLE_WITH_IPU
-      .def("get_device_id",
-           [](const platform::IPUPlace &self) { return self.GetDeviceId(); })
-#endif
-      .def("__str__", string::to_string<const platform::IPUPlace &>);
-
-  // MLUPlace
-  py::class_<platform::MLUPlace> mluplace(m, "MLUPlace", R"DOC(
-    MLUPlace is a descriptor of a device.
-    It represents a MLU device on which a tensor will be allocated and a model will run.
-
-    Examples:
-        .. code-block:: python
-          import paddle
-          # required: mlu
-          mlu_place = paddle.MLUPlace(0)
-
-        )DOC");
-  g_mluplace_pytype = reinterpret_cast<PyTypeObject *>(mluplace.ptr());
-  mluplace
-      .def("__init__",
-           [](platform::MLUPlace &self, int dev_id) {
-#ifdef PADDLE_WITH_MLU
-             if (UNLIKELY(dev_id < 0)) {
-               LOG(ERROR) << string::Sprintf(
-                   "Invalid MLUPlace(%d), device id must be 0 or "
-                   "positive integer",
-                   dev_id);
-               std::exit(-1);
-             }
-             if (UNLIKELY(dev_id >= platform::GetMLUDeviceCount())) {
-               if (platform::GetMLUDeviceCount() == 0) {
-                 LOG(ERROR) << "Cannot use MLU because there is no MLU "
-                               "detected on your "
-                               "machine.";
-                 std::exit(-1);
-               } else {
-                 LOG(ERROR) << string::Sprintf(
-                     "Invalid MLUPlace(%d), must inside [0, %d), because MLU "
-                     "number on your machine is %d",
-                     dev_id,
-                     platform::GetMLUDeviceCount(),
-                     platform::GetMLUDeviceCount());
-                 std::exit(-1);
-               }
-             }
-             new (&self) platform::MLUPlace(dev_id);
-#else
-             LOG(ERROR) << string::Sprintf(
-                 "Cannot use MLU because you have installed CPU/GPU/... "
-                 "version "
-                 "PaddlePaddle.\n"
-                 "If you want to use MLU, please try to install MLU version "
-                 "PaddlePaddle by: pip install paddlepaddle-mlu\n"
-                 "If you only have CPU, please change MLUPlace(%d) to be "
-                 "CPUPlace().\n",
-                 dev_id);
-             std::exit(-1);
-#endif
-           })
-      .def("_type", &PlaceIndex<platform::MLUPlace>)
-#ifdef PADDLE_WITH_MLU
-      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::Place>)
-      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::CUDAPlace>)
-      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::CPUPlace>)
-      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::XPUPlace>)
-      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::NPUPlace>)
-      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::IPUPlace>)
-      .def("_equals", &IsSamePlace<platform::MLUPlace, platform::MLUPlace>)
-      .def("_equals",
-           &IsSamePlace<platform::MLUPlace, platform::CUDAPinnedPlace>)
-      .def("get_device_id",
-           [](const platform::MLUPlace &self) { return self.GetDeviceId(); })
-#endif
-      .def("__str__", string::to_string<const platform::MLUPlace &>);
-
-  py::class_<platform::Place> platformplace(m, "Place");
-  g_place_pytype = reinterpret_cast<PyTypeObject *>(platformplace.ptr());
-  platformplace.def(py::init<>())
-      .def("_type", &PlaceIndex<platform::Place>)
-      .def("_equals", &IsSamePlace<platform::Place, platform::Place>)
-      .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPlace>)
-      .def("_equals", &IsSamePlace<platform::Place, platform::CPUPlace>)
-      .def("_equals", &IsSamePlace<platform::Place, platform::XPUPlace>)
-      .def("_equals", &IsSamePlace<platform::Place, platform::NPUPlace>)
-      .def("_equals", &IsSamePlace<platform::Place, platform::IPUPlace>)
-      .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>)
-      .def("_equals", &IsSamePlace<platform::Place, platform::MLUPlace>)
-      .def("_equals", &IsSamePlace<platform::Place, platform::CustomPlace>)
-      .def("is_gpu_place",
-           [](platform::Place &self) { return platform::is_gpu_place(self); })
-      .def("is_cpu_place",
-           [](platform::Place &self) { return platform::is_cpu_place(self); })
-      .def("is_xpu_place",
-           [](platform::Place &self) { return platform::is_xpu_place(self); })
-      .def("is_npu_place",
-           [](platform::Place &self) { return platform::is_npu_place(self); })
-      .def("is_ipu_place",
-           [](platform::Place &self) { return platform::is_ipu_place(self); })
-      .def("is_cuda_pinned_place",
-           [](platform::Place &self) {
-             return platform::is_cuda_pinned_place(self);
-           })
-      .def("is_mlu_place",
-           [](platform::Place &self) { return platform::is_mlu_place(self); })
-      .def(
-          "is_custom_place",
-          [](platform::Place &self) { return platform::is_custom_place(self); })
-      .def("gpu_device_id", [](platform::Place &self) { return self.device; })
-      .def("xpu_device_id", [](platform::Place &self) { return self.device; })
-      .def("npu_device_id", [](platform::Place &self) { return self.device; })
-      .def("ipu_device_id", [](platform::Place &self) { return self.device; })
-      .def("mlu_device_id", [](platform::Place &self) { return self.device; })
-      .def("custom_device_id",
-           [](platform::Place &self) { return self.device; })
-      .def("set_place",
-           [](platform::Place &self, const platform::Place &other) {
-             self = other;
-           })
-      .def("set_place",
-           [](platform::Place &self, const platform::CPUPlace &cpu_place) {
-             self = cpu_place;
-           })
-      .def("set_place",
-           [](platform::Place &self, const platform::XPUPlace &xpu_place) {
-             self = xpu_place;
-           })
-      .def("set_place",
-           [](platform::Place &self, const platform::CUDAPlace &gpu_place) {
-             self = gpu_place;
-           })
-      .def("set_place",
-           [](platform::Place &self,
-              const platform::CUDAPinnedPlace &cuda_pinned_place) {
-             self = cuda_pinned_place;
-           })
-      .def("set_place",
-           [](platform::Place &self, const platform::NPUPlace &npu_place) {
-             self = npu_place;
-           })
-      .def("set_place",
-           [](platform::Place &self, const platform::IPUPlace &ipu_place) {
-             self = ipu_place;
-           })
-      .def("set_place",
-           [](platform::Place &self, const platform::MLUPlace &mlu_place) {
-             self = mlu_place;
-           })
-      .def("set_place",
-           [](platform::Place &self, const platform::CustomPlace &plug_place) {
-             self = plug_place;
-           })
-      .def("__repr__", string::to_string<const platform::Place &>)
-      .def("__str__", string::to_string<const platform::Place &>);
 
   py::class_<OperatorBase>(m, "Operator")
       .def_static("create",
@@ -3057,65 +1542,22 @@ All parameter, weight, gradient are variables in Paddle.
       });
 
   py::class_<framework::StandaloneExecutor>(m, "StandaloneExecutor")
-      .def(py::init<const platform::Place &,
-                    const ProgramDesc &,
-                    const ProgramDesc &,
-                    Scope *>())
-      .def("run",
-           [](StandaloneExecutor &self,
-              const std::unordered_map<std::string, py::array> &input_dict,
-              std::vector<std::string> fetch_names) {
-             std::vector<framework::LoDTensor> feed_tensors;
-             std::vector<std::string> feed_names;
-
-             for (auto &item : input_dict) {
-               framework::LoDTensor t;
-               SetTensorFromPyArray<platform::CPUPlace>(
-                   &t, item.second, platform::CPUPlace(), false);
-               feed_names.push_back(item.first);
-               feed_tensors.push_back(t);
-             }
-
-             paddle::framework::FetchList ret;
-             {
-               pybind11::gil_scoped_release release;
-               ret = self.Run(feed_names, feed_tensors, fetch_names);
-             }
-             return py::cast(std::move(ret));
-           })
-      .def("run",
-           [](StandaloneExecutor &self,
-              const std::unordered_map<std::string, framework::LoDTensor>
-                  &input_dict,
-              std::vector<std::string> fetch_names) {
-             std::vector<framework::LoDTensor> feed_tensors;
-             std::vector<std::string> feed_names;
-
-             for (auto &item : input_dict) {
-               feed_names.push_back(item.first);
-               feed_tensors.push_back(item.second);
-             }
-
-             paddle::framework::FetchList ret;
-             {
-               pybind11::gil_scoped_release release;
-               ret = self.Run(feed_names, feed_tensors, fetch_names);
-             }
-             return py::cast(std::move(ret));
-           })
+      .def(py::init<const platform::Place &, const ProgramDesc &>())
       .def("run",
            [](StandaloneExecutor &self,
+              Scope *scope,
               std::vector<std::string> feed_names,
               std::vector<std::string> fetch_names) {
              paddle::framework::FetchList ret;
              {
                pybind11::gil_scoped_release release;
-               ret = self.Run(feed_names, fetch_names);
+               ret = self.Run(scope, feed_names, fetch_names);
              }
              return py::cast(std::move(ret));
            })
       .def("dry_run",
            [](StandaloneExecutor &self,
+              Scope *scope,
               const std::unordered_map<std::string, py::array> &input_dict) {
              std::vector<framework::LoDTensor> feed_tensors;
              std::vector<std::string> feed_names;
@@ -3131,7 +1573,7 @@ All parameter, weight, gradient are variables in Paddle.
              framework::interpreter::CostInfo cost_info;
              {
                pybind11::gil_scoped_release release;
-               cost_info = self.DryRun(feed_names, feed_tensors);
+               cost_info = self.DryRun(scope, feed_names, feed_tensors);
              }
              return cost_info;
            });
@@ -3225,13 +1667,17 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
 
   m.def("set_feed_variable",
-        static_cast<void (*)(
-            Scope *, const LoDTensor &, const std::string &, size_t)>(
-            &framework::SetFeedVariable));
+        static_cast<void (*)(  // NOLINT
+            Scope *,
+            const LoDTensor &,
+            const std::string &,
+            size_t)>(&framework::SetFeedVariable));
   m.def("set_feed_variable",
-        static_cast<void (*)(
-            Scope *, const Strings &, const std::string &, size_t)>(
-            &framework::SetFeedVariable));
+        static_cast<void (*)(  // NOLINT
+            Scope *,
+            const Strings &,
+            const std::string &,
+            size_t)>(&framework::SetFeedVariable));
   m.def("get_fetch_variable",
         [](const Scope &scope,
            const std::string &var_name,
@@ -3698,927 +2144,6 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("clear_executor_cache",
         []() { framework::ExecutorInfoCache::Instance().Finalize(); });
 
-  using VarQuantScale =
-      std::unordered_map<std::string, std::pair<bool, LoDTensor>>;
-
-  py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
-  pass.def(py::init())
-      .def("has", &ir::Pass::Has)
-      .def("set_not_owned",
-           [](ir::Pass &self, const std::string &attr_name, ProgramDesc &attr) {
-             self.SetNotOwned<ProgramDesc>(attr_name, &attr);
-           })
-      .def(
-          "set",
-          [](ir::Pass &self, const std::string &name, const std::string &attr) {
-            self.Set<std::string>(name, new std::string(attr));
-          })
-      .def("set",
-           [](ir::Pass &self, const std::string &name, bool val) {
-             self.Set<bool>(name, new bool(val));
-           })
-      .def("set",
-           [](ir::Pass &self, const std::string &name, int val) {
-             self.Set<const int>(name, new int(val));
-           })
-      .def("set",
-           [](ir::Pass &self,
-              const std::string &name,
-              std::vector<std::string> set) {
-             self.Set(name, new std::vector<std::string>(set));
-           })
-      .def("set",
-           [](ir::Pass &self,
-              const std::string &name,
-              std::unordered_set<std::string> set) {
-             self.Set(name, new std::unordered_set<std::string>(set));
-           })
-      .def("set",
-           [](ir::Pass &self,
-              const std::string &name,
-              std::unordered_set<int> set) {
-             self.Set(name, new std::unordered_set<int>(set));
-           })
-      .def("set",
-           [](ir::Pass &self, const std::string &name, VarQuantScale scales) {
-             self.Set(name, new VarQuantScale(scales));
-           })
-      .def("type", &ir::Pass::Type)
-      .def("apply", [](ir::Pass &self, std::shared_ptr<ir::Graph> graph) {
-        self.Apply(graph.get());
-      });
-
-  py::class_<ir::PassBuilder, std::shared_ptr<ir::PassBuilder>> pb(
-      m, "PassBuilder");
-  pb.def(py::init())
-      .def("append_pass",
-           [](ir::PassBuilder &self,
-              const std::string &pass_type) -> std::shared_ptr<ir::Pass> {
-             return self.AppendPass(pass_type);
-           })
-      .def("all_passes", [](ir::PassBuilder &self) { return self.AllPasses(); })
-      .def("insert_pass",
-           [](ir::PassBuilder &self, size_t idx, const std::string &pass_type) {
-             return self.InsertPass(idx, pass_type);
-           })
-      .def("remove_pass",
-           [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); });
-
-  // -- python binds for parallel executor.
-  py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
-  py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy", R"DOC(
-    ExecutionStrategy allows the user to more preciously control how to run
-    the program in ParallelExecutor by setting the property.
-
-    Returns:
-        ExecutionStrategy: An ExecutionStrategy object.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle
-          import paddle.static as static
-          import paddle.nn.functional as F
-
-          paddle.enable_static()
-
-          x = static.data(name='x', shape=[None, 13], dtype='float32')
-          y = static.data(name='y', shape=[None, 1], dtype='float32')
-          y_predict = static.nn.fc(input=x, size=1, act=None)
-
-          cost = F.square_error_cost(input=y_predict, label=y)
-          avg_loss = paddle.mean(cost)
-
-          sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001)
-          sgd_optimizer.minimize(avg_loss)
-
-          exec_strategy = static.ExecutionStrategy()
-          exec_strategy.num_threads = 4
-
-          train_exe = static.ParallelExecutor(use_cuda=False,
-                                              loss_name=avg_loss.name,
-                                              exec_strategy=exec_strategy)
-        )DOC");
-
-  py::enum_<paddle::platform::DeviceType>(m, "DeviceType", py::arithmetic())
-      .value("CPU", paddle::platform::DeviceType::CPU)
-      .value("CUDA", paddle::platform::DeviceType::CUDA)
-      .value("XPU", paddle::platform::DeviceType::XPU);
-
-  exec_strategy.def(py::init())
-      .def_property(
-          "num_threads",
-          [](const ExecutionStrategy &self) { return self.num_threads_; },
-          [](ExecutionStrategy &self, size_t num_threads) {
-            self.num_threads_ = num_threads;
-          },
-          R"DOC(
-            The type is INT, num_threads represents the size of thread pool that
-            used to run the operators of the current program in ParallelExecutor.
-            If :math:`num\_threads=1`, all the operators will execute one by one,
-            but the order maybe difference between iterations.
-            If it is not set, it will be set in ParallelExecutor according to the
-            device type and device count, for GPU, :math:`num\_threads=device\_count*4`, for CPU,
-            :math:`num\_threads=CPU\_NUM*4`, the explanation of:math:`CPU\_NUM` is in ParallelExecutor.
-            if it is not set, ParallelExecutor will get the cpu count by calling
-            `multiprocessing.cpu_count()`. Default 0.
-
-            Examples:
-                .. code-block:: python
-
-                    import paddle
-                    import paddle.static as static
-
-                    paddle.enable_static()
-
-                    exec_strategy = static.ExecutionStrategy()
-                    exec_strategy.num_threads = 4
-            )DOC")
-      .def_property(
-          "_use_device",
-          [](const ExecutionStrategy &self) { return self.use_device_; },
-          [](ExecutionStrategy &self, paddle::platform::DeviceType use_device) {
-            self.use_device_ = use_device;
-          })  // NOTE(liuyuhui): Doesn't add doc for 'use_device', because
-              // use_device isn‘t exposed to users.
-      .def_property(
-          "allow_op_delay",
-          [](const ExecutionStrategy &self) { return self.allow_op_delay_; },
-          [](ExecutionStrategy &self, bool allow_op_delay) {
-            self.allow_op_delay_ = allow_op_delay;
-          },
-          R"DOC(The type is BOOL, allow_op_delay represents whether to delay the
-                communication operators to run, it may make the execution faster.
-                Note that this option is invalid now, and it will be removed in
-                next version. Default False.)DOC")
-      .def_property(
-          "num_iteration_per_drop_scope",
-          [](const ExecutionStrategy &self) {
-            return self.num_iteration_per_drop_scope_;
-          },
-          [](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) {
-            self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope;
-          },
-          R"DOC(The type is INT, num_iteration_per_drop_scope indicates how
-                many iterations to clean up the temp variables which
-                is generated during execution. It may make the execution faster,
-                because the temp variable's shape maybe the same between two iterations.
-                Default 100.
-
-                .. note::
-                    1. If you fetch data when calling the 'run', the ParallelExecutor 
-                    will clean up the temp variables at the end of the current iteration. 
-                    2. In some NLP model, it may cause the GPU memory is insufficient, 
-                    in this case, you should reduce `num_iteration_per_drop_scope`.
-
-                Examples:
-                    .. code-block:: python
-
-                        import paddle
-                        import paddle.static as static
-
-                        paddle.enable_static()
-
-                        exec_strategy = static.ExecutionStrategy()
-                        exec_strategy.num_iteration_per_drop_scope = 10
-              )DOC")
-      .def_property(
-          "num_iteration_per_run",
-          [](const ExecutionStrategy &self) {
-            return self.num_iteration_per_run_;
-          },
-          [](ExecutionStrategy &self, size_t num_iteration_per_run) {
-            self.num_iteration_per_run_ = num_iteration_per_run;
-          },
-          R"DOC(This config that how many iteration the executor will run when
-                user call exe.run() in python。Default: 1.
-
-                Examples:
-                    .. code-block:: python
-
-                        import paddle
-                        import paddle.static as static
-
-                        paddle.enable_static()
-
-                        exec_strategy = static.ExecutionStrategy()
-                        exec_strategy.num_iteration_per_run = 10
-              )DOC")
-      .def_property(
-          "use_thread_barrier",
-          [](const ExecutionStrategy &self) { return self.thread_barrier_; },
-          [](ExecutionStrategy &self, bool use_thread_barrier) {
-            self.thread_barrier_ = use_thread_barrier;
-          },
-          R"DOC(This config that the this is distributed training with parameter server
-              )DOC")
-      .def_property(
-          "_dry_run",
-          [](const ExecutionStrategy &self) { return self.dry_run_; },
-          [](ExecutionStrategy &self, bool dry_run) {
-            self.dry_run_ = dry_run;
-          });
-
-  exec_strategy.def_property(
-      "use_experimental_executor",
-      [](const ExecutionStrategy &self) {
-        return self.type_ == ExecutionStrategy::kExperimental;
-      },
-      [](ExecutionStrategy &self, bool experimental) {
-        self.type_ = experimental ? ExecutionStrategy::kExperimental
-                                  : ExecutionStrategy::kDefault;
-      });
-
-  py::class_<BuildStrategy> build_strategy(pe, "BuildStrategy", R"DOC(
-    BuildStrategy allows the user to more preciously control how to
-    build the SSA Graph in ParallelExecutor by setting the property.
-
-    Returns:
-        BuildStrategy: An BuildStrategy object.
-
-    Examples:
-        .. code-block:: python
-
-            import os
-            import paddle
-            import paddle.static as static
-
-            paddle.enable_static()
-
-            os.environ['CPU_NUM'] = str(2)
-            places = static.cpu_places()
-
-            data = static.data(name="x", shape=[None, 1], dtype="float32")
-            hidden = static.nn.fc(input=data, size=10)
-            loss = paddle.mean(hidden)
-            paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
-
-            build_strategy = static.BuildStrategy()
-            build_strategy.enable_inplace = True
-            build_strategy.memory_optimize = True
-            build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce
-            program = static.CompiledProgram(static.default_main_program())
-            program = program.with_data_parallel(loss_name=loss.name,
-                                                  build_strategy=build_strategy,
-                                                  places=places)
-)DOC");
-
-  py::enum_<BuildStrategy::ReduceStrategy>(build_strategy, "ReduceStrategy")
-      .value("Reduce", BuildStrategy::ReduceStrategy::kReduce)
-      .value("AllReduce", BuildStrategy::ReduceStrategy::kAllReduce)
-      .value("_NoReduce", BuildStrategy::ReduceStrategy::kNoReduce);
-  py::enum_<BuildStrategy::GradientScaleStrategy>(build_strategy,
-                                                  "GradientScaleStrategy")
-      .value("CoeffNumDevice",
-             BuildStrategy::GradientScaleStrategy::kCoeffNumDevice)
-      .value("One", BuildStrategy::GradientScaleStrategy::kOne)
-      .value("Customized", BuildStrategy::GradientScaleStrategy::kCustomized);
-
-  build_strategy.def(py::init())
-      .def("_clear_finalized", &BuildStrategy::ClearFinalized)
-      .def_property(
-          "reduce_strategy",
-          [](const BuildStrategy &self) { return self.reduce_; },
-          [](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finlaized, cannot be "
-                                  "configured again."));
-            self.reduce_ = strategy;
-          },
-          R"DOC((fluid.BuildStrategy.ReduceStrategy, optional): there are two reduce
-                strategies in ParallelExecutor, AllReduce and Reduce. If you want
-                that all the parameters' optimization are done on all devices independently,
-                you should choose AllReduce; otherwise, if you choose Reduce, all the parameters'
-                optimization will be evenly distributed to different devices, and then
-                broadcast the optimized parameter to other devices.
-                Default is 'AllReduce'.
-
-                Examples:
-                    .. code-block:: python
-
-                        import paddle
-                        import paddle.static as static
-
-                        paddle.enable_static()
-
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce
-                  )DOC")
-      .def_property(
-          "gradient_scale_strategy",
-          [](const BuildStrategy &self) { return self.gradient_scale_; },
-          [](BuildStrategy &self,
-             BuildStrategy::GradientScaleStrategy strategy) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finlaized, cannot be "
-                                  "configured again."));
-            self.gradient_scale_ = strategy;
-          },
-          R"DOC((paddle.static.BuildStrategy.GradientScaleStrategy, optional): there are three
-                ways of defining :math:`loss@grad` in ParallelExecutor, that is, CoeffNumDevice,
-                One and Customized. By default, ParallelExecutor sets the :math:`loss@grad`
-                according to the number of devices. If you want to customize :math:`loss@grad`,
-                you can choose Customized. Default is 'CoeffNumDevice'.
-
-                Examples:
-                    .. code-block:: python
-
-                        import numpy
-                        import os
-                        import paddle
-                        import paddle.static as static
-
-                        paddle.enable_static()
-
-                        use_cuda = True
-                        place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
-                        exe = static.Executor(place)
-
-                        # NOTE: If you use CPU to run the program, you need
-                        # to specify the CPU_NUM, otherwise, paddle will use
-                        # all the number of the logic core as the CPU_NUM,
-                        # in that case, the batch size of the input should be
-                        # greater than CPU_NUM, if not, the process will be
-                        # failed by an exception.
-                        if not use_cuda:
-                            os.environ['CPU_NUM'] = str(2)
-                            places = static.cpu_places()
-                        else:
-                            places = static.cuda_places()
-
-                        data = static.data(name='X', shape=[None, 1], dtype='float32')
-                        hidden = static.nn.fc(input=data, size=10)
-                        loss = paddle.mean(hidden)
-                        paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
-
-                        exe.run(static.default_startup_program())
-
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.gradient_scale_strategy = \
-                                  static.BuildStrategy.GradientScaleStrategy.Customized
-                        compiled_prog = static.CompiledProgram(
-                                  static.default_main_program()).with_data_parallel(
-                                          loss_name=loss.name, build_strategy=build_strategy,
-                                          places=places)
-
-                        dev_count =  len(places)
-                        x = numpy.random.random(size=(10, 1)).astype('float32')
-                        loss_grad = numpy.ones((dev_count)).astype("float32") * 0.01
-                        loss_grad_name = loss.name+"@GRAD"
-                        loss_data = exe.run(compiled_prog,
-                                              feed={"X": x, loss_grad_name : loss_grad},
-                                              fetch_list=[loss.name, loss_grad_name])
-                   )DOC")
-      .def_property(
-          "debug_graphviz_path",
-          [](const BuildStrategy &self) { return self.debug_graphviz_path_; },
-          [](BuildStrategy &self, const std::string &path) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finlaized, cannot be "
-                                  "configured again."));
-            self.debug_graphviz_path_ = path;
-          },
-          R"DOC((str, optional): debug_graphviz_path indicates the path that
-                writing the SSA Graph to file in the form of graphviz.
-                It is useful for debugging. Default is empty string, that is, ""
-
-                Examples:
-                    .. code-block:: python
-
-                        import paddle
-                        import paddle.static as static
-
-                        paddle.enable_static()
-
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.debug_graphviz_path = "./graph"
-                    )DOC")
-      .def_property(
-          "enable_sequential_execution",
-          [](const BuildStrategy &self) {
-            return self.enable_sequential_execution_;
-          },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finlaized, cannot be "
-                                  "configured again."));
-            self.enable_sequential_execution_ = b;
-          },
-          R"DOC((bool, optional): If set True, the execution order of ops would
-                be the same as what is in the program. Default is False.
-
-                Examples:
-                    .. code-block:: python
-
-                        import paddle
-                        import paddle.static as static
-
-                        paddle.enable_static()
-
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.enable_sequential_execution = True
-          )DOC")
-      .def_property(
-          "remove_unnecessary_lock",
-          [](const BuildStrategy &self) {
-            return self.remove_unnecessary_lock_;
-          },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finlaized, cannot be "
-                                  "configured again."));
-            self.remove_unnecessary_lock_ = b;
-          },
-          R"DOC((bool, optional): If set True, some locks in GPU ops would be
-                released and ParallelExecutor would run faster. Default is True.
-
-                Examples:
-                    .. code-block:: python
-
-                        import paddle
-                        import paddle.static as static
-
-                        paddle.enable_static()
-
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.remove_unnecessary_lock = True
-          )DOC")
-      .def_property(
-          "num_trainers",
-          [](const BuildStrategy &self) { return self.num_trainers_; },
-          [](BuildStrategy &self, int num_trainers) {
-#ifdef WIN32
-            PADDLE_THROW(platform::errors::Unavailable(
-                "Distribution mode is not supported on Windows platform."));
-#endif
-            self.num_trainers_ = num_trainers;
-          })
-      .def_property(
-          "trainers_endpoints",
-          [](const BuildStrategy &self) { return self.trainers_endpoints_; },
-          [](BuildStrategy &self,
-             const std::vector<std::string> &trainers_endpoints) {
-            self.trainers_endpoints_ = trainers_endpoints;
-          })
-      .def_property(
-          "trainer_id",
-          [](const BuildStrategy &self) { return self.trainer_id_; },
-          [](BuildStrategy &self, int trainer_id) {
-            self.trainer_id_ = trainer_id;
-          })
-      .def_property(
-          "nccl_comm_num",
-          [](const BuildStrategy &self) { return self.nccl_comm_num_; },
-          [](BuildStrategy &self, int nccl_comm_num) {
-            self.nccl_comm_num_ = nccl_comm_num;
-          })
-      .def_property(
-          "bkcl_comm_num",
-          [](const BuildStrategy &self) { return self.bkcl_comm_num_; },
-          [](BuildStrategy &self, int bkcl_comm_num) {
-            self.bkcl_comm_num_ = bkcl_comm_num;
-          })
-      .def_property(
-          "use_hierarchical_allreduce",
-          [](const BuildStrategy &self) {
-            return self.use_hierarchical_allreduce_;
-          },
-          [](BuildStrategy &self, bool use) {
-            self.use_hierarchical_allreduce_ = use;
-          })
-      .def_property(
-          "hierarchical_allreduce_inter_nranks",
-          [](const BuildStrategy &self) {
-            return self.hierarchical_allreduce_inter_nranks_;
-          },
-          [](BuildStrategy &self, int nranks) {
-            self.hierarchical_allreduce_inter_nranks_ = nranks;
-          })
-
-      .def_property(
-          "fuse_elewise_add_act_ops",
-          [](const BuildStrategy &self) {
-            return self.fuse_elewise_add_act_ops_;
-          },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finlaized, cannot be "
-                                  "configured again."));
-            self.fuse_elewise_add_act_ops_ = b;
-          },
-          R"DOC((bool, optional): fuse_elewise_add_act_ops indicate whether
-                to fuse elementwise_add_op and activation_op,
-                it may make the execution faster. Default is False.
-
-                Examples:
-                    .. code-block:: python
-
-                        import paddle
-                        import paddle.static as static
-
-                        paddle.enable_static()
-
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.fuse_elewise_add_act_ops = True
-                     )DOC")
-      .def_property(
-          "fuse_gemm_epilogue",
-          [](const BuildStrategy &self) { return self.fuse_gemm_epilogue_; },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finlaized, cannot be "
-                                  "configured again."));
-            self.fuse_gemm_epilogue_ = b;
-          },
-          R"DOC((bool, optional): fuse_gemm_epilogue indicate whether
-                to fuse matmul_op, elemenewist_add_op and activation_op,
-                it may make the execution faster. Default is False.
-
-                Examples:
-                    .. code-block:: python
-
-                        import paddle
-                        import paddle.static as static
-
-                        paddle.enable_static()
-
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.fuse_gemm_epilogue = True
-                     )DOC")
-      .def_property(
-          "fuse_bn_act_ops",
-          [](const BuildStrategy &self) { return self.fuse_bn_act_ops_; },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finlaized, cannot be "
-                                  "configured again."));
-            self.fuse_bn_act_ops_ = b;
-          },
-          R"DOC((bool, optional): fuse_bn_act_ops indicate whether
-                to fuse batch_norm and activation_op,
-                it may make the execution faster. Default is False.
-
-                Examples:
-                    .. code-block:: python
-
-                        import paddle
-                        import paddle.static as static
-
-                        paddle.enable_static()
-
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.fuse_bn_act_ops = True
-                     )DOC")
-      .def_property(
-          "fuse_bn_add_act_ops",
-          [](const BuildStrategy &self) { return self.fuse_bn_add_act_ops_; },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finlaized, cannot be "
-                                  "configured again."));
-            self.fuse_bn_add_act_ops_ = b;
-          },
-          R"DOC((bool, optional): fuse_bn_add_act_ops indicate whether
-                to fuse batch_norm, elementwise_add and activation_op,
-                it may make the execution faster. Default is True
-
-                Examples:
-                    .. code-block:: python
-
-                        import paddle
-                        import paddle.static as static
-
-                        paddle.enable_static()
-
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.fuse_bn_add_act_ops = True
-                     )DOC")
-      .def_property(
-          "enable_auto_fusion",
-          [](const BuildStrategy &self) { return self.enable_auto_fusion_; },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finlaized, cannot be "
-                                  "configured again."));
-            self.enable_auto_fusion_ = b;
-          },
-          R"DOC((bool, optional): Whether to enable fusing subgraph to a
-                fusion_group. Now we only support fusing subgraph that composed
-                of elementwise-like operators, such as elementwise_add/mul
-                without broadcast and activations.
-
-                Examples:
-                    .. code-block:: python
-
-                        import paddle
-                        import paddle.static as static
-
-                        paddle.enable_static()
-
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.enable_auto_fusion = True
-                    )DOC")
-      .def_property(
-          "fuse_relu_depthwise_conv",
-          [](const BuildStrategy &self) {
-            return self.fuse_relu_depthwise_conv_;
-          },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finlaized, cannot be "
-                                  "configured again."));
-            self.fuse_relu_depthwise_conv_ = b;
-          },
-          R"DOC((bool, optional): fuse_relu_depthwise_conv indicate whether
-                to fuse relu and depthwise_conv2d,
-                it will save GPU memory and may make the execution faster.
-                This options is only available in GPU devices.
-                Default is False.
-
-                Examples:
-                    .. code-block:: python
-
-                        import paddle
-                        import paddle.static as static
-
-                        paddle.enable_static()
-
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.fuse_relu_depthwise_conv = True
-          )DOC")
-      .def_property(
-          "fuse_broadcast_ops",
-          [](const BuildStrategy &self) {
-            return self.fuse_broadcast_ops_ == true ||
-                   self.fuse_broadcast_ops_ == paddle::none;
-          },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finlaized, "
-                                  "cannot be configured again."));
-            self.fuse_broadcast_ops_ = b;
-          },
-          R"DOC((bool, optional): fuse_broadcast_op indicates whether
-                      to fuse the broadcast ops. Note that, in Reduce mode,
-                      fusing broadcast ops may make the program faster. Because
-                      fusing broadcast OP equals delaying the execution of all
-                      broadcast Ops, in this case, all nccl streams are used only
-                      for NCCLReduce operations for a period of time. Default False.
-
-                      Examples:
-                          .. code-block:: python
-
-                              import paddle
-                              import paddle.static as static
-
-                              paddle.enable_static()
-
-                              build_strategy = static.BuildStrategy()
-                              build_strategy.fuse_broadcast_ops = True
-                    )DOC")
-      .def_property(
-          "fuse_all_optimizer_ops",
-          [](const BuildStrategy &self) {
-            return self.fuse_all_optimizer_ops_ == true ||
-                   self.fuse_all_optimizer_ops_ == paddle::none;
-          },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finlaized, "
-                                  "cannot be configured again."));
-            self.fuse_all_optimizer_ops_ = b;
-          })
-      .def_property(
-          "sync_batch_norm",
-          [](const BuildStrategy &self) { return self.sync_batch_norm_; },
-          [](BuildStrategy &self, bool b) {
-            PADDLE_ENFORCE_NE(self.IsFinalized(),
-                              true,
-                              platform::errors::PreconditionNotMet(
-                                  "BuildStrategy has been finlaized, cannot be "
-                                  "configured again."));
-            self.sync_batch_norm_ = b;
-          },
-          R"DOC((bool, optional): sync_batch_norm indicates whether to use
-                synchronous batch normalization which synchronizes the mean
-                and variance through multi-devices in training phase.
-                Current implementation doesn't support FP16 training and CPU.
-                And only synchronous on one machine, not all machines. 
-                Default is False.
-
-                Examples:
-                    .. code-block:: python
-
-                        import paddle
-                        import paddle.static as static
-
-                        paddle.enable_static()
-
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.sync_batch_norm = True
-                )DOC")
-      .def_property(
-          "memory_optimize",
-          [](const BuildStrategy &self) -> py::object {
-            if (self.memory_optimize_) {
-              return py::cast(self.memory_optimize_.get());
-            } else {
-              return py::cast(nullptr);
-            }
-          },
-          [](BuildStrategy &self, const py::handle &value) {
-            auto *py_obj = value.ptr();
-            if (py_obj == nullptr || py_obj == Py_None) {
-              self.memory_optimize_ = paddle::none;
-            } else if (PyBool_Check(py_obj)) {
-              self.memory_optimize_ = (py_obj == Py_True);
-            } else {
-              PADDLE_THROW(platform::errors::InvalidArgument(
-                  "BuildStrategy.memory_optimize must be set to None, False "
-                  "or True"));
-            }
-          },
-          R"DOC((bool, optional): memory opitimize aims to save total memory
-                consumption, set to True to enable it.
-
-                Default None. None means framework would choose to use or not use 
-                this strategy automatically. Currently, None means that it is 
-                enabled when GC is disabled, and disabled when GC is enabled. 
-                True means enabling and False means disabling. Default is None.
-
-                Examples:
-                    .. code-block:: python
-
-                        import paddle
-                        import paddle.static as static
-
-                        paddle.enable_static()
-
-                        build_strategy = static.BuildStrategy()
-                        build_strategy.memory_optimize = True
-                
-                )DOC")
-      .def_property(
-          "is_distribution",
-          [](const BuildStrategy &self) { return self.is_distribution_; },
-          [](BuildStrategy &self, bool b) {
-#ifdef WIN32
-            if (b) {
-              PADDLE_THROW(platform::errors::Unavailable(
-                  "Distribution mode is not supported on Windows platform."));
-            }
-#else
-            self.is_distribution_ = b;
-#endif
-          })
-      .def_property(
-          "async_mode",
-          [](const BuildStrategy &self) { return self.async_mode_; },
-          [](BuildStrategy &self, bool b) { self.async_mode_ = b; })
-      .def_property(
-          "enable_inplace",
-          [](const BuildStrategy &self) { return self.enable_inplace_; },
-          [](BuildStrategy &self, bool b) { self.enable_inplace_ = b; })
-      .def_property(
-          "enable_addto",
-          [](const BuildStrategy &self) { return self.enable_addto_; },
-          [](BuildStrategy &self, bool b) { self.enable_addto_ = b; })
-      .def_property(
-          "fuse_all_reduce_ops",
-          [](const BuildStrategy &self) {
-            return self.fuse_all_reduce_ops_ == true ||
-                   self.fuse_all_reduce_ops_ == paddle::none;
-          },
-          [](BuildStrategy &self, bool b) { self.fuse_all_reduce_ops_ = b; })
-      .def_property(
-          "enable_backward_optimizer_op_deps",
-          [](const BuildStrategy &self) {
-            return self.enable_backward_optimizer_op_deps_;
-          },
-          [](BuildStrategy &self, bool b) {
-            self.enable_backward_optimizer_op_deps_ = b;
-          })
-      .def_property(
-          "cache_runtime_context",
-          [](const BuildStrategy &self) { return self.cache_runtime_context_; },
-          [](BuildStrategy &self, bool b) { self.cache_runtime_context_ = b; })
-      .def_property(
-          "mkldnn_enabled_op_types",
-          [](const BuildStrategy &self) {
-            return self.mkldnn_enabled_op_types_;
-          },
-          [](BuildStrategy &self,
-             const std::unordered_set<std::string> &mkldnn_enabled_op_types) {
-            self.mkldnn_enabled_op_types_ = mkldnn_enabled_op_types;
-          })
-      .def_property(
-          "fix_op_run_order",
-          [](const BuildStrategy &self) { return self.fix_op_run_order_; },
-          [](BuildStrategy &self, bool fix_op_run_order) {
-            self.fix_op_run_order_ = fix_op_run_order;
-          })
-      .def_property(
-          "allow_cuda_graph_capture",
-          [](const BuildStrategy &self) {
-            return self.allow_cuda_graph_capture_;
-          },
-          [](BuildStrategy &self, bool allow_cuda_graph_capture) {
-            self.allow_cuda_graph_capture_ = allow_cuda_graph_capture;
-          })
-      .def("_copy",
-           [](const BuildStrategy &self) {
-             auto new_bs = self;
-             new_bs.ClearFinalized();
-             return new_bs;
-           })
-      .def(
-          "_finalize_strategy_and_create_passes",
-          [](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
-            return self.CreatePassesFromStrategy(true);
-          },
-          R"DOC(Allow user to customized passes. Normally model-specific
-                optimization passes should be defined in this way. BuildStrategy
-                cannot be updated after being finalized.)DOC");
-
-  m.def("_set_cached_executor_build_strategy",
-        [](int64_t program_id, const BuildStrategy &build_strategy) {
-          auto &cached_exe_info = framework::ExecutorInfoCache::Instance();
-          cached_exe_info.SetBuildStrategy(program_id, build_strategy);
-        });
-
-  pe.def(py::init<const std::vector<platform::Place> &,
-                  const std::vector<std::string> &,
-                  const std::string &,
-                  Scope *,
-                  std::vector<Scope *> &,
-                  const ExecutionStrategy &,
-                  const BuildStrategy &,
-                  ir::Graph *>())
-      // NOTE: even we return a vec<Scope*>* to Python use reference policy.
-      // We still cannot get local_scope from this vector, since the element
-      // of vec<Scope*> will be freed by Python GC. We can only return Scope*
-      // one by one and mark them as reference.
-      .def(
-          "local_scopes",
-          [](ParallelExecutor &self) -> std::vector<Scope *> * {
-            return &self.GetLocalScopes();
-          },
-          py::return_value_policy::reference)
-      .def("drop_local_exe_scopes", &ParallelExecutor::DropLocalExeScopes)
-      .def("_need_create_local_exe_scopes",
-           &ParallelExecutor::NeedCreateLocalExeScope)
-      .def("feed_tensors_into_local_scopes",
-           &ParallelExecutor::FeedTensorsIntoLocalScopes)
-      .def("feed_and_split_tensor_into_local_scopes",
-           &ParallelExecutor::FeedAndSplitTensorIntoLocalScopes)
-      .def("run",
-           [](ParallelExecutor &self,
-              const std::vector<std::string> &fetch_tensors,
-              bool return_merged) -> py::object {
-             paddle::framework::FetchResultType ret;
-             {
-               pybind11::gil_scoped_release release;
-               ret = self.Run(fetch_tensors, return_merged);
-             }
-
-             // TODO(Ruibiao): Refactor the run interface of PE to avoid use
-             // boost::get here
-             if (return_merged) {
-               return py::cast(
-                   std::move(boost::get<paddle::framework::FetchList>(ret)));
-             } else {
-               return py::cast(std::move(
-                   boost::get<paddle::framework::FetchUnmergedList>(ret)));
-             }
-           })
-      .def("device_count", &ParallelExecutor::DeviceCount);
-
 #ifdef PADDLE_WITH_IPU
   py::class_<platform::ipu::IpuBackend,
              std::unique_ptr<platform::ipu::IpuBackend, py::nodelete>>(
@@ -4827,6 +2352,9 @@ All parameter, weight, gradient are variables in Paddle.
 
   BindFleetWrapper(&m);
   BindIO(&m);
+  BindParallelExecutor(m);
+  BindPlace(m);
+  BindTensor(m);
 
 #if defined(PADDLE_WITH_PSLIB) && !defined(PADDLE_WITH_HETERPS)
   BindHeterWrapper(&m);
diff --git a/paddle/fluid/pybind/pybind_boost_headers.h b/paddle/fluid/pybind/pybind_boost_headers.h
index 623ec84acda6f..2a25990944d14 100644
--- a/paddle/fluid/pybind/pybind_boost_headers.h
+++ b/paddle/fluid/pybind/pybind_boost_headers.h
@@ -18,14 +18,12 @@ limitations under the License. */
 #include <vector>
 
 #include "glog/logging.h"
-#include "paddle/fluid/platform/variant.h"
 #include "paddle/utils/variant.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
 // Cast paddle::variant for PyBind.
 // Copy from
-
 // https://github.com/pybind/pybind11/issues/576#issuecomment-269563199
 namespace pybind11 {
 namespace detail {
@@ -78,10 +76,7 @@ struct paddle_variant_caster<V<Ts...>> {
   using Type = V<Ts...>;
 
   template <typename T>
-  typename std::enable_if<
-      !std::is_same<T, boost::detail::variant::void_>::value,
-      bool>::type
-  try_load(handle src, bool convert) {
+  bool try_load(handle src, bool convert) {
     auto caster = make_caster<T>();
     if (!load_success_ && caster.load(src, convert)) {
       load_success_ = true;
@@ -112,13 +107,6 @@ struct paddle_variant_caster<V<Ts...>> {
     return false;
   }
 
-  template <typename T>
-  typename std::enable_if<std::is_same<T, boost::detail::variant::void_>::value,
-                          bool>::type
-  try_load(handle src, bool convert) {
-    return false;
-  }
-
   bool load(handle src, bool convert) {
     auto unused = {false, try_load<Ts>(src, convert)...};
     (void)(unused);
@@ -128,11 +116,6 @@ struct paddle_variant_caster<V<Ts...>> {
   static handle cast(Type const& src,
                      return_value_policy policy,
                      handle parent) {
-    /*
-    auto paddle_variant_caster_visitor = [&](Type const& src)->handle {
-      return make_caster<Type>::cast(src, policy, parent);
-    }
-    */
     paddle_variant_caster_visitor visitor(policy, parent);
     return paddle::visit(visitor, src);
   }
diff --git a/paddle/fluid/pybind/reader_py.cc b/paddle/fluid/pybind/reader_py.cc
index 9c80bb8a67e63..36c09f543a6c2 100644
--- a/paddle/fluid/pybind/reader_py.cc
+++ b/paddle/fluid/pybind/reader_py.cc
@@ -22,7 +22,7 @@
 #include <vector>
 
 #include "Python.h"
-#include "boost/optional.hpp"
+
 #include "gflags/gflags.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/imperative/layer.h"
diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc
new file mode 100644
index 0000000000000..6ee72e0c1630b
--- /dev/null
+++ b/paddle/fluid/pybind/tensor.cc
@@ -0,0 +1,1106 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <Python.h>
+
+#include <algorithm>
+#include <cctype>
+#include <cstdlib>
+#include <iterator>
+#include <map>
+#include <memory>
+#include <mutex>  // NOLINT // for call_once
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/custom_operator.h"
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/data_type_transform.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/executor_cache.h"
+#include "paddle/fluid/framework/executor_gc_helper.h"
+#include "paddle/fluid/framework/feed_fetch_method.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/garbage_collector.h"
+#include "paddle/fluid/framework/io/fs.h"
+#include "paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h"
+#include "paddle/fluid/framework/ir/cost_model.h"
+#include "paddle/fluid/framework/ir/generate_pass.h"
+#include "paddle/fluid/framework/ir/pass_builder.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/new_executor/executor_statistics.h"
+#include "paddle/fluid/framework/new_executor/standalone_executor.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/framework/parallel_executor.h"
+#include "paddle/fluid/framework/phi_utils.h"
+#include "paddle/fluid/framework/prune.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/save_load_util.h"
+#include "paddle/fluid/framework/scope_pool.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/trainer.h"
+#include "paddle/fluid/framework/type_defs.h"
+#include "paddle/fluid/framework/version.h"
+#include "paddle/fluid/imperative/amp_auto_cast.h"
+#include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/memory/allocation/allocator_strategy.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/memory/allocation/cuda_ipc_allocator.h"
+#endif
+#include "paddle/fluid/memory/allocation/mmap_allocator.h"
+#include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/common_infer_shape_functions.h"
+#include "paddle/fluid/operators/py_func_op.h"
+#include "paddle/fluid/platform/cpu_helper.h"
+#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/init.h"
+#include "paddle/fluid/platform/monitor.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_python.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
+#include "paddle/fluid/platform/profiler/profiler.h"
+#include "paddle/fluid/pybind/cuda_streams_py.h"
+#include "paddle/fluid/pybind/distributed_py.h"
+#include "paddle/fluid/pybind/eager.h"
+#include "paddle/fluid/pybind/imperative.h"
+#include "paddle/fluid/pybind/io.h"
+#include "paddle/phi/core/compat/convert_utils.h"
+#include "paddle/phi/core/lod_utils.h"
+#include "paddle/utils/none.h"
+#ifdef PADDLE_WITH_ASCEND
+#include "paddle/fluid/pybind/ascend_wrapper_py.h"
+#endif
+#include "paddle/fluid/pybind/bind_cost_model.h"
+#include "paddle/fluid/pybind/bind_fleet_executor.h"
+#include "paddle/fluid/pybind/box_helper_py.h"
+#include "paddle/fluid/pybind/communication.h"
+#include "paddle/fluid/pybind/compatible.h"
+#include "paddle/fluid/pybind/const_value.h"
+#include "paddle/fluid/pybind/data_set_py.h"
+#include "paddle/fluid/pybind/exception.h"
+#include "paddle/fluid/pybind/fleet_wrapper_py.h"
+#include "paddle/fluid/pybind/generator_py.h"
+#include "paddle/fluid/pybind/global_value_getter_setter.h"
+#include "paddle/fluid/pybind/gloo_context_py.h"
+#include "paddle/fluid/pybind/gloo_wrapper_py.h"
+#include "paddle/fluid/pybind/heter_wrapper_py.h"
+#include "paddle/fluid/pybind/inference_api.h"
+#include "paddle/fluid/pybind/ir.h"
+#include "paddle/fluid/pybind/metrics_py.h"
+#include "paddle/fluid/pybind/ps_gpu_wrapper_py.h"
+#include "paddle/fluid/pybind/pybind_boost_headers.h"
+#include "paddle/phi/backends/device_manager.h"
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/pybind/nccl_wrapper_py.h"
+#endif
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/pybind/protobuf.h"
+#include "paddle/fluid/pybind/pybind.h"  // NOLINT
+#include "paddle/fluid/pybind/reader_py.h"
+#include "paddle/fluid/pybind/tensor_py.h"
+#include "paddle/fluid/string/to_string.h"
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
+#endif
+#ifndef PADDLE_WITH_HIP
+#include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
+#endif
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/npu/npu_info.h"
+#endif
+
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/device/xpu/xpu_info.h"
+#include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
+#endif
+
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#include "paddle/phi/capi/capi.h"
+#endif
+
+#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+
+#ifdef PADDLE_WITH_IPU
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+#include "paddle/fluid/platform/device/ipu/ipu_info.h"
+#endif
+
+#ifdef PADDLE_WITH_MLU
+#include "paddle/fluid/platform/device/mlu/mlu_info.h"
+#endif
+
+#ifdef PADDLE_WITH_CRYPTO
+#include "paddle/fluid/pybind/crypto.h"
+#endif
+
+#if defined PADDLE_WITH_PSCORE
+#include "paddle/fluid/pybind/fleet_py.h"
+#endif
+
+#ifdef PADDLE_WITH_CINN
+#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
+#endif
+
+#include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/imperative/layout_autotune.h"
+#include "paddle/fluid/pybind/eager_utils.h"
+#include "paddle/fluid/pybind/tensor.h"
+#include "paddle/phi/api/ext/op_meta_info.h"
+#include "paddle/phi/kernels/autotune/cache.h"
+#include "paddle/phi/kernels/autotune/switch_autotune.h"
+#include "pybind11/stl.h"
+
+DECLARE_bool(use_mkldnn);
+
+// disable auto conversion to list in Python
+PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
+PYBIND11_MAKE_OPAQUE(paddle::framework::FetchUnmergedList);
+PYBIND11_MAKE_OPAQUE(paddle::framework::FetchList);
+PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType);
+
+namespace paddle {
+namespace pybind {
+
+PyTypeObject *g_framework_tensor_pytype = nullptr;
+
+template <typename PlaceType>
+static void TensorCopyFrom(framework::Tensor *dst,
+                           const framework::Tensor &src,
+                           const PlaceType &place,
+                           int64_t batch_size) {
+  if (batch_size < 0) {
+    framework::TensorCopy(src, place, dst);
+  } else {
+    auto sliced = src.Slice(0, batch_size);
+    framework::TensorCopy(sliced, place, dst);
+  }
+}
+
+void BindTensor(pybind11::module &m) {  // NOLINT
+  using namespace paddle::framework;    // NOLINT
+  py::class_<framework::Tensor> framework_tensor(
+      m, "Tensor", py::buffer_protocol());
+  g_framework_tensor_pytype =
+      reinterpret_cast<PyTypeObject *>(framework_tensor.ptr());
+  framework_tensor
+      .def("__array__",
+           [](framework::Tensor &self) { return TensorToPyArray(self); })
+      .def("_ptr",
+           [](const framework::Tensor &self) {
+             return reinterpret_cast<uintptr_t>(self.data());
+           })
+      .def("_slice", &framework::Tensor::Slice)
+      .def("_numel", &framework::Tensor::numel)
+      .def("_is_initialized",
+           [](const framework::Tensor &self) { return self.IsInitialized(); })
+      .def("_get_dims",
+           [](const framework::Tensor &self) { return vectorize(self.dims()); })
+      .def("_set_dims",
+           [](framework::Tensor &self, const std::vector<int64_t> &dim) {
+             self.Resize(phi::make_ddim(dim));
+           })
+      .def("_set_layout",
+           [](framework::Tensor &self, const std::string &layout) {
+             self.set_layout(StringToDataLayout(layout));
+           })
+      .def("_alloc_float",
+           [](framework::Tensor &self, paddle::platform::CustomPlace &place) {
+             self.mutable_data<float>(place);
+           })
+      .def("_alloc_float",
+           [](framework::Tensor &self, paddle::platform::CUDAPlace &place) {
+             self.mutable_data<float>(place);
+           })
+      .def("_alloc_float",
+           [](framework::Tensor &self, paddle::platform::XPUPlace &place) {
+             self.mutable_data<float>(place);
+           })
+      .def("_alloc_float",
+           [](framework::Tensor &self, paddle::platform::CPUPlace &place) {
+             self.mutable_data<float>(place);
+           })
+      .def("_alloc_float",
+           [](framework::Tensor &self, paddle::platform::NPUPlace &place) {
+             self.mutable_data<float>(place);
+           })
+      .def("_alloc_float",
+           [](framework::Tensor &self, paddle::platform::MLUPlace &place) {
+             self.mutable_data<float>(place);
+           })
+      .def("_alloc_double",
+           [](framework::Tensor &self, paddle::platform::CPUPlace &place) {
+             self.mutable_data<double>(place);
+           })
+      .def("_alloc_int",
+           [](framework::Tensor &self, paddle::platform::CPUPlace &place) {
+             self.mutable_data<int>(place);
+           })
+      .def("_alloc_int",
+           [](framework::Tensor &self, paddle::platform::CustomPlace &place) {
+             self.mutable_data<int>(place);
+           })
+      .def("_alloc_int",
+           [](framework::Tensor &self, paddle::platform::XPUPlace &place) {
+             self.mutable_data<int>(place);
+           })
+      .def("_alloc_int",
+           [](framework::Tensor &self, paddle::platform::CUDAPlace &place) {
+             self.mutable_data<int>(place);
+           })
+      .def("_alloc_int",
+           [](framework::Tensor &self, paddle::platform::MLUPlace &place) {
+             self.mutable_data<int>(place);
+           })
+      .def("_alloc_int",
+           [](framework::Tensor &self,
+              paddle::platform::CUDAPinnedPlace &place) {
+             self.mutable_data<int>(place);
+           })
+      .def("_alloc_float",
+           [](framework::Tensor &self,
+              paddle::platform::CUDAPinnedPlace &place) {
+             self.mutable_data<float>(place);
+           })
+      .def("_mutable_data",
+           [](framework::Tensor &self,
+              paddle::platform::CPUPlace &place,
+              paddle::framework::proto::VarType::Type type) {
+             return reinterpret_cast<uintptr_t>(
+                 self.mutable_data(place, framework::TransToPhiDataType(type)));
+           })
+      .def("_mutable_data",
+           [](framework::Tensor &self,
+              paddle::platform::CustomPlace &place,
+              paddle::framework::proto::VarType::Type type) {
+             return reinterpret_cast<uintptr_t>(
+                 self.mutable_data(place, framework::TransToPhiDataType(type)));
+           })
+      .def("_mutable_data",
+           [](framework::Tensor &self,
+              paddle::platform::XPUPlace &place,
+              paddle::framework::proto::VarType::Type type) {
+             return reinterpret_cast<uintptr_t>(
+                 self.mutable_data(place, framework::TransToPhiDataType(type)));
+           })
+      .def("_mutable_data",
+           [](framework::Tensor &self,
+              paddle::platform::CUDAPlace &place,
+              paddle::framework::proto::VarType::Type type) {
+             return reinterpret_cast<uintptr_t>(
+                 self.mutable_data(place, framework::TransToPhiDataType(type)));
+           })
+      .def("_mutable_data",
+           [](framework::Tensor &self,
+              paddle::platform::CUDAPinnedPlace &place,
+              paddle::framework::proto::VarType::Type type) {
+             return reinterpret_cast<uintptr_t>(
+                 self.mutable_data(place, framework::TransToPhiDataType(type)));
+           })
+      .def("_mutable_data",
+           [](framework::Tensor &self,
+              paddle::platform::MLUPlace &place,
+              paddle::framework::proto::VarType::Type type) {
+             return reinterpret_cast<uintptr_t>(
+                 self.mutable_data(place, framework::TransToPhiDataType(type)));
+           })
+      .def("_clear", &framework::Tensor::clear)
+      .def("_mutable_data",
+           [](framework::Tensor &self,
+              paddle::platform::NPUPlace &place,
+              paddle::framework::proto::VarType::Type type) {
+             return reinterpret_cast<uintptr_t>(
+                 self.mutable_data(place, framework::TransToPhiDataType(type)));
+           })
+      .def("_copy_from",
+           &TensorCopyFrom<paddle::platform::CPUPlace>,
+           py::arg("tensor"),
+           py::arg("place"),
+           py::arg("batch_size") = -1)
+      .def("_copy_from",
+           &TensorCopyFrom<paddle::platform::CustomPlace>,
+           py::arg("tensor"),
+           py::arg("place"),
+           py::arg("batch_size") = -1)
+      .def("_copy_from",
+           &TensorCopyFrom<paddle::platform::XPUPlace>,
+           py::arg("tensor"),
+           py::arg("place"),
+           py::arg("batch_size") = -1)
+      .def("_copy_from",
+           &TensorCopyFrom<paddle::platform::CUDAPlace>,
+           py::arg("tensor"),
+           py::arg("place"),
+           py::arg("batch_size") = -1)
+      .def("_copy_from",
+           &TensorCopyFrom<paddle::platform::NPUPlace>,
+           py::arg("tensor"),
+           py::arg("place"),
+           py::arg("batch_size") = -1)
+      .def("_copy_from",
+           &TensorCopyFrom<paddle::platform::CUDAPinnedPlace>,
+           py::arg("tensor"),
+           py::arg("place"),
+           py::arg("batch_size") = -1)
+      .def("_copy_from",
+           &TensorCopyFrom<paddle::platform::MLUPlace>,
+           py::arg("tensor"),
+           py::arg("place"),
+           py::arg("batch_size") = -1)
+      .def("_copy_from",
+           &TensorCopyFrom<paddle::platform::Place>,
+           py::arg("tensor"),
+           py::arg("place"),
+           py::arg("batch_size") = -1)
+      .def("set",
+           SetTensorFromPyArray<paddle::platform::CPUPlace>,
+           py::arg("array"),
+           py::arg("place"),
+           py::arg("zero_copy") = false)
+      .def("set",
+           SetTensorFromPyArray<paddle::platform::CustomPlace>,
+           py::arg("array"),
+           py::arg("place"),
+           py::arg("zero_copy") = false)
+      .def("set",
+           SetTensorFromPyArray<paddle::platform::XPUPlace>,
+           py::arg("array"),
+           py::arg("place"),
+           py::arg("zero_copy") = false)
+      .def("set",
+           SetTensorFromPyArray<paddle::platform::CUDAPlace>,
+           py::arg("array"),
+           py::arg("place"),
+           py::arg("zero_copy") = false)
+      .def("set",
+           SetTensorFromPyArray<paddle::platform::NPUPlace>,
+           py::arg("array"),
+           py::arg("place"),
+           py::arg("zero_copy") = false)
+      .def("set",
+           SetTensorFromPyArray<paddle::platform::IPUPlace>,
+           py::arg("array"),
+           py::arg("place"),
+           py::arg("zero_copy") = false)
+      .def("set",
+           SetTensorFromPyArray<paddle::platform::MLUPlace>,
+           py::arg("array"),
+           py::arg("place"),
+           py::arg("zero_copy") = false)
+      .def("set",
+           SetTensorFromPyArray<paddle::platform::CUDAPinnedPlace>,
+           py::arg("array"),
+           py::arg("place"),
+           py::arg("zero_copy") = false,
+           R"DOC(
+        Set the data of Tensor on place with given numpy array.
+        
+        Args:
+          lod (numpy.ndarray): The data to set.
+          place (CPUPlace|CUDAPlace|XPUPlace|IPUPlace|CUDAPinnedPlace|NPUPlace|MLUPlace): The place where the
+          Tensor is to be set.
+          zero_copy (bool, optional): Whether to share memory with the input numpy array.
+          This parameter only works with CPUPlace. Default: False.
+
+        Returns:
+            None.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle.fluid as fluid
+                import numpy as np
+
+                t = fluid.Tensor()
+                t.set(np.ndarray([5, 30]), fluid.CPUPlace())
+          )DOC")
+
+      .def(
+          "shape",
+          [](framework::Tensor &self) { return vectorize(self.dims()); },
+          R"DOC(
+           Return the shape of Tensor.
+
+           Returns:
+               list[int]: The shape of Tensor.
+
+
+           Examples:
+               .. code-block:: python
+
+                  import paddle.fluid as fluid
+                  import numpy as np
+
+                  t = fluid.Tensor()
+                  t.set(np.ndarray([5, 30]), fluid.CPUPlace())
+                  print(t.shape())  # [5, 30]
+           )DOC")
+      .def("_to_dlpack",
+           [](framework::Tensor &self) {
+             DLPackTensor dlpack_tensor(self, 1);
+             DLManagedTensor *dmt = dlpack_tensor.ToDLManagedTensor();
+             auto capsule = py::capsule(
+                 static_cast<void *>(dmt), "dltensor", [](PyObject *ptr) {
+                   if (ptr) {
+                     auto dltensor = new DLManagedTensor;
+                     try {
+                       dltensor = reinterpret_cast<DLManagedTensor *>(
+                           PyCapsule_GetPointer(ptr, "used_dltensor"));
+                       return;
+                     } catch (...) {
+                       dltensor = reinterpret_cast<DLManagedTensor *>(
+                           PyCapsule_GetPointer(ptr, "dltensor"));
+                     }
+                     dltensor->deleter(dltensor);
+                   }
+                 });
+             return capsule;
+           })
+      .def("_set_float_element", TensorSetElement<float>)
+      .def("_get_float_element", TensorGetElement<float>)
+      .def("_set_double_element", TensorSetElement<double>)
+      .def("_get_double_element", TensorGetElement<double>)
+      .def("_place", [](framework::Tensor &self) { return self.place(); })
+      .def("_dtype",
+           [](framework::Tensor &self) {
+             return framework::TransToProtoVarType(self.type());
+           })
+      .def("_layout",
+           [](framework::Tensor &self) {
+             return DataLayoutToString(self.layout());
+           })
+      .def("_share_data_with", &framework::Tensor::ShareDataWith)
+      .def("__getitem__", PySliceTensor, py::return_value_policy::reference)
+      .def("__str__",
+           [](const framework::Tensor &self) {
+             std::stringstream ostr;
+             ostr << self;
+             return ostr.str();
+           }) /* ------ End of original Tensor ------ */
+      .def("__init__",
+           [](framework::Tensor &instance,
+              const std::vector<std::vector<size_t>>
+                  &recursive_sequence_lengths) {
+             LoD new_lod;
+             new_lod.reserve(recursive_sequence_lengths.size());
+             std::copy(recursive_sequence_lengths.begin(),
+                       recursive_sequence_lengths.end(),
+                       std::back_inserter(new_lod));
+             LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
+             PADDLE_ENFORCE_EQ(
+                 CheckLoD(new_offset_lod, -1),
+                 true,
+                 platform::errors::InvalidArgument(
+                     "The provided recursive_sequence_lengths info is "
+                     "invalid, "
+                     "the LoD converted by recursive_sequence_lengths is %s",
+                     new_lod));
+             new (&instance) framework::Tensor(new_offset_lod);
+           })
+      .def("__init__",
+           [](framework::Tensor &instance) {
+             new (&instance) framework::Tensor();
+           })
+      // We implement offset based LOD in C++ while we use length based with
+      // Python API. So we changed set_lod to set_recursive_sequence_lengths
+      // to
+      // avoid misuse.
+      // The discussion is here:
+      // https://github.com/PaddlePaddle/Paddle/issues/10855
+      .def(
+          "set_lod",
+          [](framework::Tensor &self,
+             const std::vector<std::vector<size_t>> &lod) {
+            // the input lod is offset-based level-of-detail info
+            LoD new_lod;
+            new_lod.reserve(lod.size());
+            std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
+            PADDLE_ENFORCE_EQ(
+                CheckLoD(new_lod, vectorize(self.dims()).front()),
+                true,
+                platform::errors::InvalidArgument(
+                    "The provided LoD is invalid, the LoD is %s", new_lod));
+            self.set_lod(new_lod);
+          },
+          py::arg("lod"),
+          R"DOC(
+           Set LoD of the Tensor.
+
+           Args:
+               lod (list[list[int]]): The lod to set.
+
+           Returns:
+                None.
+
+           Examples:
+               .. code-block:: python
+
+                 import paddle.fluid as fluid
+                 import numpy as np
+
+                 t = fluid.Tensor()
+                 t.set(np.ndarray([5, 30]), fluid.CPUPlace())
+                 t.set_lod([[0, 2, 5]])
+                 print(t.lod()) # [[0, 2, 5]]
+           )DOC")
+      .def(
+          "set_recursive_sequence_lengths",
+          [](framework::Tensor &self,
+             const std::vector<std::vector<size_t>>
+                 &recursive_sequence_lengths) {
+            // the input recursive_sequence_lengths is length-based
+            // level-of-detail info
+            LoD new_lod;
+            new_lod.reserve(recursive_sequence_lengths.size());
+            std::copy(recursive_sequence_lengths.begin(),
+                      recursive_sequence_lengths.end(),
+                      std::back_inserter(new_lod));
+            LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
+            PADDLE_ENFORCE_EQ(
+                CheckLoD(new_offset_lod, vectorize(self.dims()).front()),
+                true,
+                platform::errors::InvalidArgument(
+                    "The provided recursive_sequence_lengths info is "
+                    "invalid, "
+                    "the LoD converted by recursive_sequence_lengths is "
+                    "%s",
+                    new_lod));
+            self.set_lod(new_offset_lod);
+          },
+          py::arg("recursive_sequence_lengths"),
+          R"DOC(
+           Set LoD of the Tensor according to recursive sequence lengths.
+
+           For example, if recursive_sequence_lengths=[[2, 3]], which means
+           there are two sequences with length 2 and 3 respectively, the
+           corresponding lod would be [[0, 2, 2+3]], i.e., [[0, 2, 5]].
+
+           Args:
+                recursive_sequence_lengths (list[list[int]]): The recursive sequence lengths.
+           
+           Returns:
+                None.
+
+           Examples:
+               .. code-block:: python
+
+                 import paddle.fluid as fluid
+                 import numpy as np
+
+                 t = fluid.Tensor()
+                 t.set(np.ndarray([5, 30]), fluid.CPUPlace())
+                 t.set_recursive_sequence_lengths([[2, 3]])
+                 print(t.recursive_sequence_lengths())  # [[2, 3]]
+                 print(t.lod())  # [[0, 2, 5]]
+           )DOC")
+      .def(
+          "lod",
+          [](framework::Tensor &self) -> std::vector<std::vector<size_t>> {
+            // output the offset-based lod info
+            LoD lod = self.lod();
+            std::vector<std::vector<size_t>> new_lod;
+            new_lod.reserve(lod.size());
+            std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
+            return new_lod;
+          },
+          R"DOC(
+           Return the LoD of the Tensor.
+
+           Returns:
+               list[list[int]]: The lod of the Tensor.
+           
+           Examples:
+               .. code-block:: python
+
+                 import paddle.fluid as fluid
+                 import numpy as np
+
+                 t = fluid.Tensor()
+                 t.set(np.ndarray([5, 30]), fluid.CPUPlace())
+                 t.set_lod([[0, 2, 5]])
+                 print(t.lod()) # [[0, 2, 5]]
+           )DOC")
+      // Set above comments of set_lod.
+      .def(
+          "recursive_sequence_lengths",
+          [](framework::Tensor &self) -> std::vector<std::vector<size_t>> {
+            // output the length-based lod info
+            LoD lod = phi::ConvertToLengthBasedLoD(self.lod());
+            std::vector<std::vector<size_t>> new_lod;
+            new_lod.reserve(lod.size());
+            std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
+            return new_lod;
+          },
+          R"DOC(
+           Return the recursive sequence lengths corresponding to of the LodD 
+           of the Tensor.
+
+           Returns:
+                list[list[int]]: The recursive sequence lengths.
+
+           Examples:
+               .. code-block:: python
+
+                 import paddle.fluid as fluid
+                 import numpy as np
+
+                 t = fluid.Tensor()
+                 t.set(np.ndarray([5, 30]), fluid.CPUPlace())
+                 t.set_recursive_sequence_lengths([[2, 3]])
+                 print(t.recursive_sequence_lengths()) # [[2, 3]]
+           )DOC")
+      .def(
+          "has_valid_recursive_sequence_lengths",
+          [](framework::Tensor &self) -> bool {
+            // Check that the lod info is valid and match the outermost
+            // dimension of the Tensor data
+            return CheckLoD(self.lod(), vectorize(self.dims()).front());
+          },
+          R"DOC(
+           Check whether the LoD of the Tensor is valid.
+
+           Returns:
+               bool: Whether the LoD is valid.
+
+           Examples:
+               .. code-block:: python
+
+                 import paddle.fluid as fluid
+                 import numpy as np
+
+                 t = fluid.Tensor()
+                 t.set(np.ndarray([5, 30]), fluid.CPUPlace())
+                 t.set_recursive_sequence_lengths([[2, 3]])
+                 print(t.has_valid_recursive_sequence_lengths()) # True
+           )DOC")
+      .def("_as_type",
+           [](const framework::Tensor &self,
+              paddle::framework::proto::VarType::Type type) {
+             framework::Tensor dst;
+             if (self.IsInitialized() && self.numel() > 0) {
+               TransDataType(self, type, &dst);
+             }
+             return dst;
+           })
+      .def("_copy",
+           [](const framework::Tensor &self, const platform::Place &place) {
+             // follow fetch_op's inplementation
+             framework::Tensor dst;
+             if (self.IsInitialized() && self.numel() > 0) {
+               TensorCopySync(self, place, &dst);
+             } else {
+               // Not copy, if the src tensor is empty.
+               dst.clear();
+               dst.Resize({0});
+             }
+             dst.set_lod(self.lod());
+             return dst;
+#ifdef _WIN32
+           });
+#else
+           })
+#ifdef PADDLE_WITH_CUDA
+      .def("_share_buffer_with",
+           [](framework::Tensor &self, const framework::Tensor src,
+              py::tuple t) {
+             auto *cuda_ipc_allocation =
+                 dynamic_cast<memory::allocation::CudaIpcAllocation *>(
+                     src.Holder().get());
+
+             PADDLE_ENFORCE_NOT_NULL(
+                 cuda_ipc_allocation,
+                 platform::errors::PreconditionNotMet(
+                     "Tensor is not Cuda IPC shared tensor. "
+                     "Now only Tensor shared by cuda ipc could use this "
+                     "api."));
+
+             size_t size = t[0].cast<size_t>();
+             auto dtype =
+                 static_cast<paddle::experimental::DataType>(t[1].cast<int>());
+             auto dims = phi::make_ddim(t[2].cast<std::vector<int>>());
+             auto lod_info = t[3].cast<framework::LoD>();
+             auto device_id = t[4].cast<int>();
+
+             auto shared_reader_holder =
+                 std::make_shared<memory::allocation::Allocation>(
+                     cuda_ipc_allocation->ptr(),
+                     cuda_ipc_allocation->base_ptr(), size,
+                     platform::CUDAPlace(device_id));
+
+             self.ResetHolderWithType(shared_reader_holder, dtype);
+             self.Resize(dims);
+             self.set_lod(lod_info);
+
+             VLOG(6) << "Reconstructed tensor with buffer shared!";
+           },
+           R"DOC(
+           Deserialize GPU Tensor for existed shared Cuda IPC tensor.
+
+           Params:
+               tensor: Shared Cuda IPC tensor.
+               tuple: contrains data size, data type,
+                      tensor dims, lod information, device index.
+
+       )DOC")
+      .def("_share_cuda",
+           [](framework::Tensor self) {
+             if (!self.IsInitialized() || self.numel() == 0)
+               throw std::runtime_error(
+                   "Tensor not initialized or numel is 0.  could not pass "
+                   "to shared memory. ");
+
+             auto *holder = dynamic_cast<memory::allocation::Allocation *>(
+                 self.Holder().get());
+             PADDLE_ENFORCE_EQ(
+                 platform::is_gpu_place(holder->place()), true,
+                 platform::errors::InvalidArgument(
+                     "Tensor is not on GPU. share_cuda only support GPU "
+                     "Tensor, share_filename is for CPU tensor."));
+
+             void *base_ptr = holder->base_ptr();
+             ptrdiff_t offset_bytes = reinterpret_cast<char *>(holder->ptr()) -
+                                      reinterpret_cast<char *>(base_ptr);
+
+             cudaIpcMemHandle_t handle;
+             PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcGetMemHandle(&handle, base_ptr));
+
+             auto _handle = py::bytes(reinterpret_cast<char *>(&handle),
+                                      (py::ssize_t)CUDA_IPC_HANDLE_SIZE);
+
+             // TODO(ZHUI): use cuda event, to avoid sync.
+             const auto &device_id = paddle::platform::GetCurrentDeviceId();
+             auto stream =
+                 paddle::platform::stream::get_current_stream(device_id);
+             stream->Synchronize();
+
+             int type_idx = static_cast<int>(self.type());
+             size_t data_size =
+                 self.numel() *
+                 framework::SizeOfType(
+                     framework::TransToProtoVarType(self.type()));
+
+             return py::make_tuple(_handle, (py::size_t)offset_bytes, data_size,
+                                   type_idx, vectorize(self.dims()), self.lod(),
+                                   device_id);
+           },
+           R"DOC(
+           Serialize GPU Tensor by cudaIpcMemHandle.
+
+           Returns:
+               tuple: contrains handle, data size, data type,
+                      tensor dims, lod information, device index.
+
+           Examples:
+               .. code-block:: python
+
+                 import paddle
+                 tensor = paddle.ones([3,3])
+                 metainfo = tensor.value().get_tensor()._share_cuda()
+
+      )DOC")
+      .def("_new_shared_cuda",
+           [](py::tuple t) {
+             if (t.size() != 7)
+               throw std::runtime_error(
+                   "Invalid Tensor meta info for shared cuda tensor!");
+
+             // 1. Create a new C++ instance
+             framework::Tensor tensor;
+
+             // 2. Rebuild Allocation from handle
+             const std::string &handle = t[0].cast<std::string>();
+             ptrdiff_t offset_bytes = (ptrdiff_t)t[1].cast<int64_t>();
+             auto device_id = t[6].cast<int>();
+             auto base_ptr = memory::allocation::GetIpcBasePtr(handle);
+             size_t size = t[2].cast<size_t>();
+             void *dev = base_ptr.get();
+             dev = reinterpret_cast<char *>(dev) + offset_bytes;
+
+             auto shared_reader_holder =
+                 std::make_shared<memory::allocation::CudaIpcAllocation>(
+                     dev, size, device_id, std::move(base_ptr));
+
+             // 3. Rebuild Tensor
+             tensor.ResetHolderWithType(
+                 shared_reader_holder,
+                 static_cast<paddle::experimental::DataType>(t[3].cast<int>()));
+             tensor.Resize(phi::make_ddim(t[4].cast<std::vector<int>>()));
+             tensor.set_lod(t[5].cast<framework::LoD>());
+
+             return tensor;
+           },
+           R"DOC(
+           Deserialize GPU lod tensor from cudaIpcMemHandle.
+
+           Params:
+               tuple: contrains handle, data size, data type,
+                      tensor dims, lod information, device index.
+
+           Examples:
+               .. code-block:: python
+
+                 import paddle
+                 tensor = paddle.ones([3,3])
+                 metainfo = tensor.value().get_tensor()._share_cuda()
+                 tensor_from_shared = paddle.to_tensor(paddle.fluid.core.LoDTensor._new_shared_cuda(metainfo))
+
+        )DOC")
+#endif
+      .def("_share_filename",
+           [](framework::Tensor &self) {
+             if (!self.IsInitialized() || self.numel() == 0)
+               throw std::runtime_error(
+                   "Tensor not initialized or numel is 0. could not pass to "
+                   "shared memory. ");
+
+             auto holder = self.Holder();
+             PADDLE_ENFORCE_EQ(
+                 platform::is_cpu_place(holder->place()) ||
+                     platform::is_cuda_pinned_place(holder->place()),
+                 true, platform::errors::InvalidArgument(
+                           "Tensor is not on CPU. share_filename only "
+                           "support CPU Tensor."));
+
+             auto *mmap_allocation = dynamic_cast<
+                 memory::allocation::RefcountedMemoryMapAllocation *>(
+                 holder.get());
+             // If the tensor is not shared, allocate memory map allocation.
+             if (mmap_allocation == nullptr) {
+               void *data_ptr = self.data();
+               size_t data_size =
+                   self.numel() *
+                   framework::SizeOfType(
+                       framework::TransToProtoVarType(self.type()));
+
+               int flags = memory::allocation::MAPPED_SHAREDMEM |
+                           memory::allocation::MAPPED_EXCLUSIVE;
+               std::string handle = memory::allocation::GetIPCName();
+               auto shared_holder =
+                   memory::allocation::AllocateRefcountedMemoryMapAllocation(
+                       handle, flags, data_size);
+
+               // copy data & reset holder
+               if (platform::is_cuda_pinned_place(holder->place())) {
+#ifdef PADDLE_WITH_CUDA
+                 memory::Copy(platform::CPUPlace(), shared_holder->ptr(),
+                              platform::CUDAPinnedPlace(), data_ptr, data_size);
+#endif
+               } else {
+                 memory::Copy(platform::CPUPlace(), shared_holder->ptr(),
+                              platform::CPUPlace(), data_ptr, data_size);
+               }
+               self.ResetHolder(shared_holder);
+               mmap_allocation = shared_holder.get();
+             }
+             int type_idx = static_cast<int>(self.type());
+
+             return py::make_tuple(mmap_allocation->ipc_name(),
+                                   mmap_allocation->size(), type_idx,
+                                   vectorize(self.dims()), self.lod());
+           },
+           R"DOC(
+           Serialize CPU lod tensor in shared memory to tuple.
+           If the tensor is not in shared memory, we will copy it first.
+
+           Returns:
+               tuple: contrains ipc name, data size, data type,
+                      tensor dims and lod imformation.
+
+           Examples:
+               .. code-block:: python
+
+                 import paddle
+                 tensor = paddle.ones([3,3])
+                 metainfo = tensor.value().get_tensor()._share_filename()
+
+       )DOC")
+      .def("_new_shared_filename",
+           [](py::tuple t) {  // __setstate__
+             if (t.size() != 5)
+               throw std::runtime_error("Invalid Tensor meta info state!");
+
+             framework::Tensor tensor;
+
+             // 2. Rebuild Allocation
+             const std::string &ipc_name = t[0].cast<std::string>();
+             size_t size = t[1].cast<size_t>();
+             int flags = memory::allocation::MAPPED_SHAREDMEM |
+                         memory::allocation::MAPPED_NOCREATE;
+
+             auto shared_holder =
+                 memory::allocation::AllocateRefcountedMemoryMapAllocation(
+                     ipc_name, flags, size);
+
+             // 3. Rebuild Tensor
+             tensor.ResetHolderWithType(
+                 shared_holder,
+                 static_cast<paddle::experimental::DataType>(t[2].cast<int>()));
+             tensor.Resize(phi::make_ddim(t[3].cast<std::vector<int>>()));
+             tensor.set_lod(t[4].cast<framework::LoD>());
+
+             return tensor;
+           },
+           R"DOC(
+           Deserialize CPU lod tensor from shared memory.
+
+           Params:
+               tuple: contrains ipc file name, data size, data type,
+                      tensor dims and lod information.
+
+           Examples:
+               .. code-block:: python
+
+                 import paddle
+                 tensor = paddle.ones([3,3])
+                 metainfo = tensor.value().get_tensor()._share_filename()
+                 tensor_from_shared = paddle.to_tensor(paddle.fluid.core.LoDTensor._new_shared_filename(metainfo))
+
+        )DOC")
+      .def("_shared_incref",
+           [](framework::Tensor &self) {
+             auto *mmap_allocation = dynamic_cast<
+                 memory::allocation::RefcountedMemoryMapAllocation *>(
+                 self.Holder().get());
+             if (mmap_allocation) {
+               mmap_allocation->incref();
+             }
+           },
+           R"DOC(
+            Increase reference count of share_filename tensor.
+      )DOC")
+      .def("_shared_decref",
+           [](framework::Tensor &self) {
+             auto *mmap_allocation = dynamic_cast<
+                 memory::allocation::RefcountedMemoryMapAllocation *>(
+                 self.Holder().get());
+             if (mmap_allocation) {
+               mmap_allocation->decref();
+             }
+           },
+           R"DOC(
+            Decrease reference count of share_filename tensor.
+      )DOC")
+      .def(py::pickle(
+          [](const framework::Tensor &t) {  // __getstate__
+            auto holder = t.Holder();
+            PADDLE_ENFORCE_EQ(platform::is_cpu_place(holder->place()), true,
+                              platform::errors::PreconditionNotMet(
+                                  "Tensor is not on CPU."
+                                  "Now only Tensor on CPU can be serialized."));
+            auto *mmap_writer_allocation =
+                dynamic_cast<memory::allocation::MemoryMapWriterAllocation *>(
+                    holder.get());
+            PADDLE_ENFORCE_NOT_NULL(
+                mmap_writer_allocation,
+                platform::errors::PreconditionNotMet(
+                    "Tensor is not in shared memory."
+                    "Now only Tensor on shared memory can be serialized."));
+            int type_idx = static_cast<int>(t.type());
+
+            return py::make_tuple(mmap_writer_allocation->ipc_name(),
+                                  mmap_writer_allocation->size(), type_idx,
+                                  vectorize(t.dims()), t.lod());
+          },
+          [](py::tuple t) {  // __setstate__
+            if (t.size() != 5)
+              throw std::runtime_error("Invalid Tensor state!");
+
+            // 1. Create a new C++ instance
+            framework::Tensor tensor;
+
+            // 2. Rebuild Allocation
+            const std::string &ipc_name = t[0].cast<std::string>();
+            size_t size = t[1].cast<size_t>();
+            auto shared_reader_holder =
+                memory::allocation::RebuildMemoryMapReaderAllocation(ipc_name,
+                                                                     size);
+
+            // 3. Maintain global fd set
+            VLOG(3) << "Tensor ipc name: " << ipc_name;
+            memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name);
+
+            // 4. Rebuild Tensor
+            tensor.ResetHolderWithType(
+                shared_reader_holder,
+                static_cast<paddle::experimental::DataType>(t[2].cast<int>()));
+            tensor.Resize(phi::make_ddim(t[3].cast<std::vector<int>>()));
+            tensor.set_lod(t[4].cast<framework::LoD>());
+
+            return tensor;
+          }));
+#endif
+
+  py::class_<phi::SelectedRows>(m, "SelectedRows")
+      .def("__init__",
+           [](phi::SelectedRows &instance) {
+             new (&instance) phi::SelectedRows();
+           })
+      .def("__init__",
+           [](phi::SelectedRows &instance,
+              const std::vector<int64_t> rows,
+              const int64_t &height) {
+             new (&instance) phi::SelectedRows(rows, height);
+           })
+      .def(
+          "get_tensor",
+          [](phi::SelectedRows &self) { return self.mutable_value(); },
+          py::return_value_policy::reference)
+      .def("numel",
+           [](phi::SelectedRows &self) -> int64_t {
+             return self.value().numel();
+           })
+      .def("set_height", &phi::SelectedRows::set_height)
+      .def("height", &phi::SelectedRows::height)
+      .def("set_rows",
+           [](phi::SelectedRows &self, std::vector<int64_t> rows) {
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+             self.set_rows(rows);
+#else
+        Vector<int64_t> new_rows(rows);
+        self.set_rows(new_rows);
+#endif
+           })
+      .def("sync_index",
+           [](phi::SelectedRows &instance) { instance.SyncIndex(); })
+      .def("rows", [](phi::SelectedRows &self) {
+        auto rows = self.rows();
+        std::vector<int64_t> new_rows;
+        new_rows.reserve(rows.size());
+        std::copy(rows.begin(), rows.end(), std::back_inserter(new_rows));
+        return new_rows;
+      });
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/tensor.h b/paddle/fluid/pybind/tensor.h
new file mode 100644
index 0000000000000..a21236724b885
--- /dev/null
+++ b/paddle/fluid/pybind/tensor.h
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "pybind11/pybind11.h"
+
+namespace paddle {
+namespace pybind {
+
+void BindTensor(pybind11::module& m);  // NOLINT
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index bba8526abd7f9..ccec0c060a3a4 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -676,7 +676,7 @@ void SetUVATensorFromPyArray(
 template <typename T, size_t D>
 void _sliceCompute(const framework::Tensor *in,
                    framework::Tensor *out,
-                   const platform::CPUDeviceContext &ctx,
+                   const phi::CPUContext &ctx,
                    const std::vector<int> &axes,
                    const std::vector<int> &starts) {
   auto &eigen_place = *ctx.eigen_device();
@@ -711,7 +711,7 @@ void _sliceCompute(const framework::Tensor *in,
 template <typename T>
 void _concatCompute(const std::vector<paddle::framework::Tensor> &ins,
                     paddle::framework::Tensor *out,
-                    const platform::CPUDeviceContext &ctx,
+                    const phi::CPUContext &ctx,
                     int64_t axis) {
   if (axis == 0 && ins.size() < 10) {
     size_t output_offset = 0;
@@ -729,8 +729,7 @@ void _concatCompute(const std::vector<paddle::framework::Tensor> &ins,
       output_offset += in_stride[axis];
     }
   } else {
-    paddle::operators::math::ConcatFunctor<platform::CPUDeviceContext, T>
-        concat_functor;
+    paddle::operators::math::ConcatFunctor<phi::CPUContext, T> concat_functor;
     concat_functor(ctx, ins, static_cast<int>(axis), out);
   }
 }
@@ -817,7 +816,7 @@ inline framework::Tensor *_getTensor(const framework::Tensor &self,
 template <typename T>
 void _sliceDapper(const framework::Tensor *in,
                   framework::Tensor *out,
-                  const platform::CPUDeviceContext &ctx,
+                  const phi::CPUContext &ctx,
                   const std::vector<int> &axes,
                   const std::vector<int> &starts,
                   int size) {
@@ -858,7 +857,7 @@ void _sliceDapper(const framework::Tensor *in,
 
 template <typename T>
 inline framework::Tensor *_sliceWrapper(const framework::Tensor &self,
-                                        const platform::CPUDeviceContext &ctx,
+                                        const phi::CPUContext &ctx,
                                         py::object obj,
                                         int dim,
                                         int64_t start,
@@ -876,7 +875,7 @@ template <typename T>
 inline framework::Tensor *_sliceAndConcat(const framework::Tensor &self,
                                           py::object obj,
                                           int dim) {
-  platform::CPUDeviceContext ctx;
+  phi::CPUContext ctx;
   int64_t start, stop, step, slicelength;
   _getSliceinfo(self, obj, dim, &start, &stop, &step, &slicelength);
   if (step == 1 || slicelength == 1) {
diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt
index 3846acbde4819..a19fb2d0a8ed9 100644
--- a/paddle/infrt/CMakeLists.txt
+++ b/paddle/infrt/CMakeLists.txt
@@ -128,11 +128,11 @@ endif()
 cc_library(
   infrt SHARED
   SRCS ${infrt_src}
-  DEPS glog boost ${mlir_libs} ${phi_libs} paddle_framework_proto infrt_naive)
+  DEPS glog ${mlir_libs} ${phi_libs} paddle_framework_proto infrt_naive)
 cc_library(
   infrt_static
   SRCS ${infrt_src}
-  DEPS glog boost ${mlir_libs} ${phi_libs} paddle_framework_proto)
+  DEPS glog ${mlir_libs} ${phi_libs} paddle_framework_proto)
 add_dependencies(infrt ${infrt_mlir_incs} mlir-headers)
 
 add_custom_target(test_infrt_exec DEPENDS ${INFRT_TEST_TARGETS})
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.cc b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
index 9292e593a708f..81b41d61ded3e 100644
--- a/paddle/infrt/host_context/mlir_to_runtime_translate.cc
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
@@ -31,7 +31,6 @@
 #include <utility>
 #include <vector>
 
-#include "boost/optional.hpp"
 #include "paddle/infrt/common/string.h"
 #include "paddle/infrt/dialect/dense_tensor.h"
 #include "paddle/infrt/dialect/mlir_loader.h"
@@ -124,118 +123,118 @@ bool MlirToRuntimeTranslator::EmitConstantOp(mlir::Operation* op) {
 }
 
 template <>
-boost::optional<int32_t> MlirToRuntimeTranslator::EmitAttribute(
+paddle::optional<int32_t> MlirToRuntimeTranslator::EmitAttribute(
     const mlir::Attribute& attr) {
-  if (!attr.isa<mlir::IntegerAttr>()) return boost::none;
+  if (!attr.isa<mlir::IntegerAttr>()) return paddle::none;
   if (attr.isa<mlir::IntegerAttr>()) {
     auto val = attr.cast<mlir::IntegerAttr>();
     if (val.getType().isInteger(32)) {
       return val.getValue().getSExtValue();
     }
   }
-  return boost::none;
+  return paddle::none;
 }
 template <>
-boost::optional<int64_t> MlirToRuntimeTranslator::EmitAttribute(
+paddle::optional<int64_t> MlirToRuntimeTranslator::EmitAttribute(
     const mlir::Attribute& attr) {
-  if (!attr.isa<mlir::IntegerAttr>()) return boost::none;
+  if (!attr.isa<mlir::IntegerAttr>()) return paddle::none;
   if (attr.isa<mlir::IntegerAttr>()) {
     auto val = attr.cast<mlir::IntegerAttr>();
     if (val.getType().isInteger(64)) {
       return val.getValue().getSExtValue();
     }
   }
-  return boost::none;
+  return paddle::none;
 }
 
 // TODO(Superjomn) Make double and float parsing share some thing.
 template <>
-boost::optional<float> MlirToRuntimeTranslator::EmitAttribute(
+paddle::optional<float> MlirToRuntimeTranslator::EmitAttribute(
     const mlir::Attribute& attr) {
-  if (!attr.isa<mlir::FloatAttr>()) return boost::none;
+  if (!attr.isa<mlir::FloatAttr>()) return paddle::none;
   if (attr.isa<mlir::FloatAttr>()) {
     auto val = attr.cast<mlir::FloatAttr>();
     if (val.getType().isF32()) return val.getValueAsDouble();
   }
-  return boost::none;
+  return paddle::none;
 }
 
 template <>
-boost::optional<bool> MlirToRuntimeTranslator::EmitAttribute(
+paddle::optional<bool> MlirToRuntimeTranslator::EmitAttribute(
     const mlir::Attribute& attr) {
-  if (!attr.isa<mlir::BoolAttr>()) return boost::none;
+  if (!attr.isa<mlir::BoolAttr>()) return paddle::none;
   if (attr.isa<mlir::BoolAttr>()) {
     auto val = attr.cast<mlir::BoolAttr>();
     return val.getValue();
   }
-  return boost::none;
+  return paddle::none;
 }
 
 template <>
-boost::optional<double> MlirToRuntimeTranslator::EmitAttribute(
+paddle::optional<double> MlirToRuntimeTranslator::EmitAttribute(
     const mlir::Attribute& attr) {
-  if (!attr.isa<mlir::FloatAttr>()) return boost::none;
+  if (!attr.isa<mlir::FloatAttr>()) return paddle::none;
   if (attr.isa<mlir::FloatAttr>()) {
     auto val = attr.cast<mlir::FloatAttr>();
     if (val.getType().isF64()) return val.getValueAsDouble();
   }
-  return boost::none;
+  return paddle::none;
 }
 
 template <>
-boost::optional<::infrt::TargetType> MlirToRuntimeTranslator::EmitAttribute(
+paddle::optional<::infrt::TargetType> MlirToRuntimeTranslator::EmitAttribute(
     const mlir::Attribute& attr) {
-  if (!attr.isa<::infrt::TargetAttr>()) return boost::none;
+  if (!attr.isa<::infrt::TargetAttr>()) return paddle::none;
   if (attr.isa<::infrt::TargetAttr>()) {
     return attr.cast<::infrt::TargetAttr>().getTarget();
   }
-  return boost::none;
+  return paddle::none;
 }
 
 template <>
-boost::optional<::infrt::LayoutType> MlirToRuntimeTranslator::EmitAttribute(
+paddle::optional<::infrt::LayoutType> MlirToRuntimeTranslator::EmitAttribute(
     const mlir::Attribute& attr) {
-  if (!attr.isa<::infrt::LayoutAttr>()) return boost::none;
+  if (!attr.isa<::infrt::LayoutAttr>()) return paddle::none;
   if (attr.isa<::infrt::LayoutAttr>()) {
     return attr.cast<::infrt::LayoutAttr>().getLayout();
   }
-  return boost::none;
+  return paddle::none;
 }
 
 template <>
-boost::optional<::infrt::PrecisionType> MlirToRuntimeTranslator::EmitAttribute(
+paddle::optional<::infrt::PrecisionType> MlirToRuntimeTranslator::EmitAttribute(
     const mlir::Attribute& attr) {
-  if (!attr.isa<::infrt::PrecisionAttr>()) return boost::none;
+  if (!attr.isa<::infrt::PrecisionAttr>()) return paddle::none;
   if (attr.isa<::infrt::PrecisionAttr>()) {
     return attr.cast<::infrt::PrecisionAttr>().getPrecision();
   }
-  return boost::none;
+  return paddle::none;
 }
 
 template <>
-boost::optional<std::string> MlirToRuntimeTranslator::EmitAttribute(
+paddle::optional<std::string> MlirToRuntimeTranslator::EmitAttribute(
     const mlir::Attribute& attr) {
-  if (!attr.isa<mlir::StringAttr>()) return boost::none;
+  if (!attr.isa<mlir::StringAttr>()) return paddle::none;
   return attr.cast<mlir::StringAttr>().getValue().str();
 }
 
-#define PROCESS_ARRAY_INT(type__, bits__)                                      \
-  template <>                                                                  \
-  boost::optional<std::vector<type__>> MlirToRuntimeTranslator::EmitAttribute( \
-      const mlir::Attribute& attr) {                                           \
-    if (!attr.isa<mlir::ArrayAttr>()) return boost::none;                      \
-    auto array = attr.cast<mlir::ArrayAttr>();                                 \
-    CHECK(!array.empty());                                                     \
-                                                                               \
-    if (!array[0].getType().isInteger(bits__)) {                               \
-      return boost::none;                                                      \
-    }                                                                          \
-                                                                               \
-    std::vector<type__> res;                                                   \
-    for (auto& v : array) {                                                    \
-      res.push_back(v.cast<mlir::IntegerAttr>().getValue().getSExtValue());    \
-    }                                                                          \
-    return res;                                                                \
+#define PROCESS_ARRAY_INT(type__, bits__)                                   \
+  template <>                                                               \
+  paddle::optional<std::vector<type__>>                                     \
+  MlirToRuntimeTranslator::EmitAttribute(const mlir::Attribute& attr) {     \
+    if (!attr.isa<mlir::ArrayAttr>()) return paddle::none;                  \
+    auto array = attr.cast<mlir::ArrayAttr>();                              \
+    CHECK(!array.empty());                                                  \
+                                                                            \
+    if (!array[0].getType().isInteger(bits__)) {                            \
+      return paddle::none;                                                  \
+    }                                                                       \
+                                                                            \
+    std::vector<type__> res;                                                \
+    for (auto& v : array) {                                                 \
+      res.push_back(v.cast<mlir::IntegerAttr>().getValue().getSExtValue()); \
+    }                                                                       \
+    return res;                                                             \
   }
 
 PROCESS_ARRAY_INT(bool, 1);
@@ -244,13 +243,13 @@ PROCESS_ARRAY_INT(int32_t, 32);
 PROCESS_ARRAY_INT(int64_t, 64);
 
 template <>
-boost::optional<std::vector<float>> MlirToRuntimeTranslator::EmitAttribute(
+paddle::optional<std::vector<float>> MlirToRuntimeTranslator::EmitAttribute(
     const mlir::Attribute& attr) {
-  if (!attr.isa<mlir::ArrayAttr>()) return boost::none;
+  if (!attr.isa<mlir::ArrayAttr>()) return paddle::none;
   auto array = attr.cast<mlir::ArrayAttr>();
   CHECK(!array.empty());
 
-  if (!array[0].getType().isF32()) return boost::none;
+  if (!array[0].getType().isF32()) return paddle::none;
 
   std::vector<float> res;
   for (auto& v : array) {
@@ -260,13 +259,13 @@ boost::optional<std::vector<float>> MlirToRuntimeTranslator::EmitAttribute(
 }
 
 template <>
-boost::optional<std::vector<double>> MlirToRuntimeTranslator::EmitAttribute(
+paddle::optional<std::vector<double>> MlirToRuntimeTranslator::EmitAttribute(
     const mlir::Attribute& attr) {
-  if (!attr.isa<mlir::ArrayAttr>()) return boost::none;
+  if (!attr.isa<mlir::ArrayAttr>()) return paddle::none;
   auto array = attr.cast<mlir::ArrayAttr>();
   CHECK(!array.empty());
 
-  if (!array[0].getType().isF64()) return boost::none;
+  if (!array[0].getType().isF64()) return paddle::none;
 
   std::vector<double> res;
   for (auto& v : array) {
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.h b/paddle/infrt/host_context/mlir_to_runtime_translate.h
index 27a7f20168667..64dc770489c4d 100644
--- a/paddle/infrt/host_context/mlir_to_runtime_translate.h
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate.h
@@ -75,7 +75,7 @@ class MlirToRuntimeTranslator {
   bool EmitCallOp(mlir::Operation* op, function_defs_t* function_table);
 
   template <typename T>
-  boost::optional<T> EmitAttribute(const mlir::Attribute& attr);
+  paddle::optional<T> EmitAttribute(const mlir::Attribute& attr);
 
   Value* GetOpResult(mlir::Operation* op);
 
diff --git a/paddle/infrt/kernel/phi/context_kernels.cc b/paddle/infrt/kernel/phi/context_kernels.cc
index 23d96aeb8d5e5..9c5ab13d17b52 100644
--- a/paddle/infrt/kernel/phi/context_kernels.cc
+++ b/paddle/infrt/kernel/phi/context_kernels.cc
@@ -20,7 +20,6 @@ namespace phi {
 
 ::phi::CPUContext CreateCPUContext() {
   ::phi::CPUContext ctx{};
-  ctx.Init();
   auto allocator = new backends::CpuPhiAllocator{};
   ctx.SetAllocator(allocator);
   ctx.SetHostAllocator(allocator);
diff --git a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
index 5a314817c2420..aa577da60c3ae 100644
--- a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
+++ b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
@@ -81,7 +81,6 @@ TEST(ElementwiseAdd, launcher_registry) {
 
   ::phi::CPUContext context;
   context.SetAllocator(alloc);
-  context.Init();
 
   host_context::KernelFrameBuilder kernel_frame_builder;
   kernel_frame_builder.AddArgument(new host_context::Value(std::move(context)));
diff --git a/paddle/phi/api/ext/op_meta_info.h b/paddle/phi/api/ext/op_meta_info.h
index fa19714dde7db..546b0accf8ba7 100644
--- a/paddle/phi/api/ext/op_meta_info.h
+++ b/paddle/phi/api/ext/op_meta_info.h
@@ -213,7 +213,7 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
   PD_SPECIALIZE_ComputeCallHelper(const std::vector<std::string>&);
   // TODO(chenweihang): support other attribute type if needed.
   // Why not support other attribute type here?
-  // - boost::blank, std::vector<bool> and std::vector<double>
+  // - paddle::blank, std::vector<bool> and std::vector<double>
   //   are not used in op
   // - BlockDesc* and std::vector<BlockDesc*> are used in framework
 
diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt
index 750614561c520..5a5aa9638a3be 100644
--- a/paddle/phi/api/lib/CMakeLists.txt
+++ b/paddle/phi/api/lib/CMakeLists.txt
@@ -17,13 +17,13 @@ else()
     DEPS tensor_base dense_tensor phi_api_utils phi_enforce)
 endif()
 
-set(api_gen_base ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api_base.py)
+set(api_gen_base ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/generator/api_base.py)
 
 # forward api file
-set(api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api_gen.py)
-set(api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml)
+set(api_gen_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/generator/api_gen.py)
+set(api_yaml_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/api.yaml)
 set(legacy_api_yaml_file
-    ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/legacy_api.yaml)
+    ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/legacy_api.yaml)
 set(api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/include/api.h)
 set(api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/api.cc)
 set(api_header_file_tmp ${api_header_file}.tmp)
@@ -31,11 +31,10 @@ set(api_source_file_tmp ${api_source_file}.tmp)
 
 # backward api file
 set(bw_api_gen_file
-    ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/backward_api_gen.py)
-set(bw_api_yaml_file
-    ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml)
+    ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/generator/backward_api_gen.py)
+set(bw_api_yaml_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/backward.yaml)
 set(legacy_bw_api_yaml_file
-    ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/legacy_backward.yaml)
+    ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/legacy_backward.yaml)
 set(bw_api_header_file
     ${CMAKE_SOURCE_DIR}/paddle/phi/api/backward/backward_api.h)
 set(bw_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/backward_api.cc)
@@ -44,7 +43,7 @@ set(bw_api_source_file_tmp ${bw_api_source_file}.tmp)
 
 # dygraph(intermediate) api file
 set(im_api_gen_file
-    ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/intermediate_api_gen.py)
+    ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/generator/intermediate_api_gen.py)
 set(dygraph_api_header_file
     ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/dygraph_api.h)
 set(dygraph_api_source_file
@@ -54,9 +53,9 @@ set(dygraph_api_source_file_tmp ${dygraph_api_source_file}.tmp)
 
 # sparse api file
 set(sparse_api_gen_file
-    ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api_gen.py)
+    ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/generator/sparse_api_gen.py)
 set(sparse_api_yaml_file
-    ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api.yaml)
+    ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/sparse_api.yaml)
 set(sparse_api_header_file
     ${CMAKE_SOURCE_DIR}/paddle/phi/api/include/sparse_api.h)
 set(sparse_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/sparse_api.cc)
@@ -65,9 +64,9 @@ set(sparse_api_source_file_tmp ${sparse_api_source_file}.tmp)
 
 # sparse bw api file
 set(sparse_bw_api_gen_file
-    ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api_gen.py)
+    ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/generator/sparse_bw_api_gen.py)
 set(sparse_bw_api_yaml_file
-    ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api.yaml)
+    ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/sparse_bw_api.yaml)
 set(sparse_bw_api_header_file
     ${CMAKE_SOURCE_DIR}/paddle/phi/api/backward/sparse_bw_api.h)
 set(sparse_bw_api_source_file
@@ -77,9 +76,9 @@ set(sparse_bw_api_source_file_tmp ${sparse_bw_api_source_file}.tmp)
 
 # strings api file
 set(strings_api_gen_file
-    ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/strings_api_gen.py)
+    ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/generator/strings_api_gen.py)
 set(strings_api_yaml_file
-    ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/strings_api.yaml)
+    ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/strings_api.yaml)
 set(strings_api_header_file
     ${CMAKE_SOURCE_DIR}/paddle/phi/api/include/strings_api.h)
 set(strings_api_source_file
@@ -89,12 +88,20 @@ set(strings_api_source_file_tmp ${strings_api_source_file}.tmp)
 
 # wrapped infermeta file
 set(wrapped_infermeta_gen_file
-    ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py)
+    ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/generator/wrapped_infermeta_gen.py)
 set(wrapped_infermeta_header_file
     ${CMAKE_SOURCE_DIR}/paddle/phi/infermeta/generated.h)
 set(wrapped_infermeta_source_file
     ${CMAKE_SOURCE_DIR}/paddle/phi/infermeta/generated.cc)
 
+# op extra info file
+set(ops_extra_info_gen_file
+    ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/generator/ops_extra_info_gen.py)
+set(api_compat_yaml_file
+    ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/api_compat.yaml)
+set(ops_extra_info_file
+    ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/ops_extra_info.h)
+
 if(NOT PYTHONINTERP_FOUND)
   find_package(PythonInterp REQUIRED)
 endif()
@@ -109,7 +116,7 @@ else()
 endif()
 
 # parse apis
-set(parsed_api_dir ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/parsed_apis)
+set(parsed_api_dir ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/parsed_apis)
 set(generated_op_path
     ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/generated_op.cc)
 set(generated_argument_mapping_path
@@ -121,18 +128,20 @@ message(
 - ${bw_api_yaml_file}
 - ${legacy_bw_api_yaml_file}")
 execute_process(
-  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml
   COMMAND ${CMAKE_COMMAND} -E make_directory ${parsed_api_dir}
-  COMMAND ${PYTHON_EXECUTABLE} parse_api.py --api_yaml_path ./api.yaml
+  COMMAND ${PYTHON_EXECUTABLE} generator/parse_api.py --api_yaml_path ./api.yaml
           --output_path ./parsed_apis/api.parsed.yaml
-  COMMAND ${PYTHON_EXECUTABLE} parse_api.py --api_yaml_path ./legacy_api.yaml
-          --output_path ./parsed_apis/legacy_api.parsed.yaml
-  COMMAND ${PYTHON_EXECUTABLE} parse_api.py --api_yaml_path ./backward.yaml
-          --output_path ./parsed_apis/backward_api.parsed.yaml --backward
+  COMMAND ${PYTHON_EXECUTABLE} generator/parse_api.py --api_yaml_path
+          ./legacy_api.yaml --output_path ./parsed_apis/legacy_api.parsed.yaml
+  COMMAND
+    ${PYTHON_EXECUTABLE} generator/parse_api.py --api_yaml_path ./backward.yaml
+    --output_path ./parsed_apis/backward_api.parsed.yaml --backward
   COMMAND
-    ${PYTHON_EXECUTABLE} parse_api.py --api_yaml_path ./legacy_backward.yaml
-    --output_path ./parsed_apis/legacy_backward_api.parsed.yaml --backward
-    RESULTS_VARIABLE _results)
+    ${PYTHON_EXECUTABLE} generator/parse_api.py --api_yaml_path
+    ./legacy_backward.yaml --output_path
+    ./parsed_apis/legacy_backward_api.parsed.yaml --backward RESULTS_VARIABLE
+    _results)
 foreach(_result in ${_results})
   if(${_result})
     message(FATAL_ERROR "api yaml parsing failed, exiting.")
@@ -144,9 +153,9 @@ message("validate api yaml:
 - ${parsed_api_dir}/api.parsed.yaml
 - ${parsed_api_dir}/backward_api.parsed.yaml")
 execute_process(
-  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml
   COMMAND
-    ${PYTHON_EXECUTABLE} cross_validate.py --forward_yaml_paths
+    ${PYTHON_EXECUTABLE} generator/cross_validate.py --forward_yaml_paths
     ./parsed_apis/api.parsed.yaml ./parsed_apis/legacy_api.parsed.yaml
     --backward_yaml_paths ./parsed_apis/backward_api.parsed.yaml
     ./parsed_apis/legacy_backward_api.parsed.yaml
@@ -161,13 +170,13 @@ message(
 create or remove auto-geneated argument mappings: ${generated_argument_mapping_path}.tmp"
 )
 execute_process(
-  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml
   COMMAND
-    ${PYTHON_EXECUTABLE} generate_op.py --api_yaml_path
+    ${PYTHON_EXECUTABLE} generator/generate_op.py --api_yaml_path
     ./parsed_apis/api.parsed.yaml --backward_api_yaml_path
     ./parsed_apis/backward_api.parsed.yaml --api_version_yaml_path
-    api_version.yaml --api_args_compat_yaml_path args_compat.yaml
-    --output_op_path "${generated_op_path}.tmp" --output_arg_map_path
+    api_version.yaml --api_compat_yaml_path api_compat.yaml --output_op_path
+    "${generated_op_path}.tmp" --output_arg_map_path
     "${generated_argument_mapping_path}.tmp"
   RESULT_VARIABLE _result)
 if(${_result})
@@ -210,6 +219,13 @@ else()
   message("remove ${generated_argument_mapping_path}")
 endif()
 
+# generate ops extra info
+execute_process(
+  COMMAND
+    ${PYTHON_EXECUTABLE} ${ops_extra_info_gen_file} --api_compat_yaml_path
+    ${api_compat_yaml_file} --ops_extra_info_path ${ops_extra_info_file})
+message("generate ${ops_extra_info_file}")
+
 # generate forward api
 add_custom_command(
   OUTPUT ${api_header_file} ${api_source_file}
@@ -241,8 +257,8 @@ add_custom_command(
   COMMAND ${CMAKE_COMMAND} -E copy_if_different ${bw_api_source_file_tmp}
           ${bw_api_source_file}
   COMMENT "copy_if_different ${bw_api_header_file} ${bw_api_source_file}"
-  DEPENDS ${bw_api_yaml_file} ${legacy_bw_api_yaml_file} ${bw_api_gen_file}
-          ${api_gen_base}
+  DEPENDS ${bw_api_yaml_file} ${bw_api_gen_file} ${api_gen_base}
+          ${legacy_bw_api_yaml_file}
   VERBATIM)
 
 # generate sparse api
diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc
index b68418885ca21..362c9606ebadf 100644
--- a/paddle/phi/api/lib/api_custom_impl.cc
+++ b/paddle/phi/api/lib/api_custom_impl.cc
@@ -871,49 +871,6 @@ std::tuple<Tensor, Tensor, Tensor> momentum_impl(
 
 ////////////////// Backward(grad) api impls //////////////////////
 
-// TODO(chenweihang):  the original sum grad op can support higher-level
-// differentiation,
-// but if we use this impl, it will not support. We need to be able to reuse
-// the autograd API here, which is not yet implemented
-// TODO(chenweihang): we should support call generated api in custom api impl
-void add_n_grad_impl(const std::vector<Tensor>& x,
-                     const Tensor& out_grad,
-                     std::vector<Tensor*> x_grad) {
-  auto kernel_key_set = ParseKernelKeyByInputArgs(out_grad);
-  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
-
-  Backend kernel_backend = kernel_key.backend();
-  DataLayout kernel_layout = kernel_key.layout();
-  DataType kernel_data_type = kernel_key.dtype();
-
-  auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
-      "scale", {kernel_backend, kernel_layout, kernel_data_type});
-  VLOG(6) << "add_n_grad API kernel key: [" << kernel_backend << ", "
-          << kernel_layout << ", " << kernel_data_type << "]";
-  VLOG(6) << "add_n_grad API kernel: " << kernel;
-
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
-
-  auto dense_out_grad = PrepareData(out_grad, kernel.InputAt(0), {});
-
-  auto dense_x_grad = SetKernelOutput(&x_grad);
-
-  using kernel_signature = void (*)(const platform::DeviceContext&,
-                                    const phi::DenseTensor&,
-                                    const phi::Scalar&,
-                                    float,
-                                    bool,
-                                    phi::DenseTensor*);
-  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
-
-  for (auto* dense_x_grad_t : dense_x_grad) {
-    phi::MetaTensor meta_out(dense_x_grad_t);
-    phi::UnchangedInferMeta(MakeMetaTensor(*dense_out_grad), &meta_out);
-    (*kernel_fn)(
-        *dev_ctx, *dense_out_grad, phi::Scalar(1.0), 0.0, true, dense_x_grad_t);
-  }
-}
-
 std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> batch_norm_impl(
     const Tensor& x,
     const Tensor& scale,
diff --git a/paddle/phi/api/lib/api_custom_impl.h b/paddle/phi/api/lib/api_custom_impl.h
index 627ff2aabf11c..ef695580a0773 100644
--- a/paddle/phi/api/lib/api_custom_impl.h
+++ b/paddle/phi/api/lib/api_custom_impl.h
@@ -116,10 +116,6 @@ std::tuple<Tensor, Tensor, Tensor> momentum_impl(
 
 ////////////////// Backward(grad) api impls //////////////////////
 
-void add_n_grad_impl(const std::vector<Tensor>& x,
-                     const Tensor& out_grad,
-                     std::vector<Tensor*> x_grad);
-
 void conv2d_grad_impl(const Tensor& input,
                       const Tensor& filter,
                       const Tensor& out_grad,
diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc
index 4dafc7a7ee579..58795c0f06381 100644
--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -284,5 +284,15 @@ std::unique_ptr<std::vector<phi::DenseTensor>> PrepareData(
   return pt_tensors;
 }
 
+paddle::optional<std::vector<phi::DenseTensor>> PrepareData(
+    const paddle::optional<std::vector<Tensor>>& inputs,
+    const phi::TensorArgDef& target_args_def,
+    const TransformFlag& transform_flag) {
+  if (inputs) {
+    return {*PrepareData(*inputs, target_args_def, transform_flag)};
+  }
+  return paddle::none;
+}
+
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/phi/api/lib/data_transform.h b/paddle/phi/api/lib/data_transform.h
index 4d70078ef3444..3feba2465f61b 100644
--- a/paddle/phi/api/lib/data_transform.h
+++ b/paddle/phi/api/lib/data_transform.h
@@ -76,5 +76,10 @@ std::unique_ptr<std::vector<phi::DenseTensor>> PrepareData(
     const phi::TensorArgDef& target_args_def,
     const TransformFlag& transform_flag);
 
+paddle::optional<std::vector<phi::DenseTensor>> PrepareData(
+    const paddle::optional<std::vector<Tensor>>& inputs,
+    const phi::TensorArgDef& target_args_def,
+    const TransformFlag& transform_flag);
+
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/phi/api/lib/kernel_dispatch.cc b/paddle/phi/api/lib/kernel_dispatch.cc
index a534f02663dff..27e88d217971b 100644
--- a/paddle/phi/api/lib/kernel_dispatch.cc
+++ b/paddle/phi/api/lib/kernel_dispatch.cc
@@ -57,7 +57,7 @@ BackendSet GetTensorBackendSet(const phi::TensorBase& t) {
     BackendSet backend_set(phi::TransToPhiBackend(t.place()));
     switch (t.layout()) {
       case DataLayout::MKLDNN:
-        backend_set = backend_set | BackendSet(Backend::MKLDNN);
+        backend_set = backend_set | BackendSet(Backend::ONEDNN);
         break;
       default:
         // do nothing
diff --git a/python/paddle/utils/code_gen/api.yaml b/paddle/phi/api/yaml/api.yaml
similarity index 73%
rename from python/paddle/utils/code_gen/api.yaml
rename to paddle/phi/api/yaml/api.yaml
index 500ea7b7adc6d..b5703aa57f9da 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/paddle/phi/api/yaml/api.yaml
@@ -43,6 +43,15 @@
     data_type : x
   backward : cross_grad
 
+- api : diag
+  args : (Tensor x, int offset = 0, float padding_value = 0.0)
+  output : Tensor
+  infer_meta :
+    func : DiagInferMeta
+  kernel :
+    func : diag
+  backward : diag_grad
+
 - api : diagonal
   args : (Tensor x, int offset = 0, int axis1 = 0, int axis2 = 1)
   output : Tensor
@@ -52,6 +61,34 @@
     func : diagonal
   backward : diagonal_grad
 
+- api : digamma
+  args : (Tensor x)
+  output : Tensor
+  infer_meta :
+    func : UnchangedInferMeta
+  kernel :
+    func : digamma
+  backward : digamma_grad
+
+- api : dist
+  args : (Tensor x, Tensor y, float p = 2.0)
+  output : Tensor
+  infer_meta :
+    func : DistInferMeta
+  kernel :
+    func : dist
+  backward : dist_grad
+
+- api : dot
+  args : (Tensor x, Tensor y)
+  output : Tensor
+  infer_meta :
+    func : DotInferMeta
+  kernel :
+    func : dot
+    data_type : x
+  backward : dot_grad
+
 - api : erf
   args : (Tensor x)
   output : Tensor
diff --git a/paddle/phi/api/yaml/api_compat.yaml b/paddle/phi/api/yaml/api_compat.yaml
new file mode 100644
index 0000000000000..a68de3a0f106b
--- /dev/null
+++ b/paddle/phi/api/yaml/api_compat.yaml
@@ -0,0 +1,102 @@
+- api : atan2
+  inputs :
+    {x : X1, y : X2}
+  outputs :
+    out : Out
+
+- api : bernoulli
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- api : cholesky
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- api : cholesky_solve
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    out : Out
+
+- api : conv2d
+  extra :
+    attrs : [bool use_cudnn = false, bool fuse_relu_before_depthwise_conv = false, bool use_mkldnn = false,
+             bool use_quantizer = false, str mkldnn_data_type = "float32", bool fuse_relu = false,
+             str fuse_activation = "", bool fuse_alpha = false, bool fuse_beta = false, bool use_addto = false,
+             bool fuse_residual_connection = false, float Scale_in = 1.0f, float Scale_out = 1.0f,
+             float Scale_in_eltwise = 1.0f, 'float[] Scale_weights = {1.0f}', bool force_fp32_output = false,
+             int workspace_size_MB = 512, bool exhaustive_search = false]
+
+- api : cross
+  inputs :
+    {x : X, y : Y}
+  attrs :
+    axis : dim
+  outputs :
+    out : Out
+
+- api : diag
+  op_name : diag_v2
+  grad_op_name : diag_v2_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- api : diagonal
+  inputs :
+    x : Input
+  outputs :
+    out : Out
+
+- api : digamma
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- api : dist
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    out : Out
+
+- api : dot
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    out : Out
+
+- api : erf
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- api : mv
+  inputs :
+    {x : X, vec : Vec}
+  outputs :
+    out : Out
+
+- api : poisson
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- api : trace
+  inputs :
+    x : Input
+  outputs :
+    out : Out
+
+- api : trunc
+  inputs :
+    x : X
+  outputs :
+    out : Out
diff --git a/python/paddle/utils/code_gen/api_version.yaml b/paddle/phi/api/yaml/api_version.yaml
similarity index 100%
rename from python/paddle/utils/code_gen/api_version.yaml
rename to paddle/phi/api/yaml/api_version.yaml
diff --git a/python/paddle/utils/code_gen/backward.yaml b/paddle/phi/api/yaml/backward.yaml
similarity index 71%
rename from python/paddle/utils/code_gen/backward.yaml
rename to paddle/phi/api/yaml/backward.yaml
index 32c6e2c4b63ef..17409f8ae7984 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -39,6 +39,18 @@
     func : cross_grad
     data_type : out_grad
 
+- backward_api : diag_grad
+  forward : diag (Tensor x, int offset, float padding_value) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad, int offset)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : diag_grad
+    data_type : out_grad
+  no_need_buffer : x
+
 - backward_api : diagonal_grad
   forward : diagonal (Tensor x, int offset, int axis1, int axis2) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, int offset = 0, int axis1 = 0, int axis2 = 1)
@@ -51,6 +63,37 @@
     data_type : out_grad
   no_need_buffer : x
 
+- backward_api : digamma_grad
+  forward : digamma (Tensor x) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : digamma_grad
+
+- backward_api : dist_grad
+  forward : dist (Tensor x, Tensor y, float p) -> Tensor(out)
+  args : (Tensor x, Tensor y, Tensor out, Tensor out_grad, float p)
+  output : Tensor(x_grad), Tensor(y_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, y]
+  kernel :
+    func : dist_grad
+
+- backward_api : dot_grad
+  forward : dot (Tensor x, Tensor y) -> Tensor(out)
+  args : (Tensor x, Tensor y, Tensor out_grad)
+  output : Tensor(x_grad), Tensor(y_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, y]
+  kernel :
+    func : dot_grad
+    data_type : out_grad
+
 - backward_api : erf_grad
   forward : erf (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
diff --git a/python/paddle/utils/code_gen/api_base.py b/paddle/phi/api/yaml/generator/api_base.py
similarity index 99%
rename from python/paddle/utils/code_gen/api_base.py
rename to paddle/phi/api/yaml/generator/api_base.py
index aacb4ce55befa..2659d80615f2d 100644
--- a/python/paddle/utils/code_gen/api_base.py
+++ b/paddle/phi/api/yaml/generator/api_base.py
@@ -131,9 +131,11 @@ def parse_input_and_attr(self, api_name, args_config, optional_vars=[]):
             'long': 'long',
             'size_t': 'size_t',
             'float': 'float',
+            'float[]': 'const std::vector<float>&',
             'double': 'double',
             'bool': 'bool',
             'str': 'const std::string&',
+            'str[] ': 'const std::vector<std::string>&',
             'Place': 'const Place&',
             'DataLayout': 'DataLayout',
             'DataType': 'DataType',
diff --git a/python/paddle/utils/code_gen/api_gen.py b/paddle/phi/api/yaml/generator/api_gen.py
similarity index 98%
rename from python/paddle/utils/code_gen/api_gen.py
rename to paddle/phi/api/yaml/generator/api_gen.py
index a0775dd4c0a78..0893d0d5578f9 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/paddle/phi/api/yaml/generator/api_gen.py
@@ -154,7 +154,7 @@ def gene_output(self,
                 0] == 'dense' else 'SetSelectedRowsKernelOutput'
             if return_type == 'std::vector<Tensor>':
                 assert self.outputs['out_size_expr'][0] is not None, \
-                     f"{api_name}: The out size expr : '{{expr}}' should be set when output has Tensor[]. You can refer 'split' api."
+                     f"{self.api}: The out size expr : '{{expr}}' should be set when output has Tensor[]. You can refer 'split' api."
                 output_create = output_create + f"""
 {code_indent}  auto kernel_out = {set_out_func}({self.outputs['out_size_expr'][0]}, kernel_backend, &api_output);"""
 
@@ -199,7 +199,7 @@ def gene_output(self,
 
                 if out_dtype_list[i] == 'std::vector<Tensor>':
                     assert self.outputs['out_size_expr'][i] is not None, \
-                        f"{api_name}: The out size expr : '{{expr}}' should be set when output has Tensor[]. You can refer 'split' api."
+                        f"{self.api}: The out size expr : '{{expr}}' should be set when output has Tensor[]. You can refer 'split' api."
                     output_create = output_create + f"""
 {code_indent}  auto kernel_out_{i} = {set_out_func}({self.outputs['out_size_expr'][i]}, kernel_backend, {get_out_code});"""
 
@@ -313,7 +313,7 @@ def main():
     parser.add_argument('--api_yaml_path',
                         help='path to api yaml file',
                         nargs='+',
-                        default='python/paddle/utils/code_gen/api.yaml')
+                        default='paddle/phi/api/yaml/api.yaml')
 
     parser.add_argument('--api_header_path',
                         help='output of generated api header code file',
diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/paddle/phi/api/yaml/generator/backward_api_gen.py
similarity index 98%
rename from python/paddle/utils/code_gen/backward_api_gen.py
rename to paddle/phi/api/yaml/generator/backward_api_gen.py
index 2439eff9f63e5..67d47a8ec7432 100644
--- a/python/paddle/utils/code_gen/backward_api_gen.py
+++ b/paddle/phi/api/yaml/generator/backward_api_gen.py
@@ -133,7 +133,7 @@ def gene_output(self,
                 0] == 'dense' else 'SetSelectedRowsKernelOutput'
             if out_dtype_list[0] == 'std::vector<Tensor>':
                 assert self.outputs['out_size_expr'] is not None, \
-                     f"{api_name}: The out size expr : '{{expr}}' should be set when output has Tensor[]. You can refer 'split' api."
+                     f"{self.api}: The out size expr : '{{expr}}' should be set when output has Tensor[]. You can refer 'split' api."
                 output_create = output_create + f"""
 {code_indent}  auto kernel_out = {set_out_func}(&{self.outputs['names'][0]});"""
 
@@ -164,7 +164,7 @@ def gene_output(self,
 {code_indent}  *{self.outputs['names'][i]} = {self.inplace_map[self.outputs['names'][i]]};"""
 
                     assert self.outputs['out_size_expr'][i] is not None, \
-                        f"{api_name}: The out size expr : '{{expr}}' should be set when output has Tensor[]. You can refer 'split' api."
+                        f"{self.api}: The out size expr : '{{expr}}' should be set when output has Tensor[]. You can refer 'split' api."
                     output_create = output_create + f"""
 {code_indent}  auto kernel_out_{i} = {set_out_func}(&{self.outputs['names'][i]});"""
 
@@ -279,7 +279,7 @@ def main():
     parser.add_argument('--backward_yaml_path',
                         help='path to backward yaml file',
                         nargs='+',
-                        default='python/paddle/utils/code_gen/backward.yaml')
+                        default='paddle/phi/api/yaml/backward.yaml')
     parser.add_argument('--backward_header_path',
                         help='output of generated backward header code file',
                         default='paddle/phi/api/backward/backward_api.h')
diff --git a/python/paddle/utils/code_gen/cross_validate.py b/paddle/phi/api/yaml/generator/cross_validate.py
similarity index 100%
rename from python/paddle/utils/code_gen/cross_validate.py
rename to paddle/phi/api/yaml/generator/cross_validate.py
diff --git a/python/paddle/utils/code_gen/filters.py b/paddle/phi/api/yaml/generator/filters.py
similarity index 97%
rename from python/paddle/utils/code_gen/filters.py
rename to paddle/phi/api/yaml/generator/filters.py
index d978293fe6f73..cda858ab6e74e 100644
--- a/python/paddle/utils/code_gen/filters.py
+++ b/paddle/phi/api/yaml/generator/filters.py
@@ -79,9 +79,9 @@ def to_sr_output_type(s):
 # -------------- transform argument names from yaml to opmaker ------------
 def to_opmaker_name(s):
     if s.endswith("_grad"):
-        return 'GradVarName("{}")'.format(to_pascal_case(s[:-5]))
+        return 'GradVarName("{}")'.format(s[:-5])
     else:
-        return '"{}"'.format(to_pascal_case(s))
+        return '"{}"'.format(s)
 
 
 def to_opmaker_name_cstr(s):
diff --git a/python/paddle/utils/code_gen/generate_op.py b/paddle/phi/api/yaml/generator/generate_op.py
similarity index 91%
rename from python/paddle/utils/code_gen/generate_op.py
rename to paddle/phi/api/yaml/generator/generate_op.py
index 469e264812760..bc8b80efb5156 100644
--- a/python/paddle/utils/code_gen/generate_op.py
+++ b/paddle/phi/api/yaml/generator/generate_op.py
@@ -54,32 +54,21 @@ def restruct_io(api):
     return api
 
 
-def main(api_yaml_path, backward_yaml_path, api_args_compat_yaml_path,
-         api_version_yaml_path, output_op_path, output_arg_map_path):
-    with open(api_yaml_path, "rt") as f:
-        apis = yaml.safe_load(f)
-        apis = [restruct_io(api) for api in apis]
-    forward_api_dict = to_named_dict(apis)
-
-    with open(backward_yaml_path, "rt") as f:
-        backward_apis = yaml.safe_load(f)
-        backward_apis = [restruct_io(api) for api in backward_apis]
-    backward_api_dict = to_named_dict(backward_apis)
-
-    with open(api_version_yaml_path, "rt") as f:
-        api_versions = yaml.safe_load(f)
-    # add api version info into api
-    for api_version in api_versions:
-        forward_api_dict[api_version['api']]['version'] = api_version['version']
-
-    with open(api_args_compat_yaml_path, "rt") as f:
-        api_args_map = yaml.safe_load(f)
-    # replace args name for OpMaker
-    for api_args in api_args_map:
+# replace name of op and params for OpMaker
+def replace_compat_name(api_op_map, forward_api_dict, backward_api_dict):
+    for api_args in api_op_map:
+        if api_args['api'] not in forward_api_dict:
+            continue
         forward_api_item = forward_api_dict[api_args['api']]
         has_backward = True if forward_api_item['backward'] else False
         if has_backward:
             backward_api_item = backward_api_dict[forward_api_item['backward']]
+        if 'op_name' in api_args:
+            forward_api_item['op_name'] = api_args['op_name']
+        if 'grad_op_name' in api_args and has_backward:
+            forward_api_item['backward'] = api_args['grad_op_name']
+            backward_api_item['op_name'] = api_args['grad_op_name']
+
         key_set = ['inputs', 'attrs', 'outputs']
         args_map = {}
         for key in key_set:
@@ -173,6 +162,35 @@ def main(api_yaml_path, backward_yaml_path, api_args_compat_yaml_path,
                     for param in backward_api_item['no_need_buffer']
                 ]
 
+
+def main(api_yaml_path, backward_yaml_path, api_compat_yaml_path,
+         api_version_yaml_path, output_op_path, output_arg_map_path):
+    with open(api_yaml_path, "rt") as f:
+        apis = yaml.safe_load(f)
+        apis = [restruct_io(api) for api in apis]
+    forward_api_dict = to_named_dict(apis)
+
+    with open(backward_yaml_path, "rt") as f:
+        backward_apis = yaml.safe_load(f)
+        backward_apis = [restruct_io(api) for api in backward_apis]
+    backward_api_dict = to_named_dict(backward_apis)
+
+    with open(api_version_yaml_path, "rt") as f:
+        api_versions = yaml.safe_load(f)
+    # add api version info into api
+    for api_version in api_versions:
+        forward_api_dict[api_version['api']]['version'] = api_version['version']
+
+    with open(api_compat_yaml_path, "rt") as f:
+        api_op_map = yaml.safe_load(f)
+
+    for api in apis:
+        api['op_name'] = api['name']
+    for bw_api in backward_apis:
+        bw_api['op_name'] = bw_api['name']
+
+    replace_compat_name(api_op_map, forward_api_dict, backward_api_dict)
+
     # fill backward field for an api if another api claims it as forward
     for name, backward_api in backward_api_dict.items():
         forward_name = backward_api["forward"]["name"]
@@ -181,11 +199,6 @@ def main(api_yaml_path, backward_yaml_path, api_args_compat_yaml_path,
             if forward_api["backward"] is None:
                 forward_api["backward"] = name
 
-        if forward_name in backward_api_dict:
-            forward_api = backward_api_dict[forward_name]
-            if forward_api["backward"] is None:
-                forward_api["backward"] = name
-
     api_dict = {}
     api_dict.update(forward_api_dict)
     api_dict.update(backward_api_dict)
@@ -219,7 +232,7 @@ def main(api_yaml_path, backward_yaml_path, api_args_compat_yaml_path,
     parser.add_argument('--backward_api_yaml_path',
                         type=str,
                         help="parsed backward api yaml file.")
-    parser.add_argument('--api_args_compat_yaml_path',
+    parser.add_argument('--api_compat_yaml_path',
                         type=str,
                         help="api args compat yaml file.")
     parser.add_argument('--api_version_yaml_path',
@@ -235,5 +248,5 @@ def main(api_yaml_path, backward_yaml_path, api_args_compat_yaml_path,
 
     args = parser.parse_args()
     main(args.api_yaml_path, args.backward_api_yaml_path,
-         args.api_args_compat_yaml_path, args.api_version_yaml_path,
+         args.api_compat_yaml_path, args.api_version_yaml_path,
          args.output_op_path, args.output_arg_map_path)
diff --git a/python/paddle/utils/code_gen/intermediate_api_gen.py b/paddle/phi/api/yaml/generator/intermediate_api_gen.py
similarity index 97%
rename from python/paddle/utils/code_gen/intermediate_api_gen.py
rename to paddle/phi/api/yaml/generator/intermediate_api_gen.py
index 017099a64a344..c8ba88d054ac7 100644
--- a/python/paddle/utils/code_gen/intermediate_api_gen.py
+++ b/paddle/phi/api/yaml/generator/intermediate_api_gen.py
@@ -134,11 +134,11 @@ def main():
     parser.add_argument('--api_yaml_path',
                         nargs='+',
                         help='path to api yaml file',
-                        default='python/paddle/utils/code_gen/api.yaml')
+                        default='paddle/phi/api/yaml/api.yaml')
 
     parser.add_argument('--sparse_api_yaml_path',
                         help='path to sparse api yaml file',
-                        default='python/paddle/utils/code_gen/sparse_api.yaml')
+                        default='paddle/phi/api/yaml/sparse_api.yaml')
 
     parser.add_argument('--dygraph_api_header_path',
                         help='output of generated dygraph api header code file',
diff --git a/paddle/phi/api/yaml/generator/ops_extra_info_gen.py b/paddle/phi/api/yaml/generator/ops_extra_info_gen.py
new file mode 100644
index 0000000000000..ef5afbf595b96
--- /dev/null
+++ b/paddle/phi/api/yaml/generator/ops_extra_info_gen.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import yaml
+import re
+import argparse
+
+
+def map_code_template(attrs_str):
+    return f"""
+#include "paddle/fluid/framework/attribute.h"
+
+namespace paddle {{
+const static std::unordered_map<std::string, paddle::framework::AttributeMap> extra_attrs_map = {{
+{attrs_str}
+}};
+
+}}  // namespace paddle
+
+"""
+
+
+ATTR_TYPE_STRING_MAP = {
+    'bool': 'bool',
+    'int': 'int',
+    'int64_t': 'int64_t',
+    'float': 'float',
+    'double': 'double',
+    'str': 'std::string',
+    'int[]': 'std::vector<int>',
+    'int64_t[]': 'std::vector<int64_t>',
+    'float[]': 'std::vector<float>',
+    'double[]': 'std::vector<double>',
+    'str[]': 'std::vector<std::string>'
+}
+
+
+def parse_attr(attr_str):
+    result = re.search(
+        r"(?P<attr_type>[a-z[\]]+)\s+(?P<name>[a-zA-Z0-9_]+)\s*=\s*(?P<default_val>\S+)",
+        attr_str)
+    return ATTR_TYPE_STRING_MAP[result.group('attr_type')], result.group(
+        'name'), result.group('default_val')
+
+
+def generate_extra_info(api_compat_yaml_path, ops_extra_info_path):
+    compat_apis = []
+    with open(api_compat_yaml_path, 'rt') as f:
+        compat_apis = yaml.safe_load(f)
+
+    extra_map_str_list = []
+
+    for api_compat_args in compat_apis:
+        if 'extra' in api_compat_args:
+            extra_args_map = api_compat_args['extra']
+            # TODO(chenweihang): add inputs and outputs
+            if 'attrs' in extra_args_map:
+                attr_map_list = []
+                for attr in extra_args_map['attrs']:
+                    attr_type, attr_name, default_val = parse_attr(attr)
+                    if attr_type.startswith("std::vector"):
+                        attr_map_list.append(
+                            f"{{\"{attr_name}\", {attr_type}{default_val}}}")
+                    else:
+                        attr_map_list.append(
+                            f"{{\"{attr_name}\", {attr_type}{{{default_val}}}}}"
+                        )
+                api_extra_attr_map = ", ".join(attr_map_list)
+                extra_map_str_list.append(
+                    f"{{\"{api_compat_args['api']}\", {{ {api_extra_attr_map} }}}}"
+                )
+
+    ops_extra_info_file = open(ops_extra_info_path, 'w')
+    ops_extra_info_file.write(map_code_template(",\n".join(extra_map_str_list)))
+    ops_extra_info_file.close()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Generate PaddlePaddle Extra Param Info for Op')
+    parser.add_argument('--api_compat_yaml_path',
+                        help='path to api compat yaml file',
+                        default='paddle/phi/api/yaml/api_compat.yaml')
+
+    parser.add_argument('--ops_extra_info_path',
+                        help='output of generated extra_prama_info code file',
+                        default='paddle/fluid/operators/ops_extra_info.h')
+
+    options = parser.parse_args()
+
+    api_compat_yaml_path = options.api_compat_yaml_path
+    ops_extra_info_path = options.ops_extra_info_path
+
+    generate_extra_info(api_compat_yaml_path, ops_extra_info_path)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/utils/code_gen/parse_api.py b/paddle/phi/api/yaml/generator/parse_api.py
similarity index 100%
rename from python/paddle/utils/code_gen/parse_api.py
rename to paddle/phi/api/yaml/generator/parse_api.py
diff --git a/python/paddle/utils/code_gen/parse_utils.py b/paddle/phi/api/yaml/generator/parse_utils.py
similarity index 100%
rename from python/paddle/utils/code_gen/parse_utils.py
rename to paddle/phi/api/yaml/generator/parse_utils.py
diff --git a/python/paddle/utils/code_gen/sparse_api_gen.py b/paddle/phi/api/yaml/generator/sparse_api_gen.py
similarity index 94%
rename from python/paddle/utils/code_gen/sparse_api_gen.py
rename to paddle/phi/api/yaml/generator/sparse_api_gen.py
index aa087c9136b13..69bf6950cd822 100644
--- a/python/paddle/utils/code_gen/sparse_api_gen.py
+++ b/paddle/phi/api/yaml/generator/sparse_api_gen.py
@@ -111,9 +111,8 @@ def gen_sparse_kernel_context(self, kernel_output_names):
         for param in kernel_param:
             if param in input_names:
                 if param in self.optional_vars:
-                    raise ValueError(
-                        f"{self.api} : Unsupport optional input({param}) for sparse api."
-                    )
+                    kernel_context_code = kernel_context_code + f"""
+    kernel_context.EmplaceBackInput({param} ? {param}->impl().get() : nullptr);"""
                 else:
                     kernel_context_code = kernel_context_code + f"""
     kernel_context.EmplaceBackInput({param}.impl().get());"""
@@ -170,9 +169,14 @@ def get_condition_code(self, kernel_name):
         condition_list = []
         for i, in_type in enumerate(input_types):
             if in_type == "dense":
-                condition_list.append(
-                    f"phi::DenseTensor::classof({self.inputs['names'][i]}.impl().get())"
-                )
+                if self.inputs['names'][i] in self.optional_vars:
+                    condition_list.append(
+                        f"(!{self.inputs['names'][i]} || phi::DenseTensor::classof({self.inputs['names'][i]}->impl().get()))"
+                    )
+                else:
+                    condition_list.append(
+                        f"phi::DenseTensor::classof({self.inputs['names'][i]}.impl().get())"
+                    )
             else:
                 condition_list.append(
                     f"{self.inputs['names'][i]}.layout() == {sparse_type_map[in_type]}"
@@ -280,7 +284,7 @@ def main():
         description='Generate PaddlePaddle C++ Sparse API files')
     parser.add_argument('--api_yaml_path',
                         help='path to sparse api yaml file',
-                        default='python/paddle/utils/code_gen/sparse_api.yaml')
+                        default='paddle/phi/api/yaml/sparse_api.yaml')
 
     parser.add_argument('--api_header_path',
                         help='output of generated api header code file',
diff --git a/python/paddle/utils/code_gen/sparse_bw_api_gen.py b/paddle/phi/api/yaml/generator/sparse_bw_api_gen.py
similarity index 96%
rename from python/paddle/utils/code_gen/sparse_bw_api_gen.py
rename to paddle/phi/api/yaml/generator/sparse_bw_api_gen.py
index 834e3d45d0b85..f3172a23cb991 100644
--- a/python/paddle/utils/code_gen/sparse_bw_api_gen.py
+++ b/paddle/phi/api/yaml/generator/sparse_bw_api_gen.py
@@ -109,6 +109,7 @@ def source_include(header_file_path):
 
 #include "glog/logging.h"
 
+#include "paddle/phi/api/include/sparse_api.h"
 #include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/sparse_api_custom_impl.h"
@@ -162,10 +163,9 @@ def generate_api(api_yaml_path, header_file_path, source_file_path):
 def main():
     parser = argparse.ArgumentParser(
         description='Generate PaddlePaddle C++ Sparse API files')
-    parser.add_argument(
-        '--api_yaml_path',
-        help='path to sparse api yaml file',
-        default='python/paddle/utils/code_gen/sparse_bw_api.yaml')
+    parser.add_argument('--api_yaml_path',
+                        help='path to sparse api yaml file',
+                        default='paddle/phi/api/yaml/sparse_bw_api.yaml')
 
     parser.add_argument('--api_header_path',
                         help='output of generated api header code file',
diff --git a/python/paddle/utils/code_gen/strings_api_gen.py b/paddle/phi/api/yaml/generator/strings_api_gen.py
similarity index 99%
rename from python/paddle/utils/code_gen/strings_api_gen.py
rename to paddle/phi/api/yaml/generator/strings_api_gen.py
index 1f3ec587d7413..bb5a7a2413d8e 100644
--- a/python/paddle/utils/code_gen/strings_api_gen.py
+++ b/paddle/phi/api/yaml/generator/strings_api_gen.py
@@ -351,7 +351,7 @@ def main():
         description='Generate PaddlePaddle C++ Strings API files')
     parser.add_argument('--api_yaml_path',
                         help='path to sparse api yaml file',
-                        default='python/paddle/utils/code_gen/strings_api.yaml')
+                        default='paddle/phi/api/yaml/strings_api.yaml')
 
     parser.add_argument('--api_header_path',
                         help='output of generated api header code file',
diff --git a/python/paddle/utils/code_gen/templates/ks.c.j2 b/paddle/phi/api/yaml/generator/templates/ks.c.j2
similarity index 70%
rename from python/paddle/utils/code_gen/templates/ks.c.j2
rename to paddle/phi/api/yaml/generator/templates/ks.c.j2
index 54618f0e1e6a1..0ff6d91fc20ca 100644
--- a/python/paddle/utils/code_gen/templates/ks.c.j2
+++ b/paddle/phi/api/yaml/generator/templates/ks.c.j2
@@ -1,5 +1,5 @@
-{% from "operator_utils.c.j2" import name_map, register_name_map %}
-// this file is generated by python/paddle/utils/code_gen/generate_op.py, do not edit.
+{% from "operator_utils.c.j2" import name_map, register_name_map, register_base_kernel_name %}
+// this file is generated by paddle/phi/api/yaml/generator/generate_op.py, do not edit.
 #include "paddle/phi/core/compat/op_utils.h"
 #include "paddle/utils/small_vector.h"
 
@@ -18,6 +18,9 @@ namespace phi {
 }  // namespace phi
 
 {% for api in apis + backward_apis %}
+  {% if api["name"] != api["op_name"] %}
+{{register_base_kernel_name(api)}}
+  {% endif %}
   {% if api is base_api %}
 {{register_name_map(api)}}
   {% endif %}
diff --git a/python/paddle/utils/code_gen/templates/op.c.j2 b/paddle/phi/api/yaml/generator/templates/op.c.j2
similarity index 90%
rename from python/paddle/utils/code_gen/templates/op.c.j2
rename to paddle/phi/api/yaml/generator/templates/op.c.j2
index 5c9559d1c89f8..0c2708ce223c7 100644
--- a/python/paddle/utils/code_gen/templates/op.c.j2
+++ b/paddle/phi/api/yaml/generator/templates/op.c.j2
@@ -1,5 +1,5 @@
 {% from "operator_utils.c.j2" import op_maker, backward_op_maker, operator, register_op_with_components, register_op_version %}
-// this file is generated by python/paddle/utils/code_gen/generate_op.py, do not edit.
+// this file is generated by paddle/phi/api/yaml/generator/generate_op.py, do not edit.
 #include <string>
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -11,7 +11,7 @@
 #include "paddle/phi/infermeta/binary.h"
 #include "paddle/phi/infermeta/ternary.h"
 #include "paddle/phi/infermeta/multiary.h"
-#include "paddle/phi/infermeta/backward.cc"
+#include "paddle/phi/infermeta/backward.h"
 
 namespace paddle {
 namespace operators {
diff --git a/python/paddle/utils/code_gen/templates/operator_utils.c.j2 b/paddle/phi/api/yaml/generator/templates/operator_utils.c.j2
similarity index 94%
rename from python/paddle/utils/code_gen/templates/operator_utils.c.j2
rename to paddle/phi/api/yaml/generator/templates/operator_utils.c.j2
index 9a593a99c1df0..0e684664c4884 100644
--- a/python/paddle/utils/code_gen/templates/operator_utils.c.j2
+++ b/paddle/phi/api/yaml/generator/templates/operator_utils.c.j2
@@ -1,6 +1,6 @@
 {# ----------------------------- op maker ----------------------------------- #}
 {% macro op_maker(api) %}
-  {% set api_name = api["name"] %}
+  {% set api_name = api["op_name"] %}
 class {{api_name | to_pascal_case}}OpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -124,9 +124,12 @@ All possible KernelSignatures returned by {{api["name"] | to_pascal_case }}OpArg
 */
 {% endmacro %}
 
+{% macro register_base_kernel_name(api) %}
+PD_REGISTER_BASE_KERNEL_NAME({{api["op_name"]}}, {{api["name"]}});
+{%- endmacro %}
 
 {% macro register_name_map(api) %}
-PD_REGISTER_ARG_MAPPING_FN({{api["name"]}}, phi::{{api["name"] | to_pascal_case}}OpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN({{api["op_name"]}}, phi::{{api["name"] | to_pascal_case}}OpArgumentMapping);
 {%- endmacro %}
 
 {% macro get_input_list(inputs, kernel_args) %}{# inline #}
@@ -196,7 +199,7 @@ framework::OpKernelType GetExpectedKernelType(
 
 {# --------------------------------------- operator  ---------------------------------------------- #}
 {% macro operator(api) %}
-class {{api["name"] | to_pascal_case}}Op : public framework::OperatorWithKernel {
+class {{api["op_name"] | to_pascal_case}}Op : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
   {# ----------- get expected kernel type function -------------------------- #}
@@ -209,7 +212,7 @@ class {{api["name"] | to_pascal_case}}Op : public framework::OperatorWithKernel
   {% endif %}
 };
 
-DECLARE_INFER_SHAPE_FUNCTOR({{api["name"]}}, {{api["name"] | to_pascal_case}}InferShapeFunctor,
+DECLARE_INFER_SHAPE_FUNCTOR({{api["op_name"]}}, {{api["op_name"] | to_pascal_case}}InferShapeFunctor,
                             PD_INFER_META(phi::{{api["infer_meta"]["func"]}}));
 {# inplace inferer #}
 {% if api["inplace"] is not none %}
@@ -218,19 +221,19 @@ DECLARE_INFER_SHAPE_FUNCTOR({{api["name"]}}, {{api["name"] | to_pascal_case}}Inf
 {{"{"}}{{source | to_opmaker_name}}, {{target | to_opmaker_name}}{{"}"}}{{", " if not loop.last}}
   {%- endfor %}
   {%- endset %}
-DECLARE_INPLACE_OP_INFERER({{api["name"] | to_pascal_case}}InplaceInferer,
+DECLARE_INPLACE_OP_INFERER({{api["op_name"] | to_pascal_case}}InplaceInferer,
                            {{inplace_map}});
 {% endif %}
 
 {# no_need_buffer inferer #}
 {% if api["no_need_buffer"] is not none %}
-DECLARE_NO_NEED_BUFFER_VARS_INFERER({{api["name"] | to_pascal_case}}NoNeedBufferVarInferer,
+DECLARE_NO_NEED_BUFFER_VARS_INFERER({{api["op_name"] | to_pascal_case}}NoNeedBufferVarInferer,
                                     {{api["no_need_buffer"] | map("to_opmaker_name") | join(", ")}});
 {% endif %}
 {% endmacro%}
 
 {% macro register_op_with_components(api) %}
-{% set name = api["name"] %}
+{% set name = api["op_name"] %}
 REGISTER_OPERATOR({{name}}, ops::{{name | to_pascal_case}}Op,
 {% if not "forward" in api %}{# it is a forward api #}
                   ops::{{name | to_pascal_case}}OpMaker,
@@ -254,7 +257,7 @@ REGISTER_OPERATOR({{name}}, ops::{{name | to_pascal_case}}Op,
 
 {% macro register_op_version(api) %}
 {% if "version" in api %}
-{% set name = api["name"] %}
+{% set name = api["op_name"] %}
 REGISTER_OP_VERSION({{name}})
   {% for checkpoint in api["version"]%}
   .AddCheckpoint(
@@ -296,7 +299,7 @@ REGISTER_OP_VERSION({{name}})
 
 {# --------------------------------------- backward op maker ---------------------------------------------- #}
 {% macro backward_op_maker(api, forward_api) %}
-  {% set name = api["name"] %}
+  {% set name = api["op_name"] %}
   {% set forward_input_names = api["forward"]["inputs"] | map(attribute="name") | list %}
   {% set forward_output_names = api["forward"]["outputs"] | map(attribute="name") | list %}
   {% set forward_attr_names = api["forward"]["attrs"] | map(attribute="name") | list %}
@@ -355,15 +358,15 @@ class {{name | to_pascal_case}}OpMaker : public framework::SingleGradOpMaker<T>
   input_orig_names, output_orig_names) %}{# inline #}
   {% if name in input_names %}
     {% set name_in_forward_orig = input_orig_names[input_names.index(name)]%}
-Input("{{name_in_forward_orig | to_pascal_case}}")
+Input("{{name_in_forward_orig}}")
   {%- elif name in output_names %}
     {% set name_in_forward_orig = output_orig_names[output_names.index(name)]%}
-Output("{{name | to_pascal_case}}")
+Output("{{name}}")
   {%- elif name.endswith("_grad") %}{# output grad#}
     {% set name_in_forward = name[:-5] %}
     {% if name_in_forward in output_names %}
       {% set name_in_forward_orig = output_orig_names[output_names.index(name_in_forward)] %}
-OutputGrad("{{name_in_forward_orig | to_pascal_case}}")
+OutputGrad("{{name_in_forward_orig}}")
     {%- endif %}
   {%- endif %}
 {%- endmacro %}
@@ -373,11 +376,11 @@ OutputGrad("{{name_in_forward_orig | to_pascal_case}}")
   {% if name[:-5] in input_names %}
     {% set name_in_forward = name[:-5] %}
     {% set name_in_forward_orig = input_orig_names[input_names.index(name_in_forward)]%}
-InputGrad("{{name[:-5] | to_pascal_case}}")
+InputGrad("{{name[:-5]}}")
   {%- elif (name | to_input_name) in input_names %}
     {% set name_in_forward = name | to_input_name %}
     {% set name_in_forward_orig = input_orig_names[input_names.index(name_in_forward)]%}
-InputGrad("{{name | to_input_name | to_pascal_case}}")
+InputGrad("{{name | to_input_name}}")
   {%- endif %}
 {%- endmacro %}
 
diff --git a/python/paddle/utils/code_gen/tests.py b/paddle/phi/api/yaml/generator/tests.py
similarity index 100%
rename from python/paddle/utils/code_gen/tests.py
rename to paddle/phi/api/yaml/generator/tests.py
diff --git a/python/paddle/utils/code_gen/type_mapping.py b/paddle/phi/api/yaml/generator/type_mapping.py
similarity index 100%
rename from python/paddle/utils/code_gen/type_mapping.py
rename to paddle/phi/api/yaml/generator/type_mapping.py
diff --git a/python/paddle/utils/code_gen/wrapped_infermeta_gen.py b/paddle/phi/api/yaml/generator/wrapped_infermeta_gen.py
similarity index 98%
rename from python/paddle/utils/code_gen/wrapped_infermeta_gen.py
rename to paddle/phi/api/yaml/generator/wrapped_infermeta_gen.py
index 56a55cfe80629..99da6ce3d955f 100644
--- a/python/paddle/utils/code_gen/wrapped_infermeta_gen.py
+++ b/paddle/phi/api/yaml/generator/wrapped_infermeta_gen.py
@@ -160,7 +160,7 @@ def main():
     parser.add_argument('--api_yaml_path',
                         help='path to api yaml file',
                         nargs='+',
-                        default='python/paddle/utils/code_gen/api.yaml')
+                        default='paddle/phi/api/yaml/api.yaml')
     parser.add_argument(
         '--wrapped_infermeta_header_path',
         help='output of generated wrapped_infermeta header code file',
diff --git a/python/paddle/utils/code_gen/legacy_api.yaml b/paddle/phi/api/yaml/legacy_api.yaml
similarity index 98%
rename from python/paddle/utils/code_gen/legacy_api.yaml
rename to paddle/phi/api/yaml/legacy_api.yaml
index 8d20833c652cc..b9e7361abea7d 100644
--- a/python/paddle/utils/code_gen/legacy_api.yaml
+++ b/paddle/phi/api/yaml/legacy_api.yaml
@@ -167,6 +167,15 @@
     func : argsort
   backward : argsort_grad
 
+- api : as_real
+  args : (Tensor x)
+  output : Tensor
+  infer_meta :
+    func : AsRealInferMeta
+  kernel :
+    func : as_real
+#  backward : as_complex
+
 # asin
 - api : asin
   args : (Tensor x)
@@ -342,6 +351,15 @@
     func : clip
   backward : clip_grad
 
+- api : complex
+  args : (Tensor x, Tensor y)
+  output : Tensor
+  infer_meta :
+    func : ComplexInferMeta
+  kernel :
+    func : complex
+  backward : complex_grad
+
 - api : concat
   args : (Tensor[] x, Scalar(int64_t) axis)
   output : Tensor
@@ -489,32 +507,6 @@
     func : determinant
   backward : det_grad
 
-- api : diag
-  args : (Tensor x, int offset, float padding_value)
-  output : Tensor
-  infer_meta :
-    func : DiagInferMeta
-  kernel :
-    func : diag
-
-- api : digamma
-  args : (Tensor x)
-  output : Tensor
-  infer_meta :
-    func : UnchangedInferMeta
-  kernel :
-    func : digamma
-  backward : digamma_grad
-
-- api : dist
-  args : (Tensor x, Tensor y, float p)
-  output : Tensor
-  infer_meta :
-    func : DistInferMeta
-  kernel :
-    func : dist
-  backward : dist_grad
-
 - api : divide
   args : (Tensor x, Tensor y)
   output : Tensor
@@ -524,14 +516,6 @@
     func : divide
   backward : divide_grad
 
-- api : dot
-  args : (Tensor x, Tensor y)
-  output : Tensor
-  infer_meta :
-    func : DotInferMeta
-  kernel :
-    func : dot
-
 - api : dropout
   args : (Tensor x, Tensor seed_tensor, float p, bool is_test, str mode, int seed, bool fix_seed)
   output : Tensor(out), Tensor(mask)
@@ -553,6 +537,14 @@
     func : eigh
   backward : eigh_grad
 
+- api : eigvals
+  args : (Tensor x)
+  output : Tensor
+  infer_meta :
+    func : EigvalsInferMeta
+  kernel :
+    func : eigvals
+
 - api : einsum
   args : (Tensor[] x, str equation)
   output : Tensor, Tensor[]{x.size()}, Tensor[]{x.size()}
@@ -629,14 +621,14 @@
   kernel :
     func : equal_all
 
-# erfinv
 - api : erfinv
   args : (Tensor x)
-  output : Tensor
+  output : Tensor(out)
   infer_meta :
     func : UnchangedInferMeta
   kernel :
     func : erfinv
+  inplace : (x -> out)
   backward : erfinv_grad
 
 # exp
@@ -2231,6 +2223,15 @@
     func : unique
     data_type : x
 
+- api : unique_consecutive
+  args : (Tensor x, bool return_inverse, bool return_counts, int[] axis, int dtype)
+  output : Tensor(out), Tensor(index), Tensor(counts)
+  infer_meta :
+      func : UniqueConsecutiveInferMeta
+  kernel :
+    func : unique_consecutive
+    data_type : x
+
 - api : unsqueeze
   args : (Tensor x, IntArray axis)
   output : Tensor(out), Tensor(xshape)
diff --git a/python/paddle/utils/code_gen/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
similarity index 98%
rename from python/paddle/utils/code_gen/legacy_backward.yaml
rename to paddle/phi/api/yaml/legacy_backward.yaml
index 16d58fde77ffe..b4972c68a6477 100644
--- a/python/paddle/utils/code_gen/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -71,13 +71,6 @@
   backward : add_double_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : add_n_grad
-  forward : add_n (Tensor[] x) -> Tensor(out)
-  args : (Tensor[] x, Tensor out_grad)
-  output : Tensor[](x_grad){x.size()}
-  invoke : add_n_grad_impl(x, out_grad, x_grad)
-  no_need_buffer : x
-
 - backward_api : add_triple_grad
   forward : add_double_grad (Tensor y, Tensor grad_out, Tensor grad_grad_x, Tensor grad_grad_y, int axis = -1) -> Tensor(grad_grad_out)
   args : (Tensor grad_grad_x, Tensor grad_grad_y, Tensor grad_grad_out_grad, int axis = -1)
@@ -133,6 +126,17 @@
     func : asinh_grad
   inplace : (out_grad -> x_grad)
 
+- backward_api : assign_double_grad
+  forward : assign_grad (Tensor grad_out) -> Tensor(grad_x)
+  args : (Tensor grad_x_grad)
+  output : Tensor(grad_out_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+  kernel :
+    func : assign
+  backward: assign_triple_grad
+  inplace : (grad_x_grad -> grad_out_grad)
+
 - backward_api : assign_grad
   forward : assign (Tensor x) -> Tensor(out)
   args : (Tensor out_grad)
@@ -141,6 +145,7 @@
     func : UnchangedInferMeta
   kernel :
     func : assign
+  backward: assign_double_grad
   inplace : (out_grad -> x_grad)
 
 - backward_api : assign_out__grad
@@ -153,6 +158,16 @@
     func : assign
   inplace : (out_grad -> x_grad)
 
+- backward_api : assign_triple_grad
+  forward : assign_double_grad (Tensor grad_out) -> Tensor(grad_x)
+  args : (Tensor grad_x_grad)
+  output : Tensor(grad_out_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+  kernel :
+    func : assign
+  inplace : (grad_x_grad -> grad_out_grad)
+
 - backward_api : atan_grad
   forward : atan (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
@@ -291,6 +306,16 @@
   backward : clip_double_grad
   inplace : (out_grad -> x_grad)
 
+- backward_api : complex_grad
+  forward : complex (Tensor x, Tensor y) -> Tensor(out)
+  args : (Tensor x, Tensor y, Tensor out_grad)
+  output : Tensor(x_grad), Tensor(y_grad)
+  infer_meta :
+    func : ComplexGradInferMeta
+  kernel :
+    func : complex_grad
+    data_type : x
+
 - backward_api : concat_double_grad
   forward : concat_grad (Tensor[] x, Tensor grad_out, Scalar axis) -> Tensor[](grad_x)
   args : (Tensor[] grad_x_grad, Scalar axis = 0)
@@ -498,26 +523,6 @@
   kernel :
     func : determinant_grad
 
-- backward_api : digamma_grad
-  forward : digamma (Tensor x) -> Tensor(out)
-  args : (Tensor x, Tensor out_grad)
-  output : Tensor(x_grad)
-  infer_meta :
-    func : UnchangedInferMeta
-    param : [x]
-  kernel :
-    func : digamma_grad
-
-- backward_api : dist_grad
-  forward : dist (Tensor x, Tensor y, float p) -> Tensor(out)
-  args : (Tensor x, Tensor y, Tensor out, Tensor out_grad, float p)
-  output : Tensor(x_grad), Tensor(y_grad)
-  infer_meta :
-    func : GeneralBinaryGradInferMeta
-    param : [x, y]
-  kernel :
-    func : dist_grad
-
 - backward_api : divide_double_grad
   forward : divide_grad (Tensor x, Tensor y, Tensor out, Tensor grad_out, int axis = -1) -> Tensor(grad_x), Tensor(grad_y)
   args : (Tensor y, Tensor out, Tensor grad_x, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1)
@@ -1843,6 +1848,16 @@
     func : sinh_grad
   inplace : (out_grad -> x_grad)
 
+- backward_api : slice_double_grad
+  forward : slice_grad (Tensor input, Tensor grad_out, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis) -> Tensor(grad_input)
+  args : (Tensor grad_input_grad, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis)
+  output : Tensor(grad_out_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [grad_input_grad]
+  kernel :
+    func : slice
+
 - backward_api : slice_grad
   forward : slice (Tensor input, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis) -> Tensor(out)
   args : (Tensor input, Tensor out_grad, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis)
@@ -1852,6 +1867,7 @@
     param : [input]
   kernel :
     func : slice_grad
+  backward : slice_double_grad
   no_need_buffer : input
 
 - backward_api : soft_shrink_grad
diff --git a/paddle/phi/api/yaml/sparse_api.yaml b/paddle/phi/api/yaml/sparse_api.yaml
new file mode 100644
index 0000000000000..e816824b82f72
--- /dev/null
+++ b/paddle/phi/api/yaml/sparse_api.yaml
@@ -0,0 +1,331 @@
+- api : abs
+  args : (Tensor x)
+  output : Tensor(out)
+  kernel :
+    func : abs_coo{sparse_coo -> sparse_coo},
+           abs_csr{sparse_csr -> sparse_csr}
+    layout : x
+  backward : abs_grad
+
+- api : acos
+  args : (Tensor x)
+  output : Tensor(out)
+  kernel :
+    func : acos_coo{sparse_coo -> sparse_coo},
+           acos_csr{sparse_csr -> sparse_csr}
+    layout : x
+  backward : acos_grad
+
+- api : acosh
+  args : (Tensor x)
+  output : Tensor(out)
+  kernel :
+    func : acosh_coo{sparse_coo -> sparse_coo},
+           acosh_csr{sparse_csr -> sparse_csr}
+    layout : x
+  backward : acosh_grad
+
+- api : add
+  args : (Tensor x, Tensor y)
+  output : Tensor(out)
+  kernel :
+    func : add_coo_coo{sparse_coo, sparse_coo -> sparse_coo},
+           add_csr_csr{sparse_csr, sparse_csr -> sparse_csr}
+    layout : x
+  backward : add_grad
+
+- api : asin
+  args : (Tensor x)
+  output : Tensor(out)
+  kernel :
+    func : asin_coo{sparse_coo -> sparse_coo},
+           asin_csr{sparse_csr -> sparse_csr}
+    layout : x
+  backward : asin_grad
+
+- api : asinh
+  args : (Tensor x)
+  output : Tensor(out)
+  kernel :
+    func : asinh_coo{sparse_coo -> sparse_coo},
+           asinh_csr{sparse_csr -> sparse_csr}
+    layout : x
+  backward : asinh_grad
+
+- api : atan
+  args : (Tensor x)
+  output : Tensor(out)
+  kernel :
+    func : atan_coo{sparse_coo -> sparse_coo},
+           atan_csr{sparse_csr -> sparse_csr}
+    layout : x
+  backward : atan_grad
+
+- api : atanh
+  args : (Tensor x)
+  output : Tensor(out)
+  kernel :
+    func : atanh_coo{sparse_coo -> sparse_coo},
+           atanh_csr{sparse_csr -> sparse_csr}
+    layout : x
+  backward : atanh_grad
+
+- api : cast
+  args : (Tensor x, DataType index_dtype=DataType::UNDEFINED, DataType value_dtype=DataType::UNDEFINED)
+  output : Tensor(out)
+  kernel :
+    func : cast_coo{sparse_coo -> sparse_coo},
+           cast_csr{sparse_csr -> sparse_csr}
+    layout : x
+    data_type : x
+  backward : cast_grad
+
+- api : conv3d
+  args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm)
+  output : Tensor(out), Tensor(rulebook)
+  kernel :
+    func : conv3d_coo{sparse_coo, dense -> sparse_coo, dense}
+    layout : x
+  intermediate : rulebook
+  backward : conv3d_grad
+
+- api : coo_to_dense
+  args : (Tensor x)
+  output : Tensor(out)
+  invoke : to_dense_impl(x)
+  backward : coo_to_dense_grad
+
+- api : create_sparse_coo_tensor
+  args : (Tensor values, Tensor indices, IntArray dense_shape)
+  output : Tensor(out)
+  kernel :
+    func : sparse_coo_tensor{dense, dense -> sparse_coo}
+    layout : values
+    data_type : values
+  backward : create_sparse_coo_tensor_grad
+
+- api : dense_to_coo
+  args : (Tensor x, int64_t sparse_dim)
+  output : Tensor(out)
+  invoke : to_sparse_coo_impl(x, sparse_dim)
+  backward : dense_to_coo_grad
+
+- api : divide
+  args : (Tensor x, Tensor y)
+  output : Tensor(out)
+  kernel :
+    func : divide_coo_coo{sparse_coo, sparse_coo -> sparse_coo},
+           divide_csr_csr{sparse_csr, sparse_csr -> sparse_csr}
+    layout : x
+  backward : divide_grad
+
+- api : divide_scalar
+  args : (Tensor x, float scalar)
+  output : Tensor(out)
+  kernel :
+    func : divide_coo_scalar{sparse_coo -> sparse_coo},
+           divide_csr_scalar{sparse_csr -> sparse_csr}
+  backward : divide_scalar_grad
+
+- api : log1p
+  args : (Tensor x)
+  output : Tensor(out)
+  kernel :
+    func : log1p_coo{sparse_coo -> sparse_coo},
+           log1p_csr{sparse_csr -> sparse_csr}
+    layout : x
+  backward : log1p_grad
+
+- api : multiply
+  args : (Tensor x, Tensor y)
+  output : Tensor(out)
+  kernel :
+    func : multiply_coo_coo{sparse_coo, sparse_coo -> sparse_coo},
+           multiply_csr_csr{sparse_csr, sparse_csr -> sparse_csr}
+    layout : x
+  backward : multiply_grad
+
+- api : pow
+  args : (Tensor x, float factor)
+  output : Tensor(out)
+  kernel :
+    func : pow_coo{sparse_coo -> sparse_coo},
+           pow_csr{sparse_csr -> sparse_csr}
+    layout : x
+  backward : pow_grad
+
+- api : relu
+  args : (Tensor x)
+  output : Tensor(out)
+  kernel :
+    func : relu_coo{sparse_coo -> sparse_coo},
+           relu_csr{sparse_csr -> sparse_csr}
+    layout : x
+  backward : relu_grad
+
+- api : scale
+  args : (Tensor x, float scale, float bias, bool bias_after_scale)
+  output : Tensor(out)
+  kernel :
+    func : scale_coo{sparse_coo -> sparse_coo},
+           scale_csr{sparse_csr -> sparse_csr}
+  backward : scale_grad
+
+- api : sin
+  args : (Tensor x)
+  output : Tensor(out)
+  kernel :
+    func : sin_coo{sparse_coo -> sparse_coo},
+           sin_csr{sparse_csr -> sparse_csr}
+    layout : x
+  backward : sin_grad
+
+- api : sinh
+  args : (Tensor x)
+  output : Tensor(out)
+  kernel :
+    func : sinh_coo{sparse_coo -> sparse_coo},
+           sinh_csr{sparse_csr -> sparse_csr}
+    layout : x
+  backward : sinh_grad
+
+- api : softmax
+  args : (Tensor x, int axis=-1)
+  output : Tensor(out)
+  kernel :
+    func : softmax_csr{sparse_csr -> sparse_csr}
+    layout : x
+  backward : softmax_grad
+
+- api : sqrt
+  args : (Tensor x)
+  output : Tensor(out)
+  kernel :
+    func : sqrt_coo{sparse_coo -> sparse_coo},
+           sqrt_csr{sparse_csr -> sparse_csr}
+    layout : x
+  backward : sqrt_grad
+
+- api : square
+  args : (Tensor x)
+  output : Tensor(out)
+  kernel :
+    func : square_coo{sparse_coo -> sparse_coo},
+           square_csr{sparse_csr -> sparse_csr}
+    layout : x
+  backward : square_grad
+
+- api : subtract
+  args : (Tensor x, Tensor y)
+  output : Tensor(out)
+  kernel :
+    func : subtract_coo_coo{sparse_coo, sparse_coo -> sparse_coo},
+           subtract_csr_csr{sparse_csr, sparse_csr -> sparse_csr}
+    layout : x
+  backward : subtract_grad
+
+- api : tan
+  args : (Tensor x)
+  output : Tensor(out)
+  kernel :
+    func : tan_coo{sparse_coo -> sparse_coo},
+           tan_csr{sparse_csr -> sparse_csr}
+    layout : x
+  backward : tan_grad
+
+- api : tanh
+  args : (Tensor x)
+  output : Tensor(out)
+  kernel :
+    func : tanh_coo{sparse_coo -> sparse_coo},
+           tanh_csr{sparse_csr -> sparse_csr}
+    layout : x
+  backward : tanh_grad
+
+- api : to_dense
+  args : (Tensor x)
+  output : Tensor(out)
+  invoke : to_dense_impl(x)
+
+- api : to_sparse_coo
+  args : (Tensor x, int64_t sparse_dim)
+  output : Tensor(out)
+  invoke : to_sparse_coo_impl(x, sparse_dim)
+
+- api : to_sparse_csr
+  args : (Tensor x)
+  output : Tensor(out)
+  invoke : to_sparse_csr_impl(x)
+
+- api : values
+  args : (Tensor x)
+  output : Tensor(out)
+  kernel :
+    func : coo_values{sparse_coo -> dense},
+           csr_values{sparse_csr -> dense}
+    layout : x
+  backward : values_grad
+
+- api: coalesce
+  args : (Tensor x)
+  output : Tensor(out)
+  kernel :
+    func: coalesce{sparse_coo -> sparse_coo}
+    layout : x
+
+- api: full_like
+  args : (Tensor x, Scalar value, DataType dtype=DataType::UNDEFINED)
+  output : Tensor(out)
+  kernel :
+    func : coo_full_like{sparse_coo -> sparse_coo},
+           csr_full_like{sparse_csr -> sparse_csr}
+    layout : x
+    data_type : dtype
+
+- api: fused_attention
+  args : (Tensor query, Tensor key, Tensor value, Tensor sparse_mask, Tensor key_padding_mask, Tensor attn_mask)
+  output : Tensor(out), Tensor(softmax)
+  kernel :
+    func : fused_attention_csr{dense, dense, dense, sparse_csr, dense, dense -> dense, sparse_csr}
+    layout : sparse_mask
+    data_type: query
+  optional : key_padding_mask, attn_mask
+  intermediate : softmax
+  backward: fused_attention_grad
+
+- api: masked_matmul
+  args : (Tensor x, Tensor y, Tensor mask)
+  output : Tensor(out)
+  kernel :
+    func : masked_matmul_csr{dense, dense, sparse_csr -> sparse_csr}
+    layout : x
+  backward: masked_matmul_grad
+
+- api: matmul
+  args : (Tensor x, Tensor y)
+  output : Tensor(out)
+  kernel :
+    func : matmul_csr_dense {sparse_csr, dense -> dense},
+           matmul_csr_csr {sparse_csr, sparse_csr -> sparse_csr},
+           matmul_coo_dense {sparse_coo, dense -> dense},
+           matmul_coo_coo {sparse_coo, sparse_coo -> sparse_coo}
+    layout : x
+  backward: matmul_grad
+
+- api: maxpool
+  args : (Tensor x, int[] kernel_sizes, int[] paddings, int[] dilations, int[] strides)
+  output : Tensor(out), Tensor(rulebook)
+  kernel :
+    func : sparse_maxpool{sparse_coo -> sparse_coo, dense}
+    layout : x
+  intermediate : rulebook
+  backward : sparse_maxpool_grad
+
+- api: mv
+  args : (Tensor x, Tensor vec)
+  output : Tensor(out)
+  kernel :
+    func : mv_coo{sparse_coo, dense -> dense},
+           mv_csr{sparse_csr, dense -> dense}
+    layout : x
+  backward: mv_grad
diff --git a/paddle/phi/api/yaml/sparse_bw_api.yaml b/paddle/phi/api/yaml/sparse_bw_api.yaml
new file mode 100644
index 0000000000000..68e6020ac3626
--- /dev/null
+++ b/paddle/phi/api/yaml/sparse_bw_api.yaml
@@ -0,0 +1,262 @@
+- backward_api : abs_grad
+  forward : tanh(Tensor x) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad)
+  output : Tensor(x_grad)
+  kernel :
+    func : abs_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
+           abs_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
+
+- backward_api : acos_grad
+  forward : acos(Tensor x) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad)
+  output : Tensor(x_grad)
+  kernel :
+    func : acos_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
+           acos_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
+
+- backward_api : acosh_grad
+  forward : acosh(Tensor x) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad)
+  output : Tensor(x_grad)
+  kernel :
+    func : acosh_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
+           acosh_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
+
+- backward_api : add_grad
+  forward : add(Tensor x, Tensor y) -> Tensor(out)
+  args : (Tensor x, Tensor y, Tensor out_grad)
+  output : Tensor(x_grad), Tensor(y_grad)
+  kernel :
+    func : add_coo_coo_grad{sparse_coo, sparse_coo, sparse_coo -> sparse_coo, sparse_coo},
+           add_csr_csr_grad{sparse_csr, sparse_csr, sparse_csr -> sparse_csr, sparse_csr}
+
+- backward_api : asin_grad
+  forward : asin(Tensor x) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad)
+  output : Tensor(x_grad)
+  kernel :
+    func : asin_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
+           asin_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
+
+- backward_api : asinh_grad
+  forward : asinh(Tensor x) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad)
+  output : Tensor(x_grad)
+  kernel :
+    func : asinh_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
+           asinh_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
+
+- backward_api : atan_grad
+  forward : atan(Tensor x) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad)
+  output : Tensor(x_grad)
+  kernel :
+    func : atan_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
+           atan_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
+
+- backward_api : atanh_grad
+  forward : atanh(Tensor x) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad)
+  output : Tensor(x_grad)
+  kernel :
+    func : atanh_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
+           atanh_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
+
+- backward_api : cast_grad
+  forward : cast(Tensor x, DataType index_dtype, DataType value_dtype) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad, DataType value_dtype)
+  output : Tensor(x_grad)
+  kernel :
+    func : cast_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
+           cast_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
+    data_type : out_grad
+
+- backward_api : conv3d_grad
+  forward : conv3d (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
+  args : (Tensor x, Tensor kernel, Tensor rulebook, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups, bool subm)
+  output : Tensor(x_grad), Tensor(kernel_grad)
+  kernel :
+    func : conv3d_coo_grad{sparse_coo, dense, dense, sparse_coo -> sparse_coo, dense}
+
+- backward_api : coo_to_dense_grad
+  forward : coo_to_dense(Tensor x) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad)
+  output : Tensor(x_grad)
+  kernel :
+    func : sparse_coo_to_dense_grad{sparse_coo, dense-> sparse_coo}
+
+- backward_api : create_sparse_coo_tensor_grad
+  forward : create_sparse_coo_tensor(Tensor values, Tensor indices, IntArray dense_shape) -> Tensor(out)
+  args : (Tensor indices, Tensor out_grad)
+  output : Tensor(values_grad)
+  kernel :
+    func : sparse_coo_tensor_grad{dense, sparse_coo -> dense}
+
+- backward_api : dense_to_coo_grad
+  forward : dense_to_coo(Tensor x, int64_t sparse_dim) -> Tensor(out)
+  args : (Tensor out_grad)
+  output : Tensor(x_grad)
+  invoke : to_dense_impl(out_grad)
+
+- backward_api : divide_grad
+  forward : divide(Tensor x, Tensor y) -> Tensor(out)
+  args : (Tensor x, Tensor y, Tensor out, Tensor out_grad)
+  output : Tensor(x_grad), Tensor(y_grad)
+  kernel :
+    func : divide_coo_coo_grad{sparse_coo, sparse_coo, sparse_coo, sparse_coo -> sparse_coo, sparse_coo},
+           divide_csr_csr_grad{sparse_csr, sparse_csr, sparse_csr, sparse_csr -> sparse_csr, sparse_csr}
+
+- backward_api : divide_scalar_grad
+  forward : divide_scalar (Tensor x, float scalar) -> Tensor(out)
+  args : (Tensor out_grad, float scalar)
+  output : Tensor(x_grad)
+  invoke : divide_scalar(out_grad, scalar)
+
+- backward_api : log1p_grad
+  forward : log1p(Tensor x) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad)
+  output : Tensor(x_grad)
+  kernel :
+    func : log1p_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
+           log1p_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
+
+- backward_api : masked_matmul_grad
+  forward : masked_matmul(Tensor x, Tensor y, Tensor mask) -> Tensor(out)
+  args : (Tensor x, Tensor y, Tensor out_grad)
+  output : Tensor(x_grad), Tensor(y_grad)
+  kernel :
+    func : masked_matmul_csr_grad{dense, dense, sparse_csr -> dense, dense}
+
+- backward_api : matmul_grad
+  forward : matmul(Tensor x, Tensor y) -> Tensor(out)
+  args : (Tensor x, Tensor y, Tensor out_grad)
+  output : Tensor(x_grad), Tensor(y_grad)
+  kernel :
+    func : matmul_csr_dense_grad {sparse_csr, dense, dense -> sparse_csr, dense},
+           matmul_csr_csr_grad {sparse_csr, sparse_csr, sparse_csr -> sparse_csr, sparse_csr},
+           matmul_coo_dense_grad {sparse_coo, dense, dense -> sparse_coo, dense},
+           matmul_coo_coo_grad {sparse_coo, sparse_coo, sparse_coo -> sparse_coo, sparse_coo}
+
+- backward_api : multiply_grad
+  forward : multiply(Tensor x, Tensor y) -> Tensor(out)
+  args : (Tensor x, Tensor y, Tensor out_grad)
+  output : Tensor(x_grad), Tensor(y_grad)
+  kernel :
+    func : multiply_coo_coo_grad{sparse_coo, sparse_coo, sparse_coo -> sparse_coo, sparse_coo},
+           multiply_csr_csr_grad{sparse_csr, sparse_csr, sparse_csr -> sparse_csr, sparse_csr}
+
+- backward_api : mv_grad
+  forward : mv(Tensor x, Tensor vec) -> Tensor(out)
+  args : (Tensor x, Tensor vec, Tensor out_grad)
+  output : Tensor(x_grad), Tensor(vec_grad)
+  kernel :
+    func : mv_coo_grad{sparse_coo, dense, dense -> sparse_coo, dense},
+           mv_csr_grad{sparse_csr, dense, dense -> sparse_csr, dense}
+
+- backward_api : pow_grad
+  forward : pow(Tensor x, float factor) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad, float factor)
+  output : Tensor(x_grad)
+  kernel :
+    func : pow_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
+           pow_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
+
+- backward_api : relu_grad
+  forward : relu(Tensor x) -> Tensor(out)
+  args : (Tensor out, Tensor out_grad)
+  output : Tensor(x_grad)
+  kernel :
+    func : relu_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
+           relu_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
+
+- backward_api : scale_grad
+  forward : scale(Tensor x, float scale, float bias, bool bias_after_scale) -> Tensor(out)
+  args : (Tensor out_grad, float scale)
+  output : Tensor(x_grad)
+  invoke : scale(out_grad, scale, 0.0, true)
+
+- backward_api : sin_grad
+  forward : sin(Tensor x) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad)
+  output : Tensor(x_grad)
+  kernel :
+    func : sin_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
+           sin_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
+
+- backward_api : sinh_grad
+  forward : sinh(Tensor x) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad)
+  output : Tensor(x_grad)
+  kernel :
+    func : sinh_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
+           sinh_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
+
+- backward_api : softmax_grad
+  forward : softmax(Tensor x, int axis=-1) -> Tensor(out)
+  args : (Tensor out, Tensor out_grad, int axis)
+  output : Tensor(x_grad)
+  kernel :
+    func : softmax_csr_grad{sparse_csr, sparse_csr -> sparse_csr}
+
+- backward_api : sparse_maxpool_grad
+  forward : sparse_maxpool(Tensor x, int[] kernel_sizes, int[] paddings, int[] dilations, int[] strides) -> Tensor(out), Tensor(rulebook)
+  args : (Tensor x, Tensor rulebook, Tensor out, Tensor out_grad, int[] kernel_sizes)
+  output : Tensor(x_grad)
+  kernel :
+    func : sparse_maxpool_grad {sparse_coo, dense, sparse_coo, sparse_coo -> sparse_coo}
+
+- backward_api : sqrt_grad
+  forward : sqrt(Tensor x) -> Tensor(out)
+  args : (Tensor out, Tensor out_grad)
+  output : Tensor(x_grad)
+  kernel :
+    func : sqrt_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
+           sqrt_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
+
+- backward_api : square_grad
+  forward : square(Tensor x) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad)
+  output : Tensor(x_grad)
+  kernel :
+    func : square_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
+           square_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
+
+- backward_api : subtract_grad
+  forward : subtract(Tensor x, Tensor y) -> Tensor(out)
+  args : (Tensor x, Tensor y, Tensor out_grad)
+  output : Tensor(x_grad), Tensor(y_grad)
+  kernel :
+    func : subtract_coo_coo_grad{sparse_coo, sparse_coo, sparse_coo -> sparse_coo, sparse_coo},
+           subtract_csr_csr_grad{sparse_csr, sparse_csr, sparse_csr -> sparse_csr, sparse_csr}
+
+- backward_api : tan_grad
+  forward : tan(Tensor x) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad)
+  output : Tensor(x_grad)
+  kernel :
+    func : tan_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
+           tan_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
+
+- backward_api : tanh_grad
+  forward : tanh(Tensor x) -> Tensor(out)
+  args : (Tensor out, Tensor out_grad)
+  output : Tensor(x_grad)
+  kernel :
+    func : tanh_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
+           tanh_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
+
+- backward_api : values_grad
+  forward : coo_values(Tensor x) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad)
+  output : Tensor(x_grad)
+  kernel :
+    func : coo_values_grad{sparse_coo, dense-> sparse_coo}
+
+- backward_api: fused_attention_grad
+  forward : fused_attention_csr(Tensor query, Tensor key, Tensor value, Tensor sparse_mask, Tensor key_padding_mask, Tensor attn_mask) -> Tensor(out), Tensor(softmax)
+  args: (Tensor query, Tensor key, Tensor value, Tensor softmax, Tensor out_grad)
+  output : Tensor(query_grad), Tensor(key_grad), Tensor(value_grad) 
+  kernel :
+    func : fused_attention_csr_grad{dense, dense, dense, sparse_csr, dense -> dense, dense, dense}
+    layout : softmax
+    data_type: query
diff --git a/python/paddle/utils/code_gen/strings_api.yaml b/paddle/phi/api/yaml/strings_api.yaml
similarity index 100%
rename from python/paddle/utils/code_gen/strings_api.yaml
rename to paddle/phi/api/yaml/strings_api.yaml
diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt
index c981b625192da..50367a32b02b8 100644
--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -12,6 +12,10 @@ if(WITH_XPU)
   add_subdirectory(xpu)
 endif()
 
+if(WITH_MKLDNN)
+  add_subdirectory(onednn)
+endif()
+
 cc_library(
   phi_context
   SRCS all_context.cc
diff --git a/paddle/phi/backends/all_context.h b/paddle/phi/backends/all_context.h
index 57e6f084fd4c9..392df09fcffd8 100644
--- a/paddle/phi/backends/all_context.h
+++ b/paddle/phi/backends/all_context.h
@@ -23,7 +23,9 @@ limitations under the License. */
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/custom/custom_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#ifdef PADDLE_WITH_XPU
 #include "paddle/phi/backends/xpu/xpu_context.h"
+#endif
 
 #ifndef PADDLE_WITH_CUSTOM_KERNEL
 // TODO(wilber): DeviceContextPool nees include fluid file.
diff --git a/paddle/phi/backends/c_comm_lib.h b/paddle/phi/backends/c_comm_lib.h
new file mode 100644
index 0000000000000..f2987996cbe58
--- /dev/null
+++ b/paddle/phi/backends/c_comm_lib.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/errors.h"
+#include "paddle/phi/core/macros.h"
+
+namespace phi {
+namespace ccl {
+using CCLComm = void*;
+using CCLRootId = std::vector<uint8_t>;
+
+enum CCLReduceOp { SUM = 0, AVG, MAX, MIN, PRODUCT };
+enum CCLDataType {
+  CCL_DATA_TYPE_FP64 = 0,
+  CCL_DATA_TYPE_FP32,
+  CCL_DATA_TYPE_FP16,
+  CCL_DATA_TYPE_INT64,
+  CCL_DATA_TYPE_INT32,
+  CCL_DATA_TYPE_INT16,
+  CCL_DATA_TYPE_INT8
+};
+
+inline CCLDataType ToCCLDataType(paddle::experimental::DataType type) {
+  if (type == paddle::experimental::DataType::FLOAT64) {
+    return CCL_DATA_TYPE_FP64;
+  } else if (type == paddle::experimental::DataType::FLOAT32) {
+    return CCL_DATA_TYPE_FP32;
+  } else if (type == paddle::experimental::DataType::FLOAT16) {
+    return CCL_DATA_TYPE_FP16;
+  } else if (type == paddle::experimental::DataType::INT64) {
+    return CCL_DATA_TYPE_INT64;
+  } else if (type == paddle::experimental::DataType::INT32) {
+    return CCL_DATA_TYPE_INT32;
+  } else if (type == paddle::experimental::DataType::INT8) {
+    return CCL_DATA_TYPE_INT8;
+  } else {
+    PADDLE_THROW(
+        phi::errors::Unimplemented("This datatype in CCL is not supported."));
+  }
+}
+
+}  // namespace ccl
+}  // namespace phi
diff --git a/paddle/phi/backends/callback_manager.cc b/paddle/phi/backends/callback_manager.cc
index 295f70fc65cd7..7ce59880383c7 100644
--- a/paddle/phi/backends/callback_manager.cc
+++ b/paddle/phi/backends/callback_manager.cc
@@ -18,6 +18,7 @@
 
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/backends/device_guard.h"
 
 namespace phi {
 
@@ -33,12 +34,13 @@ void CallbackManager::AddCallback(std::function<void()> callback) const {
       (*callback_func)();
     });
   });
-
+  phi::DeviceGuard guard(stream_->GetPlace());
   phi::DeviceManager::GetDeviceWithPlace(stream_->GetPlace())
       ->AddCallback(stream_, func);
 }
 
 void CallbackManager::Wait() const {
+  phi::DeviceGuard guard(stream_->GetPlace());
   phi::DeviceManager::GetDeviceWithPlace(stream_->GetPlace())
       ->SynchronizeStream(stream_);
 
diff --git a/paddle/phi/backends/cpu/cpu_context.cc b/paddle/phi/backends/cpu/cpu_context.cc
index 42e19944b210b..d42189e00eeb8 100644
--- a/paddle/phi/backends/cpu/cpu_context.cc
+++ b/paddle/phi/backends/cpu/cpu_context.cc
@@ -14,8 +14,8 @@
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 
-#include "paddle/phi/api/ext/exception.h"
 #include "paddle/phi/common/place.h"
+#include "paddle/phi/core/enforce.h"
 
 // NOTE: The paddle framework should add WITH_EIGEN option to support compile
 // without eigen.
@@ -41,7 +41,10 @@ struct CPUContext::Impl {
   }
 
   Eigen::DefaultDevice* GetEigenDevice() const {
-    PD_CHECK(eigen_device_ != nullptr, "the cpu eigen_device is nullptr.");
+    PADDLE_ENFORCE_NE(
+        eigen_device_,
+        nullptr,
+        phi::errors::Unavailable("the cpu eigen_device is nullptr."));
     return eigen_device_;
   }
 
@@ -51,10 +54,14 @@ struct CPUContext::Impl {
 };
 
 CPUContext::CPUContext()
-    : DeviceContext(), impl_(std::make_unique<CPUContext::Impl>()) {}
+    : DeviceContext(), impl_(std::make_unique<CPUContext::Impl>()) {
+  impl_->Init();
+}
 
 CPUContext::CPUContext(const Place& place)
-    : DeviceContext(), impl_(std::make_unique<CPUContext::Impl>(place)) {}
+    : DeviceContext(), impl_(std::make_unique<CPUContext::Impl>(place)) {
+  impl_->Init();
+}
 
 CPUContext::~CPUContext() = default;
 
@@ -62,8 +69,6 @@ CPUContext::CPUContext(CPUContext&&) = default;
 
 CPUContext& CPUContext::operator=(CPUContext&&) = default;
 
-void CPUContext::Init() { impl_->Init(); }
-
 Eigen::DefaultDevice* CPUContext::eigen_device() const {
   return impl_->GetEigenDevice();
 }
diff --git a/paddle/phi/backends/cpu/cpu_context.h b/paddle/phi/backends/cpu/cpu_context.h
index e482fdc9e042f..58548b2e04e02 100644
--- a/paddle/phi/backends/cpu/cpu_context.h
+++ b/paddle/phi/backends/cpu/cpu_context.h
@@ -34,12 +34,6 @@ class PADDLE_API CPUContext : public DeviceContext {
   Eigen::DefaultDevice* eigen_device() const;
   const Place& GetPlace() const override;
 
- public:
-  // NOTE: DeviceContext hold resources. Used in training scenarios.
-  // The interface used by the training scene, DeviceContext will initialize
-  // all resources and delete them when destructing.
-  void Init();
-
  protected:
   // NOTE: External users manage resources. Used in inference scenarios.
   // The Set interface is for inference only, DeviceContext will mark the
diff --git a/paddle/phi/backends/custom/CMakeLists.txt b/paddle/phi/backends/custom/CMakeLists.txt
index d8ed6706eba22..ceff429f8e596 100644
--- a/paddle/phi/backends/custom/CMakeLists.txt
+++ b/paddle/phi/backends/custom/CMakeLists.txt
@@ -11,4 +11,8 @@ if(WITH_CUSTOM_DEVICE)
     custom_device_test
     SRCS custom_device_test.cc
     DEPS device_manager device_context)
+  cc_test(
+    capi_test
+    SRCS capi_test.cc
+    DEPS phi_capi)
 endif()
diff --git a/paddle/phi/backends/custom/capi_test.cc b/paddle/phi/backends/custom/capi_test.cc
new file mode 100644
index 0000000000000..90b01d0e36021
--- /dev/null
+++ b/paddle/phi/backends/custom/capi_test.cc
@@ -0,0 +1,78 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <cstring>
+#include <string>
+
+#include "paddle/phi/capi/all.h"
+
+#ifndef UNUSED
+#define UNUSED __attribute__((unused))
+#endif
+
+#include "paddle/phi/capi/capi.h"
+
+TEST(CustomKernel, CAPI) {
+  std::string str = "capi";
+  EXPECT_EQ(str.data(), PD_StringAttr(&str));
+
+  std::vector<int32_t> int32_vec({1, 2, 3});
+  auto int32_list = PD_ListInt32Attr(&int32_vec);
+  EXPECT_EQ(int32_list.data, int32_vec.data());
+  EXPECT_EQ(int32_list.size, int32_vec.size());
+
+  std::vector<int64_t> int64_vec({1, 2, 3});
+  auto int64_list = PD_ListInt64Attr(&int64_vec);
+  EXPECT_EQ(int64_list.data, int64_vec.data());
+  EXPECT_EQ(int64_list.size, int64_vec.size());
+
+  std::vector<float> float_vec({1, 2, 3});
+  auto float_list = PD_ListFloatAttr(&float_vec);
+  EXPECT_EQ(float_list.data, float_vec.data());
+  EXPECT_EQ(float_list.size, float_vec.size());
+
+  std::vector<double> double_vec({1, 2, 3});
+  auto double_list = PD_ListDoubleAttr(&double_vec);
+  EXPECT_EQ(double_list.data, double_vec.data());
+  EXPECT_EQ(double_list.size, double_vec.size());
+
+  std::vector<std::string> string_vec{"capi", "api"};
+  auto string_list = PD_ListStringAttr(&string_vec);
+  auto string_data = reinterpret_cast<void**>(string_list.data);
+  for (size_t i = 0; i < string_vec.size(); ++i) {
+    EXPECT_EQ(string_data[i], string_vec[i].data());
+  }
+
+  std::vector<bool> bool_vec{true, false, true};
+  auto bool_list = PD_ListBoolAttr(&bool_vec);
+  auto bool_data = reinterpret_cast<uint8_t*>(bool_list.data);
+  for (size_t i = 0; i < bool_vec.size(); ++i) {
+    EXPECT_EQ(bool_data[i], static_cast<uint8_t>(bool_vec[i]));
+  }
+
+  std::vector<float*> ptr_vec;
+  for (size_t i = 0; i < float_vec.size(); ++i) {
+    ptr_vec.push_back(&float_vec[i]);
+  }
+  auto ptr_list = PD_TensorVectorToList(reinterpret_cast<PD_Tensor*>(&ptr_vec));
+  EXPECT_EQ(ptr_list.data, ptr_vec.data());
+  EXPECT_EQ(ptr_list.size, ptr_vec.size());
+}
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/phi/backends/custom/custom_device.cc b/paddle/phi/backends/custom/custom_device.cc
index 541acd9ecafd0..1a92868dd07db 100644
--- a/paddle/phi/backends/custom/custom_device.cc
+++ b/paddle/phi/backends/custom/custom_device.cc
@@ -27,6 +27,14 @@ static bool operator==(const C_Device_st& d1, const C_Device_st& d2) {
 
 namespace phi {
 
+#define INTERFACE_UNIMPLEMENT              \
+  PADDLE_THROW(phi::errors::Unimplemented( \
+      "%s is not implemented on %s device.", __func__, Type()));
+#define CHECK_PTR(x)       \
+  if (x == nullptr) {      \
+    INTERFACE_UNIMPLEMENT; \
+  }
+
 class CustomDevice : public DeviceInterface {
  public:
   CustomDevice(const std::string& type,
@@ -561,6 +569,208 @@ class CustomDevice : public DeviceInterface {
     return version;
   }
 
+  C_DataType ToXCCLDataType(ccl::CCLDataType data_type) {
+#define return_result(in, ret) \
+  case ccl::CCLDataType::in:   \
+    return C_DataType::ret
+    switch (data_type) {
+      return_result(CCL_DATA_TYPE_FP64, FLOAT64);
+      return_result(CCL_DATA_TYPE_FP32, FLOAT32);
+      return_result(CCL_DATA_TYPE_FP16, FLOAT16);
+      return_result(CCL_DATA_TYPE_INT64, INT64);
+      return_result(CCL_DATA_TYPE_INT32, INT32);
+      return_result(CCL_DATA_TYPE_INT16, INT16);
+      return_result(CCL_DATA_TYPE_INT8, INT8);
+      default: {
+        PADDLE_THROW(phi::errors::Unavailable(
+            "DataType is not supported on %s.", Type()));
+        return C_DataType::UNDEFINED;
+      }
+    }
+#undef return_result
+  }
+
+  C_CCLReduceOp ToXCCLReduceOp(ccl::CCLReduceOp reduce_op) {
+#define return_result(in, ret) \
+  case ccl::CCLReduceOp::in:   \
+    return C_CCLReduceOp::ret
+    switch (reduce_op) {
+      return_result(SUM, SUM);
+      return_result(AVG, AVG);
+      return_result(MAX, MAX);
+      return_result(MIN, MIN);
+      return_result(PRODUCT, PRODUCT);
+      default: {
+        PADDLE_THROW(phi::errors::Unavailable(
+            "ReduceOp is not supported on %s.", Type()));
+      }
+    }
+#undef return_result
+  }
+
+  void CCLGetUniqueId(ccl::CCLRootId* unique_id) override {
+    CHECK_PTR(pimpl_->xccl_get_unique_id_size);
+    CHECK_PTR(pimpl_->xccl_get_unique_id);
+
+    C_CCLRootId root_id;
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+        pimpl_->xccl_get_unique_id_size(&(root_id.sz)));
+    root_id.data = new uint8_t[root_id.sz];
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->xccl_get_unique_id(&root_id));
+
+    uint8_t* ptr = reinterpret_cast<uint8_t*>(root_id.data);
+    *unique_id = std::vector<uint8_t>(ptr, ptr + root_id.sz);
+    delete[] ptr;
+  }
+
+  void CCLCommInitRank(size_t nranks,
+                       ccl::CCLRootId* unique_id,
+                       size_t rank,
+                       ccl::CCLComm* comm) override {
+    CHECK_PTR(pimpl_->xccl_comm_init_rank);
+
+    C_CCLRootId root_id;
+    root_id.sz = unique_id->size();
+    root_id.data = unique_id->data();
+
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->xccl_comm_init_rank(
+        nranks, &root_id, rank, reinterpret_cast<C_CCLComm*>(comm)));
+  }
+
+  void CCLDestroyComm(ccl::CCLComm comm) override {
+    CHECK_PTR(pimpl_->xccl_destroy_comm);
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+        pimpl_->xccl_destroy_comm(reinterpret_cast<C_CCLComm>(comm)));
+  }
+
+  void CCLAllReduce(void* send_buf,
+                    void* recv_buf,
+                    size_t count,
+                    ccl::CCLDataType data_type,
+                    ccl::CCLReduceOp op,
+                    const ccl::CCLComm& comm,
+                    const stream::Stream& stream) override {
+    CHECK_PTR(pimpl_->xccl_all_reduce);
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->xccl_all_reduce(
+        send_buf,
+        recv_buf,
+        count,
+        ToXCCLDataType(data_type),
+        ToXCCLReduceOp(op),
+        reinterpret_cast<C_CCLComm>(comm),
+        reinterpret_cast<C_Stream>(stream.raw_stream())));
+  }
+
+  void CCLBroadcast(void* buf,
+                    size_t count,
+                    ccl::CCLDataType data_type,
+                    size_t root,
+                    const ccl::CCLComm& comm,
+                    const stream::Stream& stream) override {
+    CHECK_PTR(pimpl_->xccl_broadcast);
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->xccl_broadcast(
+        buf,
+        count,
+        ToXCCLDataType(data_type),
+        root,
+        reinterpret_cast<C_CCLComm>(comm),
+        reinterpret_cast<C_Stream>(stream.raw_stream())));
+  }
+
+  void CCLReduce(void* in_data,
+                 void* out_data,
+                 size_t num,
+                 ccl::CCLDataType data_type,
+                 ccl::CCLReduceOp reduce_op,
+                 const ccl::CCLComm& comm,
+                 const stream::Stream& stream) override {
+    CHECK_PTR(pimpl_->xccl_reduce);
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+        pimpl_->xccl_reduce(in_data,
+                            out_data,
+                            num,
+                            ToXCCLDataType(data_type),
+                            ToXCCLReduceOp(reduce_op),
+                            reinterpret_cast<C_CCLComm>(comm),
+                            reinterpret_cast<C_Stream>(stream.raw_stream())));
+  }
+
+  void CCLAllGather(void* send_buf,
+                    void* recv_buf,
+                    size_t count,
+                    ccl::CCLDataType data_type,
+                    const ccl::CCLComm& comm,
+                    const stream::Stream& stream) override {
+    CHECK_PTR(pimpl_->xccl_all_gather);
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->xccl_all_gather(
+        send_buf,
+        recv_buf,
+        count,
+        ToXCCLDataType(data_type),
+        reinterpret_cast<C_CCLComm>(comm),
+        reinterpret_cast<C_Stream>(stream.raw_stream())));
+  }
+
+  void CCLReduceScatter(void* send_buf,
+                        void* recv_buf,
+                        size_t count,
+                        ccl::CCLDataType data_type,
+                        ccl::CCLReduceOp reduce_op,
+                        const ccl::CCLComm& comm,
+                        const stream::Stream& stream) override {
+    CHECK_PTR(pimpl_->xccl_reduce_scatter);
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->xccl_reduce_scatter(
+        send_buf,
+        recv_buf,
+        count,
+        ToXCCLDataType(data_type),
+        ToXCCLReduceOp(reduce_op),
+        reinterpret_cast<C_CCLComm>(comm),
+        reinterpret_cast<C_Stream>(stream.raw_stream())));
+  }
+
+  void CCLGroupStart() override {
+    CHECK_PTR(pimpl_->xccl_group_start);
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->xccl_group_start());
+  }
+
+  void CCLGroupEnd() override {
+    CHECK_PTR(pimpl_->xccl_group_end);
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->xccl_group_end());
+  }
+
+  void CCLSend(void* send_buf,
+               size_t count,
+               ccl::CCLDataType data_type,
+               size_t dest_rank,
+               const ccl::CCLComm& comm,
+               const stream::Stream& stream) override {
+    CHECK_PTR(pimpl_->xccl_send);
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+        pimpl_->xccl_send(send_buf,
+                          count,
+                          ToXCCLDataType(data_type),
+                          dest_rank,
+                          reinterpret_cast<C_CCLComm>(comm),
+                          reinterpret_cast<C_Stream>(stream.raw_stream())));
+  }
+
+  void CCLRecv(void* recv_buf,
+               size_t count,
+               ccl::CCLDataType data_type,
+               size_t src_rank,
+               const ccl::CCLComm& comm,
+               const stream::Stream& stream) override {
+    CHECK_PTR(pimpl_->xccl_recv);
+    PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
+        pimpl_->xccl_recv(recv_buf,
+                          count,
+                          ToXCCLDataType(data_type),
+                          src_rank,
+                          reinterpret_cast<C_CCLComm>(comm),
+                          reinterpret_cast<C_Stream>(stream.raw_stream())));
+  }
+
  private:
   inline int PlaceToIdNoCheck(const Place& place) {
     int dev_id = place.GetDeviceId();
@@ -584,7 +794,7 @@ class CustomDevice : public DeviceInterface {
 };
 
 bool ValidCustomCustomRuntimeParams(const CustomRuntimeParams* params) {
-#define CHECK_PTR(ptr, required)                                   \
+#define CHECK_INTERFACE(ptr, required)                             \
   if (params->interface->ptr == nullptr && required) {             \
     LOG(WARNING) << "CustomRuntime [type: " << params->device_type \
                  << "] pointer: " << #ptr << " is not set.";       \
@@ -604,58 +814,71 @@ bool ValidCustomCustomRuntimeParams(const CustomRuntimeParams* params) {
     return false;
   }
 
-  CHECK_PTR(initialize, false);
-  CHECK_PTR(finalize, false)
-
-  CHECK_PTR(init_device, false);
-  CHECK_PTR(set_device, true);
-  CHECK_PTR(get_device, true);
-  CHECK_PTR(deinit_device, false);
-
-  CHECK_PTR(create_stream, true);
-  CHECK_PTR(destroy_stream, true);
-  CHECK_PTR(query_stream, false);
-  CHECK_PTR(stream_add_callback, false);
-
-  CHECK_PTR(create_event, true);
-  CHECK_PTR(record_event, true);
-  CHECK_PTR(destroy_event, true);
-  CHECK_PTR(query_event, false);
-
-  CHECK_PTR(synchronize_device, false);
-  CHECK_PTR(synchronize_stream, true);
-  CHECK_PTR(synchronize_event, true);
-  CHECK_PTR(stream_wait_event, true);
-
-  CHECK_PTR(device_memory_allocate, true);
-  CHECK_PTR(device_memory_deallocate, true);
-  CHECK_PTR(host_memory_allocate, false);
-  CHECK_PTR(host_memory_deallocate, false);
-  CHECK_PTR(unified_memory_allocate, false);
-  CHECK_PTR(unified_memory_deallocate, false);
-  CHECK_PTR(memory_copy_h2d, true);
-  CHECK_PTR(memory_copy_d2h, true);
-  CHECK_PTR(memory_copy_d2d, true);
-  CHECK_PTR(memory_copy_p2p, false);
-  CHECK_PTR(async_memory_copy_h2d, false);
-  CHECK_PTR(async_memory_copy_d2h, false);
-  CHECK_PTR(async_memory_copy_d2d, false);
-  CHECK_PTR(async_memory_copy_p2p, false);
-
-  CHECK_PTR(get_device_count, true);
-  CHECK_PTR(get_device_list, true);
-  CHECK_PTR(device_memory_stats, true);
-
-  CHECK_PTR(device_min_chunk_size, true);
-  CHECK_PTR(device_max_chunk_size, false);
-  CHECK_PTR(device_max_alloc_size, false);
-  CHECK_PTR(device_extra_padding_size, false);
-  CHECK_PTR(get_compute_capability, false);
-  CHECK_PTR(get_runtime_version, false);
-  CHECK_PTR(get_driver_version, false);
-
+  CHECK_INTERFACE(initialize, false);
+  CHECK_INTERFACE(finalize, false)
+
+  CHECK_INTERFACE(init_device, false);
+  CHECK_INTERFACE(set_device, true);
+  CHECK_INTERFACE(get_device, true);
+  CHECK_INTERFACE(deinit_device, false);
+
+  CHECK_INTERFACE(create_stream, true);
+  CHECK_INTERFACE(destroy_stream, true);
+  CHECK_INTERFACE(query_stream, false);
+  CHECK_INTERFACE(stream_add_callback, false);
+
+  CHECK_INTERFACE(create_event, true);
+  CHECK_INTERFACE(record_event, true);
+  CHECK_INTERFACE(destroy_event, true);
+  CHECK_INTERFACE(query_event, false);
+
+  CHECK_INTERFACE(synchronize_device, false);
+  CHECK_INTERFACE(synchronize_stream, true);
+  CHECK_INTERFACE(synchronize_event, true);
+  CHECK_INTERFACE(stream_wait_event, true);
+
+  CHECK_INTERFACE(device_memory_allocate, true);
+  CHECK_INTERFACE(device_memory_deallocate, true);
+  CHECK_INTERFACE(host_memory_allocate, false);
+  CHECK_INTERFACE(host_memory_deallocate, false);
+  CHECK_INTERFACE(unified_memory_allocate, false);
+  CHECK_INTERFACE(unified_memory_deallocate, false);
+  CHECK_INTERFACE(memory_copy_h2d, true);
+  CHECK_INTERFACE(memory_copy_d2h, true);
+  CHECK_INTERFACE(memory_copy_d2d, true);
+  CHECK_INTERFACE(memory_copy_p2p, false);
+  CHECK_INTERFACE(async_memory_copy_h2d, false);
+  CHECK_INTERFACE(async_memory_copy_d2h, false);
+  CHECK_INTERFACE(async_memory_copy_d2d, false);
+  CHECK_INTERFACE(async_memory_copy_p2p, false);
+
+  CHECK_INTERFACE(get_device_count, true);
+  CHECK_INTERFACE(get_device_list, true);
+  CHECK_INTERFACE(device_memory_stats, true);
+
+  CHECK_INTERFACE(device_min_chunk_size, true);
+  CHECK_INTERFACE(device_max_chunk_size, false);
+  CHECK_INTERFACE(device_max_alloc_size, false);
+  CHECK_INTERFACE(device_extra_padding_size, false);
+  CHECK_INTERFACE(get_compute_capability, false);
+  CHECK_INTERFACE(get_runtime_version, false);
+  CHECK_INTERFACE(get_driver_version, false);
+
+  CHECK_INTERFACE(xccl_get_unique_id, false);
+  CHECK_INTERFACE(xccl_get_unique_id_size, false);
+  CHECK_INTERFACE(xccl_comm_init_rank, false);
+  CHECK_INTERFACE(xccl_destroy_comm, false);
+  CHECK_INTERFACE(xccl_all_reduce, false);
+  CHECK_INTERFACE(xccl_broadcast, false);
+  CHECK_INTERFACE(xccl_reduce, false);
+  CHECK_INTERFACE(xccl_all_gather, false);
+  CHECK_INTERFACE(xccl_reduce_scatter, false);
+  CHECK_INTERFACE(xccl_group_start, false);
+  CHECK_INTERFACE(xccl_group_end, false);
+  CHECK_INTERFACE(xccl_send, false);
+  CHECK_INTERFACE(xccl_recv, false);
   return true;
-#undef CHECK_PTR
+#undef CHECK_INTERFACE
 }
 
 typedef bool (*RegisterDevicePluginFn)(CustomRuntimeParams* runtime_params);
@@ -712,4 +935,6 @@ void LoadCustomRuntimeLib(const std::string& dso_lib_path, void* dso_handle) {
   LOG(INFO) << "Successed in loading custom runtime in lib: " << dso_lib_path;
 }
 
+#undef INTERFACE_UNIMPLEMENT
+
 }  // namespace phi
diff --git a/paddle/phi/backends/custom/custom_device_test.cc b/paddle/phi/backends/custom/custom_device_test.cc
index 51fa74b4dc5f3..930750e864883 100644
--- a/paddle/phi/backends/custom/custom_device_test.cc
+++ b/paddle/phi/backends/custom/custom_device_test.cc
@@ -107,6 +107,7 @@ void TestTensorShareDataWith(const paddle::platform::Place& place) {
 }
 
 void TestTensorUtils(const paddle::platform::Place& place) {
+  std::cout << "TestTensorUtils on " << place << std::endl;
   if (paddle::platform::is_custom_place(place) == false) {
     return;
   }
@@ -166,6 +167,76 @@ void TestTensorUtils(const paddle::platform::Place& place) {
 #endif
 }
 
+void TestCustomCCL(const paddle::platform::Place& place) {
+  std::cout << "TestCustomCCL on " << place << std::endl;
+  if (paddle::platform::is_custom_place(place) == false) {
+    return;
+  }
+  std::string dev_type = place.GetDeviceType();
+  phi::ccl::CCLComm comm;
+  phi::stream::Stream stream(place, nullptr);
+  phi::ccl::CCLRootId root_id;
+
+  phi::DeviceManager::CCLDestroyComm(dev_type, nullptr);
+  phi::DeviceManager::CCLGetUniqueId(dev_type, &root_id);
+  phi::DeviceManager::CCLCommInitRank(dev_type, 0, &root_id, 0, nullptr);
+  phi::DeviceManager::CCLBroadcast(dev_type,
+                                   nullptr,
+                                   0,
+                                   phi::ccl::CCLDataType::CCL_DATA_TYPE_FP32,
+                                   0,
+                                   comm,
+                                   stream);
+  phi::DeviceManager::CCLAllReduce(dev_type,
+                                   nullptr,
+                                   nullptr,
+                                   0,
+                                   phi::ccl::CCLDataType::CCL_DATA_TYPE_FP32,
+                                   phi::ccl::CCLReduceOp::SUM,
+                                   comm,
+                                   stream);
+  phi::DeviceManager::CCLReduce(dev_type,
+                                nullptr,
+                                nullptr,
+                                0,
+                                phi::ccl::CCLDataType::CCL_DATA_TYPE_FP32,
+                                phi::ccl::CCLReduceOp::SUM,
+                                comm,
+                                stream);
+  phi::DeviceManager::CCLAllGather(dev_type,
+                                   nullptr,
+                                   nullptr,
+                                   0,
+                                   phi::ccl::CCLDataType::CCL_DATA_TYPE_FP32,
+                                   comm,
+                                   stream);
+  phi::DeviceManager::CCLReduceScatter(
+      dev_type,
+      nullptr,
+      nullptr,
+      0,
+      phi::ccl::CCLDataType::CCL_DATA_TYPE_FP32,
+      phi::ccl::CCLReduceOp::SUM,
+      comm,
+      stream);
+  phi::DeviceManager::CCLGroupStart(dev_type);
+  phi::DeviceManager::CCLGroupEnd(dev_type);
+  phi::DeviceManager::CCLSend(dev_type,
+                              nullptr,
+                              0,
+                              phi::ccl::CCLDataType::CCL_DATA_TYPE_FP32,
+                              0,
+                              comm,
+                              stream);
+  phi::DeviceManager::CCLRecv(dev_type,
+                              nullptr,
+                              0,
+                              phi::ccl::CCLDataType::CCL_DATA_TYPE_FP32,
+                              0,
+                              comm,
+                              stream);
+}
+
 TEST(CustomDevice, Tensor) {
   InitDevice();
   auto dev_types = phi::DeviceManager::GetAllDeviceTypes();
@@ -179,6 +250,7 @@ TEST(CustomDevice, Tensor) {
     TestTensorMutableData(place);
     TestTensorShareDataWith(place);
     TestTensorUtils(place);
+    TestCustomCCL(place);
   }
 }
 
diff --git a/paddle/phi/backends/custom/fake_cpu_device.h b/paddle/phi/backends/custom/fake_cpu_device.h
index 22c344a0e0488..41c7acc4469cd 100644
--- a/paddle/phi/backends/custom/fake_cpu_device.h
+++ b/paddle/phi/backends/custom/fake_cpu_device.h
@@ -136,6 +136,80 @@ C_Status DeviceMaxAllocSize(const C_Device device, size_t *size) {
   return C_SUCCESS;
 }
 
+C_Status XcclGetUniqueIdSize(size_t *size) {
+  *size = sizeof(size_t);
+  return C_SUCCESS;
+}
+C_Status XcclGetUniqueId(C_CCLRootId *unique_id) { return C_SUCCESS; }
+C_Status XcclCommInitRank(size_t ranks,
+                          C_CCLRootId *unique_id,
+                          size_t rank,
+                          C_CCLComm *comm) {
+  return C_SUCCESS;
+}
+C_Status XcclDestroyComm(C_CCLComm comm) { return C_SUCCESS; }
+C_Status XcclAllReduce(void *send_buf,
+                       void *recv_buf,
+                       size_t count,
+                       C_DataType data_type,
+                       C_CCLReduceOp op,
+                       C_CCLComm comm,
+                       C_Stream stream) {
+  return C_SUCCESS;
+}
+C_Status XcclBroadcast(void *buf,
+                       size_t count,
+                       C_DataType data_type,
+                       size_t root,
+                       C_CCLComm comm,
+                       C_Stream stream) {
+  return C_SUCCESS;
+}
+C_Status XcclReduce(void *send_buf,
+                    void *recv_buf,
+                    size_t count,
+                    C_DataType data_type,
+                    C_CCLReduceOp op,
+                    C_CCLComm comm,
+                    C_Stream stream) {
+  return C_SUCCESS;
+}
+C_Status XcclAllGather(void *send_buf,
+                       void *recv_buf,
+                       size_t count,
+                       C_DataType data_type,
+                       C_CCLComm comm,
+                       C_Stream stream) {
+  return C_SUCCESS;
+}
+C_Status XcclReduceScatter(void *send_buf,
+                           void *recv_buf,
+                           size_t count,
+                           C_DataType data_type,
+                           C_CCLReduceOp op,
+                           C_CCLComm comm,
+                           C_Stream stream) {
+  return C_SUCCESS;
+}
+C_Status XcclGroupStart() { return C_SUCCESS; }
+C_Status XcclGroupEnd() { return C_SUCCESS; }
+C_Status XcclSend(void *send_buf,
+                  size_t count,
+                  C_DataType data_type,
+                  size_t dest_rank,
+                  C_CCLComm comm,
+                  C_Stream stream) {
+  return C_SUCCESS;
+}
+C_Status XcclRecv(void *recv_buf,
+                  size_t count,
+                  C_DataType data_type,
+                  size_t src_rank,
+                  C_CCLComm comm,
+                  C_Stream stream) {
+  return C_SUCCESS;
+}
+
 #define DEVICE_TYPE "FakeCPU"
 #define SUB_DEVICE_TYPE "V100"
 
@@ -190,4 +264,18 @@ void InitFakeCPUDevice(CustomRuntimeParams *params) {
   params->interface->device_max_chunk_size = DeviceMaxChunkSize;
   params->interface->device_min_chunk_size = DeviceMinChunkSize;
   params->interface->device_max_alloc_size = DeviceMaxAllocSize;
+
+  params->interface->xccl_get_unique_id_size = XcclGetUniqueIdSize;
+  params->interface->xccl_get_unique_id = XcclGetUniqueId;
+  params->interface->xccl_all_reduce = XcclAllReduce;
+  params->interface->xccl_all_gather = XcclAllGather;
+  params->interface->xccl_broadcast = XcclBroadcast;
+  params->interface->xccl_comm_init_rank = XcclCommInitRank;
+  params->interface->xccl_destroy_comm = XcclDestroyComm;
+  params->interface->xccl_group_end = XcclGroupEnd;
+  params->interface->xccl_group_start = XcclGroupStart;
+  params->interface->xccl_reduce = XcclReduce;
+  params->interface->xccl_reduce_scatter = XcclReduceScatter;
+  params->interface->xccl_send = XcclSend;
+  params->interface->xccl_recv = XcclRecv;
 }
diff --git a/paddle/phi/backends/device_base.cc b/paddle/phi/backends/device_base.cc
index e57653702c538..4b82f4a340ebb 100644
--- a/paddle/phi/backends/device_base.cc
+++ b/paddle/phi/backends/device_base.cc
@@ -270,4 +270,91 @@ size_t DeviceInterface::GetExtraPaddingSize(size_t dev_id) {
   return 0;
 }
 
+void DeviceInterface::CCLDestroyComm(ccl::CCLComm ccl_comm) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::CCLCommInitRank(size_t num_ranks,
+                                      ccl::CCLRootId* root_id,
+                                      size_t rank_id,
+                                      ccl::CCLComm* ccl_comm) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::CCLGetUniqueId(ccl::CCLRootId* root_id) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::CCLBroadcast(void* data,
+                                   size_t num,
+                                   ccl::CCLDataType data_type,
+                                   size_t root,
+                                   const ccl::CCLComm& ccl_comm,
+                                   const stream::Stream& stream) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::CCLAllReduce(void* in_data,
+                                   void* out_data,
+                                   size_t num,
+                                   ccl::CCLDataType data_type,
+                                   ccl::CCLReduceOp reduce_op,
+                                   const ccl::CCLComm& ccl_comm,
+                                   const stream::Stream& stream) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::CCLReduce(void* in_data,
+                                void* out_data,
+                                size_t num,
+                                ccl::CCLDataType data_type,
+                                ccl::CCLReduceOp reduce_op,
+                                const ccl::CCLComm& ccl_comm,
+                                const stream::Stream& stream) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::CCLAllGather(void* in_data,
+                                   void* out_data,
+                                   size_t num,
+                                   ccl::CCLDataType data_type,
+                                   const ccl::CCLComm& ccl_comm,
+                                   const stream::Stream& stream) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::CCLReduceScatter(void* in_data,
+                                       void* out_data,
+                                       size_t num,
+                                       ccl::CCLDataType data_type,
+                                       ccl::CCLReduceOp op,
+                                       const ccl::CCLComm& ccl_comm,
+                                       const stream::Stream& stream) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::CCLGroupStart() { INTERFACE_UNIMPLEMENT; }
+
+void DeviceInterface::CCLGroupEnd() { INTERFACE_UNIMPLEMENT; }
+
+void DeviceInterface::CCLSend(void* sendbuf,
+                              size_t num,
+                              ccl::CCLDataType data_type,
+                              size_t dst_rank,
+                              const ccl::CCLComm& ccl_comm,
+                              const stream::Stream& stream) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+void DeviceInterface::CCLRecv(void* recvbuf,
+                              size_t num,
+                              ccl::CCLDataType data_type,
+                              size_t src_rank,
+                              const ccl::CCLComm& ccl_comm,
+                              const stream::Stream& stream) {
+  INTERFACE_UNIMPLEMENT;
+}
+
+#undef INTERFACE_UNIMPLEMENT
+
 }  // namespace phi
diff --git a/paddle/phi/backends/device_base.h b/paddle/phi/backends/device_base.h
index 8cc6e498068fa..84249261d1962 100644
--- a/paddle/phi/backends/device_base.h
+++ b/paddle/phi/backends/device_base.h
@@ -16,6 +16,7 @@
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 #include <vector>
 
+#include "paddle/phi/backends/c_comm_lib.h"
 #include "paddle/phi/backends/event.h"
 #include "paddle/phi/backends/stream.h"
 
@@ -165,6 +166,65 @@ class DeviceInterface {  // Driver / Runtime
 
   virtual size_t GetExtraPaddingSize(size_t dev_id);
 
+  // CCL
+  virtual void CCLDestroyComm(ccl::CCLComm ccl_comm);
+
+  virtual void CCLCommInitRank(size_t num_ranks,
+                               ccl::CCLRootId* root_id,
+                               size_t rank_id,
+                               ccl::CCLComm* ccl_comm);
+
+  virtual void CCLGetUniqueId(ccl::CCLRootId* root_id);
+
+  virtual void CCLBroadcast(void* data,
+                            size_t num,
+                            ccl::CCLDataType data_type,
+                            size_t root,
+                            const ccl::CCLComm& ccl_comm,
+                            const stream::Stream& stream);
+
+  virtual void CCLAllReduce(void* in_data,
+                            void* out_data,
+                            size_t num,
+                            ccl::CCLDataType data_type,
+                            ccl::CCLReduceOp reduce_op,
+                            const ccl::CCLComm& ccl_comm,
+                            const stream::Stream& stream);
+  virtual void CCLReduce(void* in_data,
+                         void* out_data,
+                         size_t num,
+                         ccl::CCLDataType data_type,
+                         ccl::CCLReduceOp reduce_op,
+                         const ccl::CCLComm& ccl_comm,
+                         const stream::Stream& stream);
+  virtual void CCLAllGather(void* in_data,
+                            void* out_data,
+                            size_t num,
+                            ccl::CCLDataType data_type,
+                            const ccl::CCLComm& ccl_comm,
+                            const stream::Stream& stream);
+  virtual void CCLReduceScatter(void* in_data,
+                                void* out_data,
+                                size_t num,
+                                ccl::CCLDataType data_type,
+                                ccl::CCLReduceOp op,
+                                const ccl::CCLComm& ccl_comm,
+                                const stream::Stream& stream);
+  virtual void CCLGroupStart();
+  virtual void CCLGroupEnd();
+  virtual void CCLSend(void* sendbuf,
+                       size_t num,
+                       ccl::CCLDataType data_type,
+                       size_t dst_rank,
+                       const ccl::CCLComm& ccl_comm,
+                       const stream::Stream& stream);
+  virtual void CCLRecv(void* recvbuf,
+                       size_t num,
+                       ccl::CCLDataType data_type,
+                       size_t src_rank,
+                       const ccl::CCLComm& ccl_comm,
+                       const stream::Stream& stream);
+
  private:
   const std::string type_;
   const uint8_t priority_;
diff --git a/paddle/phi/backends/device_ext.h b/paddle/phi/backends/device_ext.h
index 77c9ee61858c1..a4dc9176e1b1e 100644
--- a/paddle/phi/backends/device_ext.h
+++ b/paddle/phi/backends/device_ext.h
@@ -74,6 +74,15 @@ typedef void (*C_Callback)(C_Device device,
                            void* user_data,
                            C_Status* status);
 
+typedef struct {
+  size_t sz;
+  void* data;
+} C_CCLRootId;
+
+typedef struct C_CCLComm_st* C_CCLComm;
+
+typedef enum { SUM = 0, AVG, MAX, MIN, PRODUCT } C_CCLReduceOp;
+
 struct C_DeviceInterface {
   // Core fill it and plugin must to check it
   size_t size;
@@ -526,6 +535,102 @@ struct C_DeviceInterface {
 
   void* reserved_info_api[8];
 
+  //////////////
+  // ccl api //
+  //////////////
+
+  /**
+   * @brief Get size of unique id
+   *
+   * @param[size_t*]         size
+   */
+  C_Status (*xccl_get_unique_id_size)(size_t* size);
+
+  /**
+   * @brief Get unique id
+   *
+   * @param[C_CCLRootId*]    unique_id
+   */
+  C_Status (*xccl_get_unique_id)(C_CCLRootId* unique_id);
+
+  /**
+   * @brief Initialize communicator
+   *
+   * @param[size_t]          ranks
+   * @param[C_CCLRootId*]    unique_id
+   * @param[size_t]          rank
+   * @param[C_CCLComm*]      comm
+   */
+  C_Status (*xccl_comm_init_rank)(size_t ranks,
+                                  C_CCLRootId* unique_id,
+                                  size_t rank,
+                                  C_CCLComm* comm);
+
+  /**
+   * @brief Destroy communicator
+   *
+   * @param[C_CCLComm]  comm
+   */
+  C_Status (*xccl_destroy_comm)(C_CCLComm comm);
+
+  C_Status (*xccl_all_reduce)(void* send_buf,
+                              void* recv_buf,
+                              size_t count,
+                              C_DataType data_type,
+                              C_CCLReduceOp op,
+                              C_CCLComm comm,
+                              C_Stream stream);
+
+  C_Status (*xccl_broadcast)(void* buf,
+                             size_t count,
+                             C_DataType data_type,
+                             size_t root,
+                             C_CCLComm comm,
+                             C_Stream stream);
+
+  C_Status (*xccl_reduce)(void* send_buf,
+                          void* recv_buf,
+                          size_t count,
+                          C_DataType data_type,
+                          C_CCLReduceOp op,
+                          C_CCLComm comm,
+                          C_Stream stream);
+
+  C_Status (*xccl_all_gather)(void* send_buf,
+                              void* recv_buf,
+                              size_t count,
+                              C_DataType data_type,
+                              C_CCLComm comm,
+                              C_Stream stream);
+
+  C_Status (*xccl_reduce_scatter)(void* send_buf,
+                                  void* recv_buf,
+                                  size_t count,
+                                  C_DataType data_type,
+                                  C_CCLReduceOp op,
+                                  C_CCLComm comm,
+                                  C_Stream stream);
+
+  C_Status (*xccl_group_start)();
+
+  C_Status (*xccl_group_end)();
+
+  C_Status (*xccl_send)(void* send_buf,
+                        size_t count,
+                        C_DataType data_type,
+                        size_t dest_rank,
+                        C_CCLComm comm,
+                        C_Stream stream);
+
+  C_Status (*xccl_recv)(void* recv_buf,
+                        size_t count,
+                        C_DataType data_type,
+                        size_t src_rank,
+                        C_CCLComm comm,
+                        C_Stream stream);
+
+  void* reserved_ccl_api[8];
+
   ///////////////
   // other api //
   ///////////////
diff --git a/paddle/phi/backends/device_guard.h b/paddle/phi/backends/device_guard.h
index eb14236d251b3..668951f8a1c98 100644
--- a/paddle/phi/backends/device_guard.h
+++ b/paddle/phi/backends/device_guard.h
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #pragma once
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+
 #include "paddle/phi/backends/device_manager.h"
 
 namespace phi {
@@ -44,3 +46,5 @@ class DeviceGuard {
 };
 
 }  // namespace phi
+
+#endif
diff --git a/paddle/phi/backends/device_manager.cc b/paddle/phi/backends/device_manager.cc
index 35339aed0f3e1..405a87f7496a8 100644
--- a/paddle/phi/backends/device_manager.cc
+++ b/paddle/phi/backends/device_manager.cc
@@ -24,6 +24,9 @@
 #include <functional>
 #include <regex>
 
+#include "glog/logging.h"
+#include "paddle/utils/string/split.h"
+
 namespace phi {
 
 void Device::CreateStream(stream::Stream* stream,
@@ -388,14 +391,149 @@ std::vector<size_t> DeviceManager::GetDeviceList(
   return dev_impl->GetDeviceList();
 }
 
+std::vector<size_t> DeviceManager::GetSelectedDeviceList(
+    const std::string& device_type) {
+  std::vector<size_t> devices;
+  std::string FLAGS = "FLAGS_selected_" + device_type + "s";
+  auto FLAGS_selected_devices = getenv(FLAGS.c_str());
+  if (FLAGS_selected_devices) {
+    auto devices_str = paddle::string::Split(FLAGS_selected_devices, ',');
+    for (auto id : devices_str) {
+      devices.push_back(atoi(id.c_str()));
+    }
+  } else {
+    int count = DeviceManager::GetDeviceCount(device_type);
+    for (int i = 0; i < count; ++i) {
+      devices.push_back(i);
+    }
+  }
+  return devices;
+}
+
+void DeviceManager::CCLDestroyComm(const std::string& device_type,
+                                   ccl::CCLComm ccl_comm) {
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  dev_impl->CCLDestroyComm(ccl_comm);
+}
+
+void DeviceManager::CCLCommInitRank(const std::string& device_type,
+                                    size_t num_ranks,
+                                    ccl::CCLRootId* root_id,
+                                    size_t rank_id,
+                                    ccl::CCLComm* ccl_comm) {
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  dev_impl->CCLCommInitRank(num_ranks, root_id, rank_id, ccl_comm);
+}
+
+void DeviceManager::CCLGetUniqueId(const std::string& device_type,
+                                   ccl::CCLRootId* root_id) {
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  dev_impl->CCLGetUniqueId(root_id);
+}
+
+void DeviceManager::CCLBroadcast(const std::string& device_type,
+                                 void* data,
+                                 size_t num,
+                                 ccl::CCLDataType data_type,
+                                 size_t root_id,
+                                 const ccl::CCLComm& ccl_comm,
+                                 const stream::Stream& stream) {
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  dev_impl->CCLBroadcast(data, num, data_type, root_id, ccl_comm, stream);
+}
+
+void DeviceManager::CCLAllReduce(const std::string& device_type,
+                                 void* in_data,
+                                 void* out_data,
+                                 size_t num,
+                                 ccl::CCLDataType data_type,
+                                 ccl::CCLReduceOp reduce_op,
+                                 const ccl::CCLComm& ccl_comm,
+                                 const stream::Stream& stream) {
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  dev_impl->CCLAllReduce(
+      in_data, out_data, num, data_type, reduce_op, ccl_comm, stream);
+}
+
+void DeviceManager::CCLReduce(const std::string& device_type,
+                              void* in_data,
+                              void* out_data,
+                              size_t num,
+                              ccl::CCLDataType data_type,
+                              ccl::CCLReduceOp reduce_op,
+                              const ccl::CCLComm& ccl_comm,
+                              const stream::Stream& stream) {
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  dev_impl->CCLReduce(
+      in_data, out_data, num, data_type, reduce_op, ccl_comm, stream);
+}
+
+void DeviceManager::CCLAllGather(const std::string& device_type,
+                                 void* in_data,
+                                 void* out_data,
+                                 size_t num,
+                                 ccl::CCLDataType data_type,
+                                 const ccl::CCLComm& ccl_comm,
+                                 const stream::Stream& stream) {
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  dev_impl->CCLAllGather(in_data, out_data, num, data_type, ccl_comm, stream);
+}
+
+void DeviceManager::CCLReduceScatter(const std::string& device_type,
+                                     void* in_data,
+                                     void* out_data,
+                                     size_t num,
+                                     ccl::CCLDataType data_type,
+                                     ccl::CCLReduceOp op,
+                                     const ccl::CCLComm& ccl_comm,
+                                     const stream::Stream& stream) {
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  dev_impl->CCLReduceScatter(
+      in_data, out_data, num, data_type, op, ccl_comm, stream);
+}
+
+void DeviceManager::CCLGroupStart(const std::string& device_type) {
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  dev_impl->CCLGroupStart();
+}
+
+void DeviceManager::CCLGroupEnd(const std::string& device_type) {
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  dev_impl->CCLGroupEnd();
+}
+
+void DeviceManager::CCLSend(const std::string& device_type,
+                            void* sendbuf,
+                            size_t num,
+                            ccl::CCLDataType data_type,
+                            size_t dst_rank,
+                            const ccl::CCLComm& ccl_comm,
+                            const stream::Stream& stream) {
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  dev_impl->CCLSend(sendbuf, num, data_type, dst_rank, ccl_comm, stream);
+}
+
+void DeviceManager::CCLRecv(const std::string& device_type,
+                            void* recvbuf,
+                            size_t num,
+                            ccl::CCLDataType data_type,
+                            size_t src_rank,
+                            const ccl::CCLComm& ccl_comm,
+                            const stream::Stream& stream) {
+  auto dev_impl = GetDeviceInterfaceWithType(device_type);
+  dev_impl->CCLRecv(recvbuf, num, data_type, src_rank, ccl_comm, stream);
+}
+
 DeviceManager& DeviceManager::Instance() {
   static DeviceManager platform_manager;
   return platform_manager;
 }
 
 void DeviceManager::Clear() {
-  Instance().device_map_.clear();
-  Instance().device_impl_map_.clear();
+  // TODO(wangran16): fix coredump when using npu plugin
+
+  // Instance().device_map_.clear();
+  // Instance().device_impl_map_.clear();
 }
 
 std::vector<std::string> ListAllLibraries(const std::string& library_dir) {
diff --git a/paddle/phi/backends/device_manager.h b/paddle/phi/backends/device_manager.h
index 56d99ba43bdd1..4ad7643c33d3c 100644
--- a/paddle/phi/backends/device_manager.h
+++ b/paddle/phi/backends/device_manager.h
@@ -17,6 +17,7 @@
 
 #include <unordered_map>
 
+#include "paddle/phi/backends/c_comm_lib.h"
 #include "paddle/phi/backends/device_base.h"
 #include "paddle/phi/backends/device_ext.h"
 #include "paddle/phi/backends/dynload/port.h"
@@ -159,6 +160,74 @@ class DeviceManager {
 
   static std::vector<size_t> GetDeviceList(const std::string& device_type);
 
+  static std::vector<size_t> GetSelectedDeviceList(
+      const std::string& device_type);
+
+  // CCL
+  static void CCLDestroyComm(const std::string& device_type,
+                             ccl::CCLComm ccl_comm);
+  static void CCLCommInitRank(const std::string& device_type,
+                              size_t num_ranks,
+                              ccl::CCLRootId* root_id,
+                              size_t rank_id,
+                              ccl::CCLComm* ccl_comm);
+  static void CCLGetUniqueId(const std::string& device_type,
+                             ccl::CCLRootId* root_id);
+  static void CCLBroadcast(const std::string& device_type,
+                           void* data,
+                           size_t num,
+                           ccl::CCLDataType data_type,
+                           size_t root,
+                           const ccl::CCLComm& ccl_comm,
+                           const stream::Stream& stream);
+  static void CCLAllReduce(const std::string& device_type,
+                           void* in_data,
+                           void* out_data,
+                           size_t num,
+                           ccl::CCLDataType data_type,
+                           ccl::CCLReduceOp reduce_op,
+                           const ccl::CCLComm& ccl_comm,
+                           const stream::Stream& stream);
+  static void CCLReduce(const std::string& device_type,
+                        void* in_data,
+                        void* out_data,
+                        size_t num,
+                        ccl::CCLDataType data_type,
+                        ccl::CCLReduceOp reduce_op,
+                        const ccl::CCLComm& ccl_comm,
+                        const stream::Stream& stream);
+  static void CCLAllGather(const std::string& device_type,
+                           void* in_data,
+                           void* out_data,
+                           size_t num,
+                           ccl::CCLDataType data_type,
+                           const ccl::CCLComm& ccl_comm,
+                           const stream::Stream& stream);
+  static void CCLReduceScatter(const std::string& device_type,
+                               void* in_data,
+                               void* out_data,
+                               size_t num,
+                               ccl::CCLDataType data_type,
+                               ccl::CCLReduceOp op,
+                               const ccl::CCLComm& ccl_comm,
+                               const stream::Stream& stream);
+  static void CCLGroupStart(const std::string& device_type);
+  static void CCLGroupEnd(const std::string& device_type);
+  static void CCLSend(const std::string& device_type,
+                      void* sendbuf,
+                      size_t num,
+                      ccl::CCLDataType data_type,
+                      size_t dst_rank,
+                      const ccl::CCLComm& ccl_comm,
+                      const stream::Stream& stream);
+  static void CCLRecv(const std::string& device_type,
+                      void* recvbuf,
+                      size_t num,
+                      ccl::CCLDataType data_type,
+                      size_t src_rank,
+                      const ccl::CCLComm& ccl_comm,
+                      const stream::Stream& stream);
+
   static void Clear();
 
  private:
diff --git a/paddle/phi/backends/dynload/cusparse.cc b/paddle/phi/backends/dynload/cusparse.cc
index 013211064b8e4..ce8f87dc3cdfa 100644
--- a/paddle/phi/backends/dynload/cusparse.cc
+++ b/paddle/phi/backends/dynload/cusparse.cc
@@ -30,5 +30,9 @@ CUSPARSE_ROUTINE_EACH(DEFINE_WRAP);
 CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP);
 #endif
 
+#ifdef CUSPARSE_ROUTINE_EACH_R3
+CUSPARSE_ROUTINE_EACH_R3(DEFINE_WRAP);
+#endif
+
 }  // namespace dynload
 }  // namespace phi
diff --git a/paddle/phi/backends/dynload/port.cc b/paddle/phi/backends/dynload/port.cc
index 5988417654890..d1b3da64c8570 100644
--- a/paddle/phi/backends/dynload/port.cc
+++ b/paddle/phi/backends/dynload/port.cc
@@ -18,6 +18,8 @@
 #include <stdexcept>
 #include <string>
 
+#include "glog/logging.h"
+
 #if !defined(_WIN32)
 #include <dlfcn.h>  // dladdr
 #include <sys/stat.h>
diff --git a/paddle/phi/backends/dynload/port.h b/paddle/phi/backends/dynload/port.h
index ed48553accb74..03a2863e4dc4e 100644
--- a/paddle/phi/backends/dynload/port.h
+++ b/paddle/phi/backends/dynload/port.h
@@ -17,7 +17,6 @@
 #include <string>
 
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
-#include "glog/logging.h"
 
 #if !defined(_WIN32)
 #include <dlfcn.h>  // dladdr
diff --git a/paddle/phi/backends/event.cc b/paddle/phi/backends/event.cc
index 43077d280f360..b594d919abc18 100644
--- a/paddle/phi/backends/event.cc
+++ b/paddle/phi/backends/event.cc
@@ -35,7 +35,11 @@ Event::~Event() { Destroy(); }
 
 bool Event::Init(const Place& place, Flag flags) {
   place_ = place;
-  DeviceGuard guard(place_);
+  device_ = phi::DeviceManager::GetDeviceWithPlace(place);
+
+  // note(wangran16): bind device to the current thread. fix npu plugin null
+  // context bug.
+  phi::DeviceManager::SetDevice(place_);
   device_->CreateEvent(this, flags);
   VLOG(3) << "Init Event: " << event_ << ", place: " << place_
           << ", flag:" << static_cast<int>(flags);
@@ -45,7 +49,7 @@ bool Event::Init(const Place& place, Flag flags) {
 
 void Event::Destroy() {
   if (own_data_) {
-    DeviceGuard guard(place_);
+    phi::DeviceManager::SetDevice(place_);
     device_->DestroyEvent(this);
     own_data_ = false;
   }
diff --git a/paddle/phi/backends/event.h b/paddle/phi/backends/event.h
index 0866adcf39afa..8de223528f8fd 100644
--- a/paddle/phi/backends/event.h
+++ b/paddle/phi/backends/event.h
@@ -36,6 +36,7 @@ class Event {
     Interprocess = 0x4,
   };
 
+  Event() = default;
   // For compatible
   Event(const Place& place, event_t event);
   ~Event();
diff --git a/paddle/phi/backends/gpu/gpu_info.cc b/paddle/phi/backends/gpu/gpu_info.cc
index cc655f1822998..a2399554ba853 100644
--- a/paddle/phi/backends/gpu/gpu_info.cc
+++ b/paddle/phi/backends/gpu/gpu_info.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/phi/backends/gpu/gpu_info.h"
 
+#include <sstream>
 #include <vector>
 
 #include "gflags/gflags.h"
diff --git a/paddle/phi/backends/gpu/gpu_launch_config.h b/paddle/phi/backends/gpu/gpu_launch_config.h
index 04b2786c4d0fb..552f60783c8b2 100644
--- a/paddle/phi/backends/gpu/gpu_launch_config.h
+++ b/paddle/phi/backends/gpu/gpu_launch_config.h
@@ -30,6 +30,7 @@
 #include <string>
 #include <vector>
 
+#include "glog/logging.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/enforce.h"
 
diff --git a/paddle/phi/backends/onednn/CMakeLists.txt b/paddle/phi/backends/onednn/CMakeLists.txt
new file mode 100644
index 0000000000000..a65d6b002f5f7
--- /dev/null
+++ b/paddle/phi/backends/onednn/CMakeLists.txt
@@ -0,0 +1,6 @@
+if(WITH_MKLDNN)
+  cc_library(
+    onednn_context
+    SRCS onednn_context.cc
+    DEPS cpu_context mkldnn)
+endif()
diff --git a/paddle/phi/backends/onednn/onednn_context.cc b/paddle/phi/backends/onednn/onednn_context.cc
new file mode 100644
index 0000000000000..950483a469ed8
--- /dev/null
+++ b/paddle/phi/backends/onednn/onednn_context.cc
@@ -0,0 +1,326 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/phi/backends/onednn/onednn_context.h"
+
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/enforce.h"
+
+#include "paddle/fluid/framework/expect.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace phi {
+
+OneDNNContextThreadLocals::Body::Body()
+    : cur_engine(dnnl::engine::kind::cpu, 0), cur_stream(cur_engine) {
+  cur_mkldnn_session_id = kMKLDNNSessionID_Default;
+  cur_input_shape_str = "";
+  cur_input_shape_cache_capacity = 1;
+  cur_paddle_data_layout = DataLayout::kNCHW;
+}
+
+// When Thread finish we clear oneDNN cache
+// This is needed when we have one executor used by many threads
+// e.g. test_analyzer_detect. Thread ID is not part of caching key
+// (for naive executor) so we need to clear cache when one thread finish
+// and other is to start inference
+// TODO(jczaja): Ideally it would be good to clear only part of cache
+// related to thread that is to be terminated
+OneDNNContextThreadLocals::Body::~Body() {
+  auto cpu_place = phi::CPUPlace();
+  // TODO(YuanRisheng): we need remove the dependency on fluid device context
+  // here
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
+  OneDNNContext* dev_ctx = static_cast<OneDNNContext*>(pool.Get(cpu_place));
+  dev_ctx->ResetBlobMap(exec_ptr_);
+}
+
+void OneDNNContextThreadLocals::Body::set_cur_mkldnn_session_id(size_t sid) {
+  cur_mkldnn_session_id = sid;
+}
+size_t OneDNNContextThreadLocals::Body::get_cur_mkldnn_session_id(void) {
+  return cur_mkldnn_session_id;
+}
+
+void OneDNNContextThreadLocals::Body::set_cur_input_shape_str(
+    std::string input_shape_str) {
+  cur_input_shape_str = input_shape_str;
+}
+void OneDNNContextThreadLocals::Body::set_cur_input_shape_cache_capacity(
+    int input_shape_cache_capacity) {
+  cur_input_shape_cache_capacity = input_shape_cache_capacity;
+}
+
+void OneDNNContextThreadLocals::Body::set_cur_paddle_data_layout(
+    DataLayout dl) {
+  cur_paddle_data_layout = dl;
+}
+
+DataLayout OneDNNContextThreadLocals::Body::get_cur_paddle_data_layout(void) {
+  return cur_paddle_data_layout;
+}
+
+void OneDNNContextThreadLocals::Body::log_lib_version(void) {
+  if (!said_once) {
+    said_once = true;
+    auto dv = dnnl::version();
+    LOG(INFO) << "oneDNN v" << dv->major << "." << dv->minor << "."
+              << dv->patch;
+  }
+}
+
+struct OneDNNContext::Impl {
+  Impl() : p_blobmap_() {
+    p_blobmap_.reset(new BlobMap());
+    p_exec_items_.reset(new ExecShape());
+    p_mutex_.reset(new std::mutex());
+  }
+
+  ~Impl() {}
+
+  void ResetBlobMap(void* ptr) {
+    VLOG(4) << OneDNNContext::tls().get_curr_exec() << " " << ptr;
+    std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
+    if (block_next_cache_clearing_ == 0) {
+      VLOG(3) << "Clearing DNNL cache.";
+      // If no specific executor pointer then clear
+      // everything. For executor pointer then clear only
+      // objects allocated when using given executor
+      if (ptr == nullptr) {
+        p_blobmap_->clear();
+      } else {
+        // Iterate through all shapes and release
+        // for each shape and active executor all entries
+        // of this executor
+        for (auto& s : *p_exec_items_) {
+          for (auto& v : (*s.second)[ptr]) {
+            (v.first)->erase(v.second);
+          }
+          s.second->erase(ptr);
+        }
+      }
+      // Reset paddle layout to NCHW
+      VLOG(3) << "Resetting Paddle data layout to NCHW.";
+      OneDNNContext::tls().set_cur_paddle_data_layout(DataLayout::kNCHW);
+    } else {
+      --block_next_cache_clearing_;
+      VLOG(3) << "Prevented Clearing DNNL cache. Updated "
+                 "block_next_cache_clearing_ : "
+              << block_next_cache_clearing_;
+      PADDLE_ENFORCE_GE(block_next_cache_clearing_,
+                        0,
+                        phi::errors::InvalidArgument(
+                            "Cache clearing mark should be non-negative "
+                            ". But received %d.",
+                            block_next_cache_clearing_));
+    }
+  }
+
+  // Register object to currently used executor's map
+  void LinkEntryWithExecutor(BlobPtr_t<KeyBlob> pblob,
+                             KeyBlob::iterator it) const {
+    // Take current input shape from TLS
+    // Take current executor addess from TLS
+    // and for this executor's items add the one defined with arguments
+    auto key_it =
+        p_exec_items_
+            ->insert(std::make_pair(OneDNNContext::tls().cur_input_shape_str,
+                                    std::make_shared<ExecMap>()))
+            .first;
+    (*key_it->second)[OneDNNContext::tls().get_curr_exec()].push_back(
+        std::make_pair(pblob, it));
+
+    VLOG(3) << "LinkEntryWithExecutor, shapes: " << p_exec_items_->size()
+            << " curr exec size: "
+            << (*key_it->second)[OneDNNContext::tls().get_curr_exec()].size()
+            << "\n";
+  }
+
+  void RemoveShapeEntriesWithExecutor() const {
+    p_exec_items_->erase(p_exec_items_->begin());
+  }
+
+  void BlockNextCacheClearing() {
+    std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
+    ++block_next_cache_clearing_;
+    VLOG(3) << "Next DNNL cache clearing has been blocked. Updated "
+               "block_next_cache_clearing_ : "
+            << block_next_cache_clearing_;
+  }
+
+  size_t GetShapeBlobSize() const {
+    std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
+    BlobMap* pMap = p_blobmap_.get();
+    auto map_it = pMap->find(OneDNNContext::tls().cur_mkldnn_session_id);
+    if (map_it == pMap->end()) {
+      PADDLE_THROW(phi::errors::NotFound(
+          "OneDNNContext don't find cur_mkldnn_session_id: %d.",
+          OneDNNContext::tls().cur_mkldnn_session_id));
+    }
+    return map_it->second->size();
+  }
+
+  void SetBlob(const std::string& name, BlobPtr_t<void> data) const {
+    BlobMap* pMap = p_blobmap_.get();
+    BlobPtr_t<ShapeBlob> sBlob = nullptr;
+    BlobPtr_t<KeyBlob> pBlob = nullptr;
+
+    int sid = OneDNNContext::tls().get_cur_mkldnn_session_id();
+
+    std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
+
+    // Find ShapeBlob for current mkldnn session id.
+    auto map_it = pMap->find(sid);
+
+    if (map_it == pMap->end()) {
+      // 1st time to set blob in current thread
+      sBlob = std::make_shared<ShapeBlob>();
+      (*pMap)[sid] = sBlob;
+      VLOG(2) << "SetBlob: sid=" << sid << ", add new sid\n";
+    } else {
+      sBlob = map_it->second;
+    }
+
+    // Find KeyBlob for current input shape
+    auto key_it = sBlob->find(OneDNNContext::tls().cur_input_shape_str);
+
+    if (key_it == sBlob->end()) {
+      // In cache clearing mode, cur_input_shape_cache_capacity defines
+      // max pblob capacity
+      if ((static_cast<size_t>(sid) ==
+           OneDNNContextThreadLocals::kMKLDNNSessionID_CacheClearing) &&
+          sBlob->size() &&
+          (sBlob->size() >=
+           static_cast<size_t>(
+               OneDNNContext::tls().cur_input_shape_cache_capacity))) {
+        VLOG(2) << "sid=" << sid
+                << ", remove all blobs of shape: " << sBlob->begin()->first;
+        sBlob->erase(sBlob->begin()->first);
+        RemoveShapeEntriesWithExecutor();
+      }
+      pBlob = std::make_shared<KeyBlob>();
+      (*sBlob)[OneDNNContext::tls().cur_input_shape_str] = pBlob;
+    } else {
+      pBlob = key_it->second;
+    }
+
+    // Find Blob via name
+    auto blob_it = pBlob->find(name);
+    if (blob_it == pBlob->end()) {
+      auto el =
+          pBlob->insert(std::make_pair(name, data));  //  (*pBlob)[name] = data;
+      // Register new element in per executor map
+      // to have easily erased when executor terminated
+      LinkEntryWithExecutor(pBlob, el.first);
+    } else {
+      blob_it->second = data;  // set data to existing blob
+    }
+    VLOG(2) << "SetBlob: sid=" << sid << ", add blob=" << name << "\n";
+    // lock will be automatically released when out of scope
+    return;
+  }
+
+  unsigned int GetCachedObjectsNumber(void) const {
+    unsigned int num_entries = 0;
+    for (auto const& l3 : *p_blobmap_) {
+      for (auto const& l2 : *(l3.second)) {
+        num_entries += (l2.second)->size();
+      }
+    }
+    return num_entries;
+  }
+
+  OneDNNContext::BlobPtr_t<void> GetBlob(const std::string& name) const {
+    BlobMap* pMap = p_blobmap_.get();
+    BlobPtr_t<ShapeBlob> sBlob = nullptr;
+    BlobPtr_t<KeyBlob> pBlob = nullptr;
+
+    int sid = OneDNNContext::tls().get_cur_mkldnn_session_id();
+
+    std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
+
+    // Find ShapeBlob for current mkldnn session id firstly
+    auto map_it = pMap->find(sid);
+    // (jczaja): After first iteration of model's execution we
+    // should have all elements cached (mostly) so failures are unlikely (less
+    // likely for dynamic shapes)
+    if (unlikely(map_it == pMap->end())) {
+      VLOG(2) << "GetBlob: sid=" << sid << ", miss sid\n";
+      return nullptr;
+    }
+    sBlob = map_it->second;
+
+    // Find KeyBlob for current input shape secondly
+    auto sBlob_it = sBlob->find(OneDNNContext::tls().cur_input_shape_str);
+    if (unlikely(sBlob_it == sBlob->end())) {
+      VLOG(2) << "GetBlob: sid=" << OneDNNContext::tls().cur_input_shape_str
+              << ", miss input_shape_str\n";
+      return nullptr;
+    }
+    pBlob = sBlob_it->second;
+
+    // Find Blob via name
+    auto key_it = pBlob->find(name);
+
+    if (unlikely(key_it == pBlob->end())) {
+      VLOG(2) << "GetBlob sid=" << sid << ", miss blob=" << name << "\n";
+      return nullptr;
+    }
+
+    VLOG(2) << "GetBlob sid=" << sid << ", get blob=" << name << "\n";
+    // lock will be automatically released when out of scope
+    return key_it->second;
+  }
+
+  std::shared_ptr<BlobMap> p_blobmap_;
+  // Map key is pointer of executor and value is a data(iterator in map) needed
+  // to erase
+  std::shared_ptr<ExecShape> p_exec_items_;
+  std::shared_ptr<std::mutex> p_mutex_;
+  // 0 - clearing is allowed. x > 0 do not clear.
+  unsigned int block_next_cache_clearing_ = 0;
+};
+
+OneDNNContext::OneDNNContext(const Place& place)
+    : CPUContext(place), impl_(std::make_unique<Impl>()) {}
+
+OneDNNContext::~OneDNNContext() = default;
+
+void OneDNNContext::ResetBlobMap(void* ptr) { impl_->ResetBlobMap(ptr); }
+
+void OneDNNContext::BlockNextCacheClearing() {
+  impl_->BlockNextCacheClearing();
+}
+
+size_t OneDNNContext::GetShapeBlobSize() const {
+  return impl_->GetShapeBlobSize();
+}
+
+void OneDNNContext::SetBlob(const std::string& name,
+                            BlobPtr_t<void> data) const {
+  impl_->SetBlob(name, data);
+}
+
+unsigned int OneDNNContext::GetCachedObjectsNumber(void) const {
+  return impl_->GetCachedObjectsNumber();
+}
+
+OneDNNContext::BlobPtr_t<void> OneDNNContext::GetBlob(
+    const std::string& name) const {
+  return impl_->GetBlob(name);
+}
+
+}  // namespace phi
+#endif
diff --git a/paddle/phi/backends/onednn/onednn_context.h b/paddle/phi/backends/onednn/onednn_context.h
new file mode 100644
index 0000000000000..d7cf8a0ff4902
--- /dev/null
+++ b/paddle/phi/backends/onednn/onednn_context.h
@@ -0,0 +1,143 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#ifdef PADDLE_WITH_MKLDNN
+#include <memory>
+#include <mutex>     // NOLINT
+#include "dnnl.hpp"  // NOLINT
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/common/place.h"
+
+namespace phi {
+
+class OneDNNContextThreadLocals {
+  // default mkldnn session id
+
+  typedef OneDNNContextThreadLocals self;
+  struct Body {
+    bool said_once = false;
+    size_t cur_mkldnn_session_id;
+    // Current data input shape string.
+    // - For fixed-shape, it's a null string in default.
+    // - For dynamic-shape, it's user specific.
+    std::string cur_input_shape_str;
+    // the cache capacity of different input shapes for MKLDNN.
+    // Default 1 means fixed input shape, not dynamic shape.
+    int cur_input_shape_cache_capacity;
+    // Recently registered data_format. This is needed to
+    // know for converting MKL-DNN Tensor to non MKL-DNN
+    DataLayout cur_paddle_data_layout;
+    // MKL-DNN stream used for execution of primitives (per-thread)
+    dnnl::engine cur_engine;
+    dnnl::stream cur_stream;
+    std::string key_suffix;  // Key identifying current Executor
+    bool key_attach_thread_id = true;
+    void* exec_ptr_ = nullptr;
+
+    Body();
+    ~Body();
+    void set_cur_mkldnn_session_id(size_t sid);
+    size_t get_cur_mkldnn_session_id(void);
+    void set_cur_input_shape_str(std::string input_shape_str);
+    void set_cur_input_shape_cache_capacity(int input_shape_cache_capacity);
+    void set_cur_paddle_data_layout(DataLayout dl);
+    DataLayout get_cur_paddle_data_layout(void);
+    void log_lib_version(void);
+    const dnnl::engine& get_engine(void) { return cur_engine; }
+    dnnl::stream& get_stream(void) { return cur_stream; }
+    void set_key_suffix(const std::string& suffix) { key_suffix = suffix; }
+    const std::string& get_key_suffix(void) const { return key_suffix; }
+    void disable_tid_in_key(void) { key_attach_thread_id = false; }
+    bool is_tid_used_in_key(void) const { return key_attach_thread_id; }
+    void set_curr_exec(void* exec_ptr) { exec_ptr_ = exec_ptr; }
+    void* get_curr_exec(void) const { return exec_ptr_; }
+  };
+  OneDNNContextThreadLocals() = default;
+  OneDNNContextThreadLocals(const OneDNNContextThreadLocals& c) = delete;
+
+ public:
+  // default mkldnn session id
+  static constexpr size_t kMKLDNNSessionID_Default = 0;
+  // mkldnn session id for cache clearing mode
+  static constexpr size_t kMKLDNNSessionID_CacheClearing = -1;
+  static Body& fetch() {
+    thread_local Body b;
+    return b;
+  }
+};
+
+class OneDNNContext : public CPUContext {
+ public:
+  template <class T>
+  using BlobPtr_t = std::shared_ptr<T>;
+  template <class P1, class P2>
+  using umap_value_smart_t = std::unordered_map<P1, BlobPtr_t<P2>>;
+  template <class T>
+  using umap_key_string_t = umap_value_smart_t<std::string, T>;
+
+  // Following three maps are used to cache MKLDNN primitives.
+  // There relations are:
+  // - BlobMap = Map<cur_thread_id, ShapeBlob>
+  // - ShapeBlob = Map<cur_input_shape_str, KeyBlob>
+  // - KeyBlob  = Map<blob_name, blob>
+
+  using KeyBlob = umap_key_string_t<void>;
+  using ShapeBlob = umap_key_string_t<KeyBlob>;
+  using BlobMap = umap_value_smart_t<int, ShapeBlob>;
+
+  // Auxillary two-level structure (shape, executor) to easier control
+  // clearing cache objects related to specific executor
+
+  using ExecKey = void*;
+  using ExecMapCacheIterPair = std::pair<BlobPtr_t<KeyBlob>, KeyBlob::iterator>;
+  using ExecMap =
+      std::unordered_map<ExecKey, std::vector<ExecMapCacheIterPair>>;
+  using ExecShape = std::unordered_map<std::string, std::shared_ptr<ExecMap>>;
+
+  explicit OneDNNContext(const Place& place);
+  ~OneDNNContext();
+  /* \brief  Get the active engine */
+  const dnnl::engine& GetEngine() const { return tls().get_engine(); }
+
+  // Remove all entries from the blob map
+  void ResetBlobMap(void* ptr);
+
+  // Prevent next ResetBlobMap()
+  void BlockNextCacheClearing();
+
+  // Get the ShapeBlob size in cur_mkldnn_session_id.
+  size_t GetShapeBlobSize() const;
+
+  // Set data to blob (i.e. name/data pair). Create blob if not existing
+  void SetBlob(const std::string& name, std::shared_ptr<void> data) const;
+
+  // Calculate number of oneDNN objects cached
+  unsigned int GetCachedObjectsNumber(void) const;
+
+  // Find a saved blob. Return nullptr if not found
+  std::shared_ptr<void> GetBlob(const std::string& name) const;
+
+  static auto tls() -> decltype(OneDNNContextThreadLocals::fetch()) {
+    return OneDNNContextThreadLocals::fetch();
+  }
+
+ private:
+  struct Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+}  // namespace phi
+#endif
diff --git a/paddle/phi/backends/stream.cc b/paddle/phi/backends/stream.cc
index f8b15bdbd9e63..bad57c5238ec8 100644
--- a/paddle/phi/backends/stream.cc
+++ b/paddle/phi/backends/stream.cc
@@ -40,7 +40,10 @@ bool Stream::Init(const Place& place,
                   const Flag& flag) {
   place_ = place;
   device_ = phi::DeviceManager::GetDeviceWithPlace(place);
-  DeviceGuard guard(place_);
+
+  // note(wangran16): bind device to the current thread. fix npu plugin null
+  // context bug.
+  phi::DeviceManager::SetDevice(place_);
   device_->CreateStream(this, priority, flag);
 
   callback_manager_.reset(new CallbackManager(this));
@@ -80,7 +83,7 @@ void Stream::WaitCallback() const { callback_manager_->Wait(); }
 
 void Stream::Destroy() {
   if (own_data_) {
-    DeviceGuard guard(place_);
+    phi::DeviceManager::SetDevice(place_);
     device_->DestroyStream(this);
     own_data_ = false;
   }
diff --git a/paddle/phi/backends/xpu/xpu_context.cc b/paddle/phi/backends/xpu/xpu_context.cc
index dbff88c0a2709..fe0dda2d3dbeb 100644
--- a/paddle/phi/backends/xpu/xpu_context.cc
+++ b/paddle/phi/backends/xpu/xpu_context.cc
@@ -45,8 +45,8 @@ struct XPUContext::Impl {
         }
         if (l3ptrs[place_.GetDeviceId()] != nullptr) {
           context_->_l3_mgr.set(l3ptrs[place_.GetDeviceId()], l3_size);
-          VLOG(3) << "xpu place " << place_.GetDeviceId() << " set l3 size "
-                  << l3_size;
+          VLOG(3) << "xpu place " << static_cast<int>(place_.GetDeviceId())
+                  << " set l3 size " << l3_size;
         }
         break;
       }
@@ -66,6 +66,8 @@ struct XPUContext::Impl {
 
   const Place& GetPlace() const { return place_; }
 
+  void SetStream(XPUStream stream) { context_->xpu_stream = stream; }
+
   xpu::Context* GetXContext() const {
     PD_CHECK(context_ != nullptr, "the xpu context is nullptr.");
     return context_;
@@ -115,6 +117,8 @@ XPUContext::~XPUContext() = default;
 
 const Place& XPUContext::GetPlace() const { return impl_->GetPlace(); }
 
+void XPUContext::SetXPUStream(XPUStream stream) { impl_->SetStream(stream); }
+
 backends::xpu::XPUVersion XPUContext::xpu_version() const {
   return impl_->xpu_version_;
 }
diff --git a/paddle/phi/backends/xpu/xpu_context.h b/paddle/phi/backends/xpu/xpu_context.h
index d39b3c9cc1ff7..d20a1ad4e1e48 100644
--- a/paddle/phi/backends/xpu/xpu_context.h
+++ b/paddle/phi/backends/xpu/xpu_context.h
@@ -61,6 +61,8 @@ class XPUContext : public DeviceContext {
 
   void SetL3Cache(int l3_size = 14155776);
 
+  void SetXPUStream(XPUStream stream);
+
  private:
   struct Impl;
   std::unique_ptr<Impl> impl_;
diff --git a/paddle/phi/capi/include/c_kernel_context.h b/paddle/phi/capi/include/c_kernel_context.h
index c06cb3cd30086..a5524e3aee278 100644
--- a/paddle/phi/capi/include/c_kernel_context.h
+++ b/paddle/phi/capi/include/c_kernel_context.h
@@ -87,6 +87,26 @@ PD_List PD_KernelContextListScalarAttrAt(PD_KernelContext *ctx, size_t index);
 
 PD_Place *PD_KernelContextPlaceAttrAt(PD_KernelContext *ctx, size_t index);
 
+const char *PD_StringAttr(void *attr);
+
+PD_DataType PD_DatatTypeAttr(void *attr);
+
+PD_DataLayout PD_DatatLayoutAttr(void *attr);
+
+PD_List PD_ListInt32Attr(void *attr);
+
+PD_List PD_ListInt64Attr(void *attr);
+
+PD_List PD_ListFloatAttr(void *attr);
+
+PD_List PD_ListDoubleAttr(void *attr);
+
+PD_List PD_ListScalarAttr(void *attr);
+
+PD_List PD_ListStringAttr(void *attr);
+
+PD_List PD_ListBoolAttr(void *attr);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/paddle/phi/capi/include/c_tensor.h b/paddle/phi/capi/include/c_tensor.h
index 494346713cf53..2bebee977740b 100644
--- a/paddle/phi/capi/include/c_tensor.h
+++ b/paddle/phi/capi/include/c_tensor.h
@@ -24,7 +24,7 @@ extern "C" {
 
 typedef struct PD_Tensor PD_Tensor;
 
-PD_DataType PD_TensorGetDataType(const PD_Tensor *tensor, PD_Status *status);
+PD_DataType PD_TensorGetPDDataType(const PD_Tensor *tensor, PD_Status *status);
 
 PD_DataLayout PD_TensorGetDataLayout(const PD_Tensor *tensor,
                                      PD_Status *status);
@@ -82,6 +82,10 @@ void PD_TensorShareLoDWith(PD_Tensor *dst,
                            const PD_Tensor *src,
                            PD_Status *status);
 
+PD_Tensor *PD_OptionalTensorGetPointer(PD_Tensor *tensor);
+
+PD_List PD_TensorVectorToList(PD_Tensor *tensor);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/paddle/phi/capi/include/kernel_registry.h b/paddle/phi/capi/include/kernel_registry.h
index 37b045a60658b..47ddc0bf5be7e 100644
--- a/paddle/phi/capi/include/kernel_registry.h
+++ b/paddle/phi/capi/include/kernel_registry.h
@@ -19,7 +19,129 @@
 
 namespace phi {
 namespace capi {
+// eager mode
+inline std::vector<phi::capi::DenseTensor> PD_TensorVector(PD_Tensor *tensor) {
+  std::vector<phi::capi::DenseTensor> ret;
+  auto list = PD_TensorVectorToList(tensor);
+  auto data = reinterpret_cast<PD_Tensor **>(list.data);
+  for (size_t i = 0; i < list.size; ++i) {
+    ret.emplace_back(data[i]);
+  }
+  return ret;
+}
+
+inline paddle::optional<phi::capi::DenseTensor> PD_OptionalTensor(
+    PD_Tensor *tensor) {
+  auto ptr = PD_OptionalTensorGetPointer(tensor);
+  return ptr ? paddle::optional<phi::capi::DenseTensor>(
+                   phi::capi::DenseTensor(ptr))
+             : paddle::optional<phi::capi::DenseTensor>(paddle::none);
+}
+
+template <typename T>
+inline T PD_Attr(void *attr) {
+  return *reinterpret_cast<T *>(attr);
+}
+
+template <>
+inline std::string PD_Attr<std::string>(void *attr) {
+  return PD_StringAttr(attr);
+}
+
+template <>
+inline PD_DataType PD_Attr<PD_DataType>(void *attr) {
+  return PD_DatatTypeAttr(attr);
+}
+
+template <>
+inline PD_DataLayout PD_Attr<PD_DataLayout>(void *attr) {
+  return PD_DatatLayoutAttr(attr);
+}
+
+template <>
+inline std::vector<int32_t> PD_Attr<std::vector<int32_t>>(void *attr) {
+  auto list = PD_ListInt32Attr(attr);
+  auto data = reinterpret_cast<int32_t *>(list.data);
+  std::vector<int32_t> cc_list(data, data + list.size);
+  return cc_list;
+}
+
+template <>
+inline std::vector<int64_t> PD_Attr<std::vector<int64_t>>(void *attr) {
+  auto list = PD_ListInt64Attr(attr);
+  auto data = reinterpret_cast<int64_t *>(list.data);
+  std::vector<int64_t> cc_list(data, data + list.size);
+  return cc_list;
+}
+
+template <>
+inline std::vector<float> PD_Attr<std::vector<float>>(void *attr) {
+  auto list = PD_ListFloatAttr(attr);
+  auto data = reinterpret_cast<float *>(list.data);
+  std::vector<float> cc_list(data, data + list.size);
+  return cc_list;
+}
+
+template <>
+inline std::vector<double> PD_Attr<std::vector<double>>(void *attr) {
+  auto list = PD_ListDoubleAttr(attr);
+  auto data = reinterpret_cast<double *>(list.data);
+  std::vector<double> cc_list(data, data + list.size);
+  return cc_list;
+}
+
+template <>
+inline phi::capi::Scalar PD_Attr<phi::capi::Scalar>(void *attr) {
+  return phi::capi::Scalar(reinterpret_cast<PD_Scalar *>(attr));
+}
+
+template <>
+inline phi::capi::IntArray PD_Attr<phi::capi::IntArray>(void *attr) {
+  return phi::capi::IntArray(reinterpret_cast<PD_IntArray *>(attr));
+}
+
+template <>
+inline phi::capi::Place PD_Attr<phi::capi::Place>(void *attr) {
+  return phi::capi::Place(reinterpret_cast<PD_Place *>(attr));
+}
+
+template <>
+inline std::vector<phi::capi::Scalar> PD_Attr<std::vector<phi::capi::Scalar>>(
+    void *attr) {
+  auto c_list = PD_ListScalarAttr(attr);
+  auto data = reinterpret_cast<PD_Scalar **>(c_list.data);
+  std::vector<phi::capi::Scalar> list;
+  for (size_t i = 0; i < c_list.size; ++i) {
+    list.emplace_back(data[i]);
+  }
+  PD_DeletePointerList(c_list);
+  return list;
+}
 
+template <>
+inline std::vector<std::string> PD_Attr<std::vector<std::string>>(void *attr) {
+  auto c_list = PD_ListStringAttr(attr);
+  auto data = reinterpret_cast<char **>(c_list.data);
+  std::vector<std::string> list;
+  for (size_t i = 0; i < c_list.size; ++i) {
+    list.emplace_back(data[i]);
+  }
+  PD_DeletePointerList(c_list);
+  return list;
+}
+
+template <>
+inline std::vector<bool> PD_Attr<std::vector<bool>>(void *attr) {
+  auto c_list = PD_ListBoolAttr(attr);
+  std::vector<bool> list;
+  auto data = reinterpret_cast<uint8_t *>(c_list.data);
+  for (size_t i = 0; i < c_list.size; ++i) {
+    list[i] = static_cast<bool>(data[i]);
+  }
+  PD_DeleteUInt8List(c_list);
+  return list;
+}
+//
 inline phi::capi::DeviceContext PD_GetDeviceContext(PD_KernelContext *ctx) {
   return phi::capi::DeviceContext(PD_KernelContextGetDeviceContext(ctx));
 }
@@ -189,7 +311,7 @@ inline std::vector<phi::capi::Scalar> PD_AttrAt<std::vector<phi::capi::Scalar>>(
 template <>
 inline std::vector<std::string> PD_AttrAt<std::vector<std::string>>(
     PD_KernelContext *ctx, size_t index) {
-  auto c_list = PD_KernelContextListScalarAttrAt(ctx, index);
+  auto c_list = PD_KernelContextListStringAttrAt(ctx, index);
   auto data = reinterpret_cast<char **>(c_list.data);
   std::vector<std::string> list;
   for (size_t i = 0; i < c_list.size; ++i) {
diff --git a/paddle/phi/capi/include/kernel_utils.h b/paddle/phi/capi/include/kernel_utils.h
index 7302e6f4677b3..246bc9e3c5932 100644
--- a/paddle/phi/capi/include/kernel_utils.h
+++ b/paddle/phi/capi/include/kernel_utils.h
@@ -454,47 +454,67 @@ namespace capi {
                                                    meta_kernel_fn,      \
                                                    __VA_ARGS__))
 
-#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_DEVICE_CONTEXT(dev_ctx)     \
+#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_DEVICE_CONTEXT(dev_ctx)      \
+  template <typename... Tail>                                                 \
+  struct CustomKernelCallHelper<const dev_ctx &, Tail...> {                   \
+    template <int dev_ctx_idx,                                                \
+              int in_idx,                                                     \
+              int attr_idx,                                                   \
+              int out_idx,                                                    \
+              typename... PreviousArgs>                                       \
+    static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) {      \
+      static_assert(in_idx == 0,                                              \
+                    "Kernel's DeviceContext should appear before Inputs.");   \
+      static_assert(                                                          \
+          attr_idx == 0,                                                      \
+          "Kernel's DeviceContext should appear before Attributes.");         \
+      static_assert(out_idx == 0,                                             \
+                    "Kernel's DeviceContext should appear before Outputs.");  \
+      dev_ctx arg = PD_GetDeviceContext(ctx);                                 \
+      CustomKernelCallHelper<Tail...>::                                       \
+          template Compute<dev_ctx_idx + 1, in_idx, attr_idx, out_idx>(       \
+              ctx, pargs..., arg);                                            \
+    }                                                                         \
+    template <int idx, typename... PreviousArgs>                              \
+    static void VariadicCompute(const std::tuple<DevCtx, Args &...> &ctx,     \
+                                PreviousArgs &...pargs) {                     \
+      const dev_ctx &arg = std::get<idx>(ctx);                                \
+      auto dev_ctx_wrapper = phi::capi::DeviceContext(                        \
+          reinterpret_cast<PD_DeviceContext *>(const_cast<dev_ctx *>(&arg))); \
+      return CustomKernelCallHelper<Tail...>::template VariadicCompute<idx +  \
+                                                                       1>(    \
+          ctx, pargs..., dev_ctx_wrapper);                                    \
+    }                                                                         \
+  }
+
+#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_INPUT(tensor_type)          \
   template <typename... Tail>                                                \
-  struct CustomKernelCallHelper<const dev_ctx &, Tail...> {                  \
+  struct CustomKernelCallHelper<const tensor_type &, Tail...> {              \
     template <int dev_ctx_idx,                                               \
               int in_idx,                                                    \
               int attr_idx,                                                  \
               int out_idx,                                                   \
               typename... PreviousArgs>                                      \
     static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) {     \
-      static_assert(in_idx == 0,                                             \
-                    "Kernel's DeviceContext should appear before Inputs.");  \
-      static_assert(                                                         \
-          attr_idx == 0,                                                     \
-          "Kernel's DeviceContext should appear before Attributes.");        \
+      static_assert(attr_idx == 0,                                           \
+                    "Kernel's Input should appear before Attributes.");      \
       static_assert(out_idx == 0,                                            \
-                    "Kernel's DeviceContext should appear before Outputs."); \
-      dev_ctx arg = PD_GetDeviceContext(ctx);                                \
+                    "Kernel's Input should appear before Outputs.");         \
+      const tensor_type arg = PD_InputAt(ctx, in_idx);                       \
       CustomKernelCallHelper<Tail...>::                                      \
-          template Compute<dev_ctx_idx + 1, in_idx, attr_idx, out_idx>(      \
+          template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>(      \
               ctx, pargs..., arg);                                           \
     }                                                                        \
-  }
-
-#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_INPUT(tensor_type)      \
-  template <typename... Tail>                                            \
-  struct CustomKernelCallHelper<const tensor_type &, Tail...> {          \
-    template <int dev_ctx_idx,                                           \
-              int in_idx,                                                \
-              int attr_idx,                                              \
-              int out_idx,                                               \
-              typename... PreviousArgs>                                  \
-    static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) { \
-      static_assert(attr_idx == 0,                                       \
-                    "Kernel's Input should appear before Attributes.");  \
-      static_assert(out_idx == 0,                                        \
-                    "Kernel's Input should appear before Outputs.");     \
-      const tensor_type arg = PD_InputAt(ctx, in_idx);                   \
-      CustomKernelCallHelper<Tail...>::                                  \
-          template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>(  \
-              ctx, pargs..., arg);                                       \
-    }                                                                    \
+    template <int idx, typename... PreviousArgs>                             \
+    static void VariadicCompute(const std::tuple<DevCtx, Args &...> &ctx,    \
+                                PreviousArgs &...pargs) {                    \
+      const tensor_type &arg = std::get<idx>(ctx);                           \
+      auto tensor = phi::capi::DenseTensor(                                  \
+          reinterpret_cast<PD_Tensor *>(const_cast<tensor_type *>(&arg)));   \
+      return CustomKernelCallHelper<Tail...>::template VariadicCompute<idx + \
+                                                                       1>(   \
+          ctx, pargs..., tensor);                                            \
+    }                                                                        \
   }
 
 #define PD_SPECIALIZE_CustomKernelCallHelper_FOR_OPTIONAL_INPUT(tensor_type) \
@@ -516,99 +536,168 @@ namespace capi {
           template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>(      \
               ctx, pargs..., arg);                                           \
     }                                                                        \
+    template <int idx, typename... PreviousArgs>                             \
+    static void VariadicCompute(const std::tuple<DevCtx, Args &...> &ctx,    \
+                                PreviousArgs &...pargs) {                    \
+      auto &arg = std::get<idx>(ctx);                                        \
+      paddle::optional<tensor_type> tensor =                                 \
+          PD_OptionalTensor(reinterpret_cast<PD_Tensor *>(                   \
+              const_cast<paddle::optional<tensor_type> *>(&arg)));           \
+      return CustomKernelCallHelper<Tail...>::template VariadicCompute<idx + \
+                                                                       1>(   \
+          ctx, pargs..., tensor);                                            \
+    }                                                                        \
   }
 
-#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_MULTI_INPUT(tensor_type) \
-  template <typename... Tail>                                             \
-  struct CustomKernelCallHelper<const std::vector<const tensor_type *> &, \
-                                Tail...> {                                \
-    template <int dev_ctx_idx,                                            \
-              int in_idx,                                                 \
-              int attr_idx,                                               \
-              int out_idx,                                                \
-              typename... PreviousArgs>                                   \
-    static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) {  \
-      static_assert(attr_idx == 0,                                        \
-                    "Kernel's Input should appear before Attributes.");   \
-      static_assert(out_idx == 0,                                         \
-                    "Kernel's Input should appear before Outputs.");      \
-      auto arg = PD_MultiInputAt(ctx, in_idx);                            \
-      auto arg_wrapper = PD_GetPointerVector(&arg);                       \
-      CustomKernelCallHelper<Tail...>::                                   \
-          template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>(   \
-              ctx, pargs..., arg_wrapper);                                \
-    }                                                                     \
+#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_MULTI_INPUT(tensor_type)    \
+  template <typename... Tail>                                                \
+  struct CustomKernelCallHelper<const std::vector<const tensor_type *> &,    \
+                                Tail...> {                                   \
+    template <int dev_ctx_idx,                                               \
+              int in_idx,                                                    \
+              int attr_idx,                                                  \
+              int out_idx,                                                   \
+              typename... PreviousArgs>                                      \
+    static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) {     \
+      static_assert(attr_idx == 0,                                           \
+                    "Kernel's Input should appear before Attributes.");      \
+      static_assert(out_idx == 0,                                            \
+                    "Kernel's Input should appear before Outputs.");         \
+      auto arg = PD_MultiInputAt(ctx, in_idx);                               \
+      auto arg_wrapper = PD_GetPointerVector(&arg);                          \
+      CustomKernelCallHelper<Tail...>::                                      \
+          template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>(      \
+              ctx, pargs..., arg_wrapper);                                   \
+    }                                                                        \
+    template <int idx, typename... PreviousArgs>                             \
+    static void VariadicCompute(const std::tuple<DevCtx, Args &...> &ctx,    \
+                                PreviousArgs &...pargs) {                    \
+      auto &arg = std::get<idx>(ctx);                                        \
+      auto tensor = PD_TensorVector(reinterpret_cast<PD_Tensor *>(           \
+          const_cast<std::vector<const tensor_type *> *>(&arg)));            \
+      auto tensor_ptr_vec = PD_GetPointerVector(&arg);                       \
+      return CustomKernelCallHelper<Tail...>::template VariadicCompute<idx + \
+                                                                       1>(   \
+          ctx, pargs..., tensor_ptr_vec);                                    \
+    }                                                                        \
   }
 
-#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_ATTRIBUTE(attr_type)     \
-  template <typename... Tail>                                             \
-  struct CustomKernelCallHelper<attr_type, Tail...> {                     \
-    template <int dev_ctx_idx,                                            \
-              int in_idx,                                                 \
-              int attr_idx,                                               \
-              int out_idx,                                                \
-              typename... PreviousArgs>                                   \
-    static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) {  \
-      static_assert(out_idx == 0,                                         \
-                    "Kernel's Attributes should appear before Outputs."); \
-      attr_type arg = PD_AttrAt<attr_type>(ctx, attr_idx);                \
-      CustomKernelCallHelper<Tail...>::                                   \
-          template Compute<dev_ctx_idx, in_idx, attr_idx + 1, out_idx>(   \
-              ctx, pargs..., arg);                                        \
-    }                                                                     \
+#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_ATTRIBUTE(attr_type)        \
+  template <typename... Tail>                                                \
+  struct CustomKernelCallHelper<attr_type, Tail...> {                        \
+    template <int dev_ctx_idx,                                               \
+              int in_idx,                                                    \
+              int attr_idx,                                                  \
+              int out_idx,                                                   \
+              typename... PreviousArgs>                                      \
+    static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) {     \
+      static_assert(out_idx == 0,                                            \
+                    "Kernel's Attributes should appear before Outputs.");    \
+      attr_type arg = PD_AttrAt<attr_type>(ctx, attr_idx);                   \
+      CustomKernelCallHelper<Tail...>::                                      \
+          template Compute<dev_ctx_idx, in_idx, attr_idx + 1, out_idx>(      \
+              ctx, pargs..., arg);                                           \
+    }                                                                        \
+    template <int idx, typename... PreviousArgs>                             \
+    static void VariadicCompute(const std::tuple<DevCtx, Args &...> &ctx,    \
+                                PreviousArgs &...pargs) {                    \
+      auto &arg = std::get<idx>(ctx);                                        \
+      auto attr = PD_Attr<attr_type>(reinterpret_cast<void *>(&arg));        \
+      return CustomKernelCallHelper<Tail...>::template VariadicCompute<idx + \
+                                                                       1>(   \
+          ctx, pargs..., attr);                                              \
+    }                                                                        \
   }
 
-#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_CONST_ATTRIBUTE_REF(     \
-    attr_type)                                                            \
-  template <typename... Tail>                                             \
-  struct CustomKernelCallHelper<const attr_type &, Tail...> {             \
-    template <int dev_ctx_idx,                                            \
-              int in_idx,                                                 \
-              int attr_idx,                                               \
-              int out_idx,                                                \
-              typename... PreviousArgs>                                   \
-    static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) {  \
-      static_assert(out_idx == 0,                                         \
-                    "Kernel's Attributes should appear before Outputs."); \
-      attr_type arg = PD_AttrAt<attr_type>(ctx, attr_idx);                \
-      CustomKernelCallHelper<Tail...>::                                   \
-          template Compute<dev_ctx_idx, in_idx, attr_idx + 1, out_idx>(   \
-              ctx, pargs..., arg);                                        \
-    }                                                                     \
+#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_CONST_ATTRIBUTE_REF(        \
+    attr_type)                                                               \
+  template <typename... Tail>                                                \
+  struct CustomKernelCallHelper<const attr_type &, Tail...> {                \
+    template <int dev_ctx_idx,                                               \
+              int in_idx,                                                    \
+              int attr_idx,                                                  \
+              int out_idx,                                                   \
+              typename... PreviousArgs>                                      \
+    static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) {     \
+      static_assert(out_idx == 0,                                            \
+                    "Kernel's Attributes should appear before Outputs.");    \
+      attr_type arg = PD_AttrAt<attr_type>(ctx, attr_idx);                   \
+      CustomKernelCallHelper<Tail...>::                                      \
+          template Compute<dev_ctx_idx, in_idx, attr_idx + 1, out_idx>(      \
+              ctx, pargs..., arg);                                           \
+    }                                                                        \
+    template <int idx, typename... PreviousArgs>                             \
+    static void VariadicCompute(const std::tuple<DevCtx, Args &...> &ctx,    \
+                                PreviousArgs &...pargs) {                    \
+      const attr_type &arg = std::get<idx>(ctx);                             \
+      auto attr = PD_Attr<attr_type>(                                        \
+          reinterpret_cast<void *>(const_cast<attr_type *>(&arg)));          \
+      return CustomKernelCallHelper<Tail...>::template VariadicCompute<idx + \
+                                                                       1>(   \
+          ctx, pargs..., attr);                                              \
+    }                                                                        \
   }
 
-#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_OUTPUT(tensor_type)     \
-  template <typename... Tail>                                            \
-  struct CustomKernelCallHelper<tensor_type *, Tail...> {                \
-    template <int dev_ctx_idx,                                           \
-              int in_idx,                                                \
-              int attr_idx,                                              \
-              int out_idx,                                               \
-              typename... PreviousArgs>                                  \
-    static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) { \
-      auto arg = PD_OutputAt(ctx, out_idx);                              \
-      tensor_type *ptr = (arg.raw_data() ? &arg : nullptr);              \
-      CustomKernelCallHelper<Tail...>::                                  \
-          template Compute<dev_ctx_idx, in_idx, attr_idx, out_idx + 1>(  \
-              ctx, pargs..., ptr);                                       \
-    }                                                                    \
+#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_OUTPUT(tensor_type)         \
+  template <typename... Tail>                                                \
+  struct CustomKernelCallHelper<tensor_type *, Tail...> {                    \
+    template <int dev_ctx_idx,                                               \
+              int in_idx,                                                    \
+              int attr_idx,                                                  \
+              int out_idx,                                                   \
+              typename... PreviousArgs>                                      \
+    static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) {     \
+      auto arg = PD_OutputAt(ctx, out_idx);                                  \
+      tensor_type *ptr = (arg.raw_data() ? &arg : nullptr);                  \
+      CustomKernelCallHelper<Tail...>::                                      \
+          template Compute<dev_ctx_idx, in_idx, attr_idx, out_idx + 1>(      \
+              ctx, pargs..., ptr);                                           \
+    }                                                                        \
+    template <int idx, typename... PreviousArgs>                             \
+    static void VariadicCompute(const std::tuple<DevCtx, Args &...> &ctx,    \
+                                PreviousArgs &...pargs) {                    \
+      tensor_type *arg = std::get<idx>(ctx);                                 \
+      auto tensor =                                                          \
+          phi::capi::DenseTensor(reinterpret_cast<PD_Tensor *>(arg));        \
+      auto tensor_ptr = tensor.raw_data() ? &tensor : nullptr;               \
+      return CustomKernelCallHelper<Tail...>::template VariadicCompute<idx + \
+                                                                       1>(   \
+          ctx, pargs..., tensor_ptr);                                        \
+    }                                                                        \
   }
 
-#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_MULTI_OUTPUT(tensor_type) \
-  template <typename... Tail>                                              \
-  struct CustomKernelCallHelper<std::vector<tensor_type *>, Tail...> {     \
-    template <int dev_ctx_idx,                                             \
-              int in_idx,                                                  \
-              int attr_idx,                                                \
-              int out_idx,                                                 \
-              typename... PreviousArgs>                                    \
-    static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) {   \
-      auto arg = PD_MultiOutputAt(ctx, out_idx);                           \
-      auto arg_wrapper = PD_GetPointerVector(&arg);                        \
-      CustomKernelCallHelper<Tail...>::                                    \
-          template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>(    \
-              ctx, pargs..., arg_wrapper);                                 \
-    }                                                                      \
+#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_MULTI_OUTPUT(tensor_type)   \
+  template <typename... Tail>                                                \
+  struct CustomKernelCallHelper<std::vector<tensor_type *>, Tail...> {       \
+    template <int dev_ctx_idx,                                               \
+              int in_idx,                                                    \
+              int attr_idx,                                                  \
+              int out_idx,                                                   \
+              typename... PreviousArgs>                                      \
+    static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) {     \
+      auto arg = PD_MultiOutputAt(ctx, out_idx);                             \
+      std::vector<tensor_type *> tensor_ptr_vec;                             \
+      for (auto &tensor : arg) {                                             \
+        tensor_ptr_vec.push_back(tensor.raw_data() ? &tensor : nullptr);     \
+      }                                                                      \
+      CustomKernelCallHelper<Tail...>::                                      \
+          template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>(      \
+              ctx, pargs..., tensor_ptr_vec);                                \
+    }                                                                        \
+    template <int idx, typename... PreviousArgs>                             \
+    static void VariadicCompute(const std::tuple<DevCtx, Args &...> &ctx,    \
+                                PreviousArgs &...pargs) {                    \
+      std::vector<tensor_type *> &arg = std::get<idx>(ctx);                  \
+      auto tensor_vec = PD_TensorVector(reinterpret_cast<PD_Tensor *>(       \
+          const_cast<std::vector<tensor_type *> *>(&arg)));                  \
+      std::vector<tensor_type *> tensor_ptr_vec;                             \
+      for (auto &tensor : tensor_vec) {                                      \
+        tensor_ptr_vec.push_back(tensor.raw_data() ? &tensor : nullptr);     \
+      }                                                                      \
+      return CustomKernelCallHelper<Tail...>::template VariadicCompute<idx + \
+                                                                       1>(   \
+          ctx, pargs..., tensor_ptr_vec);                                    \
+    }                                                                        \
   }
 
 template <typename T>
@@ -627,9 +716,10 @@ struct CustomKernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
         template Compute<0, 0, 0, 0>(ctx);
   }
 
-  static void VariadicCompute(const phi::capi::DeviceContext &dev_ctx,
-                              Args... args) {
-    return kernel_fn(static_cast<DevCtx>(dev_ctx), std::forward<Args>(args)...);
+  static void VariadicCompute(DevCtx dev_ctx, Args... args) {
+    const std::tuple<DevCtx, Args &...> args_tuple(dev_ctx, args...);
+    return CustomKernelCallHelper<DevCtx, Args..., CustomTypeTag<int>>::
+        template VariadicCompute<0>(args_tuple);
   }
 
  private:
@@ -693,6 +783,13 @@ struct CustomKernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
       static_assert(out_idx > 0, "Kernel should have output argument.");
       return kernel_fn(dev_ctx, args...);
     }
+
+    template <int idx>
+    static void VariadicCompute(const std::tuple<DevCtx, Args &...> &ctx,
+                                DevCtx dev_ctx,
+                                Args... args) {
+      return kernel_fn(dev_ctx, args...);
+    }
   };
 };
 
diff --git a/paddle/phi/capi/include/wrapper_base.h b/paddle/phi/capi/include/wrapper_base.h
index 2b5421bc266cf..adfb2b5a0e050 100644
--- a/paddle/phi/capi/include/wrapper_base.h
+++ b/paddle/phi/capi/include/wrapper_base.h
@@ -128,7 +128,7 @@ class DenseTensor : public WrapperBase<PD_Tensor> {
 
   PD_DataType dtype() const {
     C_Status status;
-    auto data_type = PD_TensorGetDataType(raw_data(), &status);
+    auto data_type = PD_TensorGetPDDataType(raw_data(), &status);
     PD_CHECK_STATUS(status);
     return data_type;
   }
diff --git a/paddle/phi/capi/lib/c_kernel_context.cc b/paddle/phi/capi/lib/c_kernel_context.cc
index 2e14b019c19ff..d38a19038e314 100644
--- a/paddle/phi/capi/lib/c_kernel_context.cc
+++ b/paddle/phi/capi/lib/c_kernel_context.cc
@@ -220,4 +220,89 @@ PD_DataLayout PD_KernelContextDataLayoutAttrAt(PD_KernelContext* ctx,
       kernel_context->AttrAt<phi::DataLayout>(index));
 }
 
+// eager
+const char* PD_StringAttr(void* attr) {
+  auto* str = reinterpret_cast<std::string*>(attr);
+  return str->c_str();
+}
+
+PD_DataType PD_DatatTypeAttr(void* attr) {
+  auto* dtype = reinterpret_cast<phi::DataType*>(attr);
+  return phi::capi::ToPDDataType(*dtype);
+}
+
+PD_DataLayout PD_DatatLayoutAttr(void* attr) {
+  auto* layout = reinterpret_cast<phi::DataLayout*>(attr);
+  return phi::capi::ToPDDataLayout(*layout);
+}
+
+PD_List PD_ListInt32Attr(void* attr) {
+  PD_List list;
+  const auto& cc_list = *reinterpret_cast<std::vector<int32_t>*>(attr);
+  list.size = cc_list.size();
+  list.data = const_cast<int32_t*>(cc_list.data());
+  return list;
+}
+
+PD_List PD_ListInt64Attr(void* attr) {
+  PD_List list;
+  const auto& cc_list = *reinterpret_cast<std::vector<int64_t>*>(attr);
+  list.size = cc_list.size();
+  list.data = const_cast<int64_t*>(cc_list.data());
+  return list;
+}
+
+PD_List PD_ListFloatAttr(void* attr) {
+  PD_List list;
+  const auto& cc_list = *reinterpret_cast<std::vector<float>*>(attr);
+  list.size = cc_list.size();
+  list.data = const_cast<float*>(cc_list.data());
+  return list;
+}
+
+PD_List PD_ListDoubleAttr(void* attr) {
+  PD_List list;
+  const auto& cc_list = *reinterpret_cast<std::vector<double>*>(attr);
+  list.size = cc_list.size();
+  list.data = const_cast<double*>(cc_list.data());
+  return list;
+}
+
+PD_List PD_ListScalarAttr(void* attr) {
+  PD_List list;
+  const auto& cc_list = *reinterpret_cast<std::vector<phi::Scalar>*>(attr);
+  list.size = cc_list.size();
+  auto data = new PD_Scalar*[list.size];
+  for (size_t i = 0; i < list.size; ++i) {
+    data[i] =
+        const_cast<PD_Scalar*>(reinterpret_cast<const PD_Scalar*>(&cc_list[i]));
+  }
+  list.data = data;
+  return list;
+}
+
+PD_List PD_ListStringAttr(void* attr) {
+  PD_List list;
+  const auto& cc_list = *reinterpret_cast<std::vector<std::string>*>(attr);
+  list.size = cc_list.size();
+  auto data = new char*[list.size];
+  for (size_t i = 0; i < list.size; ++i) {
+    data[i] = const_cast<char*>(cc_list[i].data());
+  }
+  list.data = reinterpret_cast<void*>(data);
+  return list;
+}
+
+PD_List PD_ListBoolAttr(void* attr) {
+  PD_List list;
+  const auto& cc_list = *reinterpret_cast<std::vector<bool>*>(attr);
+  list.size = cc_list.size();
+  auto data = reinterpret_cast<uint8_t*>(new uint8_t[cc_list.size()]);
+  for (size_t i = 0; i < cc_list.size(); ++i) {
+    data[i] = static_cast<uint8_t>(cc_list[i]);
+  }
+  list.data = data;
+  return list;
+}
+
 PD_REGISTER_CAPI(kernel_context);
diff --git a/paddle/phi/capi/lib/c_tensor.cc b/paddle/phi/capi/lib/c_tensor.cc
index cd0bbd62d88a0..b460d2e368607 100644
--- a/paddle/phi/capi/lib/c_tensor.cc
+++ b/paddle/phi/capi/lib/c_tensor.cc
@@ -19,7 +19,7 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/meta_tensor.h"
 
-PD_DataType PD_TensorGetDataType(const PD_Tensor* tensor, PD_Status* status) {
+PD_DataType PD_TensorGetPDDataType(const PD_Tensor* tensor, PD_Status* status) {
   if (status) {
     if (!tensor) {
       *status = C_FAILED;
@@ -299,4 +299,19 @@ void PD_TensorShareLoDWith(PD_Tensor* dst,
   meta_dst.share_lod(meta_src);
 }
 
+PD_Tensor* PD_OptionalTensorGetPointer(PD_Tensor* tensor) {
+  auto cc_tensor =
+      reinterpret_cast<paddle::optional<phi::DenseTensor>*>(tensor);
+  return reinterpret_cast<PD_Tensor*>(cc_tensor->get_ptr());
+}
+
+PD_List PD_TensorVectorToList(PD_Tensor* tensor) {
+  auto cc_tensor =
+      reinterpret_cast<std::vector<const phi::DenseTensor*>*>(tensor);
+  PD_List list;
+  list.size = cc_tensor->size();
+  list.data = cc_tensor->data();
+  return list;
+}
+
 PD_REGISTER_CAPI(tensor);
diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h
index 3e1787cb12cfa..c6d49bd5b978b 100644
--- a/paddle/phi/common/backend.h
+++ b/paddle/phi/common/backend.h
@@ -50,7 +50,7 @@ enum class Backend : uint8_t {
   MLU,  // MLU currently does not exist at the same time as CUDA
 
   // the third library backend
-  MKLDNN,
+  ONEDNN,
   GPUDNN,  // cuDNN and hipDNN
 
   // paddle kernel primitives backend
@@ -118,8 +118,8 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) {
     case Backend::MLU:
       os << "MLU";
       break;
-    case Backend::MKLDNN:
-      os << "MKLDNN";
+    case Backend::ONEDNN:
+      os << "ONEDNN";
       break;
     case Backend::GPUDNN:
       os << "GPUDNN";
@@ -160,8 +160,8 @@ inline Backend StringToBackend(const char* backend_cstr) {
     return Backend::NPU;
   } else if (s == std::string("MLU")) {
     return Backend::MLU;
-  } else if (s == std::string("MKLDNN")) {
-    return Backend::MKLDNN;
+  } else if (s == std::string("OneDNN")) {
+    return Backend::ONEDNN;
   } else if (s == std::string("GPUDNN")) {
     return Backend::GPUDNN;
   } else if (s == std::string("KPS")) {
diff --git a/paddle/phi/common/place.h b/paddle/phi/common/place.h
index cbc1faf94f07c..ead3e463c2803 100644
--- a/paddle/phi/common/place.h
+++ b/paddle/phi/common/place.h
@@ -39,10 +39,9 @@ enum class AllocationType : int8_t {
 
 const char* AllocationTypeStr(AllocationType type);
 
-PADDLE_API size_t
-GetOrRegisterGlobalDeviceTypeId(const std::string& device_type);
+size_t GetOrRegisterGlobalDeviceTypeId(const std::string& device_type);
 
-PADDLE_API std::string GetGlobalDeviceType(size_t device_type_id_);
+std::string GetGlobalDeviceType(size_t device_type_id_);
 
 /// \brief The place is used to specify where the data is stored.
 class PADDLE_API Place {
diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc
index 18c39bfae1d18..231aaeebaccce 100644
--- a/paddle/phi/core/compat/convert_utils.cc
+++ b/paddle/phi/core/compat/convert_utils.cc
@@ -66,7 +66,7 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
           set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0);
 #endif
 #ifdef PADDLE_WITH_MKLDNN
-    case phi::Backend::MKLDNN:
+    case phi::Backend::ONEDNN:
       return phi::CPUPlace();
 #endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/phi/core/device_context.cc b/paddle/phi/core/device_context.cc
index ce57f4f627baa..fc85fc32f62a8 100644
--- a/paddle/phi/core/device_context.cc
+++ b/paddle/phi/core/device_context.cc
@@ -14,6 +14,10 @@
 
 #include "paddle/phi/core/device_context.h"
 
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
+#endif
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/selected_rows.h"
@@ -58,6 +62,26 @@ struct DeviceContext::Impl {
     pinned_allocator_ = allocator;
   }
 
+#ifdef PADDLE_WITH_CUDA
+  void SetCUDAGraphAllocator(const Allocator* allocator) {
+    // NOTE (Yuang): cuda graph allocator can be set to nullptr, so don't check
+    // validation of the allocator here
+    cuda_graph_allocator_ = allocator;
+  }
+
+  const Allocator& GetCUDAGraphAllocator() const {
+    PADDLE_ENFORCE_NOT_NULL(cuda_graph_allocator_,
+                            phi::errors::InvalidArgument(
+                                "Required cuda_graph_allocator_ shall not be "
+                                "nullptr, but received nullptr."));
+    return *cuda_graph_allocator_;
+  }
+
+  bool IsCUDAGraphAllocatorValid() const {
+    return cuda_graph_allocator_ != nullptr;
+  }
+#endif
+
   const Allocator& GetAllocator() const {
     PADDLE_ENFORCE_NOT_NULL(
         device_allocator_,
@@ -111,6 +135,17 @@ struct DeviceContext::Impl {
     auto* allocator = tensor->numel() == 0
                           ? zero_allocator_
                           : (pinned ? pinned_allocator_ : device_allocator_);
+#ifdef PADDLE_WITH_CUDA
+    bool must_cuda_graph_allocator = (tensor->numel() != 0) && !pinned;
+    if (must_cuda_graph_allocator && paddle::platform::is_gpu_place(place) &&
+        paddle::platform::CUDAGraph::IsThisThreadCapturing()) {
+      PADDLE_ENFORCE_NOT_NULL(cuda_graph_allocator_,
+                              phi::errors::InvalidArgument(
+                                  "Required cuda_graph_allocator_ shall not be "
+                                  "nullptr, but received nullptr."));
+      allocator = cuda_graph_allocator_;
+    }
+#endif
     return tensor->AllocateFrom(
         const_cast<Allocator*>(allocator), dtype, requested_size);
   }
@@ -200,6 +235,9 @@ struct DeviceContext::Impl {
   const Allocator* host_allocator_{nullptr};
   const Allocator* zero_allocator_{nullptr};
   const Allocator* pinned_allocator_{nullptr};
+#ifdef PADDLE_WITH_CUDA
+  const Allocator* cuda_graph_allocator_{nullptr};
+#endif
   Generator* device_generator_{nullptr};
   Generator* host_generator_{nullptr};
 };
@@ -213,6 +251,11 @@ DeviceContext::DeviceContext(const DeviceContext& other) {
   impl_->SetPinnedAllocator(&other.GetPinnedAllocator());
   impl_->SetHostGenerator(other.GetHostGenerator());
   impl_->SetGenerator(other.GetGenerator());
+#ifdef PADDLE_WITH_CUDA
+  if (other.IsCUDAGraphAllocatorValid()) {
+    impl_->SetCUDAGraphAllocator(&other.GetCUDAGraphAllocator());
+  }
+#endif
 }
 
 DeviceContext::DeviceContext(DeviceContext&& other) {
@@ -239,6 +282,20 @@ const Allocator& DeviceContext::GetHostAllocator() const {
   return impl_->GetHostAllocator();
 }
 
+#ifdef PADDLE_WITH_CUDA
+void DeviceContext::SetCUDAGraphAllocator(const Allocator* allocator) {
+  impl_->SetCUDAGraphAllocator(allocator);
+}
+
+const Allocator& DeviceContext::GetCUDAGraphAllocator() const {
+  return impl_->GetCUDAGraphAllocator();
+}
+
+bool DeviceContext::IsCUDAGraphAllocatorValid() const {
+  return impl_->IsCUDAGraphAllocatorValid();
+}
+#endif
+
 void DeviceContext::SetZeroAllocator(const Allocator* allocator) {
   impl_->SetZeroAllocator(allocator);
 }
diff --git a/paddle/phi/core/device_context.h b/paddle/phi/core/device_context.h
index 45e4fbf64dc04..32dbb0c0a357c 100644
--- a/paddle/phi/core/device_context.h
+++ b/paddle/phi/core/device_context.h
@@ -106,6 +106,33 @@ class PADDLE_API DeviceContext {
 
   const Allocator& GetPinnedAllocator() const;
 
+#ifdef PADDLE_WITH_CUDA
+  /**
+   * @brief Set the CUDA graph Allocator object.
+   *
+   * @param allocator
+   */
+  void SetCUDAGraphAllocator(const Allocator*);
+
+  /**
+   * @brief Get the const CUDA graph Allocator object.
+   *
+   * @return Allocator
+   */
+  const Allocator& GetCUDAGraphAllocator() const;
+
+  /**
+   * @brief Test whether the CUDA graph allocator is valid
+   *
+   * This method should be called before calling GetCUDAGraphAllocator().
+   * Other unit can calls GetCUDAGraphAllocator() method,
+   * only when this method returns True!
+   *
+   * @return true if cuda_graph_allocator_ is valid, false otherwise
+   */
+  bool IsCUDAGraphAllocatorValid() const;
+#endif
+
   /**
    * @brief Allocate device memory for tensor.
    */
diff --git a/paddle/phi/core/enforce.cc b/paddle/phi/core/enforce.cc
index 0dd415d13130e..8074fbeb49180 100644
--- a/paddle/phi/core/enforce.cc
+++ b/paddle/phi/core/enforce.cc
@@ -14,13 +14,12 @@ limitations under the License. */
 
 #include "paddle/phi/core/enforce.h"
 
-#include <boost/variant.hpp>
 #include <map>
 #include <memory>
 #include <unordered_map>
 #include <vector>
 
-#include "boost/blank.hpp"
+#include "paddle/utils/blank.h"
 #include "paddle/utils/variant.h"
 
 namespace egr {
@@ -29,7 +28,7 @@ class EagerVariable;
 namespace paddle {
 namespace framework {
 class BlockDesc;
-using Attribute = paddle::variant<boost::blank,
+using Attribute = paddle::variant<paddle::blank,
                                   int,
                                   float,
                                   std::string,
diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc
index d864544e10dd8..d6f6e60fe2d3d 100644
--- a/paddle/phi/core/kernel_factory.cc
+++ b/paddle/phi/core/kernel_factory.cc
@@ -46,9 +46,17 @@ const Kernel& KernelFactory::SelectKernel(const std::string& kernel_name,
     return empty_kernel;
   }
   auto kernel_iter = iter->second.find(kernel_key);
+  if (kernel_iter == iter->second.end() &&
+      kernel_key.layout() != phi::DataLayout::ALL_LAYOUT) {
+    phi::KernelKey any_layout_kernel_key(
+        kernel_key.backend(), phi::DataLayout::ALL_LAYOUT, kernel_key.dtype());
+    kernel_iter = iter->second.find(any_layout_kernel_key);
+  }
+
   if (kernel_iter == iter->second.end()) {
     return empty_kernel;
   }
+
   return kernel_iter->second;
 }
 
diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h
index 65f655d50375c..010d5c2e0c379 100644
--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -56,6 +56,9 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
     auto args_type = ParseArgType(Indices{});
     for (auto arg_type : args_type) {
       if (arg_type == std::type_index(typeid(const CPUContext&))
+#if defined(PADDLE_WITH_MKLDNN)
+          || arg_type == std::type_index(typeid(const OneDNNContext&))
+#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
           || arg_type == std::type_index(typeid(const GPUContext&))) {
 #elif defined(PADDLE_WITH_XPU)
@@ -63,6 +66,7 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
 #elif defined(PADDLE_WITH_CUSTOM_DEVICE)
           || arg_type == std::type_index(typeid(const CustomContext&))) {
 #else
+
       ) {
 #endif
         // do nothing, skip context arg now
diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h
index 3b5fd0247a484..9206acfd51542 100644
--- a/paddle/phi/core/kernel_utils.h
+++ b/paddle/phi/core/kernel_utils.h
@@ -17,7 +17,10 @@
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/custom/custom_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/onednn/onednn_context.h"
+#ifdef PADDLE_WITH_XPU
 #include "paddle/phi/backends/xpu/xpu_context.h"
+#endif
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
@@ -257,7 +260,9 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
   PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CustomContext);
 #endif
-
+#ifdef PADDLE_WITH_MKLDNN
+  PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(OneDNNContext);
+#endif
   /* Input Helpers */
 
   PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(DenseTensor);
diff --git a/paddle/phi/core/macros.h b/paddle/phi/core/macros.h
index 8049d027a77b8..e48f7342e456e 100644
--- a/paddle/phi/core/macros.h
+++ b/paddle/phi/core/macros.h
@@ -53,4 +53,10 @@ namespace phi {
 #define PD_CONCATENATE2(arg1, arg2) arg1##arg2
 #define PD_EXPAND(x) x
 
+#if defined(__NVCC__) || defined(__HIPCC__)
+#define PADDLE_RESTRICT __restrict__
+#else
+#define PADDLE_RESTRICT
+#endif
+
 }  // namespace phi
diff --git a/paddle/phi/core/meta_tensor.cc b/paddle/phi/core/meta_tensor.cc
index 2178855aa0fee..f0cd841235ef1 100644
--- a/paddle/phi/core/meta_tensor.cc
+++ b/paddle/phi/core/meta_tensor.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/phi/core/meta_tensor.h"
 
+#include "glog/logging.h"
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/selected_rows.h"
diff --git a/paddle/phi/core/meta_tensor.h b/paddle/phi/core/meta_tensor.h
index 271759161868b..377d0e9bc4d6d 100644
--- a/paddle/phi/core/meta_tensor.h
+++ b/paddle/phi/core/meta_tensor.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include "glog/logging.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/ddim.h"
diff --git a/paddle/phi/core/tensor_utils.cc b/paddle/phi/core/tensor_utils.cc
index e2d8d5a03651c..45f6c00affe05 100644
--- a/paddle/phi/core/tensor_utils.cc
+++ b/paddle/phi/core/tensor_utils.cc
@@ -53,6 +53,10 @@ void Copy(const Context& dev_ctx,
 #ifdef PADDLE_WITH_XPU
   } else if (paddle::platform::is_xpu_place(dst_place)) {
     dst_ptr = dev_ctx.Alloc(dst, src.dtype());
+#endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  } else if (paddle::platform::is_custom_place(dst_place)) {
+    dst_ptr = dev_ctx.Alloc(dst, src.dtype());
 #endif
   }
 
diff --git a/paddle/phi/core/utils/data_type.h b/paddle/phi/core/utils/data_type.h
index 9ef8e8a356c7a..975d55889c717 100644
--- a/paddle/phi/core/utils/data_type.h
+++ b/paddle/phi/core/utils/data_type.h
@@ -80,4 +80,21 @@ inline void VisitDataTypeTiny(phi::DataType type, Visitor visitor) {
       "Not supported phi::DataType(%d) as data type.", static_cast<int>(type)));
 }
 
+inline bool IsComplexType(const DataType& type) {
+  return (type == DataType::COMPLEX64 || type == DataType::COMPLEX128);
+}
+
+inline DataType ToComplexType(const DataType& type) {
+  switch (type) {
+    case DataType::FLOAT32:
+      return DataType::COMPLEX64;
+    case DataType::FLOAT64:
+      return DataType::COMPLEX128;
+    default:
+      PADDLE_THROW(errors::Unimplemented(
+          "Can not transform data type (%s) to complex type, now only support "
+          "float32 and float64 real value.",
+          type));
+  }
+}
 }  // namespace phi
diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
index f59ea5549bd71..dd2d1eb482c8e 100644
--- a/paddle/phi/infermeta/backward.cc
+++ b/paddle/phi/infermeta/backward.cc
@@ -83,6 +83,23 @@ void ChannelShuffleGradInferMeta(const MetaTensor& out_grad,
   x_grad->set_dtype(out_grad.dtype());
 }
 
+void ComplexGradInferMeta(const MetaTensor& x,
+                          const MetaTensor& y,
+                          const MetaTensor& dout,
+                          MetaTensor* dx,
+                          MetaTensor* dy) {
+  auto x_dims = x.dims();
+  if (dx) {
+    dx->set_dims(x_dims);
+    dx->set_dtype(x.dtype());
+  }
+  auto y_dims = y.dims();
+  if (dy) {
+    dy->set_dims(y_dims);
+    dy->set_dtype(y.dtype());
+  }
+}
+
 void ConvTransposeGradInferMeta(const MetaTensor& x,
                                 const MetaTensor& filter,
                                 const MetaTensor& dout,
diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h
index 0e7ed640d8ffb..6a4eba74b47be 100644
--- a/paddle/phi/infermeta/backward.h
+++ b/paddle/phi/infermeta/backward.h
@@ -42,6 +42,12 @@ void ChannelShuffleGradInferMeta(const MetaTensor& out_grad,
                                  const std::string& data_format,
                                  MetaTensor* x_grad);
 
+void ComplexGradInferMeta(const MetaTensor& x,
+                          const MetaTensor& y,
+                          const MetaTensor& dout,
+                          MetaTensor* dx,
+                          MetaTensor* dy);
+
 void ConvTransposeGradInferMeta(const MetaTensor& x,
                                 const MetaTensor& filter,
                                 const MetaTensor& dout,
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 1ba025e2c6252..460b0a9e1bdc4 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -17,8 +17,11 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 
+#include "glog/logging.h"
+
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/layout.h"
+#include "paddle/phi/common/type_traits.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
@@ -356,6 +359,37 @@ void CompareAllInferMeta(const MetaTensor& x,
   out->set_dtype(DataType::BOOL);
 }
 
+void ComplexInferMeta(const MetaTensor& x,
+                      const MetaTensor& y,
+                      MetaTensor* out) {
+  if (x.dims() == y.dims()) {
+    auto sizes = vectorize(x.dims());
+    out->set_dims(phi::make_ddim(sizes));
+    out->set_dtype(dtype::ToComplex(x.dtype()));
+    // NOTE(chenfeiyu): lod & broadcasting is intrinsically contradictory
+    // so tensors with lod are not supported here
+  } else {
+    auto x_dims = x.dims();
+    auto y_dims = y.dims();
+    int max_dim = std::max(x_dims.size(), y_dims.size());
+
+    // start align axis
+    int axis = std::abs(x_dims.size() - y_dims.size());
+    std::vector<int> x_dims_array(max_dim);
+    std::vector<int> y_dims_array(max_dim);
+    std::vector<int> out_dims_array(max_dim);
+    phi::funcs::GetBroadcastDimsArrays(x_dims,
+                                       y_dims,
+                                       x_dims_array.data(),
+                                       y_dims_array.data(),
+                                       out_dims_array.data(),
+                                       max_dim,
+                                       axis);
+    out->set_dims(phi::make_ddim(out_dims_array));
+    out->set_dtype(dtype::ToComplex(x.dtype()));
+  }
+}
+
 void ConvInferMeta(const MetaTensor& input,
                    const MetaTensor& filter,
                    const std::vector<int>& strides,
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index 9709edf63ccc0..12922ed536add 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -74,6 +74,10 @@ void CompareInferMeta(const MetaTensor& x,
                       int axis,
                       MetaTensor* out);
 
+void ComplexInferMeta(const MetaTensor& x,
+                      const MetaTensor& y,
+                      MetaTensor* out);
+
 void ConvInferMeta(const MetaTensor& input,
                    const MetaTensor& filter,
                    const std::vector<int>& strides,
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 61c57981f94b5..3369b0c392ec3 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -1528,6 +1528,43 @@ void LogspaceInferMeta(const MetaTensor& start,
   out->set_dtype(start.dtype());
 }
 
+void MergedAdamInferMeta(
+    const std::vector<const MetaTensor*>& param,
+    const std::vector<const MetaTensor*>& grad,
+    const std::vector<const MetaTensor*>& learning_rate,
+    const std::vector<const MetaTensor*>& moment1,
+    const std::vector<const MetaTensor*>& moment2,
+    const std::vector<const MetaTensor*>& beta1_pow,
+    const std::vector<const MetaTensor*>& beta2_pow,
+    const paddle::optional<std::vector<const MetaTensor*>>& master_param,
+    const Scalar& beta1,
+    const Scalar& beta2,
+    const Scalar& epsilon,
+    bool multi_precision,
+    bool use_global_beta_pow,
+    std::vector<MetaTensor*> param_out,
+    std::vector<MetaTensor*> moment1_out,
+    std::vector<MetaTensor*> moment2_out,
+    std::vector<MetaTensor*> beta1_pow_out,
+    std::vector<MetaTensor*> beta2_pow_out,
+    std::vector<MetaTensor*> master_param_out) {}
+
+void MergedMomentumInferMeta(
+    const std::vector<const MetaTensor*>& param,
+    const std::vector<const MetaTensor*>& grad,
+    const std::vector<const MetaTensor*>& velocity,
+    const std::vector<const MetaTensor*>& learning_rate,
+    const paddle::optional<std::vector<const MetaTensor*>>& master_param,
+    float mu,
+    bool use_nesterov,
+    const std::vector<std::string>& regularization_method,
+    const std::vector<float>& regularization_coeff,
+    bool multi_precision,
+    float rescale_grad,
+    std::vector<MetaTensor*> param_out,
+    std::vector<MetaTensor*> velocity_out,
+    std::vector<MetaTensor*> master_param_out) {}
+
 void MeshgridInferMeta(const std::vector<const MetaTensor*>& inputs,
                        std::vector<MetaTensor*> outputs) {
   const size_t inputs_num = inputs.size();
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index 54c6fccceb9c1..0ec71e86893c3 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -234,6 +234,43 @@ void LogspaceInferMeta(const MetaTensor& start,
                        const MetaTensor& base,
                        MetaTensor* out);
 
+void MergedAdamInferMeta(
+    const std::vector<const MetaTensor*>& param,
+    const std::vector<const MetaTensor*>& grad,
+    const std::vector<const MetaTensor*>& learning_rate,
+    const std::vector<const MetaTensor*>& moment1,
+    const std::vector<const MetaTensor*>& moment2,
+    const std::vector<const MetaTensor*>& beta1_pow,
+    const std::vector<const MetaTensor*>& beta2_pow,
+    const paddle::optional<std::vector<const MetaTensor*>>& master_param,
+    const Scalar& beta1,
+    const Scalar& beta2,
+    const Scalar& epsilon,
+    bool multi_precision,
+    bool use_global_beta_pow,
+    std::vector<MetaTensor*> param_out,
+    std::vector<MetaTensor*> moment1_out,
+    std::vector<MetaTensor*> moment2_out,
+    std::vector<MetaTensor*> beta1_pow_out,
+    std::vector<MetaTensor*> beta2_pow_out,
+    std::vector<MetaTensor*> master_param_out);
+
+void MergedMomentumInferMeta(
+    const std::vector<const MetaTensor*>& param,
+    const std::vector<const MetaTensor*>& grad,
+    const std::vector<const MetaTensor*>& velocity,
+    const std::vector<const MetaTensor*>& learning_rate,
+    const paddle::optional<std::vector<const MetaTensor*>>& master_param,
+    float mu,
+    bool use_nesterov,
+    const std::vector<std::string>& regularization_method,
+    const std::vector<float>& regularization_coeff,
+    bool multi_precision,
+    float rescale_grad,
+    std::vector<MetaTensor*> param_out,
+    std::vector<MetaTensor*> velocity_out,
+    std::vector<MetaTensor*> master_param_out);
+
 void MeshgridInferMeta(const std::vector<const MetaTensor*>& inputs,
                        std::vector<MetaTensor*> outputs);
 
diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
index a22f720b97e76..9f65de0f0aa70 100644
--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/phi/infermeta/ternary.h"
 
+#include "glog/logging.h"
+
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 072ab6fd68a1a..5958f0e71e76a 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -148,6 +148,14 @@ void ArgsortInferMeta(const MetaTensor& input,
   indices->share_lod(input);
 }
 
+void AsRealInferMeta(const MetaTensor& input, MetaTensor* output) {
+  auto out_dims_v = phi::vectorize(input.dims());
+  out_dims_v.push_back(2);
+  auto out_dims = phi::make_ddim(out_dims_v);
+  output->set_dims(out_dims);
+  output->share_lod(input);
+}
+
 void BatchSizeLikeInferMeta(const MetaTensor& x,
                             const std::vector<int>& shape,
                             int x_batch_size_dim,
@@ -399,6 +407,39 @@ void EighInferMeta(const MetaTensor& x,
   out_v->set_dims(input_dim);
 }
 
+void EigvalsInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config) {
+  auto x_dims = x.dims();
+  PADDLE_ENFORCE_GE(x_dims.size(),
+                    2,
+                    errors::InvalidArgument(
+                        "The dimensions of Input(X) for Eigvals operator "
+                        "should be at least 2, "
+                        "but received X's dimension = %d, X's shape = [%s].",
+                        x_dims.size(),
+                        x_dims));
+
+  if (config.is_runtime || !phi::contain_unknown_dim(x_dims)) {
+    int last_dim = x_dims.size() - 1;
+    PADDLE_ENFORCE_EQ(x_dims[last_dim],
+                      x_dims[last_dim - 1],
+                      errors::InvalidArgument(
+                          "The last two dimensions of Input(X) for Eigvals "
+                          "operator should be equal, "
+                          "but received X's shape = [%s].",
+                          x_dims));
+  }
+
+  auto out_dims = vectorize(x_dims);
+  out_dims.resize(x_dims.size() - 1);
+
+  const DataType& x_dtype = x.dtype();
+  const DataType& out_dtype =
+      IsComplexType(x_dtype) ? x_dtype : ToComplexType(x_dtype);
+
+  out->set_dims(make_ddim(out_dims));
+  out->set_dtype(out_dtype);
+}
+
 void EinsumInferMeta(const std::vector<const MetaTensor*>& inputs,
                      const std::string& equation,
                      MetaTensor* out,
@@ -2999,6 +3040,66 @@ void UnfoldInferMeta(const MetaTensor& x,
   out->set_dims(phi::make_ddim(out_dims));
 }
 
+void UniqueConsecutiveInferMeta(const MetaTensor& x,
+                                bool return_inverse,
+                                bool return_counts,
+                                const std::vector<int>& axis,
+                                int dtype,
+                                MetaTensor* out,
+                                MetaTensor* index,
+                                MetaTensor* counts) {
+  PADDLE_ENFORCE_NE(out,
+                    nullptr,
+                    phi::errors::InvalidArgument(
+                        "unique_consecutive should have output tensor out."));
+
+  auto in_dims = x.dims();
+  if (return_inverse) {
+    PADDLE_ENFORCE_NE(
+        index,
+        nullptr,
+        phi::errors::InvalidArgument("Tensor index should not be null if "
+                                     "return_inverse is set to True."));
+  }
+  if (return_counts) {
+    PADDLE_ENFORCE_NE(
+        counts,
+        nullptr,
+        phi::errors::InvalidArgument("Tensor counts should not be null if "
+                                     "return_counts is set to True."));
+  }
+
+  if (axis.empty()) {
+    out->set_dims({-1});
+    out->set_dtype(x.dtype());
+    if (return_inverse) {
+      index->set_dims({phi::product(in_dims)});
+    }
+  } else {
+    int axis_value = axis[0];
+    if (axis_value < 0) {
+      axis_value += in_dims.size();
+    }
+    PADDLE_ENFORCE_LT(
+        axis_value,
+        in_dims.size(),
+        phi::errors::InvalidArgument("The axis(%d) should be less than "
+                                     "the dimension size(%d) of x.",
+                                     axis_value,
+                                     in_dims.size()));
+    auto out_dims = in_dims;
+    out_dims[axis_value] = -1;
+    out->set_dims(out_dims);
+    out->set_dtype(x.dtype());
+    if (return_inverse) {
+      index->set_dims({in_dims[axis_value]});
+    }
+  }
+  if (return_counts) {
+    counts->set_dims({-1});
+  }
+}
+
 void UniqueInferMeta(const MetaTensor& x,
                      bool return_index,
                      bool return_inverse,
@@ -3110,15 +3211,18 @@ void UnsqueezeInferMeta(const MetaTensor& x,
     }
     out->set_dtype(x.dtype());
   }
-  // set xshape dims.
-  std::vector<int64_t> xshape_dims(x_dims.size() + 1);
-  xshape_dims[0] = 0;
-  for (int i = 0; i < x_dims.size(); ++i) {
-    xshape_dims[i + 1] = x_dims[i];
+  if (xshape) {
+    // set xshape dims.
+    std::vector<int64_t> xshape_dims(x_dims.size() + 1);
+    xshape_dims[0] = 0;
+    for (int i = 0; i < x_dims.size(); ++i) {
+      xshape_dims[i + 1] = x_dims[i];
+    }
+
+    xshape->set_dims(phi::make_ddim(xshape_dims));
+    xshape->share_lod(x);
+    xshape->set_dtype(x.dtype());
   }
-  xshape->set_dims(phi::make_ddim(xshape_dims));
-  xshape->share_lod(x);
-  xshape->set_dtype(x.dtype());
 }
 
 void UnStackInferMeta(const MetaTensor& x,
@@ -3262,6 +3366,18 @@ void ChannelShuffleInferMeta(const MetaTensor& x,
   out->set_dims(output_dims);
 }
 
+void IdentityLossInferMeta(const MetaTensor& x,
+                           int reduction,
+                           MetaTensor* out) {
+  if (reduction == 2) {
+    out->set_dtype(x.dtype());
+    out->set_dims(x.dims());
+  } else {
+    out->set_dims(phi::make_ddim({1}));
+    out->set_dtype(x.dtype());
+  }
+}
+
 }  // namespace phi
 
 PD_REGISTER_INFER_META_FN(flatten, phi::FlattenInferMeta);
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index f64d406e019ce..30db8dcae9882 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -48,6 +48,8 @@ void ArgsortInferMeta(const MetaTensor& input,
                       MetaTensor* output,
                       MetaTensor* indices);
 
+void AsRealInferMeta(const MetaTensor& input, MetaTensor* output);
+
 void BatchSizeLikeInferMeta(const MetaTensor& x,
                             const std::vector<int>& shape,
                             int x_batch_size_dim,
@@ -80,6 +82,10 @@ void EighInferMeta(const MetaTensor& x,
                    MetaTensor* out_w,
                    MetaTensor* out_v);
 
+void EigvalsInferMeta(const MetaTensor& x,
+                      MetaTensor* out,
+                      MetaConfig config = MetaConfig());
+
 void EinsumInferMeta(const std::vector<const MetaTensor*>& inputs,
                      const std::string& equation,
                      MetaTensor* out,
@@ -420,6 +426,15 @@ void UnfoldInferMeta(const MetaTensor& x,
                      MetaTensor* out,
                      MetaConfig config = MetaConfig());
 
+void UniqueConsecutiveInferMeta(const MetaTensor& x,
+                                bool return_inverse,
+                                bool return_counts,
+                                const std::vector<int>& axis,
+                                int dtype,
+                                MetaTensor* out,
+                                MetaTensor* index,
+                                MetaTensor* counts);
+
 void UniqueInferMeta(const MetaTensor& x,
                      bool return_index,
                      bool return_inverse,
@@ -469,4 +484,6 @@ void ChannelShuffleInferMeta(const MetaTensor& x,
                              const std::string& data_format,
                              MetaTensor* out);
 
+void IdentityLossInferMeta(const MetaTensor& x, int reduction, MetaTensor* out);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index af6cfb8812de8..05abcbd0d1964 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -62,6 +62,7 @@ set(COMMON_KERNEL_DEPS
     pooling
     maxouting
     matrix_inverse
+    matrix_solve
     phi_dynload_warpctc
     sequence_padding
     sequence_scale)
@@ -113,11 +114,13 @@ file(
 # file(GLOB kernel_cudnn "gpudnn/*.cu")
 # file(GLOB kernel_kps "kps/*.cu")
 file(GLOB kernel_xpu "xpu/*.cc")
+file(GLOB kernel_onednn "onednn/*.cc")
 
 add_library(phi_cpu ${kernel_cc})
 kernel_declare("${kernel_cc}")
 target_link_libraries(phi_cpu ${COMMON_KERNEL_DEPS})
-set_property(GLOBAL PROPERTY PHI_KERNELS phi_cpu)
+
+set(ADD_PHI_KERNELS phi_cpu)
 
 if(WITH_GPU OR WITH_ROCM)
   if(WITH_GPU)
@@ -127,7 +130,7 @@ if(WITH_GPU OR WITH_ROCM)
   endif()
   kernel_declare("${kernel_cu}")
   target_link_libraries(phi_gpu ${COMMON_KERNEL_DEPS})
-  set_property(GLOBAL PROPERTY PHI_KERNELS phi_cpu phi_gpu)
+  set(ADD_PHI_KERNELS ${ADD_PHI_KERNELS} phi_gpu)
 endif()
 
 if(WITH_XPU)
@@ -148,5 +151,15 @@ if(WITH_XPU)
   kernel_declare("${kernel_xpu}")
   kernel_declare("${kernel_xpu_kps}")
   target_link_libraries(phi_xpu ${COMMON_KERNEL_DEPS})
-  set_property(GLOBAL PROPERTY PHI_KERNELS phi_cpu phi_xpu)
+  set(ADD_PHI_KERNELS ${ADD_PHI_KERNELS} phi_xpu)
+endif()
+
+if(WITH_MKLDNN)
+  add_library(phi_onednn ${kernel_onednn})
+  kernel_declare(${kernel_onednn})
+  set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} onednn_context)
+  target_link_libraries(phi_onednn ${COMMON_KERNEL_DEPS})
+  set(ADD_PHI_KERNELS ${ADD_PHI_KERNELS} phi_onednn)
 endif()
+
+set_property(GLOBAL PROPERTY PHI_KERNELS ${ADD_PHI_KERNELS})
diff --git a/paddle/phi/kernels/activation_grad_kernel.h b/paddle/phi/kernels/activation_grad_kernel.h
index 8e63a0fd22ade..4daa231437116 100644
--- a/paddle/phi/kernels/activation_grad_kernel.h
+++ b/paddle/phi/kernels/activation_grad_kernel.h
@@ -212,12 +212,17 @@ DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Acosh);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Atanh);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Silu);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Square);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log2);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log10);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log1p);
 
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Exp);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Expm1);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Reciprocal);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Rsqrt);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid);
@@ -233,9 +238,12 @@ DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink, lambda);
 DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink, threshold);
 DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Swish, beta);
 DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Logit, eps);
+DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish, threshold);
 DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Celu, alpha);
 
 DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu, t_min, t_max);
+DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(STanh, scale_a, scale_b);
+DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(Softplus, beta, threshold);
 
 DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid, slope, offset);
 
diff --git a/paddle/phi/kernels/activation_kernel.h b/paddle/phi/kernels/activation_kernel.h
index 5cc4357c937db..8e5913e10fdb7 100644
--- a/paddle/phi/kernels/activation_kernel.h
+++ b/paddle/phi/kernels/activation_kernel.h
@@ -40,12 +40,12 @@ namespace phi {
                     float attr2,                                     \
                     DenseTensor* out);
 
+DECLARE_ACTIVATION_KERNEL(Sin)
 DECLARE_ACTIVATION_KERNEL(Cos)
 DECLARE_ACTIVATION_KERNEL(Tan)
-DECLARE_ACTIVATION_KERNEL(Acos)
-DECLARE_ACTIVATION_KERNEL(Sin)
 DECLARE_ACTIVATION_KERNEL(Asin)
 DECLARE_ACTIVATION_KERNEL(Atan)
+DECLARE_ACTIVATION_KERNEL(Acos)
 DECLARE_ACTIVATION_KERNEL(Sinh)
 DECLARE_ACTIVATION_KERNEL(Cosh)
 DECLARE_ACTIVATION_KERNEL(Asinh)
@@ -53,15 +53,14 @@ DECLARE_ACTIVATION_KERNEL(Acosh)
 DECLARE_ACTIVATION_KERNEL(Atanh)
 DECLARE_ACTIVATION_KERNEL(Relu)
 DECLARE_ACTIVATION_KERNEL(Tanh)
+DECLARE_ACTIVATION_KERNEL(TanhShrink)
+DECLARE_ACTIVATION_KERNEL(Silu)
 DECLARE_ACTIVATION_KERNEL(Exp)
 DECLARE_ACTIVATION_KERNEL(Expm1)
 DECLARE_ACTIVATION_KERNEL(Reciprocal)
 DECLARE_ACTIVATION_KERNEL(Square)
 DECLARE_ACTIVATION_KERNEL(Sqrt)
 DECLARE_ACTIVATION_KERNEL(Rsqrt)
-
-DECLARE_ACTIVATION_KERNEL(TanhShrink)
-DECLARE_ACTIVATION_KERNEL(Silu)
 DECLARE_ACTIVATION_KERNEL(Sigmoid)
 DECLARE_ACTIVATION_KERNEL(LogSigmoid)
 DECLARE_ACTIVATION_KERNEL(Log)
@@ -77,28 +76,18 @@ DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(LeakyRelu, alpha)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, threshold)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Relu6, threshold)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(SoftShrink, lambda)
+DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Mish, threshold)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(HardShrink, threshold)
+DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(SoftShrink, lambda)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Elu, alpha)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Swish, beta)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Celu, alpha)
+DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Logit, eps)
 
 DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(BRelu, t_min, t_max)
 DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(STanh, scale_a, scale_b)
-DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(HardSigmoid, slope, offset)
-
 DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(Softplus, beta, threshold)
-
-template <typename T, typename Context>
-void LogitKernel(const Context& dev_ctx,
-                 const DenseTensor& x,
-                 float eps,
-                 DenseTensor* out);
-
-template <typename T, typename Context>
-void MishKernel(const Context& dev_ctx,
-                const DenseTensor& x,
-                float threshold,
-                DenseTensor* out);
+DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(HardSigmoid, slope, offset)
 
 template <typename T, typename Context>
 void HardSwishKernel(const Context& dev_ctx,
diff --git a/paddle/phi/kernels/adam_kernel.h b/paddle/phi/kernels/adam_kernel.h
index 0bdf05f8e5123..b1a7f5a686530 100644
--- a/paddle/phi/kernels/adam_kernel.h
+++ b/paddle/phi/kernels/adam_kernel.h
@@ -44,4 +44,27 @@ void AdamDenseKernel(const Context& dev_ctx,
                      DenseTensor* beta2_pow_out,
                      DenseTensor* master_param_outs);
 
+template <typename T, typename Context>
+void MergedAdamKernel(
+    const Context& dev_ctx,
+    const std::vector<const DenseTensor*>& param,
+    const std::vector<const DenseTensor*>& grad,
+    const std::vector<const DenseTensor*>& learning_rate,
+    const std::vector<const DenseTensor*>& moment1,
+    const std::vector<const DenseTensor*>& moment2,
+    const std::vector<const DenseTensor*>& beta1_pow,
+    const std::vector<const DenseTensor*>& beta2_pow,
+    const paddle::optional<std::vector<const DenseTensor*>>& master_param,
+    const Scalar& beta1,
+    const Scalar& beta2,
+    const Scalar& epsilon,
+    bool multi_precision,
+    bool use_global_beta_pow,
+    std::vector<DenseTensor*> param_out,
+    std::vector<DenseTensor*> moment1_out,
+    std::vector<DenseTensor*> moment2_out,
+    std::vector<DenseTensor*> beta1_pow_out,
+    std::vector<DenseTensor*> beta2_pow_out,
+    std::vector<DenseTensor*> master_param_out);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/as_real_kernel.h b/paddle/phi/kernels/as_real_kernel.h
new file mode 100644
index 0000000000000..e600f3ce39a6d
--- /dev/null
+++ b/paddle/phi/kernels/as_real_kernel.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AsRealKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/autotune/CMakeLists.txt b/paddle/phi/kernels/autotune/CMakeLists.txt
index a7a6c2f8e4dc0..9379c78c8d005 100644
--- a/paddle/phi/kernels/autotune/CMakeLists.txt
+++ b/paddle/phi/kernels/autotune/CMakeLists.txt
@@ -18,10 +18,7 @@ elseif(WITH_ROCM)
     DEPS gtest)
 endif()
 
-cc_library(
-  cache
-  SRCS cache.cc
-  DEPS boost)
+cc_library(cache SRCS cache.cc)
 cc_library(
   switch_autotune
   SRCS switch_autotune.cc
diff --git a/paddle/phi/kernels/autotune/auto_tune_base.h b/paddle/phi/kernels/autotune/auto_tune_base.h
index 95afa7f697b49..91685c2ed547c 100644
--- a/paddle/phi/kernels/autotune/auto_tune_base.h
+++ b/paddle/phi/kernels/autotune/auto_tune_base.h
@@ -14,12 +14,10 @@
 
 #pragma once
 
-#include <mutex>
 #include <type_traits>
-
 #include "glog/logging.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/autotune/gpu_timer.h"
+#include "paddle/phi/kernels/autotune/switch_autotune.h"
 
 namespace phi {
 namespace autotune {
@@ -51,33 +49,61 @@ class AutoTuneBase {
  public:
   AutoTuneBase() {}
   virtual ~AutoTuneBase() {}
-  explicit AutoTuneBase(KernelType kernel) { kernels_.push_back(kernel); }
 
-  template <typename Type>
-  void AddCallBack(Type kernel) {
-    static_assert(std::is_same<Type, KernelType>::value,
-                  "Type must be the same");
-    kernels_.push_back(kernel);
+  explicit AutoTuneBase(KernelType kernel) {
+    kernels_.push_back(/*default=*/kernel);
   }
 
-  template <typename... Args>
-  void RunBestKernel(const int idx, Args&&... args) {
-    kernels_[idx].Run(args...);
+  void AddCallBack(KernelType kernel) {
+    if (!is_init_) {
+      std::lock_guard<std::mutex> lock(mutex_);
+      kernels_.push_back(kernel);
+    }
   }
 
-  template <typename... Args>
-  void RunDefaultKernel(Args&&... args) {
-    kernels_[0].Run(args...);
+  template <typename Context, typename... Args>
+  void Run(const Context& ctx,
+           const AlgorithmType& algo,
+           const size_t key,
+           Args&&... args) {
+    PADDLE_ENFORCE_GT(
+        kernels_.size(),
+        0,
+        paddle::platform::errors::InvalidArgument(
+            "kernel num must be greater than 0, now is %d", kernels_.size()));
+    is_init_ = true;
+
+    auto& cache = AutoTuneCache::Instance().Get(algo);
+    if (cache.Find(key)) {
+      auto best_idx = cache.Get(key);
+      kernels_[best_idx].Run(args...);
+    } else {
+      bool use_autotune = AutoTuneStatus::Instance().UseAutoTune();
+      if (use_autotune) {
+        // All avaliable kernels have ran while picking the best kernel,
+        // so there may be no need for another kernel run.
+        auto best_idx = PickBestKernel(ctx, args...);
+        cache.Set(key, best_idx);
+      } else {
+        kernels_[0].Run(args...);
+      }
+    }
   }
 
+ private:
+  bool is_init_{false};
+  std::vector<KernelType> kernels_;
+  mutable std::mutex mutex_;
+
   template <typename Context, typename... Args>
-  int PickBestKernel(const Context& ctx, Args&&... args) {
+  size_t PickBestKernel(const Context& ctx, Args&&... args) {
+    std::lock_guard<std::mutex> lock(mutex_);
     PADDLE_ENFORCE_GT(
         kernels_.size(),
         0,
         paddle::platform::errors::InvalidArgument(
             "kernel num must be greater than 0, now is %d", kernels_.size()));
-    int best_idx = 0;
+    size_t best_idx = 0;
     float min_time = std::numeric_limits<float>::max();
 
     // Time cost test estabulished in default stream.
@@ -92,23 +118,15 @@ class AutoTuneBase {
     return best_idx;
   }
 
-  bool IsInit() { return is_init_; }
-  void Finalize() { is_init_ = true; }
-
- private:
-  bool is_init_{false};
-  std::vector<KernelType> kernels_;
-
   template <typename Context, typename... Args>
   float RunAndMeasureKernel(const Context& ctx, const int idx, Args&&... args) {
+    // Regard 1st run as warmup. Judge the result by the time cost of rest run
+    // cycles.
+    constexpr int repeats = 3;
     phi::GpuTimer timer;
     float time_cost = 0;
     const auto& stream = ctx.stream();
 
-    // Treat 1st run as warm up. Judge the result with
-    // the sum of 2nd and 3rd run.
-    constexpr int repeats = 3;
-
     ctx.Wait();
     for (int i = 0; i < repeats; ++i) {
       timer.Start(stream);
@@ -151,7 +169,7 @@ std::once_flag TransposeAutoTuner<T, KernelType>::init_flag_;
 
 template <typename T, typename RetureType, typename... Args>
 static AutoTuneBase<T, KernelCallback<T, RetureType, Args...>>*
-    MakeTransposeTuner(RetureType (*func)(Args...)) {
+MakeTransposeTuner(RetureType (*func)(Args...)) {
   auto obj = MakeCallback<T>(func);
   return TransposeAutoTuner<T, decltype(obj)>::Instance(obj);
 }
diff --git a/paddle/phi/kernels/autotune/auto_tune_test.cu b/paddle/phi/kernels/autotune/auto_tune_test.cu
index d80790dbf2c15..2ac7b0b8b7509 100644
--- a/paddle/phi/kernels/autotune/auto_tune_test.cu
+++ b/paddle/phi/kernels/autotune/auto_tune_test.cu
@@ -131,24 +131,5 @@ TEST(AutoTune, sum) {
     timer.Stop(0);
     VLOG(3) << "kernel[" << i << "]: time cost is " << timer.ElapsedTime();
   }
-
-  // 2. Test call_back tune.
-  VLOG(3) << ">>> [AutoTune]: Test case.";
-  auto tuner = tune::MakeAutoTuner<float>(Algo<4>);
-  tuner.AddCallBack(tune::MakeCallback<float>(Algo<2>));
-  tuner.AddCallBack(tune::MakeCallback<float>(Algo<1>));
-
-  /* The 1st ctx works for ctx.Wait(),
-     the 2nd is just the param of call_back. */
-  auto best_index = tuner.PickBestKernel(
-      *dev_ctx, *dev_ctx, *d_in1.get(), d_in2.get(), N, threads, blocks);
-
-  dev_ctx->Wait();
-  phi::GpuTimer timer;
-  timer.Start(0);
-  tuner.RunBestKernel(
-      best_index, *dev_ctx, *d_in1.get(), d_in2.get(), N, threads, blocks);
-  timer.Stop(0);
-  VLOG(3) << "Best CallBackKernel time cost is " << timer.ElapsedTime();
 #endif
 }
diff --git a/paddle/phi/kernels/autotune/cache.cc b/paddle/phi/kernels/autotune/cache.cc
index 5e2c9e1c742ff..838f2dd265eb3 100644
--- a/paddle/phi/kernels/autotune/cache.cc
+++ b/paddle/phi/kernels/autotune/cache.cc
@@ -36,6 +36,13 @@ size_t ConvKey(const std::vector<int64_t>& x_dims,
                 static_cast<int64_t>(dtype));
 }
 
+size_t TransposeKey(const std::vector<int64_t>& x_dims,
+                    const std::vector<int32_t>& perm,
+                    phi::DataType dtype) {
+  const auto rank = perm.size();
+  return GetKey(x_dims, perm, rank, static_cast<int64_t>(dtype));
+}
+
 std::string AlgorithmTypeString(int64_t algo_type) {
   if (algo_type == static_cast<int64_t>(AlgorithmType::kConvForward)) {
     return "conv_forward";
diff --git a/paddle/phi/kernels/autotune/cache.h b/paddle/phi/kernels/autotune/cache.h
index 8de0695ede40c..1263cf40e567e 100644
--- a/paddle/phi/kernels/autotune/cache.h
+++ b/paddle/phi/kernels/autotune/cache.h
@@ -68,6 +68,10 @@ size_t ConvKey(const std::vector<int64_t>& x_dims,
                const std::vector<int>& dilations,
                phi::DataType dtype);
 
+size_t TransposeKey(const std::vector<int64_t>& x_dims,
+                    const std::vector<int32_t>& perm,
+                    phi::DataType dtype);
+
 template <typename AlgorithmT>
 class AlgorithmsCache {
  public:
diff --git a/paddle/phi/kernels/autotune/switch_autotune.cc b/paddle/phi/kernels/autotune/switch_autotune.cc
index 6fda24ef3c860..3742749b3bf03 100644
--- a/paddle/phi/kernels/autotune/switch_autotune.cc
+++ b/paddle/phi/kernels/autotune/switch_autotune.cc
@@ -29,6 +29,7 @@ void AutoTuneStatus::EnableAutoTune() {
 
 void AutoTuneStatus::DisableAutoTune() {
   FLAGS_use_autotune = false;
+  use_autotune_ = false;
   Init();
 }
 
diff --git a/paddle/phi/kernels/complex_grad_kernel.h b/paddle/phi/kernels/complex_grad_kernel.h
index be13e2826ea81..91c47538e958d 100644
--- a/paddle/phi/kernels/complex_grad_kernel.h
+++ b/paddle/phi/kernels/complex_grad_kernel.h
@@ -28,4 +28,12 @@ void ImagGradKernel(const Context& dev_ctx,
                     const DenseTensor& dout,
                     DenseTensor* dx);
 
+template <typename T, typename Context>
+void ComplexGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       const DenseTensor& dout,
+                       DenseTensor* dx,
+                       DenseTensor* dy);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/complex_kernel.h b/paddle/phi/kernels/complex_kernel.h
index 07f93f9b926f1..ad66b890b3d5a 100644
--- a/paddle/phi/kernels/complex_kernel.h
+++ b/paddle/phi/kernels/complex_kernel.h
@@ -30,6 +30,12 @@ void RealKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
 template <typename T, typename Context>
 void ImagKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
 
+template <typename T, typename Context>
+void ComplexKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   DenseTensor* out);
+
 // If T is complex
 template <
     typename T,
diff --git a/paddle/phi/kernels/cpu/adam_kernel.cc b/paddle/phi/kernels/cpu/adam_kernel.cc
index 03e2a539640ea..03a75bd36156f 100644
--- a/paddle/phi/kernels/cpu/adam_kernel.cc
+++ b/paddle/phi/kernels/cpu/adam_kernel.cc
@@ -167,7 +167,111 @@ void AdamDenseKernel(const Context& dev_ctx,
   }
 }
 
+template <typename T, typename Context>
+void MergedAdamKernel(
+    const Context& dev_ctx,
+    const std::vector<const DenseTensor*>& param,
+    const std::vector<const DenseTensor*>& grad,
+    const std::vector<const DenseTensor*>& learning_rate,
+    const std::vector<const DenseTensor*>& moment1,
+    const std::vector<const DenseTensor*>& moment2,
+    const std::vector<const DenseTensor*>& beta1_pow,
+    const std::vector<const DenseTensor*>& beta2_pow,
+    const paddle::optional<std::vector<const DenseTensor*>>& master_param,
+    const Scalar& beta1,
+    const Scalar& beta2,
+    const Scalar& epsilon,
+    bool multi_precision,
+    bool use_global_beta_pow,
+    std::vector<DenseTensor*> param_out,
+    std::vector<DenseTensor*> moment1_out,
+    std::vector<DenseTensor*> moment2_out,
+    std::vector<DenseTensor*> beta1_pow_out,
+    std::vector<DenseTensor*> beta2_pow_out,
+    std::vector<DenseTensor*> master_param_out) {
+  size_t param_num = param.size();
+  PADDLE_ENFORCE_EQ(
+      param_num,
+      grad.size(),
+      errors::InvalidArgument("The size of Input(grad) must be equal to "
+                              "Input(param), but got the size of Input(grad) "
+                              "is %d, the size of Input(param) is %d.",
+                              grad.size(),
+                              param_num));
+  PADDLE_ENFORCE_EQ(
+      param_num,
+      learning_rate.size(),
+      errors::InvalidArgument(
+          "The size of Input(learning_rate) must be equal to "
+          "Input(param), but got the size of Input(learning_rate) "
+          "is %d, the size of Input(param) is %d.",
+          learning_rate.size(),
+          param_num));
+  PADDLE_ENFORCE_EQ(param_num,
+                    moment1.size(),
+                    errors::InvalidArgument(
+                        "The size of Input(moment1) must be equal to "
+                        "Input(param), but got the size of Input(moment1) "
+                        "is %d, the size of Input(param) is %d.",
+                        moment1.size(),
+                        param_num));
+  PADDLE_ENFORCE_EQ(param_num,
+                    moment2.size(),
+                    errors::InvalidArgument(
+                        "The size of Input(moment2) must be equal to "
+                        "Input(param), but got the size of Input(moment2) "
+                        "is %d, the size of Input(param) is %d.",
+                        moment2.size(),
+                        param_num));
+  PADDLE_ENFORCE_EQ(param_num,
+                    beta1_pow.size(),
+                    errors::InvalidArgument(
+                        "The size of Input(beta1_pow) must be equal to "
+                        "Input(param), but got the size of Input(beta1_pow) "
+                        "is %d, the size of Input(param) is %d.",
+                        beta1_pow.size(),
+                        param_num));
+  PADDLE_ENFORCE_EQ(param_num,
+                    beta2_pow.size(),
+                    errors::InvalidArgument(
+                        "The size of Input(beta2_pow) must be equal to "
+                        "Input(param), but got the size of Input(beta2_pow) "
+                        "is %d, the size of Input(param) is %d.",
+                        beta2_pow.size(),
+                        param_num));
+  T beta1_ = beta1.to<T>();
+  T beta2_ = beta2.to<T>();
+  T epsilon_ = epsilon.to<T>();
+
+  for (size_t idx = 0; idx < param_num; idx++) {
+    phi::funcs::AdamFunctor<T, phi::funcs::CPUAdam> functor(
+        beta1_,
+        beta2_,
+        epsilon_,
+        beta1_pow[idx]->data<T>(),
+        beta2_pow[idx]->data<T>(),
+        moment1[idx]->data<T>(),
+        dev_ctx.template Alloc<T>(moment1_out[idx]),
+        moment2[idx]->data<T>(),
+        dev_ctx.template Alloc<T>(moment2_out[idx]),
+        learning_rate[idx]->data<T>(),
+        grad[idx]->data<T>(),
+        param[idx]->data<T>(),
+        dev_ctx.template Alloc<T>(param_out[idx]));
+    functor(param[idx]->numel());
+    if (!use_global_beta_pow) {
+      dev_ctx.template Alloc<T>(beta1_pow_out[idx])[0] =
+          beta1_ * beta1_pow[idx]->data<T>()[0];
+      dev_ctx.template Alloc<T>(beta2_pow_out[idx])[0] =
+          beta2_ * beta2_pow[idx]->data<T>()[0];
+    }
+  }
+}
+
 }  // namespace phi
 
 PD_REGISTER_KERNEL(adam, CPU, ALL_LAYOUT, phi::AdamDenseKernel, float, double) {
 }
+
+PD_REGISTER_KERNEL(
+    merged_adam, CPU, ALL_LAYOUT, phi::MergedAdamKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/dist_kernel.cc b/paddle/phi/kernels/cpu/as_real_kernel.cc
similarity index 80%
rename from paddle/phi/kernels/cpu/dist_kernel.cc
rename to paddle/phi/kernels/cpu/as_real_kernel.cc
index 0c7b5db64b38f..c4f6ec87af414 100644
--- a/paddle/phi/kernels/cpu/dist_kernel.cc
+++ b/paddle/phi/kernels/cpu/as_real_kernel.cc
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/dist_kernel.h"
+#include "paddle/phi/kernels/as_real_kernel.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/dist_kernel_impl.h"
+#include "paddle/phi/kernels/impl/as_real_impl.h"
 
-PD_REGISTER_KERNEL(dist, CPU, ALL_LAYOUT, phi::DistKernel, float, double) {}
+PD_REGISTER_KERNEL(as_real, CPU, ALL_LAYOUT, phi::AsRealKernel, float, double) {
+}
diff --git a/paddle/phi/kernels/cpu/complex_grad_kernel.cc b/paddle/phi/kernels/cpu/complex_grad_kernel.cc
index 11b7a05834607..049022f01e7c0 100644
--- a/paddle/phi/kernels/cpu/complex_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/complex_grad_kernel.cc
@@ -31,3 +31,8 @@ PD_REGISTER_KERNEL(imag_grad,
                    phi::ImagGradKernel,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(
+    complex_grad, CPU, ALL_LAYOUT, phi::ComplexGradKernel, float, double) {
+  kernel->InputAt(2).SetDataType(phi::dtype::ToComplex(kernel_key.dtype()));
+}
diff --git a/paddle/phi/kernels/cpu/complex_kernel.cc b/paddle/phi/kernels/cpu/complex_kernel.cc
index bef0b7b747a42..9e6c72ae7c16a 100644
--- a/paddle/phi/kernels/cpu/complex_kernel.cc
+++ b/paddle/phi/kernels/cpu/complex_kernel.cc
@@ -49,3 +49,8 @@ PD_REGISTER_KERNEL(imag,
                    phi::dtype::complex<double>) {
   kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
 }
+
+PD_REGISTER_KERNEL(
+    complex, CPU, ALL_LAYOUT, phi::ComplexKernel, float, double) {
+  kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype()));
+}
diff --git a/paddle/phi/kernels/cpu/eigvals_kernel.cc b/paddle/phi/kernels/cpu/eigvals_kernel.cc
new file mode 100644
index 0000000000000..e99aa42fbdb29
--- /dev/null
+++ b/paddle/phi/kernels/cpu/eigvals_kernel.cc
@@ -0,0 +1,260 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/eigvals_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/utils/data_type.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
+
+namespace phi {
+
+template <typename T, typename enable = void>
+struct PaddleComplex;
+
+template <typename T>
+struct PaddleComplex<
+    T,
+    typename std::enable_if<std::is_floating_point<T>::value>::type> {
+  using type = dtype::complex<T>;
+};
+
+template <typename T>
+struct PaddleComplex<
+    T,
+    typename std::enable_if<
+        std::is_same<T, dtype::complex<float>>::value ||
+        std::is_same<T, dtype::complex<double>>::value>::type> {
+  using type = T;
+};
+
+template <typename T>
+using PaddleCType = typename PaddleComplex<T>::type;
+template <typename T>
+using Real = typename dtype::Real<T>;
+
+inline void CheckLapackEigResult(const int info, const std::string& name) {
+  PADDLE_ENFORCE_LE(
+      info,
+      0,
+      errors::PreconditionNotMet("The QR algorithm failed to compute all the "
+                                 "eigenvalues in function %s.",
+                                 name.c_str()));
+  PADDLE_ENFORCE_GE(
+      info,
+      0,
+      errors::InvalidArgument(
+          "The %d-th argument has an illegal value in function %s.",
+          -info,
+          name.c_str()));
+}
+
+template <typename T, typename Context>
+typename std::enable_if<std::is_floating_point<T>::value>::type LapackEigvals(
+    const Context& ctx,
+    const DenseTensor& input,
+    DenseTensor* output,
+    DenseTensor* work,
+    DenseTensor* rwork /*unused*/) {
+  DenseTensor a;  // will be overwritten when lapackEig exit
+  Copy(ctx, input, input.place(), /*blocking=*/true, &a);
+
+  DenseTensor w;
+  int64_t n_dim = input.dims()[1];
+  w.Resize(make_ddim({n_dim << 1}));
+  T* w_data = ctx.template Alloc<T>(&w);
+
+  int64_t work_mem = work->memory_size();
+  int64_t required_work_mem = 3 * n_dim * sizeof(T);
+  PADDLE_ENFORCE_GE(
+      work_mem,
+      3 * n_dim * sizeof(T),
+      errors::InvalidArgument(
+          "The memory size of the work tensor in LapackEigvals function "
+          "should be at least %" PRId64 " bytes, "
+          "but received work\'s memory size = %" PRId64 " bytes.",
+          required_work_mem,
+          work_mem));
+
+  int info = 0;
+  phi::funcs::lapackEig<T>('N',
+                           'N',
+                           static_cast<int>(n_dim),
+                           a.template data<T>(),
+                           static_cast<int>(n_dim),
+                           w_data,
+                           NULL,
+                           1,
+                           NULL,
+                           1,
+                           work->template data<T>(),
+                           static_cast<int>(work_mem / sizeof(T)),
+                           static_cast<T*>(NULL),
+                           &info);
+
+  std::string name = "phi::backend::dynload::dgeev_";
+  if (input.dtype() == DataType::FLOAT64) {
+    name = "phi::backend::dynload::sgeev_";
+  }
+  CheckLapackEigResult(info, name);
+
+  funcs::ForRange<Context> for_range(ctx, n_dim);
+  funcs::RealImagToComplexFunctor<PaddleCType<T>> functor(
+      w_data, w_data + n_dim, output->template data<PaddleCType<T>>(), n_dim);
+  for_range(functor);
+}
+
+template <typename T, typename Context>
+typename std::enable_if<std::is_same<T, dtype::complex<float>>::value ||
+                        std::is_same<T, dtype::complex<double>>::value>::type
+LapackEigvals(const Context& ctx,
+              const DenseTensor& input,
+              DenseTensor* output,
+              DenseTensor* work,
+              DenseTensor* rwork) {
+  DenseTensor a;  // will be overwritten when lapackEig exit
+  Copy(ctx, input, input.place(), /*blocking=*/true, &a);
+
+  int64_t work_mem = work->memory_size();
+  int64_t n_dim = input.dims()[1];
+  int64_t required_work_mem = 3 * n_dim * sizeof(T);
+  PADDLE_ENFORCE_GE(
+      work_mem,
+      3 * n_dim * sizeof(T),
+      errors::InvalidArgument(
+          "The memory size of the work tensor in LapackEigvals function "
+          "should be at least %" PRId64 " bytes, "
+          "but received work\'s memory size = %" PRId64 " bytes.",
+          required_work_mem,
+          work_mem));
+
+  int64_t rwork_mem = rwork->memory_size();
+  int64_t required_rwork_mem = (n_dim << 1) * sizeof(dtype::Real<T>);
+  PADDLE_ENFORCE_GE(
+      rwork_mem,
+      required_rwork_mem,
+      errors::InvalidArgument(
+          "The memory size of the rwork tensor in LapackEigvals function "
+          "should be at least %" PRId64 " bytes, "
+          "but received rwork\'s memory size = %" PRId64 " bytes.",
+          required_rwork_mem,
+          rwork_mem));
+
+  int info = 0;
+  phi::funcs::lapackEig<T, dtype::Real<T>>(
+      'N',
+      'N',
+      static_cast<int>(n_dim),
+      a.template data<T>(),
+      static_cast<int>(n_dim),
+      output->template data<T>(),
+      NULL,
+      1,
+      NULL,
+      1,
+      work->template data<T>(),
+      static_cast<int>(work_mem / sizeof(T)),
+      rwork->template data<dtype::Real<T>>(),
+      &info);
+
+  std::string name = "phi::backend::dynload::cgeev_";
+  if (input.dtype() == DataType::COMPLEX128) {
+    name = "phi::backend::dynload::zgeev_";
+  }
+  CheckLapackEigResult(info, name);
+}
+
+void SpiltBatchSquareMatrix(const DenseTensor& input,
+                            std::vector<DenseTensor>* output) {
+  DDim input_dims = input.dims();
+  int last_dim = input_dims.size() - 1;
+  int n_dim = input_dims[last_dim];
+
+  DDim flattened_input_dims, flattened_output_dims;
+  if (input_dims.size() > 2) {
+    flattened_input_dims =
+        phi::flatten_to_3d(input_dims, last_dim - 1, last_dim);
+  } else {
+    flattened_input_dims = phi::make_ddim({1, n_dim, n_dim});
+  }
+
+  DenseTensor flattened_input;
+  flattened_input.ShareDataWith(input);
+  flattened_input.Resize(flattened_input_dims);
+  (*output) = flattened_input.Split(1, 0);
+}
+
+template <typename T, typename Context>
+void EigvalsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
+  ctx.template Alloc<PaddleCType<T>>(out);
+
+  std::vector<DenseTensor> x_matrices;
+  SpiltBatchSquareMatrix(x, /*->*/ &x_matrices);
+
+  int64_t n_dim = x_matrices[0].dims()[1];
+  int64_t n_batch = x_matrices.size();
+  DDim out_dims = out->dims();
+  out->Resize(make_ddim({n_batch, n_dim}));
+  std::vector<DenseTensor> out_vectors = out->Split(1, 0);
+
+  // query workspace size
+  T qwork;
+  int info;
+  funcs::lapackEig<T, dtype::Real<T>>('N',
+                                      'N',
+                                      static_cast<int>(n_dim),
+                                      x_matrices[0].template data<T>(),
+                                      static_cast<int>(n_dim),
+                                      NULL,
+                                      NULL,
+                                      1,
+                                      NULL,
+                                      1,
+                                      &qwork,
+                                      -1,
+                                      static_cast<dtype::Real<T>*>(NULL),
+                                      &info);
+  int64_t lwork = static_cast<int64_t>(qwork);
+
+  DenseTensor work, rwork;
+
+  work.Resize(make_ddim({lwork}));
+  ctx.template Alloc<T>(&work);
+
+  if (IsComplexType(x.dtype())) {
+    rwork.Resize(make_ddim({n_dim << 1}));
+    ctx.template Alloc<dtype::Real<T>>(&rwork);
+  }
+
+  for (int64_t i = 0; i < n_batch; ++i) {
+    LapackEigvals<T, Context>(
+        ctx, x_matrices[i], &out_vectors[i], &work, &rwork);
+  }
+  out->Resize(out_dims);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(eigvals,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::EigvalsKernel,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/einsum_kernel.cc b/paddle/phi/kernels/cpu/einsum_kernel.cc
index 401d2fd158a5d..901c1fed628d3 100644
--- a/paddle/phi/kernels/cpu/einsum_kernel.cc
+++ b/paddle/phi/kernels/cpu/einsum_kernel.cc
@@ -18,5 +18,11 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/einsum_impl.h"
 
-PD_REGISTER_KERNEL(
-    einsum, CPU, ALL_LAYOUT, phi::EinsumKernelRaw, float, double) {}
+PD_REGISTER_KERNEL(einsum,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::EinsumKernelRaw,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/identity_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/identity_loss_grad_kernel.cc
new file mode 100644
index 0000000000000..f26195b5069b6
--- /dev/null
+++ b/paddle/phi/kernels/cpu/identity_loss_grad_kernel.cc
@@ -0,0 +1,59 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/identity_loss_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/mean_all_grad_kernel.h"
+#include "paddle/phi/kernels/reduce_sum_grad_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IdentityLossGradKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& out_grad,
+                            const int reduction,
+                            DenseTensor* x_grad) {
+  switch (reduction) {
+    case 0:
+      // sum
+      phi::ReduceSumGradKernel<T>(
+          dev_ctx, x, out_grad, std::vector<int64_t>{0}, false, true, x_grad);
+      break;
+    case 1:
+      // mean
+      phi::MeanAllGradKernel<T>(dev_ctx, x, out_grad, x_grad);
+      break;
+    case 2:
+      // none
+      phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
+      break;
+    default:
+      // error
+      PADDLE_THROW(phi::errors::InvalidArgument(
+          "reduction should be 0, 1 and 2. But get %d", reduction));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(identity_loss_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IdentityLossGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/identity_loss_kernel.cc b/paddle/phi/kernels/cpu/identity_loss_kernel.cc
new file mode 100644
index 0000000000000..941174eb5b0bd
--- /dev/null
+++ b/paddle/phi/kernels/cpu/identity_loss_kernel.cc
@@ -0,0 +1,54 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/identity_loss_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/mean_all_kernel.h"
+#include "paddle/phi/kernels/reduce_sum_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IdentityLossKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const int reduction,
+                        DenseTensor* out) {
+  switch (reduction) {
+    case 0:
+      // sum
+      phi::SumRawKernel<T>(
+          dev_ctx, x, std::vector<int64_t>{0}, false, true, out->dtype(), out);
+      break;
+    case 1:
+      // mean
+      phi::MeanAllKernel<T>(dev_ctx, x, out);
+      break;
+    case 2:
+      // none
+      phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
+      break;
+    default:
+      // error
+      PADDLE_THROW(phi::errors::InvalidArgument(
+          "reduction should be 0, 1 and 2. But get %d", reduction));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    identity_loss, CPU, ALL_LAYOUT, phi::IdentityLossKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc b/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc
index edd41b2c7a31d..dee6e9149ca2d 100644
--- a/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc
@@ -1041,28 +1041,43 @@ PD_REGISTER_KERNEL(bilinear_interp_v2_grad,
                    ALL_LAYOUT,
                    phi::BilinearInterpGradKernel,
                    float,
-                   double) {}
+                   double) {
+  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
+}
 PD_REGISTER_KERNEL(nearest_interp_v2_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::NearestInterpGradKernel,
                    float,
-                   double) {}
+                   double) {
+  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
+}
 PD_REGISTER_KERNEL(trilinear_interp_v2_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::TrilinearInterpGradKernel,
                    float,
-                   double) {}
+                   double) {
+  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
+}
 PD_REGISTER_KERNEL(linear_interp_v2_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::LinearInterpGradKernel,
                    float,
-                   double) {}
+                   double) {
+  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
+}
 PD_REGISTER_KERNEL(bicubic_interp_v2_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::BicubicInterpGradKernel,
                    float,
-                   double) {}
+                   double) {
+  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
+}
diff --git a/paddle/phi/kernels/cpu/interpolate_kernel.cc b/paddle/phi/kernels/cpu/interpolate_kernel.cc
index 5259a770568e4..3649185a0c7ee 100644
--- a/paddle/phi/kernels/cpu/interpolate_kernel.cc
+++ b/paddle/phi/kernels/cpu/interpolate_kernel.cc
@@ -1193,7 +1193,10 @@ PD_REGISTER_KERNEL(bilinear_interp_v2,
                    phi::BilinearInterpKernel,
                    float,
                    double,
-                   uint8_t) {}
+                   uint8_t) {
+  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
+}
 PD_REGISTER_KERNEL(nearest_interp_v2,
                    CPU,
                    ALL_LAYOUT,
@@ -1202,24 +1205,36 @@ PD_REGISTER_KERNEL(nearest_interp_v2,
                    double,
                    int,
                    int64_t,
-                   uint8_t) {}
+                   uint8_t) {
+  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
+}
 PD_REGISTER_KERNEL(trilinear_interp_v2,
                    CPU,
                    ALL_LAYOUT,
                    phi::TrilinearInterpKernel,
                    float,
                    double,
-                   uint8_t) {}
+                   uint8_t) {
+  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
+}
 PD_REGISTER_KERNEL(linear_interp_v2,
                    CPU,
                    ALL_LAYOUT,
                    phi::LinearInterpKernel,
                    float,
                    double,
-                   uint8_t) {}
+                   uint8_t) {
+  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
+}
 PD_REGISTER_KERNEL(bicubic_interp_v2,
                    CPU,
                    ALL_LAYOUT,
                    phi::BicubicInterpKernel,
                    float,
-                   double) {}
+                   double) {
+  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
+}
diff --git a/paddle/phi/kernels/cpu/log_softmax_kernel.cc b/paddle/phi/kernels/cpu/log_softmax_kernel.cc
index 510eb7a6ca97a..0ba4aea78c3ca 100644
--- a/paddle/phi/kernels/cpu/log_softmax_kernel.cc
+++ b/paddle/phi/kernels/cpu/log_softmax_kernel.cc
@@ -116,5 +116,7 @@ void LogSoftmaxKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
+// TODO(YuanRisheng): The layout of mkldnn kernel should be MKLDNN, we should
+// support specifying the exact layout when the kernel is registered
 PD_REGISTER_KERNEL(
     log_softmax, CPU, ALL_LAYOUT, phi::LogSoftmaxKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/dist_grad_kernel.cc b/paddle/phi/kernels/cpu/merged_momentum_kernel.cc
similarity index 74%
rename from paddle/phi/kernels/cpu/dist_grad_kernel.cc
rename to paddle/phi/kernels/cpu/merged_momentum_kernel.cc
index c1aaa2adf7563..0751711ef64fe 100644
--- a/paddle/phi/kernels/cpu/dist_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/merged_momentum_kernel.cc
@@ -12,11 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/dist_grad_kernel.h"
-
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/dist_grad_kernel_impl.h"
+#include "paddle/phi/kernels/impl/merged_momentum_impl.h"
 
-PD_REGISTER_KERNEL(
-    dist_grad, CPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {}
+PD_REGISTER_KERNEL(merged_momentum,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MergedMomentumKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/dist_grad_kernel.cu b/paddle/phi/kernels/cpu/solve_grad_kernel.cc
similarity index 65%
rename from paddle/phi/kernels/gpu/dist_grad_kernel.cu
rename to paddle/phi/kernels/cpu/solve_grad_kernel.cc
index df422e8b2daf9..3b11d49259fd6 100644
--- a/paddle/phi/kernels/gpu/dist_grad_kernel.cu
+++ b/paddle/phi/kernels/cpu/solve_grad_kernel.cc
@@ -12,15 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/dist_grad_kernel.h"
-
-#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/kernels/solve_grad_kernel.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/dist_grad_kernel_impl.h"
+#include "paddle/phi/kernels/impl/solve_grad_kernel_impl.h"
 
-#ifdef PADDLE_WITH_HIP
-PD_REGISTER_KERNEL(dist_grad, GPU, ALL_LAYOUT, phi::DistGradKernel, float) {}
-#else
 PD_REGISTER_KERNEL(
-    dist_grad, GPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {}
-#endif
+    solve_grad, CPU, ALL_LAYOUT, phi::SolveGradKernel, float, double) {}
diff --git a/paddle/phi/ops/compat/digamma_sig.cc b/paddle/phi/kernels/cpu/solve_kernel.cc
similarity index 64%
rename from paddle/phi/ops/compat/digamma_sig.cc
rename to paddle/phi/kernels/cpu/solve_kernel.cc
index 6c14dd9bf1744..bde049bcc3ec0 100644
--- a/paddle/phi/ops/compat/digamma_sig.cc
+++ b/paddle/phi/kernels/cpu/solve_kernel.cc
@@ -12,15 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/core/compat/op_utils.h"
+#include "paddle/phi/kernels/solve_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/solve_kernel_impl.h"
 
-namespace phi {
-
-KernelSignature DigammaGradOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature("digamma_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(digamma_grad, phi::DigammaGradOpArgumentMapping);
+PD_REGISTER_KERNEL(solve, CPU, ALL_LAYOUT, phi::SolveKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/unique_consecutive_functor.h b/paddle/phi/kernels/cpu/unique_consecutive_functor.h
new file mode 100644
index 0000000000000..85081e5806933
--- /dev/null
+++ b/paddle/phi/kernels/cpu/unique_consecutive_functor.h
@@ -0,0 +1,261 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/tensor_util.h"
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/unique_functor.h"
+
+namespace phi {
+
+template <typename InT, typename IndexT, typename Context>
+static void UniqueConsecutiveFlattenedTensor(const Context& context,
+                                             const DenseTensor& in,
+                                             DenseTensor* out,
+                                             bool return_inverse,
+                                             bool return_counts,
+                                             DenseTensor* inverse,
+                                             DenseTensor* count) {
+  const InT* in_data = in.data<InT>();
+  std::vector<InT> out_vec(in.numel());
+  std::vector<IndexT> inverse_vec(in.numel());
+  std::vector<IndexT> counts_vec(in.numel());
+  memcpy(out_vec.data(), in_data, in.numel() * sizeof(InT));
+  InT* p = out_vec.data();
+  int64_t last = 0;
+  IndexT* q = counts_vec.data();
+  for (int64_t i = 0; i < in.numel(); i++) {
+    if (in_data[i] != *p) {
+      *(++p) = in_data[i];
+      if (return_counts) {
+        *(q++) = i - last;
+        last = i;
+      }
+    }
+    if (return_inverse) {
+      inverse_vec[i] = p - out_vec.data();
+    }
+  }
+
+  int64_t output_size = p - out_vec.data() + 1;
+  if (return_counts) {
+    *q = in.numel() - last;
+    counts_vec.resize(output_size);
+  }
+  out_vec.resize(output_size);
+
+  out->Resize(phi::make_ddim({output_size}));
+  auto* out_data = context.template Alloc<InT>(out);
+  std::copy(out_vec.begin(), out_vec.end(), out_data);
+
+  if (return_inverse) {
+    inverse->Resize(phi::make_ddim({in.numel()}));
+    auto* inverse_data = context.template Alloc<IndexT>(inverse);
+    std::copy(inverse_vec.begin(), inverse_vec.end(), inverse_data);
+  }
+
+  if (return_counts) {
+    count->Resize(phi::make_ddim({out->numel()}));
+    auto* counts_data = context.template Alloc<IndexT>(count);
+    std::copy(counts_vec.begin(), counts_vec.end(), counts_data);
+  }
+}
+
+template <typename Context, typename InT>
+struct UniqueConsecutiveFlattenedTensorFunctor {
+  const Context& ctx_;
+  const DenseTensor& in_;
+  DenseTensor* out_;
+  const bool return_inverse_;
+  const bool return_counts_;
+  DenseTensor* inverse_;
+  DenseTensor* count_;
+
+  UniqueConsecutiveFlattenedTensorFunctor(const Context& context,
+                                          const DenseTensor& in,
+                                          DenseTensor* out,
+                                          bool return_inverse,
+                                          bool return_counts,
+                                          DenseTensor* inverse,
+                                          DenseTensor* count)
+      : ctx_(context),
+        in_(in),
+        out_(out),
+        return_inverse_(return_inverse),
+        return_counts_(return_counts),
+        inverse_(inverse),
+        count_(count) {}
+
+  template <typename IndexT>
+  void apply() const {
+    UniqueConsecutiveFlattenedTensor<InT, IndexT, Context>(
+        ctx_, in_, out_, return_inverse_, return_counts_, inverse_, count_);
+  }
+};
+
+template <typename Context, class ForwardIt, typename InT, typename IndexT>
+static ForwardIt UniqueConsecutiveDimImpl(
+    const Context& context,
+    ForwardIt first,
+    ForwardIt last,
+    const std::vector<IndexT>& sorted_indices_vec,
+    std::vector<IndexT>* inverse_vec,
+    std::vector<IndexT>* counts_vec) {
+  if (first == last) {
+    return last;
+  }
+
+  (*inverse_vec)[sorted_indices_vec[0]] = 0;
+  (*counts_vec)[0] = 1;
+
+  ForwardIt begin = first;
+  ForwardIt result = first;
+
+  while (++first != last) {
+    int64_t idx_first = std::distance(begin, first);
+    int64_t idx_result = std::distance(begin, result);
+    if (!phi::funcs::Equal<InT>(*result, *first)) {
+      if (++result != first) {
+        *result = std::move(*first);
+      }
+      idx_result += 1;
+    }
+    (*inverse_vec)[sorted_indices_vec[idx_first]] = idx_result;
+    (*counts_vec)[idx_result] += 1;
+  }
+  return ++result;
+}
+
+template <typename Context, typename InT, typename IndexT>
+static void UniqueConsecutiveDim(const Context& context,
+                                 const DenseTensor& in,
+                                 DenseTensor* out,
+                                 bool return_inverse,
+                                 bool return_counts,
+                                 int axis,
+                                 DenseTensor* inverse,
+                                 DenseTensor* count) {
+  // transpose tensor: eg. axis=1, [dim0, dim1, dim2] -> [dim1, dim0, dim2]
+  std::vector<int> permute(in.dims().size());
+  std::iota(permute.begin(), permute.end(), 0);
+  permute[axis] = 0;
+  permute[0] = axis;
+  std::vector<int64_t> in_trans_dims_vec(phi::vectorize(in.dims()));
+  in_trans_dims_vec[axis] = in.dims()[0];
+  in_trans_dims_vec[0] = in.dims()[axis];
+  DenseTensor in_trans;
+  DDim in_trans_dims = phi::make_ddim(in_trans_dims_vec);
+  in_trans.Resize(in_trans_dims);
+  context.template Alloc<InT>(&in_trans);
+  phi::funcs::TransCompute<Context, InT>(
+      in.dims().size(), context, in, &in_trans, permute);
+  // reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2]
+  DDim in_trans_flat_dims = phi::flatten_to_2d(in_trans_dims, 1);
+  in_trans.Resize(in_trans_flat_dims);
+
+  std::vector<IndexT> sorted_indices_vec(in_trans.dims()[0]);
+  std::iota(sorted_indices_vec.begin(), sorted_indices_vec.end(), 0);
+  int64_t col = in_trans.dims()[1];
+  const InT* in_trans_data = in_trans.data<InT>();
+
+  // sort tensor according to indices
+  DenseTensor input_sorted;
+  input_sorted.Resize(in_trans_dims);
+  context.template Alloc<InT>(&input_sorted);
+  InT* input_sorted_data = input_sorted.data<InT>();
+  for (size_t i = 0; i < sorted_indices_vec.size(); ++i) {
+    memcpy(input_sorted_data + i * col,
+           in_trans_data + static_cast<int64_t>(sorted_indices_vec[i]) * col,
+           col * sizeof(InT));
+  }
+  std::vector<DenseTensor> input_unbind = phi::funcs::Unbind(input_sorted);
+  std::vector<IndexT> inverse_vec(sorted_indices_vec.size(), 0);
+  std::vector<IndexT> counts_vec(sorted_indices_vec.size(), 0);
+  auto last = UniqueConsecutiveDimImpl<Context,
+                                       std::vector<DenseTensor>::iterator,
+                                       InT>(context,
+                                            input_unbind.begin(),
+                                            input_unbind.end(),
+                                            sorted_indices_vec,
+                                            &inverse_vec,
+                                            &counts_vec);
+  input_unbind.erase(last, input_unbind.end());
+  counts_vec.erase(counts_vec.begin() + input_unbind.size(), counts_vec.end());
+
+  phi::funcs::ConcatFunctor<Context, InT> concat_functor;
+  DenseTensor out_trans;
+  std::vector<int64_t> out_trans_dims_vec = in_trans_dims_vec;
+  out_trans_dims_vec[0] = input_unbind.size();
+  out_trans.Resize(phi::make_ddim(out_trans_dims_vec));
+  context.template Alloc<InT>(&out_trans);
+  std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]);
+  out->Resize(phi::make_ddim(out_trans_dims_vec));
+  context.template Alloc<InT>(out);
+  concat_functor(context, input_unbind, 0, &out_trans);
+  phi::funcs::TransCompute<Context, InT>(
+      out_trans.dims().size(), context, out_trans, out, permute);
+  if (return_inverse) {
+    paddle::framework::TensorFromVector(inverse_vec, context, inverse);
+  }
+  if (return_counts) {
+    paddle::framework::TensorFromVector(counts_vec, context, count);
+  }
+}
+
+template <typename Context, typename InT>
+struct UniqueConsecutiveDimFunctor {
+  const Context& ctx_;
+  const DenseTensor& in_;
+  DenseTensor* out_;
+  const int axis_;
+  const bool return_inverse_;
+  const bool return_counts_;
+  DenseTensor* inverse_;
+  DenseTensor* count_;
+
+  UniqueConsecutiveDimFunctor(const Context& context,
+                              const DenseTensor& in,
+                              DenseTensor* out,
+                              const int axis,
+                              bool return_inverse,
+                              bool return_counts,
+                              DenseTensor* inverse,
+                              DenseTensor* count)
+      : ctx_(context),
+        in_(in),
+        out_(out),
+        axis_(axis),
+        return_inverse_(return_inverse),
+        return_counts_(return_counts),
+        inverse_(inverse),
+        count_(count) {}
+
+  template <typename IndexT>
+  void apply() const {
+    UniqueConsecutiveDim<Context, InT, IndexT>(ctx_,
+                                               in_,
+                                               out_,
+                                               return_inverse_,
+                                               return_counts_,
+                                               axis_,
+                                               inverse_,
+                                               count_);
+  }
+};
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/unique_consecutive_kernel.cc b/paddle/phi/kernels/cpu/unique_consecutive_kernel.cc
new file mode 100644
index 0000000000000..86fe53b72c985
--- /dev/null
+++ b/paddle/phi/kernels/cpu/unique_consecutive_kernel.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/unique_consecutive_kernel.h"
+#include "paddle/phi/kernels/cpu/unique_consecutive_functor.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/errors.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/fluid/framework/data_type.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void UniqueConsecutiveKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             bool return_inverse,
+                             bool return_counts,
+                             const std::vector<int>& axis,
+                             int dtype,
+                             DenseTensor* out,
+                             DenseTensor* index,
+                             DenseTensor* counts) {
+  auto data_type = static_cast<paddle::framework::proto::VarType::Type>(dtype);
+  if (data_type == paddle::framework::proto::VarType::INT32) {
+    PADDLE_ENFORCE_LE(
+        x.numel(),
+        INT_MAX,
+        phi::errors::InvalidArgument(
+            "The number of elements in Input(X) should be less than or "
+            "equal to INT_MAX, but received num is %d. Please set `dtype` to "
+            "int64.",
+            x.numel()));
+  }
+
+  if (axis.empty()) {
+    paddle::framework::VisitDataTypeTiny(
+        data_type,
+        UniqueConsecutiveFlattenedTensorFunctor<Context, T>(
+            dev_ctx, x, out, return_inverse, return_counts, index, counts));
+  } else {
+    int valid_axis = axis[0];
+    paddle::framework::VisitDataTypeTiny(
+        data_type,
+        UniqueConsecutiveDimFunctor<Context, T>(dev_ctx,
+                                                x,
+                                                out,
+                                                valid_axis,
+                                                return_inverse,
+                                                return_counts,
+                                                index,
+                                                counts));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(unique_consecutive,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::UniqueConsecutiveKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/diag_kernel.h b/paddle/phi/kernels/diag_kernel.h
index 8dc919fa63360..3168aea54e697 100644
--- a/paddle/phi/kernels/diag_kernel.h
+++ b/paddle/phi/kernels/diag_kernel.h
@@ -18,6 +18,26 @@
 
 namespace phi {
 
+/**
+ * @brief If ``x`` is a vector (1-D tensor), a 2-D square tensor with the
+ *        elements of ``x`` as the diagonal is returned.
+ *        If ``x`` is a matrix (2-D tensor), a 1-D tensor with the diagonal
+ *        elements of ``x`` is returned.
+ *
+ *        The argument ``offset`` controls the diagonal offset:
+ *        If ``offset`` = 0, it is the main diagonal.
+ *        If ``offset`` > 0, it is superdiagonal. If ``offset`` < 0,
+ *        it is subdiagonal.
+ * @param  ctx             device context
+ * @param  x               The input tensor. Its shape is either 1-D or 2-D.
+ * @param  offset          The diagonal offset. A positive value represents
+ *                         superdiagonal, 0 represents the main diagonal, and a
+ *                         negative value represents subdiagonal.
+ * @param  padding_value   Use this value to fill the area outside the specified
+ *                         diagonal band. Only takes effect when the input is a
+ *                         1-D Tensor. The default value is 0.
+ * @param  out             The output tensor. A square matrix or a vector.
+ */
 template <typename T, typename Context>
 void DiagKernel(const Context& dev_ctx,
                 const DenseTensor& x,
diff --git a/paddle/phi/kernels/digamma_kernel.h b/paddle/phi/kernels/digamma_kernel.h
index 3cf1eae67cc3e..b45b7070d2dee 100644
--- a/paddle/phi/kernels/digamma_kernel.h
+++ b/paddle/phi/kernels/digamma_kernel.h
@@ -18,6 +18,13 @@
 
 namespace phi {
 
+/**
+ * @brief This kernrel is used to perform elementwise digamma for x.
+ *        $$out = \Psi(x) = \frac{ \Gamma^{'}(x) }{ \Gamma(x) }$$
+ * @param  ctx     device context
+ * @param  x       the input tensor of digamma
+ * @param  out     the output tensor of digamma
+ */
 template <typename T, typename Context>
 void DigammaKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out);
 
diff --git a/paddle/phi/kernels/dist_grad_kernel.cc b/paddle/phi/kernels/dist_grad_kernel.cc
new file mode 100644
index 0000000000000..ba468ad299e4c
--- /dev/null
+++ b/paddle/phi/kernels/dist_grad_kernel.cc
@@ -0,0 +1,93 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/dist_grad_kernel.h"
+
+#include <tuple>
+#include <vector>
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/elementwise_subtract_kernel.h"
+#include "paddle/phi/kernels/p_norm_grad_kernel.h"
+#include "paddle/phi/kernels/reduce_sum_kernel.h"
+#include "paddle/phi/kernels/scale_kernel.h"
+
+namespace phi {
+
+std::pair<std::vector<int64_t>, std::vector<int64_t>> GetReduceDims(
+    const DDim& src_dim, const DDim& dst_dim) {
+  std::vector<int64_t> reduce_dims, new_dims;
+  auto pre_dims = src_dim.size() - dst_dim.size();
+  for (auto i = 0; i < pre_dims; ++i) {
+    reduce_dims.push_back(i);
+  }
+
+  for (auto i = pre_dims; i < src_dim.size(); ++i) {
+    if (dst_dim[i - pre_dims] == 1 && src_dim[i] != 1) {
+      reduce_dims.push_back(i);
+    } else {
+      new_dims.push_back(dst_dim[i - pre_dims]);
+    }
+  }
+  return {reduce_dims, new_dims};
+}
+
+template <typename T, typename Context>
+void DistGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    const DenseTensor& out,
+                    const DenseTensor& out_grad,
+                    float p,
+                    DenseTensor* x_grad,
+                    DenseTensor* y_grad) {
+  auto t = Subtract<T, Context>(dev_ctx, x, y);
+  DenseTensor x_grad_tmp;
+  x_grad_tmp.Resize(t.dims());
+  DenseTensor y_grad_tmp;
+  y_grad_tmp.Resize(t.dims());
+  PNormGradKernel<T, Context>(
+      dev_ctx, t, out, out_grad, p, -1, 1e-12, false, true, &x_grad_tmp);
+  ScaleKernel<T, Context>(dev_ctx, x_grad_tmp, -1.0, 0.0, false, &y_grad_tmp);
+  // do reduce, the implemetation of cpu SumKernel has bug, it changes
+  // the dims of output iternally, so we Resize x/y_grad twice.
+  auto res_x = GetReduceDims(x_grad_tmp.dims(), x.dims());
+  if (!std::get<0>(res_x).empty()) {
+    x_grad->Resize(phi::make_ddim(std::get<1>(res_x)));
+    SumKernel<T, Context>(
+        dev_ctx, x_grad_tmp, std::get<0>(res_x), x.dtype(), false, x_grad);
+    x_grad->Resize(x.dims());
+  } else {
+    x_grad->ShareBufferWith(x_grad_tmp);
+  }
+  auto res_y = GetReduceDims(y_grad_tmp.dims(), y.dims());
+  if (!std::get<0>(res_y).empty()) {
+    y_grad->Resize(phi::make_ddim(std::get<1>(res_y)));
+    SumKernel<T, Context>(
+        dev_ctx, y_grad_tmp, std::get<0>(res_y), y.dtype(), false, y_grad);
+    y_grad->Resize(y.dims());
+  } else {
+    y_grad->ShareBufferWith(y_grad_tmp);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    dist_grad, CPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(
+    dist_grad, GPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {}
+#endif
diff --git a/paddle/phi/kernels/dist_kernel.cc b/paddle/phi/kernels/dist_kernel.cc
new file mode 100644
index 0000000000000..ed1fa0dafe741
--- /dev/null
+++ b/paddle/phi/kernels/dist_kernel.cc
@@ -0,0 +1,40 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/dist_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/elementwise_subtract_kernel.h"
+#include "paddle/phi/kernels/p_norm_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DistKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const DenseTensor& y,
+                float p,
+                DenseTensor* out) {
+  auto t = Subtract<T, Context>(dev_ctx, x, y);
+  PNormKernel<T, Context>(dev_ctx, t, p, -1, 1e-12, false, true, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(dist, CPU, ALL_LAYOUT, phi::DistKernel, float, double) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(dist, GPU, ALL_LAYOUT, phi::DistKernel, float, double) {}
+#endif
diff --git a/paddle/phi/kernels/dist_kernel.h b/paddle/phi/kernels/dist_kernel.h
index 6cb3d6e0e8bef..8c1f6674aa5b5 100644
--- a/paddle/phi/kernels/dist_kernel.h
+++ b/paddle/phi/kernels/dist_kernel.h
@@ -18,6 +18,37 @@
 
 namespace phi {
 
+/**
+ * @brief Given two tensors x and y, compute Lp-norm of (x-y).
+ *        It is not a norm in a strict sense, only as a measure of distance.
+ *        The shapes of x and y must be broadcastable. Where, z = x - y,
+ *
+ *        When p = 0, defining $0^0 = 0$, the zero-norm of z is simply
+ *        the number of non-zero elements of z.
+ *        $$
+ *        ||z||_{0} = \lim_{p \rightarrow 0} \sum_{i=1}^{m} |z_i|^p
+ *        $$
+ *
+ *        When p = inf, the inf-norm of z is the maximum element of z.
+ *        $$
+ *        ||z||_\infty=\max_i |z_i|
+ *        $$
+ *
+ *        When p = -inf, the negative-inf-norm of z is the minimum element of z.
+ *        $$
+ *        ||z||_{-\infty}=\min_i |z_i|
+ *        $$
+ *
+ *        Otherwise, the p-norm of z follows the formula,
+ *        $$
+ *        ||z||_{p} = (\sum_{i=i}^{m} |z_i|^p)^{1/p}
+ *        $$
+ * @param  ctx     device context
+ * @param  x       the input Tensor of Dist
+ * @param  y       the Right-hand-side input Tensor of Dist
+ * @param  p       the norm to be computed
+ * @param  out     the output of Dist, which is the p-norm of (x - y)
+ */
 template <typename T, typename Context>
 void DistKernel(const Context& dev_ctx,
                 const DenseTensor& x,
diff --git a/paddle/phi/kernels/eigvals_kernel.h b/paddle/phi/kernels/eigvals_kernel.h
new file mode 100644
index 0000000000000..dd9f3370bd08e
--- /dev/null
+++ b/paddle/phi/kernels/eigvals_kernel.h
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void EigvalsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/erfinv_kernel.h b/paddle/phi/kernels/erfinv_kernel.h
index 8380a62971ba4..3ddb1ecbdfd80 100644
--- a/paddle/phi/kernels/erfinv_kernel.h
+++ b/paddle/phi/kernels/erfinv_kernel.h
@@ -18,6 +18,18 @@
 
 namespace phi {
 
+/**
+ * @brief This kernel is used to compute inverse error function of x.
+ *
+ *        The equation is:
+ *        $$erfinv(x) = {ndtri({x \over 2} + 0.5)} \over {\sqrt{2}}$$
+ *
+ *        The input `x` can carry the LoD (Level of Details) information,
+ *        or not. And the output shares the LoD information with `x`
+ * @param  ctx     device context
+ * @param  x       the input tensor of erfinv
+ * @param  out     the output tensor of erfinv
+ */
 template <typename T, typename Context>
 void ErfinvKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out);
 
diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt
index 6d16fc8f81895..25696a34e3e03 100644
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -14,3 +14,4 @@ math_library(matrix_inverse DEPS dense_tensor eigen3 blas)
 math_library(pooling DEPS dense_tensor)
 math_library(segment_pooling)
 math_library(sequence2batch)
+math_library(matrix_solve DEPS dense_tensor eigen3 blas math_function)
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.h b/paddle/phi/kernels/funcs/blas/blas_impl.h
index db4796b3f61ca..a18ec953d0abd 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.h
@@ -1003,12 +1003,6 @@ struct CBlas<phi::dtype::float16> {
 #ifdef PADDLE_WITH_MKLML
 template <>
 template <typename T>
-T *Blas<paddle::platform::CPUDeviceContext>::GEMM_ALLOC(
-    const CBLAS_IDENTIFIER id, const int M, const int N, const int K) const {
-  return CBlas<T>::GEMM_ALLOC(id, M, N, K);
-}
-template <>
-template <typename T>
 T *Blas<phi::CPUContext>::GEMM_ALLOC(const CBLAS_IDENTIFIER id,
                                      const int M,
                                      const int N,
@@ -1016,20 +1010,6 @@ T *Blas<phi::CPUContext>::GEMM_ALLOC(const CBLAS_IDENTIFIER id,
   return CBlas<T>::GEMM_ALLOC(id, M, N, K);
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::GEMM_PACK(
-    const CBLAS_IDENTIFIER id,
-    const CBLAS_TRANSPOSE trans,
-    int M,
-    int N,
-    int K,
-    const T alpha,
-    const T *src,
-    const int ld,
-    T *dst) const {
-  CBlas<T>::GEMM_PACK(CblasRowMajor, id, trans, M, N, K, alpha, src, ld, dst);
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::GEMM_PACK(const CBLAS_IDENTIFIER id,
@@ -1044,24 +1024,6 @@ void Blas<phi::CPUContext>::GEMM_PACK(const CBLAS_IDENTIFIER id,
   CBlas<T>::GEMM_PACK(CblasRowMajor, id, trans, M, N, K, alpha, src, ld, dst);
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::GEMM_COMPUTE(
-    int transA,
-    int transB,
-    int M,
-    int N,
-    int K,
-    const T *A,
-    const int lda,
-    const T *B,
-    const int ldb,
-    T beta,
-    T *C,
-    const int ldc) const {
-  CBlas<T>::GEMM_COMPUTE(
-      CblasRowMajor, transA, transB, M, N, K, A, lda, B, ldb, beta, C, ldc);
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::GEMM_COMPUTE(int transA,
@@ -1080,11 +1042,6 @@ void Blas<phi::CPUContext>::GEMM_COMPUTE(int transA,
       CblasRowMajor, transA, transB, M, N, K, A, lda, B, ldb, beta, C, ldc);
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::GEMM_FREE(T *data) const {
-  CBlas<T>::GEMM_FREE(data);
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::GEMM_FREE(T *data) const {
@@ -1092,36 +1049,6 @@ void Blas<phi::CPUContext>::GEMM_FREE(T *data) const {
 }
 #endif
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                                    CBLAS_TRANSPOSE transB,
-                                                    int M,
-                                                    int N,
-                                                    int K,
-                                                    T alpha,
-                                                    const T *A,
-                                                    const T *B,
-                                                    T beta,
-                                                    T *C) const {
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  CBlas<T>::GEMM(CblasRowMajor,
-                 transA,
-                 transB,
-                 M,
-                 N,
-                 K,
-                 alpha,
-                 A,
-                 lda,
-                 B,
-                 ldb,
-                 beta,
-                 C,
-                 ldc);
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::GEMM(CBLAS_TRANSPOSE transA,
@@ -1153,36 +1080,6 @@ void Blas<phi::CPUContext>::GEMM(CBLAS_TRANSPOSE transA,
                  ldc);
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::GEMM(bool transA,
-                                                    bool transB,
-                                                    int M,
-                                                    int N,
-                                                    int K,
-                                                    T alpha,
-                                                    const T *A,
-                                                    int lda,
-                                                    const T *B,
-                                                    int ldb,
-                                                    T beta,
-                                                    T *C,
-                                                    int ldc) const {
-  CBlas<T>::GEMM(CblasRowMajor,
-                 transA == false ? CblasNoTrans : CblasTrans,
-                 transB == false ? CblasNoTrans : CblasTrans,
-                 M,
-                 N,
-                 K,
-                 alpha,
-                 A,
-                 lda,
-                 B,
-                 ldb,
-                 beta,
-                 C,
-                 ldc);
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::GEMM(bool transA,
@@ -1214,36 +1111,6 @@ void Blas<phi::CPUContext>::GEMM(bool transA,
                  ldc);
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                                    CBLAS_TRANSPOSE transB,
-                                                    int M,
-                                                    int N,
-                                                    int K,
-                                                    T alpha,
-                                                    const T *A,
-                                                    int lda,
-                                                    const T *B,
-                                                    int ldb,
-                                                    T beta,
-                                                    T *C,
-                                                    int ldc) const {
-  CBlas<T>::GEMM(CblasRowMajor,
-                 transA,
-                 transB,
-                 M,
-                 N,
-                 K,
-                 alpha,
-                 A,
-                 lda,
-                 B,
-                 ldb,
-                 beta,
-                 C,
-                 ldc);
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::GEMM(CBLAS_TRANSPOSE transA,
@@ -1323,50 +1190,18 @@ void Blas<DeviceContext>::MatMul(const phi::DenseTensor &mat_a,
              mat_out->data<T>());
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::AXPY(int n,
-                                                    T alpha,
-                                                    const T *x,
-                                                    T *y) const {
-  CBlas<T>::AXPY(n, alpha, x, 1, y, 1);
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
   CBlas<T>::AXPY(n, alpha, x, 1, y, 1);
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::VCOPY(int n,
-                                                     const T *x,
-                                                     T *y) const {
-  CBlas<T>::VCOPY(n, x, 1, y, 1);
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::VCOPY(int n, const T *x, T *y) const {
   CBlas<T>::VCOPY(n, x, 1, y, 1);
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::VADD(int n,
-                                                    const T *x,
-                                                    const T *y,
-                                                    T *z) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VADD(n, x, y, z);
-#else
-  if (x == z) {
-    this->template AXPY<T>(n, (T)(1.), y, z);
-  } else {
-    this->template VCOPY<T>(n, y, z);
-    this->template AXPY<T>(n, (T)(1.), x, z);
-  }
-#endif
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::VADD(int n, const T *x, const T *y, T *z) const {
@@ -1382,21 +1217,6 @@ void Blas<phi::CPUContext>::VADD(int n, const T *x, const T *y, T *z) const {
 #endif
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::VSUB(int n,
-                                                    const T *x,
-                                                    const T *y,
-                                                    T *z) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VSUB(n, x, y, z);
-#else
-  // try to find if openblas support vsub
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] - y[i];
-  }
-#endif
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::VSUB(int n, const T *x, const T *y, T *z) const {
@@ -1410,21 +1230,6 @@ void Blas<phi::CPUContext>::VSUB(int n, const T *x, const T *y, T *z) const {
 #endif
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::VMUL(int n,
-                                                    const T *x,
-                                                    const T *y,
-                                                    T *z) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VMUL(n, x, y, z);
-#else
-  // try to find if openblas support vmul
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] * y[i];
-  }
-#endif
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::VMUL(int n, const T *x, const T *y, T *z) const {
@@ -1438,21 +1243,6 @@ void Blas<phi::CPUContext>::VMUL(int n, const T *x, const T *y, T *z) const {
 #endif
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::VDIV(int n,
-                                                    const T *x,
-                                                    const T *y,
-                                                    T *z) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VDIV(n, x, y, z);
-#else
-  // try to find if openblas support vdiv
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] / y[i];
-  }
-#endif
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::VDIV(int n, const T *x, const T *y, T *z) const {
@@ -1466,20 +1256,6 @@ void Blas<phi::CPUContext>::VDIV(int n, const T *x, const T *y, T *z) const {
 #endif
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::VEXP(int n,
-                                                    const T *x,
-                                                    T *y) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VEXP(n, x, y);
-#else
-  // try to find if openblas support vexp
-  for (int i = 0; i < n; ++i) {
-    y[i] = std::exp(x[i]);
-  }
-#endif
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::VEXP(int n, const T *x, T *y) const {
@@ -1493,19 +1269,6 @@ void Blas<phi::CPUContext>::VEXP(int n, const T *x, T *y) const {
 #endif
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::VSQUARE(int n,
-                                                       const T *x,
-                                                       T *y) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VSQUARE(n, x, y);
-#else
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] * x[i];
-  }
-#endif
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::VSQUARE(int n, const T *x, T *y) const {
@@ -1518,20 +1281,6 @@ void Blas<phi::CPUContext>::VSQUARE(int n, const T *x, T *y) const {
 #endif
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::VPOW(int n,
-                                                    const T *x,
-                                                    T a,
-                                                    T *y) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VPOW(n, x, a, y);
-#else
-  for (int i = 0; i < n; ++i) {
-    y[i] = std::pow(x[i], a);
-  }
-#endif
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::VPOW(int n, const T *x, T a, T *y) const {
@@ -1544,22 +1293,6 @@ void Blas<phi::CPUContext>::VPOW(int n, const T *x, T a, T *y) const {
 #endif
 }
 
-template <>
-template <typename T>
-T Blas<paddle::platform::CPUDeviceContext>::DOT(int n,
-                                                const T *x,
-                                                const T *y) const {
-#ifdef PADDLE_WITH_MKLML
-  return CBlas<T>::DOT(n, x, 1, y, 1);
-#else
-  // try to find if openblas support cblas_dot
-  T sum = 0;
-  for (int i = 0; i < n; ++i) {
-    sum += x[i] * y[i];
-  }
-  return sum;
-#endif
-}
 template <>
 template <typename T>
 T Blas<phi::CPUContext>::DOT(int n, const T *x, const T *y) const {
@@ -1575,20 +1308,6 @@ T Blas<phi::CPUContext>::DOT(int n, const T *x, const T *y) const {
 #endif
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::SCAL(int n,
-                                                    const T a,
-                                                    T *x) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::SCAL(n, a, x, 1);
-#else
-  // try to find if openblas support cblas_scal
-  for (int i = 0; i < n; ++i) {
-    x[i] = a * x[i];
-  }
-#endif
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::SCAL(int n, const T a, T *x) const {
@@ -1602,20 +1321,6 @@ void Blas<phi::CPUContext>::SCAL(int n, const T a, T *x) const {
 #endif
 }
 
-template <>
-template <typename T>
-T Blas<paddle::platform::CPUDeviceContext>::ASUM(int n, T *x, int inc) const {
-  auto sum = static_cast<T>(0.0);
-#ifdef PADDLE_WITH_MKLML
-  sum = CBlas<T>::ASUM(n, x, inc);
-#else
-  // TODO(jczaja): check if openblas does provide cblas_sasum/cblas_dasum
-  for (int c = 0; c < n; ++c) {
-    sum += x[c];
-  }
-#endif
-  return sum;
-}
 template <>
 template <typename T>
 T Blas<phi::CPUContext>::ASUM(int n, T *x, int inc) const {
@@ -1631,19 +1336,6 @@ T Blas<phi::CPUContext>::ASUM(int n, T *x, int inc) const {
   return sum;
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::GEMV(bool trans_a,
-                                                    int M,
-                                                    int N,
-                                                    T alpha,
-                                                    const T *A,
-                                                    const T *B,
-                                                    T beta,
-                                                    T *C) const {
-  CBLAS_TRANSPOSE transA = !trans_a ? CblasNoTrans : CblasTrans;
-  CBlas<T>::GEMV(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1);
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::GEMV(bool trans_a,
@@ -1658,66 +1350,6 @@ void Blas<phi::CPUContext>::GEMV(bool trans_a,
   CBlas<T>::GEMV(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1);
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::BatchedGEMM(
-    CBLAS_TRANSPOSE transA,
-    CBLAS_TRANSPOSE transB,
-    int M,
-    int N,
-    int K,
-    T alpha,
-    const T *A,
-    const T *B,
-    T beta,
-    T *C,
-    int batchCount,
-    int64_t strideA,
-    int64_t strideB) const {
-  PADDLE_ENFORCE_NOT_NULL(
-      A, phi::errors::InvalidArgument("Pointer A should not be null."));
-  PADDLE_ENFORCE_NOT_NULL(
-      B, phi::errors::InvalidArgument("Pointer B should not be null."));
-  PADDLE_ENFORCE_NOT_NULL(
-      C, phi::errors::InvalidArgument("Pointer C should not be null."));
-#ifdef PADDLE_WITH_MKLML
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  auto a_array = std::vector<const T *>(batchCount);
-  auto b_array = std::vector<const T *>(batchCount);
-  auto c_array = std::vector<T *>(batchCount);
-  for (int k = 0; k < batchCount; ++k) {
-    a_array[k] = &A[k * strideA];
-    b_array[k] = &B[k * strideB];
-    c_array[k] = &C[k * M * N];
-  }
-
-  CBlas<T>::GEMM_BATCH(CblasRowMajor,
-                       &transA,
-                       &transB,
-                       &M,
-                       &N,
-                       &K,
-                       &alpha,
-                       a_array.data(),
-                       &lda,
-                       b_array.data(),
-                       &ldb,
-                       &beta,
-                       c_array.data(),
-                       &ldc,
-                       1 /* group_count */,
-                       &batchCount);
-#else
-  for (int k = 0; k < batchCount; ++k) {
-    auto *Ak = &A[k * strideA];
-    auto *Bk = &B[k * strideB];
-    auto *Ck = &C[k * M * N];
-    this->template GEMM<T>(transA, transB, M, N, K, alpha, Ak, Bk, beta, Ck);
-  }
-#endif
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
@@ -1778,47 +1410,6 @@ void Blas<phi::CPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
 #endif
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::BatchedGEMM(
-    CBLAS_TRANSPOSE transA,
-    CBLAS_TRANSPOSE transB,
-    int M,
-    int N,
-    int K,
-    T alpha,
-    const T **A,
-    const T **B,
-    T beta,
-    T **C,
-    int batchCount) const {
-#ifdef PADDLE_WITH_MKLML
-  const int lda = (std::max)((transA == CblasNoTrans) ? K : M, 1);
-  const int ldb = (std::max)((transB == CblasNoTrans) ? N : K, 1);
-  const int ldc = (std::max)(N, 1);
-  CBlas<T>::GEMM_BATCH(CblasRowMajor,
-                       &transA,
-                       &transB,
-                       &M,
-                       &N,
-                       &K,
-                       &alpha,
-                       A,
-                       &lda,
-                       B,
-                       &ldb,
-                       &beta,
-                       C,
-                       &ldc,
-                       1 /* group_count */,
-                       &batchCount);
-#else
-  for (int k = 0; k < batchCount; ++k) {
-    this->template GEMM<T>(
-        transA, transB, M, N, K, alpha, A[k], B[k], beta, C[k]);
-  }
-#endif
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
@@ -1864,113 +1455,6 @@ void Blas<phi::CPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
     !defined(PADDLE_WITH_HIP)  // @{ Group Blas MKLML: BatchedGEMMWithHead
 template <>
 template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::BatchedGEMMWithHead(
-    CBLAS_TRANSPOSE transA,
-    CBLAS_TRANSPOSE transB,
-    int W1,
-    int H1,
-    int W2,
-    int H2,
-    T alpha,
-    const T *A,
-    const T *B,
-    T beta,
-    T *C,
-    int batchCount,
-    int64_t strideA,
-    int64_t strideB,
-    int64_t head_number,
-    bool split_b_vertical) const {
-  int lda = (transA == CblasNoTrans) ? W1 : H1;
-  int ldb = (transB == CblasNoTrans) ? W2 : H2;
-  auto a_array = std::vector<const T *>(batchCount);
-  auto b_array = std::vector<const T *>(batchCount);
-  auto c_array = std::vector<T *>(batchCount);
-
-  if (split_b_vertical) {
-    int ldc = W2;
-    int sub_width = W2 / head_number;
-
-    for (int i = 0; i < head_number; i++) {
-      int sub_matA_offset = (transA == CblasNoTrans)
-                                ? i * (W1 / head_number)
-                                : i * (W1 / head_number) * H1;
-      int sub_matB_offset = (transB == CblasNoTrans)
-                                ? i * (W2 / head_number)
-                                : i * (W2 / head_number) * H2;
-      int sub_matC_offset = i * W2 / head_number;
-      for (int k = 0; k < batchCount; ++k) {
-        a_array[k] = &A[k * strideA] + sub_matA_offset;
-        b_array[k] = &B[k * strideB] + sub_matB_offset;
-        c_array[k] = &C[k * H1 * W2] + sub_matC_offset;
-      }
-
-      CBlas<T>::GEMM_BATCH(CblasRowMajor,
-                           &transA,
-                           &transB,
-                           &H1,
-                           &sub_width,
-                           &H2,
-                           &alpha,
-                           a_array.data(),
-                           &lda,
-                           b_array.data(),
-                           &ldb,
-                           &beta,
-                           c_array.data(),
-                           &ldc,
-                           1 /* group_count */,
-                           &batchCount);
-    }
-
-  } else {
-    PADDLE_ENFORCE_EQ(
-        W1,
-        H2,
-        phi::errors::InvalidArgument(
-            "The fisrt matrix width should be same as second matrix height,"
-            "but received fisrt matrix width %d"
-            ", second matrix height %d",
-            W1,
-            H2));
-    int ldc = W2 * head_number;
-    int sub_width = W1 / head_number;
-
-    for (int i = 0; i < head_number; i++) {
-      int sub_matA_offset = (transA == CblasNoTrans)
-                                ? i * (W1 / head_number)
-                                : i * (W1 / head_number) * H1;
-      int sub_matB_offset = (transB == CblasNoTrans)
-                                ? i * (W1 / head_number) * W2
-                                : i * (W1 / head_number);
-      int sub_matC_offset = i * W2;
-      for (int k = 0; k < batchCount; ++k) {
-        a_array[k] = &A[k * strideA] + sub_matA_offset;
-        b_array[k] = &B[k * strideB] + sub_matB_offset;
-        c_array[k] = &C[k * H1 * head_number * W2] + sub_matC_offset;
-      }
-
-      CBlas<T>::GEMM_BATCH(CblasRowMajor,
-                           &transA,
-                           &transB,
-                           &H1,
-                           &W2,
-                           &sub_width,
-                           &alpha,
-                           a_array.data(),
-                           &lda,
-                           b_array.data(),
-                           &ldb,
-                           &beta,
-                           c_array.data(),
-                           &ldc,
-                           1 /* group_count */,
-                           &batchCount);
-    }
-  }
-}
-template <>
-template <typename T>
 void Blas<phi::CPUContext>::BatchedGEMMWithHead(CBLAS_TRANSPOSE transA,
                                                 CBLAS_TRANSPOSE transB,
                                                 int W1,
@@ -2097,43 +1581,6 @@ void Blas<DeviceContext>::MatMul(
                          N);
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::MatMul(
-    const int M, const int N, const int K, const T *A, const T *B, T *C) const {
-#ifdef PADDLE_WITH_LIBXSMM
-  // Refer to https://github.com/hfp/libxsmm/blob/master/README.md
-  // But the threshold is custom constexpr int LIBXSMM_THRESHOLD = 20 * 20 * 20;
-
-  // Since the matrix is very small,
-  // so the unit of calculation is already very fast,
-  // and the if( M*N*K < LIBXSMM_THRESHOLD) would be overhead,
-  // use xsmm directly.
-  // Note: SMM use ColMajor
-  const char transa = 'N';
-  const char transb = 'N';
-  const T alpha = static_cast<T>(1);
-  const T beta = static_cast<T>(0);
-  CBlas<T>::SMM_GEMM(
-      &transa, &transb, &N, &M, &K, &alpha, B, &N, A, &K, &beta, C, &N);
-  return;
-#endif
-
-  CBlas<T>::GEMM(CblasRowMajor,
-                 CblasNoTrans,
-                 CblasNoTrans,
-                 M,
-                 N,
-                 K,
-                 static_cast<T>(1),
-                 A,
-                 K,
-                 B,
-                 N,
-                 static_cast<T>(0),
-                 C,
-                 N);
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::MatMul(
@@ -2425,20 +1872,6 @@ void Blas<DeviceContext>::VINV(int n, const T *a, T *y) const {
 #endif
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::VMERF(int n,
-                                                     const T *a,
-                                                     T *y,
-                                                     int64_t mode) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VMERF(n, a, y, mode);
-#else
-  for (int i = 0; i < n; ++i) {
-    y[i] = std::erf(a[i]);
-  }
-#endif
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::VMERF(int n, const T *a, T *y, int64_t mode) const {
@@ -2454,39 +1887,6 @@ void Blas<phi::CPUContext>::VMERF(int n, const T *a, T *y, int64_t mode) const {
 #ifdef PADDLE_WITH_MKLML
 template <>
 template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::CSRMM(const char *transa,
-                                                     const int *m,
-                                                     const int *n,
-                                                     const int *k,
-                                                     const T *alpha,
-                                                     const char *matdescra,
-                                                     const T *val,
-                                                     const int *indx,
-                                                     const int *pntrb,
-                                                     const int *pntre,
-                                                     const T *b,
-                                                     const int *ldb,
-                                                     const T *beta,
-                                                     T *c,
-                                                     const int *ldc) const {
-  CBlas<T>::CSRMM(transa,
-                  m,
-                  n,
-                  k,
-                  alpha,
-                  matdescra,
-                  val,
-                  indx,
-                  pntrb,
-                  pntre,
-                  b,
-                  ldb,
-                  beta,
-                  c,
-                  ldc);
-}
-template <>
-template <typename T>
 void Blas<phi::CPUContext>::CSRMM(const char *transa,
                                   const int *m,
                                   const int *n,
@@ -2520,22 +1920,6 @@ void Blas<phi::CPUContext>::CSRMM(const char *transa,
 }
 #endif
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::TRSM(CBLAS_SIDE side,
-                                                    CBLAS_UPLO uplo,
-                                                    CBLAS_TRANSPOSE transA,
-                                                    CBLAS_DIAG diag,
-                                                    int M,
-                                                    int N,
-                                                    T alpha,
-                                                    const T *A,
-                                                    int lda,
-                                                    T *B,
-                                                    int ldb) const {
-  CBlas<T>::TRSM(
-      CblasRowMajor, side, uplo, transA, diag, M, N, alpha, A, lda, B, ldb);
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::TRSM(CBLAS_SIDE side,
diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
index dbcd4016170d5..01701ee287385 100644
--- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
-#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 
 namespace phi {
 namespace funcs {
diff --git a/paddle/phi/kernels/funcs/data_type_transform.h b/paddle/phi/kernels/funcs/data_type_transform.h
index ad7f2aa192ce4..72fa94d4ed23d 100644
--- a/paddle/phi/kernels/funcs/data_type_transform.h
+++ b/paddle/phi/kernels/funcs/data_type_transform.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include "glog/logging.h"
+
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/cast_kernel.h"
diff --git a/paddle/phi/kernels/funcs/eigen/eigen_function.h b/paddle/phi/kernels/funcs/eigen/eigen_function.h
index b971b4f95ef57..1e81256e79e14 100644
--- a/paddle/phi/kernels/funcs/eigen/eigen_function.h
+++ b/paddle/phi/kernels/funcs/eigen/eigen_function.h
@@ -118,6 +118,18 @@ struct EigenSub {
                    const InType& right);
 };
 
+template <typename EigenDevice, typename T>
+struct EigenDiv {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev,
+                   OutType out,
+                   const InType& in,
+                   const T value);
+};
+
 template <typename EigenDevice, typename T, int Rank>
 struct EigenSlice {
   using Array = Eigen::DSizes<Eigen::DenseIndex, Rank>;
diff --git a/paddle/phi/kernels/funcs/eigen/elementwise.cc b/paddle/phi/kernels/funcs/eigen/elementwise.cc
index 507a0116c3c20..713513757ad8c 100644
--- a/paddle/phi/kernels/funcs/eigen/elementwise.cc
+++ b/paddle/phi/kernels/funcs/eigen/elementwise.cc
@@ -55,5 +55,22 @@ struct EigenSub<Eigen::DefaultDevice, T> {
 
 template struct EigenSub<Eigen::DefaultDevice, float>;
 
+template <typename T>
+struct EigenDiv<Eigen::DefaultDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::DefaultDevice& dev,
+                   OutType out,
+                   const InType& in,
+                   const T value) {
+    out.device(dev) = in / value;
+  }
+};
+
+template struct EigenDiv<Eigen::DefaultDevice, float>;
+template struct EigenDiv<Eigen::DefaultDevice, double>;
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/eigen/elementwise.cu b/paddle/phi/kernels/funcs/eigen/elementwise.cu
index 3855ba8ccf945..1fb3b8a376efa 100644
--- a/paddle/phi/kernels/funcs/eigen/elementwise.cu
+++ b/paddle/phi/kernels/funcs/eigen/elementwise.cu
@@ -55,5 +55,22 @@ struct EigenSub<Eigen::GpuDevice, T> {
 
 template struct EigenSub<Eigen::GpuDevice, float>;
 
+template <typename T>
+struct EigenDiv<Eigen::GpuDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::GpuDevice& dev,
+                   OutType out,
+                   const InType& in,
+                   const T value) {
+    out.device(dev) = in / value;
+  }
+};
+
+template struct EigenDiv<Eigen::GpuDevice, float>;
+template struct EigenDiv<Eigen::GpuDevice, double>;
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h
old mode 100644
new mode 100755
index 3e68462c88a5c..ddbbe4b1718f1
--- a/paddle/phi/kernels/funcs/elementwise_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_base.h
@@ -558,6 +558,9 @@ struct VecSizeGetter {
 template <typename OutT, typename Functor>
 int GetVectorizedSizeForTensors(const std::vector<const DenseTensor *> &ins,
                                 const std::vector<DenseTensor *> &outs) {
+#ifdef PADDLE_WITH_XPU_KP
+  int vec_size = 256;
+#else
   using Traits = paddle::platform::FunctionTraits<Functor>;
   using ArgsT = typename Traits::ArgsTuple;
   const int Arity = Traits::arity;
@@ -569,6 +572,7 @@ int GetVectorizedSizeForTensors(const std::vector<const DenseTensor *> &ins,
     vec_size =
         std::min<int>(vec_size, phi::GetVectorizedSize((*iter)->data<OutT>()));
   }
+#endif
   return vec_size;
 }
 
@@ -784,7 +788,6 @@ template <typename OutT, typename Functor, int Arity, int NumOuts, int VecSize>
 void LaunchElementwiseCudaKernel(const KPDevice &ctx,
                                  const std::vector<const DenseTensor *> &ins,
                                  std::vector<DenseTensor *> *outs,
-                                 int read_lens,
                                  Functor func) {
   // There are at least 1 output, but maybe 0 input (ins.size() == 0).
   // For large tensor numel * sizeof(T) > 2^31, we must use int64_t as index
@@ -800,6 +803,7 @@ void LaunchElementwiseCudaKernel(const KPDevice &ctx,
 #ifdef PADDLE_WITH_XPU_KP
   int block_size = 64;
   int grid_size = 8;
+  int read_lens = kps::details::GetXpuReadLens(numel, block_size, grid_size);
   auto stream = ctx.x_context()->xpu_stream;
   int64_t main_offset =
       (numel / (read_lens * block_size)) * read_lens * block_size;
@@ -853,32 +857,20 @@ void ElementwiseKernel(const KPDevice &ctx,
     }
   }
 
-#ifdef PADDLE_WITH_XPU_KP
-  const int buf_size = 256;
-  int numel = (*outs)[0]->numel();
-  int block_size = 64;
-  int grid_size = 8;
-  int nthreads = block_size * grid_size;
-  int read_lens =
-      std::min(buf_size, kps::details::RoundUpDiv(numel, 32 * nthreads) * 32);
-  int vec_size = buf_size;
-#else
   // calculate the max vec_size for all ins and outs
   int vec_size = GetVectorizedSizeForTensors<OutT, Functor>(ins, *outs);
-  int read_lens = vec_size;
-#endif
   switch (vec_size) {
     case VecSizeL:
       LaunchElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, VecSizeL>(
-          ctx, ins, outs, read_lens, func);
+          ctx, ins, outs, func);
       break;
     case VecSizeM:
       LaunchElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, VecSizeM>(
-          ctx, ins, outs, read_lens, func);
+          ctx, ins, outs, func);
       break;
     case VecSizeS:
       LaunchElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, VecSizeS>(
-          ctx, ins, outs, read_lens, func);
+          ctx, ins, outs, func);
       break;
     default: {
       PADDLE_THROW(phi::errors::Unimplemented(
diff --git a/paddle/phi/kernels/funcs/fc_functor.cc b/paddle/phi/kernels/funcs/fc_functor.cc
index 0fb38c971abf5..0434483be1326 100644
--- a/paddle/phi/kernels/funcs/fc_functor.cc
+++ b/paddle/phi/kernels/funcs/fc_functor.cc
@@ -96,8 +96,6 @@ void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
   }
 }
 
-template class FCFunctor<paddle::platform::CPUDeviceContext, float>;
-template class FCFunctor<paddle::platform::CPUDeviceContext, double>;
 template class FCFunctor<CPUContext, float>;
 template class FCFunctor<CPUContext, double>;
 
diff --git a/paddle/phi/kernels/funcs/for_range.h b/paddle/phi/kernels/funcs/for_range.h
index bf0888c301fe7..78066ce5b2f5f 100644
--- a/paddle/phi/kernels/funcs/for_range.h
+++ b/paddle/phi/kernels/funcs/for_range.h
@@ -41,22 +41,6 @@ struct ForRange<phi::CPUContext> {
   size_t limit_;
 };
 
-// NOTE: After the pten kernel is migrated, it needs to be deleted.
-template <>
-struct ForRange<paddle::platform::CPUDeviceContext> {
-  ForRange(const paddle::platform::CPUDeviceContext& dev_ctx, size_t limit)
-      : dev_ctx_(dev_ctx), limit_(limit) {}
-
-  template <typename Function>
-  void operator()(Function func) const {
-    phi::funcs::ForRange<phi::CPUContext> for_range(dev_ctx_, limit_);
-    for_range(func);
-  }
-
-  const paddle::platform::CPUDeviceContext& dev_ctx_;
-  size_t limit_;
-};
-
 #if defined(__NVCC__) || defined(__HIPCC__)
 
 template <typename Function>
diff --git a/paddle/phi/kernels/funcs/gru_compute.cc b/paddle/phi/kernels/funcs/gru_compute.cc
index 8cda2e9062ae1..f0c946134906b 100644
--- a/paddle/phi/kernels/funcs/gru_compute.cc
+++ b/paddle/phi/kernels/funcs/gru_compute.cc
@@ -19,8 +19,8 @@ namespace phi {
 namespace funcs {
 
 template <typename T>
-struct GRUUnitFunctor<paddle::platform::CPUDeviceContext, T> {
-  static void compute(const paddle::platform::CPUDeviceContext &context,
+struct GRUUnitFunctor<phi::CPUContext, T> {
+  static void compute(const phi::CPUContext &context,
                       GRUMetaValue<T> value,
                       int frame_size,
                       int batch_size,
@@ -28,8 +28,7 @@ struct GRUUnitFunctor<paddle::platform::CPUDeviceContext, T> {
                       const phi::funcs::detail::ActivationType active_gate,
                       bool origin_mode) {
 #if !defined(__NVCC__) && !defined(__HIPCC___)
-    auto blas =
-        phi::funcs::GetBlas<paddle::platform::CPUDeviceContext, T>(context);
+    auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(context);
     if (value.prev_out_value) {
       blas.GEMM(false,
                 false,
@@ -46,7 +45,7 @@ struct GRUUnitFunctor<paddle::platform::CPUDeviceContext, T> {
                 frame_size * 3);
     }
 
-    detail::forward_reset_output<paddle::platform::CPUDeviceContext>(
+    detail::forward_reset_output<phi::CPUContext>(
         phi::funcs::detail::forward::gru_resetOutput<T>(),
         value,
         frame_size,
@@ -71,7 +70,7 @@ struct GRUUnitFunctor<paddle::platform::CPUDeviceContext, T> {
                 frame_size * 3);
     }
 
-    detail::forward_final_output<paddle::platform::CPUDeviceContext>(
+    detail::forward_final_output<phi::CPUContext>(
         phi::funcs::detail::forward::gru_finalOutput<T>(),
         value,
         frame_size,
@@ -85,8 +84,8 @@ struct GRUUnitFunctor<paddle::platform::CPUDeviceContext, T> {
 };
 
 template <typename T>
-struct GRUUnitGradFunctor<paddle::platform::CPUDeviceContext, T> {
-  static void compute(const paddle::platform::CPUDeviceContext &context,
+struct GRUUnitGradFunctor<phi::CPUContext, T> {
+  static void compute(const phi::CPUContext &context,
                       GRUMetaValue<T> value,
                       GRUMetaGrad<T> grad,
                       int frame_size,
@@ -103,8 +102,7 @@ struct GRUUnitGradFunctor<paddle::platform::CPUDeviceContext, T> {
         batch_size,
         active_node,
         origin_mode);
-    auto blas =
-        phi::funcs::GetBlas<paddle::platform::CPUDeviceContext, T>(context);
+    auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(context);
     if (value.prev_out_value && grad.prev_out_grad) {
       blas.GEMM(false,
                 true,
@@ -179,60 +177,6 @@ struct GRUUnitGradFunctor<paddle::platform::CPUDeviceContext, T> {
   }
 };
 
-template <typename T>
-struct GRUUnitFunctorV2<paddle::platform::CPUDeviceContext, T> {
-  static void compute(const paddle::platform::CPUDeviceContext &context,
-                      GRUMetaValue<T> value,
-                      int frame_size,
-                      int batch_size,
-                      const phi::funcs::detail::ActivationType active_node,
-                      const phi::funcs::detail::ActivationType active_gate) {
-#if !defined(__NVCC__) && !defined(__HIPCC___)
-    auto blas =
-        phi::funcs::GetBlas<paddle::platform::CPUDeviceContext, T>(context);
-    if (value.prev_out_value) {
-      blas.GEMM(CblasNoTrans,
-                CblasTrans,
-                batch_size,
-                frame_size,
-                frame_size,
-                1,
-                value.prev_out_value,
-                value.state_weight,
-                0,
-                value.reset_output_value);
-    }
-    detail::forward_reset_output(
-        phi::funcs::detail::forward::gru_resetOutput<T>(),
-        value,
-        frame_size,
-        batch_size,
-        active_gate,
-        false,
-        &context);
-
-    T *cell_state_value = value.gate_value + 2 * frame_size;
-    T *reset_output_value = value.reset_output_value;
-    for (int b = 0; b < batch_size; ++b) {
-      blas.VADD(
-          frame_size, cell_state_value, reset_output_value, cell_state_value);
-      cell_state_value += frame_size * 3;
-      reset_output_value += frame_size;
-    }
-
-    detail::forward_final_output(
-        phi::funcs::detail::forward::gru_finalOutput<T>(),
-        value,
-        frame_size,
-        batch_size,
-        active_node,
-        true,
-        false,
-        &context);
-#endif
-  }
-};
-
 template <typename T>
 struct GRUUnitFunctorV2<CPUContext, T> {
   static void compute(const CPUContext &context,
@@ -286,131 +230,6 @@ struct GRUUnitFunctorV2<CPUContext, T> {
   }
 };
 
-template <typename T>
-struct GRUUnitGradFunctorV2<paddle::platform::CPUDeviceContext, T> {
-  static void compute(const paddle::platform::CPUDeviceContext &context,
-                      GRUMetaValue<T> value,
-                      GRUMetaGrad<T> grad,
-                      int frame_size,
-                      int batch_size,
-                      const phi::funcs::detail::ActivationType active_node,
-                      const phi::funcs::detail::ActivationType active_gate) {
-#if !defined(__NVCC__) && !defined(__HIPCC___)
-    // calculate grad_update_gate, grad_frame_state,
-    // grad_reset_output, grad_reset_gate
-    detail::cpu_gru_backward(context,
-                             phi::funcs::detail::backward::gru<T>(),
-                             value,
-                             grad,
-                             frame_size,
-                             batch_size,
-                             active_node,
-                             active_gate);
-    auto blas =
-        phi::funcs::GetBlas<paddle::platform::CPUDeviceContext, T>(context);
-    if (grad.prev_out_grad && value.prev_out_value) {
-      // update prev_out_grad
-      blas.GEMM(false,
-                false,
-                batch_size,
-                frame_size,
-                frame_size,
-                1,
-                grad.gate_grad,
-                frame_size * 3,
-                value.gate_weight,
-                frame_size,
-                1,
-                grad.prev_out_grad,
-                frame_size);
-      blas.GEMM(false,
-                false,
-                batch_size,
-                frame_size,
-                frame_size,
-                1,
-                grad.gate_grad + frame_size,
-                frame_size * 3,
-                value.gate_weight + frame_size * frame_size,
-                frame_size,
-                1,
-                grad.prev_out_grad,
-                frame_size);
-      blas.GEMM(false,
-                false,
-                batch_size,
-                frame_size,
-                frame_size,
-                1,
-                grad.reset_output_grad,
-                frame_size,
-                value.state_weight,
-                frame_size,
-                1,
-                grad.prev_out_grad,
-                frame_size);
-      // update weight_hh_grad
-      if (grad.gate_weight_grad) {
-        // reset gate
-        blas.GEMM(true,
-                  false,
-                  frame_size,
-                  frame_size,
-                  batch_size,
-                  1,
-                  grad.gate_grad,
-                  frame_size * 3,
-                  value.prev_out_value,
-                  frame_size,
-                  1,
-                  grad.gate_weight_grad,
-                  frame_size);
-        // update gate
-        blas.GEMM(true,
-                  false,
-                  frame_size,
-                  frame_size,
-                  batch_size,
-                  1,
-                  grad.gate_grad + frame_size,
-                  frame_size * 3,
-                  value.prev_out_value,
-                  frame_size,
-                  1,
-                  grad.gate_weight_grad + frame_size * frame_size,
-                  frame_size);
-        // cell state
-        blas.GEMM(true,
-                  false,
-                  frame_size,
-                  frame_size,
-                  batch_size,
-                  1,
-                  grad.reset_output_grad,
-                  frame_size,
-                  value.prev_out_value,
-                  frame_size,
-                  1,
-                  grad.state_weight_grad,
-                  frame_size);
-      }
-    }
-    // update bias_hh_grad
-    T *gate_grad = grad.gate_grad;
-    T *bias_hh_grad = grad.bias_hh_grad;
-    T *state_bias_grad = grad.bias_hh_grad + 2 * frame_size;
-    T *reset_output_grad = grad.reset_output_grad;
-    for (int b = 0; b < batch_size; ++b) {
-      blas.VADD(2 * frame_size, bias_hh_grad, gate_grad, bias_hh_grad);
-      blas.VADD(
-          frame_size, state_bias_grad, reset_output_grad, state_bias_grad);
-      gate_grad += 3 * frame_size;
-      reset_output_grad += frame_size;
-    }
-#endif
-  }
-};
-
 template <typename T>
 struct GRUUnitGradFunctorV2<CPUContext, T> {
   static void compute(const CPUContext &context,
@@ -535,16 +354,10 @@ struct GRUUnitGradFunctorV2<CPUContext, T> {
   }
 };
 
-template struct GRUUnitFunctor<paddle::platform::CPUDeviceContext, float>;
-template struct GRUUnitFunctor<paddle::platform::CPUDeviceContext, double>;
-template struct GRUUnitGradFunctor<paddle::platform::CPUDeviceContext, float>;
-template struct GRUUnitGradFunctor<paddle::platform::CPUDeviceContext, double>;
-
-template struct GRUUnitFunctorV2<paddle::platform::CPUDeviceContext, float>;
-template struct GRUUnitFunctorV2<paddle::platform::CPUDeviceContext, double>;
-template struct GRUUnitGradFunctorV2<paddle::platform::CPUDeviceContext, float>;
-template struct GRUUnitGradFunctorV2<paddle::platform::CPUDeviceContext,
-                                     double>;
+template struct GRUUnitFunctor<phi::CPUContext, float>;
+template struct GRUUnitFunctor<phi::CPUContext, double>;
+template struct GRUUnitGradFunctor<phi::CPUContext, float>;
+template struct GRUUnitGradFunctor<phi::CPUContext, double>;
 
 template struct GRUUnitFunctorV2<CPUContext, float>;
 template struct GRUUnitFunctorV2<CPUContext, double>;
diff --git a/paddle/phi/kernels/funcs/lstm_compute.cc b/paddle/phi/kernels/funcs/lstm_compute.cc
index 45d0b2e40b4f3..e4b8a6961fd7e 100644
--- a/paddle/phi/kernels/funcs/lstm_compute.cc
+++ b/paddle/phi/kernels/funcs/lstm_compute.cc
@@ -21,38 +21,6 @@ limitations under the License. */
 namespace phi {
 namespace funcs {
 
-template <class T>
-struct LstmUnitFunctor<paddle::platform::CPUDeviceContext, T> {
-  static void compute(const paddle::platform::CPUDeviceContext& context,
-                      LstmMetaValue<T> value,
-                      int frame_size,
-                      int batch_size,
-                      T cell_clip,
-                      const phi::funcs::detail::ActivationType& gate_act,
-                      const phi::funcs::detail::ActivationType& cell_act,
-                      const phi::funcs::detail::ActivationType& cand_act,
-                      bool old_api_version = true) {
-    for (int b = 0; b < batch_size; b++) {
-      detail::cpu_lstm_forward(context,
-                               phi::funcs::detail::forward::lstm<T>(),
-                               value,
-                               frame_size,
-                               cell_clip,
-                               cand_act,
-                               gate_act,
-                               cell_act,
-                               old_api_version);
-      value.gate_value += frame_size * 4;
-      value.state_value += frame_size;
-      value.state_active_value += frame_size;
-      value.output_value += frame_size;
-      if (value.prev_state_value) {
-        value.prev_state_value += frame_size;
-      }
-    }
-  }
-};
-
 template <class T>
 struct LstmUnitFunctor<CPUContext, T> {
   static void compute(const CPUContext& context,
@@ -85,49 +53,6 @@ struct LstmUnitFunctor<CPUContext, T> {
   }
 };
 
-template <class T>
-struct LstmUnitGradFunctor<paddle::platform::CPUDeviceContext, T> {
-  static void compute(const paddle::platform::CPUDeviceContext& context,
-                      LstmMetaValue<T> value,
-                      LstmMetaGrad<T> grad,
-                      int frame_size,
-                      int batch_size,
-                      T cell_clip,
-                      const phi::funcs::detail::ActivationType& gate_act,
-                      const phi::funcs::detail::ActivationType& cell_act,
-                      const phi::funcs::detail::ActivationType& cand_act,
-                      bool old_api_version = true) {
-    for (int b = 0; b < batch_size; b++) {
-      detail::cpu_lstm_backward(context,
-                                phi::funcs::detail::backward::lstm<T>(),
-                                value,
-                                grad,
-                                frame_size,
-                                cell_clip,
-                                cand_act,
-                                gate_act,
-                                cell_act,
-                                old_api_version);
-
-      value.gate_value += frame_size * 4;
-      value.state_value += frame_size;
-      value.state_active_value += frame_size;
-      value.output_value += frame_size;
-      if (value.prev_state_value) {
-        value.prev_state_value += frame_size;
-      }
-
-      grad.gate_grad += frame_size * 4;
-      grad.state_grad += frame_size;
-      grad.state_active_grad += frame_size;
-      grad.output_grad += frame_size;
-      if (grad.prev_state_grad) {
-        grad.prev_state_grad += frame_size;
-      }
-    }
-  }
-};
-
 template <class T>
 struct LstmUnitGradFunctor<CPUContext, T> {
   static void compute(const CPUContext& context,
@@ -171,11 +96,6 @@ struct LstmUnitGradFunctor<CPUContext, T> {
   }
 };
 
-template class LstmUnitFunctor<paddle::platform::CPUDeviceContext, float>;
-template class LstmUnitFunctor<paddle::platform::CPUDeviceContext, double>;
-template class LstmUnitGradFunctor<paddle::platform::CPUDeviceContext, float>;
-template class LstmUnitGradFunctor<paddle::platform::CPUDeviceContext, double>;
-
 template class LstmUnitFunctor<CPUContext, float>;
 template class LstmUnitFunctor<CPUContext, double>;
 template class LstmUnitGradFunctor<CPUContext, float>;
diff --git a/paddle/phi/kernels/funcs/math_function.cc b/paddle/phi/kernels/funcs/math_function.cc
index 033c50e537da6..15a708f02f497 100644
--- a/paddle/phi/kernels/funcs/math_function.cc
+++ b/paddle/phi/kernels/funcs/math_function.cc
@@ -39,22 +39,6 @@ namespace funcs {
 
 using float16 = phi::dtype::float16;
 
-template struct SetConstant<paddle::platform::CPUDeviceContext,
-                            phi::dtype::float16>;
-template struct SetConstant<paddle::platform::CPUDeviceContext,
-                            phi::dtype::bfloat16>;
-template struct SetConstant<paddle::platform::CPUDeviceContext, float>;
-template struct SetConstant<paddle::platform::CPUDeviceContext, double>;
-template struct SetConstant<paddle::platform::CPUDeviceContext, int16_t>;
-template struct SetConstant<paddle::platform::CPUDeviceContext, int>;
-template struct SetConstant<paddle::platform::CPUDeviceContext, int64_t>;
-template struct SetConstant<paddle::platform::CPUDeviceContext, bool>;
-template struct SetConstant<paddle::platform::CPUDeviceContext, uint8_t>;
-template struct SetConstant<paddle::platform::CPUDeviceContext,
-                            phi::dtype::complex<float>>;
-template struct SetConstant<paddle::platform::CPUDeviceContext,
-                            phi::dtype::complex<double>>;
-
 template struct SetConstant<phi::CPUContext, phi::dtype::float16>;
 template struct SetConstant<phi::CPUContext, phi::dtype::bfloat16>;
 template struct SetConstant<phi::CPUContext, float>;
@@ -85,46 +69,20 @@ template struct SetConstant<paddle::platform::XPUDeviceContext,
                             phi::dtype::complex<double>>;
 #endif
 
-#define DEFINE_CPU_TRANS(RANK)                                                 \
-  template struct Transpose<paddle::platform::CPUDeviceContext,                \
-                            phi::dtype::float16,                               \
-                            RANK>;                                             \
-  template struct Transpose<paddle::platform::CPUDeviceContext,                \
-                            phi::dtype::bfloat16,                              \
-                            RANK>;                                             \
-  template struct Transpose<paddle::platform::CPUDeviceContext, float, RANK>;  \
-  template struct Transpose<paddle::platform::CPUDeviceContext, double, RANK>; \
-  template struct Transpose<paddle::platform::CPUDeviceContext, int, RANK>;    \
-  template struct Transpose<paddle::platform::CPUDeviceContext,                \
-                            int64_t,                                           \
-                            RANK>;                                             \
-  template struct Transpose<paddle::platform::CPUDeviceContext, bool, RANK>;   \
-  template struct Transpose<paddle::platform::CPUDeviceContext,                \
-                            int16_t,                                           \
-                            RANK>;                                             \
-  template struct Transpose<paddle::platform::CPUDeviceContext,                \
-                            uint8_t,                                           \
-                            RANK>;                                             \
-  template struct Transpose<paddle::platform::CPUDeviceContext, int8_t, RANK>; \
-  template struct Transpose<paddle::platform::CPUDeviceContext,                \
-                            phi::dtype::complex<float>,                        \
-                            RANK>;                                             \
-  template struct Transpose<paddle::platform::CPUDeviceContext,                \
-                            phi::dtype::complex<double>,                       \
-                            RANK>;                                             \
-  template struct Transpose<phi::CPUContext, phi::dtype::float16, RANK>;       \
-  template struct Transpose<phi::CPUContext, phi::dtype::bfloat16, RANK>;      \
-  template struct Transpose<phi::CPUContext, float, RANK>;                     \
-  template struct Transpose<phi::CPUContext, double, RANK>;                    \
-  template struct Transpose<phi::CPUContext, int, RANK>;                       \
-  template struct Transpose<phi::CPUContext, int64_t, RANK>;                   \
-  template struct Transpose<phi::CPUContext, bool, RANK>;                      \
-  template struct Transpose<phi::CPUContext, int16_t, RANK>;                   \
-  template struct Transpose<phi::CPUContext, uint8_t, RANK>;                   \
-  template struct Transpose<phi::CPUContext, int8_t, RANK>;                    \
-  template struct Transpose<phi::CPUContext,                                   \
-                            phi::dtype::complex<float>,                        \
-                            RANK>;                                             \
+#define DEFINE_CPU_TRANS(RANK)                                            \
+  template struct Transpose<phi::CPUContext, phi::dtype::float16, RANK>;  \
+  template struct Transpose<phi::CPUContext, phi::dtype::bfloat16, RANK>; \
+  template struct Transpose<phi::CPUContext, float, RANK>;                \
+  template struct Transpose<phi::CPUContext, double, RANK>;               \
+  template struct Transpose<phi::CPUContext, int, RANK>;                  \
+  template struct Transpose<phi::CPUContext, int64_t, RANK>;              \
+  template struct Transpose<phi::CPUContext, bool, RANK>;                 \
+  template struct Transpose<phi::CPUContext, int16_t, RANK>;              \
+  template struct Transpose<phi::CPUContext, uint8_t, RANK>;              \
+  template struct Transpose<phi::CPUContext, int8_t, RANK>;               \
+  template struct Transpose<phi::CPUContext,                              \
+                            phi::dtype::complex<float>,                   \
+                            RANK>;                                        \
   template struct Transpose<phi::CPUContext, phi::dtype::complex<double>, RANK>;
 
 DEFINE_CPU_TRANS(1);
@@ -163,8 +121,7 @@ void TransposeNormal<DeviceContext, T>::operator()(
 }
 
 // define transpose normal
-#define DEFINE_CPU_TRANS_NORMAL(TYPE)                                        \
-  template struct TransposeNormal<paddle::platform::CPUDeviceContext, TYPE>; \
+#define DEFINE_CPU_TRANS_NORMAL(TYPE) \
   template struct TransposeNormal<phi::CPUContext, TYPE>
 
 DEFINE_CPU_TRANS_NORMAL(phi::dtype::float16);
@@ -257,7 +214,8 @@ void set_constant_with_place<paddle::platform::CUDAPinnedPlace>(
   phi::VisitDataType(tensor->dtype(), TensorSetConstantCPU(tensor, value));
 }
 
-struct TensorSetConstantWithPlace : public boost::static_visitor<void> {
+struct TensorSetConstantWithPlace
+    : public std::unary_function<paddle::platform::Place, void> {
   TensorSetConstantWithPlace(const paddle::platform::DeviceContext& context,
                              paddle::framework::Tensor* tensor,
                              float value)
@@ -291,9 +249,32 @@ void set_constant(const paddle::platform::DeviceContext& context,
 #endif
 }
 
+template struct ColwiseSum<phi::CPUContext, float>;
+template struct ColwiseSum<phi::CPUContext, double>;
+template struct ColwiseSum<phi::CPUContext, int>;
+template struct ColwiseSum<phi::CPUContext, int64_t>;
+
+template struct RowwiseMean<phi::CPUContext, float>;
+template struct RowwiseMean<phi::CPUContext, double>;
+
+template <typename T>
+struct ElementwiseAddTo<phi::CPUContext, T> {
+  void operator()(phi::CPUContext* ctx,
+                  const paddle::framework::Tensor& src,
+                  paddle::framework::Tensor* dst) {
+    auto in = paddle::framework::EigenVector<T>::Flatten(src);
+    auto out = paddle::framework::EigenVector<T>::Flatten(*dst);
+    auto& place = *(ctx->eigen_device());
+    out.device(place) = out + in;
+  }
+};
+
+template struct ElementwiseAddTo<phi::CPUContext, phi::dtype::float16>;
+template struct ElementwiseAddTo<phi::CPUContext, phi::dtype::bfloat16>;
+
 template <typename T>
-struct RowwiseAdd<paddle::platform::CPUDeviceContext, T> {
-  void operator()(const paddle::platform::CPUDeviceContext& context,
+struct RowwiseAdd<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext& context,
                   const paddle::framework::Tensor& input,
                   const paddle::framework::Tensor& vector,
                   paddle::framework::Tensor* output) {
@@ -330,44 +311,8 @@ struct RowwiseAdd<paddle::platform::CPUDeviceContext, T> {
   }
 };
 
-template struct RowwiseAdd<paddle::platform::CPUDeviceContext, float>;
-template struct RowwiseAdd<paddle::platform::CPUDeviceContext, double>;
-
-template struct ColwiseSum<paddle::platform::CPUDeviceContext, float>;
-template struct ColwiseSum<paddle::platform::CPUDeviceContext, double>;
-template struct ColwiseSum<paddle::platform::CPUDeviceContext, int>;
-template struct ColwiseSum<paddle::platform::CPUDeviceContext, int64_t>;
-
-template struct ColwiseSum<phi::CPUContext, float>;
-template struct ColwiseSum<phi::CPUContext, double>;
-template struct ColwiseSum<phi::CPUContext, int>;
-template struct ColwiseSum<phi::CPUContext, int64_t>;
-
-template struct RowwiseSum<paddle::platform::CPUDeviceContext, float>;
-template struct RowwiseSum<paddle::platform::CPUDeviceContext, double>;
-
-template struct RowwiseMean<paddle::platform::CPUDeviceContext, float>;
-template struct RowwiseMean<paddle::platform::CPUDeviceContext, double>;
-
-template struct RowwiseMean<phi::CPUContext, float>;
-template struct RowwiseMean<phi::CPUContext, double>;
-
-template <typename T>
-struct ElementwiseAddTo<paddle::platform::CPUDeviceContext, T> {
-  void operator()(paddle::platform::CPUDeviceContext* ctx,
-                  const paddle::framework::Tensor& src,
-                  paddle::framework::Tensor* dst) {
-    auto in = paddle::framework::EigenVector<T>::Flatten(src);
-    auto out = paddle::framework::EigenVector<T>::Flatten(*dst);
-    auto& place = *(ctx->eigen_device());
-    out.device(place) = out + in;
-  }
-};
-
-template struct ElementwiseAddTo<paddle::platform::CPUDeviceContext,
-                                 phi::dtype::float16>;
-template struct ElementwiseAddTo<paddle::platform::CPUDeviceContext,
-                                 phi::dtype::bfloat16>;
+template struct RowwiseAdd<phi::CPUContext, float>;
+template struct RowwiseAdd<phi::CPUContext, double>;
 
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/math_function_impl.h b/paddle/phi/kernels/funcs/math_function_impl.h
index 7c337e6c0dba9..f9055fb56c913 100644
--- a/paddle/phi/kernels/funcs/math_function_impl.h
+++ b/paddle/phi/kernels/funcs/math_function_impl.h
@@ -92,9 +92,9 @@ void ColwiseSum<DeviceContext, T>::operator()(
 // colwise-sum can be easily implemented. General reduce has a huge overhead in
 // CPU
 template <typename T>
-class ColwiseSum<paddle::platform::CPUDeviceContext, T> {
+class ColwiseSum<phi::CPUContext, T> {
  public:
-  void operator()(const paddle::platform::CPUDeviceContext& context,
+  void operator()(const phi::CPUContext& context,
                   const paddle::framework::Tensor& input,
                   paddle::framework::Tensor* out) {
     auto& in_dims = input.dims();
@@ -155,9 +155,9 @@ void RowwiseMean<DeviceContext, T>::operator()(
 // rowwise-sum can be easily implemented. General reduce has a huge overhead in
 // CPU
 template <typename T>
-class RowwiseMean<paddle::platform::CPUDeviceContext, T> {
+class RowwiseMean<phi::CPUContext, T> {
  public:
-  void operator()(const paddle::platform::CPUDeviceContext& context,
+  void operator()(const phi::CPUContext& context,
                   const paddle::framework::Tensor& input,
                   paddle::framework::Tensor* out) {
     auto& in_dims = input.dims();
@@ -222,9 +222,9 @@ void RowwiseSum<DeviceContext, T>::operator()(
 // rowwise-sum can be easily implemented. General reduce has a huge overhead in
 // CPU
 template <typename T>
-class RowwiseSum<paddle::platform::CPUDeviceContext, T> {
+class RowwiseSum<phi::CPUContext, T> {
  public:
-  void operator()(const paddle::platform::CPUDeviceContext& context,
+  void operator()(const phi::CPUContext& context,
                   const paddle::framework::Tensor& input,
                   paddle::framework::Tensor* out) {
     auto& in_dims = input.dims();
diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cc b/paddle/phi/kernels/funcs/matrix_inverse.cc
index c95e97f8ea81a..c316970e6a560 100644
--- a/paddle/phi/kernels/funcs/matrix_inverse.cc
+++ b/paddle/phi/kernels/funcs/matrix_inverse.cc
@@ -29,9 +29,5 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
 template class MatrixInverseFunctor<CPUContext, float>;
 template class MatrixInverseFunctor<CPUContext, double>;
 
-// TODO(chenweihang): remove these instantiations later
-template class MatrixInverseFunctor<paddle::platform::CPUDeviceContext, float>;
-template class MatrixInverseFunctor<paddle::platform::CPUDeviceContext, double>;
-
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/matrix_solve.cc b/paddle/phi/kernels/funcs/matrix_solve.cc
new file mode 100644
index 0000000000000..31baedb3c314d
--- /dev/null
+++ b/paddle/phi/kernels/funcs/matrix_solve.cc
@@ -0,0 +1,32 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/funcs/matrix_solve.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename Context, typename T>
+void MatrixSolveFunctor<Context, T>::operator()(const Context& dev_ctx,
+                                                const DenseTensor& a,
+                                                const DenseTensor& b,
+                                                DenseTensor* out) {
+  compute_solve_eigen<Context, T>(dev_ctx, a, b, out);
+}
+
+template class MatrixSolveFunctor<CPUContext, float>;
+template class MatrixSolveFunctor<CPUContext, double>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/matrix_solve.cu b/paddle/phi/kernels/funcs/matrix_solve.cu
new file mode 100644
index 0000000000000..fccceb7e20d2d
--- /dev/null
+++ b/paddle/phi/kernels/funcs/matrix_solve.cu
@@ -0,0 +1,178 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/funcs/matrix_solve.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename Context, typename T>
+void MatrixSolveFunctor<Context, T>::operator()(const Context& context,
+                                                const DenseTensor& a,
+                                                const DenseTensor& b,
+                                                DenseTensor* out) {
+#ifndef PADDLE_WITH_HIP
+
+  // solve the equation: Ax = B,
+  // use cuBlas cublas<S/D>getrfBatched funcion to performs the LU
+  // factorization of each matrix A,
+  // and then use cuBlas cublas<S/D>getriBatched function to solve the
+  // equation after LU factorization.
+  // ref:
+  // https://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-getrfbatched
+  const auto& a_dims = a.dims();
+  const int a_rank = a_dims.size();
+  int n = a_dims[a_rank - 1];
+  int lda = n;
+  int batch_size = a_rank > 2 ? a.numel() / (n * n) : 1;
+
+  const auto& b_dims = b.dims();
+  const int b_rank = b_dims.size();
+  int nrhs = b_dims[b_rank - 1];
+  int ldb = b_dims[b_rank - 2];
+
+  // make sure the out dims is right
+  out->Resize(b_dims);
+
+  context.template Alloc<T>(out);
+
+  // copy input A to a temporary tensor tmp_a,
+  // LU factorization, written back to original matrix A, so in the beginning,
+  // it's necessary to create a temporary tensor tmp_a.
+  DenseTensor tmp_a(a.dtype());
+  tmp_a.Resize(a.dims());
+
+  context.template Alloc<T>(&tmp_a);
+  paddle::framework::TensorCopy(a, context.GetPlace(), &tmp_a);
+
+  // copy input B to a temporary tensor tmp_b, and transpose tmp_b,
+  // because cuBlas assumes column-major while Paddle uses row-majar.
+  DenseTensor tmp_b(b.type());
+  const auto& new_dims_vec = getNewDimsVec(b_dims);
+  tmp_b.Resize(phi::make_ddim(new_dims_vec));
+  context.template Alloc<T>(&tmp_b);
+  phi::funcs::TransposeNormal<Context, T> trans;
+  std::vector<int> new_axis = getNewAxis(b_rank);
+  trans(context, b, &tmp_b, new_axis);
+
+  const T* a_data_in_gpu = tmp_a.data<T>();
+  const T* b_data_in_gpu = tmp_b.data<T>();
+
+  std::vector<const T*> cpu_ptrs(batch_size * 2);
+  for (int i = 0; i < batch_size; ++i) {
+    cpu_ptrs[i] = a_data_in_gpu + i * n * n;
+    cpu_ptrs[i + batch_size] = b_data_in_gpu + i * n * nrhs;
+  }
+
+  // Copy the addresses of A and tmp_b from host to device.
+  paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
+      paddle::memory::Alloc(context, cpu_ptrs.size() * sizeof(T*));
+  paddle::memory::Copy(context.GetPlace(),
+                       tmp_gpu_ptrs_data->ptr(),
+                       phi::CPUPlace(),
+                       static_cast<void*>(cpu_ptrs.data()),
+                       cpu_ptrs.size() * sizeof(T*),
+                       context.stream());
+
+  T** gpu_tmp_b_ptrs =
+      reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()) + batch_size;
+
+  // Allocate device memory for BatchedGETRF's info and pivots.
+  int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
+  paddle::memory::allocation::AllocationPtr tmp_gpu_info_data =
+      paddle::memory::Alloc(context, num_ints * sizeof(int));
+  int* gpu_info_ptr = reinterpret_cast<int*>(tmp_gpu_info_data->ptr());
+
+  auto blas = phi::funcs::GetBlas<Context, T>(context);
+
+  // only for singular checking
+  std::vector<int> info;
+  info.resize(batch_size);
+
+  int* gpu_pivot_ptr =
+      reinterpret_cast<int*>(tmp_gpu_info_data->ptr()) + batch_size;
+
+  // This function performs the LU factorization of each matrix A by the
+  // equation A = L * U. L and U are written back to original matrix A,
+  // and diagonal elements of L are discarded.
+  blas.BatchedGETRF(n,
+                    reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()),
+                    gpu_pivot_ptr,
+                    gpu_info_ptr,
+                    batch_size);
+
+  // check whether BatchedGETRF is executed successfully or not
+  paddle::memory::Copy(phi::CPUPlace(),
+                       info.data(),
+                       context.GetPlace(),
+                       gpu_info_ptr,
+                       sizeof(int) * batch_size,
+                       context.stream());
+  for (int i = 0; i < batch_size; ++i) {
+    PADDLE_ENFORCE_EQ(info[i],
+                      0,
+                      phi::errors::PreconditionNotMet(
+                          "For batch [%d]: U(%d, %d) is zero, singular U. "
+                          "Please check the matrix value and change it to a "
+                          "non-singular matrix",
+                          i,
+                          info[i],
+                          info[i]));
+  }
+
+  // hold the result code from BatchedGETRS
+  int host_info = 0;
+
+  // to solve the equation after LU factorization
+  CBLAS_TRANSPOSE transA = CblasTrans;
+  blas.BatchedGETRS(transA,
+                    n,
+                    nrhs,
+                    reinterpret_cast<const T**>(tmp_gpu_ptrs_data->ptr()),
+                    lda,
+                    gpu_pivot_ptr,
+                    gpu_tmp_b_ptrs,
+                    ldb,
+                    &host_info,
+                    batch_size);
+
+  // check whether BatchedGETRS is executed successfully or not
+  PADDLE_ENFORCE_EQ(host_info,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The [%d]'th argument to cublas*getrsBatched had "
+                        "an illegal value.",
+                        -host_info));
+
+  // transpose tmp_b to get the final result in row-major form.
+  phi::funcs::TransposeNormal<Context, T> trans2;
+  trans2(context, tmp_b, out, new_axis);
+
+#else
+  compute_solve_eigen<Context, T>(context, a, b, out);
+#endif
+}
+
+template class MatrixSolveFunctor<GPUContext, float>;
+template class MatrixSolveFunctor<GPUContext, double>;
+
+// TODO(wuweilong): remove these instantiations later
+template class MatrixSolveFunctor<paddle::platform::CUDADeviceContext, float>;
+template class MatrixSolveFunctor<paddle::platform::CUDADeviceContext, double>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/math/matrix_solve.h b/paddle/phi/kernels/funcs/matrix_solve.h
similarity index 61%
rename from paddle/fluid/operators/math/matrix_solve.h
rename to paddle/phi/kernels/funcs/matrix_solve.h
index 6852d04e5a7e9..3856c06c1b25f 100644
--- a/paddle/fluid/operators/math/matrix_solve.h
+++ b/paddle/phi/kernels/funcs/matrix_solve.h
@@ -18,18 +18,79 @@ limitations under the License. */
 
 #include "Eigen/Core"
 #include "Eigen/LU"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename DeviceContext, typename T>
-void compute_solve_eigen(const DeviceContext& context,
-                         const framework::Tensor& a,
-                         const framework::Tensor& b,
-                         framework::Tensor* out) {
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
+
+namespace phi {
+namespace funcs {
+
+// for TransposeNormal
+static std::vector<int> getNewAxis(const int b_rank) {
+  std::vector<int> axis_1 = {0};
+  std::vector<int> axis_2 = {1, 0};
+  std::vector<int> axis_3 = {0, 2, 1};
+  std::vector<int> axis_4 = {0, 1, 3, 2};
+  std::vector<int> axis_5 = {0, 1, 2, 4, 3};
+  std::vector<int> axis_6 = {0, 1, 2, 3, 5, 4};
+  std::vector<int> axis_7 = {0, 1, 2, 3, 4, 6, 5};
+  std::vector<int> axis_8 = {0, 1, 2, 3, 4, 5, 7, 6};
+  std::vector<int> axis_9 = {0, 1, 2, 3, 4, 5, 6, 8, 7};
+  switch (b_rank) {
+    case 1:
+      return axis_1;
+      break;
+    case 2:
+      return axis_2;
+      break;
+    case 3:
+      return axis_3;
+      break;
+    case 4:
+      return axis_4;
+      break;
+    case 5:
+      return axis_5;
+      break;
+    case 6:
+      return axis_6;
+      break;
+    case 7:
+      return axis_7;
+      break;
+    case 8:
+      return axis_8;
+      break;
+    default:
+      return axis_9;
+  }
+}
+
+// for Resize
+static std::vector<int64_t> getNewDimsVec(const DDim& b_dims) {
+  std::vector<int64_t> b_dims_vec = phi::vectorize(b_dims);
+  int size = b_dims_vec.size();
+  if (size >= 2) {
+    // swap the last 2 elements in b_dims_vec
+    int64_t temp = b_dims_vec[size - 1];
+    b_dims_vec[size - 1] = b_dims_vec[size - 2];
+    b_dims_vec[size - 2] = temp;
+    return b_dims_vec;
+  }
+  PADDLE_ENFORCE_NE(
+      b_dims_vec.empty(),
+      true,
+      phi::errors::PreconditionNotMet(
+          "The size of tensor b must not be %d after getting new dims", 0));
+  // if b_dims_vec.size() == 1, just retun original vec
+  return b_dims_vec;
+}
+
+template <typename Context, typename T>
+void compute_solve_eigen(const Context& context,
+                         const DenseTensor& a,
+                         const DenseTensor& b,
+                         DenseTensor* out) {
   using Matrix =
       Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
   using EigenMatrixMap = Eigen::Map<Matrix>;
@@ -51,7 +112,7 @@ void compute_solve_eigen(const DeviceContext& context,
   const T* b_ptr = b.data<T>();
   out->Resize(b_mat_dims);  // make sure the out dims is right
 
-  T* out_ptr = out->mutable_data<T>(context.GetPlace());
+  T* out_ptr = context.template Alloc<T>(out);
   if (a_batch_size == b_batch_size) {
     for (int i = 0; i < a_batch_size; ++i) {
       ConstEigenMatrixMap a_mat(a_ptr + i * n * n, n, n);
@@ -63,13 +124,13 @@ void compute_solve_eigen(const DeviceContext& context,
       PADDLE_ENFORCE_GT(
           min_abs_pivot,
           static_cast<T>(0),
-          platform::errors::InvalidArgument("Input is not invertible."));
+          phi::errors::InvalidArgument("Input is not invertible."));
       out_mat.noalias() = lu.solve(b_mat);
     }
   } else {
     PADDLE_ENFORCE_EQ(a_batch_size,
                       b_batch_size,
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "All input tensors must have the same rank."));
   }
 }
@@ -114,22 +175,21 @@ void SolveLinearSystem(T* matrix_data,
         lu_decomposition.matrixLU().diagonal().cwiseAbs().minCoeff();
     PADDLE_ENFORCE_GT(min_abs_piv,
                       Treal(0),
-                      platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "Something's wrong with SolveLinearSystem. "));
 
     output = lu_decomposition.solve(input_rhs);
   }
 }
 
-template <typename DeviceContext, typename T>
+template <typename Context, typename T>
 class MatrixSolveFunctor {
  public:
-  void operator()(const DeviceContext& context,
-                  const framework::Tensor& a,
-                  const framework::Tensor& b,
-                  framework::Tensor* out);
+  void operator()(const Context& context,
+                  const DenseTensor& a,
+                  const DenseTensor& b,
+                  DenseTensor* out);
 };
 
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/select_impl.cu.h b/paddle/phi/kernels/funcs/select_impl.cu.h
index a036f27cc2b80..831e0ca907b3c 100644
--- a/paddle/phi/kernels/funcs/select_impl.cu.h
+++ b/paddle/phi/kernels/funcs/select_impl.cu.h
@@ -395,7 +395,6 @@ void SelectKernel(const KPDevice &dev_ctx,
   paddle::platform::CPUPlace cpu_place = paddle::platform::CPUPlace();
 
   // 1.1 get stored data num of per block
-  int total_true_num = 0;  // init
   const int kVecSize = 4;
 #ifdef PADDLE_WITH_XPU_KP
   int block = 64;
@@ -424,6 +423,7 @@ void SelectKernel(const KPDevice &dev_ctx,
   DenseTensor cumsum_mem = phi::Empty<CT, KPDevice>(dev_ctx, dims_array);
   CT *cumsum_data = cumsum_mem.data<CT>();
   // 2.2 get prefix of count_data for real out_index
+  CT total_true_num = static_cast<CT>(0);  // init
   const int kCumVesize = 2;
   const int block_c = 256;
   const int main_offset_c = Floor(size_count_block, (kCumVesize * block_c));
@@ -448,7 +448,7 @@ void SelectKernel(const KPDevice &dev_ctx,
   if (SelectData == 1) {
     out->Resize(phi::make_ddim(out_dim));
   } else if (SelectData == 0) {  // == 0 where_index
-    out_dim.push_back(rank);
+    out_dim.push_back(static_cast<int64_t>(rank));
     out->Resize(phi::make_ddim(out_dim));
   }
   auto out_data = out->mutable_data<OutT>(cuda_place);
diff --git a/paddle/phi/kernels/funcs/sequence2batch.cc b/paddle/phi/kernels/funcs/sequence2batch.cc
index 0d75ba877db5e..7cad5b6c0b929 100644
--- a/paddle/phi/kernels/funcs/sequence2batch.cc
+++ b/paddle/phi/kernels/funcs/sequence2batch.cc
@@ -18,9 +18,9 @@ namespace phi {
 namespace funcs {
 
 template <typename T>
-class CopyMatrixRowsFunctor<paddle::platform::CPUDeviceContext, T> {
+class CopyMatrixRowsFunctor<phi::CPUContext, T> {
  public:
-  void operator()(const paddle::platform::CPUDeviceContext& context,
+  void operator()(const phi::CPUContext& context,
                   const paddle::framework::Tensor& src,
                   paddle::framework::Vector<size_t> index_lod,
                   paddle::framework::Tensor* dst,
@@ -68,18 +68,13 @@ class CopyMatrixRowsFunctor<paddle::platform::CPUDeviceContext, T> {
   }
 };
 
-template class CopyMatrixRowsFunctor<paddle::platform::CPUDeviceContext, float>;
-template class CopyMatrixRowsFunctor<paddle::platform::CPUDeviceContext,
-                                     double>;
+template class CopyMatrixRowsFunctor<phi::CPUContext, float>;
+template class CopyMatrixRowsFunctor<phi::CPUContext, double>;
 
-template class LoDTensor2BatchFunctor<paddle::platform::CPUDeviceContext,
-                                      float>;
-template class LoDTensor2BatchFunctor<paddle::platform::CPUDeviceContext,
-                                      double>;
-template class Batch2LoDTensorFunctor<paddle::platform::CPUDeviceContext,
-                                      float>;
-template class Batch2LoDTensorFunctor<paddle::platform::CPUDeviceContext,
-                                      double>;
+template class LoDTensor2BatchFunctor<phi::CPUContext, float>;
+template class LoDTensor2BatchFunctor<phi::CPUContext, double>;
+template class Batch2LoDTensorFunctor<phi::CPUContext, float>;
+template class Batch2LoDTensorFunctor<phi::CPUContext, double>;
 
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h
index 3d92674c92d6e..9f7be26857bdb 100644
--- a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h
+++ b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h
@@ -298,6 +298,7 @@ class CuSparseDnVecDescriptor {
   cusparseDnVecDescr_t descriptor_;
 };
 
+/************* SPARSE*DENSE->DENSE MATMUL ************/
 template <>
 template <typename T, typename TensorType>
 void SparseBlas<phi::GPUContext>::SPMM(bool transa,
@@ -345,6 +346,7 @@ void SparseBlas<phi::GPUContext>::SPMM(bool transa,
   });
 }
 
+/************* SPARSE*DENSE->DENSE MV ************/
 template <>
 template <typename T, typename TensorType>
 void SparseBlas<phi::GPUContext>::SPMV(bool transa,
@@ -389,6 +391,7 @@ void SparseBlas<phi::GPUContext>::SPMV(bool transa,
   });
 }
 
+/************* DENSE*DENSE->SPARSE MATMUL ************/
 #if CUDA_VERSION >= 11030
 template <>
 template <typename T, typename TensorType>
diff --git a/paddle/phi/kernels/gpu/adam_kernel.cu b/paddle/phi/kernels/gpu/adam_kernel.cu
index 59aa4cf597e86..b20e8610fefaf 100644
--- a/paddle/phi/kernels/gpu/adam_kernel.cu
+++ b/paddle/phi/kernels/gpu/adam_kernel.cu
@@ -265,6 +265,106 @@ void AdamDenseKernel(const Context& dev_ctx,
   }
 }
 
+template <typename T, typename Context>
+void MergedAdamKernel(
+    const Context& dev_ctx,
+    const std::vector<const DenseTensor*>& param,
+    const std::vector<const DenseTensor*>& grad,
+    const std::vector<const DenseTensor*>& learning_rate,
+    const std::vector<const DenseTensor*>& moment1,
+    const std::vector<const DenseTensor*>& moment2,
+    const std::vector<const DenseTensor*>& beta1_pow,
+    const std::vector<const DenseTensor*>& beta2_pow,
+    const paddle::optional<std::vector<const DenseTensor*>>& master_param,
+    const Scalar& beta1,
+    const Scalar& beta2,
+    const Scalar& epsilon,
+    bool multi_precision,
+    bool use_global_beta_pow,
+    std::vector<DenseTensor*> param_out,
+    std::vector<DenseTensor*> moment1_out,
+    std::vector<DenseTensor*> moment2_out,
+    std::vector<DenseTensor*> beta1_pow_out,
+    std::vector<DenseTensor*> beta2_pow_out,
+    std::vector<DenseTensor*> master_param_out) {
+  using MPDType = typename phi::dtype::MPTypeTrait<T>::Type;
+  VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
+  MPDType beta1_ = beta1.to<MPDType>();
+  MPDType beta2_ = beta2.to<MPDType>();
+  MPDType epsilon_ = epsilon.to<MPDType>();
+
+  size_t param_num = param.size();
+
+  for (size_t idx = 0; idx < param_num; idx++) {
+    const MPDType* master_in_data =
+        multi_precision ? master_param.get()[idx]->data<MPDType>() : nullptr;
+    MPDType* master_out_data =
+        multi_precision ? dev_ctx.template Alloc<MPDType>(master_param_out[idx])
+                        : nullptr;
+
+    // update param and moment
+    int threads = 512;
+    int blocks = (param[idx]->numel() + threads - 1) / threads;
+
+    if (beta1_pow[idx]->place() == CPUPlace() &&
+        beta2_pow[idx]->place() == CPUPlace()) {
+      // Compute with betapow in REG
+      AdamKernelREG<T, MPDType><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          beta1_,
+          beta2_,
+          epsilon_,
+          *beta1_pow[idx]->data<MPDType>(),
+          *beta2_pow[idx]->data<MPDType>(),
+          moment1[idx]->data<MPDType>(),
+          dev_ctx.template Alloc<MPDType>(moment1_out[idx]),
+          moment2[idx]->data<MPDType>(),
+          dev_ctx.template Alloc<MPDType>(moment2_out[idx]),
+          learning_rate[idx]->data<MPDType>(),
+          grad[idx]->data<T>(),
+          param[idx]->data<T>(),
+          dev_ctx.template Alloc<T>(param_out[idx]),
+          master_in_data,
+          master_out_data,
+          param[idx]->numel());
+      if (!use_global_beta_pow) {
+        // Cpu update
+        dev_ctx.template HostAlloc<MPDType>(beta1_pow_out[idx])[0] =
+            beta1_ * beta1_pow[idx]->data<MPDType>()[0];
+        dev_ctx.template HostAlloc<MPDType>(beta2_pow_out[idx])[0] =
+            beta2_ * beta2_pow[idx]->data<MPDType>()[0];
+      }
+    } else {
+      AdamKernelMEM<T, MPDType><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          beta1_,
+          beta2_,
+          epsilon_,
+          beta1_pow[idx]->data<MPDType>(),
+          beta2_pow[idx]->data<MPDType>(),
+          moment1[idx]->data<MPDType>(),
+          dev_ctx.template Alloc<MPDType>(moment1_out[idx]),
+          moment2[idx]->data<MPDType>(),
+          dev_ctx.template Alloc<MPDType>(moment2_out[idx]),
+          learning_rate[idx]->data<MPDType>(),
+          grad[idx]->data<T>(),
+          param[idx]->data<T>(),
+          dev_ctx.template Alloc<T>(param_out[idx]),
+          master_in_data,
+          master_out_data,
+          param[idx]->numel());
+      if (!use_global_beta_pow) {
+        // Update with gpu
+        UpdateBetaPow<MPDType><<<1, 32, 0, dev_ctx.stream()>>>(
+            beta1_,
+            beta2_,
+            beta1_pow[idx]->data<MPDType>(),
+            beta2_pow[idx]->data<MPDType>(),
+            dev_ctx.template Alloc<MPDType>(beta1_pow_out[idx]),
+            dev_ctx.template Alloc<MPDType>(beta2_pow_out[idx]));
+      }
+    }
+  }
+}
+
 }  // namespace phi
 
 PD_REGISTER_KERNEL(adam,
@@ -279,3 +379,15 @@ PD_REGISTER_KERNEL(adam,
   kernel->InputAt(6).SetBackend(phi::Backend::ALL_BACKEND);
   kernel->InputAt(8).SetBackend(phi::Backend::ALL_BACKEND);
 }
+
+PD_REGISTER_KERNEL(merged_adam,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MergedAdamKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  // Skip beta1_pow, beta2_pow data transform
+  kernel->InputAt(5).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->InputAt(6).SetBackend(phi::Backend::ALL_BACKEND);
+}
diff --git a/paddle/phi/kernels/gpu/allclose_kernel.cu b/paddle/phi/kernels/gpu/allclose_kernel.cu
index 8abc6b272c511..fa6a8fce0bf86 100644
--- a/paddle/phi/kernels/gpu/allclose_kernel.cu
+++ b/paddle/phi/kernels/gpu/allclose_kernel.cu
@@ -14,6 +14,8 @@
 
 #include "paddle/phi/kernels/allclose_kernel.h"
 
+#include "glog/logging.h"
+
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 
diff --git a/paddle/phi/kernels/gpu/dist_kernel.cu b/paddle/phi/kernels/gpu/as_real_kernel.cu
similarity index 63%
rename from paddle/phi/kernels/gpu/dist_kernel.cu
rename to paddle/phi/kernels/gpu/as_real_kernel.cu
index 095110c252978..63227e7f0b1d8 100644
--- a/paddle/phi/kernels/gpu/dist_kernel.cu
+++ b/paddle/phi/kernels/gpu/as_real_kernel.cu
@@ -12,16 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/dist_kernel.h"
+#include "paddle/phi/kernels/as_real_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/dist_kernel_impl.h"
+#include "paddle/phi/kernels/impl/as_real_impl.h"
 
-#ifdef PADDLE_WITH_HIP
-// Eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h:922
-// do not support double in HIPCC platform (Eigen3 to be fixed)
-PD_REGISTER_KERNEL(dist, GPU, ALL_LAYOUT, phi::DistKernel, float) {}
-#else
-PD_REGISTER_KERNEL(dist, GPU, ALL_LAYOUT, phi::DistKernel, float, double) {}
-#endif
+PD_REGISTER_KERNEL(as_real, GPU, ALL_LAYOUT, phi::AsRealKernel, float, double) {
+}
diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
index b23b119342d68..0f028f42a956c 100644
--- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -591,10 +591,12 @@ void BatchNormGradRawKernel(const Context &ctx,
 //             ctx.GetPlace()),
 //         epsilon, saved_mean_data, saved_var_data));
 #else
-      // CUDNN PER_ACTIVATION mode only support small batch size
+      // CUDNN only support small batch size
       const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 131070;
+      const size_t CUDNN_SPATIAL_THRESHOLD = 880801;
       const bool use_native_kernel =
-          (x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD);
+          ((x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD) ||
+           (x_dims.size() == 3 && N >= CUDNN_SPATIAL_THRESHOLD));
       if (use_native_kernel) {
         if (compute_format == DataLayout::kNCHW) {
           BNBackward<T, block, DataLayout::kNCHW>
diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index 702722591553f..61694db7e8ed3 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -31,6 +31,7 @@ namespace cub = hipcub;
 #include "paddle/phi/kernels/batch_norm_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/norm_utils.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
 #include "paddle/phi/kernels/gpu/batch_norm_utils.h"
 
 #ifdef __HIPCC__
@@ -137,6 +138,398 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining(
   }
 }
 
+template <typename T>
+__device__ __forceinline__ void merge_block_vertical(
+    BatchNormParamType<T> x_sum,
+    BatchNormParamType<T> x_square_sum,
+    BatchNormParamType<T> *smem_sum,
+    BatchNormParamType<T> *smem_square_sum,
+    BatchNormParamType<T> *x_sum_out,
+    BatchNormParamType<T> *x_square_sum_out) {
+  int tid = threadIdx.x + threadIdx.y * blockDim.x;
+#pragma unroll
+  for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) {
+    if (threadIdx.y < offset * 2) {
+      smem_sum[tid] = x_sum;
+      smem_square_sum[tid] = x_square_sum;
+    }
+    __syncthreads();
+    if (threadIdx.y < offset) {
+      int pair_tid = tid + offset * blockDim.x;
+      x_sum += smem_sum[pair_tid];
+      x_square_sum += smem_square_sum[pair_tid];
+    }
+  }
+  if (threadIdx.y == 0) {
+    *x_sum_out = x_sum;
+    *x_square_sum_out = x_square_sum;
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ void merge_block_horizonal(
+    BatchNormParamType<T> x_sum,
+    BatchNormParamType<T> x_square_sum,
+    BatchNormParamType<T> *smem_sum,
+    BatchNormParamType<T> *smem_square_sum,
+    BatchNormParamType<T> *x_sum_out,
+    BatchNormParamType<T> *x_square_sum_out) {
+  int tid = threadIdx.x + threadIdx.y * blockDim.x;
+#pragma unroll
+  for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
+    if (threadIdx.x < offset * 2) {
+      smem_sum[tid] = x_sum;
+      smem_square_sum[tid] = x_square_sum;
+    }
+    __syncthreads();
+    if (threadIdx.x < offset) {
+      int pair_tid = tid + offset;
+      x_sum += smem_sum[pair_tid];
+      x_square_sum += smem_square_sum[pair_tid];
+    }
+  }
+  if (threadIdx.x == 0) {
+    *x_sum_out = x_sum;
+    *x_square_sum_out = x_square_sum;
+  }
+}
+
+template <typename T, int BlockDim>
+static __global__ void BNForwardTraining2DChannelLastCompStat(
+    const T *x,
+    const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *bias,
+    const int C,
+    const int N,
+    const int HxW,
+    const double epsilon,
+    double exponentialAverageFactor,
+    T *y,
+    BatchNormParamType<T> *global_mean,
+    BatchNormParamType<T> *global_variance,
+    BatchNormParamType<T> *save_mean,
+    BatchNormParamType<T> *save_inv_variance,
+    BatchNormParamType<T> *compute_mean,
+    BatchNormParamType<T> *compute_inv_var,
+    BatchNormParamType<T> *block_data_ptr,
+    int *flag_ptr) {
+  int outer_size = C;
+  int inner_size = N * HxW;
+
+  __shared__ BatchNormParamType<T> smem_sum[BlockDim];
+  __shared__ BatchNormParamType<T> smem_square_sum[BlockDim];
+
+  int outer_loop_stride = gridDim.x * blockDim.x;
+  int inner_loop_stride = gridDim.y * blockDim.y;
+
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < outer_size;
+       i += outer_loop_stride) {
+    BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> x_square_sum = static_cast<BatchNormParamType<T>>(0);
+
+    for (int j = blockIdx.y * blockDim.y + threadIdx.y; j < inner_size;
+         j += inner_loop_stride) {
+      const int index = j * outer_size + i;
+      BatchNormParamType<T> x_i = static_cast<BatchNormParamType<T>>(x[index]);
+      x_sum += x_i;
+      x_square_sum += x_i * x_i;
+    }
+
+    // vertical block sum
+    merge_block_vertical<T>(x_sum,
+                            x_square_sum,
+                            &smem_sum[0],
+                            &smem_square_sum[0],
+                            &x_sum,
+                            &x_square_sum);
+
+    if (gridDim.y > 1) {
+      volatile BatchNormParamType<T> *staging_sum = block_data_ptr;
+      volatile BatchNormParamType<T> *staging_square_sum =
+          &block_data_ptr[C * gridDim.y];
+      // write block data to global memory
+      if (threadIdx.y == 0) {
+        staging_sum[i + blockIdx.y * C] = x_sum;
+        staging_square_sum[i + blockIdx.y * C] = x_square_sum;
+      }
+
+      // make sure write is visible to all blocks
+      __threadfence();
+      __syncthreads();
+
+      __shared__ bool is_last_block_done;
+      // mark block done
+      if (threadIdx.x == 0 && threadIdx.y == 0) {
+        int old = atomicAdd(&flag_ptr[blockIdx.x], 1);
+        is_last_block_done = (old == (gridDim.y - 1));
+      }
+
+      __syncthreads();
+
+      if (is_last_block_done) {
+        x_sum = static_cast<BatchNormParamType<T>>(0);
+        x_square_sum = static_cast<BatchNormParamType<T>>(0);
+        // thread sum
+        for (int y = threadIdx.y; y < gridDim.y; y += blockDim.y) {
+          x_sum += staging_sum[i + y * C];
+          x_square_sum += staging_square_sum[i + y * C];
+        }
+
+        // vertical block sum
+        merge_block_vertical<T>(x_sum,
+                                x_square_sum,
+                                &smem_sum[0],
+                                &smem_square_sum[0],
+                                &x_sum,
+                                &x_square_sum);
+
+        // final compute
+        if (threadIdx.y == 0) {
+          BatchNormParamType<T> compute_mean_val = x_sum / inner_size;
+          BatchNormParamType<T> variance_val =
+              x_square_sum / inner_size - compute_mean_val * compute_mean_val;
+          BatchNormParamType<T> compute_inv_var_val =
+              1 / sqrt(variance_val + epsilon);
+
+          if (save_mean && save_inv_variance) {
+            save_mean[i] = compute_mean_val;
+            save_inv_variance[i] = compute_inv_var_val;
+          }
+          global_mean[i] = (1 - exponentialAverageFactor) * compute_mean_val +
+                           exponentialAverageFactor * global_mean[i];
+          global_variance[i] = (1 - exponentialAverageFactor) * variance_val +
+                               exponentialAverageFactor * global_variance[i];
+
+          compute_mean[i] = compute_mean_val;
+          compute_inv_var[i] = compute_inv_var_val;
+        }
+      }
+    } else {
+      if (blockIdx.y == 0 && threadIdx.y == 0) {
+        BatchNormParamType<T> compute_mean_val = x_sum / inner_size;
+        BatchNormParamType<T> variance_val =
+            x_square_sum / inner_size - compute_mean_val * compute_mean_val;
+        BatchNormParamType<T> compute_inv_var_val =
+            1 / sqrt(variance_val + epsilon);
+
+        if (save_mean && save_inv_variance) {
+          save_mean[i] = compute_mean_val;
+          save_inv_variance[i] = compute_inv_var_val;
+        }
+        global_mean[i] = (1 - exponentialAverageFactor) * compute_mean_val +
+                         exponentialAverageFactor * global_mean[i];
+        global_variance[i] = (1 - exponentialAverageFactor) * variance_val +
+                             exponentialAverageFactor * global_variance[i];
+
+        compute_mean[i] = compute_mean_val;
+        compute_inv_var[i] = compute_inv_var_val;
+      }
+    }
+  }
+}
+
+template <typename T>
+static __global__ void BNForwardTraining2DChannelLastWriteRes(
+    const T *x,
+    const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *bias,
+    const int C,
+    const int N,
+    const int HxW,
+    T *y,
+    BatchNormParamType<T> *compute_mean,
+    BatchNormParamType<T> *compute_inv_var) {
+  int outer_size = C;
+  int inner_size = N * HxW;
+
+  int outer_loop_stride = gridDim.x * blockDim.x;
+  int inner_loop_stride = gridDim.y * blockDim.y;
+
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < outer_size;
+       i += outer_loop_stride) {
+    BatchNormParamType<T> mean_val = compute_mean[i];
+    BatchNormParamType<T> inv_var_val = compute_inv_var[i];
+    BatchNormParamType<T> scale_val = scale[i];
+    BatchNormParamType<T> bias_val = bias[i];
+
+    for (int j = blockIdx.y * blockDim.y + threadIdx.y; j < inner_size;
+         j += inner_loop_stride) {
+      const int index = j * outer_size + i;
+      BatchNormParamType<T> x_sub_mean =
+          static_cast<BatchNormParamType<T>>(x[index]) - mean_val;
+      y[index] = scale_val * x_sub_mean * inv_var_val + bias_val;
+    }
+  }
+}
+
+template <typename T, int BlockDim>
+static __global__ void BNForwardTraining2DCompStat(
+    const T *x,
+    const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *bias,
+    const int C,
+    const int N,
+    const int HxW,
+    const double epsilon,
+    double exponentialAverageFactor,
+    T *y,
+    BatchNormParamType<T> *global_mean,
+    BatchNormParamType<T> *global_variance,
+    BatchNormParamType<T> *save_mean,
+    BatchNormParamType<T> *save_inv_variance,
+    BatchNormParamType<T> *compute_mean,
+    BatchNormParamType<T> *compute_inv_var,
+    BatchNormParamType<T> *block_data_ptr,
+    int *flag_ptr) {
+  int outer_size = C;
+  int inner_size = N * HxW;
+
+  __shared__ BatchNormParamType<T> smem_sum[BlockDim];
+  __shared__ BatchNormParamType<T> smem_square_sum[BlockDim];
+
+  int outer_loop_stride = gridDim.y * blockDim.y;
+  int inner_loop_stride = gridDim.x * blockDim.x;
+
+  for (int i = blockIdx.y * blockDim.y + threadIdx.y; i < outer_size;
+       i += outer_loop_stride) {
+    BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> x_square_sum = static_cast<BatchNormParamType<T>>(0);
+
+    for (int j = blockIdx.x * blockDim.x + threadIdx.x; j < inner_size;
+         j += inner_loop_stride) {
+      const int index = (j / HxW * C + i) * HxW + j % HxW;
+      BatchNormParamType<T> x_i = static_cast<BatchNormParamType<T>>(x[index]);
+      x_sum += x_i;
+      x_square_sum += x_i * x_i;
+    }
+
+    // horizonal block sum
+    merge_block_horizonal<T>(x_sum,
+                             x_square_sum,
+                             &smem_sum[0],
+                             &smem_square_sum[0],
+                             &x_sum,
+                             &x_square_sum);
+
+    if (gridDim.x > 1) {
+      volatile BatchNormParamType<T> *staging_sum = block_data_ptr;
+      volatile BatchNormParamType<T> *staging_square_sum =
+          &block_data_ptr[C * gridDim.x];
+      // write block data to global memory
+      if (threadIdx.x == 0) {
+        staging_sum[i + blockIdx.x * C] = x_sum;
+        staging_square_sum[i + blockIdx.x * C] = x_square_sum;
+      }
+
+      // make sure write is visible to all blocks
+      __threadfence();
+      __syncthreads();
+
+      __shared__ bool is_last_block_done;
+      // mark block done
+      if (threadIdx.x == 0 && threadIdx.y == 0) {
+        int old = atomicAdd(&flag_ptr[blockIdx.y], 1);
+        is_last_block_done = (old == (gridDim.x - 1));
+      }
+
+      __syncthreads();
+
+      if (is_last_block_done) {
+        x_sum = static_cast<BatchNormParamType<T>>(0);
+        x_square_sum = static_cast<BatchNormParamType<T>>(0);
+        // thread sum
+        for (int x = threadIdx.x; x < gridDim.x; x += blockDim.x) {
+          x_sum += staging_sum[i + x * C];
+          x_square_sum += staging_square_sum[i + x * C];
+        }
+
+        // horizonal block sum
+        merge_block_horizonal<T>(x_sum,
+                                 x_square_sum,
+                                 &smem_sum[0],
+                                 &smem_square_sum[0],
+                                 &x_sum,
+                                 &x_square_sum);
+
+        // final compute
+        if (threadIdx.x == 0) {
+          BatchNormParamType<T> compute_mean_val = x_sum / inner_size;
+          BatchNormParamType<T> variance_val =
+              x_square_sum / inner_size - compute_mean_val * compute_mean_val;
+          BatchNormParamType<T> compute_inv_var_val =
+              1 / sqrt(variance_val + epsilon);
+
+          if (save_mean && save_inv_variance) {
+            save_mean[i] = compute_mean_val;
+            save_inv_variance[i] = compute_inv_var_val;
+          }
+          global_mean[i] = (1 - exponentialAverageFactor) * compute_mean_val +
+                           exponentialAverageFactor * global_mean[i];
+          global_variance[i] = (1 - exponentialAverageFactor) * variance_val +
+                               exponentialAverageFactor * global_variance[i];
+
+          compute_mean[i] = compute_mean_val;
+          compute_inv_var[i] = compute_inv_var_val;
+        }
+      }
+    } else {
+      if (blockIdx.x == 0 && threadIdx.x == 0) {
+        BatchNormParamType<T> compute_mean_val = x_sum / inner_size;
+        BatchNormParamType<T> variance_val =
+            x_square_sum / inner_size - compute_mean_val * compute_mean_val;
+        BatchNormParamType<T> compute_inv_var_val =
+            1 / sqrt(variance_val + epsilon);
+
+        if (save_mean && save_inv_variance) {
+          save_mean[i] = compute_mean_val;
+          save_inv_variance[i] = compute_inv_var_val;
+        }
+        global_mean[i] = (1 - exponentialAverageFactor) * compute_mean_val +
+                         exponentialAverageFactor * global_mean[i];
+        global_variance[i] = (1 - exponentialAverageFactor) * variance_val +
+                             exponentialAverageFactor * global_variance[i];
+
+        compute_mean[i] = compute_mean_val;
+        compute_inv_var[i] = compute_inv_var_val;
+      }
+    }
+  }
+}
+
+template <typename T>
+static __global__ void BNForwardTraining2DWriteRes(
+    const T *x,
+    const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *bias,
+    const int C,
+    const int N,
+    const int HxW,
+    T *y,
+    BatchNormParamType<T> *compute_mean,
+    BatchNormParamType<T> *compute_inv_var) {
+  int outer_size = C;
+  int inner_size = N * HxW;
+
+  int outer_loop_stride = gridDim.y * blockDim.y;
+  int inner_loop_stride = gridDim.x * blockDim.x;
+
+  for (int i = blockIdx.y * blockDim.y + threadIdx.y; i < outer_size;
+       i += outer_loop_stride) {
+    BatchNormParamType<T> mean_val = compute_mean[i];
+    BatchNormParamType<T> inv_var_val = compute_inv_var[i];
+    BatchNormParamType<T> scale_val = scale[i];
+    BatchNormParamType<T> bias_val = bias[i];
+
+    for (int j = blockIdx.x * blockDim.x + threadIdx.x; j < inner_size;
+         j += inner_loop_stride) {
+      const int index = (j / HxW * C + i) * HxW + j % HxW;
+      BatchNormParamType<T> x_sub_mean =
+          static_cast<BatchNormParamType<T>>(x[index]) - mean_val;
+      y[index] = scale_val * x_sub_mean * inv_var_val + bias_val;
+    }
+  }
+}
+
 template <typename T, typename Context>
 void BatchNormKernel(const Context &ctx,
                      const DenseTensor &x,
@@ -515,17 +908,63 @@ void BatchNormKernel(const Context &ctx,
 //         static_cast<void *>(saved_variance->template mutable_data<
 //                             BatchNormParamType<T>>(ctx.GetPlace()))));
 #else
-      // CUDNN PER_ACTIVATION mode only support small batch size
       const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 131070;
+      const size_t CUDNN_SPATIAL_THRESHOLD = 880801;
       const bool use_native_kernel =
-          (x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD);
+          ((x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD) ||
+           (x_dims.size() == 3 && N >= CUDNN_SPATIAL_THRESHOLD));
       if (use_native_kernel) {
-        const int block = 512;
-        const int max_threads = ctx.GetMaxPhysicalThreadCount();
-        const int max_blocks = std::max(max_threads / block, 1);
-        const int grid = std::min(C, max_blocks);
-        if (compute_format == DataLayout::kNCHW) {
-          BNForwardTraining<T, block, DataLayout::kNCHW>
+        dim3 block;
+        dim3 grid;
+        const int block_size = 512;
+        const int MAX_GRID_SIZE = 128;
+        const int WARP_SIZE = 32;
+
+        // init intermediate storage
+        DenseTensor block_data_tensor;
+        DenseTensor flag_tensor;
+        DenseTensor compute_mean_tensor =
+            phi::Empty<BatchNormParamType<T>, Context>(ctx, {C});
+        DenseTensor compute_inv_var_tensor =
+            phi::Empty<BatchNormParamType<T>, Context>(ctx, {C});
+
+        BatchNormParamType<T> *block_data_ptr = nullptr;
+        int *flag_ptr = nullptr;
+
+        if (x_dims.size() != 2 && compute_format == DataLayout::kNCHW) {
+          // init block&grid config
+          int block_x =
+              std::min(phi::funcs::details::GetLastPow2(H * W * D), block_size);
+          int block_y = std::min(phi::funcs::details::GetLastPow2(C),
+                                 block_size / block_x);
+
+          if (block_x * block_y != block_size) {
+            block_x =
+                std::min(phi::funcs::details::GetLastPow2(N * H * W * D / 16),
+                         block_size / block_y);
+          }
+
+          int grid_x =
+              std::min((N * H * W * D + block_x * 16 - 1) / (block_x * 16),
+                       MAX_GRID_SIZE);
+          int grid_y = (C + block_y - 1) / block_y;
+
+          block.x = block_x;
+          block.y = block_y;
+          grid.x = grid_x;
+          grid.y = grid_y;
+
+          if (grid.x > 1) {
+            block_data_tensor = phi::Empty<BatchNormParamType<T>, Context>(
+                ctx, {2 * C * grid.x});
+            flag_tensor = phi::Empty<int, Context>(ctx, {grid.y});
+
+            block_data_ptr = block_data_tensor.data<BatchNormParamType<T>>();
+            flag_ptr = flag_tensor.data<int>();
+            funcs::SetConstant<Context, int> set_zero;
+            set_zero(ctx, &flag_tensor, static_cast<int>(0));
+          }
+          BNForwardTraining2DCompStat<T, block_size>
               <<<grid, block, 0, ctx.stream()>>>(
                   transformed_x.template data<T>(),
                   scale.template data<BatchNormParamType<T>>(),
@@ -539,9 +978,54 @@ void BatchNormKernel(const Context &ctx,
                   mean_out->template data<BatchNormParamType<T>>(),
                   variance_out->template data<BatchNormParamType<T>>(),
                   saved_mean->template data<BatchNormParamType<T>>(),
-                  saved_variance->template data<BatchNormParamType<T>>());
+                  saved_variance->template data<BatchNormParamType<T>>(),
+                  compute_mean_tensor.data<BatchNormParamType<T>>(),
+                  compute_inv_var_tensor.data<BatchNormParamType<T>>(),
+                  block_data_ptr,
+                  flag_ptr);
+
+          BNForwardTraining2DWriteRes<T><<<grid, block, 0, ctx.stream()>>>(
+              transformed_x.template data<T>(),
+              scale.template data<BatchNormParamType<T>>(),
+              bias.template data<BatchNormParamType<T>>(),
+              C,
+              N,
+              H * W * D,
+              transformed_y.template data<T>(),
+              compute_mean_tensor.data<BatchNormParamType<T>>(),
+              compute_inv_var_tensor.data<BatchNormParamType<T>>());
         } else {
-          BNForwardTraining<T, block, DataLayout::kNHWC>
+          // init block&grid config
+          int block_x =
+              std::min(phi::funcs::details::GetLastPow2(C), WARP_SIZE);
+          int block_y =
+              std::min(phi::funcs::details::GetLastPow2(N * H * W * D / 16),
+                       block_size / block_x);
+          if (block_x * block_y != block_size) {
+            block_x = std::min(phi::funcs::details::GetLastPow2(C),
+                               block_size / block_y);
+          }
+          int grid_x = (C + block_x - 1) / block_x;
+          int grid_y =
+              std::min((N * H * W * D + block_y * 16 - 1) / (block_y * 16),
+                       MAX_GRID_SIZE);
+
+          block.x = block_x;
+          block.y = block_y;
+          grid.x = grid_x;
+          grid.y = grid_y;
+
+          if (grid.y > 1) {
+            block_data_tensor = phi::Empty<BatchNormParamType<T>, Context>(
+                ctx, {2 * C * grid.y});
+            flag_tensor = phi::Empty<int, Context>(ctx, {grid.x});
+
+            block_data_ptr = block_data_tensor.data<BatchNormParamType<T>>();
+            flag_ptr = flag_tensor.data<int>();
+            funcs::SetConstant<Context, int> set_zero;
+            set_zero(ctx, &flag_tensor, static_cast<int>(0));
+          }
+          BNForwardTraining2DChannelLastCompStat<T, block_size>
               <<<grid, block, 0, ctx.stream()>>>(
                   transformed_x.template data<T>(),
                   scale.template data<BatchNormParamType<T>>(),
@@ -555,7 +1039,23 @@ void BatchNormKernel(const Context &ctx,
                   mean_out->template data<BatchNormParamType<T>>(),
                   variance_out->template data<BatchNormParamType<T>>(),
                   saved_mean->template data<BatchNormParamType<T>>(),
-                  saved_variance->template data<BatchNormParamType<T>>());
+                  saved_variance->template data<BatchNormParamType<T>>(),
+                  compute_mean_tensor.data<BatchNormParamType<T>>(),
+                  compute_inv_var_tensor.data<BatchNormParamType<T>>(),
+                  block_data_ptr,
+                  flag_ptr);
+
+          BNForwardTraining2DChannelLastWriteRes<T>
+              <<<grid, block, 0, ctx.stream()>>>(
+                  transformed_x.template data<T>(),
+                  scale.template data<BatchNormParamType<T>>(),
+                  bias.template data<BatchNormParamType<T>>(),
+                  C,
+                  N,
+                  H * W * D,
+                  transformed_y.template data<T>(),
+                  compute_mean_tensor.data<BatchNormParamType<T>>(),
+                  compute_inv_var_tensor.data<BatchNormParamType<T>>());
         }
       } else {
 #if CUDNN_VERSION_MIN(7, 4, 1)
diff --git a/paddle/phi/kernels/gpu/complex_grad_kernel.cu b/paddle/phi/kernels/gpu/complex_grad_kernel.cu
index 450b32291c4bc..e9fd5e1fa5834 100644
--- a/paddle/phi/kernels/gpu/complex_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/complex_grad_kernel.cu
@@ -31,3 +31,8 @@ PD_REGISTER_KERNEL(real_grad,
                    phi::RealGradKernel,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(
+    complex_grad, GPU, ALL_LAYOUT, phi::ComplexGradKernel, float, double) {
+  kernel->InputAt(2).SetDataType(phi::dtype::ToComplex(kernel_key.dtype()));
+}
diff --git a/paddle/phi/kernels/gpu/complex_kernel.cu b/paddle/phi/kernels/gpu/complex_kernel.cu
index d0ee78202b060..5c5bf104128d3 100644
--- a/paddle/phi/kernels/gpu/complex_kernel.cu
+++ b/paddle/phi/kernels/gpu/complex_kernel.cu
@@ -50,3 +50,8 @@ PD_REGISTER_KERNEL(imag,
                    phi::dtype::complex<double>) {
   kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
 }
+
+PD_REGISTER_KERNEL(
+    complex, GPU, ALL_LAYOUT, phi::ComplexKernel, float, double) {
+  kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype()));
+}
diff --git a/paddle/phi/kernels/gpu/cross_grad_kernel.cu b/paddle/phi/kernels/gpu/cross_grad_kernel.cu
index ada78adb77fc9..a6399ba39dcae 100644
--- a/paddle/phi/kernels/gpu/cross_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_grad_kernel.cu
@@ -22,8 +22,6 @@
 
 namespace phi {
 
-using funcs::IndexCalculator;
-
 template <typename T>
 __global__ void CrossGrad(const T* x,
                           const T* y,
@@ -32,7 +30,7 @@ __global__ void CrossGrad(const T* x,
                           T* out_dy,
                           const int stride,
                           const int N,
-                          IndexCalculator index_calculator) {
+                          phi::funcs::IndexCalculator index_calculator) {
   CUDA_KERNEL_LOOP(i, N) {
     int offset = index_calculator(i);
 
@@ -107,32 +105,52 @@ void CrossGradKernel(const Context& dev_ctx,
   std::vector<int> cal_dims;
   std::vector<int> left_strides;
   std::vector<int> full_strides;
+  std::vector<int> merged_dims;
+
+  for (int i = 0; i < dim; i++) {
+    if (i == 0) {
+      merged_dims.push_back(input_x_dims[i]);
+    } else {
+      merged_dims[0] *= input_x_dims[i];
+    }
+  }
+  int merge_axis = merged_dims.size();
+  merged_dims.push_back(input_x_dims[dim]);
+  for (int i = dim + 1; i < input_x_dims.size(); i++) {
+    if (i == dim + 1) {
+      merged_dims.push_back(input_x_dims[i]);
+    } else {
+      merged_dims[merge_axis + 1] *= input_x_dims[i];
+    }
+  }
 
   int full_dim = 1;
-  int left_dim = 1;
-  for (auto i = 0; i < input_x_dims.size(); i++) {
+  for (int i = 0; i < merged_dims.size(); i++) {
     full_strides.insert(full_strides.begin(), full_dim);
-    full_dim *= input_x_dims[input_x_dims.size() - i - 1];
-    if (i == dim) {
+    full_dim *= merged_dims[merged_dims.size() - i - 1];
+    if (i == merge_axis) {
       continue;
     }
     cal_dims.push_back(i);
+  }
+  int left_dim = 1;
+  for (int i = merged_dims.size() - 1; i >= 0; i--) {
+    if (i == merge_axis) {
+      continue;
+    }
     left_strides.insert(left_strides.begin(), left_dim);
-    left_dim *= input_x_dims[input_x_dims.size() - i - 1];
+    left_dim *= merged_dims[i];
   }
 
   const auto* input_x_data = input_x.data<T>();
   const auto* input_y_data = input_y.data<T>();
   const auto* input_out_grad_data = input_out_grad.data<T>();
-
   auto* output_x_grad_data = dev_ctx.template Alloc<T>(x_grad);
   auto* output_y_grad_data = dev_ctx.template Alloc<T>(y_grad);
-
-  auto index_calculator = IndexCalculator(
-      input_x_dims.size() - 1, cal_dims, left_strides, full_strides);
+  auto index_calculator = phi::funcs::IndexCalculator(
+      merged_dims.size() - 1, cal_dims, left_strides, full_strides);
 
   int64_t numel = x.numel();
-
   backends::gpu::GpuLaunchConfig config =
       backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel / 3);
 
@@ -144,7 +162,7 @@ void CrossGradKernel(const Context& dev_ctx,
                                   input_out_grad_data,
                                   output_x_grad_data,
                                   output_y_grad_data,
-                                  full_strides[dim],
+                                  full_strides[merge_axis],
                                   numel / 3,
                                   index_calculator);
 }
diff --git a/paddle/phi/kernels/gpu/cross_kernel.cu b/paddle/phi/kernels/gpu/cross_kernel.cu
index 44173f4fbe62d..0e1e7b3a42568 100644
--- a/paddle/phi/kernels/gpu/cross_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_kernel.cu
@@ -22,15 +22,13 @@
 
 namespace phi {
 
-using funcs::IndexCalculator;
-
 template <typename T>
 __global__ void Cross(const T* x,
                       const T* y,
                       T* out,
                       const int stride,
                       const int N,
-                      IndexCalculator index_calculator) {
+                      phi::funcs::IndexCalculator index_calculator) {
   CUDA_KERNEL_LOOP(i, N) {
     int offset = index_calculator(i);
 
@@ -96,30 +94,50 @@ void CrossKernel(const Context& dev_ctx,
   std::vector<int> cal_dims;
   std::vector<int> left_strides;
   std::vector<int> full_strides;
+  std::vector<int> merged_dims;
+
+  for (int i = 0; i < dim; i++) {
+    if (i == 0) {
+      merged_dims.push_back(input_x_dims[i]);
+    } else {
+      merged_dims[0] *= input_x_dims[i];
+    }
+  }
+  int merge_axis = merged_dims.size();
+  merged_dims.push_back(input_x_dims[dim]);
+  for (int i = dim + 1; i < input_x_dims.size(); i++) {
+    if (i == dim + 1) {
+      merged_dims.push_back(input_x_dims[i]);
+    } else {
+      merged_dims[merge_axis + 1] *= input_x_dims[i];
+    }
+  }
 
-  int dims0 = 1;
-  int dims1 = 1;
-  for (auto i = 0; i < input_x_dims.size(); i++) {
-    full_strides.insert(full_strides.begin(), dims0);
-    dims0 *= input_x_dims[input_x_dims.size() - i - 1];
-    if (i == dim) {
+  int full_dim = 1;
+  for (int i = 0; i < merged_dims.size(); i++) {
+    full_strides.insert(full_strides.begin(), full_dim);
+    full_dim *= merged_dims[merged_dims.size() - i - 1];
+    if (i == merge_axis) {
       continue;
     }
     cal_dims.push_back(i);
-    left_strides.insert(left_strides.begin(), dims1);
-    dims1 *= input_x_dims[input_x_dims.size() - i - 1];
+  }
+  int left_dim = 1;
+  for (int i = merged_dims.size() - 1; i >= 0; i--) {
+    if (i == merge_axis) {
+      continue;
+    }
+    left_strides.insert(left_strides.begin(), left_dim);
+    left_dim *= merged_dims[i];
   }
 
   const auto* input_x_data = input_x.data<T>();
   const auto* input_y_data = input_y.data<T>();
-
   auto* out_data = dev_ctx.template Alloc<T>(out);
-
-  auto index_calculator = IndexCalculator(
-      input_x_dims.size() - 1, cal_dims, left_strides, full_strides);
+  auto index_calculator = phi::funcs::IndexCalculator(
+      merged_dims.size() - 1, cal_dims, left_strides, full_strides);
 
   int64_t numel = x.numel();
-
   backends::gpu::GpuLaunchConfig config =
       backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel / 3);
 
@@ -129,7 +147,7 @@ void CrossKernel(const Context& dev_ctx,
           dev_ctx.stream()>>>(input_x_data,
                               input_y_data,
                               out_data,
-                              full_strides[dim],
+                              full_strides[merge_axis],
                               numel / 3,
                               index_calculator);
 }
diff --git a/paddle/phi/kernels/gpu/dropout_kernel.cu b/paddle/phi/kernels/gpu/dropout_kernel.cu
index 2fa3c7639e396..f973bb8e15fc7 100644
--- a/paddle/phi/kernels/gpu/dropout_kernel.cu
+++ b/paddle/phi/kernels/gpu/dropout_kernel.cu
@@ -84,7 +84,9 @@ PD_REGISTER_KERNEL(dropout,
                    float,
                    double,
                    phi::dtype::bfloat16,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16) {
+  kernel->InputAt(1).SetBackend(phi::Backend::ALL_BACKEND);
+}
 
 PD_REGISTER_KERNEL(dropout_nd,
                    GPU,
@@ -93,4 +95,6 @@ PD_REGISTER_KERNEL(dropout_nd,
                    float,
                    double,
                    phi::dtype::bfloat16,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16) {
+  kernel->InputAt(1).SetBackend(phi::Backend::ALL_BACKEND);
+}
diff --git a/paddle/phi/kernels/gpu/einsum_kernel.cu b/paddle/phi/kernels/gpu/einsum_kernel.cu
index d1f4c6590387a..b3706710c40e3 100644
--- a/paddle/phi/kernels/gpu/einsum_kernel.cu
+++ b/paddle/phi/kernels/gpu/einsum_kernel.cu
@@ -25,4 +25,6 @@ PD_REGISTER_KERNEL(einsum,
                    float,
                    double,
                    phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
index 175f09fccfa30..047b4ff69a784 100644
--- a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
@@ -1574,28 +1574,43 @@ PD_REGISTER_KERNEL(bilinear_interp_v2_grad,
                    ALL_LAYOUT,
                    phi::BilinearInterpGradKernel,
                    float,
-                   double) {}
+                   double) {
+  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
+}
 PD_REGISTER_KERNEL(nearest_interp_v2_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::NearestInterpGradKernel,
                    float,
-                   double) {}
+                   double) {
+  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
+}
 PD_REGISTER_KERNEL(trilinear_interp_v2_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::TrilinearInterpGradKernel,
                    float,
-                   double) {}
+                   double) {
+  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
+}
 PD_REGISTER_KERNEL(linear_interp_v2_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::LinearInterpGradKernel,
                    float,
-                   double) {}
+                   double) {
+  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
+}
 PD_REGISTER_KERNEL(bicubic_interp_v2_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::BicubicInterpGradKernel,
                    float,
-                   double) {}
+                   double) {
+  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
+}
diff --git a/paddle/phi/kernels/gpu/interpolate_kernel.cu b/paddle/phi/kernels/gpu/interpolate_kernel.cu
index 7bc331c52a015..c05514236e091 100644
--- a/paddle/phi/kernels/gpu/interpolate_kernel.cu
+++ b/paddle/phi/kernels/gpu/interpolate_kernel.cu
@@ -1446,7 +1446,10 @@ PD_REGISTER_KERNEL(bilinear_interp_v2,
                    phi::BilinearInterpKernel,
                    float,
                    double,
-                   int) {}
+                   int) {
+  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
+}
 PD_REGISTER_KERNEL(nearest_interp_v2,
                    GPU,
                    ALL_LAYOUT,
@@ -1454,25 +1457,37 @@ PD_REGISTER_KERNEL(nearest_interp_v2,
                    float,
                    double,
                    int,
-                   int64_t) {}
+                   int64_t) {
+  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
+}
 PD_REGISTER_KERNEL(trilinear_interp_v2,
                    GPU,
                    ALL_LAYOUT,
                    phi::TrilinearInterpKernel,
                    float,
                    double,
-                   int) {}
+                   int) {
+  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
+}
 PD_REGISTER_KERNEL(linear_interp_v2,
                    GPU,
                    ALL_LAYOUT,
                    phi::LinearInterpKernel,
                    float,
                    double,
-                   int) {}
+                   int) {
+  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
+}
 PD_REGISTER_KERNEL(bicubic_interp_v2,
                    GPU,
                    ALL_LAYOUT,
                    phi::BicubicInterpKernel,
                    float,
                    double,
-                   int) {}
+                   int) {
+  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
+}
diff --git a/paddle/phi/kernels/gpu/merged_momentum_kernel.cu b/paddle/phi/kernels/gpu/merged_momentum_kernel.cu
new file mode 100644
index 0000000000000..c6883caecd1a6
--- /dev/null
+++ b/paddle/phi/kernels/gpu/merged_momentum_kernel.cu
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/merged_momentum_impl.h"
+
+PD_REGISTER_KERNEL(merged_momentum,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MergedMomentumKernel,
+                   phi::dtype::float16,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu b/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu
index 8f4af0a450890..e9f820a318482 100644
--- a/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu
@@ -503,5 +503,10 @@ void Pad3dGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(
-    pad3d_grad, GPU, ALL_LAYOUT, phi::Pad3dGradKernel, float, double) {}
+PD_REGISTER_KERNEL(pad3d_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::Pad3dGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/ops/compat/dot_sig.cc b/paddle/phi/kernels/gpu/solve_grad_kernel.cu
similarity index 64%
rename from paddle/phi/ops/compat/dot_sig.cc
rename to paddle/phi/kernels/gpu/solve_grad_kernel.cu
index 2187a7eb4fca0..c13c3b6545c44 100644
--- a/paddle/phi/ops/compat/dot_sig.cc
+++ b/paddle/phi/kernels/gpu/solve_grad_kernel.cu
@@ -12,15 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/core/compat/op_utils.h"
+#include "paddle/phi/kernels/solve_grad_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/solve_grad_kernel_impl.h"
 
-namespace phi {
-
-KernelSignature DotGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature(
-      "dot_grad", {"X", "Y", "Out@GRAD"}, {}, {"X@GRAD", "Y@GRAD"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(dot_grad, phi::DotGradOpArgumentMapping);
+PD_REGISTER_KERNEL(
+    solve_grad, GPU, ALL_LAYOUT, phi::SolveGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/solve_kernel.cu b/paddle/phi/kernels/gpu/solve_kernel.cu
new file mode 100644
index 0000000000000..59bc77ca0b975
--- /dev/null
+++ b/paddle/phi/kernels/gpu/solve_kernel.cu
@@ -0,0 +1,19 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/solve_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/solve_kernel_impl.h"
+
+PD_REGISTER_KERNEL(solve, GPU, ALL_LAYOUT, phi::SolveKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/transpose_kernel.cu b/paddle/phi/kernels/gpu/transpose_kernel.cu
index 62e29950e2d89..3f3760a4890a2 100644
--- a/paddle/phi/kernels/gpu/transpose_kernel.cu
+++ b/paddle/phi/kernels/gpu/transpose_kernel.cu
@@ -31,12 +31,11 @@ void TransposeKernel(const Context& ctx,
                      const DenseTensor& x,
                      const std::vector<int>& axis,
                      DenseTensor* out) {
-  int rank = axis.size();
   ctx.template Alloc<T>(out);
   if (out->numel() == 0) {
     return;
   }
-  paddle::operators::TransposeGPUKernelDriver<T>(ctx, rank, x, axis, out);
+  paddle::operators::TransposeGPUKernelDriver<T>(ctx, x, axis, out);
 }
 
 }  // namespace phi
diff --git a/paddle/fluid/operators/unique_consecutive_op.cu b/paddle/phi/kernels/gpu/unique_consecutive_functor.h
similarity index 53%
rename from paddle/fluid/operators/unique_consecutive_op.cu
rename to paddle/phi/kernels/gpu/unique_consecutive_functor.h
index b96499cdb20e8..e603f695039c0 100644
--- a/paddle/fluid/operators/unique_consecutive_op.cu
+++ b/paddle/phi/kernels/gpu/unique_consecutive_functor.h
@@ -1,16 +1,18 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
 #include <thrust/adjacent_difference.h>
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
@@ -22,13 +24,204 @@ limitations under the License. */
 #include <iostream>
 #include <vector>
 
-#include "paddle/fluid/framework/tensor_util.h"            // TensorToVector()
-#include "paddle/fluid/operators/unique_consecutive_op.h"  // TransComute()
+#include "paddle/fluid/framework/tensor_util.h"
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/unique_functor.h"
+
+namespace phi {
+
+// The core logic of computing Unique Consecutive for a flattend Tensor
+template <typename Context,
+          typename InT,
+          typename IndexT,
+          typename equal_T,
+          typename not_equal_T>
+static void UniqueConsecutiveFlattenedCUDATensor(const Context& context,
+                                                 const DenseTensor& in,
+                                                 DenseTensor* out,
+                                                 bool return_inverse,
+                                                 bool return_counts,
+                                                 equal_T equal,
+                                                 not_equal_T not_equal,
+                                                 int64_t num_input,
+                                                 DenseTensor* inverse,
+                                                 DenseTensor* counts) {
+  // 0. Preparation
+  DenseTensor in_hat;
+  phi::Copy(context, in, context.GetPlace(), false, &in_hat);
+  auto in_data_hat = context.template Alloc<InT>(&in_hat);
+
+  DenseTensor sorted_indices;
+  sorted_indices.Resize(phi::make_ddim({num_input}));
+  auto sorted_indices_data = context.template Alloc<IndexT>(&sorted_indices);
+  thrust::sequence(
+      thrust::device, sorted_indices_data, sorted_indices_data + num_input);
+  // 1. Calculate op result: 'out'
+  DenseTensor range;
+  range.Resize(phi::make_ddim({num_input + 1}));
+  auto range_data_ptr = context.template Alloc<IndexT>(&range);
+  thrust::sequence(
+      thrust::device, range_data_ptr, range_data_ptr + num_input + 1);
+  phi::Copy(context, in_hat, context.GetPlace(), false, out);
+  int num_out;
+  auto out_data = context.template Alloc<InT>(out);
+  num_out =
+      thrust::unique_by_key(
+          thrust::device, out_data, out_data + num_input, range_data_ptr, equal)
+          .first -
+      out_data;
+  out->Resize(phi::make_ddim({num_out}));
+
+  // 2. Calculate inverse index: 'inverse'
+  if (return_inverse) {
+    inverse->Resize(phi::make_ddim({num_input}));
+    auto inverse_data = context.template Alloc<IndexT>(inverse);
+    DenseTensor inv_loc;
+    inv_loc.Resize(phi::make_ddim({num_input}));
+    auto inv_loc_data_ptr = context.template Alloc<IndexT>(&inv_loc);
+    thrust::adjacent_difference(thrust::device,
+                                in_data_hat,
+                                in_data_hat + num_input,
+                                inv_loc_data_ptr,
+                                not_equal);
+    thrust::device_ptr<IndexT> inv_loc_data_dev(inv_loc_data_ptr);
+    inv_loc_data_dev[0] = 0;  // without device_ptr, segmentation fault
+    thrust::inclusive_scan(thrust::device,
+                           inv_loc_data_ptr,
+                           inv_loc_data_ptr + num_input,
+                           inv_loc_data_ptr);
+    thrust::scatter(thrust::device,
+                    inv_loc_data_ptr,
+                    inv_loc_data_ptr + num_input,
+                    sorted_indices_data,
+                    inverse_data);
+  }
+  // 3. Calculate 'counts'
+  if (return_counts) {
+    counts->Resize(phi::make_ddim({num_out}));
+    auto count_data = context.template Alloc<IndexT>(counts);
+    // init 'count_data' as 0
+    thrust::fill(thrust::device, count_data, count_data + num_out, 0);
+    thrust::device_ptr<IndexT> range_data_ptr_dev(range_data_ptr);
+    range_data_ptr_dev[num_out] = num_input;
+    thrust::adjacent_difference(thrust::device,
+                                range_data_ptr + 1,
+                                range_data_ptr + num_out + 1,
+                                count_data);
+  }
+}
+
+// functor for processing a flattend Tensor
+template <typename Context, typename InT>
+struct UniqueConsecutiveFlattenedCUDAFunctor {
+  const Context& ctx_;
+  const DenseTensor& in_;
+  DenseTensor* out_;
+  const bool return_inverse_;
+  const bool return_counts_;
+  DenseTensor* inverse_;
+  DenseTensor* count_;
+
+  UniqueConsecutiveFlattenedCUDAFunctor(const Context& context,
+                                        const DenseTensor& in,
+                                        DenseTensor* out,
+                                        bool return_inverse,
+                                        bool return_counts,
+                                        DenseTensor* inverse,
+                                        DenseTensor* count)
+      : ctx_(context),
+        in_(in),
+        out_(out),
+        return_inverse_(return_inverse),
+        return_counts_(return_counts),
+        inverse_(inverse),
+        count_(count) {}
+
+  template <typename IndexT>
+  void apply() const {
+    UniqueConsecutiveFlattenedCUDATensor<Context, InT, IndexT>(
+        ctx_,
+        in_,
+        out_,
+        return_inverse_,
+        return_counts_,
+        thrust::equal_to<InT>(),
+        thrust::not_equal_to<InT>(),
+        in_.numel(),
+        inverse_,
+        count_);
+  }
+};
 
-namespace paddle {
-namespace operators {
+// The logic of compute unique with axis required, it's a little different
+// from above function
+template <typename Context,
+          typename InT,
+          typename IndexT,
+          typename equal_T,
+          typename not_equal_T>
+static void ComputeUniqueConsecutiveDims(const Context& context,
+                                         DenseTensor* sorted_indices,
+                                         IndexT* sorted_indices_data,
+                                         DenseTensor* out,
+                                         bool return_inverse,
+                                         bool return_counts,
+                                         equal_T equal,
+                                         not_equal_T not_equal,
+                                         int64_t row,
+                                         DenseTensor* inverse,
+                                         DenseTensor* counts) {
+  // 1. inverse indices: 'inverse'
+  inverse->Resize(phi::make_ddim({row}));
+  auto inverse_data = context.template Alloc<IndexT>(inverse);
+  DenseTensor inv_loc;
+  inv_loc.Resize(phi::make_ddim({row}));
+  auto inv_loc_data_ptr = context.template Alloc<IndexT>(&inv_loc);
+  thrust::adjacent_difference(thrust::device,
+                              sorted_indices_data,
+                              sorted_indices_data + row,
+                              inv_loc_data_ptr,
+                              not_equal);
+  thrust::device_ptr<IndexT> inv_loc_data_dev(inv_loc_data_ptr);
+  inv_loc_data_dev[0] = 0;
+  thrust::inclusive_scan(thrust::device,
+                         inv_loc_data_ptr,
+                         inv_loc_data_ptr + row,
+                         inv_loc_data_ptr);
+  thrust::scatter(thrust::device,
+                  inv_loc_data_ptr,
+                  inv_loc_data_ptr + row,
+                  sorted_indices_data,
+                  inverse_data);
+
+  // 2. sorted indices
+  DenseTensor range;
+  range.Resize(phi::make_ddim({row + 1}));
+  auto range_data_ptr = context.template Alloc<IndexT>(&range);
+  thrust::sequence(thrust::device, range_data_ptr, range_data_ptr + row + 1);
+  int num_out;
+  num_out = thrust::unique_by_key(thrust::device,
+                                  sorted_indices_data,
+                                  sorted_indices_data + row,
+                                  range_data_ptr,
+                                  equal)
+                .first -
+            sorted_indices_data;
+  thrust::device_ptr<IndexT> range_data_ptr_dev(range_data_ptr);
+  range_data_ptr_dev[num_out] = row;
+  sorted_indices->Resize(phi::make_ddim({num_out}));
 
-using Tensor = framework::Tensor;
+  // 3. counts: 'counts'
+  counts->Resize(phi::make_ddim({num_out}));
+  auto count_data = context.template Alloc<IndexT>(counts);
+  thrust::fill(thrust::device, count_data, count_data + row, 0);
+  thrust::adjacent_difference(
+      thrust::device, range_data_ptr + 1, range_data_ptr + row + 1, count_data);
+}
 
 // Binary function 'equal_to'
 template <typename InT>
@@ -73,11 +266,11 @@ struct BinaryNotEqual {
 };
 
 // index_select() function for Tensor
-template <typename InT, typename IndexT>
-void IndexSelect(const framework::ExecutionContext& context,
-                 const Tensor& input,
-                 const Tensor& index,
-                 Tensor* output,
+template <typename Context, typename InT, typename IndexT>
+void IndexSelect(const Context& context,
+                 const DenseTensor& input,
+                 const DenseTensor& index,
+                 DenseTensor* output,
                  int dim) {
   auto input_dim = input.dims();
   auto input_dim_size = input_dim.size();
@@ -100,17 +293,15 @@ void IndexSelect(const framework::ExecutionContext& context,
 
   std::vector<InT> input_vec;
   std::vector<IndexT> index_vec;
-  paddle::framework::TensorToVector(
-      input, context.device_context(), &input_vec);
-  paddle::framework::TensorToVector(
-      index, context.device_context(), &index_vec);
+  paddle::framework::TensorToVector(input, context, &input_vec);
+  paddle::framework::TensorToVector(index, context, &index_vec);
   std::vector<InT> out_vec(output->numel());
 
   for (int i = 0; i < index_size; i++) {
     PADDLE_ENFORCE_GE(
         index_vec[i],
         0,
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Variable value (index) of OP(index_select) "
             "expected >= 0 and < %ld, but got %ld. Please check input "
             "value.",
@@ -119,7 +310,7 @@ void IndexSelect(const framework::ExecutionContext& context,
     PADDLE_ENFORCE_LT(
         index_vec[i],
         input_dim[dim],
-        platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "Variable value (index) of OP(index_select) "
             "expected >= 0 and < %ld, but got %ld. Please check input "
             "value.",
@@ -139,162 +330,21 @@ void IndexSelect(const framework::ExecutionContext& context,
       }
     }
   }
-  output->mutable_data<InT>(context.GetPlace());
-  framework::TensorFromVector(out_vec, context.device_context(), output);
+  context.template Alloc<InT>(output);
+  paddle::framework::TensorFromVector(out_vec, context, output);
   output->Resize(output_dim);
 }
 
-// The core logic of computing Unique Consecutive for a flattend Tensor
-template <typename InT, typename IndexT, typename equal_T, typename not_equal_T>
-static void UniqueConsecutiveFlattendCUDATensor(
-    const framework::ExecutionContext& context,
-    const Tensor& in,
-    Tensor* out,
-    bool return_inverse,
-    bool return_counts,
-    equal_T equal,
-    not_equal_T not_equal,
-    int64_t num_input) {
-  // 0. Prepration
-  Tensor in_hat;
-  framework::TensorCopy(in, context.GetPlace(), &in_hat);
-  auto in_data_hat = in_hat.mutable_data<InT>(context.GetPlace());
-
-  Tensor sorted_indices;
-  sorted_indices.Resize(phi::make_ddim({num_input}));
-  auto sorted_indices_data =
-      sorted_indices.mutable_data<IndexT>(context.GetPlace());
-  thrust::sequence(
-      thrust::device, sorted_indices_data, sorted_indices_data + num_input);
-  // 1. Calculate op result: 'out'
-  Tensor range;
-  range.Resize(phi::make_ddim({num_input + 1}));
-  auto range_data_ptr = range.mutable_data<IndexT>(context.GetPlace());
-  thrust::sequence(
-      thrust::device, range_data_ptr, range_data_ptr + num_input + 1);
-  framework::TensorCopy(in_hat, context.GetPlace(), out);
-  int num_out;
-  auto out_data = out->mutable_data<InT>(context.GetPlace());
-  num_out =
-      thrust::unique_by_key(
-          thrust::device, out_data, out_data + num_input, range_data_ptr, equal)
-          .first -
-      out_data;
-  out->Resize(phi::make_ddim({num_out}));
-
-  // 2. Calculate inverse index: 'inverse'
-  if (return_inverse) {
-    Tensor* inverse = context.Output<Tensor>("Index");
-    inverse->Resize(phi::make_ddim({num_input}));
-    auto inverse_data = inverse->mutable_data<IndexT>(context.GetPlace());
-    Tensor inv_loc;
-    inv_loc.Resize(phi::make_ddim({num_input}));
-    auto inv_loc_data_ptr = inv_loc.mutable_data<IndexT>(context.GetPlace());
-    thrust::adjacent_difference(thrust::device,
-                                in_data_hat,
-                                in_data_hat + num_input,
-                                inv_loc_data_ptr,
-                                not_equal);
-    thrust::device_ptr<IndexT> inv_loc_data_dev(inv_loc_data_ptr);
-    inv_loc_data_dev[0] = 0;  // without device_ptr, segmentation fault
-    thrust::inclusive_scan(thrust::device,
-                           inv_loc_data_ptr,
-                           inv_loc_data_ptr + num_input,
-                           inv_loc_data_ptr);
-    thrust::scatter(thrust::device,
-                    inv_loc_data_ptr,
-                    inv_loc_data_ptr + num_input,
-                    sorted_indices_data,
-                    inverse_data);
-  }
-  // 3. Calculate 'counts'
-  if (return_counts) {
-    Tensor* counts = context.Output<Tensor>("Counts");
-    counts->Resize(phi::make_ddim({num_out}));
-    auto count_data = counts->mutable_data<IndexT>(context.GetPlace());
-    // init 'count_data' as 0
-    thrust::fill(thrust::device, count_data, count_data + num_out, 0);
-    thrust::device_ptr<IndexT> range_data_ptr_dev(range_data_ptr);
-    range_data_ptr_dev[num_out] = num_input;
-    thrust::adjacent_difference(thrust::device,
-                                range_data_ptr + 1,
-                                range_data_ptr + num_out + 1,
-                                count_data);
-  }
-}
-
-// The logic of compute unique with axis required, it's a little different
-// from above function
-template <typename InT, typename IndexT, typename equal_T, typename not_equal_T>
-static void ComputeUniqueConsecutiveDims(
-    const framework::ExecutionContext& context,
-    Tensor* sorted_indices,
-    IndexT* sorted_indices_data,
-    Tensor* out,
-    bool return_inverse,
-    bool return_counts,
-    equal_T equal,
-    not_equal_T not_equal,
-    int64_t row) {
-  // 1. inverse indices: 'inverse'
-  Tensor* inverse = context.Output<Tensor>("Index");
-  inverse->Resize(phi::make_ddim({row}));
-  auto inverse_data = inverse->mutable_data<IndexT>(context.GetPlace());
-  Tensor inv_loc;
-  inv_loc.Resize(phi::make_ddim({row}));
-  auto inv_loc_data_ptr = inv_loc.mutable_data<IndexT>(context.GetPlace());
-  thrust::adjacent_difference(thrust::device,
-                              sorted_indices_data,
-                              sorted_indices_data + row,
-                              inv_loc_data_ptr,
-                              not_equal);
-  thrust::device_ptr<IndexT> inv_loc_data_dev(inv_loc_data_ptr);
-  inv_loc_data_dev[0] = 0;
-  thrust::inclusive_scan(thrust::device,
-                         inv_loc_data_ptr,
-                         inv_loc_data_ptr + row,
-                         inv_loc_data_ptr);
-  thrust::scatter(thrust::device,
-                  inv_loc_data_ptr,
-                  inv_loc_data_ptr + row,
-                  sorted_indices_data,
-                  inverse_data);
-
-  // 2. sorted indices
-  Tensor range;
-  range.Resize(phi::make_ddim({row + 1}));
-  auto range_data_ptr = range.mutable_data<IndexT>(context.GetPlace());
-  thrust::sequence(thrust::device, range_data_ptr, range_data_ptr + row + 1);
-  int num_out;
-  num_out = thrust::unique_by_key(thrust::device,
-                                  sorted_indices_data,
-                                  sorted_indices_data + row,
-                                  range_data_ptr,
-                                  equal)
-                .first -
-            sorted_indices_data;
-  thrust::device_ptr<IndexT> range_data_ptr_dev(range_data_ptr);
-  range_data_ptr_dev[num_out] = row;
-  sorted_indices->Resize(phi::make_ddim({num_out}));
-
-  // 3. counts: 'counts'
-  Tensor* counts = context.Output<Tensor>("Counts");
-  counts->Resize(phi::make_ddim({num_out}));
-  auto count_data = counts->mutable_data<IndexT>(context.GetPlace());
-  thrust::fill(thrust::device, count_data, count_data + row, 0);
-  thrust::adjacent_difference(
-      thrust::device, range_data_ptr + 1, range_data_ptr + row + 1, count_data);
-}
-
 // Calculate unique consecutive when 'axis' is set
-template <typename DeviceContext, typename InT, typename IndexT>
-static void UniqueConsecutiveDimsCUDATensor(
-    const framework::ExecutionContext& context,
-    const Tensor& in,
-    Tensor* out,
-    bool return_inverse,
-    bool return_counts,
-    int axis) {
+template <typename Context, typename InT, typename IndexT>
+static void UniqueConsecutiveDimsCUDATensor(const Context& context,
+                                            const DenseTensor& in,
+                                            DenseTensor* out,
+                                            bool return_inverse,
+                                            bool return_counts,
+                                            int axis,
+                                            DenseTensor* inverse,
+                                            DenseTensor* counts) {
   // 1. Transpose & reshape
   // Transpose tensor: eg. axis=1, [dim0, dim1, dim2] -> [dim1, dim0, dim2]
   std::vector<int> permute(in.dims().size());
@@ -304,19 +354,18 @@ static void UniqueConsecutiveDimsCUDATensor(
   std::vector<int64_t> in_trans_dims_vec(phi::vectorize(in.dims()));
   in_trans_dims_vec[axis] = in.dims()[0];
   in_trans_dims_vec[0] = in.dims()[axis];
-  framework::Tensor in_trans;
-  framework::DDim in_trans_dims = phi::make_ddim(in_trans_dims_vec);
+  DenseTensor in_trans;
+  DDim in_trans_dims = phi::make_ddim(in_trans_dims_vec);
   in_trans.Resize(in_trans_dims);
-  in_trans.mutable_data<InT>(context.GetPlace());
-  auto& dev_ctx = context.cuda_device_context();
-  TransCompute<DeviceContext, InT>(in.dims().size(),  // num of dims
-                                   dev_ctx,           // device
-                                   in,                // original Tensor
-                                   &in_trans,         // Tensor after reshape
-                                   permute);          // index of axis
+  context.template Alloc<InT>(&in_trans);
+  phi::funcs::TransCompute<Context, InT>(in.dims().size(),  // num of dims
+                                         context,           // device
+                                         in,                // original Tensor
+                                         &in_trans,  // Tensor after reshape
+                                         permute);   // index of axis
 
   // Reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2]
-  framework::DDim in_trans_flat_dims = phi::flatten_to_2d(in_trans_dims, 1);
+  DDim in_trans_flat_dims = phi::flatten_to_2d(in_trans_dims, 1);
   in_trans.Resize(in_trans_flat_dims);
 
   // now 'in_trans' is 2D
@@ -324,16 +373,15 @@ static void UniqueConsecutiveDimsCUDATensor(
   int64_t row = in_trans.dims()[0];
   const InT* in_trans_data = in_trans.data<InT>();
 
-  Tensor sorted_indices;
+  DenseTensor sorted_indices;
   sorted_indices.Resize(phi::make_ddim({row}));
-  auto sorted_indices_data =
-      sorted_indices.mutable_data<IndexT>(context.GetPlace());
+  auto sorted_indices_data = context.template Alloc<IndexT>(&sorted_indices);
 
   // 2. Calculate 'inverse', 'counts'
   // Init index
   thrust::sequence(
       thrust::device, sorted_indices_data, sorted_indices_data + row);
-  ComputeUniqueConsecutiveDims<InT, IndexT>(
+  ComputeUniqueConsecutiveDims<Context, InT, IndexT>(
       context,
       &sorted_indices,
       sorted_indices_data,
@@ -342,143 +390,70 @@ static void UniqueConsecutiveDimsCUDATensor(
       return_counts,
       BinaryEqual<InT>(col, in_trans_data),
       BinaryNotEqual<InT>(col, in_trans_data),
-      row);
+      row,
+      inverse,
+      counts);
 
   // 3. Select indices and reshape back to get 'out'
-  Tensor out_trans;
+  DenseTensor out_trans;
   std::vector<int64_t> out_trans_dims_vec = in_trans_dims_vec;
   out_trans_dims_vec[0] = sorted_indices.numel();
   out_trans.Resize(phi::make_ddim(out_trans_dims_vec));
-  out_trans.mutable_data<InT>(context.GetPlace());
+  context.template Alloc<InT>(&out_trans);
 
-  IndexSelect<InT, IndexT>(context, in_trans, sorted_indices, &out_trans, 0);
+  IndexSelect<Context, InT, IndexT>(
+      context, in_trans, sorted_indices, &out_trans, 0);
 
   std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]);
   out->Resize(phi::make_ddim(out_trans_dims_vec));
-  out->mutable_data<InT>(context.GetPlace());
-  std::vector<framework::Tensor> out_trans_unbind = Unbind(out_trans);
-  math::ConcatFunctor<DeviceContext, InT> concat_functor;
-  concat_functor(dev_ctx, out_trans_unbind, 0, &out_trans);
-  TransCompute<DeviceContext, InT>(
-      out_trans.dims().size(), dev_ctx, out_trans, out, permute);
+  context.template Alloc<InT>(out);
+  std::vector<DenseTensor> out_trans_unbind = phi::funcs::Unbind(out_trans);
+  phi::funcs::ConcatFunctor<Context, InT> concat_functor;
+  concat_functor(context, out_trans_unbind, 0, &out_trans);
+  phi::funcs::TransCompute<Context, InT>(
+      out_trans.dims().size(), context, out_trans, out, permute);
 }
 
-// functor for processing a flattend Tensor
-template <typename DeviceContext, typename InT>
-struct UniqueConsecutiveFlattendCUDAFunctor {
-  const framework::ExecutionContext& ctx_;
-  const Tensor& in_;
-  Tensor* out_;
-  const bool return_inverse_;
-  const bool return_counts_;
-
-  UniqueConsecutiveFlattendCUDAFunctor(
-      const framework::ExecutionContext& context,
-      const Tensor& in,
-      Tensor* out,
-      bool return_inverse,
-      bool return_counts)
-      : ctx_(context),
-        in_(in),
-        out_(out),
-        return_inverse_(return_inverse),
-        return_counts_(return_counts) {}
-
-  template <typename IndexT>
-  void apply() const {
-    UniqueConsecutiveFlattendCUDATensor<InT, IndexT>(
-        ctx_,
-        in_,
-        out_,
-        return_inverse_,
-        return_counts_,
-        thrust::equal_to<InT>(),
-        thrust::not_equal_to<InT>(),
-        in_.numel());
-  }
-};
-
 // functor for processing a multi-dimentional Tensor
-template <typename DeviceContext, typename InT>
+template <typename Context, typename InT>
 struct UniqueConsecutiveDimsCUDAFunctor {
-  const framework::ExecutionContext& ctx_;
-  const Tensor& in_;
-  Tensor* out_;
+  const Context& ctx_;
+  const DenseTensor& in_;
+  DenseTensor* out_;
   const int axis_;
   const bool return_inverse_;
   const bool return_counts_;
+  DenseTensor* inverse_;
+  DenseTensor* count_;
 
-  UniqueConsecutiveDimsCUDAFunctor(const framework::ExecutionContext& context,
-                                   const Tensor& in,
-                                   Tensor* out,
+  UniqueConsecutiveDimsCUDAFunctor(const Context& context,
+                                   const DenseTensor& in,
+                                   DenseTensor* out,
                                    const int axis,
                                    bool return_inverse,
-                                   bool return_counts)
+                                   bool return_counts,
+                                   DenseTensor* inverse,
+                                   DenseTensor* count)
       : ctx_(context),
         in_(in),
         out_(out),
         axis_(axis),
         return_inverse_(return_inverse),
-        return_counts_(return_counts) {}
+        return_counts_(return_counts),
+        inverse_(inverse),
+        count_(count) {}
 
   template <typename IndexT>
   void apply() const {
-    UniqueConsecutiveDimsCUDATensor<DeviceContext, InT, IndexT>(
-        ctx_, in_, out_, return_inverse_, return_counts_, axis_);
+    UniqueConsecutiveDimsCUDATensor<Context, InT, IndexT>(ctx_,
+                                                          in_,
+                                                          out_,
+                                                          return_inverse_,
+                                                          return_counts_,
+                                                          axis_,
+                                                          inverse_,
+                                                          count_);
   }
 };
 
-// Unique_Consecutive_op CUDA implementation.
-template <typename InT>
-class UniqueConsecutiveKernel<platform::CUDADeviceContext, InT>
-    : public framework::OpKernel<InT> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto data_type = static_cast<framework::proto::VarType::Type>(
-        context.Attr<int>("dtype"));
-    if (data_type == framework::proto::VarType::INT32) {
-      PADDLE_ENFORCE_LE(
-          x->numel() + 1,
-          INT_MAX,
-          platform::errors::InvalidArgument(
-              "The number of elements in Input(X) should be less than or "
-              "equal to INT_MAX, but received num is %d. Please set `dtype` to "
-              "int64.",
-              x->numel()));
-    }
-
-    std::vector<int> axis_vec = context.Attr<std::vector<int>>("axis");
-    bool return_inverse = context.Attr<bool>("return_inverse");
-    bool return_counts = context.Attr<bool>("return_counts");
-
-    // if 'axis' is not required, flatten the Tensor.
-    if (axis_vec.empty()) {
-      framework::VisitDataTypeTiny(
-          data_type,
-          UniqueConsecutiveFlattendCUDAFunctor<platform::CUDADeviceContext,
-                                               InT>(
-              context, *x, out, return_inverse, return_counts));
-    } else {
-      // 'axis' is required.
-      int axis = axis_vec[0];
-      framework::VisitDataTypeTiny(
-          data_type,
-          UniqueConsecutiveDimsCUDAFunctor<platform::CUDADeviceContext, InT>(
-              context, *x, out, axis, return_inverse, return_counts));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    unique_consecutive,
-    ops::UniqueConsecutiveKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::UniqueConsecutiveKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::UniqueConsecutiveKernel<paddle::platform::CUDADeviceContext, int32_t>,
-    ops::UniqueConsecutiveKernel<paddle::platform::CUDADeviceContext, int64_t>);
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/unique_consecutive_kernel.cu b/paddle/phi/kernels/gpu/unique_consecutive_kernel.cu
new file mode 100644
index 0000000000000..4ce91a0dd66b4
--- /dev/null
+++ b/paddle/phi/kernels/gpu/unique_consecutive_kernel.cu
@@ -0,0 +1,81 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/unique_consecutive_kernel.h"
+#include "paddle/phi/kernels/gpu/unique_consecutive_functor.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/errors.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/fluid/framework/data_type.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void UniqueConsecutiveKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             bool return_inverse,
+                             bool return_counts,
+                             const std::vector<int>& axis,
+                             int dtype,
+                             DenseTensor* out,
+                             DenseTensor* index,
+                             DenseTensor* counts) {
+  auto data_type = static_cast<paddle::framework::proto::VarType::Type>(dtype);
+  if (data_type == paddle::framework::proto::VarType::INT32) {
+    PADDLE_ENFORCE_LE(
+        x.numel() + 1,
+        INT_MAX,
+        phi::errors::InvalidArgument(
+            "The number of elements in Input(X) should be less than or "
+            "equal to INT_MAX, but received num is %d. Please set `dtype` to "
+            "int64.",
+            x.numel()));
+  }
+
+  // if 'axis' is not required, flatten the Tensor.
+  if (axis.empty()) {
+    paddle::framework::VisitDataTypeTiny(
+        data_type,
+        UniqueConsecutiveFlattenedCUDAFunctor<Context, T>(
+            dev_ctx, x, out, return_inverse, return_counts, index, counts));
+  } else {
+    // 'axis' is required.
+    int valid_axis = axis[0];
+    paddle::framework::VisitDataTypeTiny(
+        data_type,
+        UniqueConsecutiveDimsCUDAFunctor<Context, T>(dev_ctx,
+                                                     x,
+                                                     out,
+                                                     valid_axis,
+                                                     return_inverse,
+                                                     return_counts,
+                                                     index,
+                                                     counts));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(unique_consecutive,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::UniqueConsecutiveKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/identity_loss_grad_kernel.h b/paddle/phi/kernels/identity_loss_grad_kernel.h
new file mode 100644
index 0000000000000..02422fd936bda
--- /dev/null
+++ b/paddle/phi/kernels/identity_loss_grad_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IdentityLossGradKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& out_grad,
+                            const int reduction,
+                            DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/identity_loss_kernel.h b/paddle/phi/kernels/identity_loss_kernel.h
new file mode 100644
index 0000000000000..895b565894b22
--- /dev/null
+++ b/paddle/phi/kernels/identity_loss_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IdentityLossKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const int reduction,
+                        DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/as_real_impl.h b/paddle/phi/kernels/impl/as_real_impl.h
new file mode 100644
index 0000000000000..0534b836e3732
--- /dev/null
+++ b/paddle/phi/kernels/impl/as_real_impl.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/as_real_kernel.h"
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+
+namespace phi {
+template <typename T, typename Context>
+void AsRealKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
+  ctx.template Alloc<T>(out);
+  auto out_dims_original = out->dims();
+  Copy(ctx, x, ctx.GetPlace(), false, out);
+  out->Resize(out_dims_original);  // restored the shape.
+  out->set_type(
+      paddle::experimental::CppTypeToDataType<T>::Type());  // restored the
+                                                            // dtype.
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/complex_grad_kernel_impl.h b/paddle/phi/kernels/impl/complex_grad_kernel_impl.h
index 03896a2353dda..f7366b32e1105 100644
--- a/paddle/phi/kernels/impl/complex_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/complex_grad_kernel_impl.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/elementwise_grad_base.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
 
 namespace phi {
@@ -47,4 +48,51 @@ void ImagGradKernel(const Context& dev_ctx,
   for_range(functor);
 }
 
+template <typename T>
+struct ComplexGradForRealFunctor {
+  inline HOSTDEVICE T operator()(const T x,
+                                 const T y,
+                                 const phi::dtype::complex<T> out,
+                                 const phi::dtype::complex<T> dout) {
+    return dout.real;
+  }
+};
+
+template <typename T>
+struct ComplexGradForImagFunctor {
+  inline HOSTDEVICE T operator()(const T x,
+                                 const T y,
+                                 const phi::dtype::complex<T> out,
+                                 const phi::dtype::complex<T> dout) {
+    return dout.imag;
+  }
+};
+
+template <typename T, typename Context>
+void ComplexGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       const DenseTensor& dout,
+                       DenseTensor* dx,
+                       DenseTensor* dy) {
+  using C = phi::dtype::complex<T>;
+
+  // skip out in a hacky way
+  auto out = dout;
+  phi::funcs::ElemwiseGradCompute<Context,
+                                  T,
+                                  ComplexGradForRealFunctor<T>,
+                                  ComplexGradForImagFunctor<T>,
+                                  C>(dev_ctx,
+                                     x,
+                                     y,
+                                     out,
+                                     dout,
+                                     /*axis*/ -1,
+                                     dx,
+                                     dy,
+                                     ComplexGradForRealFunctor<T>(),
+                                     ComplexGradForImagFunctor<T>());
+}
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/complex_kernel_impl.h b/paddle/phi/kernels/impl/complex_kernel_impl.h
index 72b1328833979..8bd7823411964 100644
--- a/paddle/phi/kernels/impl/complex_kernel_impl.h
+++ b/paddle/phi/kernels/impl/complex_kernel_impl.h
@@ -15,7 +15,9 @@
 #pragma once
 
 // See Note [ Why still include the fluid headers? ]
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
 
 namespace phi {
@@ -61,4 +63,45 @@ void ImagKernel(const Context& dev_ctx,
   for_range(functor);
 }
 
+// functors to use with ElementwiseComputeEx
+template <typename T>
+struct RealAndImagToComplexFunctor {
+  inline HOSTDEVICE phi::dtype::complex<T> operator()(const T x, const T y) {
+    return phi::dtype::complex<T>(x, y);
+  }
+};
+
+template <typename T>
+struct ImagAndRealToComplexFunctor {
+  inline HOSTDEVICE phi::dtype::complex<T> operator()(const T y, const T x) {
+    return phi::dtype::complex<T>(x, y);
+  }
+};
+
+template <typename T, typename Context>
+void ComplexKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   DenseTensor* out) {
+  using C = phi::dtype::complex<T>;
+  dev_ctx.template Alloc<C>(out);
+
+// NOTE(chenfeiyu): be careful of the caveats of calling elementwise-related
+// facility functions
+#if defined(__NVCC__) || defined(__HIPCC__)
+  phi::funcs::ElementwiseCompute<RealAndImagToComplexFunctor<T>, T, C>(
+      dev_ctx, x, y, /*axis*/ -1, RealAndImagToComplexFunctor<T>(), out);
+#else
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  if (x_dims.size() >= y_dims.size()) {
+    phi::funcs::ElementwiseCompute<RealAndImagToComplexFunctor<T>, T, C>(
+        dev_ctx, x, y, /*axis*/ -1, RealAndImagToComplexFunctor<T>(), out);
+  } else {
+    phi::funcs::ElementwiseCompute<ImagAndRealToComplexFunctor<T>, T, C>(
+        dev_ctx, x, y, /*axis*/ -1, ImagAndRealToComplexFunctor<T>(), out);
+  }
+#endif
+}
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/dist_grad_kernel_impl.h b/paddle/phi/kernels/impl/dist_grad_kernel_impl.h
deleted file mode 100644
index fc118a832dc9f..0000000000000
--- a/paddle/phi/kernels/impl/dist_grad_kernel_impl.h
+++ /dev/null
@@ -1,223 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/funcs/eigen/common.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace phi {
-
-template <typename T,
-          size_t D,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using ETensor = phi::EigenTensor<T, D, MajorType, IndexType>;
-
-template <int Rank>
-static void GetBraodcastDims(const phi::DDim& x_dims,
-                             const phi::DDim& y_dims,
-                             Eigen::DSizes<int, Rank>* x_bcast_dims,
-                             Eigen::DSizes<int, Rank>* y_bcast_dims) {
-  int bcast_dims_remainder = 0;
-  for (int i = 0; i < x_dims.size(); ++i) {
-    if (x_dims[i] >= y_dims[i]) {
-      (*x_bcast_dims)[i] = 1;
-      (*y_bcast_dims)[i] = x_dims[i] / y_dims[i];
-      bcast_dims_remainder += x_dims[i] % y_dims[i];
-    } else {
-      (*y_bcast_dims)[i] = 1;
-      (*x_bcast_dims)[i] = y_dims[i] / x_dims[i];
-      bcast_dims_remainder += y_dims[i] % x_dims[i];
-    }
-  }
-  PADDLE_ENFORCE_EQ(bcast_dims_remainder,
-                    0,
-                    phi::errors::PreconditionNotMet(
-                        "The input tensor of Op(dist) could not be broadcast, "
-                        "X's shape is [%s], Y's shape is [%s].",
-                        x_dims,
-                        y_dims));
-}
-
-static phi::DDim GetNewDims(const phi::DDim& in_dims, int rank) {
-  std::vector<int64_t> new_dims_vec(rank);
-  if (in_dims.size() < rank) {
-    for (int i = 0; i < rank - in_dims.size(); ++i) {
-      new_dims_vec[i] = 1;
-    }
-    for (int i = 0; i < in_dims.size(); ++i) {
-      new_dims_vec[i + rank - in_dims.size()] = in_dims[i];
-    }
-  } else {
-    new_dims_vec = vectorize(in_dims);
-  }
-  return phi::make_ddim(new_dims_vec);
-}
-
-template <typename Context, typename T, int Rank>
-static void DistGradFunction(const Context& dev_ctx,
-                             const DenseTensor& x,
-                             const DenseTensor& y,
-                             const DenseTensor& out,
-                             const DenseTensor& out_grad,
-                             float p,
-                             DenseTensor* x_grad,
-                             DenseTensor* y_grad) {
-  auto x_dims = x.dims();
-  auto y_dims = y.dims();
-  auto out_dims = out.dims();
-
-  phi::DDim x_new_dims = GetNewDims(x_dims, Rank);
-  phi::DDim y_new_dims = GetNewDims(y_dims, Rank);
-  phi::DDim out_new_dims = GetNewDims(out_dims, Rank);
-  auto x_t = ETensor<T, Rank>::From(x, x_new_dims);
-  auto y_t = ETensor<T, Rank>::From(y, y_new_dims);
-  auto out_t = ETensor<T, Rank>::From(out, out_new_dims);
-
-  Eigen::DSizes<int, Rank> x_bcast_dims;
-  Eigen::DSizes<int, Rank> y_bcast_dims;
-  Eigen::DSizes<int, Rank> out_bcast_dims;
-
-  GetBraodcastDims<Rank>(x_new_dims, y_new_dims, &x_bcast_dims, &y_bcast_dims);
-  std::vector<int64_t> new_dims_vec(Rank);
-  for (int i = 0; i < Rank; ++i) {
-    new_dims_vec[i] = std::max(x_new_dims[i], y_new_dims[i]);
-    out_bcast_dims[i] = new_dims_vec[i];
-  }
-  phi::DDim new_dims = phi::make_ddim(new_dims_vec);
-
-  auto& place = *dev_ctx.eigen_device();
-  auto out_grad_t = ETensor<T, Rank>::From(out_grad, out_new_dims);
-  DenseTensor grad;
-  grad.Resize(new_dims);
-  dev_ctx.template Alloc<T>(&grad);
-  auto grad_t = ETensor<T, Rank>::From(grad);
-
-  auto x_minux_y = x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims);
-  auto x_minux_y_abs = x_minux_y.abs();
-  auto sign =
-      (x_minux_y > static_cast<T>(0)).template cast<T>() * static_cast<T>(1.0) +
-      (x_minux_y < static_cast<T>(0)).template cast<T>() * static_cast<T>(-1.0);
-  T epsilon = static_cast<T>(1.0e-10f);
-
-  // 1: Lp-norm(z), z = x-y, compute dz
-  if (p == 0) {
-    phi::funcs::SetConstant<Context, T> set_zero;
-    set_zero(dev_ctx, &grad, static_cast<T>(0));
-  } else if (p == INFINITY || p == -INFINITY) {
-    // p=inf or -inf, Lp-norm = |z_i|, the j-th element of dz tends to 0 if
-    // j!=i, or equals to sign(z_i) * dout if j=i.
-    if (paddle::platform::is_cpu_place(dev_ctx.GetPlace())) {
-      grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims))
-                                 .template cast<T>() *
-                             sign.eval() * out_grad_t.broadcast(out_bcast_dims);
-    } else {
-      grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims))
-                                 .template cast<T>() *
-                             sign * out_grad_t.broadcast(out_bcast_dims);
-    }
-  } else {
-    // dz = pow(abs(x-y)/out, p-1) * sign(x-y) * dout
-    if (paddle::platform::is_cpu_place(dev_ctx.GetPlace())) {
-      grad_t.device(place) =
-          (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims))
-              .pow(p - 1) *
-          sign.eval() * out_grad_t.broadcast(out_bcast_dims);
-    } else {
-      grad_t.device(place) =
-          (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims))
-              .pow(p - 1) *
-          sign * out_grad_t.broadcast(out_bcast_dims);
-    }
-  }
-
-  Eigen::DSizes<int, Rank * 2> x_reshape_dims;
-  Eigen::DSizes<int, Rank * 2> y_reshape_dims;
-  Eigen::DSizes<int, Rank> reduce_dims;
-  for (int i = 0; i < x_new_dims.size(); ++i) {
-    x_reshape_dims[2 * i] = x_bcast_dims[i];
-    x_reshape_dims[2 * i + 1] = x_new_dims[i];
-    y_reshape_dims[2 * i] = y_bcast_dims[i];
-    y_reshape_dims[2 * i + 1] = y_new_dims[i];
-    reduce_dims[i] = 2 * i;
-  }
-
-  // 2: if x or y is broadcasted in forward function,
-  // the grad need to be sum along the broadcasted dimensions
-  if (x_grad) {
-    dev_ctx.template Alloc<T>(x_grad);
-    auto x_grad_t = ETensor<T, Rank>::From(*x_grad, x_new_dims);
-    x_grad_t.device(place) = grad_t.reshape(x_reshape_dims)
-                                 .sum(reduce_dims)
-                                 .reshape(x_grad_t.dimensions());
-  }
-  if (y_grad) {
-    dev_ctx.template Alloc<T>(y_grad);
-    auto y_grad_t = ETensor<T, Rank>::From(*y_grad, y_new_dims);
-    y_grad_t.device(place) = -grad_t.reshape(y_reshape_dims)
-                                  .sum(reduce_dims)
-                                  .reshape(y_grad_t.dimensions());
-  }
-}
-
-template <typename T, typename Context>
-void DistGradKernel(const Context& dev_ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    const DenseTensor& out,
-                    const DenseTensor& out_grad,
-                    float p,
-                    DenseTensor* x_grad,
-                    DenseTensor* y_grad) {
-  auto x_rank = x.dims().size();
-  auto y_rank = y.dims().size();
-  auto rank = std::max(x_rank, y_rank);
-  PADDLE_ENFORCE_LE(rank,
-                    6,
-                    phi::errors::Unimplemented(
-                        "Op(dist) only support tensors with no more than 6 "
-                        "dimensions, but X's rank is %d, Y's rank is %d.",
-                        x_rank,
-                        y_rank));
-  switch (rank) {
-    case 1:
-      DistGradFunction<Context, T, 1>(
-          dev_ctx, x, y, out, out_grad, p, x_grad, y_grad);
-      break;
-    case 2:
-      DistGradFunction<Context, T, 2>(
-          dev_ctx, x, y, out, out_grad, p, x_grad, y_grad);
-      break;
-    case 3:
-      DistGradFunction<Context, T, 3>(
-          dev_ctx, x, y, out, out_grad, p, x_grad, y_grad);
-      break;
-    case 4:
-      DistGradFunction<Context, T, 4>(
-          dev_ctx, x, y, out, out_grad, p, x_grad, y_grad);
-      break;
-    case 5:
-      DistGradFunction<Context, T, 5>(
-          dev_ctx, x, y, out, out_grad, p, x_grad, y_grad);
-      break;
-    case 6:
-      DistGradFunction<Context, T, 6>(
-          dev_ctx, x, y, out, out_grad, p, x_grad, y_grad);
-      break;
-  }
-}
-
-}  // namespace phi
diff --git a/paddle/phi/kernels/impl/dist_kernel_impl.h b/paddle/phi/kernels/impl/dist_kernel_impl.h
deleted file mode 100644
index c4ee7cec34750..0000000000000
--- a/paddle/phi/kernels/impl/dist_kernel_impl.h
+++ /dev/null
@@ -1,166 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <math.h>
-
-#include <algorithm>
-#include <vector>
-
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/funcs/eigen/common.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace phi {
-
-template <typename T,
-          size_t D,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using ETensor = phi::EigenTensor<T, D, MajorType, IndexType>;
-
-template <int Rank>
-static void GetBraodcastDims(const phi::DDim& x_dims,
-                             const phi::DDim& y_dims,
-                             Eigen::DSizes<int, Rank>* x_bcast_dims,
-                             Eigen::DSizes<int, Rank>* y_bcast_dims) {
-  int bcast_dims_remainder = 0;
-  for (int i = 0; i < x_dims.size(); ++i) {
-    if (x_dims[i] >= y_dims[i]) {
-      (*x_bcast_dims)[i] = 1;
-      (*y_bcast_dims)[i] = x_dims[i] / y_dims[i];
-      bcast_dims_remainder += x_dims[i] % y_dims[i];
-    } else {
-      (*y_bcast_dims)[i] = 1;
-      (*x_bcast_dims)[i] = y_dims[i] / x_dims[i];
-      bcast_dims_remainder += y_dims[i] % x_dims[i];
-    }
-  }
-  PADDLE_ENFORCE_EQ(bcast_dims_remainder,
-                    0,
-                    phi::errors::PreconditionNotMet(
-                        "The input tensor of Op(dist) could not be broadcast, "
-                        "X's shape is [%s], Y's shape is [%s].",
-                        x_dims,
-                        y_dims));
-}
-
-static phi::DDim GetNewDims(const phi::DDim& in_dims, int rank) {
-  std::vector<int64_t> new_dims_vec(rank);
-  if (in_dims.size() < rank) {
-    for (int i = 0; i < rank - in_dims.size(); ++i) {
-      new_dims_vec[i] = 1;
-    }
-    for (int i = 0; i < in_dims.size(); ++i) {
-      new_dims_vec[i + rank - in_dims.size()] = in_dims[i];
-    }
-  } else {
-    new_dims_vec = vectorize(in_dims);
-  }
-  return phi::make_ddim(new_dims_vec);
-}
-
-template <typename Context, typename T, int Rank>
-static void DistFunction(const Context& dev_ctx,
-                         const DenseTensor& x,
-                         const DenseTensor& y,
-                         float p,
-                         DenseTensor* out) {
-  if (out) {
-    dev_ctx.template Alloc<T>(out);
-  }
-  auto x_dims = x.dims();
-  auto y_dims = y.dims();
-
-  // new dims with same size as rank, e.g. (rank=3, (4, 3) => (1, 4, 3))
-  phi::DDim x_new_dims = GetNewDims(x_dims, Rank);
-  phi::DDim y_new_dims = GetNewDims(y_dims, Rank);
-
-  auto x_t = ETensor<T, Rank>::From(x, x_new_dims);
-  auto y_t = ETensor<T, Rank>::From(y, y_new_dims);
-  auto out_t = ETensor<T, 1>::From(*out);
-  auto& place = *dev_ctx.eigen_device();
-
-  Eigen::DSizes<int, Rank> x_bcast_dims;
-  Eigen::DSizes<int, Rank> y_bcast_dims;
-  GetBraodcastDims<Rank>(x_new_dims, y_new_dims, &x_bcast_dims, &y_bcast_dims);
-  // p=0 means number of non-zero elements of (x-y)
-  // p=inf means the maximum of |x-y|
-  // p=-inf means the minimum of |x-y|
-  // otherwise, Lp-norm = pow(sum(pow(|x-y|, p)), 1/p)
-  if (p == 0) {
-    out_t.device(place) =
-        (x_t.broadcast(x_bcast_dims) != y_t.broadcast(y_bcast_dims))
-            .template cast<T>()
-            .sum();
-  } else if (p == INFINITY) {
-    out_t.device(place) =
-        (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims))
-            .abs()
-            .maximum();
-  } else if (p == -INFINITY) {
-    out_t.device(place) =
-        (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims))
-            .abs()
-            .minimum();
-  } else {
-    out_t.device(place) =
-        (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims))
-            .abs()
-            .pow(p)
-            .sum()
-            .pow(1.0 / p);
-  }
-}
-
-template <typename T, typename Context>
-void DistKernel(const Context& dev_ctx,
-                const DenseTensor& x,
-                const DenseTensor& y,
-                float p,
-                DenseTensor* out) {
-  auto x_rank = x.dims().size();
-  auto y_rank = y.dims().size();
-  auto rank = std::max(x_rank, y_rank);
-  PADDLE_ENFORCE_LE(rank,
-                    6,
-                    phi::errors::Unimplemented(
-                        "Op(dist) only support tensors with no more than 6 "
-                        "dimensions, but X's rank is %d, Y's rank is %d.",
-                        x_rank,
-                        y_rank));
-  switch (rank) {
-    case 1:
-      DistFunction<Context, T, 1>(dev_ctx, x, y, p, out);
-      break;
-    case 2:
-      DistFunction<Context, T, 2>(dev_ctx, x, y, p, out);
-      break;
-    case 3:
-      DistFunction<Context, T, 3>(dev_ctx, x, y, p, out);
-      break;
-    case 4:
-      DistFunction<Context, T, 4>(dev_ctx, x, y, p, out);
-      break;
-    case 5:
-      DistFunction<Context, T, 5>(dev_ctx, x, y, p, out);
-      break;
-    case 6:
-      DistFunction<Context, T, 6>(dev_ctx, x, y, p, out);
-      break;
-  }
-}
-
-}  // namespace phi
diff --git a/paddle/phi/kernels/impl/einsum_impl.h b/paddle/phi/kernels/impl/einsum_impl.h
index 43b2760b404f9..b5bc826881af8 100644
--- a/paddle/phi/kernels/impl/einsum_impl.h
+++ b/paddle/phi/kernels/impl/einsum_impl.h
@@ -15,6 +15,8 @@
 
 #include <set>
 
+#include "glog/logging.h"
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/matmul_kernel.h"
 #include "paddle/phi/kernels/reduce_sum_kernel.h"
diff --git a/paddle/phi/kernels/impl/merged_momentum_impl.h b/paddle/phi/kernels/impl/merged_momentum_impl.h
new file mode 100644
index 0000000000000..2972a93d10858
--- /dev/null
+++ b/paddle/phi/kernels/impl/merged_momentum_impl.h
@@ -0,0 +1,400 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/core/macros.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/impl/momentum_kernel_impl.h"
+#include "paddle/phi/kernels/merged_momentum_kernel.h"
+
+namespace phi {
+
+template <typename T>
+using MultiPrecisionType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+template <typename MT, uint32_t kParamNum, bool kHasMasterParams>
+struct MergedMomentumMasterParams {
+  MT *PADDLE_RESTRICT master_params[kParamNum];
+
+  HOSTDEVICE MT *MasterParam(size_t idx) const { return master_params[idx]; }
+  HOSTDEVICE void SetMasterParam(size_t idx, MT *p) { master_params[idx] = p; }
+};
+
+template <typename MT, uint32_t kParamNum>
+struct MergedMomentumMasterParams<MT, kParamNum, false> {
+  HOSTDEVICE constexpr MT *MasterParam(size_t) const { return nullptr; }
+  HOSTDEVICE constexpr void SetMasterParam(size_t, MT *) {}
+};
+
+template <typename T,
+          typename MT,
+          bool kHasMasterParams,
+          uint32_t kParamNum = kHasMasterParams ? 55 : 110>
+struct MergedMomentumKernelParam
+    : public MergedMomentumMasterParams<MT, kParamNum, kHasMasterParams> {
+  static constexpr auto N = kParamNum;
+  size_t sizes[N];
+  T *PADDLE_RESTRICT params[N];
+  const T *PADDLE_RESTRICT grads[N];
+  MT *PADDLE_RESTRICT velocitys[N];
+  const MultiPrecisionType<MT> *PADDLE_RESTRICT lr;
+  MT mu;
+  MT rescale_grad;
+  uint32_t param_num;
+
+  HOSTDEVICE void operator()(size_t i) const {
+    const MT lr_val = static_cast<MT>(*lr);
+    for (uint32_t idx = 0; idx < param_num; ++idx) {
+      auto size = sizes[idx];
+      if (i >= size) continue;
+
+      auto param_p = params[idx];
+      auto grad_p = grads[idx];
+      auto velocity_p = velocitys[idx];
+      auto master_param_p = this->MasterParam(idx);
+
+      const MT param =
+          master_param_p ? master_param_p[i] : static_cast<MT>(param_p[i]);
+      const MT grad = static_cast<MT>(grad_p[i]) * rescale_grad;
+      const MT velocity = velocity_p[i];
+      const MT velocity_out = velocity * mu + grad;
+      const MT param_out = param - lr_val * velocity_out;
+      velocity_p[i] = velocity_out;
+      param_p[i] = static_cast<T>(param_out);
+      if (master_param_p) {
+        master_param_p[i] = param_out;
+      }
+    }
+  }
+};
+
+template <typename MT, typename Context, typename MPType, typename T>
+void MergedMomentumInnerCompute(
+    const Context &ctx,
+    const std::vector<const DenseTensor *> &params,
+    const std::vector<const DenseTensor *> &grads,
+    const std::vector<const DenseTensor *> &velocitys,
+    const std::vector<const DenseTensor *> &lrs,
+    const paddle::optional<std::vector<const DenseTensor *>> &master_params_opt,
+    float mu,
+    bool use_nesterov,
+    const std::vector<std::string> &regularization_methods,
+    const std::vector<float> &regularization_coeffs,
+    float rescale_grad,
+    const bool multi_precision,
+    std::vector<DenseTensor *> params_out,
+    std::vector<DenseTensor *> velocitys_out,
+    std::vector<DenseTensor *> master_params_out) {
+  size_t n = params.size();
+  PADDLE_ENFORCE_EQ(n,
+                    params_out.size(),
+                    phi::errors::InvalidArgument(
+                        "The size of Output(ParamOut) must be equal to "
+                        "Input(Param), but got the size of Output(ParamOut) "
+                        "is %d, the size of Input(Param) is %d.",
+                        params_out.size(),
+                        n));
+  for (size_t i = 0; i < n; ++i) {
+    PADDLE_ENFORCE_EQ(
+        params[i],
+        params_out[i],
+        phi::errors::InvalidArgument("Input(Param) and Output(ParamOut) "
+                                     "must be the same Tensors."));
+  }
+
+  PADDLE_ENFORCE_EQ(
+      n,
+      grads.size(),
+      phi::errors::InvalidArgument(
+          "The size of Input(Grad) must be equal to Input(Param), but got "
+          "the size of Input(Grad) is %d, the size of Input(Param) is %d.",
+          grads.size(),
+          n));
+
+  PADDLE_ENFORCE_EQ(n,
+                    velocitys.size(),
+                    phi::errors::InvalidArgument(
+                        "The size of Input(Velocity) must be equal to "
+                        "Input(Param), but got the size of Input(Velocity) "
+                        "is %d, the size of Input(Param) is %d.",
+                        velocitys.size(),
+                        n));
+
+  PADDLE_ENFORCE_EQ(
+      n,
+      velocitys_out.size(),
+      phi::errors::InvalidArgument(
+          "The size of Output(VelocityOut) must be "
+          "equal to Input(Param), but got the size of Output(VelocityOut) is "
+          "%d, the size of Input(Param) is %d.",
+          velocitys_out.size(),
+          n));
+  for (size_t i = 0; i < n; ++i) {
+    PADDLE_ENFORCE_EQ(velocitys[i],
+                      velocitys_out[i],
+                      phi::errors::InvalidArgument(
+                          "Input(Velocity) and Output(VelocityOut) must be "
+                          "the same Tensors."));
+  }
+
+  if (multi_precision) {
+    auto master_params = master_params_opt.get();
+    PADDLE_ENFORCE_EQ(
+        n,
+        master_params.size(),
+        phi::errors::InvalidArgument(
+            "The size of Input(MasterParam) must be "
+            "equal to Input(Param), but got the size of Input(MasterParam) "
+            "is %d, the size of Input(Param) is %d.",
+            master_params.size(),
+            n));
+    PADDLE_ENFORCE_EQ(
+        n,
+        master_params_out.size(),
+        phi::errors::InvalidArgument(
+            "The size of Output(MasterParamOut) must be equal to "
+            "Input(MasterParam), but got the size of Output(MasterParamOut) "
+            "is %d, the size of Input(Param) is %d.",
+            master_params_out.size(),
+            n));
+    for (size_t i = 0; i < n; ++i) {
+      PADDLE_ENFORCE_EQ(master_params[i],
+                        master_params_out[i],
+                        phi::errors::InvalidArgument(
+                            "Input(MasterParam) and Output(MasterParamOut) "
+                            "must be the same Tensors."));
+      PADDLE_ENFORCE_NOT_NULL(master_params[i],
+                              phi::errors::InvalidArgument(
+                                  "Input(MasterParam) must be provided when "
+                                  "multi_precision=True."));
+    }
+  } else {
+    master_params_out.clear();
+  }
+
+  if (lrs.size() != 1) {
+    PADDLE_ENFORCE_EQ(
+        n,
+        lrs.size(),
+        phi::errors::InvalidArgument(
+            "If the size of Input(LearningRate) is not 1, the size of "
+            "Input(LearningRate) must be "
+            "equal to Input(Param), but got the size of Input(LearningRate) "
+            "is %d, the size of Input(Param) is %d.",
+            lrs.size(),
+            n));
+  }
+  if (regularization_methods.size() != 0) {
+    PADDLE_ENFORCE_EQ(
+        n,
+        regularization_methods.size(),
+        phi::errors::InvalidArgument(
+            "The size of Attr(regularization_method) must be equal "
+            "to Input(Param), but got the size of "
+            "Attr(regularization_method) is %d, the size of Input(Param) is "
+            "%d.",
+            regularization_methods.size(),
+            n));
+    PADDLE_ENFORCE_EQ(
+        n,
+        regularization_coeffs.size(),
+        phi::errors::InvalidArgument(
+            "The size of Attr(regularization_coeff) must be equal "
+            "to Input(Param), but got the size of Attr(regularization_coeff) "
+            "is %d, the size of Input(Param) is %d.",
+            regularization_coeffs.size(),
+            n));
+  }
+
+  VLOG(5) << "use_nesterov: " << use_nesterov
+          << ",  regularization_methods.size(): "
+          << regularization_methods.size()
+          << ",  regularization_coeffs.size(): "
+          << regularization_coeffs.size();
+
+  if (lrs.size() == 1 && use_nesterov == false &&
+      regularization_methods.size() == 0) {
+#define PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL(kMultiPrecision)            \
+  MergedMomentumKernelParam<T, MT, kMultiPrecision> kernel_params;       \
+  constexpr auto kMaxMergedNum = decltype(kernel_params)::N;             \
+  size_t kernel_num = (n + kMaxMergedNum - 1) / kMaxMergedNum;           \
+  kernel_params.mu = static_cast<MT>(mu);                                \
+  kernel_params.rescale_grad = static_cast<MT>(rescale_grad);            \
+  kernel_params.lr = lrs[0]->data<MPType>();                             \
+  for (size_t i = 0; i < kernel_num; ++i) {                              \
+    size_t start = i * kMaxMergedNum;                                    \
+    size_t end = std::min((i + 1) * kMaxMergedNum, n);                   \
+    kernel_params.param_num = static_cast<uint32_t>(end - start);        \
+    size_t max_size = 0;                                                 \
+    for (size_t j = 0; j < kernel_params.param_num; ++j) {               \
+      auto size = static_cast<size_t>(params_out[j + start]->numel());   \
+      max_size = std::max(max_size, size);                               \
+      kernel_params.sizes[j] = size;                                     \
+      kernel_params.params[j] = params_out[j + start]->data<T>();        \
+      kernel_params.grads[j] = grads[j + start]->data<T>();              \
+      kernel_params.velocitys[j] = velocitys_out[j + start]->data<MT>(); \
+      kernel_params.SetMasterParam(                                      \
+          j,                                                             \
+          kMultiPrecision ? master_params_out[j + start]->data<MT>()     \
+                          : nullptr);                                    \
+    }                                                                    \
+    phi::funcs::ForRange<Context> for_range(ctx, max_size);              \
+    for_range(kernel_params);                                            \
+    VLOG(10) << "Launch MergedMomentum kernel " << i << " "              \
+             << kernel_params.param_num;                                 \
+  }
+    if (multi_precision) {
+      PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL(true);
+    } else {
+      PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL(false);
+    }
+#undef PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL
+  } else {
+    for (size_t idx = 0; idx < n; idx++) {
+      phi::RegularizationType regularization_flag =
+          regularization_methods.size() > 0 &&
+                  regularization_methods[idx] == "l2_decay"
+              ? phi::RegularizationType::kL2DECAY
+              : phi::RegularizationType::kNONE;
+
+      MT regularization_coeff = static_cast<MT>(0.0);
+      if (regularization_coeffs.size() != 0) {
+        regularization_coeff = static_cast<MT>(regularization_coeffs[idx]);
+      }
+      auto lr_temp = lrs.size() > 1 ? lrs[idx] : lrs[0];
+
+      const MT *master_in_data =
+          multi_precision ? master_params_opt.get()[idx]->data<MT>() : nullptr;
+      MT *master_out_data =
+          multi_precision ? master_params_out[idx]->data<MT>() : nullptr;
+      if (paddle::platform::is_cpu_place(ctx.GetPlace())) {
+        phi::CPUDenseMomentumFunctor<MT> functor;
+        functor(params[idx],
+                grads[idx],
+                velocitys[idx],
+                lr_temp,
+                static_cast<MT>(mu),
+                use_nesterov,
+                regularization_flag,
+                regularization_coeff,
+                params_out[idx],
+                velocitys_out[idx]);
+        VLOG(10) << "Launch MergedMomentum cpu kernel.";
+      } else if (paddle::platform::is_gpu_place(ctx.GetPlace())) {
+        phi::funcs::ForRange<Context> for_range(
+            static_cast<const Context &>(ctx), params[idx]->numel());
+#define PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL(__nesterov, __reg_type) \
+  phi::DenseMomentumFunctor<T, MT, __reg_type, __nesterov> functor(   \
+      params[idx]->data<T>(),                                         \
+      grads[idx]->data<T>(),                                          \
+      velocitys[idx]->data<MT>(),                                     \
+      lr_temp->data<MPType>(),                                        \
+      master_in_data,                                                 \
+      static_cast<MT>(mu),                                            \
+      static_cast<MT>(rescale_grad),                                  \
+      params[idx]->numel(),                                           \
+      regularization_coeff,                                           \
+      params_out[idx]->data<T>(),                                     \
+      velocitys_out[idx]->data<MT>(),                                 \
+      master_out_data);                                               \
+  for_range(functor);
+        if (use_nesterov) {
+          if (regularization_flag == phi::RegularizationType::kL2DECAY) {
+            PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL(
+                phi::UseNesterov, phi::RegularizationType::kL2DECAY);
+            VLOG(10)
+                << "Launch MergedMomentum gpu kernel use_nesterov kL2DECAY.";
+          } else {
+            PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL(
+                phi::UseNesterov, phi::RegularizationType::kNONE);
+            VLOG(10) << "Launch MergedMomentum gpu kernel use_nesterov kNONE.";
+          }
+        } else {
+          if (regularization_flag == phi::RegularizationType::kL2DECAY) {
+            PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL(
+                phi::NoNesterov, phi::RegularizationType::kL2DECAY);
+            VLOG(10)
+                << "Launch MergedMomentum gpu kernel no_nesterov kL2DECAY.";
+          } else {
+            PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL(
+                phi::NoNesterov, phi::RegularizationType::kNONE);
+            VLOG(10) << "Launch MergedMomentum gpu kernel no_nesterov kNONE.";
+          }
+        }
+      }
+    }
+    VLOG(10)
+        << "Launch MergedMomentum kernel with multi_lr and regularization.";
+  }
+}
+
+template <typename T, typename Context>
+void MergedMomentumKernel(
+    const Context &dev_ctx,
+    const std::vector<const DenseTensor *> &param,
+    const std::vector<const DenseTensor *> &grad,
+    const std::vector<const DenseTensor *> &velocity,
+    const std::vector<const DenseTensor *> &learning_rate,
+    const paddle::optional<std::vector<const DenseTensor *>> &master_param,
+    float mu,
+    bool use_nesterov,
+    const std::vector<std::string> &regularization_method,
+    const std::vector<float> &regularization_coeff,
+    bool multi_precision,
+    float rescale_grad,
+    std::vector<DenseTensor *> param_out,
+    std::vector<DenseTensor *> velocity_out,
+    std::vector<DenseTensor *> master_param_out) {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  if (multi_precision) {
+    MergedMomentumInnerCompute<MPType, Context, MPType, T>(
+        dev_ctx,
+        param,
+        grad,
+        velocity,
+        learning_rate,
+        master_param,
+        mu,
+        use_nesterov,
+        regularization_method,
+        regularization_coeff,
+        rescale_grad,
+        multi_precision,
+        param_out,
+        velocity_out,
+        master_param_out);
+  } else {
+    MergedMomentumInnerCompute<T, Context, MPType, T>(dev_ctx,
+                                                      param,
+                                                      grad,
+                                                      velocity,
+                                                      learning_rate,
+                                                      master_param,
+                                                      mu,
+                                                      use_nesterov,
+                                                      regularization_method,
+                                                      regularization_coeff,
+                                                      rescale_grad,
+                                                      multi_precision,
+                                                      param_out,
+                                                      velocity_out,
+                                                      master_param_out);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/solve_grad_kernel_impl.h
new file mode 100644
index 0000000000000..55ee023cb5caa
--- /dev/null
+++ b/paddle/phi/kernels/impl/solve_grad_kernel_impl.h
@@ -0,0 +1,267 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/kernels/cpu/reduce.h"
+#include "paddle/phi/kernels/expand_as_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/matrix_solve.h"
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+#include "paddle/phi/kernels/impl/solve_kernel_impl.h"
+#include "paddle/phi/kernels/squeeze_kernel.h"
+#include "paddle/phi/kernels/unsqueeze_kernel.h"
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+#include "paddle/phi/kernels/gpu/reduce.h"
+#endif
+
+namespace phi {
+
+template <typename Context, typename T>
+struct ReduceSumForSolvelGrad {
+  void operator()(const Context& dev_ctx,
+                  const DenseTensor& input,
+                  DenseTensor* output,
+                  const std::vector<int>& reduce_dims,
+                  bool keep_dims);
+};
+
+template <typename T>
+struct ReduceSumForSolvelGrad<CPUContext, T> {
+  void operator()(const CPUContext& dev_ctx,
+                  const DenseTensor& input,
+                  DenseTensor* output,
+                  const std::vector<int>& reduce_dims,
+                  bool keep_dims) {
+    std::vector<int64_t> reduce_dims_tmp(reduce_dims.begin(),
+                                         reduce_dims.end());
+    phi::ReduceKernelImpl<CPUContext, T, T, phi::funcs::SumFunctor>(
+        dev_ctx, input, output, reduce_dims_tmp, keep_dims, false);
+  }
+};
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+template <typename T>
+struct ReduceSumForSolvelGrad<GPUContext, T> {
+  void operator()(const GPUContext& dev_ctx,
+                  const DenseTensor& input,
+                  DenseTensor* output,
+                  const std::vector<int>& reduce_dims,
+                  bool keep_dims) {
+    phi::funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+        dev_ctx, input, output, kps::IdentityFunctor<T>(), reduce_dims);
+  }
+};
+#endif
+
+template <typename T, typename Context>
+void SolveGradKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     const DenseTensor& dout,
+                     const DenseTensor& out,
+                     DenseTensor* dx,
+                     DenseTensor* dy) {
+  bool is_vector = false;
+  is_vector = is_vector_rhs(x, y);
+  DenseTensor tmp_y;
+  if (is_vector) {
+    dev_ctx.Alloc(&tmp_y, y.dtype());
+    phi::Unsqueeze<T, Context>(dev_ctx, y, {-1}, &tmp_y, nullptr);
+  } else {
+    tmp_y.Resize(y.dims());
+    dev_ctx.Alloc(&tmp_y, y.dtype());
+    phi::Copy(dev_ctx, y, dev_ctx.GetPlace(), false, &tmp_y);
+  }
+  DenseTensor tmp_x;
+  tmp_x.Resize(x.dims());
+  dev_ctx.Alloc(&tmp_x, x.dtype());
+  phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, &tmp_x);
+
+  std::vector<int64_t> x_broadcast_dims;
+  std::vector<int64_t> y_broadcast_dims;
+  std::tie(x_broadcast_dims, y_broadcast_dims) =
+      get_broadcast_dims(tmp_x, tmp_y);
+  // tmp_dx
+  DenseTensor tmp_dx;
+  tmp_dx.Resize(phi::make_ddim(x_broadcast_dims));
+  dev_ctx.template Alloc<T>(&tmp_dx);
+
+  // tmp_dy
+  DenseTensor tmp_dy;
+  tmp_dy.Resize(phi::make_ddim(y_broadcast_dims));
+  dev_ctx.template Alloc<T>(&tmp_dy);
+
+  DenseTensor tmp_input(x.dtype());
+  const auto& new_dims_vec = phi::funcs::getNewDimsVec(x.dims());
+  tmp_input.Resize(phi::make_ddim(new_dims_vec));
+  dev_ctx.template Alloc<T>(&tmp_input);
+
+  phi::funcs::TransposeNormal<Context, T> trans;
+  std::vector<int> new_axis = phi::funcs::getNewAxis(x.dims().size());
+  trans(dev_ctx, x, &tmp_input, new_axis);
+
+  if (dy) {
+    dev_ctx.template Alloc<T>(dy);
+    linalg_solve<Context, T>(dev_ctx, tmp_input, dout, &tmp_dy);
+  }
+
+  if (dx) {
+    dev_ctx.template Alloc<T>(dx);
+
+    // to get dx
+    auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+    if (x.dims().size() == 2 && y.dims().size() == 2) {
+      auto mat_dim_a1 =
+          phi::funcs::CreateMatrixDescriptor(tmp_dy.dims(), 0, false);
+      auto mat_dim_b1 = phi::funcs::CreateMatrixDescriptor(out.dims(), 0, true);
+      blas.MatMul(tmp_dy, mat_dim_a1, out, mat_dim_b1, T(-1), &tmp_dx, T(0));
+
+    } else if (is_vector_rhs(x, y)) {
+      DenseTensor tmp_dy_;
+      dev_ctx.Alloc(&tmp_dy_, y.dtype());
+
+      phi::Unsqueeze<T, Context>(dev_ctx,
+                                 tmp_dy,
+                                 paddle::experimental::IntArray({-1}),
+                                 &tmp_dy_,
+                                 nullptr);
+
+      DenseTensor tmp_out_;
+      dev_ctx.Alloc(&tmp_out_, out.dtype());
+
+      phi::Unsqueeze<T, Context>(dev_ctx,
+                                 out,
+                                 paddle::experimental::IntArray({-1}),
+                                 &tmp_out_,
+                                 nullptr);
+
+      auto mat_dim_a1 =
+          phi::funcs::CreateMatrixDescriptor(tmp_dy_.dims(), 0, false);
+      auto mat_dim_b1 =
+          phi::funcs::CreateMatrixDescriptor(tmp_out_.dims(), 0, true);
+      blas.MatMul(
+          tmp_dy_, mat_dim_a1, tmp_out_, mat_dim_b1, T(-1), &tmp_dx, T(0));
+
+    } else {
+      auto mat_dim_a1 =
+          phi::funcs::CreateMatrixDescriptor(tmp_dy.dims(), 0, false);
+      auto mat_dim_b1 = phi::funcs::CreateMatrixDescriptor(out.dims(), 0, true);
+      blas.MatMul(tmp_dy, mat_dim_a1, out, mat_dim_b1, T(-1), &tmp_dx, T(0));
+    }
+  }
+  if (y.dims() != tmp_dy.dims()) {
+    DenseTensor dy_help;
+    dy_help.Resize(tmp_dy.dims());
+    dev_ctx.Alloc(&dy_help, tmp_dy.dtype());
+
+    phi::Copy(dev_ctx, tmp_dy, dev_ctx.GetPlace(), false, &dy_help);
+
+    // get dims
+    std::vector<std::int64_t> x_dims = vectorize(x.dims());
+    std::vector<std::int64_t> y_dims = vectorize(y.dims());
+    std::vector<std::int64_t> dout_dims = vectorize(dout.dims());
+
+    if (is_vector_rhs(x, y)) {
+      dout_dims.push_back(1);
+    }
+
+    int y_ndim = y_dims.size();
+    int ndim = dout_dims.size();
+
+    const std::vector<std::int64_t> dy_help_dims = vectorize(dy_help.dims());
+    std::vector<std::int64_t> dy_broadcast_dims(ndim);
+
+    std::fill(
+        dy_broadcast_dims.data(), dy_broadcast_dims.data() + ndim - y_ndim, 1);
+    std::copy(y_dims.data(),
+              y_dims.data() + y_ndim,
+              dy_broadcast_dims.data() + ndim - y_ndim);
+
+    std::vector<int> dy_reduce_dims;
+    for (int idx = 0; idx <= ndim - 3; idx++) {
+      if (dy_help_dims[idx] != 1 && dy_broadcast_dims[idx] == 1) {
+        dy_reduce_dims.push_back(idx);
+      }
+    }
+    // reduce sum to get grad by ReduceSum
+    if (dy) {
+      if (dy_reduce_dims.empty()) {
+        *dy = std::move(dy_help);
+      } else {
+        bool keep_dim = true;
+        if (dy_help.dims().size() != dy->dims().size()) {
+          keep_dim = false;
+        }
+        ReduceSumForSolvelGrad<Context, T>()(
+            dev_ctx, dy_help, dy, dy_reduce_dims, keep_dim);
+      }
+      dy->Resize(y.dims());
+    }
+  } else {
+    phi::Copy(dev_ctx, tmp_dy, dev_ctx.GetPlace(), false, dy);
+  }
+
+  if (x.dims() != tmp_dx.dims()) {
+    DenseTensor dx_help;
+    dx_help.Resize(tmp_dx.dims());
+    dev_ctx.Alloc(&dx_help, tmp_dx.dtype());
+    phi::Copy(dev_ctx, tmp_dx, dev_ctx.GetPlace(), false, &dx_help);
+    // get dims
+    std::vector<std::int64_t> x_dims = vectorize(x.dims());
+    std::vector<std::int64_t> y_dims = vectorize(y.dims());
+
+    int x_ndim = x_dims.size();
+    int ndim = x_broadcast_dims.size();
+
+    const std::vector<std::int64_t> dx_help_dims = vectorize(dx_help.dims());
+    std::vector<std::int64_t> dx_broadcast_dims(ndim);
+    std::fill(
+        dx_broadcast_dims.data(), dx_broadcast_dims.data() + ndim - x_ndim, 1);
+    std::copy(x_dims.data(),
+              x_dims.data() + x_ndim,
+              dx_broadcast_dims.data() + ndim - x_ndim);
+
+    std::vector<int> dx_reduce_dims;
+    for (int idx = 0; idx <= ndim - 3; idx++) {
+      if (dx_help_dims[idx] != 1 && dx_broadcast_dims[idx] == 1) {
+        dx_reduce_dims.push_back(idx);
+      }
+    }
+    // reduce sum to get grad by ReduceSum
+    if (dx) {
+      dev_ctx.template Alloc<T>(dx);
+
+      if (dx_reduce_dims.empty()) {
+        *dx = std::move(dx_help);
+      } else {
+        bool keep_dim = true;
+        if (dx_help.dims().size() != dx->dims().size()) {
+          keep_dim = false;
+        }
+        ReduceSumForSolvelGrad<Context, T>()(
+            dev_ctx, dx_help, dx, dx_reduce_dims, keep_dim);
+      }
+      dx->Resize(x.dims());
+    }
+  } else {
+    phi::Copy(dev_ctx, tmp_dx, dev_ctx.GetPlace(), false, dx);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/solve_kernel_impl.h b/paddle/phi/kernels/impl/solve_kernel_impl.h
new file mode 100644
index 0000000000000..09c9e74dd207a
--- /dev/null
+++ b/paddle/phi/kernels/impl/solve_kernel_impl.h
@@ -0,0 +1,199 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/expand_as_kernel.h"
+#include "paddle/phi/kernels/funcs/matrix_solve.h"
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+#include "paddle/phi/kernels/squeeze_kernel.h"
+#include "paddle/phi/kernels/unsqueeze_kernel.h"
+
+namespace phi {
+
+using Tensor = DenseTensor;
+
+// check the input other is vector_case or not
+static inline bool is_vector_rhs(const DenseTensor& input,
+                                 const DenseTensor& other) {
+  auto x_dim = input.dims();
+  auto y_dim = other.dims();
+  auto x_dim_size = x_dim.size();
+  auto y_dim_size = y_dim.size();
+  std::vector<int64_t> x_dims_vec = phi::vectorize(x_dim);
+  std::vector<int64_t> y_dims_vec = phi::vectorize(y_dim);
+
+  std::vector<int64_t>::const_iterator f = x_dims_vec.begin();
+  std::vector<int64_t>::const_iterator l = x_dims_vec.end() - 1;
+  std::vector<int64_t> x_dims_vec_cut(f, l);  // input.shape[:-1]
+
+  std::vector<int64_t> expected_batched_rhs_shape(x_dims_vec_cut);
+  bool vector_case =
+      y_dim_size == 1 || (x_dim_size - 1 == y_dim_size &&
+                          y_dims_vec == (expected_batched_rhs_shape));
+
+  return vector_case;
+}
+
+// Prepared for the broadcast operation
+static std::vector<int64_t> get_broadcast_batch_portion(
+    std::vector<int64_t> x, std::vector<int64_t> y) {
+  size_t size_x = x.size();
+  size_t size_y = y.size();
+  size_t size = std::max(size_x, size_y);
+  std::vector<int64_t> batchPortion(size);
+  ptrdiff_t i = (ptrdiff_t)size - 1;
+  for (; i >= 0; --i) {
+    ptrdiff_t offset = size - i - 1;
+    ptrdiff_t dim_x = size_x - offset - 1;
+    ptrdiff_t dim_y = size_y - offset - 1;
+    int64_t x_size = (dim_x >= 0) ? x[dim_x] : 1;
+    int64_t y_size = (dim_y >= 0) ? y[dim_y] : 1;
+    PADDLE_ENFORCE_EQ(
+        (x_size == y_size || x_size == 1 || y_size == 1),
+        true,
+        phi::errors::PreconditionNotMet(
+            "The size of tensor x (%d) must match the size of tensor y "
+            "(%d) at non-singleton dimension %d.",
+            x_size,
+            y_size,
+            i));
+
+    batchPortion[i] = x_size != 1 ? x_size : y_size;
+  }
+  return batchPortion;
+}
+
+static inline std::vector<int> convert_to_int_vec(std::vector<int64_t> a) {
+  std::vector<int> ret;
+  for (size_t i = 0; i < a.size(); i++) {
+    ret.emplace_back(int(a[i]));
+  }
+
+  return ret;
+}
+
+// broadcast the batch dimensions of tensor x and tensor y.
+static inline std::tuple<std::vector<int64_t>, std::vector<int64_t>>
+get_broadcast_dims(const Tensor& x, const Tensor& y) {
+  std::vector<int64_t> x_dims_vec = phi::vectorize(x.dims());
+  std::vector<int64_t> y_dims_vec = phi::vectorize(y.dims());
+  std::vector<int64_t>::const_iterator f1 = x_dims_vec.begin();
+  std::vector<int64_t>::const_iterator l1 = x_dims_vec.end() - 2;
+  std::vector<int64_t> x_dims_vec_cut(f1, l1);
+
+  std::vector<int64_t>::const_iterator f2 = y_dims_vec.begin();
+  std::vector<int64_t>::const_iterator l2 = y_dims_vec.end() - 2;
+  std::vector<int64_t> y_dims_vec_cut(f2, l2);
+
+  std::vector<int64_t> expand_batch_portion =
+      get_broadcast_batch_portion(x_dims_vec_cut, y_dims_vec_cut);
+  std::vector<int64_t> x_expand_size({expand_batch_portion});
+  x_expand_size.insert(x_expand_size.end(),
+                       {x_dims_vec[static_cast<int>(x_dims_vec.size()) - 2],
+                        x_dims_vec[static_cast<int>(x_dims_vec.size()) - 1]});
+  std::vector<int64_t> y_expand_size({expand_batch_portion});
+  y_expand_size.insert(y_expand_size.end(),
+                       {y_dims_vec[static_cast<int>(y_dims_vec.size()) - 2],
+                        y_dims_vec[static_cast<int>(y_dims_vec.size()) - 1]});
+
+  return std::make_tuple(x_expand_size, y_expand_size);
+}
+
+template <typename Context, typename T>
+static void linalg_solve(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& y,
+                         DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  phi::funcs::MatrixSolveFunctor<Context, T> mat_solve;
+
+  // input y can be vector or matrix
+  // but need to be unsqueezed if y is a vector
+  bool is_vector = false;
+  is_vector = is_vector_rhs(x, y);
+
+  Tensor tmp_y;
+  if (is_vector) {
+    dev_ctx.Alloc(&tmp_y, y.dtype());
+
+    phi::Unsqueeze<T, Context>(dev_ctx, y, {-1}, &tmp_y, nullptr);
+  } else {
+    tmp_y.Resize(y.dims());
+    dev_ctx.Alloc(&tmp_y, y.dtype());
+
+    phi::Copy(dev_ctx, y, dev_ctx.GetPlace(), false, &tmp_y);
+  }
+
+  Tensor tmp_x;
+  tmp_x.Resize(x.dims());
+  dev_ctx.Alloc(&tmp_x, x.dtype());
+  phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, &tmp_x);
+
+  std::vector<int64_t> x_broadcast_dims;
+  std::vector<int64_t> y_broadcast_dims;
+  std::tie(x_broadcast_dims, y_broadcast_dims) =
+      get_broadcast_dims(tmp_x, tmp_y);
+
+  Tensor tmp_x_bc;
+
+  phi::ExpandAsKernel<T, Context>(
+      dev_ctx, tmp_x, nullptr, convert_to_int_vec(x_broadcast_dims), &tmp_x_bc);
+
+  Tensor tmp_y_bc;
+  phi::ExpandAsKernel<T, Context>(
+      dev_ctx, tmp_y, nullptr, convert_to_int_vec(y_broadcast_dims), &tmp_y_bc);
+
+  auto x_dim = x.dims();
+  auto y_dim = y.dims();
+  auto x_dim_size = x_dim.size();
+  auto y_dim_size = y_dim.size();
+
+  if (is_vector) {                 // vector case
+    out->Resize(tmp_y_bc.dims());  // out.unsqueeze(-1)
+    mat_solve(dev_ctx, tmp_x_bc, tmp_y_bc, out);
+
+    Tensor out_tmp;
+    out_tmp.Resize(out->dims());
+    out_tmp = *out;
+
+    phi::SqueezeKernel<T, Context>(dev_ctx, out_tmp, {-1}, out, nullptr);
+  } else {
+    PADDLE_ENFORCE_EQ(
+        x_dim[x_dim_size - 1],
+        y_dim[y_dim_size - 2],
+        phi::errors::InvalidArgument(
+            "Matrix X1 with dimension greater than 2 and any matrix Y1,"
+            "the matrix X1's width must be equal with matrix Y1's "
+            "height. But received X's shape = [%s], X1's shape = [%s], X1's "
+            "width = %s; Y's shape = [%s], Y1's shape = [%s], Y1's height = "
+            "%s.",
+            x_dim,
+            x_dim,
+            x_dim[x_dim_size - 1],
+            y_dim,
+            y_dim,
+            y_dim[y_dim_size - 2]));
+    mat_solve(dev_ctx, tmp_x_bc, tmp_y_bc, out);
+  }
+}
+
+template <typename T, typename Context>
+void SolveKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const DenseTensor& y,
+                 DenseTensor* out) {
+  linalg_solve<Context, T>(dev_ctx, x, y, out);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/merged_momentum_kernel.h b/paddle/phi/kernels/merged_momentum_kernel.h
new file mode 100644
index 0000000000000..9f21b988b4bed
--- /dev/null
+++ b/paddle/phi/kernels/merged_momentum_kernel.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MergedMomentumKernel(
+    const Context& dev_ctx,
+    const std::vector<const DenseTensor*>& param,
+    const std::vector<const DenseTensor*>& grad,
+    const std::vector<const DenseTensor*>& velocity,
+    const std::vector<const DenseTensor*>& learning_rate,
+    const paddle::optional<std::vector<const DenseTensor*>>& master_param,
+    float mu,
+    bool use_nesterov,
+    const std::vector<std::string>& regularization_method,
+    const std::vector<float>& regularization_coeff,
+    bool multi_precision,
+    float rescale_grad,
+    std::vector<DenseTensor*> param_out,
+    std::vector<DenseTensor*> velocity_out,
+    std::vector<DenseTensor*> master_param_out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/onednn/log_softmax_kernel.cc b/paddle/phi/kernels/onednn/log_softmax_kernel.cc
new file mode 100644
index 0000000000000..254e975dd45ec
--- /dev/null
+++ b/paddle/phi/kernels/onednn/log_softmax_kernel.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/log_softmax_kernel.h"
+
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+#include "paddle/phi/backends/onednn/onednn_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T>
+class LogSoftmaxMKLDNNHandler
+    : public paddle::platform::
+          MKLDNNHandlerNoCachingT<T, dnnl::logsoftmax_forward> {
+ public:
+  LogSoftmaxMKLDNNHandler(const dnnl::engine mkldnn_engine,
+                          Place cpu_place,
+                          const DenseTensor& x,
+                          const int axis)
+      : paddle::platform::MKLDNNHandlerNoCachingT<T, dnnl::logsoftmax_forward>(
+            mkldnn_engine, cpu_place) {
+    this->AcquireForwardPrimitiveDescriptor(
+        dnnl::prop_kind::forward_inference, x.mem_desc(), axis);
+  }
+};
+
+template <typename T, typename Context>
+void LogSoftmaxKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      int axis,
+                      DenseTensor* out) {
+  const auto& mkldnn_engine = dev_ctx.GetEngine();
+  axis = axis >= 0 ? axis : x.dims().size() + axis;
+
+  LogSoftmaxMKLDNNHandler<T> handler(
+      mkldnn_engine, dev_ctx.GetPlace(), x, axis);
+
+  auto src_memory_p = handler.AcquireSrcMemory(&x);
+  auto dst_memory_p = handler.AcquireDstMemory(out);
+
+  auto logsoftmax_p = handler.AcquireForwardPrimitive();
+
+  auto& astream = OneDNNContext::tls().get_stream();
+  logsoftmax_p->execute(
+      astream, {{DNNL_ARG_SRC, *src_memory_p}, {DNNL_ARG_DST, *dst_memory_p}});
+  astream.wait();
+
+  out->set_mem_desc(dst_memory_p->get_desc());
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(log_softmax,
+                   OneDNN,
+                   ALL_LAYOUT,
+                   phi::LogSoftmaxKernel,
+                   float,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
old mode 100644
new mode 100755
index f2d187f89b252..68eb11bd6d0b9
--- a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
@@ -21,7 +21,17 @@ namespace phi {
 namespace kps {
 namespace details {
 
-int RoundUpDiv(int n, int k) { return (n + k - 1) / k; }
+static inline int RoundUpDiv(int n, int k) { return (n + k - 1) / k; }
+
+static inline int GetXpuReadLens(int numel, int block_num, int grid_num) {
+  const int buf_size = 256;
+  int nthreads = block_num * grid_num;
+  if (numel / nthreads == 1) {
+    return numel / nthreads * 4;
+  }
+  int read_lens = std::min(buf_size, RoundUpDiv(numel, 32 * nthreads) * 32);
+  return read_lens;
+}
 
 enum class OptType {    // Optimize type of calc after input shape compressed
   CanNotOptimize = -1,  // can not optimize, broadcast first
@@ -98,8 +108,10 @@ struct BroadcastConfig {
       strides_out_tmp[i] = strides_out_tmp[i - 1] * out_dims[i - 1];
     }
 
+    int numel_out = 1;
     for (int i = 0; i < dim_size; i++) {
       dim_tmp[i] = in_dims[i];
+      numel_out = out_dims[i] * numel_out;
     }
     kDims = dim_size;
     memcpy(strides_in, strides_in_tmp.data(), kDims * sizeof(int));
@@ -108,13 +120,25 @@ struct BroadcastConfig {
 
     cmp_res = get_mnk_for_broadcast_ops(in_dims, y_in_dims);
     get_opt_type();
-    buf_len = get_buf_len();
+    buf_len = get_buf_len(numel_out);
+    int numel_x = 1;
+    int numel_y = 1;
+    for (int i = 0; i < dim_size; i++) {
+      numel_x = in_dims[i] * numel_x;
+      numel_y = y_in_dims[i] * numel_y;
+    }
+    if (numel_out == numel_x && numel_out == numel_y) {
+      buf_len = GetXpuReadLens(numel_out, 8, 64);
+    }
   }
 
-  int get_buf_len() {
+  int get_buf_len(int numel) {
     if (cmp_type == OptType::CanNotOptimize) {
       return 256;
     }
+    if (cmp_type == OptType::N_1) {
+      return kps::details::GetXpuReadLens(numel, 8, 64);
+    }
     int max_buf_len = 512;
     int buf_len = m / 16 * 16;
     if (buf_len == 0) {
diff --git a/paddle/phi/kernels/solve_grad_kernel.h b/paddle/phi/kernels/solve_grad_kernel.h
new file mode 100644
index 0000000000000..31bdb9932becc
--- /dev/null
+++ b/paddle/phi/kernels/solve_grad_kernel.h
@@ -0,0 +1,30 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SolveGradKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     const DenseTensor& dout,
+                     const DenseTensor& out,
+                     DenseTensor* dx,
+                     DenseTensor* dy);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/coalesced_kernel.h b/paddle/phi/kernels/solve_kernel.h
similarity index 72%
rename from paddle/phi/kernels/sparse/coalesced_kernel.h
rename to paddle/phi/kernels/solve_kernel.h
index 0755579a57ade..28dddb0f641bd 100644
--- a/paddle/phi/kernels/sparse/coalesced_kernel.h
+++ b/paddle/phi/kernels/solve_kernel.h
@@ -15,16 +15,13 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/sparse_coo_tensor.h"
-#include "paddle/phi/kernels/empty_kernel.h"
 
 namespace phi {
-namespace sparse {
 
 template <typename T, typename Context>
-void CoalescedKernel(const Context& dev_ctx,
-                     const SparseCooTensor& x,
-                     SparseCooTensor* out);
+void SolveKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const DenseTensor& y,
+                 DenseTensor* out);
 
-}  // namespace sparse
 }  // namespace phi
diff --git a/paddle/phi/kernels/sparse/coalesce_kernel.h b/paddle/phi/kernels/sparse/coalesce_kernel.h
new file mode 100644
index 0000000000000..cb8b98fd87404
--- /dev/null
+++ b/paddle/phi/kernels/sparse/coalesce_kernel.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void CoalesceKernel(const Context& dev_ctx,
+                    const SparseCooTensor& x,
+                    SparseCooTensor* out);
+
+template <typename T, typename Context>
+SparseCooTensor Coalesce(const Context& dev_ctx, const SparseCooTensor& x) {
+  SparseCooTensor coo;
+  CoalesceKernel<T, Context>(dev_ctx, x, &coo);
+  return coo;
+}
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/convolution_grad_kernel.h b/paddle/phi/kernels/sparse/conv_grad_kernel.h
similarity index 53%
rename from paddle/phi/kernels/sparse/convolution_grad_kernel.h
rename to paddle/phi/kernels/sparse/conv_grad_kernel.h
index eebfcddfc7a9e..205823e620375 100644
--- a/paddle/phi/kernels/sparse/convolution_grad_kernel.h
+++ b/paddle/phi/kernels/sparse/conv_grad_kernel.h
@@ -17,27 +17,26 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/kernels/empty_kernel.h"
-#include "paddle/phi/kernels/sparse/convolution_kernel.h"
 
 namespace phi {
 namespace sparse {
 
 template <typename T, typename Context>
-void Conv3dGradKernel(const Context& dev_ctx,
-                      const SparseCooTensor& x,
-                      const DenseTensor& kernel,
-                      const DenseTensor& rulebook,
-                      const SparseCooTensor& out_grad,
-                      const std::vector<int>& paddings,
-                      const std::vector<int>& dilations,
-                      const std::vector<int>& strides,
-                      const int groups,
-                      const bool subm,
-                      SparseCooTensor* x_grad,
-                      DenseTensor* kernel_grad);
+void Conv3dCooGradKernel(const Context& dev_ctx,
+                         const SparseCooTensor& x,
+                         const DenseTensor& kernel,
+                         const DenseTensor& rulebook,
+                         const SparseCooTensor& out_grad,
+                         const std::vector<int>& paddings,
+                         const std::vector<int>& dilations,
+                         const std::vector<int>& strides,
+                         const int groups,
+                         const bool subm,
+                         SparseCooTensor* x_grad,
+                         DenseTensor* kernel_grad);
 
 template <typename T, typename Context>
-std::tuple<SparseCooTensor, DenseTensor> Conv3dGrad(
+std::tuple<SparseCooTensor, DenseTensor> Conv3dCooGrad(
     const Context& dev_ctx,
     const SparseCooTensor& x,
     const DenseTensor& kernel,
@@ -52,18 +51,18 @@ std::tuple<SparseCooTensor, DenseTensor> Conv3dGrad(
   DenseTensor kernel_grad;
 
   // TODO(zhangkaihuo): call InferMeta func here
-  Conv3dGradKernel<T, Context>(dev_ctx,
-                               x,
-                               kernel,
-                               rulebook,
-                               out_grad,
-                               paddings,
-                               dilations,
-                               strides,
-                               groups,
-                               subm,
-                               &x_grad,
-                               &kernel_grad);
+  Conv3dCooGradKernel<T, Context>(dev_ctx,
+                                  x,
+                                  kernel,
+                                  rulebook,
+                                  out_grad,
+                                  paddings,
+                                  dilations,
+                                  strides,
+                                  groups,
+                                  subm,
+                                  &x_grad,
+                                  &kernel_grad);
   return std::make_tuple(x_grad, kernel_grad);
 }
 
diff --git a/paddle/phi/kernels/sparse/conv_kernel.h b/paddle/phi/kernels/sparse/conv_kernel.h
new file mode 100644
index 0000000000000..fbff46d4390ba
--- /dev/null
+++ b/paddle/phi/kernels/sparse/conv_kernel.h
@@ -0,0 +1,62 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/sparse/convolution.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void Conv3dCooKernel(const Context& dev_ctx,
+                     const SparseCooTensor& x,
+                     const DenseTensor& kernel,
+                     const std::vector<int>& paddings,
+                     const std::vector<int>& dilations,
+                     const std::vector<int>& strides,
+                     const int groups,
+                     const bool subm,
+                     SparseCooTensor* out,
+                     DenseTensor* rulebook);
+
+template <typename T, typename Context>
+SparseCooTensor Conv3dCoo(const Context& dev_ctx,
+                          const SparseCooTensor& x,
+                          const DenseTensor kernel,
+                          const std::vector<int>& paddings,
+                          const std::vector<int>& dilations,
+                          const std::vector<int>& strides,
+                          const int groups,
+                          const bool subm,
+                          DenseTensor* rulebook) {
+  SparseCooTensor coo;
+  Conv3dCooKernel<T, Context>(dev_ctx,
+                              x,
+                              kernel,
+                              paddings,
+                              dilations,
+                              strides,
+                              groups,
+                              subm,
+                              &coo,
+                              rulebook);
+  return coo;
+}
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/convolution_kernel.h b/paddle/phi/kernels/sparse/convolution_kernel.h
deleted file mode 100644
index 62a72a9dd4115..0000000000000
--- a/paddle/phi/kernels/sparse/convolution_kernel.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/sparse_coo_tensor.h"
-#include "paddle/phi/kernels/empty_kernel.h"
-#include "paddle/phi/kernels/funcs/sparse/convolution.h"
-
-namespace phi {
-namespace sparse {
-
-template <typename T, typename Context>
-void Conv3dKernel(const Context& dev_ctx,
-                  const SparseCooTensor& x,
-                  const DenseTensor& kernel,
-                  const std::vector<int>& paddings,
-                  const std::vector<int>& dilations,
-                  const std::vector<int>& strides,
-                  const int groups,
-                  const bool subm,
-                  SparseCooTensor* out,
-                  DenseTensor* rulebook);
-
-template <typename T, typename Context>
-SparseCooTensor Conv3d(const Context& dev_ctx,
-                       const SparseCooTensor& x,
-                       const DenseTensor kernel,
-                       const std::vector<int>& paddings,
-                       const std::vector<int>& dilations,
-                       const std::vector<int>& strides,
-                       const int groups,
-                       const bool subm,
-                       DenseTensor* rulebook) {
-  SparseCooTensor coo;
-  Conv3dKernel<T, Context>(dev_ctx,
-                           x,
-                           kernel,
-                           paddings,
-                           dilations,
-                           strides,
-                           groups,
-                           subm,
-                           &coo,
-                           rulebook);
-  return coo;
-}
-
-}  // namespace sparse
-}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc b/paddle/phi/kernels/sparse/cpu/coalesce_kernel.cc
similarity index 87%
rename from paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc
rename to paddle/phi/kernels/sparse/cpu/coalesce_kernel.cc
index 9d1f71afceb5e..95d8abd6bcf5c 100644
--- a/paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/coalesce_kernel.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/sparse/coalesced_kernel.h"
+#include "paddle/phi/kernels/sparse/coalesce_kernel.h"
 
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/visit_type.h"
@@ -22,9 +22,9 @@ namespace phi {
 namespace sparse {
 
 template <typename T, typename IntT>
-void CoalescedCPUKernel(const CPUContext& dev_ctx,
-                        const SparseCooTensor& x,
-                        SparseCooTensor* out) {
+void CoalesceCPUKernel(const CPUContext& dev_ctx,
+                       const SparseCooTensor& x,
+                       SparseCooTensor* out) {
   const DenseTensor& x_indices = x.non_zero_indices();
   const DenseTensor& x_values = x.non_zero_elements();
   DenseTensor out_indices = phi::EmptyLike<IntT>(dev_ctx, x_indices);
@@ -95,22 +95,22 @@ void CoalescedCPUKernel(const CPUContext& dev_ctx,
 }
 
 template <typename T, typename Context>
-void CoalescedKernel(const Context& dev_ctx,
-                     const SparseCooTensor& x,
-                     SparseCooTensor* out) {
+void CoalesceKernel(const Context& dev_ctx,
+                    const SparseCooTensor& x,
+                    SparseCooTensor* out) {
   PD_VISIT_INTEGRAL_TYPES(
-      x.non_zero_indices().dtype(), "CoalescedCPUKernel", ([&] {
-        CoalescedCPUKernel<T, data_t>(dev_ctx, x, out);
+      x.non_zero_indices().dtype(), "CoalesceCPUKernel", ([&] {
+        CoalesceCPUKernel<T, data_t>(dev_ctx, x, out);
       }));
 }
 
 }  // namespace sparse
 }  // namespace phi
 
-PD_REGISTER_KERNEL(sort,
+PD_REGISTER_KERNEL(coalesce,
                    CPU,
                    ALL_LAYOUT,
-                   phi::sparse::CoalescedKernel,
+                   phi::sparse::CoalesceKernel,
                    float,
                    double,
                    phi::dtype::float16,
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/conv_grad_kernel.cc
similarity index 80%
rename from paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
rename to paddle/phi/kernels/sparse/cpu/conv_grad_kernel.cc
index a675853ac47c1..a8f4441eae897 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/conv_grad_kernel.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
+#include "paddle/phi/kernels/sparse/conv_grad_kernel.h"
 
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
@@ -31,18 +31,18 @@ namespace sparse {
 // x_grad = out_grad * transpose(kenrel)
 // kernel_grad = transpose(x) * out_grad
 template <typename T, typename IntT = int>
-void Conv3dGradCPUKernel(const CPUContext& dev_ctx,
-                         const SparseCooTensor& x,
-                         const DenseTensor& kernel,
-                         const DenseTensor& rulebook,
-                         const SparseCooTensor& out_grad,
-                         const std::vector<int>& paddings,
-                         const std::vector<int>& dilations,
-                         const std::vector<int>& strides,
-                         const int groups,
-                         const bool subm,
-                         SparseCooTensor* x_grad,
-                         DenseTensor* kernel_grad) {
+void Conv3dCooGradCPUKernel(const CPUContext& dev_ctx,
+                            const SparseCooTensor& x,
+                            const DenseTensor& kernel,
+                            const DenseTensor& rulebook,
+                            const SparseCooTensor& out_grad,
+                            const std::vector<int>& paddings,
+                            const std::vector<int>& dilations,
+                            const std::vector<int>& strides,
+                            const int groups,
+                            const bool subm,
+                            SparseCooTensor* x_grad,
+                            DenseTensor* kernel_grad) {
   const auto& kernel_dims = kernel.dims();
   const int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
   const int in_channels = kernel_dims[3];
@@ -178,42 +178,42 @@ void Conv3dGradCPUKernel(const CPUContext& dev_ctx,
 }
 
 template <typename T, typename Context>
-void Conv3dGradKernel(const Context& dev_ctx,
-                      const SparseCooTensor& x,
-                      const DenseTensor& kernel,
-                      const DenseTensor& rulebook,
-                      const SparseCooTensor& out_grad,
-                      const std::vector<int>& paddings,
-                      const std::vector<int>& dilations,
-                      const std::vector<int>& strides,
-                      const int groups,
-                      const bool subm,
-                      SparseCooTensor* x_grad,
-                      DenseTensor* kernel_grad) {
+void Conv3dCooGradKernel(const Context& dev_ctx,
+                         const SparseCooTensor& x,
+                         const DenseTensor& kernel,
+                         const DenseTensor& rulebook,
+                         const SparseCooTensor& out_grad,
+                         const std::vector<int>& paddings,
+                         const std::vector<int>& dilations,
+                         const std::vector<int>& strides,
+                         const int groups,
+                         const bool subm,
+                         SparseCooTensor* x_grad,
+                         DenseTensor* kernel_grad) {
   PD_VISIT_INTEGRAL_TYPES(
-      x.non_zero_indices().dtype(), "Conv3dGradCPUKernel", ([&] {
-        Conv3dGradCPUKernel<T, data_t>(dev_ctx,
-                                       x,
-                                       kernel,
-                                       rulebook,
-                                       out_grad,
-                                       paddings,
-                                       dilations,
-                                       strides,
-                                       groups,
-                                       subm,
-                                       x_grad,
-                                       kernel_grad);
+      x.non_zero_indices().dtype(), "Conv3dCooGradCPUKernel", ([&] {
+        Conv3dCooGradCPUKernel<T, data_t>(dev_ctx,
+                                          x,
+                                          kernel,
+                                          rulebook,
+                                          out_grad,
+                                          paddings,
+                                          dilations,
+                                          strides,
+                                          groups,
+                                          subm,
+                                          x_grad,
+                                          kernel_grad);
       }));
 }
 
 }  // namespace sparse
 }  // namespace phi
 
-PD_REGISTER_KERNEL(sparse_conv3d_grad,
+PD_REGISTER_KERNEL(conv3d_coo_grad,
                    CPU,
                    ALL_LAYOUT,
-                   phi::sparse::Conv3dGradKernel,
+                   phi::sparse::Conv3dCooGradKernel,
                    float,
                    double) {
   kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/conv_kernel.cc
similarity index 83%
rename from paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
rename to paddle/phi/kernels/sparse/cpu/conv_kernel.cc
index 1b95de890deeb..7147a29a9c832 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/conv_kernel.cc
@@ -27,16 +27,16 @@ namespace sparse {
  * out: (N, D, H, W, OC)
  **/
 template <typename T, typename IntT = int>
-void Conv3dCPUKernel(const CPUContext& dev_ctx,
-                     const SparseCooTensor& x,
-                     const DenseTensor& kernel,
-                     const std::vector<int>& paddings,
-                     const std::vector<int>& dilations,
-                     const std::vector<int>& strides,
-                     const int groups,
-                     const bool subm,
-                     SparseCooTensor* out,
-                     DenseTensor* rulebook) {
+void Conv3dCooCPUKernel(const CPUContext& dev_ctx,
+                        const SparseCooTensor& x,
+                        const DenseTensor& kernel,
+                        const std::vector<int>& paddings,
+                        const std::vector<int>& dilations,
+                        const std::vector<int>& strides,
+                        const int groups,
+                        const bool subm,
+                        SparseCooTensor* out,
+                        DenseTensor* rulebook) {
   // update padding and dilation
   // Currently, only support x.layout is NDHWC, groups = 1
   // if x.layout != NDHWC then transpose(x), transpose(weight)
@@ -151,28 +151,28 @@ void Conv3dCPUKernel(const CPUContext& dev_ctx,
 }
 
 template <typename T, typename Context>
-void Conv3dKernel(const Context& dev_ctx,
-                  const SparseCooTensor& x,
-                  const DenseTensor& kernel,
-                  const std::vector<int>& paddings,
-                  const std::vector<int>& dilations,
-                  const std::vector<int>& strides,
-                  const int groups,
-                  const bool subm,
-                  SparseCooTensor* out,
-                  DenseTensor* rulebook) {
+void Conv3dCooKernel(const Context& dev_ctx,
+                     const SparseCooTensor& x,
+                     const DenseTensor& kernel,
+                     const std::vector<int>& paddings,
+                     const std::vector<int>& dilations,
+                     const std::vector<int>& strides,
+                     const int groups,
+                     const bool subm,
+                     SparseCooTensor* out,
+                     DenseTensor* rulebook) {
   PD_VISIT_INTEGRAL_TYPES(
-      x.non_zero_indices().dtype(), "Conv3dCPUKernel", ([&] {
-        Conv3dCPUKernel<T, data_t>(dev_ctx,
-                                   x,
-                                   kernel,
-                                   paddings,
-                                   dilations,
-                                   strides,
-                                   groups,
-                                   subm,
-                                   out,
-                                   rulebook);
+      x.non_zero_indices().dtype(), "Conv3dCooCPUKernel", ([&] {
+        Conv3dCooCPUKernel<T, data_t>(dev_ctx,
+                                      x,
+                                      kernel,
+                                      paddings,
+                                      dilations,
+                                      strides,
+                                      groups,
+                                      subm,
+                                      out,
+                                      rulebook);
       }));
 }
 
@@ -180,6 +180,6 @@ void Conv3dKernel(const Context& dev_ctx,
 }  // namespace phi
 
 PD_REGISTER_KERNEL(
-    sparse_conv3d, CPU, ALL_LAYOUT, phi::sparse::Conv3dKernel, float, double) {
+    conv3d_coo, CPU, ALL_LAYOUT, phi::sparse::Conv3dCooKernel, float, double) {
   kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
 }
diff --git a/paddle/phi/kernels/sparse/cpu/convolution.h b/paddle/phi/kernels/sparse/cpu/convolution.h
index b2544619774c2..373087ade272b 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution.h
+++ b/paddle/phi/kernels/sparse/cpu/convolution.h
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/sparse/convolution_kernel.h"
+#include "paddle/phi/kernels/sparse/conv_kernel.h"
 
 namespace phi {
 namespace sparse {
diff --git a/paddle/phi/kernels/sparse/cpu/elementwise_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/elementwise_grad_kernel.cc
index d9ebbd10267f5..972b4537b9554 100644
--- a/paddle/phi/kernels/sparse/cpu/elementwise_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/elementwise_grad_kernel.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/sparse/cpu/fused_attention_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/fused_attention_grad_kernel.cc
new file mode 100644
index 0000000000000..416b715a9a6a2
--- /dev/null
+++ b/paddle/phi/kernels/sparse/cpu/fused_attention_grad_kernel.cc
@@ -0,0 +1,38 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/sparse/fused_attention_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void FusedAttentionCsrGradKernel(const Context& dev_ctx,
+                                 const DenseTensor& query,
+                                 const DenseTensor& key,
+                                 const DenseTensor& value,
+                                 const SparseCsrTensor& softmax,
+                                 const DenseTensor& dout,
+                                 DenseTensor* dquery,
+                                 DenseTensor* dkey,
+                                 DenseTensor* dvalue) {
+  PD_THROW(
+      "Not support CPU kernel of 'sparse.nn.functional.fused_attention' now");
+}
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/cpu/fused_attention_kernel.cc b/paddle/phi/kernels/sparse/cpu/fused_attention_kernel.cc
new file mode 100644
index 0000000000000..11c9e2d5c2007
--- /dev/null
+++ b/paddle/phi/kernels/sparse/cpu/fused_attention_kernel.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/sparse/fused_attention_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void FusedAttentionCsrKernel(
+    const Context& dev_ctx,
+    const DenseTensor& query,
+    const DenseTensor& key,
+    const DenseTensor& value,
+    const SparseCsrTensor& sparse_mask,
+    const paddle::optional<DenseTensor>& key_padding_mask,
+    const paddle::optional<DenseTensor>& attn_mask,
+    DenseTensor* out,
+    SparseCsrTensor* softmax) {
+  PD_THROW(
+      "Not support CPU kernel of 'sparse.nn.functional.fused_attention' now");
+}
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc b/paddle/phi/kernels/sparse/cpu/mask_kernel.cc
similarity index 99%
rename from paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
rename to paddle/phi/kernels/sparse/cpu/mask_kernel.cc
index cf2acd8557333..92c015101264c 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/mask_kernel.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/sparse/sparse_mask_kernel.h"
+#include "paddle/phi/kernels/sparse/mask_kernel.h"
 
 #include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/core/ddim.h"
diff --git a/paddle/phi/kernels/sparse/cpu/matmul_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/matmul_grad_kernel.cc
index cd1665b66431b..2586976b7636c 100644
--- a/paddle/phi/kernels/sparse/cpu/matmul_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/matmul_grad_kernel.cc
@@ -22,7 +22,7 @@ namespace sparse {
 
 // TODO(zhouwei25): implement CPU backward kernel of " CSR @ DENSE -> DENSE"
 template <typename T, typename Context>
-void CsrDenseMatmulGradKernel(const Context& dev_ctx,
+void MatmulCsrDenseGradKernel(const Context& dev_ctx,
                               const SparseCsrTensor& x,
                               const DenseTensor& y,
                               const DenseTensor& dout,
@@ -34,7 +34,7 @@ void CsrDenseMatmulGradKernel(const Context& dev_ctx,
 
 // TODO(zhouwei25): implement CPU kernel of " DENSE @ DENSE * CSR_MASK -> CSR"
 template <typename T, typename Context>
-void CsrMaskedMatmulGradKernel(const Context& dev_ctx,
+void MaskedMatmulCsrGradKernel(const Context& dev_ctx,
                                const DenseTensor& x,
                                const DenseTensor& y,
                                const SparseCsrTensor& dout,
@@ -47,18 +47,18 @@ void CsrMaskedMatmulGradKernel(const Context& dev_ctx,
 }  // namespace sparse
 }  // namespace phi
 
-PD_REGISTER_KERNEL(csr_dense_matmul_grad,
+PD_REGISTER_KERNEL(matmul_csr_dense_grad,
                    CPU,
                    ALL_LAYOUT,
-                   phi::sparse::CsrDenseMatmulGradKernel,
+                   phi::sparse::MatmulCsrDenseGradKernel,
                    float,
                    double) {
   kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
 }
 
-PD_REGISTER_KERNEL(csr_masked_matmul_grad,
+PD_REGISTER_KERNEL(masked_matmul_csr_grad,
                    CPU,
                    ALL_LAYOUT,
-                   phi::sparse::CsrMaskedMatmulGradKernel,
+                   phi::sparse::MaskedMatmulCsrGradKernel,
                    float,
                    double) {}
diff --git a/paddle/phi/kernels/sparse/cpu/matmul_kernel.cc b/paddle/phi/kernels/sparse/cpu/matmul_kernel.cc
index 0818b8e900a05..8db0ccfd575e5 100644
--- a/paddle/phi/kernels/sparse/cpu/matmul_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/matmul_kernel.cc
@@ -22,7 +22,7 @@ namespace sparse {
 
 // TODO(zhouwei25): implement CPU kernel of " CSR @ DENSE -> DENSE"
 template <typename T, typename Context>
-void CsrDenseMatmulKernel(const Context& dev_ctx,
+void MatmulCsrDenseKernel(const Context& dev_ctx,
                           const SparseCsrTensor& x,
                           const DenseTensor& y,
                           DenseTensor* out) {
@@ -32,7 +32,7 @@ void CsrDenseMatmulKernel(const Context& dev_ctx,
 
 // TODO(zhouwei25): implement CPU kernel of " DENSE @ DENSE * CSR_MASK -> CSR"
 template <typename T, typename Context>
-void CsrMaskedMatmulKernel(const Context& dev_ctx,
+void MaskedMatmulCsrKernel(const Context& dev_ctx,
                            const DenseTensor& x,
                            const DenseTensor& y,
                            const SparseCsrTensor& mask,
@@ -44,18 +44,18 @@ void CsrMaskedMatmulKernel(const Context& dev_ctx,
 }  // namespace sparse
 }  // namespace phi
 
-PD_REGISTER_KERNEL(csr_dense_matmul,
+PD_REGISTER_KERNEL(matmul_csr_dense,
                    CPU,
                    ALL_LAYOUT,
-                   phi::sparse::CsrDenseMatmulKernel,
+                   phi::sparse::MatmulCsrDenseKernel,
                    float,
                    double) {
   kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
 }
 
-PD_REGISTER_KERNEL(csr_masked_matmul,
+PD_REGISTER_KERNEL(masked_matmul_csr,
                    CPU,
                    ALL_LAYOUT,
-                   phi::sparse::CsrMaskedMatmulKernel,
+                   phi::sparse::MaskedMatmulCsrKernel,
                    float,
                    double) {}
diff --git a/paddle/phi/kernels/sparse/cpu/unary_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/unary_grad_kernel.cc
new file mode 100644
index 0000000000000..f8520db2cad6f
--- /dev/null
+++ b/paddle/phi/kernels/sparse/cpu/unary_grad_kernel.cc
@@ -0,0 +1,79 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sparse/unary_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/sparse/impl/unary_grad_kernel_impl.h"
+
+#define PD_REGISTER_SPARSE_UNARY_CPU_GRAD_KERNEL(name, prefix)     \
+  PD_REGISTER_KERNEL(name##_coo_grad,                              \
+                     CPU,                                          \
+                     ALL_LAYOUT,                                   \
+                     phi::sparse::prefix##CooGradKernel,           \
+                     float,                                        \
+                     double) {                                     \
+    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); \
+  }                                                                \
+                                                                   \
+  PD_REGISTER_KERNEL(name##_csr_grad,                              \
+                     CPU,                                          \
+                     ALL_LAYOUT,                                   \
+                     phi::sparse::prefix##CsrGradKernel,           \
+                     float,                                        \
+                     double) {                                     \
+    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); \
+  }
+
+PD_REGISTER_SPARSE_UNARY_CPU_GRAD_KERNEL(sin, Sin)
+PD_REGISTER_SPARSE_UNARY_CPU_GRAD_KERNEL(tan, Tan)
+PD_REGISTER_SPARSE_UNARY_CPU_GRAD_KERNEL(asin, Asin)
+PD_REGISTER_SPARSE_UNARY_CPU_GRAD_KERNEL(atan, Atan)
+PD_REGISTER_SPARSE_UNARY_CPU_GRAD_KERNEL(sinh, Sinh)
+PD_REGISTER_SPARSE_UNARY_CPU_GRAD_KERNEL(tanh, Tanh)
+PD_REGISTER_SPARSE_UNARY_CPU_GRAD_KERNEL(asinh, Asinh)
+PD_REGISTER_SPARSE_UNARY_CPU_GRAD_KERNEL(atanh, Atanh)
+PD_REGISTER_SPARSE_UNARY_CPU_GRAD_KERNEL(sqrt, Sqrt)
+PD_REGISTER_SPARSE_UNARY_CPU_GRAD_KERNEL(square, Square)
+PD_REGISTER_SPARSE_UNARY_CPU_GRAD_KERNEL(log1p, Log1p)
+PD_REGISTER_SPARSE_UNARY_CPU_GRAD_KERNEL(relu, Relu)
+PD_REGISTER_SPARSE_UNARY_CPU_GRAD_KERNEL(abs, Abs)
+PD_REGISTER_SPARSE_UNARY_CPU_GRAD_KERNEL(pow, Pow)
+
+PD_REGISTER_KERNEL(cast_coo_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::CastCooGradKernel,
+                   float,
+                   double,
+                   int8_t,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool) {}
+
+PD_REGISTER_KERNEL(cast_csr_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::CastCsrGradKernel,
+                   float,
+                   double,
+                   int8_t,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool) {}
diff --git a/paddle/phi/kernels/sparse/cpu/unary_kernel.cc b/paddle/phi/kernels/sparse/cpu/unary_kernel.cc
new file mode 100644
index 0000000000000..1c1ece27d97d0
--- /dev/null
+++ b/paddle/phi/kernels/sparse/cpu/unary_kernel.cc
@@ -0,0 +1,139 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sparse/unary_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/sparse/impl/unary_grad_kernel_impl.h"
+#include "paddle/phi/kernels/sparse/impl/unary_kernel_impl.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void DivCooScalarKernel(const Context& dev_ctx,
+                        const SparseCooTensor& x,
+                        float scalar,
+                        SparseCooTensor* out) {
+  EmptyLikeCooKernel<T, Context>(dev_ctx, x, out);
+
+  auto eigen_out =
+      phi::EigenVector<T>::Flatten(*(out->mutable_non_zero_elements()));
+  auto eigen_x = phi::EigenVector<T>::Flatten(x.non_zero_elements());
+  auto& dev = *dev_ctx.eigen_device();
+
+  phi::funcs::EigenDiv<std::decay_t<decltype(dev)>, T>::Eval(
+      dev, eigen_out, eigen_x, static_cast<T>(scalar));
+}
+
+template <typename T, typename Context>
+void DivCsrScalarKernel(const Context& dev_ctx,
+                        const SparseCsrTensor& x,
+                        float scalar,
+                        SparseCsrTensor* out) {
+  EmptyLikeCsrKernel<T, Context>(dev_ctx, x, out);
+
+  auto eigen_out =
+      phi::EigenVector<T>::Flatten(*(out->mutable_non_zero_elements()));
+  auto eigen_x = phi::EigenVector<T>::Flatten(x.non_zero_elements());
+  auto& dev = *dev_ctx.eigen_device();
+
+  phi::funcs::EigenDiv<std::decay_t<decltype(dev)>, T>::Eval(
+      dev, eigen_out, eigen_x, static_cast<T>(scalar));
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+#define PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(name, prefix)          \
+  PD_REGISTER_KERNEL(name##_coo,                                   \
+                     CPU,                                          \
+                     ALL_LAYOUT,                                   \
+                     phi::sparse::prefix##CooKernel,               \
+                     float,                                        \
+                     double) {                                     \
+    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); \
+  }                                                                \
+                                                                   \
+  PD_REGISTER_KERNEL(name##_csr,                                   \
+                     CPU,                                          \
+                     ALL_LAYOUT,                                   \
+                     phi::sparse::prefix##CsrKernel,               \
+                     float,                                        \
+                     double) {                                     \
+    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); \
+  }
+
+PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(sin, Sin)
+PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(tan, Tan)
+PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(asin, Asin)
+PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(atan, Atan)
+PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(sinh, Sinh)
+PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(tanh, Tanh)
+PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(asinh, Asinh)
+PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(atanh, Atanh)
+PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(sqrt, Sqrt)
+PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(square, Square)
+PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(log1p, Log1p)
+PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(relu, Relu)
+PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(abs, Abs)
+PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(pow, Pow)
+PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(scale, Scale)
+
+PD_REGISTER_KERNEL(divide_coo_scalar,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::DivCooScalarKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
+
+PD_REGISTER_KERNEL(divide_csr_scalar,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::DivCsrScalarKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
+}
+
+PD_REGISTER_KERNEL(cast_coo,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::CastCooKernel,
+                   float,
+                   double,
+                   int8_t,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool) {}
+
+PD_REGISTER_KERNEL(cast_csr,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::CastCsrKernel,
+                   float,
+                   double,
+                   int8_t,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool) {}
diff --git a/paddle/phi/kernels/sparse/empty_kernel.cc b/paddle/phi/kernels/sparse/empty_kernel.cc
index fe7fb72b4caa6..115611a272d94 100644
--- a/paddle/phi/kernels/sparse/empty_kernel.cc
+++ b/paddle/phi/kernels/sparse/empty_kernel.cc
@@ -26,37 +26,27 @@ template <typename T, typename Context>
 void EmptyLikeCooKernel(const Context& dev_ctx,
                         const SparseCooTensor& x,
                         SparseCooTensor* out) {
-  const DenseTensor& x_indices = x.non_zero_indices();
+  out->set_dims(x.dims());
+  *(out->mutable_non_zero_indices()) = x.non_zero_indices();
+
   const DenseTensor& x_values = x.non_zero_elements();
-  DenseTensor* out_indices = out->mutable_non_zero_indices();
   DenseTensor* out_values = out->mutable_non_zero_elements();
-
-  phi::Copy(dev_ctx, x_indices, dev_ctx.GetPlace(), false, out_indices);
-
   out_values->Resize(x_values.dims());
   dev_ctx.template Alloc<T>(out_values);
-
-  out->set_dims(x.dims());
 }
 
 template <typename T, typename Context>
 void EmptyLikeCsrKernel(const Context& dev_ctx,
                         const SparseCsrTensor& x,
                         SparseCsrTensor* out) {
-  const DenseTensor& x_crows = x.non_zero_crows();
-  const DenseTensor& x_cols = x.non_zero_cols();
+  out->set_dims(x.dims());
+  *(out->mutable_non_zero_crows()) = x.non_zero_crows();
+  *(out->mutable_non_zero_cols()) = x.non_zero_cols();
+
   const DenseTensor& x_values = x.non_zero_elements();
-  DenseTensor* out_crows = out->mutable_non_zero_crows();
-  DenseTensor* out_cols = out->mutable_non_zero_cols();
   DenseTensor* out_values = out->mutable_non_zero_elements();
-
-  phi::Copy(dev_ctx, x_crows, dev_ctx.GetPlace(), false, out_crows);
-  phi::Copy(dev_ctx, x_cols, dev_ctx.GetPlace(), false, out_cols);
-
   out_values->Resize(x_values.dims());
   dev_ctx.template Alloc<T>(out_values);
-
-  out->set_dims(x.dims());
 }
 
 }  // namespace sparse
@@ -97,6 +87,7 @@ PD_REGISTER_KERNEL(empty_like_coo,
                    GPU,
                    ALL_LAYOUT,
                    phi::sparse::EmptyLikeCooKernel,
+                   phi::dtype::float16,
                    float,
                    double,
                    int8_t,
@@ -112,6 +103,7 @@ PD_REGISTER_KERNEL(empty_like_csr,
                    GPU,
                    ALL_LAYOUT,
                    phi::sparse::EmptyLikeCsrKernel,
+                   phi::dtype::float16,
                    float,
                    double,
                    int8_t,
diff --git a/paddle/phi/kernels/sparse/fused_attention_grad_kernel.h b/paddle/phi/kernels/sparse/fused_attention_grad_kernel.h
new file mode 100644
index 0000000000000..0a025d21f94f3
--- /dev/null
+++ b/paddle/phi/kernels/sparse/fused_attention_grad_kernel.h
@@ -0,0 +1,35 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_csr_tensor.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void FusedAttentionCsrGradKernel(const Context& dev_ctx,
+                                 const DenseTensor& query,
+                                 const DenseTensor& key,
+                                 const DenseTensor& value,
+                                 const SparseCsrTensor& softmax,
+                                 const DenseTensor& dout,
+                                 DenseTensor* dquery,
+                                 DenseTensor* dkey,
+                                 DenseTensor* dvalue);
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/fused_attention_kernel.h b/paddle/phi/kernels/sparse/fused_attention_kernel.h
new file mode 100644
index 0000000000000..340fdce0196c3
--- /dev/null
+++ b/paddle/phi/kernels/sparse/fused_attention_kernel.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_csr_tensor.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void FusedAttentionCsrKernel(
+    const Context& dev_ctx,
+    const DenseTensor& query,
+    const DenseTensor& key,
+    const DenseTensor& value,
+    const SparseCsrTensor& sparse_mask,
+    const paddle::optional<DenseTensor>& key_padding_mask,
+    const paddle::optional<DenseTensor>& attn_mask,
+    DenseTensor* out,
+    SparseCsrTensor* softmax);
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
similarity index 87%
rename from paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu
rename to paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
index 405384009df89..f6aedb8b68fc3 100644
--- a/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/sparse/coalesced_kernel.h"
+#include "paddle/phi/kernels/sparse/coalesce_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
@@ -27,9 +27,9 @@ namespace phi {
 namespace sparse {
 
 template <typename T, typename IntT>
-void CoalescedGPUKernel(const GPUContext& dev_ctx,
-                        const SparseCooTensor& x,
-                        SparseCooTensor* out) {
+void CoalesceGPUKernel(const GPUContext& dev_ctx,
+                       const SparseCooTensor& x,
+                       SparseCooTensor* out) {
   const DenseTensor& x_indices = x.non_zero_indices();
   const DenseTensor& x_values = x.non_zero_elements();
   DenseTensor out_indices = phi::EmptyLike<IntT>(dev_ctx, x_indices);
@@ -55,11 +55,7 @@ void CoalescedGPUKernel(const GPUContext& dev_ctx,
   phi::backends::gpu::GpuMemcpyAsync(d_sparse_offsets.data<IntT>(),
                                      sparse_offsets.data(),
                                      sizeof(IntT) * sparse_dim,
-#ifdef PADDLE_WITH_HIP
-                                     hipMemcpyHostToDevice,
-#else
-                                     cudaMemcpyHostToDevice,
-#endif
+                                     gpuMemcpyHostToDevice,
                                      dev_ctx.stream());
 
   // 1. flatten indices
@@ -117,11 +113,7 @@ void CoalescedGPUKernel(const GPUContext& dev_ctx,
   phi::backends::gpu::GpuMemcpyAsync(&out_nnz,
                                      out_indices.data<IntT>(),
                                      sizeof(IntT),
-#ifdef PADDLE_WITH_HIP
-                                     hipMemcpyDeviceToHost,
-#else
-                                     cudaMemcpyDeviceToHost,
-#endif
+                                     gpuMemcpyDeviceToHost,
                                      dev_ctx.stream());
   dev_ctx.Wait();
 
@@ -161,22 +153,21 @@ void CoalescedGPUKernel(const GPUContext& dev_ctx,
 }
 
 template <typename T, typename Context>
-void CoalescedKernel(const Context& dev_ctx,
-                     const SparseCooTensor& x,
-                     SparseCooTensor* out) {
+void CoalesceKernel(const Context& dev_ctx,
+                    const SparseCooTensor& x,
+                    SparseCooTensor* out) {
   PD_VISIT_INTEGRAL_TYPES(
-      x.non_zero_indices().dtype(), "CoalescedGPUKernel", ([&] {
-        CoalescedGPUKernel<T, data_t>(dev_ctx, x, out);
+      x.non_zero_indices().dtype(), "CoalesceGPUKernel", ([&] {
+        CoalesceGPUKernel<T, data_t>(dev_ctx, x, out);
       }));
 }
-
 }  // namespace sparse
 }  // namespace phi
 
-PD_REGISTER_KERNEL(sort,
+PD_REGISTER_KERNEL(coalesce,
                    GPU,
                    ALL_LAYOUT,
-                   phi::sparse::CoalescedKernel,
+                   phi::sparse::CoalesceKernel,
                    float,
                    double,
                    phi::dtype::float16,
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu
similarity index 84%
rename from paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
rename to paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu
index 1f82f2ff93e96..0ce3558e1d73f 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
+#include "paddle/phi/kernels/sparse/conv_grad_kernel.h"
 
 #include "glog/logging.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
@@ -39,18 +39,18 @@ namespace sparse {
 // x_grad = out_grad * transpose(kenrel)
 // kernel_grad = transpose(x) * out_grad
 template <typename T, typename IntT>
-void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
-                         const SparseCooTensor& x,
-                         const DenseTensor& kernel,
-                         const DenseTensor& rulebook,
-                         const SparseCooTensor& out_grad,
-                         const std::vector<int>& paddings,
-                         const std::vector<int>& dilations,
-                         const std::vector<int>& strides,
-                         const int groups,
-                         const bool subm,
-                         SparseCooTensor* x_grad,
-                         DenseTensor* kernel_grad) {
+void Conv3dCooGradGPUKernel(const GPUContext& dev_ctx,
+                            const SparseCooTensor& x,
+                            const DenseTensor& kernel,
+                            const DenseTensor& rulebook,
+                            const SparseCooTensor& out_grad,
+                            const std::vector<int>& paddings,
+                            const std::vector<int>& dilations,
+                            const std::vector<int>& strides,
+                            const int groups,
+                            const bool subm,
+                            SparseCooTensor* x_grad,
+                            DenseTensor* kernel_grad) {
   const auto& kernel_dims = kernel.dims();
   const int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
   const int in_channels = kernel_dims[3];
@@ -220,42 +220,42 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
 }
 
 template <typename T, typename Context>
-void Conv3dGradKernel(const Context& dev_ctx,
-                      const SparseCooTensor& x,
-                      const DenseTensor& kernel,
-                      const DenseTensor& rulebook,
-                      const SparseCooTensor& out_grad,
-                      const std::vector<int>& paddings,
-                      const std::vector<int>& dilations,
-                      const std::vector<int>& strides,
-                      const int groups,
-                      const bool subm,
-                      SparseCooTensor* x_grad,
-                      DenseTensor* kernel_grad) {
+void Conv3dCooGradKernel(const Context& dev_ctx,
+                         const SparseCooTensor& x,
+                         const DenseTensor& kernel,
+                         const DenseTensor& rulebook,
+                         const SparseCooTensor& out_grad,
+                         const std::vector<int>& paddings,
+                         const std::vector<int>& dilations,
+                         const std::vector<int>& strides,
+                         const int groups,
+                         const bool subm,
+                         SparseCooTensor* x_grad,
+                         DenseTensor* kernel_grad) {
   PD_VISIT_INTEGRAL_TYPES(
-      x.non_zero_indices().dtype(), "Conv3dGradGPUKernel", ([&] {
-        Conv3dGradGPUKernel<T, data_t>(dev_ctx,
-                                       x,
-                                       kernel,
-                                       rulebook,
-                                       out_grad,
-                                       paddings,
-                                       dilations,
-                                       strides,
-                                       groups,
-                                       subm,
-                                       x_grad,
-                                       kernel_grad);
+      x.non_zero_indices().dtype(), "Conv3dCooGradGPUKernel", ([&] {
+        Conv3dCooGradGPUKernel<T, data_t>(dev_ctx,
+                                          x,
+                                          kernel,
+                                          rulebook,
+                                          out_grad,
+                                          paddings,
+                                          dilations,
+                                          strides,
+                                          groups,
+                                          subm,
+                                          x_grad,
+                                          kernel_grad);
       }));
 }
 
 }  // namespace sparse
 }  // namespace phi
 
-PD_REGISTER_KERNEL(sparse_conv3d_grad,
+PD_REGISTER_KERNEL(conv3d_coo_grad,
                    GPU,
                    ALL_LAYOUT,
-                   phi::sparse::Conv3dGradKernel,
+                   phi::sparse::Conv3dCooGradKernel,
                    float,
                    double,
                    phi::dtype::float16) {
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/conv_kernel.cu
similarity index 86%
rename from paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
rename to paddle/phi/kernels/sparse/gpu/conv_kernel.cu
index fe66fb5cff9de..6820b677147f3 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/conv_kernel.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/sparse/convolution_kernel.h"
+#include "paddle/phi/kernels/sparse/conv_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -27,16 +27,16 @@ namespace phi {
 namespace sparse {
 
 template <typename T, typename IntT>
-void Conv3dGPUKernel(const GPUContext& dev_ctx,
-                     const SparseCooTensor& x,
-                     const DenseTensor& kernel,
-                     const std::vector<int>& paddings,
-                     const std::vector<int>& dilations,
-                     const std::vector<int>& strides,
-                     const int groups,
-                     const bool subm,
-                     SparseCooTensor* out,
-                     DenseTensor* rulebook) {
+void Conv3dCooGPUKernel(const GPUContext& dev_ctx,
+                        const SparseCooTensor& x,
+                        const DenseTensor& kernel,
+                        const std::vector<int>& paddings,
+                        const std::vector<int>& dilations,
+                        const std::vector<int>& strides,
+                        const int groups,
+                        const bool subm,
+                        SparseCooTensor* out,
+                        DenseTensor* rulebook) {
   // update padding and dilation
   // Currently, only support x.layout is NDHWC, groups = 1
   // if x.layout != NDHWC then transpose(x), transpose(weight)
@@ -190,38 +190,38 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
  * out: (N, D, H, W, OC)
  **/
 template <typename T, typename Context>
-void Conv3dKernel(const Context& dev_ctx,
-                  const SparseCooTensor& x,
-                  const DenseTensor& kernel,
-                  const std::vector<int>& paddings,
-                  const std::vector<int>& dilations,
-                  const std::vector<int>& strides,
-                  const int groups,
-                  const bool subm,
-                  SparseCooTensor* out,
-                  DenseTensor* rulebook) {
+void Conv3dCooKernel(const Context& dev_ctx,
+                     const SparseCooTensor& x,
+                     const DenseTensor& kernel,
+                     const std::vector<int>& paddings,
+                     const std::vector<int>& dilations,
+                     const std::vector<int>& strides,
+                     const int groups,
+                     const bool subm,
+                     SparseCooTensor* out,
+                     DenseTensor* rulebook) {
   PD_VISIT_INTEGRAL_TYPES(
-      x.non_zero_indices().dtype(), "Conv3dGPUKernel", ([&] {
-        Conv3dGPUKernel<T, data_t>(dev_ctx,
-                                   x,
-                                   kernel,
-                                   paddings,
-                                   dilations,
-                                   strides,
-                                   groups,
-                                   subm,
-                                   out,
-                                   rulebook);
+      x.non_zero_indices().dtype(), "Conv3dCooGPUKernel", ([&] {
+        Conv3dCooGPUKernel<T, data_t>(dev_ctx,
+                                      x,
+                                      kernel,
+                                      paddings,
+                                      dilations,
+                                      strides,
+                                      groups,
+                                      subm,
+                                      out,
+                                      rulebook);
       }));
 }
 
 }  // namespace sparse
 }  // namespace phi
 
-PD_REGISTER_KERNEL(sparse_conv3d,
+PD_REGISTER_KERNEL(conv3d_coo,
                    GPU,
                    ALL_LAYOUT,
-                   phi::sparse::Conv3dKernel,
+                   phi::sparse::Conv3dCooKernel,
                    float,
                    double,
                    phi::dtype::float16) {
diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index d56575cddbfe2..2591d24bfe443 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -28,7 +28,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/sparse/utils.cu.h"
 #include "paddle/phi/kernels/primitive/compute_primitives.h"
-#include "paddle/phi/kernels/sparse/convolution_kernel.h"
+#include "paddle/phi/kernels/sparse/conv_kernel.h"
 
 namespace phi {
 namespace sparse {
diff --git a/paddle/phi/kernels/sparse/gpu/fused_attention_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/fused_attention_grad_kernel.cu
new file mode 100644
index 0000000000000..4d31ad96cdd3b
--- /dev/null
+++ b/paddle/phi/kernels/sparse/gpu/fused_attention_grad_kernel.cu
@@ -0,0 +1,153 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sparse/fused_attention_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_cuda_utils.h"
+#include "paddle/phi/kernels/funcs/sparse/sparse_blas.h"
+#include "paddle/phi/kernels/sparse/empty_kernel.h"
+#include "paddle/phi/kernels/sparse/matmul_grad_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T>
+__global__ void AttnSoftmaxGpuGradKernel(const int64_t* out_crows,
+                                         const T* out_values,
+                                         const T* dout_values,
+                                         T* dx_values,
+                                         int M,
+                                         int total_row_num,
+                                         float scale,
+                                         int batch_nnz) {
+  // dx = (dout - sum(dout * out)) * out
+  int row = blockIdx.x * blockDim.y + threadIdx.y;
+  if (row >= total_row_num) return;
+
+  int cur_batch = row / M;
+  int crow_idx = cur_batch * (M + 1) + (row % M);
+  int row_first = cur_batch * batch_nnz + static_cast<int>(out_crows[crow_idx]);
+  int row_nnz = static_cast<int>(out_crows[crow_idx + 1] - out_crows[crow_idx]);
+  if (row_nnz == 0) return;
+
+  int kIteration = (row_nnz + WARP_SIZE - 1) / WARP_SIZE;
+  T mul_result = 0;
+  for (int i = 0; i < kIteration; ++i) {
+    int idx = threadIdx.x + i * WARP_SIZE;
+    if (idx >= row_nnz) break;
+
+    mul_result += out_values[row_first + idx] * dout_values[row_first + idx];
+  }
+  T sum = phi::funcs::warpReduceSum<T>(mul_result, 0xFFFFFFFF);
+
+  for (int i = 0; i < kIteration; ++i) {
+    int idx = threadIdx.x + i * WARP_SIZE;
+    if (idx >= row_nnz) break;
+
+    dx_values[row_first + idx] = (dout_values[row_first + idx] - sum) *
+                                 out_values[row_first + idx] / scale;
+  }
+}
+
+template <typename T, typename Context>
+void FusedAttentionCsrGradKernel(const Context& dev_ctx,
+                                 const DenseTensor& query,
+                                 const DenseTensor& key,
+                                 const DenseTensor& value,
+                                 const SparseCsrTensor& softmax,
+                                 const DenseTensor& dout,
+                                 DenseTensor* dquery,
+                                 DenseTensor* dkey,
+                                 DenseTensor* dvalue) {
+#if CUDA_VERSION >= 11070
+  /* Step1: Forward: softmax{CSR} * value{Dense} -> out{Dense}, reuse */
+  SparseCsrTensor dsoftmax;
+  CsrDenseMatmulGradKernel<T, Context>(
+      dev_ctx, softmax, value, dout, &dsoftmax, dvalue);
+
+  /* Step2: Calculate grad of sdd_result, manualy not reuse */
+  SparseCsrTensor d_sdd_result;
+  EmptyLikeCsrKernel<T, Context>(dev_ctx, dsoftmax, &d_sdd_result);
+  auto q_dim = query.dims();
+  auto q_rank = q_dim.size();
+
+  int total_row_num = 1;
+  int batch_num = 1;
+  for (int i = 0; i < q_rank - 1; ++i) {
+    total_row_num *= q_dim[i];
+    if (i < q_rank - 2) {
+      batch_num *= q_dim[i];
+    }
+  }
+  int M = q_dim[q_rank - 2];
+  int N = q_dim[q_rank - 1];
+  int batch_nnz = softmax.nnz() / batch_num;
+
+  dim3 grid((total_row_num + 3) / 4);
+  dim3 block(WARP_SIZE, 4);
+
+  AttnSoftmaxGpuGradKernel<T><<<grid, block, 0, dev_ctx.stream()>>>(
+      softmax.non_zero_crows().data<int64_t>(),
+      softmax.non_zero_elements().data<T>(),
+      dsoftmax.mutable_non_zero_elements()->data<T>(),
+      d_sdd_result.mutable_non_zero_elements()->data<T>(),
+      M,
+      total_row_num,
+      std::sqrt(N),
+      batch_nnz);
+
+  /* Step3: Forward: query{Dense} * key'{Dense} -> sdd_result{SparseCsr} */
+  auto sparse_blas = phi::funcs::sparse::GetSparseBlas<Context, T>(dev_ctx);
+  // dquery{Dense} = d_sdd_result{SparseCsr} * key{Dense} //
+  dquery->Resize(query.dims());
+  dev_ctx.template Alloc<T>(dquery);
+  sparse_blas.SPMM(false,
+                   false,
+                   static_cast<T>(1.f),
+                   d_sdd_result,
+                   key,
+                   static_cast<T>(0.f),
+                   dquery);
+
+  // dkey{Dense} = d_sdd_result'{SparseCsr} * query{Dense} //
+  dkey->Resize(key.dims());
+  dev_ctx.template Alloc<T>(dkey);
+  sparse_blas.SPMM(true,
+                   false,
+                   static_cast<T>(1.f),
+                   d_sdd_result,
+                   query,
+                   static_cast<T>(0.f),
+                   dkey);
+#else
+  PADDLE_THROW(
+      phi::errors::Unimplemented("backward of 'sparse.nn.functional.attention' "
+                                 "use 'cusparseCsrSetStridedBatch', which is "
+                                 "completed supported from CUDA 11.7"));
+#endif
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(fused_attention_csr_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::FusedAttentionCsrGradKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
+}
diff --git a/paddle/phi/kernels/sparse/gpu/fused_attention_kernel.cu b/paddle/phi/kernels/sparse/gpu/fused_attention_kernel.cu
new file mode 100644
index 0000000000000..46412d57f16c7
--- /dev/null
+++ b/paddle/phi/kernels/sparse/gpu/fused_attention_kernel.cu
@@ -0,0 +1,285 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/sparse/fused_attention_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/activation_functor.h"
+#include "paddle/phi/kernels/funcs/math_cuda_utils.h"
+#include "paddle/phi/kernels/funcs/sparse/sparse_blas.h"
+#include "paddle/phi/kernels/sparse/empty_kernel.h"
+#include "paddle/phi/kernels/sparse/matmul_kernel.h"
+#include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+#define PRIVATE_CASE_VISIT_ATTN_SOFTMAX(NAME, size, HINT, ...) \
+  case size: {                                                 \
+    constexpr int HINT = size;                                 \
+    __VA_ARGS__();                                             \
+    break;                                                     \
+  }
+
+#define VISIT_ATTN_SFOTMAX(SIZE, NAME, ...)                                 \
+  [&] {                                                                     \
+    const auto& __size__ = SIZE;                                            \
+    switch (__size__) {                                                     \
+      PRIVATE_CASE_VISIT_ATTN_SOFTMAX(NAME, 1, KBufferSize, __VA_ARGS__)    \
+      PRIVATE_CASE_VISIT_ATTN_SOFTMAX(NAME, 2, KBufferSize, __VA_ARGS__)    \
+      PRIVATE_CASE_VISIT_ATTN_SOFTMAX(NAME, 3, KBufferSize, __VA_ARGS__)    \
+      PRIVATE_CASE_VISIT_ATTN_SOFTMAX(NAME, 4, KBufferSize, __VA_ARGS__)    \
+      PRIVATE_CASE_VISIT_ATTN_SOFTMAX(NAME, 8, KBufferSize, __VA_ARGS__)    \
+      PRIVATE_CASE_VISIT_ATTN_SOFTMAX(NAME, 12, KBufferSize, __VA_ARGS__)   \
+      PRIVATE_CASE_VISIT_ATTN_SOFTMAX(NAME, 16, KBufferSize, __VA_ARGS__)   \
+      default:                                                              \
+        PD_THROW("function " #NAME " is not implemented for columns>512 "); \
+    }                                                                       \
+  }()
+
+template <typename T, int BufferSize>
+__global__ void AttnSoftmaxGpuKernel(const int64_t* x_crows,
+                                     const int64_t* x_cols,
+                                     const T* x_values,
+                                     const T* kp_mask,
+                                     const T* attn_mask,
+                                     T* out_values,
+                                     int M,
+                                     int total_row_num,
+                                     float scale,
+                                     int num_heads,
+                                     int batch_nnz) {
+  // out = exp(x-x_max) / sum(exp(x-x_max))
+  int row = blockIdx.x * blockDim.y + threadIdx.y;
+  if (row >= total_row_num) return;
+
+  int cur_batch = row / M;
+  int cur_row = row % M;
+  int crow_idx = cur_batch * (M + 1) + cur_row;
+  int row_first = cur_batch * batch_nnz + static_cast<int>(x_crows[crow_idx]);
+  int row_nnz = static_cast<int>(x_crows[crow_idx + 1] - x_crows[crow_idx]);
+  if (row_nnz == 0) return;
+
+  T buffer[BufferSize] = {0};
+  int kIteration = (row_nnz + WARP_SIZE - 1) / WARP_SIZE;
+
+  T max_val = -std::numeric_limits<T>::infinity();
+  for (int i = 0; i < kIteration; ++i) {
+    bool mask = false;
+    int idx = threadIdx.x + i * WARP_SIZE;
+    if (idx >= row_nnz) break;
+
+    int col_idx = static_cast<int>(x_cols[row_first + idx]);
+
+    if (kp_mask != nullptr &&
+        kp_mask[(cur_batch / num_heads) * M + col_idx] == 0) {
+      mask = true;
+    }
+    if (attn_mask != nullptr && attn_mask[cur_row * M + col_idx] == 0) {
+      mask = true;
+    }
+
+    if (!mask) {
+      buffer[i] = x_values[row_first + idx] / scale;
+      if (buffer[i] > max_val) {
+        max_val = buffer[i];
+      }
+    }
+  }
+  T row_max_val = phi::funcs::warpReduceMax<T>(max_val, 0xFFFFFFFF);
+
+  auto functor = phi::funcs::CudaExpFunctor<T>();
+  T exp_sum = 0;
+  for (int i = 0; i < kIteration; ++i) {
+    int idx = threadIdx.x + i * WARP_SIZE;
+    if (idx >= row_nnz) break;
+
+    if (buffer[i]) {
+      T exp = functor(buffer[i] - row_max_val);
+      exp_sum += exp;
+      buffer[i] = exp;
+    }
+  }
+  T row_exp_sum = phi::funcs::warpReduceSum<T>(exp_sum, 0xFFFFFFFF);
+
+  for (int i = 0; i < kIteration; ++i) {
+    int idx = threadIdx.x + i * WARP_SIZE;
+    if (idx >= row_nnz) break;
+
+    if (buffer[i]) {
+      out_values[row_first + idx] = buffer[i] / row_exp_sum;
+    } else {
+      out_values[row_first + idx] = static_cast<T>(0);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void FusedAttentionCsrKernel(
+    const Context& dev_ctx,
+    const DenseTensor& query,
+    const DenseTensor& key,
+    const DenseTensor& value,
+    const SparseCsrTensor& sparse_mask,
+    const paddle::optional<DenseTensor>& key_padding_mask,
+    const paddle::optional<DenseTensor>& attn_mask,
+    DenseTensor* out,
+    SparseCsrTensor* softmax) {
+#if CUDA_VERSION >= 11070
+  /* Check Shape */
+  auto q_dim = query.dims();
+  auto q_rank = q_dim.size();
+
+  int total_row_num = 1;
+  int batch_num = 1;
+  for (int i = 0; i < q_rank - 1; ++i) {
+    total_row_num *= q_dim[i];
+    if (i < q_rank - 2) {
+      batch_num *= q_dim[i];
+    }
+  }
+  int M = q_dim[q_rank - 2];
+  int N = q_dim[q_rank - 1];
+
+  PADDLE_ENFORCE_EQ(query.dims().size(),
+                    4,
+                    phi::errors::InvalidArgument(" 'query' must be 4D Tensor"));
+  PADDLE_ENFORCE_EQ(key.dims().size(),
+                    4,
+                    phi::errors::InvalidArgument(" 'key' must be 4D Tensor"));
+  PADDLE_ENFORCE_EQ(value.dims().size(),
+                    4,
+                    phi::errors::InvalidArgument(" 'value' must be 4D Tensor"));
+
+  PADDLE_ENFORCE_EQ(
+      sparse_mask.dims().size(),
+      3,
+      phi::errors::InvalidArgument("dense shape of 'sparse_mask' must be "
+                                   "[batch_size*num_heads, seq_len, seq_len]"));
+  PADDLE_ENFORCE_EQ(
+      sparse_mask.dims()[0],
+      q_dim[0] * q_dim[1],
+      phi::errors::InvalidArgument("dense shape of 'sparse_mask' must be "
+                                   "[batch_size*num_heads, seq_len, seq_len]"));
+  PADDLE_ENFORCE_EQ(
+      sparse_mask.dims()[1],
+      M,
+      phi::errors::InvalidArgument("dense shape of 'sparse_mask' must be "
+                                   "[batch_size*num_heads, seq_len, seq_len]"));
+  PADDLE_ENFORCE_EQ(
+      sparse_mask.dims()[2],
+      M,
+      phi::errors::InvalidArgument("dense shape of 'sparse_mask' must be "
+                                   "[batch_size*num_heads, seq_len, seq_len]"));
+
+  const auto kp_mask_ptr = key_padding_mask.get_ptr();
+  if (kp_mask_ptr) {
+    PADDLE_ENFORCE_EQ(
+        kp_mask_ptr->dims().size(),
+        2,
+        phi::errors::InvalidArgument(
+            "shape of 'key_padding_mask' must be [batch_size, seq_len]"));
+    PADDLE_ENFORCE_EQ(
+        kp_mask_ptr->dims()[0],
+        q_dim[0],
+        phi::errors::InvalidArgument(
+            "shape of 'key_padding_mask' must be [batch_size, seq_len]"));
+    PADDLE_ENFORCE_EQ(
+        kp_mask_ptr->dims()[1],
+        M,
+        phi::errors::InvalidArgument(
+            "shape of 'key_padding_mask' must be [batch_size, seq_len]"));
+  }
+
+  const auto attn_mask_ptr = attn_mask.get_ptr();
+  if (attn_mask_ptr) {
+    PADDLE_ENFORCE_EQ(attn_mask_ptr->dims().size(),
+                      2,
+                      phi::errors::InvalidArgument(
+                          "shape of 'attn_mask' must be [seq_len, seq_len]"));
+    PADDLE_ENFORCE_EQ(attn_mask_ptr->dims()[0],
+                      M,
+                      phi::errors::InvalidArgument(
+                          "shape of 'attn_mask' must be [seq_len, seq_len]"));
+    PADDLE_ENFORCE_EQ(attn_mask_ptr->dims()[1],
+                      M,
+                      phi::errors::InvalidArgument(
+                          "shape of 'attn_mask' must be [seq_len, seq_len]"));
+  }
+
+  /* Step1: SDD Matmul, reuse */
+  SparseCsrTensor sdd_result;
+  EmptyLikeCsrKernel<T, Context>(dev_ctx, sparse_mask, &sdd_result);
+  auto sparse_blas = phi::funcs::sparse::GetSparseBlas<Context, T>(dev_ctx);
+  sparse_blas.SDDMM(false,
+                    true,
+                    static_cast<T>(1),
+                    query,
+                    key,
+                    static_cast<T>(0),
+                    &sdd_result);
+
+  /* Step2: Softmax with kp_mask/attn_mask, manualy not reuse */
+  EmptyLikeCsrKernel<T, Context>(dev_ctx, sdd_result, softmax);
+
+  int buffer_size;
+  if (M < 128) {
+    buffer_size = (M + 32 - 1) / 32;
+  } else {
+    buffer_size = ((M + 128 - 1) / 128) * 4;
+  }
+
+  dim3 grid((total_row_num + 3) / 4);
+  dim3 block(WARP_SIZE, 4);
+
+  int batch_nnz = sdd_result.nnz() / batch_num;
+
+  VISIT_ATTN_SFOTMAX(buffer_size, "AttnSoftmaxGpuKernel", [&] {
+    AttnSoftmaxGpuKernel<T, KBufferSize><<<grid, block, 0, dev_ctx.stream()>>>(
+        sdd_result.non_zero_crows().data<int64_t>(),
+        sdd_result.non_zero_cols().data<int64_t>(),
+        sdd_result.non_zero_elements().data<T>(),
+        kp_mask_ptr ? kp_mask_ptr->data<T>() : nullptr,
+        attn_mask_ptr ? attn_mask_ptr->data<T>() : nullptr,
+        softmax->mutable_non_zero_elements()->data<T>(),
+        M,
+        total_row_num,
+        std::sqrt(N),
+        q_dim[1],
+        batch_nnz);
+  });
+
+  /* Step3: DSD Matmul, reuse */
+  softmax->set_dims(phi::make_ddim({q_dim[0], q_dim[1], q_dim[2], q_dim[2]}));
+  CsrDenseMatmulKernel<T, Context>(dev_ctx, *softmax, value, out);
+#else
+  PADDLE_THROW(
+      phi::errors::Unimplemented("forward of 'sparse.nn.functional.attention' "
+                                 "use 'cusparseCsrSetStridedBatch', which is "
+                                 "completed supported from CUDA 11.7"));
+#endif
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(fused_attention_csr,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::FusedAttentionCsrKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
+}
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu b/paddle/phi/kernels/sparse/gpu/mask_kernel.cu
similarity index 72%
rename from paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
rename to paddle/phi/kernels/sparse/gpu/mask_kernel.cu
index 21d6850bdc4aa..39fa89c0379b7 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/mask_kernel.cu
@@ -12,9 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/sparse/sparse_mask_kernel.h"
-
-#include <thrust/binary_search.h>
+#include "paddle/phi/kernels/sparse/mask_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
@@ -24,6 +22,7 @@ limitations under the License. */
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h"
 
@@ -72,11 +71,7 @@ void SparseMaskGPUKernel(const GPUContext& dev_ctx,
   phi::backends::gpu::GpuMemcpyAsync(sparse_offsets.data<int64_t>(),
                                      &h_sparse_offsets[0],
                                      sizeof(int64_t) * sparse_dim,
-#ifdef PADDLE_WITH_HIP
-                                     hipMemcpyHostToDevice,
-#else
-                                     cudaMemcpyHostToDevice,
-#endif
+                                     gpuMemcpyHostToDevice,
                                      dev_ctx.stream());
 
   DenseTensor out_indices = phi::EmptyLike<T>(dev_ctx, indices);
@@ -93,14 +88,15 @@ void SparseMaskGPUKernel(const GPUContext& dev_ctx,
 
   auto config =
       phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num * cols, 1);
-  MaskKernel<T, IntT><<<config.block_per_grid, config.thread_per_block>>>(
-      x_ptr,
-      indices_ptr,
-      sparse_offsets.data<int64_t>(),
-      non_zero_num,
-      cols,
-      sparse_dim,
-      out_values_ptr);
+  MaskKernel<T, IntT>
+      <<<config.block_per_grid, config.thread_per_block, 0, dev_ctx.stream()>>>(
+          x_ptr,
+          indices_ptr,
+          sparse_offsets.data<int64_t>(),
+          non_zero_num,
+          cols,
+          sparse_dim,
+          out_values_ptr);
 
   out->SetMember(out_indices, out_values, dims, true);
 }
@@ -121,19 +117,31 @@ void SparseMaskKernel(const Context& dev_ctx,
       }));
 }
 
-template <typename T, typename IntT>
-__global__ void SparseMaskCopyKernel(const IntT* x_indexs,
-                                     const IntT* mask_indexs,
-                                     const IntT* bound_out,
-                                     const T* x_values,
-                                     const int64_t n,
-                                     const int64_t stride,
-                                     T* out_values) {
+template <typename IntT>
+__global__ void MaskTable(const IntT* x_indexs, const int n, int* table) {
+  CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) {
+    int index = x_indexs[i];
+    table[index] = i == 0 ? -1 : i;
+  }
+}
+
+template <typename T, typename IntT, int VecSize>
+__global__ void MaskCopy(const IntT* mask_indexs,
+                         const int* table,
+                         const int n,
+                         const int stride,
+                         const T* x_values,
+                         T* out_values) {
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  using StoreT = phi::AlignedVector<T, VecSize>;
   CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) {
-    const IntT j = bound_out[i];
-    if (j >= 0 && j < n && mask_indexs[i] == x_indexs[j]) {
-      for (int k = 0; k < stride; k++) {
-        out_values[i * stride + k] = x_values[j * stride + k];
+    int j = table[mask_indexs[i]];
+    if (j != 0) {
+      if (j == -1) j = 0;
+      for (int k = 0; k < stride; k += VecSize) {
+        LoadT vec_x;
+        phi::Load<T, VecSize>(x_values + j * stride + k, &vec_x);
+        phi::Store<T, VecSize>(vec_x, out_values + i * stride + k);
       }
     }
   }
@@ -179,11 +187,7 @@ void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx,
   phi::backends::gpu::GpuMemcpyAsync(d_sparse_offsets.data<IntT>(),
                                      sparse_offsets.data(),
                                      sizeof(IntT) * sparse_dim,
-#ifdef PADDLE_WITH_HIP
-                                     hipMemcpyHostToDevice,
-#else
-                                     cudaMemcpyHostToDevice,
-#endif
+                                     gpuMemcpyHostToDevice,
                                      dev_ctx.stream());
 
   // 3. flatten x indices and mask indices
@@ -210,37 +214,54 @@ void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx,
       mask_indexs.numel(),
       sparse_dim,
       mask_indexs_ptr);
-// 4. call thrust::lower_bound
-#ifdef PADDLE_WITH_HIP
-  thrust::lower_bound(thrust::hip::par.on(dev_ctx.stream()),
-#else
-  thrust::lower_bound(thrust::cuda::par.on(dev_ctx.stream()),
-#endif
-                      x_indexs_ptr,
-                      x_indexs_ptr + x_indexs.numel(),
-                      mask_indexs_ptr,
-                      mask_indexs_ptr + mask_indexs.numel(),
-                      bound_out_ptr);
 
-  // 5. copy value to out
+  int table_size = 1;
+  auto x_dims = x.dims();
+  for (int i = 0; i < x_dims.size() - 1; i++) {
+    table_size *= x_dims[i];
+  }
+  DenseTensor table = phi::Empty<int>(dev_ctx, {table_size});
+  phi::backends::gpu::GpuMemsetAsync(
+      table.data<int>(), 0, table_size * sizeof(int), dev_ctx.stream());
+  const int64_t stride =
+      x.dims().size() == sparse_dim ? 1 : x.non_zero_elements().dims()[1];
   *out = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
   phi::funcs::SetConstant<GPUContext, T> set_zero;
   set_zero(dev_ctx, out, static_cast<T>(0));
   T* out_ptr = out->data<T>();
-
-  const int64_t stride =
-      x.dims().size() == sparse_dim ? 1 : x.non_zero_elements().dims()[1];
-
-  SparseMaskCopyKernel<<<config.block_per_grid,
-                         config.thread_per_block,
-                         0,
-                         dev_ctx.stream()>>>(x_indexs_ptr,
-                                             mask_indexs_ptr,
-                                             bound_out_ptr,
-                                             x.non_zero_elements().data<T>(),
-                                             mask_indexs.numel(),
-                                             stride,
-                                             out_ptr);
+  config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_indexs.numel(), 1);
+  MaskTable<<<config.block_per_grid,
+              config.thread_per_block,
+              0,
+              dev_ctx.stream()>>>(
+      x_indexs_ptr, x_indexs.numel(), table.data<int>());
+  config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, mask_indexs.numel(), 1);
+  const int VecBytes = 16;
+  const int VecSize = VecBytes / sizeof(T);
+  if (stride % VecSize == 0) {
+    MaskCopy<T, IntT, VecSize>
+        <<<config.block_per_grid,
+           config.thread_per_block,
+           0,
+           dev_ctx.stream()>>>(mask_indexs_ptr,
+                               table.data<int>(),
+                               mask_indexs.numel(),
+                               stride,
+                               x.non_zero_elements().data<T>(),
+                               out_ptr);
+  } else {
+    MaskCopy<T, IntT, 1><<<config.block_per_grid,
+                           config.thread_per_block,
+                           0,
+                           dev_ctx.stream()>>>(mask_indexs_ptr,
+                                               table.data<int>(),
+                                               mask_indexs.numel(),
+                                               stride,
+                                               x.non_zero_elements().data<T>(),
+                                               out_ptr);
+  }
 }
 
 template <typename T, typename Context>
@@ -257,7 +278,7 @@ void SparseMaskHelperKernel(const Context& dev_ctx,
 }  // namespace sparse
 }  // namespace phi
 
-PD_REGISTER_KERNEL(sparse_mask,
+PD_REGISTER_KERNEL(mask,
                    GPU,
                    ALL_LAYOUT,
                    phi::sparse::SparseMaskKernel,
@@ -272,7 +293,7 @@ PD_REGISTER_KERNEL(sparse_mask,
   kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO);
 }
 
-PD_REGISTER_KERNEL(sparse_mask_helper,
+PD_REGISTER_KERNEL(mask_helper,
                    GPU,
                    ALL_LAYOUT,
                    phi::sparse::SparseMaskHelperKernel,
diff --git a/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu
index d5c128fea6f29..c4bb66827e35a 100644
--- a/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu
@@ -22,13 +22,52 @@ limitations under the License. */
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/sparse/sparse_blas.h"
 #include "paddle/phi/kernels/sparse/empty_kernel.h"
+#include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
 #include "paddle/phi/kernels/transpose_kernel.h"
 
 namespace phi {
 namespace sparse {
 
 template <typename T, typename Context>
-void CsrDenseMatmulGradKernel(const Context& dev_ctx,
+void MatmulCooDenseGradKernel(const Context& dev_ctx,
+                              const SparseCooTensor& x,
+                              const DenseTensor& y,
+                              const DenseTensor& dout,
+                              SparseCooTensor* dx,
+                              DenseTensor* dy) {
+#if CUDA_VERSION >= 11030
+  auto sparse_blas = phi::funcs::sparse::GetSparseBlas<Context, T>(dev_ctx);
+
+  // dx{SparseCoo} = dout{Dense} * y'{Dense}
+  if (dx) {
+    // 'cusparseSDDMM' only support CSR now, so use COO->CSR->COO,
+    // which will increase some expenses.
+    EmptyLikeCooKernel<T, Context>(dev_ctx, x, dx);
+    SparseCsrTensor dx_csr = SparseCooToCsr<T, Context>(dev_ctx, *dx);
+    sparse_blas.SDDMM(
+        false, true, static_cast<T>(1), dout, y, static_cast<T>(0), &dx_csr);
+    SparseCsrToCooKernel<T, Context>(dev_ctx, dx_csr, dx);
+  }
+
+  // dy{Dense} = x'{SparseCoo} * dout{Dense}
+  if (dy) {
+    MetaTensor meta_dy(dy);
+    meta_dy.set_dims(y.dims());
+    meta_dy.set_dtype(y.dtype());
+    dev_ctx.template Alloc<T>(dy);
+
+    sparse_blas.SPMM(
+        true, false, static_cast<T>(1), x, dout, static_cast<T>(0), dy);
+  }
+#else
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "backward of 'sparse.matmul' use cusparseSDDMM, which is supported from "
+      "CUDA 11.3"));
+#endif
+}
+
+template <typename T, typename Context>
+void MatmulCsrDenseGradKernel(const Context& dev_ctx,
                               const SparseCsrTensor& x,
                               const DenseTensor& y,
                               const DenseTensor& dout,
@@ -66,7 +105,7 @@ void CsrDenseMatmulGradKernel(const Context& dev_ctx,
 }
 
 template <typename T, typename Context>
-void CsrMaskedMatmulGradKernel(const Context& dev_ctx,
+void MaskedMatmulCsrGradKernel(const Context& dev_ctx,
                                const DenseTensor& x,
                                const DenseTensor& y,
                                const SparseCsrTensor& dout,
@@ -119,18 +158,27 @@ void CsrMaskedMatmulGradKernel(const Context& dev_ctx,
 }  // namespace sparse
 }  // namespace phi
 
-PD_REGISTER_KERNEL(csr_dense_matmul_grad,
+PD_REGISTER_KERNEL(matmul_coo_dense_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::MatmulCooDenseGradKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
+
+PD_REGISTER_KERNEL(matmul_csr_dense_grad,
                    GPU,
                    ALL_LAYOUT,
-                   phi::sparse::CsrDenseMatmulGradKernel,
+                   phi::sparse::MatmulCsrDenseGradKernel,
                    float,
                    double) {
   kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
 }
 
-PD_REGISTER_KERNEL(csr_masked_matmul_grad,
+PD_REGISTER_KERNEL(masked_matmul_csr_grad,
                    GPU,
                    ALL_LAYOUT,
-                   phi::sparse::CsrMaskedMatmulGradKernel,
+                   phi::sparse::MaskedMatmulCsrGradKernel,
                    float,
                    double) {}
diff --git a/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu b/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu
index 9357bbd2ad083..3adbce0dd17df 100644
--- a/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu
@@ -31,11 +31,11 @@ limitations under the License. */
 namespace phi {
 namespace sparse {
 
-template <typename T, typename Context>
-void CsrDenseMatmulKernel(const Context& dev_ctx,
-                          const SparseCsrTensor& x,
-                          const DenseTensor& y,
-                          DenseTensor* out) {
+template <typename T, typename Context, typename TensorType>
+void MatmulKernelImpl(const Context& dev_ctx,
+                      const TensorType& x,
+                      const DenseTensor& y,
+                      DenseTensor* out) {
 #if CUDA_VERSION >= 11000
   std::vector<int64_t> xdim_vec = phi::vectorize(x.dims());
   std::vector<int64_t> ydim_vec = phi::vectorize(y.dims());
@@ -76,7 +76,7 @@ void CsrDenseMatmulKernel(const Context& dev_ctx,
   out_dim_vec[y_ndims - 1] = ydim_vec[y_ndims - 1];
   MetaTensor meta_out(out);
   meta_out.set_dims(phi::make_ddim(out_dim_vec));
-  meta_out.set_dtype(x.non_zero_elements().dtype());
+  meta_out.set_dtype(y.dtype());
 
   dev_ctx.template Alloc<T>(out);
 
@@ -91,7 +91,23 @@ void CsrDenseMatmulKernel(const Context& dev_ctx,
 }
 
 template <typename T, typename Context>
-void CsrMaskedMatmulKernel(const Context& dev_ctx,
+void MatmulCooDenseKernel(const Context& dev_ctx,
+                          const SparseCooTensor& x,
+                          const DenseTensor& y,
+                          DenseTensor* out) {
+  MatmulKernelImpl<T>(dev_ctx, x, y, out);
+}
+
+template <typename T, typename Context>
+void MatmulCsrDenseKernel(const Context& dev_ctx,
+                          const SparseCsrTensor& x,
+                          const DenseTensor& y,
+                          DenseTensor* out) {
+  MatmulKernelImpl<T>(dev_ctx, x, y, out);
+}
+
+template <typename T, typename Context>
+void MaskedMatmulCsrKernel(const Context& dev_ctx,
                            const DenseTensor& x,
                            const DenseTensor& y,
                            const SparseCsrTensor& mask,
@@ -176,18 +192,27 @@ void CsrMaskedMatmulKernel(const Context& dev_ctx,
 }  // namespace sparse
 }  // namespace phi
 
-PD_REGISTER_KERNEL(csr_dense_matmul,
+PD_REGISTER_KERNEL(matmul_csr_dense,
                    GPU,
                    ALL_LAYOUT,
-                   phi::sparse::CsrDenseMatmulKernel,
+                   phi::sparse::MatmulCsrDenseKernel,
                    float,
                    double) {
   kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
 }
 
-PD_REGISTER_KERNEL(csr_masked_matmul,
+PD_REGISTER_KERNEL(matmul_coo_dense,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::MatmulCooDenseKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
+
+PD_REGISTER_KERNEL(masked_matmul_csr,
                    GPU,
                    ALL_LAYOUT,
-                   phi::sparse::CsrMaskedMatmulKernel,
+                   phi::sparse::MaskedMatmulCsrKernel,
                    float,
                    double) {}
diff --git a/paddle/phi/kernels/sparse/gpu/softmax_kernel.cu b/paddle/phi/kernels/sparse/gpu/softmax_kernel.cu
index 9c9f5cfbca545..ee0671b333f81 100644
--- a/paddle/phi/kernels/sparse/gpu/softmax_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/softmax_kernel.cu
@@ -52,8 +52,9 @@ __global__ void SoftmaxGpuKernel(const IntT* x_crows,
     int idx = non_zero_idx + i * warpSize;
     if (idx >= row_nnz) break;
 
-    if (max_val < x_values[row_first + idx]) {
-      max_val = x_values[row_first + idx];
+    T val = x_values[row_first + idx];
+    if (val > max_val) {
+      max_val = val;
     }
   }
   T row_max_val = phi::funcs::warpReduceMax<T>(max_val, 0xFFFFFFFF);
diff --git a/paddle/phi/kernels/sparse/gpu/unary_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/unary_grad_kernel.cu
new file mode 100644
index 0000000000000..be0f13fb0e538
--- /dev/null
+++ b/paddle/phi/kernels/sparse/gpu/unary_grad_kernel.cu
@@ -0,0 +1,83 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sparse/unary_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/sparse/impl/unary_grad_kernel_impl.h"
+
+#define PD_REGISTER_SPARSE_UNARY_GPU_GRAD_KERNEL(name, prefix)     \
+  PD_REGISTER_KERNEL(name##_coo_grad,                              \
+                     GPU,                                          \
+                     ALL_LAYOUT,                                   \
+                     phi::sparse::prefix##CooGradKernel,           \
+                     phi::dtype::float16,                          \
+                     float,                                        \
+                     double) {                                     \
+    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); \
+  }                                                                \
+                                                                   \
+  PD_REGISTER_KERNEL(name##_csr_grad,                              \
+                     GPU,                                          \
+                     ALL_LAYOUT,                                   \
+                     phi::sparse::prefix##CsrGradKernel,           \
+                     phi::dtype::float16,                          \
+                     float,                                        \
+                     double) {                                     \
+    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); \
+  }
+
+PD_REGISTER_SPARSE_UNARY_GPU_GRAD_KERNEL(sin, Sin)
+PD_REGISTER_SPARSE_UNARY_GPU_GRAD_KERNEL(tan, Tan)
+PD_REGISTER_SPARSE_UNARY_GPU_GRAD_KERNEL(asin, Asin)
+PD_REGISTER_SPARSE_UNARY_GPU_GRAD_KERNEL(atan, Atan)
+PD_REGISTER_SPARSE_UNARY_GPU_GRAD_KERNEL(sinh, Sinh)
+PD_REGISTER_SPARSE_UNARY_GPU_GRAD_KERNEL(tanh, Tanh)
+PD_REGISTER_SPARSE_UNARY_GPU_GRAD_KERNEL(asinh, Asinh)
+PD_REGISTER_SPARSE_UNARY_GPU_GRAD_KERNEL(atanh, Atanh)
+PD_REGISTER_SPARSE_UNARY_GPU_GRAD_KERNEL(sqrt, Sqrt)
+PD_REGISTER_SPARSE_UNARY_GPU_GRAD_KERNEL(square, Square)
+PD_REGISTER_SPARSE_UNARY_GPU_GRAD_KERNEL(log1p, Log1p)
+PD_REGISTER_SPARSE_UNARY_GPU_GRAD_KERNEL(relu, Relu)
+PD_REGISTER_SPARSE_UNARY_GPU_GRAD_KERNEL(abs, Abs)
+PD_REGISTER_SPARSE_UNARY_GPU_GRAD_KERNEL(pow, Pow)
+
+PD_REGISTER_KERNEL(cast_coo_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::CastCooGradKernel,
+                   phi::dtype::float16,
+                   float,
+                   double,
+                   int8_t,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool) {}
+
+PD_REGISTER_KERNEL(cast_csr_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::CastCsrGradKernel,
+                   phi::dtype::float16,
+                   float,
+                   double,
+                   int8_t,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool) {}
diff --git a/paddle/phi/kernels/sparse/gpu/unary_kernel.cu b/paddle/phi/kernels/sparse/gpu/unary_kernel.cu
new file mode 100644
index 0000000000000..6358b7b983576
--- /dev/null
+++ b/paddle/phi/kernels/sparse/gpu/unary_kernel.cu
@@ -0,0 +1,146 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sparse/unary_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/sparse/impl/unary_kernel_impl.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T>
+struct DivScalarFunctor {
+  T value_;
+
+  explicit DivScalarFunctor(T value) : value_(value) {}
+
+  __device__ __forceinline__ T operator()(const T x) const {
+    return x / value_;
+  }
+};
+
+template <typename T, typename Context>
+void DivCooScalarKernel(const Context& dev_ctx,
+                        const SparseCooTensor& x,
+                        float scalar,
+                        SparseCooTensor* out) {
+  EmptyLikeCooKernel<T, Context>(dev_ctx, x, out);
+
+  std::vector<const DenseTensor*> ins = {&(x.non_zero_elements())};
+  std::vector<DenseTensor*> outs = {out->mutable_non_zero_elements()};
+  DivScalarFunctor<T> func(static_cast<T>(scalar));
+  funcs::ElementwiseKernel<T, DivScalarFunctor<T>>(dev_ctx, ins, &outs, func);
+}
+
+template <typename T, typename Context>
+void DivCsrScalarKernel(const Context& dev_ctx,
+                        const SparseCsrTensor& x,
+                        float scalar,
+                        SparseCsrTensor* out) {
+  EmptyLikeCsrKernel<T, Context>(dev_ctx, x, out);
+
+  std::vector<const DenseTensor*> ins = {&(x.non_zero_elements())};
+  std::vector<DenseTensor*> outs = {out->mutable_non_zero_elements()};
+  DivScalarFunctor<T> func(static_cast<T>(scalar));
+  funcs::ElementwiseKernel<T, DivScalarFunctor<T>>(dev_ctx, ins, &outs, func);
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+#define PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(name, prefix)          \
+  PD_REGISTER_KERNEL(name##_coo,                                   \
+                     GPU,                                          \
+                     ALL_LAYOUT,                                   \
+                     phi::sparse::prefix##CooKernel,               \
+                     phi::dtype::float16,                          \
+                     float,                                        \
+                     double) {                                     \
+    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); \
+  }                                                                \
+                                                                   \
+  PD_REGISTER_KERNEL(name##_csr,                                   \
+                     GPU,                                          \
+                     ALL_LAYOUT,                                   \
+                     phi::sparse::prefix##CsrKernel,               \
+                     phi::dtype::float16,                          \
+                     float,                                        \
+                     double) {                                     \
+    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); \
+  }
+
+PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(sin, Sin)
+PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(tan, Tan)
+PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(asin, Asin)
+PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(atan, Atan)
+PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(sinh, Sinh)
+PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(tanh, Tanh)
+PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(asinh, Asinh)
+PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(atanh, Atanh)
+PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(sqrt, Sqrt)
+PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(square, Square)
+PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(log1p, Log1p)
+PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(relu, Relu)
+PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(abs, Abs)
+PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(pow, Pow)
+PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(scale, Scale)
+
+PD_REGISTER_KERNEL(divide_coo_scalar,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::DivCooScalarKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
+
+PD_REGISTER_KERNEL(divide_csr_scalar,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::DivCsrScalarKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
+}
+
+PD_REGISTER_KERNEL(cast_coo,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::CastCooKernel,
+                   phi::dtype::float16,
+                   float,
+                   double,
+                   int8_t,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool) {}
+
+PD_REGISTER_KERNEL(cast_csr,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::CastCsrKernel,
+                   phi::dtype::float16,
+                   float,
+                   double,
+                   int8_t,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool) {}
diff --git a/paddle/phi/kernels/sparse/impl/unary_grad_kernel_impl.h b/paddle/phi/kernels/sparse/impl/unary_grad_kernel_impl.h
new file mode 100644
index 0000000000000..ffc5f6bbacae3
--- /dev/null
+++ b/paddle/phi/kernels/sparse/impl/unary_grad_kernel_impl.h
@@ -0,0 +1,141 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/core/sparse_csr_tensor.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/abs_grad_kernel.h"
+#include "paddle/phi/kernels/activation_grad_kernel.h"
+#include "paddle/phi/kernels/cast_kernel.h"
+#include "paddle/phi/kernels/sparse/empty_kernel.h"
+#include "paddle/phi/kernels/sparse/impl/unary_kernel_impl.h"
+
+namespace phi {
+namespace sparse {
+
+#define DEFINE_SPARSE_UNARY_GRAD_KERNEL(prefix)                           \
+  template <typename T, typename Context>                                 \
+  void prefix##CooGradKernel(const Context& dev_ctx,                      \
+                             const SparseCooTensor& x_or_out,             \
+                             const SparseCooTensor& dout,                 \
+                             SparseCooTensor* dx) {                       \
+    EmptyLikeCooKernel<T, Context>(dev_ctx, x_or_out, dx);                \
+    phi::prefix##GradKernel<T, Context>(dev_ctx,                          \
+                                        x_or_out.non_zero_elements(),     \
+                                        dout.non_zero_elements(),         \
+                                        dx->mutable_non_zero_elements()); \
+  }                                                                       \
+                                                                          \
+  template <typename T, typename Context>                                 \
+  void prefix##CsrGradKernel(const Context& dev_ctx,                      \
+                             const SparseCsrTensor& x_or_out,             \
+                             const SparseCsrTensor& dout,                 \
+                             SparseCsrTensor* dx) {                       \
+    EmptyLikeCsrKernel<T, Context>(dev_ctx, x_or_out, dx);                \
+    phi::prefix##GradKernel<T, Context>(dev_ctx,                          \
+                                        x_or_out.non_zero_elements(),     \
+                                        dout.non_zero_elements(),         \
+                                        dx->mutable_non_zero_elements()); \
+  }
+
+#define DEFINE_SPARSE_UNARY_GRAD_KERNEL_WITH_ONE_ATTR(prefix, attr)       \
+  template <typename T, typename Context>                                 \
+  void prefix##CooGradKernel(const Context& dev_ctx,                      \
+                             const SparseCooTensor& x_or_out,             \
+                             const SparseCooTensor& dout,                 \
+                             float attr,                                  \
+                             SparseCooTensor* dx) {                       \
+    EmptyLikeCooKernel<T, Context>(dev_ctx, x_or_out, dx);                \
+    phi::prefix##GradKernel<T, Context>(dev_ctx,                          \
+                                        x_or_out.non_zero_elements(),     \
+                                        dout.non_zero_elements(),         \
+                                        attr,                             \
+                                        dx->mutable_non_zero_elements()); \
+  }                                                                       \
+                                                                          \
+  template <typename T, typename Context>                                 \
+  void prefix##CsrGradKernel(const Context& dev_ctx,                      \
+                             const SparseCsrTensor& x_or_out,             \
+                             const SparseCsrTensor& dout,                 \
+                             float attr,                                  \
+                             SparseCsrTensor* dx) {                       \
+    EmptyLikeCsrKernel<T, Context>(dev_ctx, x_or_out, dx);                \
+    phi::prefix##GradKernel<T, Context>(dev_ctx,                          \
+                                        x_or_out.non_zero_elements(),     \
+                                        dout.non_zero_elements(),         \
+                                        attr,                             \
+                                        dx->mutable_non_zero_elements()); \
+  }
+
+DEFINE_SPARSE_UNARY_GRAD_KERNEL(Sin)
+DEFINE_SPARSE_UNARY_GRAD_KERNEL(Tan)
+DEFINE_SPARSE_UNARY_GRAD_KERNEL(Asin)
+DEFINE_SPARSE_UNARY_GRAD_KERNEL(Atan)
+DEFINE_SPARSE_UNARY_GRAD_KERNEL(Sinh)
+DEFINE_SPARSE_UNARY_GRAD_KERNEL(Tanh)
+DEFINE_SPARSE_UNARY_GRAD_KERNEL(Asinh)
+DEFINE_SPARSE_UNARY_GRAD_KERNEL(Atanh)
+DEFINE_SPARSE_UNARY_GRAD_KERNEL(Sqrt)
+DEFINE_SPARSE_UNARY_GRAD_KERNEL(Square)
+DEFINE_SPARSE_UNARY_GRAD_KERNEL(Log1p)
+DEFINE_SPARSE_UNARY_GRAD_KERNEL(Relu)
+DEFINE_SPARSE_UNARY_GRAD_KERNEL(Abs)
+DEFINE_SPARSE_UNARY_GRAD_KERNEL_WITH_ONE_ATTR(Pow, factor)
+
+template <typename T, typename Context>
+void CastCooGradKernel(const Context& dev_ctx,
+                       const SparseCooTensor& x,
+                       const SparseCooTensor& dout,
+                       DataType value_dtype,
+                       SparseCooTensor* dx) {
+  EmptyLikeCooKernel<T, Context>(dev_ctx, x, dx);
+  if (value_dtype == DataType::UNDEFINED) {
+    phi::Copy(dev_ctx,
+              dout.non_zero_elements(),
+              dev_ctx.GetPlace(),
+              false,
+              dx->mutable_non_zero_elements());
+  } else {
+    phi::CastKernel<T, Context>(dev_ctx,
+                                dout.non_zero_elements(),
+                                x.non_zero_elements().dtype(),
+                                dx->mutable_non_zero_elements());
+  }
+}
+
+template <typename T, typename Context>
+void CastCsrGradKernel(const Context& dev_ctx,
+                       const SparseCsrTensor& x,
+                       const SparseCsrTensor& dout,
+                       DataType value_dtype,
+                       SparseCsrTensor* dx) {
+  EmptyLikeCsrKernel<T, Context>(dev_ctx, x, dx);
+  if (value_dtype == DataType::UNDEFINED) {
+    phi::Copy(dev_ctx,
+              dout.non_zero_elements(),
+              dev_ctx.GetPlace(),
+              false,
+              dx->mutable_non_zero_elements());
+  } else {
+    phi::CastKernel<T, Context>(dev_ctx,
+                                dout.non_zero_elements(),
+                                x.non_zero_elements().dtype(),
+                                dx->mutable_non_zero_elements());
+  }
+}
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/impl/unary_kernel_impl.h b/paddle/phi/kernels/sparse/impl/unary_kernel_impl.h
new file mode 100644
index 0000000000000..2639753266db6
--- /dev/null
+++ b/paddle/phi/kernels/sparse/impl/unary_kernel_impl.h
@@ -0,0 +1,207 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/meta_tensor.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/core/sparse_csr_tensor.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/core/visit_type.h"
+#include "paddle/phi/kernels/abs_kernel.h"
+#include "paddle/phi/kernels/activation_kernel.h"
+#include "paddle/phi/kernels/cast_kernel.h"
+#include "paddle/phi/kernels/scale_kernel.h"
+#include "paddle/phi/kernels/sparse/empty_kernel.h"
+#include "paddle/phi/kernels/trunc_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+#define DEFINE_SPARSE_UNARY_KERNEL(prefix)                                 \
+  template <typename T, typename Context>                                  \
+  void prefix##CooKernel(const Context& dev_ctx,                           \
+                         const SparseCooTensor& x,                         \
+                         SparseCooTensor* out) {                           \
+    EmptyLikeCooKernel<T, Context>(dev_ctx, x, out);                       \
+    phi::prefix##Kernel<T, Context>(                                       \
+        dev_ctx, x.non_zero_elements(), out->mutable_non_zero_elements()); \
+  }                                                                        \
+                                                                           \
+  template <typename T, typename Context>                                  \
+  void prefix##CsrKernel(const Context& dev_ctx,                           \
+                         const SparseCsrTensor& x,                         \
+                         SparseCsrTensor* out) {                           \
+    EmptyLikeCsrKernel<T, Context>(dev_ctx, x, out);                       \
+    phi::prefix##Kernel<T, Context>(                                       \
+        dev_ctx, x.non_zero_elements(), out->mutable_non_zero_elements()); \
+  }
+
+#define DEFINE_SPARSE_UNARY_KERNEL_WITH_ONE_ATTR(prefix, attr)         \
+  template <typename T, typename Context>                              \
+  void prefix##CooKernel(const Context& dev_ctx,                       \
+                         const SparseCooTensor& x,                     \
+                         float attr,                                   \
+                         SparseCooTensor* out) {                       \
+    EmptyLikeCooKernel<T, Context>(dev_ctx, x, out);                   \
+    phi::prefix##Kernel<T, Context>(dev_ctx,                           \
+                                    x.non_zero_elements(),             \
+                                    attr,                              \
+                                    out->mutable_non_zero_elements()); \
+  }                                                                    \
+                                                                       \
+  template <typename T, typename Context>                              \
+  void prefix##CsrKernel(const Context& dev_ctx,                       \
+                         const SparseCsrTensor& x,                     \
+                         float attr,                                   \
+                         SparseCsrTensor* out) {                       \
+    EmptyLikeCsrKernel<T, Context>(dev_ctx, x, out);                   \
+    phi::prefix##Kernel<T, Context>(dev_ctx,                           \
+                                    x.non_zero_elements(),             \
+                                    attr,                              \
+                                    out->mutable_non_zero_elements()); \
+  }
+
+DEFINE_SPARSE_UNARY_KERNEL(Sin)
+DEFINE_SPARSE_UNARY_KERNEL(Tan)
+DEFINE_SPARSE_UNARY_KERNEL(Asin)
+DEFINE_SPARSE_UNARY_KERNEL(Atan)
+DEFINE_SPARSE_UNARY_KERNEL(Sinh)
+DEFINE_SPARSE_UNARY_KERNEL(Tanh)
+DEFINE_SPARSE_UNARY_KERNEL(Asinh)
+DEFINE_SPARSE_UNARY_KERNEL(Atanh)
+DEFINE_SPARSE_UNARY_KERNEL(Sqrt)
+DEFINE_SPARSE_UNARY_KERNEL(Square)
+DEFINE_SPARSE_UNARY_KERNEL(Log1p)
+DEFINE_SPARSE_UNARY_KERNEL(Relu)
+DEFINE_SPARSE_UNARY_KERNEL(Abs)
+DEFINE_SPARSE_UNARY_KERNEL_WITH_ONE_ATTR(Pow, factor)
+
+template <typename T, typename Context>
+void ScaleCooKernel(const Context& dev_ctx,
+                    const SparseCooTensor& x,
+                    float scale,
+                    float bias,
+                    bool bias_after_scale,
+                    SparseCooTensor* out) {
+  EmptyLikeCooKernel<T, Context>(dev_ctx, x, out);
+  phi::ScaleKernel<T, Context>(dev_ctx,
+                               x.non_zero_elements(),
+                               scale,
+                               bias,
+                               bias_after_scale,
+                               out->mutable_non_zero_elements());
+}
+
+template <typename T, typename Context>
+void ScaleCsrKernel(const Context& dev_ctx,
+                    const SparseCsrTensor& x,
+                    float scale,
+                    float bias,
+                    bool bias_after_scale,
+                    SparseCsrTensor* out) {
+  EmptyLikeCsrKernel<T, Context>(dev_ctx, x, out);
+  phi::ScaleKernel<T, Context>(dev_ctx,
+                               x.non_zero_elements(),
+                               scale,
+                               bias,
+                               bias_after_scale,
+                               out->mutable_non_zero_elements());
+}
+
+template <typename T, typename Context>
+void CastCooKernel(const Context& dev_ctx,
+                   const SparseCooTensor& x,
+                   DataType index_dtype,
+                   DataType value_dtype,
+                   SparseCooTensor* out) {
+  out->set_dims(x.dims());
+
+  const DenseTensor& x_indices = x.non_zero_indices();
+  const DenseTensor& x_values = x.non_zero_elements();
+  DenseTensor* out_indices = out->mutable_non_zero_indices();
+  DenseTensor* out_values = out->mutable_non_zero_elements();
+
+  if (index_dtype == DataType::UNDEFINED) {
+    *out_indices = x_indices;
+  } else {
+    phi::MetaTensor meta(out_indices);
+    meta.set_dims(x_indices.dims());
+    meta.set_dtype(index_dtype);
+
+    PD_VISIT_INTEGRAL_TYPES(x_indices.dtype(), "CastCooKernel", [&] {
+      phi::CastKernel<data_t, Context>(
+          dev_ctx, x_indices, index_dtype, out_indices);
+    });
+  }
+
+  if (value_dtype == DataType::UNDEFINED) {
+    phi::Copy(dev_ctx, x_values, dev_ctx.GetPlace(), false, out_values);
+  } else {
+    phi::MetaTensor meta(out_values);
+    meta.set_dims(x_values.dims());
+    meta.set_dtype(value_dtype);
+    phi::CastKernel<T, Context>(dev_ctx, x_values, value_dtype, out_values);
+  }
+}
+
+template <typename T, typename Context>
+void CastCsrKernel(const Context& dev_ctx,
+                   const SparseCsrTensor& x,
+                   DataType index_dtype,
+                   DataType value_dtype,
+                   SparseCsrTensor* out) {
+  out->set_dims(x.dims());
+
+  const DenseTensor& x_crows = x.non_zero_crows();
+  const DenseTensor& x_cols = x.non_zero_cols();
+  const DenseTensor& x_values = x.non_zero_elements();
+  DenseTensor* out_crows = out->mutable_non_zero_crows();
+  DenseTensor* out_cols = out->mutable_non_zero_cols();
+  DenseTensor* out_values = out->mutable_non_zero_elements();
+
+  if (index_dtype == DataType::UNDEFINED) {
+    *out_crows = x_crows;
+    *out_cols = x_cols;
+  } else {
+    phi::MetaTensor crows_meta(out_crows);
+    crows_meta.set_dims(x_crows.dims());
+    crows_meta.set_dtype(index_dtype);
+
+    PD_VISIT_INTEGRAL_TYPES(x_crows.dtype(), "CastCsrKernel", [&] {
+      phi::CastKernel<data_t, Context>(
+          dev_ctx, x_crows, index_dtype, out_crows);
+    });
+
+    phi::MetaTensor cols_meta(out_cols);
+    cols_meta.set_dims(x_cols.dims());
+    cols_meta.set_dtype(index_dtype);
+
+    PD_VISIT_INTEGRAL_TYPES(x_cols.dtype(), "CastCsrKernel", [&] {
+      phi::CastKernel<data_t, Context>(dev_ctx, x_cols, index_dtype, out_cols);
+    });
+  }
+
+  if (value_dtype == DataType::UNDEFINED) {
+    phi::Copy(dev_ctx, x_values, dev_ctx.GetPlace(), false, out_values);
+  } else {
+    phi::MetaTensor meta(out_values);
+    meta.set_dims(x_values.dims());
+    meta.set_dtype(value_dtype);
+    phi::CastKernel<T, Context>(dev_ctx, x_values, value_dtype, out_values);
+  }
+}
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/sparse_mask_kernel.h b/paddle/phi/kernels/sparse/mask_kernel.h
similarity index 100%
rename from paddle/phi/kernels/sparse/sparse_mask_kernel.h
rename to paddle/phi/kernels/sparse/mask_kernel.h
diff --git a/paddle/phi/kernels/sparse/matmul_grad_kernel.h b/paddle/phi/kernels/sparse/matmul_grad_kernel.h
index 787691f3515d6..4acb7bb7e1eb5 100644
--- a/paddle/phi/kernels/sparse/matmul_grad_kernel.h
+++ b/paddle/phi/kernels/sparse/matmul_grad_kernel.h
@@ -23,16 +23,16 @@ namespace sparse {
 
 // TODO(zhouwei25): implement Backward of " COO @ COO -> COO"
 template <typename T, typename Context>
-void CooCooMatmulGradKernel(const Context& dev_ctx,
+void MatmulCooCooGradKernel(const Context& dev_ctx,
                             const SparseCooTensor& x,
                             const SparseCooTensor& y,
                             const SparseCooTensor& dout,
                             SparseCooTensor* dx,
                             SparseCooTensor* dy);
 
-// TODO(zhouwei25): implement Backward of " COO @ DENSE -> DENSE"
+// Backward of " COO @ DENSE -> DENSE"
 template <typename T, typename Context>
-void CooDenseMatmulGradKernel(const Context& dev_ctx,
+void MatmulCooDenseGradKernel(const Context& dev_ctx,
                               const SparseCooTensor& x,
                               const DenseTensor& y,
                               const DenseTensor& dout,
@@ -41,7 +41,7 @@ void CooDenseMatmulGradKernel(const Context& dev_ctx,
 
 // TODO(zhouwei25): implement Backward of " CSR @ CSR -> CSR"
 template <typename T, typename Context>
-void CsrCsrMatmulGradKernel(const Context& dev_ctx,
+void MatmulCsrCsrGradKernel(const Context& dev_ctx,
                             const SparseCsrTensor& x,
                             const SparseCsrTensor& y,
                             const SparseCsrTensor& dout,
@@ -50,7 +50,7 @@ void CsrCsrMatmulGradKernel(const Context& dev_ctx,
 
 /* Backward of "CSR @ DENSE -> DENSE" */
 template <typename T, typename Context>
-void CsrDenseMatmulGradKernel(const Context& dev_ctx,
+void MatmulCsrDenseGradKernel(const Context& dev_ctx,
                               const SparseCsrTensor& x,
                               const DenseTensor& y,
                               const DenseTensor& dout,
@@ -59,7 +59,7 @@ void CsrDenseMatmulGradKernel(const Context& dev_ctx,
 
 /* Backward of "DENSE @ DENSE * CSR_MASK -> CSR" */
 template <typename T, typename Context>
-void CsrMaskedMatmulGradKernel(const Context& dev_ctx,
+void MaskedMatmulCsrGradKernel(const Context& dev_ctx,
                                const DenseTensor& x,
                                const DenseTensor& y,
                                const SparseCsrTensor& dout,
diff --git a/paddle/phi/kernels/sparse/matmul_kernel.h b/paddle/phi/kernels/sparse/matmul_kernel.h
index d9093a020c207..a261bbf3cd3f7 100644
--- a/paddle/phi/kernels/sparse/matmul_kernel.h
+++ b/paddle/phi/kernels/sparse/matmul_kernel.h
@@ -23,35 +23,35 @@ namespace sparse {
 
 // TODO(zhouwei25): implement " COO @ COO -> COO"
 template <typename T, typename Context>
-void CooCooMatmulKernel(const Context& dev_ctx,
+void MatmulCooCooKernel(const Context& dev_ctx,
                         const SparseCooTensor& x,
                         const SparseCooTensor& y,
                         SparseCooTensor* out);
 
-// TODO(zhouwei25): implement " COO @ DENSE -> DENSE"
+/* COO @ DENSE -> DENSE */
 template <typename T, typename Context>
-void CooDenseMatmulKernel(const Context& dev_ctx,
+void MatmulCooDenseKernel(const Context& dev_ctx,
                           const SparseCooTensor& x,
                           const DenseTensor& y,
                           DenseTensor* out);
 
 // TODO(zhouwei25): implement " CSR @ CSR -> CSR"
 template <typename T, typename Context>
-void CsrCsrMatmulKernel(const Context& dev_ctx,
+void MatmulCsrCsrKernel(const Context& dev_ctx,
                         const SparseCsrTensor& x,
                         const SparseCsrTensor& y,
                         SparseCsrTensor* out);
 
 /* CSR @ DENSE -> DENSE */
 template <typename T, typename Context>
-void CsrDenseMatmulKernel(const Context& dev_ctx,
+void MatmulCsrDenseKernel(const Context& dev_ctx,
                           const SparseCsrTensor& x,
                           const DenseTensor& y,
                           DenseTensor* out);
 
 /* DENSE @ DENSE * CSR_MASK -> CSR */
 template <typename T, typename Context>
-void CsrMaskedMatmulKernel(const Context& dev_ctx,
+void MaskedMatmulCsrKernel(const Context& dev_ctx,
                            const DenseTensor& x,
                            const DenseTensor& y,
                            const SparseCsrTensor& mask,
diff --git a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
index 69677be34b231..9425c14b79b36 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h"
 
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/sparse/sparse_mask_kernel.h"
 
 namespace phi {
 namespace sparse {
diff --git a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h
index a00b9c275c292..7cf97c3f48ece 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h
+++ b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
-#include "paddle/phi/kernels/sparse/sparse_mask_kernel.h"
+#include "paddle/phi/kernels/sparse/mask_kernel.h"
 
 namespace phi {
 namespace sparse {
diff --git a/paddle/phi/kernels/sparse/sparse_utils_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
index 93abf70b24412..12d55596a935d 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_kernel.h
+++ b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
 #include "paddle/phi/kernels/empty_kernel.h"
-#include "paddle/phi/kernels/sparse/coalesced_kernel.h"
 
 namespace phi {
 namespace sparse {
@@ -154,9 +153,8 @@ void SparseCooTensorKernel(const Context& dev_ctx,
                            const DenseTensor& indices,
                            const IntArray& dense_shape,
                            SparseCooTensor* out) {
-  SparseCooTensor before_coalesced(
-      indices, values, phi::make_ddim(dense_shape.GetData()));
-  CoalescedKernel<T, Context>(dev_ctx, before_coalesced, out);
+  *out =
+      SparseCooTensor(indices, values, phi::make_ddim(dense_shape.GetData()));
 }
 
 }  // namespace sparse
diff --git a/paddle/phi/kernels/sparse/unary_grad_kernel.cc b/paddle/phi/kernels/sparse/unary_grad_kernel.cc
deleted file mode 100644
index cd844532e938f..0000000000000
--- a/paddle/phi/kernels/sparse/unary_grad_kernel.cc
+++ /dev/null
@@ -1,183 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/kernels/sparse/unary_grad_kernel.h"
-
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/sparse_coo_tensor.h"
-#include "paddle/phi/core/sparse_csr_tensor.h"
-#include "paddle/phi/core/tensor_utils.h"
-#include "paddle/phi/kernels/activation_grad_kernel.h"
-#include "paddle/phi/kernels/empty_kernel.h"
-
-#define DEFINE_SPARSE_UNARY_GRAD_KERNEL(DenseKernelFunc)                    \
-  namespace phi {                                                           \
-  namespace sparse {                                                        \
-                                                                            \
-  template <typename T, typename Context>                                   \
-  void SparseCoo##DenseKernelFunc(const Context& dev_ctx,                   \
-                                  const SparseCooTensor& x_or_out,          \
-                                  const SparseCooTensor& out_grad,          \
-                                  SparseCooTensor* x_grad) {                \
-    DenseTensor non_zero_indices =                                          \
-        phi::EmptyLike<T, Context>(dev_ctx, x_or_out.non_zero_indices());   \
-    DenseTensor non_zero_elements =                                         \
-        phi::EmptyLike<T, Context>(dev_ctx, x_or_out.non_zero_elements());  \
-    phi::Copy(dev_ctx,                                                      \
-              x_or_out.non_zero_indices(),                                  \
-              dev_ctx.GetPlace(),                                           \
-              false,                                                        \
-              &non_zero_indices);                                           \
-    phi::DenseKernelFunc<T, Context>(dev_ctx,                               \
-                                     x_or_out.non_zero_elements(),          \
-                                     out_grad.non_zero_elements(),          \
-                                     &non_zero_elements);                   \
-    x_grad->SetMember(                                                      \
-        non_zero_indices, non_zero_elements, x_or_out.dims(), true);        \
-  }                                                                         \
-                                                                            \
-  template <typename T, typename Context>                                   \
-  void SparseCsr##DenseKernelFunc(const Context& dev_ctx,                   \
-                                  const SparseCsrTensor& x_or_out,          \
-                                  const SparseCsrTensor& out_grad,          \
-                                  SparseCsrTensor* out) {                   \
-    DenseTensor non_zero_crows =                                            \
-        phi::EmptyLike<T, Context>(dev_ctx, x_or_out.non_zero_crows());     \
-    DenseTensor non_zero_cols =                                             \
-        phi::EmptyLike<T, Context>(dev_ctx, x_or_out.non_zero_cols());      \
-    DenseTensor non_zero_elements =                                         \
-        phi::EmptyLike<T, Context>(dev_ctx, x_or_out.non_zero_elements());  \
-    phi::Copy(dev_ctx,                                                      \
-              x_or_out.non_zero_crows(),                                    \
-              dev_ctx.GetPlace(),                                           \
-              false,                                                        \
-              &non_zero_crows);                                             \
-    phi::Copy(dev_ctx,                                                      \
-              x_or_out.non_zero_cols(),                                     \
-              dev_ctx.GetPlace(),                                           \
-              false,                                                        \
-              &non_zero_cols);                                              \
-    phi::DenseKernelFunc<T, Context>(dev_ctx,                               \
-                                     x_or_out.non_zero_elements(),          \
-                                     out_grad.non_zero_elements(),          \
-                                     &non_zero_elements);                   \
-    out->SetMember(                                                         \
-        non_zero_crows, non_zero_cols, non_zero_elements, x_or_out.dims()); \
-  }                                                                         \
-  }                                                                         \
-  }
-
-#define REGISTER_CPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) \
-  PD_REGISTER_KERNEL(sparse_coo_##kernel_name,                         \
-                     CPU,                                              \
-                     ALL_LAYOUT,                                       \
-                     phi::sparse::SparseCoo##DenseKernelFunc,          \
-                     float,                                            \
-                     double) {                                         \
-    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);     \
-  }                                                                    \
-  PD_REGISTER_KERNEL(sparse_csr_##kernel_name,                         \
-                     CPU,                                              \
-                     ALL_LAYOUT,                                       \
-                     phi::sparse::SparseCsr##DenseKernelFunc,          \
-                     float,                                            \
-                     double) {                                         \
-    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);     \
-  }
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#define REGISTER_GPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) \
-  PD_REGISTER_KERNEL(sparse_coo_##kernel_name,                         \
-                     GPU,                                              \
-                     ALL_LAYOUT,                                       \
-                     phi::sparse::SparseCoo##DenseKernelFunc,          \
-                     float,                                            \
-                     double,                                           \
-                     phi::dtype::float16) {                            \
-    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);     \
-  }                                                                    \
-                                                                       \
-  PD_REGISTER_KERNEL(sparse_csr_##kernel_name,                         \
-                     GPU,                                              \
-                     ALL_LAYOUT,                                       \
-                     phi::sparse::SparseCsr##DenseKernelFunc,          \
-                     float,                                            \
-                     double,                                           \
-                     phi::dtype::float16) {                            \
-    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);     \
-  }
-#else
-// This macro definition is empty when GPU is disabled
-#define REGISTER_GPU_SPARSE_UNARY_KERNEL(sparse_kernel_name, DenseKernelFunc)
-#endif
-
-#define REGISTER_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) \
-  REGISTER_CPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc)   \
-  REGISTER_GPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc)
-
-#define DEFINE_AND_REGISTER_SPARSE_UNARY_GRAD_KERNEL(kernel_name,     \
-                                                     DenseKernelFunc) \
-  DEFINE_SPARSE_UNARY_GRAD_KERNEL(DenseKernelFunc)                    \
-  REGISTER_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc)
-
-// NOTE: the following code is to bypass the restriction of Paddle
-// kernel registration mechanism. Do NOT refactor them unless you
-// know what you are doing.
-// If you want to implement any new kernel, please follow `sin_grad`,
-// `tanh_grad` etc, do NOT follow the following `relu_grad`.
-DEFINE_SPARSE_UNARY_GRAD_KERNEL(ReluGradKernel)
-
-PD_REGISTER_KERNEL(sparse_coo_relu_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::sparse::SparseCooReluGradKernel,
-                   float,
-                   double) {
-  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
-}
-PD_REGISTER_KERNEL(sparse_csr_relu_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::sparse::SparseCsrReluGradKernel,
-                   float,
-                   double) {
-  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
-}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_REGISTER_KERNEL(sparse_coo_relu_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::sparse::SparseCooReluGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {
-  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
-}
-
-PD_REGISTER_KERNEL(sparse_csr_relu_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::sparse::SparseCsrReluGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {
-  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
-}
-#endif
-
-DEFINE_AND_REGISTER_SPARSE_UNARY_GRAD_KERNEL(sin_grad, SinGradKernel)
-DEFINE_AND_REGISTER_SPARSE_UNARY_GRAD_KERNEL(sqrt_grad, SqrtGradKernel)
-DEFINE_AND_REGISTER_SPARSE_UNARY_GRAD_KERNEL(tanh_grad, TanhGradKernel)
diff --git a/paddle/phi/kernels/sparse/unary_grad_kernel.h b/paddle/phi/kernels/sparse/unary_grad_kernel.h
index 24ea4fee1a4fd..eb2cf9ed697e9 100644
--- a/paddle/phi/kernels/sparse/unary_grad_kernel.h
+++ b/paddle/phi/kernels/sparse/unary_grad_kernel.h
@@ -17,25 +17,65 @@
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
 
-#define DECLARE_SPARSE_UNARY_GRAD_KERNEL(name)                      \
-  template <typename T, typename Context>                           \
-  void SparseCoo##name##GradKernel(const Context& dev_ctx,          \
-                                   const SparseCooTensor& x,        \
-                                   const SparseCooTensor& out_grad, \
-                                   SparseCooTensor* x_grad);        \
-                                                                    \
-  template <typename T, typename Context>                           \
-  void SparseCsr##name##GradKernel(const Context& dev_ctx,          \
-                                   const SparseCsrTensor& x,        \
-                                   const SparseCsrTensor& out_grad, \
-                                   SparseCsrTensor* x_grad);
-
 namespace phi {
 namespace sparse {
 
+#define DECLARE_SPARSE_UNARY_GRAD_KERNEL(prefix)              \
+  template <typename T, typename Context>                     \
+  void prefix##CooGradKernel(const Context& dev_ctx,          \
+                             const SparseCooTensor& x_or_out, \
+                             const SparseCooTensor& dout,     \
+                             SparseCooTensor* dx);            \
+                                                              \
+  template <typename T, typename Context>                     \
+  void prefix##CsrGradKernel(const Context& dev_ctx,          \
+                             const SparseCsrTensor& x_or_out, \
+                             const SparseCsrTensor& dout,     \
+                             SparseCsrTensor* dx);
+
+#define DECLARE_SPARSE_UNARY_GRAD_KERNEL_WITH_ONE_ATTR(prefix, attr) \
+  template <typename T, typename Context>                            \
+  void prefix##CooGradKernel(const Context& dev_ctx,                 \
+                             const SparseCooTensor& x_or_out,        \
+                             const SparseCooTensor& dout,            \
+                             float attr,                             \
+                             SparseCooTensor* dx);                   \
+                                                                     \
+  template <typename T, typename Context>                            \
+  void prefix##CsrGradKernel(const Context& dev_ctx,                 \
+                             const SparseCsrTensor& x_or_out,        \
+                             const SparseCsrTensor& dout,            \
+                             float attr,                             \
+                             SparseCsrTensor* dx);
+
+DECLARE_SPARSE_UNARY_GRAD_KERNEL(Sin)
+DECLARE_SPARSE_UNARY_GRAD_KERNEL(Tan)
+DECLARE_SPARSE_UNARY_GRAD_KERNEL(Asin)
+DECLARE_SPARSE_UNARY_GRAD_KERNEL(Atan)
+DECLARE_SPARSE_UNARY_GRAD_KERNEL(Sinh)
+DECLARE_SPARSE_UNARY_GRAD_KERNEL(Asinh)
+DECLARE_SPARSE_UNARY_GRAD_KERNEL(Atanh)
 DECLARE_SPARSE_UNARY_GRAD_KERNEL(Relu)
+DECLARE_SPARSE_UNARY_GRAD_KERNEL(Tanh)
+DECLARE_SPARSE_UNARY_GRAD_KERNEL(Square)
 DECLARE_SPARSE_UNARY_GRAD_KERNEL(Sqrt)
-DECLARE_SPARSE_UNARY_GRAD_KERNEL(Sin)
+DECLARE_SPARSE_UNARY_GRAD_KERNEL(Log1p)
+DECLARE_SPARSE_UNARY_GRAD_KERNEL(Abs)
+DECLARE_SPARSE_UNARY_GRAD_KERNEL_WITH_ONE_ATTR(Pow, factor)
+
+template <typename T, typename Context>
+void CastCooGradKernel(const Context& dev_ctx,
+                       const SparseCooTensor& x,
+                       const SparseCooTensor& dout,
+                       DataType value_dtype,
+                       SparseCooTensor* dx);
+
+template <typename T, typename Context>
+void CastCsrGradKernel(const Context& dev_ctx,
+                       const SparseCsrTensor& x,
+                       const SparseCsrTensor& dout,
+                       DataType value_dtype,
+                       SparseCsrTensor* dx);
 
 }  // namespace sparse
 }  // namespace phi
diff --git a/paddle/phi/kernels/sparse/unary_kernel.cc b/paddle/phi/kernels/sparse/unary_kernel.cc
deleted file mode 100644
index 2999536b34ee9..0000000000000
--- a/paddle/phi/kernels/sparse/unary_kernel.cc
+++ /dev/null
@@ -1,177 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/kernels/sparse/unary_kernel.h"
-
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/core/sparse_coo_tensor.h"
-#include "paddle/phi/core/sparse_csr_tensor.h"
-#include "paddle/phi/core/tensor_utils.h"
-#include "paddle/phi/kernels/activation_kernel.h"
-#include "paddle/phi/kernels/empty_kernel.h"
-
-#define DEFINE_SPARSE_UNARY_KERNEL(DenseKernelFunc)                      \
-  namespace phi {                                                        \
-  namespace sparse {                                                     \
-                                                                         \
-  template <typename T, typename Context>                                \
-  void SparseCoo##DenseKernelFunc(const Context& dev_ctx,                \
-                                  const SparseCooTensor& x,              \
-                                  SparseCooTensor* out) {                \
-    DenseTensor non_zero_indices =                                       \
-        phi::EmptyLike<T, Context>(dev_ctx, x.non_zero_indices());       \
-    DenseTensor non_zero_elements =                                      \
-        phi::EmptyLike<T, Context>(dev_ctx, x.non_zero_elements());      \
-    phi::Copy(dev_ctx,                                                   \
-              x.non_zero_indices(),                                      \
-              dev_ctx.GetPlace(),                                        \
-              false,                                                     \
-              &non_zero_indices);                                        \
-    phi::DenseKernelFunc<T, Context>(                                    \
-        dev_ctx, x.non_zero_elements(), &non_zero_elements);             \
-    out->SetMember(non_zero_indices, non_zero_elements, x.dims(), true); \
-  }                                                                      \
-                                                                         \
-  template <typename T, typename Context>                                \
-  void SparseCsr##DenseKernelFunc(const Context& dev_ctx,                \
-                                  const SparseCsrTensor& x,              \
-                                  SparseCsrTensor* out) {                \
-    DenseTensor non_zero_crows =                                         \
-        phi::EmptyLike<T, Context>(dev_ctx, x.non_zero_crows());         \
-    DenseTensor non_zero_cols =                                          \
-        phi::EmptyLike<T, Context>(dev_ctx, x.non_zero_cols());          \
-    DenseTensor non_zero_elements =                                      \
-        phi::EmptyLike<T, Context>(dev_ctx, x.non_zero_elements());      \
-    phi::Copy(dev_ctx,                                                   \
-              x.non_zero_crows(),                                        \
-              dev_ctx.GetPlace(),                                        \
-              false,                                                     \
-              &non_zero_crows);                                          \
-    phi::Copy(dev_ctx,                                                   \
-              x.non_zero_cols(),                                         \
-              dev_ctx.GetPlace(),                                        \
-              false,                                                     \
-              &non_zero_cols);                                           \
-    phi::DenseKernelFunc<T, Context>(                                    \
-        dev_ctx, x.non_zero_elements(), &non_zero_elements);             \
-    out->SetMember(                                                      \
-        non_zero_crows, non_zero_cols, non_zero_elements, x.dims());     \
-  }                                                                      \
-  }                                                                      \
-  }
-
-#define REGISTER_CPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) \
-  PD_REGISTER_KERNEL(sparse_coo_##kernel_name,                         \
-                     CPU,                                              \
-                     ALL_LAYOUT,                                       \
-                     phi::sparse::SparseCoo##DenseKernelFunc,          \
-                     float,                                            \
-                     double) {                                         \
-    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);     \
-  }                                                                    \
-  PD_REGISTER_KERNEL(sparse_csr_##kernel_name,                         \
-                     CPU,                                              \
-                     ALL_LAYOUT,                                       \
-                     phi::sparse::SparseCsr##DenseKernelFunc,          \
-                     float,                                            \
-                     double) {                                         \
-    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);     \
-  }
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#define REGISTER_GPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) \
-  PD_REGISTER_KERNEL(sparse_coo_##kernel_name,                         \
-                     GPU,                                              \
-                     ALL_LAYOUT,                                       \
-                     phi::sparse::SparseCoo##DenseKernelFunc,          \
-                     float,                                            \
-                     double,                                           \
-                     phi::dtype::float16) {                            \
-    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);     \
-  }                                                                    \
-                                                                       \
-  PD_REGISTER_KERNEL(sparse_csr_##kernel_name,                         \
-                     GPU,                                              \
-                     ALL_LAYOUT,                                       \
-                     phi::sparse::SparseCsr##DenseKernelFunc,          \
-                     float,                                            \
-                     double,                                           \
-                     phi::dtype::float16) {                            \
-    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);     \
-  }
-#else
-// This macro definition is empty when GPU is disabled
-#define REGISTER_GPU_SPARSE_UNARY_KERNEL(sparse_kernel_name, DenseKernelFunc)
-#endif
-
-#define REGISTER_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) \
-  REGISTER_CPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc)   \
-  REGISTER_GPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc)
-
-#define DEFINE_AND_REGISTER_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) \
-  DEFINE_SPARSE_UNARY_KERNEL(DenseKernelFunc)                                 \
-  REGISTER_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc)
-
-// NOTE: the following code is to bypass the restriction of Paddle
-// kernel registration mechanism. Do NOT refactor them unless you
-// know what you are doing.
-// If you want to implement any new kernel, please follow `sin`,
-// `tanh` etc, do NOT follow `sqrt`.
-DEFINE_SPARSE_UNARY_KERNEL(SqrtKernel)
-
-PD_REGISTER_KERNEL(sparse_coo_sqrt,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::sparse::SparseCooSqrtKernel,
-                   float,
-                   double) {
-  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
-}
-PD_REGISTER_KERNEL(sparse_csr_sqrt,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::sparse::SparseCsrSqrtKernel,
-                   float,
-                   double) {
-  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
-}
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_REGISTER_KERNEL(sparse_coo_sqrt,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::sparse::SparseCooSqrtKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {
-  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
-}
-
-PD_REGISTER_KERNEL(sparse_csr_sqrt,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::sparse::SparseCsrSqrtKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {
-  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
-}
-
-#endif
-
-DEFINE_AND_REGISTER_SPARSE_UNARY_KERNEL(sin, SinKernel)
-DEFINE_AND_REGISTER_SPARSE_UNARY_KERNEL(tanh, TanhKernel)
-DEFINE_AND_REGISTER_SPARSE_UNARY_KERNEL(relu, ReluKernel)
diff --git a/paddle/phi/kernels/sparse/unary_kernel.h b/paddle/phi/kernels/sparse/unary_kernel.h
index 4470173c143db..fdb6b21a44427 100644
--- a/paddle/phi/kernels/sparse/unary_kernel.h
+++ b/paddle/phi/kernels/sparse/unary_kernel.h
@@ -14,35 +14,104 @@
 
 #pragma once
 
-#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
-#include "paddle/phi/kernels/activation_kernel.h"
-#include "paddle/phi/kernels/empty_kernel.h"
 
-#define DECLARE_SPARSE_UNARY_KERNEL(name)                                      \
+namespace phi {
+namespace sparse {
+
+#define DECLARE_SPARSE_UNARY_KERNEL(prefix)                                    \
   template <typename T, typename Context>                                      \
-  void SparseCoo##name##Kernel(                                                \
+  void prefix##CooKernel(                                                      \
       const Context& dev_ctx, const SparseCooTensor& x, SparseCooTensor* out); \
                                                                                \
   template <typename T, typename Context>                                      \
-  void SparseCsr##name##Kernel(                                                \
+  void prefix##CsrKernel(                                                      \
       const Context& dev_ctx, const SparseCsrTensor& x, SparseCsrTensor* out);
 
-namespace phi {
-namespace sparse {
+#define DECLARE_SPARSE_UNARY_KERNEL_WITH_ONE_ATTR(prefix, attr) \
+  template <typename T, typename Context>                       \
+  void prefix##CooKernel(const Context& dev_ctx,                \
+                         const SparseCooTensor& x,              \
+                         float attr,                            \
+                         SparseCooTensor* out);                 \
+                                                                \
+  template <typename T, typename Context>                       \
+  void prefix##CsrKernel(const Context& dev_ctx,                \
+                         const SparseCsrTensor& x,              \
+                         float attr,                            \
+                         SparseCsrTensor* out);
 
+DECLARE_SPARSE_UNARY_KERNEL(Sin)
+DECLARE_SPARSE_UNARY_KERNEL(Tan)
+DECLARE_SPARSE_UNARY_KERNEL(Asin)
+DECLARE_SPARSE_UNARY_KERNEL(Atan)
+DECLARE_SPARSE_UNARY_KERNEL(Sinh)
+DECLARE_SPARSE_UNARY_KERNEL(Asinh)
+DECLARE_SPARSE_UNARY_KERNEL(Atanh)
 DECLARE_SPARSE_UNARY_KERNEL(Relu)
+DECLARE_SPARSE_UNARY_KERNEL(Tanh)
+DECLARE_SPARSE_UNARY_KERNEL(Square)
 DECLARE_SPARSE_UNARY_KERNEL(Sqrt)
-DECLARE_SPARSE_UNARY_KERNEL(Sin)
+DECLARE_SPARSE_UNARY_KERNEL(Log1p)
+DECLARE_SPARSE_UNARY_KERNEL(Abs)
+DECLARE_SPARSE_UNARY_KERNEL_WITH_ONE_ATTR(Pow, factor)
+
+template <typename T, typename Context>
+void ScaleCooKernel(const Context& dev_ctx,
+                    const SparseCooTensor& x,
+                    float scale,
+                    float bias,
+                    bool bias_after_scale,
+                    SparseCooTensor* out);
+
+template <typename T, typename Context>
+void ScaleCsrKernel(const Context& dev_ctx,
+                    const SparseCsrTensor& x,
+                    float scale,
+                    float bias,
+                    bool bias_after_scale,
+                    SparseCsrTensor* out);
 
 template <typename T, typename Context>
-SparseCooTensor SparseRelu(const Context& dev_ctx, const SparseCooTensor& x) {
-  DenseTensor indices, values;
-  SparseCooTensor coo(indices, values, x.dims());
-  SparseCooReluKernel<T, Context>(dev_ctx, x, &coo);
+void DivCooScalarKernel(const Context& dev_ctx,
+                        const SparseCooTensor& x,
+                        float scalar,
+                        SparseCooTensor* out);
+
+template <typename T, typename Context>
+void DivCsrScalarKernel(const Context& dev_ctx,
+                        const SparseCsrTensor& x,
+                        float scalar,
+                        SparseCsrTensor* out);
+
+template <typename T, typename Context>
+void CastCooKernel(const Context& dev_ctx,
+                   const SparseCooTensor& x,
+                   DataType index_dtype,
+                   DataType value_dtype,
+                   SparseCooTensor* out);
+
+template <typename T, typename Context>
+void CastCsrKernel(const Context& dev_ctx,
+                   const SparseCsrTensor& x,
+                   DataType index_dtype,
+                   DataType value_dtype,
+                   SparseCsrTensor* out);
+
+template <typename T, typename Context>
+SparseCooTensor ReluCoo(const Context& dev_ctx, const SparseCooTensor& x) {
+  SparseCooTensor coo;
+  ReluCooKernel<T, Context>(dev_ctx, x, &coo);
   return coo;
 }
 
+template <typename T, typename Context>
+SparseCooTensor ReluCsr(const Context& dev_ctx, const SparseCooTensor& x) {
+  SparseCooTensor csr;
+  ReluCsrKernel<T, Context>(dev_ctx, x, &csr);
+  return csr;
+}
+
 }  // namespace sparse
 }  // namespace phi
diff --git a/paddle/phi/kernels/unique_consecutive_kernel.h b/paddle/phi/kernels/unique_consecutive_kernel.h
new file mode 100644
index 0000000000000..ade35d4d49730
--- /dev/null
+++ b/paddle/phi/kernels/unique_consecutive_kernel.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void UniqueConsecutiveKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             bool return_inverse,
+                             bool return_counts,
+                             const std::vector<int>& axis,
+                             int dtype,
+                             DenseTensor* out,
+                             DenseTensor* index,
+                             DenseTensor* counts);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/unsqueeze_kernel.h b/paddle/phi/kernels/unsqueeze_kernel.h
index 4622a9b0a859c..62ba878c056cb 100644
--- a/paddle/phi/kernels/unsqueeze_kernel.h
+++ b/paddle/phi/kernels/unsqueeze_kernel.h
@@ -17,6 +17,7 @@
 
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace phi {
 
@@ -26,4 +27,16 @@ void UnsqueezeKernel(const Context& dev_ctx,
                      const IntArray& axes,
                      DenseTensor* out,
                      DenseTensor* xshape);
+
+template <typename T, typename Context>
+void Unsqueeze(const Context& dev_ctx,
+               const DenseTensor& x,
+               const IntArray& axes,
+               DenseTensor* out,
+               DenseTensor* xshape) {
+  MetaTensor meta_out(out);
+  UnsqueezeInferMeta(x, axes, &meta_out, nullptr, MetaConfig());
+  UnsqueezeKernel<T, Context>(dev_ctx, x, axes, out, nullptr);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/xpu/elementwise_add_kernel.cc b/paddle/phi/kernels/xpu/elementwise_add_kernel.cc
new file mode 100644
index 0000000000000..34d39b0a83da2
--- /dev/null
+++ b/paddle/phi/kernels/xpu/elementwise_add_kernel.cc
@@ -0,0 +1,41 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/api/ext/dispatch.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GradAddXPUKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  auto x_shape = phi::vectorize<int>(x.dims());
+  auto y_shape = phi::vectorize<int>(y.dims());
+  int r = xpu::broadcast_add(dev_ctx.x_context(),
+                             x.data<T>(),
+                             y.data<T>(),
+                             out->data<T>(),
+                             x_shape,
+                             y_shape);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_add");
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(grad_add, XPU, ALL_LAYOUT, phi::GradAddXPUKernel, float) {}
diff --git a/paddle/phi/kernels/xpu/log_softmax_grad_kernel.cc b/paddle/phi/kernels/xpu/log_softmax_grad_kernel.cc
new file mode 100644
index 0000000000000..c9165f3ef7d7e
--- /dev/null
+++ b/paddle/phi/kernels/xpu/log_softmax_grad_kernel.cc
@@ -0,0 +1,68 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/log_softmax_grad_kernel.h"
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LogSoftmaxGradKernel(const Context& dev_ctx,
+                          const DenseTensor& out,
+                          const DenseTensor& out_grad,
+                          int axis,
+                          DenseTensor* x_grad) {
+  const int rank = out.dims().size();
+  axis = funcs::CanonicalAxis(axis, rank);
+
+  if (out.numel() != 0) {
+    auto out_shape = phi::vectorize<int>(out.dims());
+    dev_ctx.template Alloc<T>(x_grad);
+    xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+    T* tmp_ptr = RAII_GUARD.alloc_l3_or_gm<T>(out_grad.numel());
+    T* tmp2_ptr = RAII_GUARD.alloc_l3_or_gm<T>(out_grad.numel());
+    PADDLE_ENFORCE_NE(
+        tmp_ptr, nullptr, phi::errors::External("no enough memory in xpu"));
+    PADDLE_ENFORCE_NE(
+        tmp2_ptr, nullptr, phi::errors::External("no enough memory in xpu"));
+
+    int r =
+        xpu::exp(dev_ctx.x_context(), out.data<T>(), tmp_ptr, out_grad.numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "exp");
+    r = xpu::reciprocal(
+        dev_ctx.x_context(), tmp_ptr, tmp2_ptr, out_grad.numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "reciprocal");
+    r = xpu::mul(dev_ctx.x_context(),
+                 tmp2_ptr,
+                 out_grad.data<T>(),
+                 tmp2_ptr,
+                 out_grad.numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "mul");
+    r = xpu::softmax_grad(dev_ctx.x_context(),
+                          tmp_ptr,
+                          tmp2_ptr,
+                          x_grad->data<T>(),
+                          out_shape,
+                          axis);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "softmax_grad");
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    log_softmax_grad, XPU, ALL_LAYOUT, phi::LogSoftmaxGradKernel, float) {}
diff --git a/paddle/phi/kernels/xpu/log_softmax_kernel.cc b/paddle/phi/kernels/xpu/log_softmax_kernel.cc
new file mode 100644
index 0000000000000..1f084d0e6cbf7
--- /dev/null
+++ b/paddle/phi/kernels/xpu/log_softmax_kernel.cc
@@ -0,0 +1,47 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/log_softmax_kernel.h"
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LogSoftmaxKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      int axis,
+                      DenseTensor* out) {
+  const int rank = x.dims().size();
+  axis = funcs::CanonicalAxis(axis, rank);
+
+  if (x.numel() != 0) {
+    auto x_shape = phi::vectorize<int>(x.dims());
+    dev_ctx.template Alloc<T>(out);
+    if (axis < 0) axis += rank;
+    int r = xpu::softmax<T>(
+        dev_ctx.x_context(), x.data<T>(), out->data<T>(), x_shape, axis);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "softmax");
+    r = xpu::log<T>(
+        dev_ctx.x_context(), out->data<T>(), out->data<T>(), out->numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "log");
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(log_softmax, XPU, ALL_LAYOUT, phi::LogSoftmaxKernel, float) {
+}
diff --git a/paddle/phi/ops/compat/complex_sig.cc b/paddle/phi/ops/compat/complex_sig.cc
index 88156677d34df..da47e2c7bc750 100644
--- a/paddle/phi/ops/compat/complex_sig.cc
+++ b/paddle/phi/ops/compat/complex_sig.cc
@@ -24,7 +24,14 @@ KernelSignature ImagGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
   return KernelSignature("imag_grad", {"Out@GRAD"}, {}, {"X@GRAD"});
 }
 
+KernelSignature ComplexGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "complex_grad", {"X", "Y", "Out@GRAD"}, {}, {"X@GRAD", "Y@GRAD"});
+}
+
 }  // namespace phi
 
 PD_REGISTER_ARG_MAPPING_FN(real_grad, phi::RealGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(imag_grad, phi::ImagGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(complex_grad, phi::ComplexGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/conv3d_sig.cc b/paddle/phi/ops/compat/conv3d_sig.cc
index 49f31288d00f6..68bd54609cb03 100644
--- a/paddle/phi/ops/compat/conv3d_sig.cc
+++ b/paddle/phi/ops/compat/conv3d_sig.cc
@@ -32,7 +32,7 @@ KernelSignature Conv3dOpArgumentMapping(const ArgumentMappingContext& ctx) {
 }
 
 KernelSignature Conv3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature("conv2d_grad",
+  return KernelSignature("conv3d_grad",
                          {"Input", "Filter", "Output@GRAD"},
                          {"strides",
                           "paddings",
diff --git a/paddle/phi/ops/compat/eigvals_sig.cc b/paddle/phi/ops/compat/eigvals_sig.cc
new file mode 100644
index 0000000000000..cb29126abc39f
--- /dev/null
+++ b/paddle/phi/ops/compat/eigvals_sig.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature EigvalsOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("eigvals", {"X"}, {}, {"Out"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(eigvals, phi::EigvalsOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/diag_sig.cc b/paddle/phi/ops/compat/identity_loss_sig.cc
similarity index 57%
rename from paddle/phi/ops/compat/diag_sig.cc
rename to paddle/phi/ops/compat/identity_loss_sig.cc
index b232c714c9710..aa9516bd1ec4f 100644
--- a/paddle/phi/ops/compat/diag_sig.cc
+++ b/paddle/phi/ops/compat/identity_loss_sig.cc
@@ -16,19 +16,19 @@
 
 namespace phi {
 
-KernelSignature DiagOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature("diag", {"X"}, {"offset", "padding_value"}, {"Out"});
+KernelSignature IdentityLossOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("identity_loss", {"X"}, {"reduction"}, {"Out"});
 }
 
-KernelSignature DiagGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+KernelSignature IdentityLossGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
   return KernelSignature(
-      "diag_grad", {"X", "Out@GRAD"}, {"offset"}, {"X@GRAD"});
+      "identity_loss_grad", {"X", "Out@GRAD"}, {"reduction"}, {"X@GRAD"});
 }
 
 }  // namespace phi
 
-PD_REGISTER_BASE_KERNEL_NAME(diag_v2, diag);
-PD_REGISTER_BASE_KERNEL_NAME(diag_v2_grad, diag_grad);
-
-PD_REGISTER_ARG_MAPPING_FN(diag_v2, phi::DiagOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(diag_v2_grad, phi::DiagGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(identity_loss, phi::IdentityLossOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(identity_loss_grad,
+                           phi::IdentityLossGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/merged_adam_sig.cc b/paddle/phi/ops/compat/merged_adam_sig.cc
new file mode 100644
index 0000000000000..38f56bad08d85
--- /dev/null
+++ b/paddle/phi/ops/compat/merged_adam_sig.cc
@@ -0,0 +1,47 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <string>
+
+#include "paddle/phi/core/compat/op_utils.h"
+#include "paddle/utils/small_vector.h"
+
+namespace phi {
+
+KernelSignature MergedAdamOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  paddle::small_vector<const char*> in_names = {"Param",
+                                                "Grad",
+                                                "LearningRate",
+                                                "Moment1",
+                                                "Moment2",
+                                                "Beta1Pow",
+                                                "Beta2Pow",
+                                                "MasterParam"};
+  paddle::small_vector<const char*> out_names = {"ParamOut",
+                                                 "Moment1Out",
+                                                 "Moment2Out",
+                                                 "Beta1PowOut",
+                                                 "Beta2PowOut",
+                                                 "MasterParamOut"};
+  paddle::small_vector<const char*> attr_names = {
+      "beta1", "beta2", "epsilon", "multi_precision", "use_global_beta_pow"};
+
+  return KernelSignature("merged_adam",
+                         std::move(in_names),
+                         std::move(attr_names),
+                         std::move(out_names));
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(merged_adam, phi::MergedAdamOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/merged_momentum_sig.cc b/paddle/phi/ops/compat/merged_momentum_sig.cc
new file mode 100644
index 0000000000000..3444d5e2d3097
--- /dev/null
+++ b/paddle/phi/ops/compat/merged_momentum_sig.cc
@@ -0,0 +1,40 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature MergedMomentumOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "merged_momentum",
+      {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"},
+      {"mu",
+       "use_nesterov",
+       "regularization_method",
+       "regularization_coeff",
+       "multi_precision",
+       "rescale_grad"},
+      {
+          "ParamOut",
+          "VelocityOut",
+          "MasterParamOut",
+      });
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(merged_momentum,
+                           phi::MergedMomentumOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/dist_sig.cc b/paddle/phi/ops/compat/solve_sig.cc
similarity index 75%
rename from paddle/phi/ops/compat/dist_sig.cc
rename to paddle/phi/ops/compat/solve_sig.cc
index cc702fefbc940..9771adee8e983 100644
--- a/paddle/phi/ops/compat/dist_sig.cc
+++ b/paddle/phi/ops/compat/solve_sig.cc
@@ -16,11 +16,11 @@ limitations under the License. */
 
 namespace phi {
 
-KernelSignature DistGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+KernelSignature SolveGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
   return KernelSignature(
-      "dist_grad", {"X", "Y", "Out", "Out@GRAD"}, {"p"}, {"X@GRAD", "Y@GRAD"});
+      "solve_grad", {"X", "Y", "Out@GRAD", "Out"}, {}, {"X@GRAD", "Y@GRAD"});
 }
 
 }  // namespace phi
 
-PD_REGISTER_ARG_MAPPING_FN(dist_grad, phi::DistGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(solve_grad, phi::SolveGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/unique_consecutive_sig.cc b/paddle/phi/ops/compat/unique_consecutive_sig.cc
new file mode 100644
index 0000000000000..f085858d8cb0d
--- /dev/null
+++ b/paddle/phi/ops/compat/unique_consecutive_sig.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature UniqueConsecutiveOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("unique_consecutive",
+                         {"X"},
+                         {"return_inverse", "return_counts", "axis", "dtype"},
+                         {"Out", "Index", "Counts"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(unique_consecutive,
+                           phi::UniqueConsecutiveOpArgumentMapping);
diff --git a/paddle/phi/tests/api/test_sparse_conv_api.cc b/paddle/phi/tests/api/test_sparse_conv_api.cc
index bbdb2f70d7fd3..95f4afe4d1540 100644
--- a/paddle/phi/tests/api/test_sparse_conv_api.cc
+++ b/paddle/phi/tests/api/test_sparse_conv_api.cc
@@ -23,7 +23,7 @@ limitations under the License. */
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 
-PD_DECLARE_KERNEL(sparse_conv3d, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(conv3d_coo, CPU, ALL_LAYOUT);
 
 template <typename T>
 void TestConv3dBase(const std::vector<int>& indices,
diff --git a/paddle/phi/tests/api/test_sparse_utils_api.cc b/paddle/phi/tests/api/test_sparse_utils_api.cc
index e02017555111c..d5891baaf10a2 100644
--- a/paddle/phi/tests/api/test_sparse_utils_api.cc
+++ b/paddle/phi/tests/api/test_sparse_utils_api.cc
@@ -48,7 +48,6 @@ TEST(API, to_sparse_coo) {
   std::copy(&dense_data[0][0], &dense_data[0][0] + 9, dense_x_data);
 
   phi::CPUContext dev_ctx_cpu;
-  dev_ctx_cpu.Init();
 
   // 1. test dense_to_sparse_coo
   paddle::experimental::Tensor x(dense_x);
diff --git a/paddle/phi/tests/common/test_backend.cc b/paddle/phi/tests/common/test_backend.cc
index c1550e31fae88..415c1f21465ed 100644
--- a/paddle/phi/tests/common/test_backend.cc
+++ b/paddle/phi/tests/common/test_backend.cc
@@ -39,8 +39,8 @@ TEST(Backend, OStream) {
   oss << phi::Backend::NPU;
   EXPECT_EQ(oss.str(), "NPU");
   oss.str("");
-  oss << phi::Backend::MKLDNN;
-  EXPECT_EQ(oss.str(), "MKLDNN");
+  oss << phi::Backend::ONEDNN;
+  EXPECT_EQ(oss.str(), "ONEDNN");
   oss.str("");
   oss << phi::Backend::GPUDNN;
   EXPECT_EQ(oss.str(), "GPUDNN");
@@ -63,7 +63,7 @@ TEST(Backend, StringToBackend) {
   EXPECT_EQ(phi::Backend::GPU, pexp::StringToBackend("GPU"));
   EXPECT_EQ(phi::Backend::XPU, pexp::StringToBackend("XPU"));
   EXPECT_EQ(phi::Backend::NPU, pexp::StringToBackend("NPU"));
-  EXPECT_EQ(phi::Backend::MKLDNN, pexp::StringToBackend("MKLDNN"));
+  EXPECT_EQ(phi::Backend::ONEDNN, pexp::StringToBackend("OneDNN"));
   EXPECT_EQ(phi::Backend::GPUDNN, pexp::StringToBackend("GPUDNN"));
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   EXPECT_EQ(phi::Backend::GPU, pexp::StringToBackend("KPS"));
diff --git a/paddle/phi/tests/common/test_scalar.cu b/paddle/phi/tests/common/test_scalar.cu
index 50b9e198da08b..e985f1c417de3 100644
--- a/paddle/phi/tests/common/test_scalar.cu
+++ b/paddle/phi/tests/common/test_scalar.cu
@@ -47,7 +47,6 @@ TEST(Scalar, ConstructFromDenseTensor1) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(phi::CPUPlace())
                            .get());
-  dev_ctx.Init();
 
   auto* dense_x_data = dev_ctx.Alloc<float16>(&dense_x);
   dense_x_data[0] = 1;
@@ -67,7 +66,6 @@ TEST(Scalar, ConstructFromDenseTensor2) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(phi::CPUPlace())
                            .get());
-  dev_ctx.Init();
 
   auto* dense_x_data = dev_ctx.Alloc<int16_t>(&dense_x);
   dense_x_data[0] = 1;
@@ -87,7 +85,6 @@ TEST(Scalar, ConstructFromDenseTensor3) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(phi::CPUPlace())
                            .get());
-  dev_ctx.Init();
 
   auto* dense_x_data = dev_ctx.Alloc<int8_t>(&dense_x);
   dense_x_data[0] = 1;
@@ -107,7 +104,6 @@ TEST(Scalar, ConstructFromDenseTensor4) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(phi::CPUPlace())
                            .get());
-  dev_ctx.Init();
 
   auto* dense_x_data = dev_ctx.Alloc<bool>(&dense_x);
   dense_x_data[0] = true;
@@ -127,7 +123,6 @@ TEST(Scalar, ConstructFromDenseTensor5) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(phi::CPUPlace())
                            .get());
-  dev_ctx.Init();
 
   auto* dense_x_data = dev_ctx.Alloc<complex64>(&dense_x);
   dense_x_data[0] = 1;
@@ -148,7 +143,6 @@ TEST(Scalar, ConstructFromDenseTensor6) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(phi::CPUPlace())
                            .get());
-  dev_ctx.Init();
 
   auto* dense_x_data = dev_ctx.Alloc<complex128>(&dense_x);
   dense_x_data[0] = 1;
diff --git a/paddle/phi/tests/core/CMakeLists.txt b/paddle/phi/tests/core/CMakeLists.txt
index c299559da5914..3d549aa5f160c 100644
--- a/paddle/phi/tests/core/CMakeLists.txt
+++ b/paddle/phi/tests/core/CMakeLists.txt
@@ -24,10 +24,6 @@ cc_test(
   test_op_utils
   SRCS test_op_utils.cc
   DEPS op_compat_infos)
-cc_test(
-  test_phi_device_context
-  SRCS test_device_context.cc
-  DEPS phi_context cpu_context)
 cc_test(
   test_meta_fn_utils
   SRCS test_meta_fn_utils.cc
diff --git a/paddle/phi/tests/core/test_device_context.cc b/paddle/phi/tests/core/test_device_context.cc
deleted file mode 100644
index 844330ee097ef..0000000000000
--- a/paddle/phi/tests/core/test_device_context.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "gtest/gtest.h"
-
-// TODO(wilber): will remove after the cpu, gpu context megre.
-#include "paddle/phi/backends/cpu/cpu_context.h"
-// #include "paddle/phi/backends/all_context.h"
-
-// NOTE: The paddle framework should add WITH_EIGEN option to support compile
-// without eigen.
-#include "unsupported/Eigen/CXX11/Tensor"
-
-namespace phi {
-namespace tests {
-
-class InferenceCPUContext : public CPUContext {
- public:
-  void SetEigenDevice(Eigen::DefaultDevice* eigen_device) {
-    CPUContext::SetEigenDevice(eigen_device);
-  }
-};
-
-TEST(DeviceContext, cpu_context) {
-  std::cout << "test training scenarios" << std::endl;
-  {
-    phi::CPUContext ctx;
-    ctx.Init();
-    EXPECT_TRUE(ctx.eigen_device() != nullptr);
-  }
-
-  std::cout << "test inference scenarios" << std::endl;
-  Eigen::DefaultDevice* device = new Eigen::DefaultDevice();
-  {
-    InferenceCPUContext ctx;
-    ctx.SetEigenDevice(device);
-    EXPECT_TRUE(ctx.eigen_device() != nullptr);
-  }
-  delete device;
-}
-
-}  // namespace tests
-}  // namespace phi
diff --git a/paddle/phi/tests/kernels/test_cast_dev_api.cc b/paddle/phi/tests/kernels/test_cast_dev_api.cc
index 179e44f0f0f12..d43cd075ed590 100644
--- a/paddle/phi/tests/kernels/test_cast_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_cast_dev_api.cc
@@ -52,7 +52,6 @@ TEST(DEV_API, cast) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
-  dev_ctx.Init();
 
   phi::DataType out_dtype = phi::DataType::FLOAT64;
   // 2. test API
diff --git a/paddle/phi/tests/kernels/test_concat_dev_api.cc b/paddle/phi/tests/kernels/test_concat_dev_api.cc
index 0dd58b1bba938..9283fcd0b65f4 100644
--- a/paddle/phi/tests/kernels/test_concat_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_concat_dev_api.cc
@@ -60,7 +60,6 @@ TEST(DEV_API, concat) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
-  dev_ctx.Init();
   auto out = phi::Concat<float>(dev_ctx, inputs, 0);
 
   // 3. check result
diff --git a/paddle/phi/tests/kernels/test_conj_dev_api.cc b/paddle/phi/tests/kernels/test_conj_dev_api.cc
index 5ac676ffcbcae..2f7ab8383733f 100644
--- a/paddle/phi/tests/kernels/test_conj_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_conj_dev_api.cc
@@ -48,7 +48,6 @@ TEST(DEV_API, conj) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
-  dev_ctx.Init();
 
   // 2. test API
   auto out = phi::Conj<paddle::complex64>(dev_ctx, dense_x);
diff --git a/paddle/phi/tests/kernels/test_copy_dev_api.cc b/paddle/phi/tests/kernels/test_copy_dev_api.cc
index 1c9b17ed613e4..c2df0a8acdccf 100644
--- a/paddle/phi/tests/kernels/test_copy_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_copy_dev_api.cc
@@ -65,7 +65,6 @@ TEST(DEV_API, copy) {
       paddle::memory::allocation::AllocatorFacade::Instance()
           .GetAllocator(paddle::platform::CPUPlace())
           .get());
-  dev_ctx.Init();
   phi::Copy(
       dev_ctx, *(dense_src.get()), phi::CPUPlace(), false, dense_dst.get());
 
diff --git a/paddle/phi/tests/kernels/test_creation_dev_api.cc b/paddle/phi/tests/kernels/test_creation_dev_api.cc
index 2dcd8739991f8..5685c3a2a0b0d 100644
--- a/paddle/phi/tests/kernels/test_creation_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_creation_dev_api.cc
@@ -36,7 +36,6 @@ TEST(DEV_API, empty) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
-  dev_ctx.Init();
 
   // 2. test API
   auto out = phi::Empty<int>(dev_ctx, {3, 2});
@@ -66,7 +65,6 @@ TEST(DEV_API, empty_like) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
-  dev_ctx.Init();
   auto out = phi::EmptyLike<float>(dev_ctx, dense_x);
 
   // 3. check result
@@ -86,7 +84,6 @@ TEST(DEV_API, full) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
-  dev_ctx.Init();
   auto out = phi::Full<float>(dev_ctx, {3, 2}, val);
 
   // 3. check result
@@ -119,7 +116,6 @@ TEST(DEV_API, full_like) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
-  dev_ctx.Init();
 
   // 2. test API
   auto out = phi::FullLike<float>(dev_ctx, dense_x, val);
diff --git a/paddle/phi/tests/kernels/test_dot_dev_api.cc b/paddle/phi/tests/kernels/test_dot_dev_api.cc
index de20907cadf44..a2af0471df0d0 100644
--- a/paddle/phi/tests/kernels/test_dot_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_dot_dev_api.cc
@@ -61,7 +61,6 @@ TEST(DEV_API, dot) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
-  dev_ctx.Init();
   auto out = phi::Dot<float>(dev_ctx, dense_x, dense_y);
 
   // 3. check result
diff --git a/paddle/phi/tests/kernels/test_elementwise_dev_api.cc b/paddle/phi/tests/kernels/test_elementwise_dev_api.cc
index 63f8b86a534ed..4100889d3ac41 100644
--- a/paddle/phi/tests/kernels/test_elementwise_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_elementwise_dev_api.cc
@@ -66,7 +66,6 @@ TEST(DEV_API, add) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
-  dev_ctx.Init();
   auto dense_out = phi::Add<float>(dev_ctx, dense_x, dense_y);
 
   // 3. check result
@@ -118,7 +117,6 @@ TEST(DEV_API, subtract) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
-  dev_ctx.Init();
   auto dense_out = phi::Subtract<float>(dev_ctx, dense_x, dense_y);
 
   // 3. check result
@@ -170,7 +168,6 @@ TEST(DEV_API, divide) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
-  dev_ctx.Init();
   auto dense_out = phi::Divide<float>(dev_ctx, dense_x, dense_y);
 
   // 3. check result
@@ -222,7 +219,6 @@ TEST(DEV_API, multiply) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
-  dev_ctx.Init();
   auto dense_out = phi::Multiply<float>(dev_ctx, dense_x, dense_y);
 
   // 3. check result
diff --git a/paddle/phi/tests/kernels/test_flatten_dev_api.cc b/paddle/phi/tests/kernels/test_flatten_dev_api.cc
index fb1cdee7e5fba..860af4c4a4dce 100644
--- a/paddle/phi/tests/kernels/test_flatten_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_flatten_dev_api.cc
@@ -52,7 +52,6 @@ TEST(DEV_API, flatten) {
       paddle::memory::allocation::AllocatorFacade::Instance()
           .GetAllocator(paddle::platform::CPUPlace())
           .get());
-  dev_ctx.Init();
 
   // 2. test API
   auto out = phi::Flatten<float>(dev_ctx, dense_x, start_axis, stop_axis);
diff --git a/paddle/phi/tests/kernels/test_math_function.cc b/paddle/phi/tests/kernels/test_math_function.cc
index 29f33c555d1aa..b21cf0203febe 100644
--- a/paddle/phi/tests/kernels/test_math_function.cc
+++ b/paddle/phi/tests/kernels/test_math_function.cc
@@ -20,9 +20,9 @@ namespace phi {
 namespace tests {
 
 template <typename T>
-inline phi::funcs::BlasT<paddle::platform::CPUDeviceContext, T> GetBlas(
-    const paddle::platform::CPUDeviceContext& context) {
-  return phi::funcs::GetBlas<paddle::platform::CPUDeviceContext, T>(context);
+inline phi::funcs::BlasT<phi::CPUContext, T> GetBlas(
+    const phi::CPUContext& context) {
+  return phi::funcs::GetBlas<phi::CPUContext, T>(context);
 }
 
 TEST(math_function, gemm_notrans_cblas) {
@@ -44,7 +44,7 @@ TEST(math_function, gemm_notrans_cblas) {
   float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
   memcpy(input3_ptr, arr3, 8 * sizeof(float));
 
-  paddle::platform::CPUDeviceContext context(*cpu_place);
+  phi::CPUContext context(*cpu_place);
   GetBlas<float>(context).GEMM(false,
                                false,
                                m,
@@ -165,7 +165,7 @@ TEST(math_function, gemm_trans_cblas) {
   float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
   memcpy(input3_ptr, arr3, 8 * sizeof(float));
 
-  paddle::platform::CPUDeviceContext context(*cpu_place);
+  phi::CPUContext context(*cpu_place);
   GetBlas<float>(context).GEMM(false,
                                true,
                                m,
@@ -196,8 +196,8 @@ TEST(math_function, zero) {
   paddle::framework::Tensor tensor;
   auto* cpu_place = new paddle::platform::CPUPlace();
   float* t = tensor.mutable_data<float>({2, 2}, *cpu_place);
-  paddle::platform::CPUDeviceContext context(*cpu_place);
-  phi::funcs::SetConstant<paddle::platform::CPUDeviceContext, float> functor;
+  phi::CPUContext context(*cpu_place);
+  phi::funcs::SetConstant<phi::CPUContext, float> functor;
   functor(context, &tensor, 0);
   EXPECT_EQ(t[0], 0);
   EXPECT_EQ(t[1], 0);
@@ -231,7 +231,7 @@ void GemvTest(int m, int n, bool trans) {
     data_b[i] = static_cast<T>(i);
   }
 
-  paddle::platform::CPUDeviceContext context(*cpu_place);
+  phi::CPUContext context(*cpu_place);
   GetBlas<T>(context).GEMV(trans,
                            static_cast<int>(m),
                            static_cast<int>(n),
@@ -272,8 +272,7 @@ TEST(math_funciton, set_constant) {
   paddle::framework::Tensor t;
   t.Resize({10, 10});
   t.mutable_data<int>(paddle::platform::CPUPlace());
-  auto* ctx = new paddle::platform::CPUDeviceContext();
-  ctx->Init();
+  auto* ctx = new phi::CPUContext();
   phi::funcs::set_constant(*ctx, &t, 10);
   for (int64_t i = 0; i < t.numel(); ++i) {
     PADDLE_ENFORCE_EQ(10,
@@ -312,7 +311,7 @@ void GemmWarpTest(int m, int n, int k, T alpha, T beta) {
   }
 
   // this would call gemm_warp
-  paddle::platform::CPUDeviceContext context(*cpu_place);
+  phi::CPUContext context(*cpu_place);
   GetBlas<T>(context).GEMM(
       CblasNoTrans, CblasNoTrans, m, n, k, alpha, A, B, beta, CREF);
 
diff --git a/paddle/phi/tests/kernels/test_matmul_dev_api.cc b/paddle/phi/tests/kernels/test_matmul_dev_api.cc
index f25acaf9bcc3f..374a05fc5e475 100644
--- a/paddle/phi/tests/kernels/test_matmul_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_matmul_dev_api.cc
@@ -58,7 +58,6 @@ TEST(DEV_API, dot) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
-  dev_ctx.Init();
   auto out = Matmul<float, CPUContext>(dev_ctx, dense_x, dense_y, false, false);
 
   // 3. check result
diff --git a/paddle/phi/tests/kernels/test_mean_dev_api.cc b/paddle/phi/tests/kernels/test_mean_dev_api.cc
index 6f3f91a7dbe56..1c79150391379 100644
--- a/paddle/phi/tests/kernels/test_mean_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_mean_dev_api.cc
@@ -51,7 +51,6 @@ TEST(DEV_API, mean) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
-  dev_ctx.Init();
   auto out = phi::Mean<float>(dev_ctx, dense_x, dims, false);
 
   // 3. check result
diff --git a/paddle/phi/tests/kernels/test_reshape_dev_api.cc b/paddle/phi/tests/kernels/test_reshape_dev_api.cc
index f0f521d57dbd8..708b31cb9a9ce 100644
--- a/paddle/phi/tests/kernels/test_reshape_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_reshape_dev_api.cc
@@ -54,7 +54,6 @@ TEST(DEV_API, reshape) {
       paddle::memory::allocation::AllocatorFacade::Instance()
           .GetAllocator(paddle::platform::CPUPlace())
           .get());
-  dev_ctx.Init();
   auto out = phi::Reshape<float>(dev_ctx, dense_x, shape);
   // 3. check result
   std::vector<int64_t> expect_shape = {12, 3};
diff --git a/paddle/phi/tests/kernels/test_scale_dev_api.cc b/paddle/phi/tests/kernels/test_scale_dev_api.cc
index eff18bdeecaab..57e186ab393ec 100644
--- a/paddle/phi/tests/kernels/test_scale_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_scale_dev_api.cc
@@ -51,7 +51,6 @@ TEST(DEV_API, scale) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
-  dev_ctx.Init();
 
   auto out = phi::Scale<float>(dev_ctx, dense_x, scale, bias, bias_after_scale);
 
@@ -93,7 +92,6 @@ TEST(DEV_API, scale_host) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
-  dev_ctx.Init();
 
   auto out = phi::Scale<float>(dev_ctx, dense_x, scale, bias, bias_after_scale);
 
diff --git a/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc
index d1c464e4b1c9d..9c6776fb2ac35 100644
--- a/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc
@@ -42,7 +42,6 @@ TEST(DEV_API, sparse_relu) {
       paddle::memory::allocation::AllocatorFacade::Instance()
           .GetAllocator(paddle::platform::CPUPlace())
           .get());
-  dev_ctx_cpu.Init();
 
   DenseTensor dense_x =
       phi::Empty(dev_ctx_cpu,
@@ -50,7 +49,7 @@ TEST(DEV_API, sparse_relu) {
   memcpy(dense_x.data<float>(), data.data(), data.size() * sizeof(float));
   auto sparse_coo = sparse::DenseToSparseCoo<float>(dev_ctx_cpu, dense_x, 2);
 
-  auto sparse_out = sparse::SparseRelu<float>(dev_ctx_cpu, sparse_coo);
+  auto sparse_out = sparse::ReluCoo<float>(dev_ctx_cpu, sparse_coo);
   DenseTensor dense_out =
       phi::EmptyLike<float>(dev_ctx_cpu, sparse_out.non_zero_elements());
   ReluKernel<float>(dev_ctx_cpu, sparse_coo.non_zero_elements(), &dense_out);
@@ -70,7 +69,7 @@ TEST(DEV_API, sparse_relu) {
 
   SparseCooTensor sparse_out_grad(
       sparse_coo.non_zero_indices(), dense_out, {3, 4});
-  sparse::SparseCooReluGradKernel<float>(
+  sparse::ReluCooGradKernel<float>(
       dev_ctx_cpu, sparse_coo, sparse_out_grad, &sparse_grad_x);
 
   cmp = memcmp(dense_grad_x.data<float>(),
diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
index bb84690cd07ee..4a39f2bd8f1c4 100644
--- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
@@ -22,8 +22,9 @@ limitations under the License. */
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
-#include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
-#include "paddle/phi/kernels/sparse/convolution_kernel.h"
+#include "paddle/phi/kernels/sparse/coalesce_kernel.h"
+#include "paddle/phi/kernels/sparse/conv_grad_kernel.h"
+#include "paddle/phi/kernels/sparse/conv_kernel.h"
 
 namespace phi {
 namespace tests {
@@ -75,7 +76,6 @@ void TestConv3dBase(const std::vector<IntT>& indices,
       paddle::memory::allocation::AllocatorFacade::Instance()
           .GetAllocator(paddle::platform::CPUPlace())
           .get());
-  dev_ctx_cpu.Init();
 
   const int in_channels = kernel_dims[3];
   const int out_channels = kernel_dims[4];
@@ -114,15 +114,15 @@ void TestConv3dBase(const std::vector<IntT>& indices,
   if (!std::is_same<T, phi::dtype::float16>::value) {
     DenseTensor rulebook = phi::Empty(
         dev_ctx_cpu, DenseTensorMeta(indices_dtype, {1}, DataLayout::NCHW));
-    SparseCooTensor out = sparse::Conv3d<T>(dev_ctx_cpu,
-                                            x_tensor,
-                                            kernel_tensor,
-                                            paddings,
-                                            dilations,
-                                            strides,
-                                            1,
-                                            subm,
-                                            &rulebook);
+    SparseCooTensor out = sparse::Conv3dCoo<T>(dev_ctx_cpu,
+                                               x_tensor,
+                                               kernel_tensor,
+                                               paddings,
+                                               dilations,
+                                               strides,
+                                               1,
+                                               subm,
+                                               &rulebook);
 
     ASSERT_EQ(correct_out_dims.size(), out.dims().size());
     for (int i = 0; i < correct_out_dims.size(); i++) {
@@ -139,16 +139,16 @@ void TestConv3dBase(const std::vector<IntT>& indices,
 
     if (backward) {
       std::tuple<SparseCooTensor, DenseTensor> grads =
-          sparse::Conv3dGrad<T>(dev_ctx_cpu,
-                                x_tensor,
-                                kernel_tensor,
-                                rulebook,
-                                out,
-                                paddings,
-                                dilations,
-                                strides,
-                                1,
-                                subm);
+          sparse::Conv3dCooGrad<T>(dev_ctx_cpu,
+                                   x_tensor,
+                                   kernel_tensor,
+                                   rulebook,
+                                   out,
+                                   paddings,
+                                   dilations,
+                                   strides,
+                                   1,
+                                   subm);
       f_verify(std::get<0>(grads).non_zero_elements().data<T>(), features_grad);
       f_verify(std::get<1>(grads).data<T>(), kernel_grad);
     }
@@ -198,15 +198,17 @@ void TestConv3dBase(const std::vector<IntT>& indices,
 
   DenseTensor d_rulebook = phi::Empty(
       dev_ctx_gpu, DenseTensorMeta(indices_dtype, {1}, DataLayout::NCHW));
-  SparseCooTensor d_out = sparse::Conv3d<T>(dev_ctx_gpu,
-                                            d_x_tensor,
-                                            d_kernel_tensor,
-                                            paddings,
-                                            dilations,
-                                            strides,
-                                            1,
-                                            subm,
-                                            &d_rulebook);
+  SparseCooTensor d_out = sparse::Conv3dCoo<T>(dev_ctx_gpu,
+                                               d_x_tensor,
+                                               d_kernel_tensor,
+                                               paddings,
+                                               dilations,
+                                               strides,
+                                               1,
+                                               subm,
+                                               &d_rulebook);
+
+  SparseCooTensor tmp_d_out = sparse::Coalesce<T>(dev_ctx_gpu, d_out);
 
   ASSERT_EQ(correct_out_dims.size(), d_out.dims().size());
   ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, d_out.nnz());
@@ -218,7 +220,7 @@ void TestConv3dBase(const std::vector<IntT>& indices,
       dev_ctx_cpu,
       DenseTensorMeta(indices_dtype, {4, d_out.nnz()}, DataLayout::NCHW));
   phi::Copy(dev_ctx_gpu,
-            d_out.non_zero_indices(),
+            tmp_d_out.non_zero_indices(),
             phi::CPUPlace(),
             true,
             &h_indices_tensor);
@@ -232,7 +234,7 @@ void TestConv3dBase(const std::vector<IntT>& indices,
       phi::EmptyLike<T>(dev_ctx_cpu, d_out.non_zero_elements());
 
   phi::Copy(dev_ctx_gpu,
-            d_out.non_zero_elements(),
+            tmp_d_out.non_zero_elements(),
             phi::CPUPlace(),
             true,
             &h_features_tensor);
@@ -240,16 +242,16 @@ void TestConv3dBase(const std::vector<IntT>& indices,
 
   if (backward) {
     std::tuple<SparseCooTensor, DenseTensor> grads =
-        sparse::Conv3dGrad<T>(dev_ctx_gpu,
-                              d_x_tensor,
-                              d_kernel_tensor,
-                              d_rulebook,
-                              d_out,
-                              paddings,
-                              dilations,
-                              strides,
-                              1,
-                              subm);
+        sparse::Conv3dCooGrad<T>(dev_ctx_gpu,
+                                 d_x_tensor,
+                                 d_kernel_tensor,
+                                 d_rulebook,
+                                 d_out,
+                                 paddings,
+                                 dilations,
+                                 strides,
+                                 1,
+                                 subm);
     DenseTensor d_features_grad = std::get<0>(grads).non_zero_elements();
     DenseTensor d_kernel_grad = std::get<1>(grads);
     DenseTensor h_features_grad =
diff --git a/paddle/phi/tests/kernels/test_sparse_elementwise_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_elementwise_dev_api.cc
index 50848ae5f1ce7..cbac854d48ea4 100644
--- a/paddle/phi/tests/kernels/test_sparse_elementwise_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_elementwise_dev_api.cc
@@ -113,7 +113,6 @@ TEST(DEV_API, sparse_elementwise_coo_kernel_double) {
         paddle::memory::allocation::AllocatorFacade::Instance()
             .GetAllocator(paddle::platform::CPUPlace())
             .get());
-    dev_ctx_cpu.Init();
 
     auto coo_x = sparse::DenseToSparseCoo<T>(dev_ctx_cpu, dense_x, sparse_dim);
     auto coo_y = sparse::DenseToSparseCoo<T>(dev_ctx_cpu, dense_y, sparse_dim);
@@ -159,7 +158,6 @@ TEST(DEV_API, sparse_elementwise_csr_kernel_float) {
       paddle::memory::allocation::AllocatorFacade::Instance()
           .GetAllocator(paddle::platform::CPUPlace())
           .get());
-  dev_ctx_cpu.Init();
 
   auto csr_x = sparse::DenseToSparseCsr<T>(dev_ctx_cpu, dense_x);
   auto csr_y = sparse::DenseToSparseCsr<T>(dev_ctx_cpu, dense_y);
@@ -357,7 +355,6 @@ TEST(DEV_API, sparse_elementwise_csr_grad_kernel_float) {
       paddle::memory::allocation::AllocatorFacade::Instance()
           .GetAllocator(paddle::platform::CPUPlace())
           .get());
-  dev_ctx_cpu.Init();
 
   auto csr_x = sparse::DenseToSparseCsr<T>(dev_ctx_cpu, dense_x);
   auto csr_y = sparse::DenseToSparseCsr<T>(dev_ctx_cpu, dense_y);
@@ -404,7 +401,6 @@ TEST(DEV_API, sparse_elementwise_coo_grad_kernel_double) {
       paddle::memory::allocation::AllocatorFacade::Instance()
           .GetAllocator(paddle::platform::CPUPlace())
           .get());
-  dev_ctx_cpu.Init();
 
   auto csr_x = sparse::DenseToSparseCoo<T>(dev_ctx_cpu, dense_x, sparse_dim);
   auto csr_y = sparse::DenseToSparseCoo<T>(dev_ctx_cpu, dense_y, sparse_dim);
diff --git a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
index 7d7cd1ceaf57e..eeba9cdc131d8 100644
--- a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/sparse/coalesce_kernel.h"
 #include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h"
 #include "paddle/phi/kernels/sparse/sparse_pool_kernel.h"
 
@@ -60,7 +61,6 @@ void TestMaxPoolBase(const std::vector<IntT>& indices,
       paddle::memory::allocation::AllocatorFacade::Instance()
           .GetAllocator(phi::CPUPlace())
           .get());
-  dev_ctx_cpu.Init();
 
   const int in_channels = x_dims[4];
   const int out_channels = in_channels;
@@ -158,6 +158,7 @@ void TestMaxPoolBase(const std::vector<IntT>& indices,
                                              dilations,
                                              strides,
                                              &d_rulebook);
+  SparseCooTensor tmp_d_out = sparse::Coalesce<T>(dev_ctx_gpu, d_out);
 
   ASSERT_EQ(correct_out_dims.size(), d_out.dims().size());
   ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, d_out.nnz());
@@ -169,7 +170,7 @@ void TestMaxPoolBase(const std::vector<IntT>& indices,
       dev_ctx_cpu,
       DenseTensorMeta(indices_dtype, {4, d_out.nnz()}, DataLayout::NCHW));
   phi::Copy(dev_ctx_gpu,
-            d_out.non_zero_indices(),
+            tmp_d_out.non_zero_indices(),
             phi::CPUPlace(),
             true,
             &h_indices_tensor);
@@ -183,7 +184,7 @@ void TestMaxPoolBase(const std::vector<IntT>& indices,
       phi::EmptyLike<T>(dev_ctx_cpu, d_out.non_zero_elements());
 
   phi::Copy(dev_ctx_gpu,
-            d_out.non_zero_elements(),
+            tmp_d_out.non_zero_elements(),
             phi::CPUPlace(),
             true,
             &h_features_tensor);
diff --git a/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc
index d4f1d6efb5d93..70c9f4cfc611d 100644
--- a/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc
@@ -88,7 +88,6 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x,
       paddle::platform::CPUPlace());
 
   phi::CPUContext dev_ctx_cpu;
-  dev_ctx_cpu.Init();
   dev_ctx_cpu.SetAllocator(
       paddle::memory::allocation::AllocatorFacade::Instance()
           .GetAllocator(phi::CPUPlace())
@@ -307,7 +306,6 @@ void TestSparseCsrToCoo(const DDim& dense_dims,
 
   // 1. test cpu
   phi::CPUContext dev_ctx_cpu;
-  dev_ctx_cpu.Init();
   dev_ctx_cpu.SetAllocator(
       paddle::memory::allocation::AllocatorFacade::Instance()
           .GetAllocator(phi::CPUPlace())
@@ -489,7 +487,6 @@ void TestCooToCsr(const DDim& dense_dims,
 
   // 1. test cpu
   phi::CPUContext dev_ctx_cpu;
-  dev_ctx_cpu.Init();
   dev_ctx_cpu.SetAllocator(
       paddle::memory::allocation::AllocatorFacade::Instance()
           .GetAllocator(phi::CPUPlace())
@@ -588,7 +585,6 @@ void TestDenseToSparseCsr(const DenseTensor& dense_x,
   const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   phi::CPUContext dev_ctx_cpu;
-  dev_ctx_cpu.Init();
   dev_ctx_cpu.SetAllocator(
       paddle::memory::allocation::AllocatorFacade::Instance()
           .GetAllocator(phi::CPUPlace())
@@ -701,7 +697,6 @@ void TestSparseCooToDense(const DDim& dense_dims,
                           const int64_t non_zero_num,
                           const int64_t sparse_dim) {
   phi::CPUContext dev_ctx_cpu;
-  dev_ctx_cpu.Init();
   dev_ctx_cpu.SetAllocator(
       paddle::memory::allocation::AllocatorFacade::Instance()
           .GetAllocator(phi::CPUPlace())
@@ -879,7 +874,6 @@ void TestSparseCsrToDense(const DDim& dense_dims,
 
   // 1. test cpu
   phi::CPUContext dev_ctx_cpu;
-  dev_ctx_cpu.Init();
   dev_ctx_cpu.SetAllocator(
       paddle::memory::allocation::AllocatorFacade::Instance()
           .GetAllocator(phi::CPUPlace())
diff --git a/paddle/phi/tests/kernels/test_split_dev_api.cc b/paddle/phi/tests/kernels/test_split_dev_api.cc
index a358fcdf28db0..0389ab7afba1a 100644
--- a/paddle/phi/tests/kernels/test_split_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_split_dev_api.cc
@@ -40,7 +40,6 @@ TEST(DEV_API, split) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
-  dev_ctx.Init();
 
   auto* dense_x_data = dev_ctx.Alloc<float>(&dense_x);
   for (size_t i = 0; i < 4; ++i) {
diff --git a/paddle/phi/tests/kernels/test_sum_dev_api.cc b/paddle/phi/tests/kernels/test_sum_dev_api.cc
index 2cd677373f4ef..20e934eb69297 100644
--- a/paddle/phi/tests/kernels/test_sum_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sum_dev_api.cc
@@ -49,7 +49,6 @@ TEST(DEV_API, sum) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
-  dev_ctx.Init();
 
   // 2. test API
   auto out =
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index d87915d172bb7..9680ec234b3b4 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -685,7 +685,8 @@ set PATH=%THIRD_PARTY_PATH:/=\%\install\openblas\lib;%THIRD_PARTY_PATH:/=\%\inst
 %THIRD_PARTY_PATH:/=\%\install\zlib\bin;%THIRD_PARTY_PATH:/=\%\install\mklml\lib;^
 %THIRD_PARTY_PATH:/=\%\install\mkldnn\bin;%THIRD_PARTY_PATH:/=\%\install\warpctc\bin;^
 %THIRD_PARTY_PATH:/=\%\install\onnxruntime\lib;%THIRD_PARTY_PATH:/=\%\install\paddle2onnx\lib;^
-%work_dir%\%BUILD_DIR%\paddle\fluid\inference;%PATH%
+%work_dir%\%BUILD_DIR%\paddle\fluid\inference;%work_dir%\%BUILD_DIR%\paddle\fluid\inference\capi_exp;^
+%PATH%
 
 REM TODO: make ut find .dll in install\onnxruntime\lib
 xcopy %THIRD_PARTY_PATH:/=\%\install\onnxruntime\lib\onnxruntime.dll %work_dir%\%BUILD_DIR%\paddle\fluid\inference\tests\api\ /Y
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 8577b8eb37efa..cbfd401d30b9d 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -3306,9 +3306,10 @@ function check_coverage_build() {
 
     rm -f build_size
     curl -O https://paddle-docker-tar.bj.bcebos.com/paddle_ci_index/build_size
-    curl -O https://xly-devops.bj.bcebos.com/PR/build_whl/${AGILE_PULL_ID}/${AGILE_REVISION}/coverage_build_size
+    #curl -O https://xly-devops.bj.bcebos.com/PR/build_whl/${AGILE_PULL_ID}/${AGILE_REVISION}/coverage_build_size
+    #pr_coverage_build_size=`cat coverage_build_size|sed 's#G##g'`
     dev_coverage_build_size=`cat build_size|sed 's#G##g'`
-    pr_coverage_build_size=`cat coverage_build_size|sed 's#G##g'`
+    pr_coverage_build_size=`echo $buildSize|sed 's#G##g'`
 
     diff_coverage_build_size=`echo $(($pr_coverage_build_size - $dev_coverage_build_size))`
 
@@ -3454,6 +3455,7 @@ function main() {
         check_diff_file_for_coverage
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
         enable_unused_var_check
+        check_coverage_build
         ;;
       gpu_cicheck_coverage)
         parallel_test
diff --git a/paddle/utils/blank.h b/paddle/utils/blank.h
new file mode 100644
index 0000000000000..dd863c92897af
--- /dev/null
+++ b/paddle/utils/blank.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+// This file copy from boost/blank.hpp, boost version: 1.41.0
+// Modified the following points:
+// 1. modify namespace from boost to paddle
+// 2. remove the depending boost header files
+// 3. remove the type traits specializations
+// 4. remove streaming support
+
+//-----------------------------------------------------------------------------
+// boost blank.hpp header file
+// See http://www.boost.org for updates, documentation, and revision history.
+//-----------------------------------------------------------------------------
+//
+// Copyright (c) 2003
+// Eric Friedman
+//
+// Distributed under the Boost Software License, Version 1.0. (See
+// accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt)
+
+#pragma once
+
+namespace paddle {
+
+struct blank {};
+
+inline bool operator==(const blank&, const blank&) { return true; }
+
+inline bool operator<=(const blank&, const blank&) { return true; }
+
+inline bool operator>=(const blank&, const blank&) { return true; }
+
+inline bool operator!=(const blank&, const blank&) { return false; }
+
+inline bool operator<(const blank&, const blank&) { return false; }
+
+inline bool operator>(const blank&, const blank&) { return false; }
+
+}  // namespace paddle
diff --git a/paddle/utils/tribool.h b/paddle/utils/tribool.h
new file mode 100644
index 0000000000000..9ede76f3ec15e
--- /dev/null
+++ b/paddle/utils/tribool.h
@@ -0,0 +1,440 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file copy from boost/logic/tribool.hpp, boost version: 1.41.0
+// Modified the following points:
+// 1. modify namespace from boost to paddle
+// 2. remove the depending boost header files
+// 3. remove the dummy_ in indeterminate_t, which is specially implemented for
+// Borland C++ Builder
+// 4. remove unnecessary macro BOOST_TRIBOOL_THIRD_STATE
+
+// Three-state boolean logic library
+
+// Copyright Douglas Gregor 2002-2004. Use, modification and
+// distribution is subject to the Boost Software License, Version
+// 1.0. (See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt)
+
+// For more information, see http://www.boost.org
+
+#pragma once
+
+namespace paddle {
+namespace logic {
+
+/// INTERNAL ONLY
+namespace detail {
+/**
+ * INTERNAL ONLY
+ *
+ * \brief A type used only to uniquely identify the 'indeterminate'
+ * function/keyword.
+ */
+struct indeterminate_t {};
+
+}  // end namespace detail
+
+class tribool;
+
+/**
+ * INTERNAL ONLY
+ * The type of the 'indeterminate' keyword. This has the same type as the
+ * function 'indeterminate' so that we can recognize when the keyword is
+ * used.
+ */
+typedef bool (*indeterminate_keyword_t)(tribool, detail::indeterminate_t);
+
+/**
+ * \brief Keyword and test function for the indeterminate tribool value
+ *
+ * The \c indeterminate function has a dual role. It's first role is
+ * as a unary function that tells whether the tribool value is in the
+ * "indeterminate" state. It's second role is as a keyword
+ * representing the indeterminate (just like "true" and "false"
+ * represent the true and false states). If you do not like the name
+ * "indeterminate", and would prefer to use a different name, see the
+ * macro \c BOOST_TRIBOOL_THIRD_STATE.
+ *
+ * \returns <tt>x.value == tribool::indeterminate_value</tt>
+ * \throws nothrow
+ */
+inline bool indeterminate(
+    tribool x, detail::indeterminate_t dummy = detail::indeterminate_t());
+
+/**
+ * \brief A 3-state boolean type.
+ *
+ * 3-state boolean values are either true, false, or
+ * indeterminate.
+ */
+class tribool {
+ private:
+  /// INTERNAL ONLY
+  struct dummy {
+    void nonnull() {}
+  };
+
+  typedef void (dummy::*safe_bool)();
+
+ public:
+  /**
+   * Construct a new 3-state boolean value with the value 'false'.
+   *
+   * \throws nothrow
+   */
+  tribool() : value(false_value) {}
+
+  /**
+   * Construct a new 3-state boolean value with the given boolean
+   * value, which may be \c true or \c false.
+   *
+   * \throws nothrow
+   */
+  tribool(bool value) : value(value ? true_value : false_value) {}  // NOLINT
+
+  /**
+   * Construct a new 3-state boolean value with an indeterminate value.
+   *
+   * \throws nothrow
+   */
+  tribool(indeterminate_keyword_t) : value(indeterminate_value) {}  // NOLINT
+
+  /**
+   * Use a 3-state boolean in a boolean context. Will evaluate true in a
+   * boolean context only when the 3-state boolean is definitely true.
+   *
+   * \returns true if the 3-state boolean is true, false otherwise
+   * \throws nothrow
+   */
+  operator safe_bool() const {
+    return value == true_value ? &dummy::nonnull : 0;
+  }
+
+  /**
+   * The actual stored value in this 3-state boolean, which may be false, true,
+   * or indeterminate.
+   */
+  enum value_t { false_value, true_value, indeterminate_value } value;
+};
+
+// Check if the given tribool has an indeterminate value. Also doubles as a
+// keyword for the 'indeterminate' value
+inline bool indeterminate(tribool x, detail::indeterminate_t) {
+  return x.value == tribool::indeterminate_value;
+}
+
+/** @defgroup logical Logical operations
+ */
+//@{
+/**
+ * \brief Computes the logical negation of a tribool
+ *
+ * \returns the logical negation of the tribool, according to the
+ * table:
+ *  <table border=1>
+ *    <tr>
+ *      <th><center><code>!</code></center></th>
+ *      <th/>
+ *    </tr>
+ *    <tr>
+ *      <th><center>false</center></th>
+ *      <td><center>true</center></td>
+ *    </tr>
+ *    <tr>
+ *      <th><center>true</center></th>
+ *      <td><center>false</center></td>
+ *    </tr>
+ *    <tr>
+ *      <th><center>indeterminate</center></th>
+ *      <td><center>indeterminate</center></td>
+ *    </tr>
+ *  </table>
+ * \throws nothrow
+ */
+inline tribool operator!(tribool x) {
+  return x.value == tribool::false_value  ? tribool(true)
+         : x.value == tribool::true_value ? tribool(false)
+                                          : tribool(indeterminate);
+}
+
+/**
+ * \brief Computes the logical conjuction of two tribools
+ *
+ * \returns the result of logically ANDing the two tribool values,
+ * according to the following table:
+ *       <table border=1>
+ *           <tr>
+ *             <th><center><code>&amp;&amp;</code></center></th>
+ *             <th><center>false</center></th>
+ *             <th><center>true</center></th>
+ *             <th><center>indeterminate</center></th>
+ *           </tr>
+ *           <tr>
+ *             <th><center>false</center></th>
+ *             <td><center>false</center></td>
+ *             <td><center>false</center></td>
+ *             <td><center>false</center></td>
+ *           </tr>
+ *           <tr>
+ *             <th><center>true</center></th>
+ *             <td><center>false</center></td>
+ *             <td><center>true</center></td>
+ *             <td><center>indeterminate</center></td>
+ *           </tr>
+ *           <tr>
+ *             <th><center>indeterminate</center></th>
+ *             <td><center>false</center></td>
+ *             <td><center>indeterminate</center></td>
+ *             <td><center>indeterminate</center></td>
+ *           </tr>
+ *       </table>
+ * \throws nothrow
+ */
+inline tribool operator&&(tribool x, tribool y) {
+  if (static_cast<bool>(!x) || static_cast<bool>(!y))
+    return false;
+  else if (static_cast<bool>(x) && static_cast<bool>(y))
+    return true;
+  else
+    return indeterminate;
+}
+
+/**
+ * \overload
+ */
+inline tribool operator&&(tribool x, bool y) { return y ? x : tribool(false); }
+
+/**
+ * \overload
+ */
+inline tribool operator&&(bool x, tribool y) { return x ? y : tribool(false); }
+
+/**
+ * \overload
+ */
+inline tribool operator&&(indeterminate_keyword_t, tribool x) {
+  return !x ? tribool(false) : tribool(indeterminate);
+}
+
+/**
+ * \overload
+ */
+inline tribool operator&&(tribool x, indeterminate_keyword_t) {
+  return !x ? tribool(false) : tribool(indeterminate);
+}
+
+/**
+ * \brief Computes the logical disjunction of two tribools
+ *
+ * \returns the result of logically ORing the two tribool values,
+ * according to the following table:
+ *       <table border=1>
+ *           <tr>
+ *             <th><center><code>||</code></center></th>
+ *             <th><center>false</center></th>
+ *             <th><center>true</center></th>
+ *             <th><center>indeterminate</center></th>
+ *           </tr>
+ *           <tr>
+ *             <th><center>false</center></th>
+ *             <td><center>false</center></td>
+ *             <td><center>true</center></td>
+ *             <td><center>indeterminate</center></td>
+ *           </tr>
+ *           <tr>
+ *             <th><center>true</center></th>
+ *             <td><center>true</center></td>
+ *             <td><center>true</center></td>
+ *             <td><center>true</center></td>
+ *           </tr>
+ *           <tr>
+ *             <th><center>indeterminate</center></th>
+ *             <td><center>indeterminate</center></td>
+ *             <td><center>true</center></td>
+ *             <td><center>indeterminate</center></td>
+ *           </tr>
+ *       </table>
+ *  \throws nothrow
+ */
+inline tribool operator||(tribool x, tribool y) {
+  if (static_cast<bool>(!x) && static_cast<bool>(!y))
+    return false;
+  else if (static_cast<bool>(x) || static_cast<bool>(y))
+    return true;
+  else
+    return indeterminate;
+}
+
+/**
+ * \overload
+ */
+inline tribool operator||(tribool x, bool y) { return y ? tribool(true) : x; }
+
+/**
+ * \overload
+ */
+inline tribool operator||(bool x, tribool y) { return x ? tribool(true) : y; }
+
+/**
+ * \overload
+ */
+inline tribool operator||(indeterminate_keyword_t, tribool x) {
+  return x ? tribool(true) : tribool(indeterminate);
+}
+
+/**
+ * \overload
+ */
+inline tribool operator||(tribool x, indeterminate_keyword_t) {
+  return x ? tribool(true) : tribool(indeterminate);
+}
+//@}
+
+/**
+ * \brief Compare tribools for equality
+ *
+ * \returns the result of comparing two tribool values, according to
+ * the following table:
+ *       <table border=1>
+ *          <tr>
+ *            <th><center><code>==</code></center></th>
+ *            <th><center>false</center></th>
+ *            <th><center>true</center></th>
+ *            <th><center>indeterminate</center></th>
+ *          </tr>
+ *          <tr>
+ *            <th><center>false</center></th>
+ *            <td><center>true</center></td>
+ *            <td><center>false</center></td>
+ *            <td><center>indeterminate</center></td>
+ *          </tr>
+ *          <tr>
+ *            <th><center>true</center></th>
+ *            <td><center>false</center></td>
+ *            <td><center>true</center></td>
+ *            <td><center>indeterminate</center></td>
+ *          </tr>
+ *          <tr>
+ *            <th><center>indeterminate</center></th>
+ *            <td><center>indeterminate</center></td>
+ *            <td><center>indeterminate</center></td>
+ *            <td><center>indeterminate</center></td>
+ *          </tr>
+ *      </table>
+ * \throws nothrow
+ */
+inline tribool operator==(tribool x, tribool y) {
+  if (indeterminate(x) || indeterminate(y))
+    return indeterminate;
+  else
+    return (x && y) || (!x && !y);
+}
+
+/**
+ * \overload
+ */
+inline tribool operator==(tribool x, bool y) { return x == tribool(y); }
+
+/**
+ * \overload
+ */
+inline tribool operator==(bool x, tribool y) { return tribool(x) == y; }
+
+/**
+ * \overload
+ */
+inline tribool operator==(indeterminate_keyword_t, tribool x) {
+  return tribool(indeterminate) == x;
+}
+
+/**
+ * \overload
+ */
+inline tribool operator==(tribool x, indeterminate_keyword_t) {
+  return tribool(indeterminate) == x;
+}
+
+/**
+ * \brief Compare tribools for inequality
+ *
+ * \returns the result of comparing two tribool values for inequality,
+ * according to the following table:
+ *       <table border=1>
+ *           <tr>
+ *             <th><center><code>!=</code></center></th>
+ *             <th><center>false</center></th>
+ *             <th><center>true</center></th>
+ *             <th><center>indeterminate</center></th>
+ *           </tr>
+ *           <tr>
+ *             <th><center>false</center></th>
+ *             <td><center>false</center></td>
+ *             <td><center>true</center></td>
+ *             <td><center>indeterminate</center></td>
+ *           </tr>
+ *           <tr>
+ *             <th><center>true</center></th>
+ *             <td><center>true</center></td>
+ *             <td><center>false</center></td>
+ *             <td><center>indeterminate</center></td>
+ *           </tr>
+ *           <tr>
+ *             <th><center>indeterminate</center></th>
+ *             <td><center>indeterminate</center></td>
+ *             <td><center>indeterminate</center></td>
+ *             <td><center>indeterminate</center></td>
+ *           </tr>
+ *       </table>
+ * \throws nothrow
+ */
+inline tribool operator!=(tribool x, tribool y) {
+  if (indeterminate(x) || indeterminate(y))
+    return indeterminate;
+  else
+    return !((x && y) || (!x && !y));
+}
+
+/**
+ * \overload
+ */
+inline tribool operator!=(tribool x, bool y) { return x != tribool(y); }
+
+/**
+ * \overload
+ */
+inline tribool operator!=(bool x, tribool y) { return tribool(x) != y; }
+
+/**
+ * \overload
+ */
+inline tribool operator!=(indeterminate_keyword_t, tribool x) {
+  return tribool(indeterminate) != x;
+}
+
+/**
+ * \overload
+ */
+inline tribool operator!=(tribool x, indeterminate_keyword_t) {
+  return x != tribool(indeterminate);
+}
+
+}  // namespace logic
+}  // namespace paddle
+
+// Pull tribool and indeterminate into namespace "boost"
+namespace paddle {
+using logic::indeterminate;
+using logic::tribool;
+}  // namespace paddle
diff --git a/python/paddle/autograd/__init__.py b/python/paddle/autograd/__init__.py
index 6669e4f4c70aa..8bc7b11368680 100644
--- a/python/paddle/autograd/__init__.py
+++ b/python/paddle/autograd/__init__.py
@@ -26,8 +26,6 @@
     from .py_layer import LegacyPyLayerContext as PyLayerContext  # noqa: F401
 from ..framework import set_grad_enabled, is_grad_enabled  # noqa: F401
 from ..fluid.dygraph.base import no_grad_ as no_grad  # noqa: F401
-from .functional import vjp, jvp, Jacobian, Hessian  # noqa: F401
-from .functional import jacobian, hessian, batch_jacobian, batch_hessian, vhp  # noqa: F401
 
 __all__ = [  # noqa
     'backward',
diff --git a/python/paddle/autograd/functional.py b/python/paddle/autograd/functional.py
deleted file mode 100644
index aa3e99978b72a..0000000000000
--- a/python/paddle/autograd/functional.py
+++ /dev/null
@@ -1,1362 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import functools
-import typing
-
-import paddle
-from paddle.fluid import framework
-from paddle.autograd.utils import as_tensors
-
-
-def vjp(func, xs, v=None):
-    r"""Computes the Vector-Jacobian product, a functional form of
-    reverse mode automatic differentiation.
-
-    Warning:
-        This API is in beta, the signatures could be changed in future version.
-
-    Args:
-        func(Callable): A function that takes ``xs`` as inputs parameter and
-            returns a sequence of Tensors or a Tensor.
-        xs(Tensor|Sequence[Tensor]): Used as positional arguments to evaluate
-            ``func``. ``xs`` is accepted as one Tensor or a sequence of Tensors.
-        v(Tensor|Sequence[Tensor]|None, optional): The cotangent vector invovled
-            in the VJP computation. ``v`` matches the size and shape of
-            ``func`` 's output. Defaults to None, which is equivalent to all
-            ones the same size of ``func`` 's output.
-
-    Returns:
-        output(tuple):
-        
-            - func_out(Tensor|tuple[Tensor]): The output of ``func(xs)`` .
-            - vjp(Tensor|tuple[Tensor]): The vjp result.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-
-            def func(x):
-                return paddle.matmul(x, x)
-
-            x = paddle.ones(shape=[2, 2], dtype='float32')
-            _, vjp_result = paddle.incubate.autograd.vjp(func, x)
-            print(vjp_result)
-            # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False,
-            #        [[4., 4.],
-            #         [4., 4.]])
-
-            v = paddle.to_tensor([[1.0, 0.0], [0.0, 0.0]])
-            _, vjp_result = paddle.incubate.autograd.vjp(func, x, v)
-            print(vjp_result)
-            # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False,
-            #        [[2., 1.],
-            #         [1., 0.]])
-    """
-    _check_inputs(func, xs, v)
-
-    # ``_seprate`` breaks the dependencies between ``xs`` and other
-    # variables. See more ``_seprate`` .
-    xs, v = _separate(xs), _separate(v)
-    ys = func(*xs) if isinstance(xs, typing.Sequence) else func(xs)
-    _check_v_shape(v, ys)
-
-    return ys, _grad(ys, xs, v)
-
-
-def jvp(func, xs, v=None):
-    r"""
-    Computes the Jacobian-Vector product for a function at the given
-    inputs and a vector in the tangent space induced by the inputs.
-
-    Warning:
-        This API is in beta, the signatures could be changed in future version.
-
-    Args:
-        func(Callable): The ``func`` takes as input a Tensor or a Sequence
-            of Tensors and returns a Tensor or a Sequence of Tensors.
-        xs(Tensor|Sequence[Tensor]): Used as positional arguments to
-            evaluate ``func``.  The ``xs`` is accepted as one Tensor or a
-            Sequence of Tensors.
-        v(Tensor|Sequence[Tensor]|None, Optional): The tangent vector invovled
-            in the JVP computation. The ``v`` matches the size and shape of
-            ``xs`` . Default value is None and in this case is equivalent to 
-            all ones the same size of ``xs`` .
-
-    Returns:
-        output(tuple):
-
-            - func_out(Tensor|tuple[Tensor]): The output of ``func(xs)`` .
-            - jvp(Tensor|tuple[Tensor]): The jvp result.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-
-
-            def func(x):
-                return paddle.matmul(x, x)
-
-
-            x = paddle.ones(shape=[2, 2], dtype='float32')
-            _, jvp_result = paddle.incubate.autograd.jvp(func, x)
-            print(jvp_result)
-            # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False,
-            #        [[4., 4.],
-            #         [4., 4.]])
-            v = paddle.to_tensor([[1.0, 0.0], [0.0, 0.0]])
-            _, jvp_result = paddle.incubate.autograd.jvp(func, x, v)
-            print(jvp_result)
-            # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False,
-            #        [[2., 1.],
-            #         [1., 0.]])
-
-    """
-    _check_inputs(func, xs, v)
-    # ``_seprate`` breaks the dependencies between ``xs`` and other
-    # variables. See more ``_seprate`` .
-    xs, v = _separate(xs), _separate(v)
-    ys = func(*xs) if isinstance(xs, typing.Sequence) else func(xs)
-    _check_v_shape(v, xs)
-    return ys, _double_backward_trick(ys, xs, v)
-
-
-def _double_backward_trick(ys, xs, v):
-    """Double backward trick for computing ``jvp`` by ``vjp``
-    see details: https://j-towns.github.io/2017/06/12/A-new-trick.html
-    """
-    # The value of ys_grad is not important, it can be any random value in
-    # theory, but it's required to set stop_gradient=False.
-    ys_grad = _zeros_like_with_grad(ys)
-    xs_grad = _grad(ys, xs, ys_grad)
-    return _grad(xs_grad, ys_grad, v)
-
-
-def _zeros_like_with_grad(xs):
-    """Create a zero or zeros sequence Tensor like ``xs`` with a flag 
-    ``stop_graident=False`` .
-    """
-    if not isinstance(xs, typing.Sequence):
-        ys = paddle.zeros_like(xs)
-        ys.stop_gradient = False
-    else:
-        ys = []
-        for x in xs:
-            y = paddle.zeros_like(x)
-            y.stop_gradient = False
-            ys.append(y)
-    return ys
-
-
-class Jacobian(object):
-    r"""
-    Computes the Jacobian matrix of a given function.
-
-    If the function has multiple inputs and multiple outputs, during internal 
-    implementation, all input tensors are concatenated after being flatten, 
-    the batch dimension is retained, and the output is subject to the same 
-    processing rules.
-
-    Once the Jacobian ``J`` is constructed, you can use a multidimensional index 
-    to retrieve the submatrix of ``J``, as same as slicing a Tensor. The 
-    submatrix is lazily evaluated along row axis, and will be cached once 
-    evaluated.
-
-    For examples, supposing ``is_batched=True``, you can retrieve the submatrix 
-    by following methods:
-
-        * J[:], retrieving the full matrix.
-        * J[:, :, j], retrieving the partial derivatives w.r.t. the j'th input
-          variable.
-        * J[:, i, :], retrieving the partial derivatives w.r.t. the i'th output
-          variable.
-        * J[:, i, j], retrieving the partial derivatives w.r.t. the i'th output
-          variable and the j'th input variable.
-
-    Notes:
-
-        Eclipsis index is not supported currently.
-
-    Warning:
-        This API is in beta, the signatures could be changed in future version.
-
-    Args:
-
-        func (Callable): A python function that takes a Tensor or a sequence of 
-            Tensors as inputs(the first dimension is batch size) and
-            returns a Tensor  a sequence of Tensors.
-        xs (Tensor|Sequence[Tensor]): The input to the function ``func`` .
-        is_batched (bool): If true, the first axis is batch axis. Defaults to 
-            False.
-
-    Returns:
-
-        Jacobian (Object): A python object retains the Jacobian matrix.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-
-
-            def func(x, y):
-                return paddle.matmul(x, y)
-
-
-            x = paddle.to_tensor([[1., 2.], [3., 4.]])
-            J = paddle.incubate.autograd.Jacobian(func, [x, x])
-            print(J[:, :])
-            # Tensor(shape=[4, 8], dtype=float32, place=Place(gpu:0), stop_gradient=False,
-            #        [[1., 3., 0., 0., 1., 0., 2., 0.],
-            #         [2., 4., 0., 0., 0., 1., 0., 2.],
-            #         [0., 0., 1., 3., 3., 0., 4., 0.],
-            #         [0., 0., 2., 4., 0., 3., 0., 4.]])
-
-            print(J[0, :])
-            # Tensor(shape=[8], dtype=float32, place=Place(gpu:0), stop_gradient=False,
-            #        [1., 3., 0., 0., 1., 0., 2., 0.])
-            print(J[:, 0])
-            # Tensor(shape=[4], dtype=float32, place=Place(gpu:0), stop_gradient=False,
-            #        [1., 2., 0., 0.])
-
-    """
-
-    def __init__(self, func, xs, is_batched=False):
-        if not is_batched:
-            self._jacobian = _JacobianNoBatch(func, xs)
-        else:
-            self._jacobian = _JacobianBatchFirst(func, xs)
-
-    def __getitem__(self, indexes):
-        return self._jacobian[indexes]
-
-    @property
-    def shape(self):
-        """The shape of flattened Jacobian matrix.
-        """
-        return self._jacobian.shape
-
-
-class Hessian(object):
-    """
-    Computes the Hessian matrix  with a given ``func`` with respect to ``xs`` .
-
-    If the function has multiple inputs, during internal implementation, 
-    all input tensors are concatenated after being flatten, the batch dimension 
-    is retained.
-
-    The Hessian submatrix is lazily evaluated, and can be retrieved with a 
-    multidimensional indexes. See details ``Jacobian`` .
-
-    Warning:
-        This API is in beta, the signatures could be changed in future version.
-
-    Args:
-        func (Callable): A python function that takes a Tensor or a Tensor
-            sequence as inputs and returns a Tensor with shape 
-            ``[batch_size, 1]`` with batch or ``[1]`` without batch.
-        xs (Tensor|Sequence(Tensor)): The input Tensor or Tensor sequence of 
-            the function ``func``.
-        is_batched (bool): If true, the first axis is batch axis. Defaults to 
-            False.
-
-    Returns:
-
-        Hessian (Object): A python object retains the Hessian matrix.
-
-
-    Examples:
-
-    .. code-block:: python
-
-        import paddle
-
-
-        def reducer(x):
-            return paddle.sum(x * x)
-
-
-        x = paddle.rand([2, 2])
-        h = paddle.incubate.autograd.Hessian(reducer, x)
-        print(h[:])
-        # Tensor(shape=[4, 4], dtype=float32, place=Place(gpu:0), stop_gradient=False,
-        #        [[2., 0., 0., 0.],
-        #         [0., 2., 0., 0.],
-        #         [0., 0., 2., 0.],
-        #         [0., 0., 0., 2.]])
-    """
-
-    def __init__(self, func, xs, is_batched=False):
-
-        def _jac_func(*xs):
-            jac = Jacobian(func, xs, is_batched=is_batched)
-            if (is_batched and jac.shape[1] != 1) or (not is_batched
-                                                      and jac.shape[0] != 1):
-                raise RuntimeError(
-                    "The function given to Hessian shoud return as single element Tensor or batched single element Tensor."
-                )
-            return jac[:, 0, :] if is_batched else jac[0, :]
-
-        self.symbolic = Jacobian(_jac_func, xs, is_batched=is_batched)
-
-    def __getitem__(self, indexes):
-        return self.symbolic[indexes]
-
-    @property
-    def shape(self):
-        """The shape of flattened Hessian matrix.
-        """
-        return self.symbolic.shape
-
-
-class _Jacobian(object):
-    """The base class for computing Jacobian matrix.
-
-    ``_Jacobian`` implementes the core logic of multidimensional index and lazy 
-    evaluation for Jacobian matrix, subclass only need to overwrite following 
-    methods:
-
-        * ``_lazy_axis()``,  return the axis along which will be lazy 
-            evaluating.
-        * ``_flatten(xs)``, flattens the inputs ``xs``.
-        * ``_evaluate(index)``, evaluates one slice along ``_lazy_axis`` .
-
-    Notes:
-
-        Because currently PaddlePaddle only support reverse differentiation by 
-        ``paddle.grad``, so lazy evaluation is only supported along the row of 
-        Jacobian matrix, which means that slicing along row will get better 
-        performance.
-
-    """
-
-    def __init__(self, func, xs):
-        # Skip separating in prim mode temporarily, as detach and clone are not
-        # primitive operators.
-        if not paddle.fluid._non_static_mode(
-        ) and paddle.incubate.autograd.prim_enabled():
-            self._xs = xs
-        else:
-            self._xs = _separate(xs)
-        self._ys = func(*as_tensors(self._xs))
-        self._flatten_xs = self._flatten(as_tensors(self._xs))
-        self._flatten_ys = self._flatten(as_tensors(self._ys))
-        self._cache = {}
-
-    @property
-    def shape(self):
-        raise NotImplementedError
-
-    @property
-    def _lazy_axis(self):
-        """"The axis of lazily evaluated."""
-        raise NotImplementedError
-
-    def _lazy_indexes(self, indexes):
-        idx = indexes[self._lazy_axis]
-        return (idx, ) if isinstance(idx, int) else tuple(
-            range(idx.start, idx.stop, idx.step))
-
-    def _flatten(self, xs):
-        raise NotImplementedError
-
-    def _shifted_indexes(self, indexes, lazy_axis_size=0):
-        idx = indexes[self._lazy_axis]
-        shifted_lazy_axis_idx = 0 if isinstance(idx, int) else slice(
-            0, lazy_axis_size, 1)
-        return indexes[:self._lazy_axis] + (
-            shifted_lazy_axis_idx, ) + indexes[self._lazy_axis + 1:]
-
-    def __getitem__(self, indexes):
-        indexes = _multi_index(indexes, self.shape)
-
-        if isinstance(indexes[self._lazy_axis], int):
-            other_indexes = indexes[:self._lazy_axis] + \
-                indexes[self._lazy_axis+1:]
-            return self._cached_evaluate(
-                indexes[self._lazy_axis])[other_indexes]
-        lazy_indexes = self._lazy_indexes(indexes)
-        # Using concat and reshape to replace stack operator temporarily, as
-        # it is not a primitive operator.
-        shape = list(self.shape)
-        shape[self._lazy_axis] = len(lazy_indexes)
-        part_jac = paddle.concat(
-            [self._cached_evaluate(i) for i in lazy_indexes],
-            axis=self._lazy_axis).reshape(shape)
-        return part_jac[self._shifted_indexes(indexes, len(lazy_indexes))]
-
-    def _cached_evaluate(self, k):
-        v = self._cache.get(k)
-        if v is None:
-            v = self._evaluate(k)
-            self._cache[k] = v
-        return v
-
-    def _evaluate(self, index):
-        """Evaluate one slice at along lazy axis."""
-        raise NotImplementedError
-
-
-class _JacobianNoBatch(_Jacobian):
-    """Compute Jacobian matrix without batch dimension.
-    Suppose the mapping is :math:`f: R^M \to R^N`, the output shape is 
-    ``(N, M)`` .
-    """
-
-    def __init__(self, func, xs):
-        super(_JacobianNoBatch, self).__init__(func, xs)
-
-    @property
-    def shape(self):
-        return (self._flatten_ys.shape[0], self._flatten_xs.shape[0])
-
-    @property
-    def _lazy_axis(self):
-        return 0
-
-    def _flatten(self, xs):
-        return paddle.concat(tuple(x.reshape((-1, )) for x in xs))
-
-    def _evaluate(self, row_index):
-        return self._flatten(_grad(
-            self._flatten_ys[row_index],
-            self._xs,
-        ))
-
-
-class _JacobianBatchLast(_Jacobian):
-    """Compute Jacobian matrix with batch at last axis.
-    Suppose the mapping is :math:`f: R^{M,B} \to R^{N,B}`, the output shape is 
-    ``(N, M, B)`` .
-    """
-
-    def __init__(self, func, xs):
-        super(_JacobianBatchLast, self).__init__(func, xs)
-
-    @property
-    def shape(self):
-        return (self._flatten_ys.shape[0], self._flatten_xs.shape[0],
-                self._flatten_xs.shape[1])
-
-    @property
-    def _lazy_axis(self):
-        return 0
-
-    def _flatten(self, xs):
-        return paddle.concat(
-            tuple(x.reshape((-1, x.shape[-1])) for x in as_tensors(xs)), 0)
-
-    def _evaluate(self, row):
-        return self._flatten(_grad(self._flatten_ys[row, :], self._xs))
-
-
-class _JacobianBatchFirst(_Jacobian):
-    """Compute Jacobian matrix with batch at first axis.
-    Suppose the mapping is :math:`f: R^{B,M} \to R^{B,N}`, the output shape is 
-    ``(B, N, M)`` .
-    """
-
-    def __init__(self, func, xs):
-        super(_JacobianBatchFirst, self).__init__(func, xs)
-
-    @property
-    def shape(self):
-        return (self._flatten_xs.shape[0], self._flatten_ys.shape[1],
-                self._flatten_xs.shape[1])
-
-    @property
-    def _lazy_axis(self):
-        return 1
-
-    def _flatten(self, xs):
-        return paddle.concat(
-            tuple(x.reshape((x.shape[0], -1)) for x in as_tensors(xs)), 1)
-
-    def _evaluate(self, row_index):
-        return self._flatten(_grad(self._flatten_ys[:, row_index], self._xs))
-
-
-def _multi_index(indexes, shape):
-    """A tool for parsing N-dimensional index into a standard format.
-
-    Currently supporting following input format:
-        * ([positive|negative|slice], ...), the right-most elements can be 
-            omited.
-
-    The standard format after converted is slice tuple which contains N elements:
-        * ([positive|slice], ..., [positive|slice])
-
-    Notes: 
-        Ellipsis indexes such as ``(..., i), (i, ...)`` is not supported.
-
-    Args:
-        indexes (tuple): The input indexes.
-        shape (tuple): The input shape.
-
-    Returns:
-        tuple: The standard format index as the above description.
-    """
-    indexes = indexes if isinstance(indexes, typing.Sequence) else (indexes, )
-    if any(isinstance(i, type(Ellipsis)) for i in indexes):
-        raise IndexError('Ellipsis index currently is not supported.')
-    # Fill the right-most elements.
-    indexes = indexes + (slice(0, None, None), ) * (len(shape) - len(indexes))
-    # Convert to positive index.
-    positive_indexes = []
-    for i, index in enumerate(indexes):
-        if isinstance(index, slice):
-            index = slice(index.start or 0, index.stop or shape[i], index.step
-                          or 1)
-            positive_indexes.append(
-                slice(
-                    index.start + shape[i] if index.start < 0 else index.start,
-                    index.stop + shape[i] if index.stop < 0 else index.stop,
-                    # Negative step means index backward, no need to convert to
-                    # positive interger.
-                    index.step))
-        elif isinstance(index, int):
-            positive_indexes.append(index + shape[i] if index < 0 else index)
-        else:
-            raise TypeError(f'Not supported index type {index}.')
-    return tuple(positive_indexes)
-
-
-def _stack_tensor_or_return_none(origin_list):
-    assert len(origin_list) > 0, "Can't not stack an empty list"
-    return paddle.stack(origin_list, axis=0) if isinstance(
-        origin_list[0], paddle.fluid.framework.Variable) else None
-
-
-def _replace_none_with_zero_tensor(xs, refs):
-    if xs is None:
-        xs = paddle.zeros_like(refs)
-        xs.stop_gradient = refs.stop_gradient
-        return xs
-    elif isinstance(xs, typing.Sequence):
-        return tuple(
-            _replace_none_with_zero_tensor(x, refs[i])
-            for i, x in enumerate(xs))
-    else:
-        return xs
-
-
-def _grad(ys, xs, v=None):
-    """A gradient function that can be used in dynamic graph and static graph.
-
-    The ``grad`` combines ``paddle.grad`` used in dynamic graph and
-    ``paddle.static.gradients`` used in static graph, and do following changes:
-
-    * The ``allow_unused`` flag is removed and set defaults to true internally,
-        none in outputs will be replaced by zero tensor.
-    * The ``create_graph`` flag is removed and set defaults to true internally,
-        only makes sense in dynamic graph.
-    * When xs is a single Tensor, ``paddle.grad`` returns a list which only 
-        contains one Tensor. It may confuse users, thus in this case we improve 
-        to return a single Tensor in _grad interface.
-
-    Args:
-        ys (Tensor|Sequence[Tensor]): The output tensor or tensor sequence of
-            the graph to compute gradients.
-        xs (Tensor|Sequence[Tensor]): The input tensor or tensor sequence of the graph to
-            compute gradients. The returned values of this API are the
-            gradients of inputs .
-        v (Tensor|Sequence[Tensor]|None,optional): The initial gradient values
-            of outputs . If grad_outputs is None, the initial gradient values of
-            outputs would be Tensors filled with 1; if grad_outputs is not None,
-            it must have the same length as outputs , and in this case, the
-            initial gradient value of the i-th outputs would be: (1) a Tensor
-            filled with 1 when the i-th element of grad_outputs is None;
-            (2) the i-th element of grad_outputs when the i-th element of
-            grad_outputs is a Tensor. Default None.
-
-    Returns:
-        Tensor|tuple[Tensor]: Tensor or a tuple of Tensors, whose length is the 
-            same as the Tensor number inside inputs, and the i-th returned 
-            Tensor is the sum of gradients of outputs with respect to the i-th 
-            inputs.
-    """
-    if paddle.fluid._non_static_mode():
-        xs_grad = paddle.grad(ys, xs, v, create_graph=True, allow_unused=True)
-    else:
-        xs_grad = paddle.static.gradients(ys, xs, v)
-
-    if isinstance(xs, paddle.fluid.framework.Variable):
-        xs_grad = xs_grad[0]
-
-    return _replace_none_with_zero_tensor(xs_grad, xs)
-
-
-def _separate(xs):
-    """
-    ``_separate`` separates ``xs`` from the computation graph through ``clone`` 
-    or ``deteach`` .
-
-    Interally, ``paddle.grad(xs, ys)`` is stateful API implemented based on 
-    computional graph, which will reduce gradients along all path from ys to xs.
-
-    However, funcional autograd API such as ``vjp``, ``jvp`` is stateless, and 
-    only compute gradients with a given ``func`` .
-
-    For example, given a ``func`` :math:`y0=f(x0)`, supposing forward path is:
-    ``x0 -> y0``, ``x0 -> x1 -> y0`` .
-    ``paddle.grad(y0, x0)`` will reduce gradients along ``y0->x0`` and 
-    ``y0->x1->x0``, and ``vjp`` only need reduce along ``y0->x0``.
-
-    So, it's needed to clone or detach xs for breaking the dependencies with 
-    other variables.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-            from paddle.autograd.functional import _separate
-
-
-            def func(x, y):
-                return x * y
-
-
-            x = paddle.ones((1,))
-            x.stop_gradient = False
-
-            y = func(x, x)
-            print(paddle.grad(y, x))
-            # [Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [2.])]
-
-            x1, x2 = _separate((x, x))
-            y = func(x1, x2)
-            print(paddle.grad(y, x1))
-            # [Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [1.])]
-
-    """
-    if isinstance(xs, typing.Sequence):
-        return tuple(_single_separate(x) for x in xs)
-    else:
-        return _single_separate(xs)
-
-
-def _single_separate(x):
-    if x is None:  # x maybe none because grad input's v defaults to none.
-        return x
-    if not x.stop_gradient:
-        return paddle.clone(x)
-    else:  # use detach to share memory when no need gradients.
-        x = x.detach()
-        x.stop_gradient = False
-        return x
-    return x
-
-
-def _check_inputs(func, xs, v=None):
-    if not callable(func):
-        raise TypeError(f"Expected 'fun' is Callable, but got {type(func)}.")
-
-    if not isinstance(xs, (framework.Variable, typing.Sequence)):
-        raise TypeError(f"Expected 'xs' is a Tensor|Sequence[Tensor],"
-                        f"but got {type(xs)}.")
-    if isinstance(xs, typing.Sequence) and not all(
-            isinstance(x, framework.Variable) for x in xs):
-        raise TypeError("All elements of 'xs' shoule be Tensor.")
-
-    if not isinstance(v, (framework.Variable, typing.Sequence, type(None))):
-        raise TypeError(
-            f"Expected 'v' is Tensor|Sequence[Tensor]|None, but got {type(v)}.")
-
-    if isinstance(v, typing.Sequence) and not all(
-            isinstance(e, framework.Variable) for e in v):
-        raise TypeError("All elements of 'xs' shoule be Tensor.")
-
-
-def _check_v_shape(v, refs):
-    if v is None:
-        return
-
-    v, refs = as_tensors(v), as_tensors(refs)
-    if len(refs) != len(v):
-        raise RuntimeError(f"The argument v is a tuple of invalid length:"
-                           f"should be {len(refs)} but got {len(v)}.")
-
-    for index, (element_v, element_ref) in enumerate(zip(v, refs)):
-        if element_v.shape != element_ref.shape:
-            raise RuntimeError(
-                f"The v[{index}] has invalid shape: should "
-                f"be {element_ref.shape} but got {element_v.shape}.")
-
-
-@framework.dygraph_only
-def jacobian(func, inputs, create_graph=False, allow_unused=False):
-    ''' 
-    .. note::
-        **This API is ONLY available in the imperative mode.**
-
-    This function computes the Jacobian matrix of `func` with respect to `inputs`.
-
-    Parameters:
-        func (function): a Python function that takes a Tensor or a Tensor
-            list/tuple as inputs and returns a Tensor or a Tensor tuple.
-        inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or 
-            Tensor list/tuple of the function ``func``.
-        create_graph (bool, optional): whether to create the gradient graphs
-            of the computing process. When it is True, higher order derivatives
-            are supported to compute; when it is False, the gradient graphs of
-            the computing process would be discarded. Defaults to ``False``.
-        allow_unused (bool, optional): whether to raise error or return None if
-            some Tensors of `inputs` are unreachable in the graph. Error would
-            be raised if allow_unused=False, and None would be returned as
-            their gradients if allow_unused=True. Default False.
-    Returns:
-        Jacobian (Tensor or nested tuple of Tensors): if function ``func``
-        takes a Tensor as inputs and returns a Tensor as outputs, Jacobian
-        will be a single Tensor containing the Jacobian matrix for the
-        linearized inputs and outputs. If one of the inputs and outputs is
-        a Tensor, and another is a Tensor list/tuple, then the Jacobian will
-        be a tuple of Tensors. If both of inputs and outputs are Tensor
-        list/tuple, then the Jacobian will be a tuple of tuple of Tensors
-        where ``Jacobian[i][j]`` will contain the Jacobian matrix of the
-        linearized ``i``th output and ``j``th input and will have same
-        dtype and device as the corresponding input. ``Jacobian[i][j]`` will
-        have as size ``m * n``, where ``m`` and ``n`` denote the numbers of
-        elements of ``i``th output and ``j``th input respectively.
-
-
-    Examples 1:
-        .. code-block:: python
-
-            import paddle
-
-            def func(x):
-                return paddle.matmul(x, x)
-
-            x = paddle.ones(shape=[2, 2], dtype='float32')
-            x.stop_gradient = False
-            jacobian = paddle.autograd.jacobian(func, x)
-            print(jacobian)
-            # Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #        [[2., 1., 1., 0.],
-            #         [1., 2., 0., 1.],
-            #         [1., 0., 2., 1.],
-            #         [0., 1., 1., 2.]])
-
-    Examples 2:
-        .. code-block:: python
-
-            import paddle
-
-            def func(x, y):
-                return paddle.matmul(x, y)
-
-            x = paddle.ones(shape=[2, 2], dtype='float32')
-            y = paddle.ones(shape=[2, 2], dtype='float32') * 2
-            x.stop_gradient = False
-            y.stop_gradient = False
-            jacobian = paddle.autograd.jacobian(func, [x, y], create_graph=True)
-            print(jacobian)
-            # (Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
-            #        [[2., 2., 0., 0.],
-            #         [2., 2., 0., 0.],
-            #         [0., 0., 2., 2.],
-            #         [0., 0., 2., 2.]]), 
-            #  Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
-            #        [[1., 0., 1., 0.],
-            #         [0., 1., 0., 1.],
-            #         [1., 0., 1., 0.],
-            #         [0., 1., 0., 1.]]))
-
-    Examples 3:
-        .. code-block:: python
-
-            import paddle
-
-            def func(x, y):
-                return paddle.matmul(x, y), x * x
-
-            x = paddle.ones(shape=[2, 2], dtype='float32')
-            y = paddle.ones(shape=[2, 2], dtype='float32') * 2
-            x.stop_gradient = False
-            y.stop_gradient = False
-            jacobian = paddle.autograd.jacobian(func, [x, y], allow_unused=True)
-            print(jacobian)
-            # ((Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #        [[2., 2., 0., 0.],
-            #         [2., 2., 0., 0.],
-            #         [0., 0., 2., 2.],
-            #         [0., 0., 2., 2.]]),
-            #   Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #        [[1., 0., 1., 0.],
-            #         [0., 1., 0., 1.],
-            #         [1., 0., 1., 0.],
-            #         [0., 1., 0., 1.]])),
-            #  (Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #        [[2., 0., 0., 0.],
-            #         [0., 2., 0., 0.],
-            #         [0., 0., 2., 0.],
-            #         [0., 0., 0., 2.]]), None))
-
-    '''
-    inputs = as_tensors(inputs)
-    outputs = as_tensors(func(*inputs))
-    fin_size = len(inputs)
-    fout_size = len(outputs)
-    flat_outputs = tuple(
-        paddle.reshape(output, shape=[-1]) for output in outputs)
-    jacobian = tuple()
-    for i, flat_output in enumerate(flat_outputs):
-        jac_i = list([] for _ in range(fin_size))
-        for k in range(len(flat_output)):
-            row_k = paddle.grad(flat_output[k],
-                                inputs,
-                                create_graph=create_graph,
-                                retain_graph=True,
-                                allow_unused=allow_unused)
-            for j in range(fin_size):
-                jac_i[j].append(
-                    paddle.reshape(row_k[j], shape=[-1]) if isinstance(
-                        row_k[j], paddle.Tensor) else None)
-        jacobian += (tuple(
-            _stack_tensor_or_return_none(jac_i_j) for jac_i_j in jac_i), )
-    if fin_size == 1 and fout_size == 1:
-        return jacobian[0][0]
-    elif fin_size == 1 and fout_size != 1:
-        return tuple(jacobian[i][0] for i in range(fout_size))
-    elif fin_size != 1 and fout_size == 1:
-        return jacobian[0]
-    else:
-        return jacobian
-
-
-@framework.dygraph_only
-def batch_jacobian(func, inputs, create_graph=False, allow_unused=False):
-    ''' 
-    .. note::
-        **This API is ONLY available in the imperative mode.**
-
-    This function computes the batch Jacobian matrix of `func` with respect to `inputs`.
-    Noted that the first dimension of inputs is batch size.
-
-    Parameters:
-        func (function): a Python function that takes a Tensor or a Tensor
-            list/tuple as inputs(the first dimension is batch size) and 
-            returns a Tensor or a Tensor tuple.
-        inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or 
-            Tensor list/tuple of the function ``func``, Noted that
-            the first dimension of inputs is batch size.
-        create_graph (bool, optional): whether to create the gradient graphs
-            of the computing process. When it is True, higher order derivatives
-            are supported to compute; when it is False, the gradient graphs of
-            the computing process would be discarded. Defaults to ``False``.
-        allow_unused (bool, optional): whether to raise error or return None if
-            some Tensors of `inputs` are unreachable in the graph. Error would
-            be raised if allow_unused=False, and None would be returned as
-            their gradients if allow_unused=True. Default False.
-    Returns:
-        Jacobian (Tensor or nested tuple of Tensors): if function ``func``
-        takes a Tensor as inputs and returns a Tensor as outputs, Jacobian
-        will be a single Tensor containing the Jacobian matrix for the
-        linearized inputs and outputs. If one of the inputs and outputs is
-        a Tensor, and another is a Tensor list/tuple, then the Jacobian will
-        be a tuple of Tensors. If both of inputs and outputs are Tensor
-        list/tuple, then the Jacobian will be a tuple of tuple of Tensors.
-        Noted that the first dimension of inputs is batch size.
-
-        For example,
-        the inputs shape and outputs shape of function ``func` is [batch_size, num] 
-        and [batch_size, num] respectively, then the Jacobian will be a Tensor with
-        a shape of [num, batch_size * num], where ``Jacobian[i][j]`` will contain 
-        the Jacobian matrix of the ``i``th column output and the ``j``th input and 
-        will have same dtype and device as the corresponding input.
-        Other situations can be deduced by analogy.
-
-    Examples 1:
-        .. code-block:: python
-
-            import paddle
-
-            x = paddle.ones(shape=(4, 2), dtype='float64')
-            weight = paddle.ones(shape=(2, 4), dtype='float64')
-            y = paddle.ones(shape=(4, 2), dtype='float64')
-
-            def func(x):
-                return paddle.matmul(paddle.matmul(x, weight), y)
-
-            x.stop_gradient = False
-            batch_jacobian = paddle.autograd.batch_jacobian(func, x)
-            print(batch_jacobian)
-            # Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
-            #      [[4., 4., 4., 4., 4., 4., 4., 4.],
-            #       [4., 4., 4., 4., 4., 4., 4., 4.]])
-
-    Examples 2:
-        .. code-block:: python
-
-            import paddle
-
-            x = paddle.ones(shape=(4, 2), dtype='float64')
-            weight = paddle.ones(shape=(2, 4), dtype='float64')
-            y = paddle.ones(shape=(4, 2), dtype='float64')
-
-            def func(x):
-                return paddle.matmul(paddle.matmul(x, weight), y), x * x
-
-            x.stop_gradient = False
-            batch_jacobian = paddle.autograd.batch_jacobian(func, x) 
-            print(batch_jacobian)    
-            # (Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
-            #       [[4., 4., 4., 4., 4., 4., 4., 4.],
-            #        [4., 4., 4., 4., 4., 4., 4., 4.]]), Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
-            #       [[2., 0., 2., 0., 2., 0., 2., 0.],
-            #        [0., 2., 0., 2., 0., 2., 0., 2.]]))
-
-    Examples 3:
-        .. code-block:: python
-
-            import paddle
-
-            x = paddle.ones(shape=(4, 2), dtype='float64')
-            weight = paddle.ones(shape=(2, 4), dtype='float64')
-            y = paddle.ones(shape=(4, 2), dtype='float64')
-
-            def func(x, y):
-                return x * y
-
-            x.stop_gradient = False
-            y.stop_gradient = False
-            batch_jacobian = paddle.autograd.batch_jacobian(func, [x, y])
-            print(batch_jacobian)
-            # (Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
-            #       [[1., 0., 1., 0., 1., 0., 1., 0.],
-            #        [0., 1., 0., 1., 0., 1., 0., 1.]]), Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
-            #       [[1., 0., 1., 0., 1., 0., 1., 0.],
-            #        [0., 1., 0., 1., 0., 1., 0., 1.]]))
-
-    '''
-
-    inputs = as_tensors(inputs)
-    outputs = as_tensors(func(*inputs))
-
-    batch_size = inputs[0].shape[0]
-    for input in inputs:
-        assert input.shape[
-            0] == batch_size, "The first dimension of input should equals to the same batch size!"
-    for output in outputs:
-        assert output.shape[
-            0] == batch_size, "The first dimension of output should equals to the same batch size!"
-    fin_size = len(inputs)
-    fout_size = len(outputs)
-    flat_outputs = tuple(
-        paddle.reshape(output, shape=[batch_size, -1]) for output in outputs)
-    jacobian = tuple()
-    for i, flat_output in enumerate(flat_outputs):
-        jac_i = list([] for _ in range(fin_size))
-        for k in range(flat_output.shape[1]):
-
-            row_k = paddle.grad(flat_output[:, k],
-                                inputs,
-                                create_graph=create_graph,
-                                retain_graph=True,
-                                allow_unused=allow_unused)
-
-            for j in range(fin_size):
-                jac_i[j].append(
-                    paddle.reshape(row_k[j], shape=[-1]) if isinstance(
-                        row_k[j], paddle.Tensor) else None)
-        jacobian += (tuple(
-            _stack_tensor_or_return_none(jac_i_j) for jac_i_j in jac_i), )
-    if fin_size == 1 and fout_size == 1:
-        return jacobian[0][0]
-    elif fin_size == 1 and fout_size != 1:
-        return tuple(jacobian[i][0] for i in range(fout_size))
-    elif fin_size != 1 and fout_size == 1:
-        return jacobian[0]
-    else:
-        return jacobian
-
-
-@framework.dygraph_only
-def batch_hessian(func, inputs, create_graph=False, allow_unused=False):
-    ''' 
-    .. note::
-        **This API is ONLY available in the imperative mode.**
-
-    This function computes the batch Hessian matrix of `func` with respect to `inputs`.
-    Noted that the first dimension of inputs is batch size.
-
-    Parameters:
-        func (function): a Python function that takes a Tensor or a Tensor
-            list/tuple as inputs(the first dimension is batch size) and
-            returns a Tensor with shape [batch_size, 1].
-        inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or 
-            Tensor list/tuple of the function ``func``.
-            Noted that the first dimension of inputs is batch size.
-        create_graph (bool, optional): whether to create the gradient graphs
-            of the computing process. When it is True, higher order derivatives
-            are supported to compute; when it is False, the gradient graphs of
-            the computing process would be discarded. Defaults to ``False``.
-        allow_unused (bool, optional): whether to raise error or return None if
-            some Tensors of `inputs` are unreachable in the graph. Error would
-            be raised if allow_unused=False, and None would be returned as
-            their gradients if allow_unused=True. Default False.
-    Returns:
-        Hessian (Tensor or a tuple of tuple of Tensors): if function ``func``
-        takes a Tensor as ``inputs``, Hessian will be a single Tensor containing
-        the Hessian matrix for the linearized ``inputs`` Tensor. If function
-        ``func`` takes a Tensor list/tuple as ``inputs``, then the Hessian will
-        be a tuple of tuple of Tensors. Noted that the first dimension of inputs 
-        is batch size and the execution step is to obtain the result of the 
-        first order differentiation, and then differentiate the batch input.
-
-        For example,
-        the inputs shape and outputs shape of function ``func` is [batch_size, num] 
-        and [batch_size, 1] respectively, then the batched Hessian will be a Tensor with
-        a shape of [num, batch_size * num].
-
-        Why the final shape in this case is that?
-        because batch_hessian will create a inner func(the wrapper of paddle.grad() func)
-        to computes the sum of gradients of `outputs` with respect to each `inputs`,
-        this inner func will get the first order differentiation and shape is [batch_size, num], 
-        then call batch_jacobian to compute jacobian between the first order differentiation
-        and the origin inputs. The final result ``Hessian[i][j]`` will contain the Jacobian 
-        matrix of the ``i``th column output(Noted that this output means the first order 
-        differentiation) and the ``j``th input and will have same dtype and device as the 
-        corresponding input. Other situations can be deduced by analogy.
-
-
-    Examples 1:
-        .. code-block:: python
-
-            import paddle
-
-            x = paddle.ones(shape=(4, 2), dtype='float64')
-            weight = paddle.ones(shape=(2, 4), dtype='float64')
-            y = paddle.ones(shape=(4, 2), dtype='float64')
-
-            def func(x):
-                return paddle.matmul(x * x, weight)[:, 0:1]
-
-
-            x.stop_gradient = False
-            batch_hessian = paddle.autograd.batch_hessian(func, x)
-            print(batch_hessian)
-            # Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
-            #      [[2., 0., 2., 0., 2., 0., 2., 0.],
-            #       [0., 2., 0., 2., 0., 2., 0., 2.]])
-
-    Examples 2:
-        .. code-block:: python
-
-            import paddle
-
-            x = paddle.ones(shape=(4, 2), dtype='float64')
-            weight = paddle.ones(shape=(2, 4), dtype='float64')
-            y = paddle.ones(shape=(4, 2), dtype='float64')
-
-            def func(x, y):
-                return paddle.matmul(x * x * y * y, weight)[:, 0:1]
-
-            x.stop_gradient = False
-            y.stop_gradient = False
-            batch_hessian = paddle.autograd.batch_hessian(func, [x, y])
-            print(batch_hessian)
-            # ((Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
-            #        [[2., 0., 2., 0., 2., 0., 2., 0.],
-            #         [0., 2., 0., 2., 0., 2., 0., 2.]]), 
-            #   Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
-            #        [[4., 0., 4., 0., 4., 0., 4., 0.],
-            #         [0., 4., 0., 4., 0., 4., 0., 4.]])), 
-            #  (Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
-            #        [[4., 0., 4., 0., 4., 0., 4., 0.],
-            #         [0., 4., 0., 4., 0., 4., 0., 4.]]), 
-            #   Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
-            #        [[2., 0., 2., 0., 2., 0., 2., 0.],
-            #         [0., 2., 0., 2., 0., 2., 0., 2.]])))
-
-
-    Examples 3:
-        .. code-block:: python
-
-            import paddle
-
-            x = paddle.ones(shape=(4, 2), dtype='float64')
-            weight = paddle.ones(shape=(2, 4), dtype='float64')
-            y = paddle.ones(shape=(4, 2), dtype='float64')
-
-            def func(x, y):
-                return paddle.matmul(x * x, weight)[:, 0:1]
-
-            x.stop_gradient = False
-            y.stop_gradient = False
-            batch_hessian = paddle.autograd.batch_hessian(func, [x, y], allow_unused=True)
-            print(batch_hessian)
-            # ((Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
-            #        [[2., 0., 2., 0., 2., 0., 2., 0.],
-            #         [0., 2., 0., 2., 0., 2., 0., 2.]]), None), (None, None))
-
-    '''
-    inputs = as_tensors(inputs)
-    outputs = func(*inputs)
-    batch_size = inputs[0].shape[0]
-    for input in inputs:
-        assert input.shape[
-            0] == batch_size, "The first dimension of input should equals to the same batch size!"
-    assert isinstance(outputs, paddle.Tensor) and outputs.shape == [
-        batch_size, 1
-    ], "The function to compute batched Hessian matrix should return a Tensor of shape [batch_size, 1]"
-
-    def jac_func(*ins):
-        grad_inputs = paddle.grad(outputs,
-                                  ins,
-                                  create_graph=True,
-                                  retain_graph=True,
-                                  allow_unused=allow_unused)
-        return tuple(
-            _replace_none_with_zero_tensor(grad_inputs[i], inputs[i])
-            for i in range(len(inputs)))
-
-    return batch_jacobian(jac_func,
-                          inputs,
-                          create_graph=create_graph,
-                          allow_unused=allow_unused)
-
-
-@framework.dygraph_only
-def hessian(func, inputs, create_graph=False, allow_unused=False):
-    ''' 
-    .. note::
-        **This API is ONLY available in the imperative mode.**
-
-    This function computes the Hessian matrix of `func` with respect to `inputs`.
-
-    Parameters:
-        func (function): a Python function that takes a Tensor or a Tensor
-            list/tuple as inputs and returns a Tensor with a single element.
-        inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or 
-            Tensor list/tuple of the function ``func``.
-        create_graph (bool, optional): whether to create the gradient graphs
-            of the computing process. When it is True, higher order derivatives
-            are supported to compute; when it is False, the gradient graphs of
-            the computing process would be discarded. Defaults to ``False``.
-        allow_unused (bool, optional): whether to raise error or return None if
-            some Tensors of `inputs` are unreachable in the graph. Error would
-            be raised if allow_unused=False, and None would be returned as
-            their gradients if allow_unused=True. Default False.
-    Returns:
-        Hessian (Tensor or a tuple of tuple of Tensors): if function ``func``
-        takes a Tensor as ``inputs``, Hessian will be a single Tensor containing
-        the Hessian matrix for the linearized ``inputs`` Tensor. If function
-        ``func`` takes a Tensor list/tuple as ``inputs``, then the Hessian will
-        be a tuple of tuple of Tensors where ``Hessian[i][j]`` will contain the
-        Hessian matrix of the ``i``th input and ``j``th input with size ``m * n``.
-        Here ``m`` and ``n`` denote the number of elements of the ``i`` th input
-        and the ``j`` th input respectively.
-
-    Examples 1:
-        .. code-block:: python
-
-            import paddle
-
-            def func(x):
-                return paddle.sum(paddle.matmul(x, x))
-
-            x = paddle.ones(shape=[2, 2], dtype='float32')
-            x.stop_gradient = False
-            hessian = paddle.autograd.hessian(func, x)
-            print(hessian)
-            # Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #        [[2., 1., 1., 0.],
-            #         [1., 0., 2., 1.],
-            #         [1., 2., 0., 1.],
-            #         [0., 1., 1., 2.]])
-
-    Examples 2:
-        .. code-block:: python
-
-            import paddle
-
-            def func(x, y):
-                return paddle.sum(paddle.matmul(x, y))
-
-            x = paddle.ones(shape=[2, 2], dtype='float32')
-            y = paddle.ones(shape=[2, 2], dtype='float32')
-            x.stop_gradient = False
-            y.stop_gradient = False
-            hessian = paddle.autograd.hessian(func, [x, y])
-            print(hessian)
-            # ((Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #        [[0., 0., 0., 0.],
-            #         [0., 0., 0., 0.],
-            #         [0., 0., 0., 0.],
-            #         [0., 0., 0., 0.]]),
-            #   Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #        [[1., 1., 0., 0.],
-            #         [0., 0., 1., 1.],
-            #         [1., 1., 0., 0.],
-            #         [0., 0., 1., 1.]])),
-            #  (Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #        [[1., 0., 1., 0.],
-            #         [1., 0., 1., 0.],
-            #         [0., 1., 0., 1.],
-            #         [0., 1., 0., 1.]]),
-            #   Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #        [[0., 0., 0., 0.],
-            #         [0., 0., 0., 0.],
-            #         [0., 0., 0., 0.],
-            #         [0., 0., 0., 0.]])))
-
-    Examples 3:
-        .. code-block:: python
-
-            import paddle
-
-            def func(x, y):
-                return paddle.sum(paddle.matmul(x, x))
-
-            x = paddle.ones(shape=[2, 2], dtype='float32')
-            y = paddle.ones(shape=[2, 2], dtype='float32')
-            x.stop_gradient = False
-            y.stop_gradient = False
-            hessian = paddle.autograd.hessian(func, [x, y], allow_unused=True)
-            print(hessian)
-            # ((Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #        [[2., 1., 1., 0.],
-            #         [1., 0., 2., 1.],
-            #         [1., 2., 0., 1.],
-            #         [0., 1., 1., 2.]]), None), (None, None))
-
-    '''
-    inputs = as_tensors(inputs)
-    outputs = func(*inputs)
-    assert isinstance(outputs, paddle.Tensor) and outputs.shape == [
-        1
-    ], "The function to compute Hessian matrix should return a Tensor with a single element"
-
-    def jac_func(*ins):
-        grad_inputs = paddle.grad(outputs,
-                                  ins,
-                                  create_graph=True,
-                                  retain_graph=True,
-                                  allow_unused=allow_unused)
-        return tuple(
-            _replace_none_with_zero_tensor(grad_inputs[i], inputs[i])
-            for i in range(len(inputs)))
-
-    return jacobian(jac_func,
-                    inputs,
-                    create_graph=create_graph,
-                    allow_unused=allow_unused)
-
-
-def vhp(func, inputs, v=None, create_graph=False, allow_unused=False):
-    ''' 
-    .. note::
-        **This API is ONLY available in the imperative mode.**
-
-    This function computes the product between a vector ``v`` and the
-    Hessian matrix of `func` with respect to `inputs`.
-
-    Parameters:
-        func (function): a Python function that takes a Tensor or a Tensor
-            list/tuple as inputs and returns a Tensor with a single element.
-        inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or 
-            Tensor list/tuple of the function ``func``.
-        v (Tensor|list(Tensor)|tuple(Tensor)|None, optional): the vector used
-            to compute vector hessian product. ``v`` should have same shape
-            and dtype with ``inputs``. If ``v`` is None, it will be set as
-            Tensor|list(Tensor) with all elements 1. Defaults to "None".
-        create_graph (bool, optional): whether to create the gradient graphs
-            of the computing process. When it is True, higher order derivatives
-            are supported to compute; when it is False, the gradient graphs of
-            the computing process would be discarded. Defaults to ``False``.
-        allow_unused (bool, optional): whether to raise error or return None if
-            some Tensors of `inputs` are unreachable in the graph. Error would
-            be raised if allow_unused=False, and None would be returned as
-            their gradients if allow_unused=True. Default False.
-    Returns:
-        output (tuple): tuple with:
-            func_output (Tensor): output of ``func(inputs)``
-            vhp (list(Tensor)): result of the vector hessian product
-            with the same shape and dtype as the inputs.
-    Examples 1:
-        .. code-block:: python
-            import paddle
-            def func(x):
-                return paddle.sum(paddle.matmul(x, x))
-
-            x = paddle.ones(shape=[2, 2], dtype='float32')
-            x.stop_gradient = False
-            vx = paddle.ones(shape=[2, 2], dtype='float32') * 2
-            vhp_rslt = paddle.autograd.vhp(func, x, v=vx)
-            print(vhp_rslt)
-            # (Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
-            #        [8.]),
-            #  Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #        [[8., 8.],
-            #         [8., 8.]]))
-
-    Examples 2:
-        .. code-block:: python
-            import paddle
-            def func(x):
-                return paddle.sum(paddle.matmul(x, x))
-
-            x = paddle.ones(shape=[2, 2], dtype='float32')
-            x.stop_gradient = False
-            vhp_rslt = paddle.autograd.vhp(func, x)
-            print(vhp_rslt)
-            # (Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
-            #        [8.]),
-            #  Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #        [[4., 4.],
-            #         [4., 4.]]))
-
-    Examples 3:
-        .. code-block:: python
-            import paddle
-            def func(x, y):
-                return paddle.sum(paddle.matmul(x, x))
-
-            x = paddle.ones(shape=[2, 2], dtype='float32')
-            x.stop_gradient = False
-            y = paddle.ones(shape=[2, 2], dtype='float32')
-            y.stop_gradient = False
-            vx = paddle.ones(shape=[2, 2], dtype='float32') * 2
-            vy = paddle.ones(shape=[2, 2], dtype='float32') * 3
-            vhp_rslt = paddle.autograd.vhp(func, [x, y], v=[vx, vy], allow_unused=True)
-            print(vhp_rslt)
-            # (Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
-            #        [8.]),
-            # [Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #        [[8., 8.],
-            #         [8., 8.]]), None])
-    '''
-    xs = as_tensors(inputs)
-    if v is not None:
-        v = as_tensors(v)
-    xs, v = _separate(xs), _separate(v)
-    outputs = func(*xs)
-    ys = as_tensors(outputs)
-    assert len(ys) == 1 and isinstance(
-        ys[0], framework.Variable
-    ) and ys[0].shape == [
-        1
-    ], "The function to compute vhp should return a Tensor with a single element"
-    jac = _grad(ys, xs)
-    vhp = _grad(jac, xs, v)
-    return outputs, vhp
diff --git a/python/paddle/common_ops_import.py b/python/paddle/common_ops_import.py
index de8056f280a39..d8fdac59df48f 100644
--- a/python/paddle/common_ops_import.py
+++ b/python/paddle/common_ops_import.py
@@ -14,7 +14,7 @@
 from six.moves import reduce
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.param_attr import ParamAttr
-from paddle.fluid.framework import convert_np_dtype_to_dtype_, _non_static_mode, _varbase_creator
+from paddle.fluid.framework import convert_np_dtype_to_dtype_, _non_static_mode, _varbase_creator, in_dygraph_mode, _in_legacy_dygraph
 from paddle.fluid.framework import device_guard, default_main_program, dygraph_only, _dygraph_tracer
 from paddle.fluid.framework import OpProtoHolder, Variable
 from paddle.fluid.initializer import Constant
diff --git a/python/paddle/device/__init__.py b/python/paddle/device/__init__.py
index 4fcf9c5d21b26..aa959150cec3c 100644
--- a/python/paddle/device/__init__.py
+++ b/python/paddle/device/__init__.py
@@ -230,7 +230,10 @@ def _convert_to_place(device):
         device_id = int(selected_mlus[0])
         place = core.MLUPlace(device_id)
     elif device in core.get_all_custom_device_type():
-        place = core.CustomPlace(device, 0)
+        selected_devices = os.getenv("FLAGS_selected_{}s".format(device),
+                                     "0").split(",")
+        device_id = int(selected_devices[0])
+        place = core.CustomPlace(device, device_id)
     else:
         avaliable_gpu_device = re.match(r'gpu:\d+', lower_device)
         avaliable_xpu_device = re.match(r'xpu:\d+', lower_device)
diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index 003a14799c53e..ab83e2929e4bc 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -41,6 +41,14 @@
 from .collective import get_group  # noqa: F401
 from .collective import send  # noqa: F401
 from .collective import wait  # noqa: F401
+from .collective import is_initialized  # noqa: F401
+from .collective import destroy_process_group  # noqa: F401
+from .collective import alltoall_single  # noqa: F401
+from .collective import isend  # noqa: F401
+from .collective import irecv  # noqa: F401
+from .collective import batch_isend_irecv  # noqa: F401
+from .collective import P2POp  # noqa: F401
+from .collective import reduce_scatter  # noqa: F401
 
 from .auto_parallel import shard_op  # noqa: F401
 from .auto_parallel import shard_tensor  # noqa: F401
@@ -59,33 +67,11 @@
 from .sharding import *  # noqa: F401
 
 __all__ = [  # noqa
-    "spawn",
-    "launch",
-    "scatter",
-    "broadcast",
-    "ParallelEnv",
-    "new_group",
-    "init_parallel_env",
-    "gloo_init_parallel_env",
-    "gloo_barrier",
-    "gloo_release",
-    "QueueDataset",
-    "split",
-    "CountFilterEntry",
-    "ShowClickEntry",
-    "get_world_size",
-    "get_group",
-    "all_gather",
-    "InMemoryDataset",
-    "barrier",
-    "all_reduce",
-    "alltoall",
-    "send",
-    "reduce",
-    "recv",
-    "ReduceOp",
-    "wait",
-    "get_rank",
-    "ProbabilityEntry",
-    "ParallelMode",
+    "spawn", "launch", "scatter", "broadcast", "ParallelEnv", "new_group",
+    "init_parallel_env", "gloo_init_parallel_env", "gloo_barrier",
+    "gloo_release", "QueueDataset", "split", "CountFilterEntry",
+    "ShowClickEntry", "get_world_size", "get_group", "all_gather",
+    "InMemoryDataset", "barrier", "all_reduce", "alltoall", "send", "reduce",
+    "recv", "ReduceOp", "wait", "get_rank", "ProbabilityEntry", "ParallelMode",
+    "is_initialized", "isend", "irecv", "reduce_scatter"
 ]
diff --git a/python/paddle/distributed/auto_parallel/cluster.py b/python/paddle/distributed/auto_parallel/cluster.py
index e70b29dbe3931..e17f83eb41907 100644
--- a/python/paddle/distributed/auto_parallel/cluster.py
+++ b/python/paddle/distributed/auto_parallel/cluster.py
@@ -16,6 +16,7 @@
 import json
 from enum import IntEnum
 from enum import unique
+import paddle
 
 
 @unique
@@ -138,7 +139,7 @@ def __repr__(self):
 class Link:
 
     default_hop = 1
-    default_nic_bandwith = 24
+    default_nic_bandwidth = 24
 
     def __init__(self, source, target):
         self._src = source
@@ -411,6 +412,174 @@ def __init__(self):
         self._alpha_latency = None
         self._rank_to_device_id = {}
         self._device_id_to_rank = {}
+        # This property only be valid when the cluster consists of machines,
+        # which have the same number accelerators.
+        self._num_devices_per_machine = None
+
+    def gen_default_config_cluster(self,
+                                   gpu_model="V100",
+                                   cpu_model="6271C",
+                                   node_count=1,
+                                   device_count=1,
+                                   gpu_memory=32,
+                                   cpu_memory=503,
+                                   inter_bandwidth=24,
+                                   intra_bandwidth=235,
+                                   gpu_dp_gflops=7800,
+                                   gpu_sp_gflops=15700,
+                                   cpu_dp_gflops=75,
+                                   cpu_sp_gflops=150):
+        """Generate cluster by default config."""
+        gpu_models = ["V100", "A100", "H100", "A2", "A10", "A16", "A30", "A40"]
+        xpu_models = ["XPU"]
+        npu_models = ["NPU"]
+        dcu_models = ["DCU"]
+        all_gpu_models = gpu_models + xpu_models + npu_models + dcu_models
+        assert gpu_model in all_gpu_models
+        self._num_devices_per_machine = device_count
+
+        def _convert_to_type(gpu_model):
+            type = None
+            if gpu_model in gpu_models:
+                type = "GPU"
+            elif gpu_model in xpu_models:
+                type = "XPU"
+            elif gpu_model in npu_models:
+                type = "NPU"
+            elif gpu_model in dcu_models:
+                type = "DCU"
+            assert type is not None
+
+            return type
+
+        def _convert_to_model(gpu_model, gpu_memory):
+            model = None
+            if gpu_model == "V100":
+                model = "Tesla V100-SXM2-" + str(gpu_memory) + "GB"
+            assert model is not None
+
+            return model
+
+        def _convert_to_cpu_info(cpu_model):
+            arch, vendor, model = None, None, None
+            if cpu_model == "6271C":
+                arch = "x86_64"
+                vendor = "GenuineIntel"
+                model = "Intel(R) Xeon(R) Gold 6271C CPU @ 2.60G"
+            elif cpu_model == "6148":
+                arch = "x86_64"
+                vendor = "GenuineIntel"
+                model = "Intel(R) Xeon(R) Gold 6148 CPU @ 2.40G"
+            assert arch is not None
+            assert vendor is not None
+            assert model is not None
+
+            return arch, vendor, model
+
+        cluster_info = {}
+        cluster_info["machines"] = []
+        global_id = 0
+        global_id_to_device_type = {}
+        global_id_to_node = {}
+        # NOTE: It will support NPU, XPU, DCU models in the future, it is just a fake value now
+        for i in range(node_count):
+            machine = {}
+            # NOTE: The hostname is host_0, host_1, ...
+            machine["hostname"] = "host_" + str(i)
+            # NOTE: The addr is localhost, if need actual addr, it should be reset manually
+            machine["addr"] = "127.0.0.1"
+            # NOTE: The port is a default value
+            machine["port"] = 60009
+            machine["links"] = []
+
+            devices = []
+            local_id = 0
+
+            for j in range(device_count):
+                device = {}
+                global_id = global_id if i == 0 and j == 0 else global_id + 1
+
+                local_id += 1
+                type = _convert_to_type(gpu_model)
+                model = _convert_to_model(gpu_model, gpu_memory)
+                dp_gflops = gpu_dp_gflops
+                sp_gflops = gpu_dp_gflops
+                memory = gpu_memory
+
+                device["global_id"] = global_id
+                device["local_id"] = local_id
+                device["type"] = type
+                device["model"] = model
+                device["memory"] = memory
+                device["sp_gflops"] = sp_gflops
+                device["dp_gflops"] = dp_gflops
+                global_id_to_device_type[global_id] = type
+                global_id_to_node[global_id] = i
+                devices.append(device)
+
+            # add cpu device and nic device, just one cpu
+            cpu_device = {}
+            arch, vendor, model = _convert_to_cpu_info(cpu_model)
+            sp_gflops = cpu_sp_gflops
+            dp_gflops = cpu_dp_gflops
+            global_id += 1
+            local_id = 0
+            memory = cpu_memory
+            type = "CPU"
+            cpu_device["arch"] = arch
+            cpu_device["vendor"] = vendor
+            cpu_device["model"] = model
+            cpu_device["sp_gflops"] = sp_gflops
+            cpu_device["dp_gflops"] = dp_gflops
+            cpu_device["global_id"] = global_id
+            cpu_device["local_id"] = local_id
+            cpu_device["memory"] = memory
+            cpu_device["type"] = type
+            global_id_to_node[global_id] = i
+            global_id_to_device_type[global_id] = type
+            devices.append(cpu_device)
+
+            nic_device = {}
+            global_id += 1
+
+            # add NIC
+            type = "NIC"
+            width = 12.5
+            ip = "127.0.0.1"
+            local_id = 0
+            nic_device["type"] = type
+            nic_device["local_id"] = type
+            nic_device["global_id"] = global_id
+            global_id_to_device_type[global_id] = type
+            global_id_to_node[global_id] = i
+            devices.append(nic_device)
+            machine["devices"] = devices
+            cluster_info["machines"].append(machine)
+
+        # build link
+        for i in range(0, global_id + 1):
+            for j in range(0, global_id + 1):
+                if i == j:
+                    continue
+                node_id_i = global_id_to_node[i]
+                node_id_j = global_id_to_node[j]
+                device_type_i = global_id_to_device_type[i]
+                device_type_j = global_id_to_device_type[j]
+                link = {}
+                source_global_id = i
+                target_global_id = j
+                link["source_global_id"] = source_global_id
+                link["target_global_id"] = target_global_id
+                # the same node and device_type, set intra_bandwidth, NVL
+                if node_id_i == node_id_j and device_type_i == device_type_j:
+                    link["type"] = "NVL"
+                    link["bandwidth"] = intra_bandwidth
+                else:
+                    link["type"] = "PHB"
+                    link["bandwidth"] = inter_bandwidth
+                cluster_info["machines"][node_id_i]["links"].append(link)
+
+        self._build_from_dict(cluster_info)
 
     @property
     def rank_to_device_id(self):
@@ -473,9 +642,7 @@ def get_device(self, device_global_id):
                 device = machine.devices[device_global_id]
         return device
 
-    def build_from_file(self, json_file_path):
-        with open(json_file_path) as json_file:
-            cluster_info = json.load(json_file)
+    def _build_from_dict(self, cluster_info):
         machines_info = cluster_info["machines"]
         for machine_info in machines_info:
             machine_id = self._generate_machine_id()
@@ -533,6 +700,11 @@ def build_from_file(self, json_file_path):
         else:
             self._alpha_latecy = None
 
+    def build_from_file(self, json_file_path):
+        with open(json_file_path) as json_file:
+            cluster_info = json.load(json_file)
+        self._build_from_dict(cluster_info)
+
     def _generate_machine_id(self):
         cur_machine_id = self._num_machines
         self._num_machines += 1
@@ -556,7 +728,7 @@ def get_beta(self, source_device_id, target_device_id):
         bandwidth = None
         # None means the source and target are not connected directly, set NIC in default
         if link is None:
-            bandwidth = Link.default_nic_bandwith
+            bandwidth = Link.default_nic_bandwidth
         else:
             bandwidth = link.bandwidth
 
@@ -608,6 +780,15 @@ def get_involved_machine_count(self, device_ids):
         assert count > 0
         return count
 
+    def get_num_machines(self):
+        return len(self._machines)
+
+    def get_num_devices_per_machine(self):
+        # Only return the number of accelerators of each machine.
+        # All machines must has the same number of devices and same type of devices.
+        assert self._num_devices_per_machine
+        return self._num_devices_per_machine
+
     def __str__(self):
         str = ""
         for machine in self.machines.values():
@@ -616,3 +797,29 @@ def __str__(self):
 
     def __repr__(self):
         return self.__str__()
+
+
+def get_default_cluster():
+    cluster = Cluster()
+    local_device_count = os.getenv("PADDLE_LOCAL_SIZE")
+    if local_device_count is None:
+        local_device_count = 1
+    else:
+        local_device_count = int(local_device_count)
+    global_device_count = os.getenv("PADDLE_GLOBAL_SIZE")
+    if global_device_count is None:
+        node_count = 1
+    else:
+        global_device_count = int(global_device_count)
+        assert global_device_count % local_device_count == 0
+        node_count = int(global_device_count) // local_device_count
+    print("Node Count: ",
+          node_count,
+          "Local Device Size: ",
+          local_device_count,
+          "World size: ",
+          paddle.distributed.get_world_size(),
+          flush=True)
+    cluster.gen_default_config_cluster(node_count=node_count,
+                                       device_count=local_device_count)
+    return cluster
diff --git a/python/paddle/distributed/auto_parallel/cost/base_cost.py b/python/paddle/distributed/auto_parallel/cost/base_cost.py
index 4455d6f66483b..deac76e45a8b0 100644
--- a/python/paddle/distributed/auto_parallel/cost/base_cost.py
+++ b/python/paddle/distributed/auto_parallel/cost/base_cost.py
@@ -17,8 +17,12 @@
 
 import paddle
 
-from ..cluster import LinkType
+from ..utils import _get_comm_group, _get_corresponding_rank
 from ..process_group import get_process_group
+from ..cluster import LinkType
+from ..dist_tensor import DistributedTensor
+from ..utils import _get_idx_in_axis
+from ..dist_tensor import DistributedTensor
 
 COMM_OP_TYPE = [
     "send_v2", "recv_v2", "c_broadcast", "c_allgather", "c_allreduce_sum",
@@ -28,33 +32,22 @@
 _g_op_cost_factory = {}
 
 
-def build_comm_desc(op_type, group_ranks, dtype, shape, attrs=None):
-    desc = {}
-    desc["op"] = op_type
-    desc["group_ranks"] = group_ranks
-    desc["inputs"] = {"X": [(dtype, shape)]}
-    if attrs is not None:
-        desc["attrs"] = attrs
-    return desc
-
+def build_comp_desc_from_op(op):
+    """Build the description of computation op."""
+    # NOTE: The desc is for serial op.
+    from ..reshard import get_var_with_recursion
 
-def _parse_op_to_desc(op, dist_context=None):
     desc = {}
-    desc["op"] = op.type
+    # The desc of concat op is {"op": "concat", "inputs": {"X": [(paddle.float32, [20, 20]), (paddle.float32, [20, 20])]}, "outputs": {"Out": [(paddle.float32, [20, 40])], "attrs": {"axis": -1}}}
     vars = op.block.vars
+    desc["op"] = op.type
     input_desc = OrderedDict()
     for input_name in op.input_names:
         var_name_list = op.input(input_name)
         var_desc = []
         for var_name in var_name_list:
-            var = vars[var_name]
-            shape = None
-            if dist_context is not None:
-                dist_tensor = dist_context.get_dist_tensor_for_program(var)
-                shape = dist_tensor.local_sizes()
-            else:
-                shape = var.shape
-            assert shape is not None
+            var = get_var_with_recursion(var_name, op.block, op.block.program)
+            shape = var.shape
             var_desc.append((var.dtype, shape))
         input_desc[input_name] = var_desc
     desc["inputs"] = input_desc
@@ -64,14 +57,8 @@ def _parse_op_to_desc(op, dist_context=None):
         var_name_list = op.output(out_name)
         var_desc = []
         for var_name in var_name_list:
-            var = vars[var_name]
-            shape = None
-            if dist_context is not None:
-                dist_tensor = dist_context.get_dist_tensor_for_program(var)
-                shape = dist_tensor.local_sizes()
-            else:
-                shape = var.shape
-            assert shape is not None
+            var = get_var_with_recursion(var_name, op.block, op.block.program)
+            shape = var.shape
             var_desc.append((var.dtype, shape))
         output_desc[out_name] = var_desc
     desc["outputs"] = output_desc
@@ -82,19 +69,101 @@ def _parse_op_to_desc(op, dist_context=None):
     return desc
 
 
-def parse_to_desc(op=None, dist_op=None, dist_context=None):
-    desc = None
-    if op is None and dist_op is not None and dist_context is not None:
-        desc = _parse_op_to_desc(op=dist_op.serial_op,
-                                 dist_context=dist_context)
-    elif op is not None and dist_op is None and dist_context is None:
-        desc = _parse_op_to_desc(op)
-
-    return desc
-
-
-def parse_desc_to_str(desc):
-
+def build_comp_desc_from_dist_op(dist_op, dist_context):
+    """Build descriptions of computation op distributed on the processes."""
+    from ..reshard import get_var_with_recursion
+
+    op_descs = {}
+    op = dist_op.serial_op
+    dist_attr = dist_op.dist_attr
+    process_mesh = dist_attr.process_mesh
+    assert process_mesh, "Process mesh must not be None."
+    processes = process_mesh.processes
+    for process in processes:
+        desc = {}
+        desc["op"] = op.type
+        attr_desc = op.all_attrs()
+        # NOTE: The attrs of desc is replica of serial op, there may be a bug if shape need to be partitioned involved in attrs.
+        desc["attrs"] = attr_desc
+        input_desc = OrderedDict()
+        output_desc = OrderedDict()
+
+        # Get partitioned shape of input
+        for input_name in op.input_names:
+            var_name_list = op.input(input_name)
+            var_desc = []
+            for var_name in var_name_list:
+                var = get_var_with_recursion(var_name, op.block,
+                                             op.block.program)
+                # Use op input_dims_mapping
+                dims_mapping = dist_attr.get_input_dims_mapping(var_name)
+                global_sizes = var.shape
+                # NOTE: When support uneven partition, the shard_sizes will be got from dist_attr.
+                shard_sizes = None
+                topology = process_mesh.topology
+                shape = DistributedTensor.get_local_sizes(
+                    global_sizes, dims_mapping, topology, processes, process,
+                    shard_sizes)
+                var_desc.append((var.dtype, shape))
+
+                # For special op such as embedding and its grad op
+                if op.type == "c_embedding" or op.type == "lookup_table_v2" or op.type == "c_embedding_grad" or op.type == "lookup_table_v2_grad":
+                    if input_name == "W":
+                        embedding_row_dim_mapping = dist_attr.get_input_dims_mapping(
+                            op.input(input_name)[0])[0]
+                        relative_idx = _get_idx_in_axis(
+                            processes, dist_attr.process_mesh.topology,
+                            embedding_row_dim_mapping, process)
+                        per_part_size = shape[0]
+                        relative_idx = relative_idx * per_part_size
+                        desc["attrs"]["start_index"] = relative_idx
+
+            input_desc[input_name] = var_desc
+        desc["inputs"] = input_desc
+
+        for out_name in op.output_names:
+            var_name_list = op.output(out_name)
+            var_desc = []
+            for var_name in var_name_list:
+                # Use op output_dims_mapping
+                var = get_var_with_recursion(var_name, op.block,
+                                             op.block.program)
+                dist_attr = dist_op.dist_attr
+                dims_mapping = dist_attr.get_output_dims_mapping(var_name)
+                process_mesh = dist_attr.process_mesh
+                global_sizes = var.shape
+                shard_sizes = None
+                processes = process_mesh.processes
+                topology = process_mesh.topology
+                shape = DistributedTensor.get_local_sizes(
+                    global_sizes, dims_mapping, topology, processes, process,
+                    shard_sizes)
+                var_desc.append((var.dtype, shape))
+
+                # For special op such as fill_constant_batch_size_like
+                if op.type == "fill_constant_batch_size_like":
+                    # Modify shape attr according to how output are partitioned
+                    out_name = var_name_list[0]
+                    dims_mapping = dist_attr.get_output_dims_mapping(out_name)
+                    process_mesh_shape = dist_attr.process_mesh.topology
+                    shape_list = op.attr("shape")
+                    # Modify target shape
+                    for idx, axis in enumerate(dims_mapping):
+                        if axis >= 0:
+                            shape_list[idx] = shape_list[
+                                idx] // process_mesh_shape[axis]
+                    desc["attrs"]["shape"] = shape_list
+            output_desc[out_name] = var_desc
+
+        desc["outputs"] = output_desc
+
+        op_descs[process] = desc
+
+    return op_descs
+
+
+def build_comp_desc_str_for_predict(desc):
+    # NOTE: The description format may change in the future.
     def _parse_dtype(dtype):
         dtype_str = ""
         if dtype == paddle.float32:
@@ -135,8 +204,208 @@ def _parse_dtype(dtype):
     shape_str = "[" + ",".join(shape_list) + "]"
     desc_str_list += [dtype_str, dims_str, shape_str]
     desc_str = "_".join(desc_str_list)
+    attrs = desc["attrs"]
+    parse_result = (desc_str, attrs)
+    return parse_result
+
+
+def build_comm_desc_from_dist_op(op_type,
+                                 dist_op,
+                                 ctx,
+                                 var_names,
+                                 attrs=None,
+                                 parallel_axis=None,
+                                 group_ranks=None):
+    """Build descriptions of communication op distributed on the processes."""
+    from ..reshard import get_var_with_recursion
+
+    specific_op_type = []
+    dist_attr = dist_op.dist_attr
+    assert dist_attr, "Dist attr must not be None."
+    process_mesh = dist_attr.process_mesh
+    assert process_mesh, "Process mesh must not be None."
+
+    processes = process_mesh.processes
+    op_descs = {}
+    for process in processes:
+        rank_id = process
+        desc = {}
+        desc["op"] = op_type
+        op_attrs = None
+        comm_group_ranks = None
+
+        if op_type not in specific_op_type:
+            serial_op = dist_op.serial_op
+            input_list = []
+            # The var_names usually contain just one item.
+            for var_name in var_names:
+                dist_attr = dist_op.dist_attr
+                has_found = False
+                # Find var_name in serial op input or output
+                for name in dist_op.serial_op.input_arg_names:
+                    # If a tensor is the input of multi ops, sum the grad of all ops, so the name will be varname@RENAME@block@0 and so on.
+                    if var_name in name:
+                        var_name = name
+                        has_found = True
+                        break
+
+                if not has_found:
+                    for name in dist_op.serial_op.output_arg_names:
+                        if var_name in name:
+                            var_name = name
+                            has_found = True
+                            break
+                assert has_found
+                var = get_var_with_recursion(var_name, serial_op.block,
+                                             serial_op.block.program)
+
+                dims_mapping = dist_attr.get_input_dims_mapping(
+                    var_name
+                ) if var_name in dist_op.serial_op.input_arg_names else dist_attr.get_output_dims_mapping(
+                    var_name)
+                global_sizes = var.shape
+                shard_sizes = None
+                topology = process_mesh.topology
+                shape = DistributedTensor.get_local_sizes(
+                    global_sizes, dims_mapping, topology, processes, process,
+                    shard_sizes)
+                input_list.append((var.dtype, shape))
+
+            # NOTE: The input_name of comm ops used usually is X.
+            desc["inputs"] = {"X": input_list}
+
+            # Get comm group by parallel_axis or the given group_ranks.
+            if parallel_axis is not None:
+                process_mesh_shape = process_mesh.topology
+                process_mesh_group = process_mesh.processes
+                comm_group_ranks = _get_comm_group(process_mesh_group,
+                                                   process_mesh_shape,
+                                                   parallel_axis, rank_id)
+            elif group_ranks is not None:
+                comm_group_ranks = group_ranks
+            else:
+                raise ValueError(
+                    "The parallel_axis and group_ranks can not be None in the same."
+                )
+
+            if attrs is not None:
+                assert isinstance(attrs, dict)
+                op_attrs = attrs
+            else:
+                op_attrs = {}
+
+            desc["attrs"] = op_attrs
+            desc["group_ranks"] = comm_group_ranks
+
+            op_descs[rank_id] = desc
+
+    return op_descs
+
+
+def build_comm_desc(op_type, group_ranks, dtype, shape, attrs=None):
+    """Build a comm desc directly."""
+    desc = {}
+    desc["op"] = op_type
+    desc["group_ranks"] = group_ranks
+    desc["inputs"] = {"X": [(dtype, shape)]}
+    desc["attrs"] = attrs
+    return desc
+
 
-    return desc_str
+def build_comm_costs_from_descs(op_cost_class, ctx, processes, descs, cluster):
+    """Build comm costs by descriptions"""
+    comm_context = CommContext(cluster)
+    group_ranks_list = []
+    comm_op_cost_list = []
+    for process in processes:
+        desc = descs[process]
+        group_ranks = desc["group_ranks"]
+        if group_ranks not in group_ranks_list:
+            group_ranks_list.append(group_ranks)
+            comm_op_cost = op_cost_class(op_desc=desc,
+                                         comm_context=comm_context)
+            comm_op_cost_list.append(comm_op_cost)
+    return comm_op_cost_list
+
+
+def build_comp_costs_from_descs(op_cost_class, ctx, processes, descs, cluster):
+    """Build comp costs by descriptions."""
+    costs = {}
+    for process in processes:
+        costs[process] = op_cost_class(op_desc=descs[process], cluster=cluster)
+    return costs
+
+
+def build_dp_costs(result, dist_op, ctx, var_names, attrs, parallel_axis,
+                   cluster):
+    """DP cost contains a allreduce_sum op cost and a scale op cost"""
+    # The costs will be appended in the given result.
+    from ..reshard import get_var_with_recursion
+
+    dist_attr = dist_op.dist_attr
+    process_mesh = dist_attr.process_mesh
+    processes = process_mesh.processes
+    assert len(var_names) == 1
+    vars = dist_op.serial_op.block.vars
+    var_name = var_names[0]
+    has_found = False
+    for name in dist_op.serial_op.input_arg_names:
+        if var_name in name:
+            var_name = name
+            has_found = True
+            break
+
+    if not has_found:
+        for name in dist_op.serial_op.output_arg_names:
+            if var_name in name:
+                var_name = name
+                has_found = True
+                break
+    if not has_found:
+        return
+
+    c_allreduce_sum_descs = build_comm_desc_from_dist_op(
+        "c_allreduce_sum",
+        dist_op,
+        ctx,
+        var_names,
+        attrs=attrs,
+        parallel_axis=parallel_axis)
+    comm_cost_list = build_comm_costs_from_descs(
+        _g_op_cost_factory["c_allreduce_sum"], ctx, processes,
+        c_allreduce_sum_descs, cluster)
+    result.append(comm_cost_list)
+
+    # The scale op just on the group_ranks
+    for comm_cost in comm_cost_list:
+        group_ranks = comm_cost.group_ranks
+        dp_degree = len(group_ranks)
+        scale_costs = {}
+        op_type = "scale"
+        for rank in group_ranks:
+            desc = {}
+            desc["op"] = op_type
+            desc["inputs"] = {}
+            dims_mapping = dist_attr.get_input_dims_mapping(
+                var_name) if dist_attr.get_input_dims_mapping(
+                    var_name
+                ) is not None else dist_attr.get_output_dims_mapping(var_name)
+            var = get_var_with_recursion(var_name, dist_op.serial_op.block,
+                                         dist_op.serial_op.block.program)
+            global_sizes = var.shape
+            shard_sizes = None
+            topology = process_mesh.topology
+            shape = DistributedTensor.get_local_sizes(global_sizes,
+                                                      dims_mapping, topology,
+                                                      processes, rank,
+                                                      shard_sizes)
+            desc["inputs"]["X"] = [(var.dtype, shape)]
+            attrs = {"scale": 1.0 / dp_degree}
+            desc["attrs"] = attrs
+            scale_op_cost = _g_op_cost_factory["scale"](op_desc=desc,
+                                                        cluster=cluster)
+            scale_costs[rank] = scale_op_cost
+        result.append(scale_costs)
 
 
 class CommContext:
@@ -174,6 +443,8 @@ def _post_init(self):
             # set default
             self.base_ring = 8.4
             self.base_tree = 0.
+            # self.base_inter_ring = 9.6
+            # self.base_inter_tree = 28
             # NVL in default
             self.intra_ring = 3.4
             self.intra_tree = 28
@@ -441,6 +712,8 @@ def comm_context(self):
 
     @property
     def comm_count(self):
+        from ..reshard import get_var_with_recursion
+
         if self._comm_count is None:
             dtype = None
             shape = None
@@ -448,7 +721,8 @@ def comm_count(self):
                 vars = self.op.block.vars
                 # NOTE: The tensor communicated input_name is "X" in default. Otherwise, this function should be overrided
                 var_name = self.op.input("X")[0]
-                var = vars[var_name]
+                var = get_var_with_recursion(var_name, self.op.block,
+                                             self.program)
                 dtype = var.dtype
                 shape = var.shape
             elif self.op_desc is not None:
@@ -464,9 +738,10 @@ def comm_count(self):
                 factor = 1
             elif dtype == paddle.float16:
                 factor = 2
+            elif dtype == paddle.bool:
+                factor = 8
             else:
-                raise TypeError(
-                    "This dtype {} is not supported now".format(dtype))
+                raise ValueError("Unsupported comm dtype {}".format(dtype))
             comm_count = reduce(lambda x, y: x * y, shape) * factor
             self._comm_count = comm_count
 
diff --git a/python/paddle/distributed/auto_parallel/dist_context.py b/python/paddle/distributed/auto_parallel/dist_context.py
index 2f57b0ac0e415..04b7f6aded7a5 100644
--- a/python/paddle/distributed/auto_parallel/dist_context.py
+++ b/python/paddle/distributed/auto_parallel/dist_context.py
@@ -125,6 +125,9 @@ def __init__(self,
         # A flag indicates whether the used parallelism is data parallel
         self._data_parallel = False
 
+        # flag whether using `to_static`
+        self._dygraph_mode = True
+
     @property
     def serial_main_program(self):
         return self._serial_main_program
diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py
index 5e4a8c7d04033..72a377603edc7 100644
--- a/python/paddle/distributed/auto_parallel/engine.py
+++ b/python/paddle/distributed/auto_parallel/engine.py
@@ -15,13 +15,14 @@
 import copy
 import logging
 from collections import defaultdict
+import socket
 
 import paddle
 import paddle.utils as utils
-import paddle.distributed.auto_parallel as auto
 
 from paddle import fluid, static
 from paddle.io import Dataset
+from paddle.jit import to_static
 from paddle.metric import Metric
 from paddle.static import InputSpec
 from paddle.fluid import core
@@ -29,14 +30,15 @@
 from paddle.fluid.layers.utils import flatten
 from paddle.fluid.executor import global_scope, _to_name_str
 from paddle.fluid.backward import append_backward
-from paddle.fluid.framework import Operator
+from paddle.fluid.framework import Operator, Parameter, _non_static_mode
 from paddle.fluid.framework import _current_expected_place as _get_device
 from paddle.fluid.dygraph.parallel import ParallelEnv
 from paddle.distributed import fleet
 from paddle.distributed.utils import get_logger
 from paddle.distributed.passes import new_pass, PassContext
 
-# from .cluster import Cluster, get_default_cluster
+from ..collective import _get_global_env
+from .cluster import Cluster, get_default_cluster
 from .planner_v2 import Planner
 from .parallelizer_v2 import Parallelizer
 from .dist_op import DistributedOperator
@@ -60,8 +62,8 @@ def __init__(self,
         self.inputs_spec = self._validate_spec(inputs_spec)
         self.labels_spec = self._validate_spec(labels_spec)
         self.cluster = cluster
-        # if self.cluster is None:
-        #     self.cluster = get_default_cluster()
+        if self.cluster is None:
+            self.cluster = get_default_cluster()
         self.strategy = strategy
         if self.strategy is None:
             self.strategy = fleet.DistributedStrategy()
@@ -72,7 +74,6 @@ def __init__(self,
         self._saver = DistributedSaver()
         self._logger = get_logger(logging.INFO)
 
-        self._default_strategy = None
         self._orig_main_prog = static.default_main_program()
         self._orig_startup_prog = static.default_startup_program()
         self._orig_dist_context = get_default_distributed_context()
@@ -84,6 +85,12 @@ def __init__(self,
         self._feed_vars = {}
         self._fetch_vars = {}
         self._planners = {}
+        self._mode_init_states = {
+            "train": False,
+            "eval": False,
+            "predict": False
+        }
+        self._dygraph_mode = False
 
     def prepare(self,
                 optimizer=None,
@@ -99,6 +106,7 @@ def prepare(self,
                         " or `paddle.fluid.optimizer.Optimizer`."
                 )
         self._optimizer = optimizer
+        self._all_ranks = all_ranks
 
         if loss and not isinstance(loss,
                                    paddle.nn.Layer) and not callable(loss):
@@ -114,23 +122,108 @@ def prepare(self,
                     metric.__class__.__name__)
         self._metrics = to_list(metrics)
         self._gradient_scale = gradient_scale
-
         self._planned_mode = None
-        self._modes = ['train', 'eval', 'predict']
-        self._build()
+        self._prepare_single_mode("train")
 
+    def _prepare_single_mode(self, mode):
+        self._modes = [mode]
+        self._build(self._modes[0])
         # Do auto parallel process
         for mode in self._modes:
             # Do the planning process
             self._plan(mode)
         for mode in self._modes:
             # Do the parallel process
-            self._parallel(mode, all_ranks)
+            self._parallel(mode, self._all_ranks)
+
             # Init comm and startup program
             self._initialize(mode)
+            self._mode_init_states[mode] = True
+
+    def _build(self, mode):
+
+        if _non_static_mode() or self._dygraph_mode:
+            self._dygraph_mode = True
+            self._logger.info("Building model with 'to_static' method.")
+
+            # build forward main program
+            self.static_model = to_static(self.model,
+                                          input_spec=self.inputs_spec)
+            inputs = self.static_model.forward.inputs
+            outputs = self.static_model.forward.outputs
+            forward_main_prog = self.static_model.forward.main_program
+            forward_startup_prog = self.static_model.forward.concrete_program.startup_program
+            self.concrete_program = self.static_model.forward.concrete_program
+
+            # build loss main program
+            outputs_spec = []
+            outputs_name = []
+            for out in outputs:
+                outputs_spec.append(InputSpec(out.shape, out.dtype, out.name))
+                outputs_name.append(out.name)
+            if isinstance(self._loss, paddle.nn.Layer):
+                self.static_loss = to_static(self._loss.forward,
+                                             input_spec=outputs_spec +
+                                             self.labels_spec)
+                loss_main_prog = self.static_loss.main_program
+            elif callable(self._loss):
+                self.static_loss = to_static(self._loss,
+                                             input_spec=outputs_spec +
+                                             self.labels_spec)
+                loss_main_prog = self.static_loss.main_program
+
+            # build startup program
+            for param in self.concrete_program.parameters:
+                Parameter(name=param.name,
+                          desc=param,
+                          type=param.type,
+                          shape=param.shape,
+                          dtype=param.dtype,
+                          stop_gradient=param.stop_gradient,
+                          block=forward_startup_prog.global_block())
+
+            paddle.enable_static()
+
+            # NOTE: pure program will loss dist_attr
+            # feeded_var_names = [var.name for var in inputs]
+            # main_prog_0 = main_prog_0._prune_with_input(
+            #     feeded_var_names=feeded_var_names, targets=outputs)
+
+            labels = []
+            losses = []
+            metrics = []
+            # concat forward and loss prog
+            if mode != 'predict' and self._loss:
+                forward_block = forward_main_prog.global_block()
+                loss_block = loss_main_prog.global_block()
+                for idx, op in enumerate(loss_block.ops):
+                    op_desc = forward_block.desc.append_op()
+                    op_desc.copy_from(op.desc)
+                    for in_name in op.input_arg_names:
+                        if in_name in outputs_name:
+                            continue
+                        in_var = forward_block._clone_variable(
+                            loss_block.vars[in_name], force_persistable=False)
+                        if loss_block.vars[in_name].is_data:
+                            labels.append(in_var)
+                    for out_name in op.output_arg_names:
+                        out_var = forward_block._clone_variable(
+                            loss_block.vars[out_name], force_persistable=False)
+                        if idx == len(loss_block.ops) - 1:
+                            losses.append(out_var)
+                forward_block._sync_with_cpp()
+            serial_main_prog = forward_main_prog
+            serial_startup_prog = forward_startup_prog
+            # update metrics op in program
+            with static.program_guard(serial_main_prog, serial_startup_prog), \
+                utils.unique_name.guard():
+                if mode != "predict":
+                    for metric in self._metrics:
+                        metrics.extend(
+                            to_list(metric.compute(*(outputs + labels))))
 
-    def _build(self):
-        for mode in self._modes:
+        else:
+            # build program in static mode
             serial_main_prog = self._serial_main_progs.get(mode, None)
             if serial_main_prog is not None:
                 return
@@ -154,31 +247,26 @@ def _build(self):
                         metrics.extend(
                             to_list(metric.compute(*(outputs + labels))))
 
-            default_ctx = get_default_distributed_context()
-            if not default_ctx.has_annotation or self._default_strategy:
-                # We build the world process group because the data parallel
-                # needs all ranks by default.
-                new_process_group(list(range(self._nranks)))
-                default_ctx.data_parallel = True
-
-            # self._feed_vars[mode] = {"inputs": inputs, "labels": labels}
-            feed_vars = {"inputs": inputs, "labels": labels}
-
-            # self._fetch_vars[mode] = {
-            #     "outputs": flatten(outputs),
-            #     "loss": losses,
-            #     "metrics": metrics
-            # }
-            fetch_vars = {
-                "outputs": flatten(outputs),
-                "loss": losses,
-                "metrics": metrics
-            }
-
-            self._dist_contexts[mode] = DistributedContext(
-                serial_main_prog, serial_startup_prog, self._optimizer, losses,
-                feed_vars, fetch_vars, self.cluster, self.strategy)
-            self._dist_contexts[mode].gradient_scale = self._gradient_scale
+        default_ctx = get_default_distributed_context()
+        if not default_ctx.has_annotation:
+            # We build the world process group because the data parallel
+            # needs all ranks by default.
+            new_process_group(list(range(self._nranks)))
+            default_ctx.data_parallel = True
+
+        feed_vars = {"inputs": inputs, "labels": labels}
+
+        fetch_vars = {
+            "outputs": flatten(outputs),
+            "loss": losses,
+            "metrics": metrics
+        }
+
+        self._dist_contexts[mode] = DistributedContext(
+            serial_main_prog, serial_startup_prog, self._optimizer, losses,
+            feed_vars, fetch_vars, self.cluster, self.strategy)
+        self._dist_contexts[mode].gradient_scale = self._gradient_scale
+        self._dist_contexts[mode]._dygraph_mode = self._dygraph_mode
 
     def _plan(self, mode):
         if self._planned_mode is None:
@@ -235,15 +323,45 @@ def _initialize(self, mode):
             # Traverse different rank programs and traverse each op of them,
             # instantiate communication by process_mapping.
             all_process_groups = get_all_process_groups()
+
+            # NOTE: add the comm init control in the future for auto search
             for process_group in all_process_groups:
                 if self._cur_rank not in process_group.ranks:
                     continue
                 process_group.instantiate()
 
-        # initialize
         self._place = _get_device()
         if isinstance(self._place, fluid.CUDAPlace):
             self._place = fluid.CUDAPlace(ParallelEnv().dev_id)
+
+        if self._dygraph_mode:
+            paddle.disable_static()
+            main_program = self._dist_main_progs[mode][self._cur_rank]
+            for param in self.concrete_program.parameters:
+                # create var in scope and share parameters to scope
+                if param.name not in main_program.global_block().vars:
+                    continue
+                # get param_var's dist_attr
+                var = main_program.global_block().vars[param.name]
+                var_dist_attr = self._dist_contexts[
+                    mode].get_tensor_dist_attr_for_program(var)
+                dist_attr = {
+                    "dims_mapping": var_dist_attr.dims_mapping,
+                    "process_shape": var_dist_attr.process_mesh.topology,
+                    "process_group": var_dist_attr.process_mesh.processes
+                }
+                # slice param_value with dist_attr
+                # share sliced_param_value with param_tensor in global_scope
+                from .converter import Converter
+                param_tensor = global_scope().var(param.name).get_tensor()
+                sliced_param = Converter.slice_with_dist_attr(
+                    param.numpy(), dist_attr)
+                shared_tensor = paddle.to_tensor(sliced_param,
+                                                 place=self._place)
+                param_tensor._share_data_with(
+                    shared_tensor.value().get_tensor())
+            paddle.enable_static()
+
         if self._executor is None:
             self._executor = paddle.static.Executor(self._place)
             uninitialized = []
@@ -267,14 +385,20 @@ def fit(self,
             return_numpy=True):
         # TODO: callbacks
         # TODO: evaluate after training
+
+        if not self._mode_init_states['train']:
+            raise Exception(
+                "train program is not initialized yet, please call engine.prepare() before calling fit() funtion."
+            )
+
         self.mode = 'train'
         assert self.mode in self._dist_main_progs, \
             "train model is not ready, please call `engine.prepare()` first."
         train_dataloader = self._create_dataloader(train_data, batch_size,
                                                    epochs, steps_per_epoch)
 
-        usr_fetch = self._to_map_fetch(fetches)
-        fetch_loss = self._inner_fetch(self.fetch_vars["loss"])
+        usr_fetch = self._validate_fetches(fetches)
+        fetch_loss = self._validate_fetches(self.fetch_vars["loss"])
         fetch_list, fetch_map = self._fetch_map(fetch_loss, usr_fetch)
 
         for epoch in range(epochs):
@@ -292,8 +416,7 @@ def fit(self,
                 user_outs = outs[len(fetch_loss):]
                 user_fetch_list = fetch_list[len(fetch_loss):]
                 for i, out in enumerate(user_outs):
-                    train_logs["train_" +
-                               fetch_map[user_fetch_list[i]]] = out[0]
+                    train_logs["train_" + fetch_map[user_fetch_list[i]]] = out
                 self._logger.info(train_logs)
 
     def evaluate(self,
@@ -303,13 +426,16 @@ def evaluate(self,
                  use_program_cache=False,
                  return_numpy=True):
         self.mode = 'eval'
+        if not self._mode_init_states[self.mode]:
+            self._prepare_single_mode(self.mode)
+
         assert self.mode in self._dist_main_progs, \
             "eval model is not ready, please call `engine.prepare()` first."
         eval_dataloader = self._create_dataloader(eval_data, batch_size)
 
-        usr_fetch = self._to_map_fetch(fetches)
-        fetch_loss = self._inner_fetch(self.fetch_vars["loss"])
-        fetch_metrics = self._inner_fetch(self.fetch_vars["metrics"])
+        usr_fetch = self._validate_fetches(fetches)
+        fetch_loss = self._validate_fetches(self.fetch_vars["loss"])
+        fetch_metrics = self._validate_fetches(self.fetch_vars["metrics"])
         inner_fetch = dict(fetch_loss, **fetch_metrics)
         fetch_list, fetch_map = self._fetch_map(inner_fetch, usr_fetch)
 
@@ -321,7 +447,7 @@ def evaluate(self,
                                       return_numpy=return_numpy)
             # inner fetches
             if fetch_loss:
-                eval_logs["eval_loss"] = outs[0]
+                eval_logs["eval_loss"] = outs[0][0]
             # Metric
             if fetch_metrics:
                 metric_out = outs[len(fetch_loss):len(inner_fetch)]
@@ -331,9 +457,9 @@ def evaluate(self,
                     for i, res in enumerate(to_list(results)):
                         eval_logs["eval_" + metric.name()[i]] = res
             # usr fetches
-            usr_out = outs[len(inner_fetch):]
+            usr_outs = outs[len(inner_fetch):]
             usr_fetch_list = fetch_list[len(inner_fetch):]
-            for i, out in enumerate(usr_out):
+            for i, out in enumerate(usr_outs):
                 eval_logs["eval_" + fetch_map[usr_fetch_list[i]]] = out
             # logger
             self._logger.info(eval_logs)
@@ -345,12 +471,15 @@ def predict(self,
                 use_program_cache=False,
                 return_numpy=True):
         self.mode = 'predict'
+        if not self._mode_init_states[self.mode]:
+            self._prepare_single_mode(self.mode)
+
         assert self.mode in self._dist_main_progs, \
             "predict model is not ready, please call `engine.prepare()` first."
         test_dataloader = self._create_dataloader(test_data, batch_size)
 
-        usr_fetch = self._to_map_fetch(fetches)
-        fetch_outputs = self._inner_fetch(self.fetch_vars["outputs"])
+        usr_fetch = self._validate_fetches(fetches)
+        fetch_outputs = self._validate_fetches(self.fetch_vars["outputs"])
         fetch_list, fetch_map = self._fetch_map(fetch_outputs, usr_fetch)
 
         outputs = []
@@ -362,42 +491,11 @@ def predict(self,
                                       return_numpy=return_numpy)
             outputs.append(outs[:len(fetch_outputs)])
             for i, out in enumerate(outs):
-                predict_logs["pred_" + fetch_map[fetch_list[i]]] = out[0]
+                predict_logs["pred_" + fetch_map[fetch_list[i]]] = out
             self._logger.info(predict_logs)
 
         return outputs
 
-    def _local_var(self, var):
-        var_name = _to_name_str(var)
-        return var_name in self.main_program.global_block().vars
-
-    def _to_map_fetch(self, fetches):
-        if not fetches:
-            return {}
-        if isinstance(fetches, dict):
-            fetch_var_names = list(map(_to_name_str, fetches.values()))
-            usr_fetches = dict(zip(fetch_var_names, list(fetches.keys())))
-        elif isinstance(fetches, list):
-            fetch_var_names = list(map(_to_name_str, fetches))
-            usr_fetches = dict(zip(fetch_var_names, fetch_var_names))
-        return dict(filter(lambda x: self._local_var(x[0]),
-                           usr_fetches.items()))
-
-    def _inner_fetch(self, fetch_vars):
-        fetch_list = list(
-            map(lambda x: x.name, list(filter(self._local_var, fetch_vars))))
-        inner_fetches = dict(zip(fetch_list, fetch_list))
-        return inner_fetches
-
-    def _fetch_map(self, inner_fetch, usr_fetch):
-        # replace inner fetch name if usr set for it
-        for iname in inner_fetch:
-            if iname in usr_fetch:
-                inner_fetch[iname] = usr_fetch[iname]
-                usr_fetch.pop(iname)
-        fetches = dict(inner_fetch, **usr_fetch)
-        return list(fetches.keys()), fetches
-
     def _create_dataloader(self,
                            dataset,
                            batch_size,
@@ -468,26 +566,35 @@ def _validate_spec(self, specs):
                         .format(i, spec))
         return specs
 
-    def _set_data_parallel(self, var):
-        if self._nranks == 1:
-            self._default_strategy = 'serial'
-            auto.shard_tensor(var,
-                              dist_attr={
-                                  "process_mesh": [0],
-                                  "dims_mapping":
-                                  [-1 for _ in range(len(var.shape))]
-                              })
+    def _is_local_var(self, var):
+        var_name = _to_name_str(var)
+        return var_name in self.main_program.global_block().vars
+
+    def _validate_fetches(self, fetches):
+        # 1. Check user-defined fetches type
+        # 2. Prepare fetches_dict like {user_defined_name: var_name}
+        if not fetches:
+            return {}
+        if isinstance(fetches, dict):
+            fetch_var_names = list(map(_to_name_str, fetches.values()))
+            fetches_dict = dict(zip(fetch_var_names, list(fetches.keys())))
+        elif isinstance(fetches, list):
+            fetch_var_names = list(map(_to_name_str, fetches))
+            fetches_dict = dict(zip(fetch_var_names, fetch_var_names))
         else:
-            self._default_strategy = 'dp'
-            auto.shard_tensor(var,
-                              dist_attr={
-                                  "process_mesh":
-                                  list(range(self._nranks)),
-                                  "dims_mapping":
-                                  [0] + [-1 for _ in range(len(var.shape) - 1)]
-                              })
-
-        return var
+            raise TypeError("'fetches' only support 'dict' and 'list', "
+                            "but got '{}'".format(str(type(fetches))))
+        return dict(
+            filter(lambda x: self._is_local_var(x[0]), fetches_dict.items()))
+
+    def _fetch_map(self, inner_fetch, usr_fetch):
+        # replace inner fetch name if usr set for it
+        for iname in inner_fetch:
+            if iname in usr_fetch:
+                inner_fetch[iname] = usr_fetch[iname]
+                usr_fetch.pop(iname)
+        fetches = dict(inner_fetch, **usr_fetch)
+        return list(fetches.keys()), fetches
 
     def _get_data_parallel_info(self, var, dist_context):
         # get data parallel world size and current data parallel rank
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py b/python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py
index 0a4bfb1213d46..108b99fdce613 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py
@@ -113,12 +113,11 @@ def backward(ctx, *args, **kwargs):
                 filter_vars.append(varname)
 
         # replicate op in dist program
-        dist_op_desc = main_block.desc.append_op()
+        dist_op_desc = main_block.append_op(type='nop').desc
         dist_op_desc.copy_from(backward_op.desc)
         set_dist_op_desc_original_id(dist_op_desc, backward_op.desc, ctx)
         dist_op_desc.set_input('X', filter_vars)
         dist_op_desc.set_output('Out', filter_vars)
-        main_block._sync_with_cpp()
 
         # sync result
         group = new_process_group(world_process_group.ranks)
@@ -137,7 +136,7 @@ def backward(ctx, *args, **kwargs):
                                         attrs={
                                             "in_dtype": inf_var.dtype,
                                             "out_dtype": inf_var_int32.dtype,
-                                            OP_ROLE_KEY: OpRole.Backward
+                                            OP_ROLE_KEY: OpRole.Optimize
                                         })
         allreduce_op = main_block.append_op(type='c_allreduce_max',
                                             inputs={'X': inf_var_int32},
@@ -145,7 +144,7 @@ def backward(ctx, *args, **kwargs):
                                             attrs={
                                                 'ring_id': group.id,
                                                 'use_calc_stream': True,
-                                                OP_ROLE_KEY: OpRole.Backward
+                                                OP_ROLE_KEY: OpRole.Optimize
                                             })
         cast_op2 = main_block.append_op(type='cast',
                                         inputs={'X': inf_var_int32},
@@ -153,9 +152,8 @@ def backward(ctx, *args, **kwargs):
                                         attrs={
                                             "in_dtype": inf_var_int32.dtype,
                                             "out_dtype": inf_var.dtype,
-                                            OP_ROLE_KEY: OpRole.Backward
+                                            OP_ROLE_KEY: OpRole.Optimize
                                         })
-        main_block._sync_with_cpp()
 
         for op in [cast_op1, allreduce_op, cast_op2]:
             new_op_dist_attr = OperatorDistributedAttribute()
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_default.py b/python/paddle/distributed/auto_parallel/operators/dist_default.py
index a2b1b7826d51f..9d9d5371aca3e 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_default.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_default.py
@@ -363,7 +363,7 @@ def forward(ctx, *args, **kwargs):
                 output_name)
 
         # replicate op in dist program
-        dist_op_desc = main_block.desc.append_op()
+        dist_op_desc = main_block.append_op(type='nop').desc
         dist_op_desc.copy_from(src_op.desc)
         set_dist_op_desc_original_id(dist_op_desc, src_op.desc, ctx)
         for input_name in src_op.desc.input_names():
@@ -371,8 +371,6 @@ def forward(ctx, *args, **kwargs):
         for output_name in src_op.desc.output_names():
             dist_op_desc.set_output(output_name, kwargs[output_name])
 
-        main_block._sync_with_cpp()
-
         # data parallel synchronization for primtive operators
         from paddle.incubate.autograd import prim_enabled
         if prim_enabled():
@@ -431,8 +429,6 @@ def forward(ctx, *args, **kwargs):
                         op_attr.set_input_dims_mapping(param.name, dims_mapping)
                         ctx.set_op_dist_attr_for_program(new_op, op_attr)
 
-                startup_block._sync_with_cpp()
-
     @staticmethod
     def backward(ctx, *args, **kwargs):
 
@@ -461,7 +457,7 @@ def backward(ctx, *args, **kwargs):
                 output_name)
 
         # replicate op in dist program
-        dist_op_desc = main_block.desc.append_op()
+        dist_op_desc = main_block.append_op(type='nop').desc
         dist_op_desc.copy_from(backward_op.desc)
         # Refer to the related dist op
         set_dist_op_desc_original_id(dist_op_desc, backward_op.desc, ctx)
@@ -470,8 +466,6 @@ def backward(ctx, *args, **kwargs):
         for output_name in backward_op.desc.output_names():
             dist_op_desc.set_output(output_name, kwargs[output_name])
 
-        main_block._sync_with_cpp()
-
         # check if need gradient allreduce
         # if there is a non-gradient & non-parameter input and its batch dimension is splited,
         # we need insert gradient allreduce for the gradient of parameter in its output
@@ -552,8 +546,6 @@ def backward(ctx, *args, **kwargs):
                                                        dims_mapping)
                         ctx.set_op_dist_attr_for_program(op, op_attr)
 
-                main_block._sync_with_cpp()
-
 
 register_distributed_operator_impl(
     "default", DistributedDefaultImpl0("replicate_parallel"))
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
index 2272400e60ddf..aa463398139ba 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
@@ -222,7 +222,10 @@ def forward(ctx, *args, **kwargs):
                 'W': [Weight_var]
             },
             outputs={'Out': [intermediate_var_0]},
-            attrs={"start_index": relative_idx})
+            attrs={
+                "start_index": relative_idx,
+                OP_ROLE_KEY: src_op.attr('op_role')
+            })
         if intermediate_var_0.shape != ref_shape:
             intermediate_var_0.desc.set_shape(ref_shape)
 
@@ -235,6 +238,7 @@ def forward(ctx, *args, **kwargs):
                 'ring_id': group.id,
                 'use_calc_stream': True,
                 'use_model_parallel': True,
+                OP_ROLE_KEY: src_op.attr('op_role')
             })
         if Out_var.shape != ref_shape:
             Out_var.desc.set_shape(ref_shape)
@@ -308,7 +312,6 @@ def forward(ctx, *args, **kwargs):
                                                 'use_calc_stream': True,
                                                 OP_ROLE_KEY: OpRole.Forward
                                             })
-            startup_block._sync_with_cpp()
 
     @staticmethod
     def backward(ctx, *args, **kwargs):
@@ -408,8 +411,7 @@ def backward(ctx, *args, **kwargs):
         set_comm_op_dist_attr_for_program(c_identity_op, dist_attr.process_mesh,
                                           out_grad_dist_attr, ctx)
 
-        main_block._sync_with_cpp()
-        c_embedding_grad_op_desc = main_block.desc.append_op()
+        c_embedding_grad_op_desc = main_block.append_op(type='nop').desc
         c_embedding_grad_op_desc.set_type("c_embedding_grad")
         c_embedding_grad_op_desc.set_input('Ids', [Ids_var.name])
         c_embedding_grad_op_desc.set_input('W', [Weight_var.name])
@@ -418,7 +420,6 @@ def backward(ctx, *args, **kwargs):
         c_embedding_grad_op_desc.set_output('W@GRAD', [Weight_grad.name])
         c_embedding_grad_op_desc._set_attr('start_index', relative_idx)
         c_embedding_grad_op_desc._set_attr(OP_ROLE_KEY, OpRole.Backward)
-        main_block._sync_with_cpp()
 
         c_embedding_grad_op = main_block.ops[-1]
         assert c_embedding_grad_op.type == "c_embedding_grad"
@@ -442,6 +443,7 @@ def backward(ctx, *args, **kwargs):
             dp_group = new_process_group(group_ranks)
 
         if need_gradient_allreduce:
+            added_ops = []
             W_Grad_var = main_block.var(kwargs['W@GRAD'][0])
             allreduce_op = main_block.append_op(type='c_allreduce_sum',
                                                 inputs={'X': [W_Grad_var]},
@@ -451,19 +453,24 @@ def backward(ctx, *args, **kwargs):
                                                     'use_calc_stream': True,
                                                     OP_ROLE_KEY: OpRole.Backward
                                                 })
-            scale_op = main_block.append_op(type='scale',
-                                            inputs={'X': W_Grad_var},
-                                            outputs={'Out': W_Grad_var},
-                                            attrs={
-                                                'scale': 1.0 / dp_degree,
-                                                OP_ROLE_KEY: OpRole.Backward
-                                            })
+            added_ops.append(allreduce_op)
+
+            if ctx.gradient_scale:
+                scale_op = main_block.append_op(type='scale',
+                                                inputs={'X': W_Grad_var},
+                                                outputs={'Out': W_Grad_var},
+                                                attrs={
+                                                    'scale': 1.0 / dp_degree,
+                                                    OP_ROLE_KEY: OpRole.Backward
+                                                })
+                added_ops.append(scale_op)
+
             main_block._sync_with_cpp()
 
             dims_mapping = ctx.get_tensor_dist_attr_for_program(
                 W_Grad_var).dims_mapping
             process_mesh = dist_attr.process_mesh
-            for op in [allreduce_op, scale_op]:
+            for op in added_ops:
                 op_attr = OperatorDistributedAttribute()
                 op_attr.process_mesh = process_mesh
                 op_attr.set_output_dims_mapping(W_Grad_var.name, dims_mapping)
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py b/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py
index 763e47802b333..27e8983707b72 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py
@@ -118,7 +118,6 @@ def forward(ctx, *args, **kwargs):
                 shape_list[idx] = shape_list[idx] // process_mesh_shape[axis]
 
         op._set_attr("shape", shape_list)
-        main_block._sync_with_cpp()
 
     @staticmethod
     def backward(ctx, *args, **kwargs):
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
index 427932a77fbcd..4e9aefd168c4f 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
@@ -38,7 +38,7 @@
 
 
 def copy_op_with_new_input_output(ctx, block, src_op, **kwargs):
-    dist_op_desc = block.desc.append_op()
+    dist_op_desc = block.append_op(type='nop').desc
     dist_op_desc.copy_from(src_op.desc)
     set_dist_op_desc_original_id(dist_op_desc, src_op.desc, ctx)
     for input_name in src_op.desc.input_names():
@@ -48,7 +48,6 @@ def copy_op_with_new_input_output(ctx, block, src_op, **kwargs):
         assert input_name in kwargs
         dist_op_desc.set_output(output_name, kwargs[output_name])
 
-    block._sync_with_cpp()
     return dist_op_desc
 
 
@@ -387,8 +386,6 @@ def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs):
         matmul_op_desc = copy_op_with_new_input_output(ctx, main_block,
                                                        backward_op, **kwargs)
 
-    main_block._sync_with_cpp()
-
     # check if need gradient allreduce
     need_gradient_allreduce = False
 
@@ -405,6 +402,7 @@ def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs):
         dp_group = new_process_group(group_ranks)
 
     if need_gradient_allreduce and is_parameter_related(Y_var.name, main_block):
+        added_ops = []
         Y_Grad_var = main_block.var(kwargs['Y@GRAD'][0])
         allreduce_op = main_block.append_op(type='c_allreduce_sum',
                                             inputs={'X': [Y_Grad_var]},
@@ -414,19 +412,24 @@ def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs):
                                                 'use_calc_stream': True,
                                                 OP_ROLE_KEY: OpRole.Backward
                                             })
-        scale_op = main_block.append_op(type='scale',
-                                        inputs={'X': Y_Grad_var},
-                                        outputs={'Out': Y_Grad_var},
-                                        attrs={
-                                            'scale': 1.0 / dp_degree,
-                                            OP_ROLE_KEY: OpRole.Backward
-                                        })
+        added_ops.append(allreduce_op)
+
+        if ctx.gradient_scale:
+            scale_op = main_block.append_op(type='scale',
+                                            inputs={'X': Y_Grad_var},
+                                            outputs={'Out': Y_Grad_var},
+                                            attrs={
+                                                'scale': 1.0 / dp_degree,
+                                                OP_ROLE_KEY: OpRole.Backward
+                                            })
+            added_ops.append(scale_op)
+
         main_block._sync_with_cpp()
 
         dims_mapping = ctx.get_tensor_dist_attr_for_program(
             Y_Grad_var).dims_mapping
         process_mesh = dist_attr.process_mesh
-        for op in [allreduce_op, scale_op]:
+        for op in added_ops:
             op_attr = OperatorDistributedAttribute()
             op_attr.process_mesh = process_mesh
             op_attr.set_output_dims_mapping(Y_Grad_var.name, dims_mapping)
@@ -462,7 +465,6 @@ def _init_param_sync(Weight_var, dist_op_context, startup_block, ctx, rank_id):
                                         'use_calc_stream': True,
                                         OP_ROLE_KEY: OpRole.Forward
                                     })
-    startup_block._sync_with_cpp()
 
 
 class DistributedMatmul(DistributedOperatorImplContainer):
@@ -617,6 +619,7 @@ def forward(ctx, *args, **kwargs):
                 'ring_id': group.id,
                 'use_calc_stream': True,
                 'use_model_parallel': True,
+                OP_ROLE_KEY: src_op.attr('op_role')
             })
         if intermediate_var_0.shape != ref_shape_x:
             intermediate_var_0.desc.set_shape(ref_shape_x)
@@ -629,6 +632,7 @@ def forward(ctx, *args, **kwargs):
             'transpose_X': False,
             'transpose_Y': False,
             'alpha': 1,
+            OP_ROLE_KEY: src_op('op_role')
         }
         inputs = {'X': [intermediate_var_0], 'Y': [Weight_var]}
         matmul_op = main_block.append_op(type='matmul',
@@ -814,6 +818,7 @@ def forward(ctx, *args, **kwargs):
             'transpose_X': False,
             'transpose_Y': False,
             'alpha': 1,
+            OP_ROLE_KEY: src_op.attr('op_role')
         }
         inputs = {'X': X_var, 'Y': Weight_var}
 
@@ -853,7 +858,8 @@ def forward(ctx, *args, **kwargs):
             attrs={
                 'ring_id': group.id,
                 'use_calc_stream': True,
-                'use_model_parallel': True
+                'use_model_parallel': True,
+                OP_ROLE_KEY: src_op.attr('op_role')
             })
         if Out_var.shape != ref_shape:
             Out_var.desc.set_shape(ref_shape)
@@ -1137,6 +1143,7 @@ def forward(ctx, *args, **kwargs):
                 'ring_id': group.id,
                 'use_calc_stream': True,
                 'use_model_parallel': True,
+                OP_ROLE_KEY: src_op.attr('op_role'),
             })
         if intermediate_var_0.shape != ref_shape_x:
             intermediate_var_0.desc.set_shape(ref_shape_x)
@@ -1145,7 +1152,11 @@ def forward(ctx, *args, **kwargs):
                                  ['float16', 'float32', 'float64'], 'linear')
         check_dtype(intermediate_var_0.dtype, 'dtype',
                     ['float16', 'float32', 'float64'], 'linear')
-        attrs = {'trans_x': False, 'trans_y': False}
+        attrs = {
+            'trans_x': False,
+            'trans_y': False,
+            OP_ROLE_KEY: src_op.attr('op_role')
+        }
         inputs = {'X': [intermediate_var_0], 'Y': [Weight_var]}
         matmul_v2_op = main_block.append_op(type='matmul_v2',
                                             inputs=inputs,
@@ -1322,7 +1333,11 @@ def forward(ctx, *args, **kwargs):
                                  'linear')
         check_dtype(X_var.dtype, 'dtype', ['float16', 'float32', 'float64'],
                     'linear')
-        attrs = {'trans_x': False, 'trans_y': False}
+        attrs = {
+            'trans_x': False,
+            'trans_y': False,
+            OP_ROLE_KEY: src_op.attr('op_role')
+        }
         inputs = {'X': X_var, 'Y': Weight_var}
 
         # infer out var shape with op dist attr
@@ -1361,7 +1376,8 @@ def forward(ctx, *args, **kwargs):
             attrs={
                 'ring_id': group.id,
                 'use_calc_stream': True,
-                'use_model_parallel': True
+                'use_model_parallel': True,
+                OP_ROLE_KEY: src_op.attr('op_role')
             })
         if Out_var.shape != ref_shape:
             Out_var.desc.set_shape(ref_shape)
@@ -1646,6 +1662,7 @@ def forward(ctx, *args, **kwargs):
                 'ring_id': group.id,
                 'use_calc_stream': True,
                 'use_model_parallel': True,
+                OP_ROLE_KEY: src_op.attr('op_role')
             })
         if intermediate_var_0.shape != ref_shape_x:
             intermediate_var_0.desc.set_shape(ref_shape_x)
@@ -1657,7 +1674,8 @@ def forward(ctx, *args, **kwargs):
         # attrs = {'trans_x': False, 'trans_y': False}
         attrs = {
             "x_num_col_dims": src_op.desc.attr("x_num_col_dims"),
-            "y_num_col_dims": src_op.desc.attr("y_num_col_dims")
+            "y_num_col_dims": src_op.desc.attr("y_num_col_dims"),
+            OP_ROLE_KEY: src_op.attr('op_role')
         }
         inputs = {'X': [intermediate_var_0], 'Y': [Weight_var]}
         mul_op = main_block.append_op(type='mul',
@@ -1838,7 +1856,8 @@ def forward(ctx, *args, **kwargs):
         # attrs = {'trans_x': False, 'trans_y': False}
         attrs = {
             "x_num_col_dims": src_op.desc.attr("x_num_col_dims"),
-            "y_num_col_dims": src_op.desc.attr("y_num_col_dims")
+            "y_num_col_dims": src_op.desc.attr("y_num_col_dims"),
+            OP_ROLE_KEY: src_op.attr('op_role')
         }
         inputs = {'X': X_var, 'Y': Weight_var}
 
@@ -1878,7 +1897,8 @@ def forward(ctx, *args, **kwargs):
             attrs={
                 'ring_id': group.id,
                 'use_calc_stream': True,
-                'use_model_parallel': True
+                'use_model_parallel': True,
+                OP_ROLE_KEY: src_op.attr('op_role')
             })
         if Out_var.shape != ref_shape:
             Out_var.desc.set_shape(ref_shape)
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py b/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py
index 4629e4bef930e..7eea4bea49f35 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py
@@ -248,7 +248,7 @@ def forward(ctx, *args, **kwargs):
         # rename input
         kwargs['X'] = [allgather_out.name]
         # replicate op in dist program
-        dist_op_desc = main_block.desc.append_op()
+        dist_op_desc = main_block.append_op(type='nop').desc
         dist_op_desc.copy_from(src_op.desc)
         set_dist_op_desc_original_id(dist_op_desc, src_op.desc, ctx)
         for input_name in src_op.desc.input_names():
@@ -260,8 +260,6 @@ def forward(ctx, *args, **kwargs):
             allgather_out.name, allgather_out_dist_attr.dims_mapping)
         ctx.set_op_dist_attr_for_program(pnorm_op, op_dist_attr)
 
-        main_block._sync_with_cpp()
-
     @staticmethod
     def backward(ctx, *args, **kwargs):
 
@@ -305,7 +303,7 @@ def backward(ctx, *args, **kwargs):
         new_X_var_dist_attr = ctx.get_tensor_dist_attr_for_program(new_X_var)
         ctx.set_tensor_dist_attr_for_program(new_X_grad, new_X_var_dist_attr)
         # replicate op in dist program with new kwargs
-        dist_op_desc = main_block.desc.append_op()
+        dist_op_desc = main_block.append_op(type='nop').desc
         dist_op_desc.copy_from(backward_op.desc)
         # Refer to the related dist op
         set_dist_op_desc_original_id(dist_op_desc, backward_op.desc, ctx)
@@ -319,7 +317,6 @@ def backward(ctx, *args, **kwargs):
         op_dist_attr.set_output_dims_mapping(new_X_grad.name,
                                              new_X_var_dist_attr.dims_mapping)
         ctx.set_op_dist_attr_for_program(p_norm_grad_op, op_dist_attr)
-        main_block._sync_with_cpp()
 
         # 2. insert slice op
         process_mesh_shape = op_dist_attr.process_mesh.topology
@@ -359,7 +356,6 @@ def backward(ctx, *args, **kwargs):
         slice_op_dist_attr.set_output_dims_mapping(X_grad_var.name,
                                                    X_grad_var_dims_mapping)
         ctx.set_op_dist_attr_for_program(slice_op, slice_op_dist_attr)
-        main_block._sync_with_cpp()
 
 
 register_distributed_operator_impl("p_norm",
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_reduce_p.py b/python/paddle/distributed/auto_parallel/operators/dist_reduce_p.py
index 6d750562c96d9..bdd105ef64c30 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_reduce_p.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_reduce_p.py
@@ -109,14 +109,13 @@ def forward(ctx, *args, **kwargs):
                 output_name)
 
         # replicate op in dist program
-        dist_op_desc = main_block.desc.append_op()
+        dist_op_desc = main_block.append_op(type='nop').desc
         dist_op_desc.copy_from(src_op.desc)
         set_dist_op_desc_original_id(dist_op_desc, src_op.desc, ctx)
         for input_name in src_op.desc.input_names():
             dist_op_desc.set_input(input_name, kwargs[input_name])
         for output_name in src_op.desc.output_names():
             dist_op_desc.set_output(output_name, kwargs[output_name])
-        main_block._sync_with_cpp()
 
         # batch dimension synchronization
         var_name = src_op.output_arg_names[0]
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_reshape.py b/python/paddle/distributed/auto_parallel/operators/dist_reshape.py
index 47a783a5f6d71..790e97cf4e170 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_reshape.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_reshape.py
@@ -177,7 +177,7 @@ def forward(ctx, *args, **kwargs):
                         idx] = shape_list[idx] // process_mesh_shape[axis]
 
         # create op
-        new_op_desc = main_block.desc.append_op()
+        new_op_desc = main_block.append_op(type='nop').desc
         new_op_desc.copy_from(src_op.desc)
         set_dist_op_desc_original_id(new_op_desc, src_op.desc, ctx)
         new_op_desc.set_input('ShapeTensor', ShapeTensor_var_list)
@@ -187,8 +187,6 @@ def forward(ctx, *args, **kwargs):
         new_op_desc.set_output('Out', [Out_var.name])
         new_op_desc._set_attr('shape', shape_list)
 
-        main_block._sync_with_cpp()
-
     @staticmethod
     def backward(ctx, *args, **kwargs):
         DistributedDefaultImpl0.backward(ctx, *args, **kwargs)
@@ -335,7 +333,7 @@ def forward(ctx, *args, **kwargs):
                         idx] = shape_list[idx] // process_mesh_shape[axis]
 
         # create op
-        new_op_desc = main_block.desc.append_op()
+        new_op_desc = main_block.append_op(type='nop').desc
         new_op_desc.copy_from(src_op.desc)
         set_dist_op_desc_original_id(new_op_desc, src_op.desc, ctx)
         new_op_desc.set_input('ShapeTensor', ShapeTensor_var_list)
@@ -345,8 +343,6 @@ def forward(ctx, *args, **kwargs):
         new_op_desc.set_output('Out', [Out_var.name])
         new_op_desc._set_attr('shape', shape_list)
 
-        main_block._sync_with_cpp()
-
     @staticmethod
     def backward(ctx, *args, **kwargs):
         DistributedDefaultImpl0.backward(ctx, *args, **kwargs)
@@ -486,7 +482,7 @@ def forward(ctx, *args, **kwargs):
                         idx] = shape_list[idx] // process_mesh_shape[axis]
 
         # create op
-        new_op_desc = main_block.desc.append_op()
+        new_op_desc = main_block.append_op(type='nop').desc
         new_op_desc.copy_from(src_op.desc)
         set_dist_op_desc_original_id(new_op_desc, src_op.desc, ctx)
         new_op_desc.set_input('ShapeTensor', ShapeTensor_var_list)
@@ -496,8 +492,6 @@ def forward(ctx, *args, **kwargs):
         new_op_desc.set_output('Out', [Out_var.name])
         new_op_desc._set_attr('shape', shape_list)
 
-        main_block._sync_with_cpp()
-
     @staticmethod
     def backward(ctx, *args, **kwargs):
         DistributedDefaultImpl0.backward(ctx, *args, **kwargs)
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py b/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py
index 9666f882200e5..cbbcaef5ee47f 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py
@@ -127,12 +127,11 @@ def backward(ctx, *args, **kwargs):
                 filter_vars.append(varname)
 
         # replicate op in dist program
-        dist_op_desc = main_block.desc.append_op()
+        dist_op_desc = main_block.append_op(type='nop').desc
         dist_op_desc.copy_from(backward_op.desc)
         set_dist_op_desc_original_id(dist_op_desc, backward_op.desc, ctx)
         dist_op_desc.set_input('X', filter_vars)
         dist_op_desc.set_output('Out', filter_vars)
-        main_block._sync_with_cpp()
 
 
 register_distributed_operator_impl(
diff --git a/python/paddle/distributed/auto_parallel/parallelizer_v2.py b/python/paddle/distributed/auto_parallel/parallelizer_v2.py
index dce3908e75a62..005e51dfce723 100644
--- a/python/paddle/distributed/auto_parallel/parallelizer_v2.py
+++ b/python/paddle/distributed/auto_parallel/parallelizer_v2.py
@@ -13,11 +13,16 @@
 # limitations under the License.
 
 import copy
+import time
+import logging
 from collections import defaultdict
 
+import paddle
 from paddle.fluid import program_guard
 from paddle.fluid.backward import append_backward
+from paddle.fluid.framework import _non_static_mode
 from paddle.distributed.passes import new_pass
+from paddle.distributed.utils import get_logger
 
 from .reshard import Resharder
 from .partitioner import Partitioner
@@ -39,6 +44,7 @@ def __init__(self, mode, completer, dist_context):
         assert self._dist_context._is_initialized
         self._pass_context = self._dist_context.pass_context
         self._strategy = self._dist_context.strategy
+        self._logger = get_logger(logging.INFO)
 
     def parallel_all(self):
         world_process_group = get_world_process_group()
@@ -59,38 +65,65 @@ def parallel(self, rank):
                                                    serial_startup_program,
                                                    serial_loss)
             # Apply pre optimization passes
+            time0 = time.time()
             self._apply_pre_optimization(serial_main_program,
                                          serial_startup_program, serial_loss,
                                          serial_optimizer, params_grads)
-
+            self._logger.info(
+                "within parallel apply_pre_optimization time: {}, mode {}".
+                format(time.time() - time0, self._mode))
             # Do logical partition
+            time0 = time.time()
             partitioner = Partitioner(self._dist_context, rank)
             dist_main_prog, dist_startup_prog, dist_params_grads = partitioner.partition(
                 serial_main_program, serial_startup_program, params_grads)
+            self._logger.info(
+                "within parallel partitioner time: {}, mode {}".format(
+                    time.time() - time0, self._mode))
             # Generate optimizer
+            time0 = time.time()
             self._generate_optimizer(dist_main_prog, dist_startup_prog,
                                      serial_optimizer, dist_params_grads)
+            self._logger.info(
+                "within parallel optimizer time: {}, mode {}".format(
+                    time.time() - time0, self._mode))
             # Do reshard process
+            time0 = time.time()
             set_grad_var_shape(dist_main_prog, self._dist_context)
             resharder = Resharder(dist_main_prog, dist_startup_prog, rank,
                                   self._dist_context, dist_params_grads)
             resharder.reshard()
+            self._logger.info(
+                "within parallel reshard time: {}, mode {}".format(
+                    time.time() - time0, self._mode))
             # Apply post optimization passes
+            time0 = time.time()
             self._apply_post_optimization(dist_main_prog, dist_startup_prog,
                                           rank, dist_params_grads)
+            self._logger.info(
+                "within parallel apply_post_optimization time: {}, mode {}".
+                format(time.time() - time0, self._mode))
         else:
             # Apply pre optimization passes
             # self._apply_pre_optimization(serial_main_program,
             #                              serial_startup_program, None, None,
             #                              None)
             # Do logical partition
+            time0 = time.time()
             partitioner = Partitioner(self._dist_context, rank)
             dist_main_prog, dist_startup_prog, dist_params_grads = partitioner.partition(
                 serial_main_program, serial_startup_program, [])
             # Do reshard process
+            self._logger.info(
+                "within parallel partitioner time: {}, mode {}".format(
+                    time.time() - time0, self._mode))
+            time0 = time.time()
             resharder = Resharder(dist_main_prog, dist_startup_prog, rank,
                                   self._dist_context, [], 1)
             resharder.reshard()
+            self._logger.info(
+                "within parallel reshard time: {}, mode {}".format(
+                    time.time() - time0, self._mode))
         # Clone program for test
         if self._mode != 'train':
             dist_main_prog = dist_main_prog.clone(for_test=True)
@@ -110,9 +143,14 @@ def _generate_backward(self, main_program, startup_program, loss):
 
     def _generate_optimizer(self, main_program, startup_program, optimizer,
                             params_grads):
+        if self._dist_context._dygraph_mode:
+            paddle.disable_static()
+            optimizer = copy.deepcopy(optimizer)
+            paddle.enable_static()
+        else:
+            optimizer = copy.deepcopy(optimizer)
         with program_guard(main_program, startup_program):
-            optimizer_ops = copy.deepcopy(optimizer).apply_gradients(
-                params_grads)
+            optimizer_ops = optimizer.apply_gradients(params_grads)
         self._completer.complete_update_annotation(main_program)
         return optimizer_ops
 
diff --git a/python/paddle/distributed/auto_parallel/partitioner.py b/python/paddle/distributed/auto_parallel/partitioner.py
index 9056ab34fa711..97ff881ef95bf 100644
--- a/python/paddle/distributed/auto_parallel/partitioner.py
+++ b/python/paddle/distributed/auto_parallel/partitioner.py
@@ -264,10 +264,12 @@ def partition_block(self, ref_block, target_block):
                     self._dist_context, **kinputs, **koutputs,
                     **{"grad_var_to_var": grad_var_to_var})
             elif is_optimize_op(op):
+                # NOTE: BACKWARD_ONLY_DIST_OPS's op_role must 2 because of 1F1B PASS
                 kinputs, koutputs = dist_op_context.prepare_context(op)
-                dist_op_impl = get_distributed_operator_impl_container(
-                    "default").get_impl(0)
-                dist_op_impl.backward(self._dist_context, **kinputs, **koutputs)
+                dist_op_opt_impl = _get_dist_op_backward_implement(
+                    op, self._dist_context, forward_op_id2forward_op)
+                dist_op_opt_impl.backward(self._dist_context, **kinputs,
+                                          **koutputs)
             else:
                 raise NotImplementedError(
                     "partitioner only support forward and backward, optimize ops, but got {}"
diff --git a/python/paddle/distributed/auto_parallel/process_group.py b/python/paddle/distributed/auto_parallel/process_group.py
index d583dcb32eb22..74cb6930e0392 100644
--- a/python/paddle/distributed/auto_parallel/process_group.py
+++ b/python/paddle/distributed/auto_parallel/process_group.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
+from collections import OrderedDict
+
 import paddle
 import paddle.fluid.core as core
 from ..collective import _get_global_env
@@ -130,16 +132,23 @@ def instantiate(self):
             else:
                 assert False, ("No CUDA device found")
 
-        # TODO(shenliang03): This is a temporary solution to solve the problem of
-        # hang caused by cross-creation of new_group
-        tmp = paddle.to_tensor(
-            [1], dtype="int32") if _non_static_mode() else fill_constant(
-                [0], dtype="int32", value="1")
-        paddle.distributed.all_reduce(tmp, use_calc_stream=True)
-        paddle.distributed.wait(tmp)
+            # TODO(shenliang03): This is a temporary solution to solve the problem of
+            # hang caused by cross-creation of new_group
+            paddle.framework._in_legacy_dygraph()
+            paddle.set_device('gpu:%d' %
+                              paddle.distributed.ParallelEnv().dev_id)
+            tmp = paddle.to_tensor(
+                [1], dtype="int32") if _non_static_mode() else fill_constant(
+                    [0], dtype="int32", value="1")
+            paddle.distributed.all_reduce(tmp, use_calc_stream=True, group=self)
+            paddle.distributed.wait(tmp, group=self)
+            paddle.enable_static()
 
         self._is_instantiate = True
 
+    def is_member(self):
+        return True
+
     # def __eq__(self, other):
     #     if not isinstance(other, ProcessGroup):
     #         return False
@@ -158,5 +167,5 @@ def __str__(self):
 
 # Note that Process group 0 is reserved for representing all ranks.
 # At the beginning, group 0 is empty and new ranks will be added automatically.
-_g_process_group_map = {}
+_g_process_group_map = OrderedDict()
 _g_process_group_map[0] = ProcessGroup(0, [])
diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py
index e220b654e700a..c4f9ad8b6bc84 100644
--- a/python/paddle/distributed/auto_parallel/utils.py
+++ b/python/paddle/distributed/auto_parallel/utils.py
@@ -1065,7 +1065,7 @@ def set_grad_var_shape(program, dist_context):
                 "softmax", "cross_entropy2", "dropout", "tanh",
                 ["slice_grad", "c_allgather"], "assign", "matmul_v2_grad_grad",
                 "elementwise_add_grad_grad", "shape", "sqrt",
-                "fused_softmax_mask_upper_triangle_grad"
+                "fused_softmax_mask_upper_triangle"
             ]
             if op.type in need_set_shape_list:
                 for forward_op in block.ops:
@@ -1096,11 +1096,9 @@ def set_grad_var_shape(program, dist_context):
 
 
 def is_forward_op(op):
-    ref_role1 = int(core.op_proto_and_checker_maker.OpRole.Forward)
-    ref_role2 = int(core.op_proto_and_checker_maker.OpRole.Loss)
     op_role = int(op.attr('op_role'))
-    return OP_ROLE_KEY in op.attr_names and (op_role == ref_role1
-                                             or op_role == ref_role2)
+    return OP_ROLE_KEY in op.attr_names and (op_role == int(OpRole.Forward)
+                                             or op_role == int(OpRole.Loss))
 
 
 def is_backward_op(op):
@@ -1113,9 +1111,14 @@ def is_optimize_op(op):
             int(op.all_attrs()[OP_ROLE_KEY]) & int(OpRole.Optimize)
 
 
+def is_lr_sched_op(op):
+    return OP_ROLE_KEY in op.attr_names and \
+            int(op.all_attrs()[OP_ROLE_KEY]) & int(OpRole.Optimize.LRSched)
+
+
 def is_loss_op(op):
     return OP_ROLE_KEY in op.attr_names and \
-        int(op.all_attrs()[OP_ROLE_KEY]) == (int(core.op_proto_and_checker_maker.OpRole.Forward) | int(core.op_proto_and_checker_maker.OpRole.Loss))
+        int(op.all_attrs()[OP_ROLE_KEY]) == (int(OpRole.Forward) | int(OpRole.Loss))
 
 
 def is_prim_op(op):
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index cb634a4b6ac1a..62b18298f11e0 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -36,6 +36,7 @@
 import paddle.fluid.core as core
 from paddle import _C_ops
 import paddle.fluid.dygraph_utils as dygraph_utils
+import contextlib
 
 __all__ = []
 
@@ -136,6 +137,10 @@ def _get_global_env():
 # Dict[name, Group]
 _group_map_by_name = {}
 
+# backend map by group : the map of all backend from their groups
+# Dict[group, backend]
+_group_map_backend = {}
+
 # Name of the default group for init_parallel_env
 _default_group_name = "_default_pg"
 
@@ -175,9 +180,8 @@ def _get_group_map_by_name():
 
 def _get_default_group():
     global _group_map_by_name
-    assert _default_group_name in _group_map_by_name, (
-        "Call paddle.distributed.init_parallel_env first "
-        "to initialize the distributed environment.")
+    assert is_initialized(), ("Call paddle.distributed.init_parallel_env first "
+                              "to initialize the distributed environment.")
     return _get_group_map_by_name()[_default_group_name]
 
 
@@ -193,10 +197,29 @@ def _set_group_map_by_name(name, group):
     _group_map_by_name[name] = group
 
 
+def _set_group_map_backend(group, backend):
+    global _group_map_backend
+    assert group not in _group_map_backend
+    _group_map_backend[group] = backend
+
+
 def _new_ring_id():
     return len(_get_group_map()) + max(_get_global_env().nrings, 9)
 
 
+def _get_reduce_op(reduce_op, func_name):
+    if reduce_op == ReduceOp.SUM:
+        return core.ReduceOp.SUM
+    elif reduce_op == ReduceOp.MAX:
+        return core.ReduceOp.MAX
+    elif reduce_op == ReduceOp.MIN:
+        return core.ReduceOp.MIN
+    elif reduce_op == ReduceOp.PROD:
+        return core.ReduceOp.PRODUCT
+    else:
+        raise ValueError("Unknown reduce_op type for {}.".format(func_name))
+
+
 def get_group(id=0):
     """
 
@@ -400,6 +423,7 @@ def new_group(ranks=None, backend=None):
         group = Group(rank, size, id=gid, ranks=ranks, pg=pg, name=group_name)
         _group_map_by_name[group_name] = group
         _group_map[gid] = group
+        _group_map_backend[group] = backend
 
         # TODO(shenliang03): This is a temporary solution to solve the problem of
         # hang caused by tcp
@@ -462,6 +486,75 @@ def new_group(ranks=None, backend=None):
     return gp
 
 
+def is_initialized():
+    """
+
+    Check whether the distributed environment has been initialized
+
+    Returns (bool): `True` if distributed environment has been initialized, otherwise `False`.
+
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+
+            print(paddle.distributed.is_initialized())
+            # False
+
+            paddle.distributed.init_parallel_env()
+            print(paddle.distributed.is_initialized())
+            # True
+
+    """
+    global _group_map_by_name
+    return _default_group_name in _group_map_by_name
+
+
+def destroy_process_group(group=None):
+    """
+    Destroy a given group for communication
+
+    Args:
+        group (ProcessGroup, optional): The group to be destroyed. All of process groups, including 
+                                        the default group, will be destroyed and the distributed 
+                                        environment will be deinitialized.
+    
+    Returns : None
+
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+
+            paddle.distributed.init_parallel_env()
+            group = paddle.distributed.new_group([0, 1])
+
+            paddle.distributed.destroy_process_group(group)
+            print(paddle.distributed.is_initialized())
+            # True
+            paddle.distributed.destroy_process_group()
+            print(paddle.distributed.is_initialized())
+            # False
+
+    """
+    global _group_map
+    global _group_map_by_name
+
+    pg = _get_default_group() if group is None else group
+    assert _group_map.get(pg.id, None) is not None, "Invalid group."
+
+    if group is None:
+        _group_map.clear()
+        _group_map_by_name.clear()
+        _group_map_backend.clear()
+    else:
+        del _group_map[pg.id]
+        del _group_map_by_name[pg.name]
+        del _group_map_backend[pg]
+
+
 def wait(tensor, group=None, use_calc_stream=True):
     """
 
@@ -663,16 +756,7 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=None, use_calc_stream=True):
         return
 
     if in_dygraph_mode():
-        if op == ReduceOp.SUM:
-            op_type = core.ReduceOp.SUM
-        elif op == ReduceOp.MAX:
-            op_type = core.ReduceOp.MAX
-        elif op == ReduceOp.MIN:
-            op_type = core.ReduceOp.MIN
-        elif op == ReduceOp.PROD:
-            op_type = core.ReduceOp.PRODUCT
-        else:
-            raise ValueError("Unknown reduce_op type for allreduce.")
+        op_type = _get_reduce_op(op, "all_reduce")
         group = _get_default_group() if group is None else group
         task = group.process_group.allreduce(tensor, op_type)
         if use_calc_stream:
@@ -768,16 +852,7 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=None, use_calc_stream=True):
         return
 
     if in_dygraph_mode():
-        if op == ReduceOp.SUM:
-            op_type = core.ReduceOp.SUM
-        elif op == ReduceOp.MAX:
-            op_type = core.ReduceOp.MAX
-        elif op == ReduceOp.MIN:
-            op_type = core.ReduceOp.MIN
-        elif op == ReduceOp.PROD:
-            op_type = core.ReduceOp.PRODUCT
-        else:
-            raise ValueError("Unknown reduce_op type for reduce.")
+        op_type = _get_reduce_op(op, "reduce")
         group = _get_default_group() if group is None else group
         gdst = group.get_group_rank(dst)
         assert gdst >= 0, ("dst rank out of group, need global rank")
@@ -1781,10 +1856,10 @@ def alltoall(in_tensor_list, out_tensor_list, group=None, use_calc_stream=True):
     Args:
         in_tensor_list (list): A list of input Tensors. Every element in the list must be a Tensor whose data type
             should be float16, float32, float64, int32 or int64.
-        out_tensor_list (Tensor): A list of output Tensors. The data type of its elements should be the same as the
+        out_tensor_list (list): A list of output Tensors. The data type of its elements should be the same as the
             data type of the input Tensors.
         group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
-        use_calc_stream (bool, optional): Wether to use calculation stream (True) or communication stream. Default: True.
+        use_calc_stream (bool, optional): Whether to use calculation stream (True) or communication stream. Default: True.
     
     Returns:
         None.
@@ -1867,6 +1942,98 @@ def alltoall(in_tensor_list, out_tensor_list, group=None, use_calc_stream=True):
     out_tensor_list.extend(paddle.split(out, nranks, 0))
 
 
+def alltoall_single(in_tensor,
+                    out_tensor,
+                    in_split_sizes=None,
+                    out_split_sizes=None,
+                    group=None,
+                    use_calc_stream=True):
+    """
+    Scatter a single input tensor to all participators and gather the received tensors in out_tensor.
+
+    .. note::
+        ``alltoall_single`` is only supported in eager mode.
+
+    Args:
+        in_tensor (Tensor): Input tensor. The data type should be float16, float32, float64, int32 or int64.
+        out_tensor (Tensor): Output Tensor. The data type should be the same as the data type of the input Tensor.
+        in_split_sizes (list[int], optional): Split sizes of ``in_tensor`` for dim[0]. If not given, dim[0] of ``in_tensor`` 
+            must be divisible by group size and ``in_tensor`` will be scattered averagely to all participators. Default: None.
+        out_split_sizes (list[int], optional): Split sizes of ``out_tensor`` for dim[0]. If not given, dim[0] of ``out_tensor`` 
+            must be divisible by group size and ``out_tensor`` will be gathered averagely from all participators. Default: None.
+        group (Group, optional): The group instance return by ``new_group`` or None for global default group. Default: None.
+        use_calc_stream (bool, optional): Whether to use calculation stream (True) or communication stream. Default: True.
+    
+    Returns:
+        None, if ``use_calc_stream`` is set to ``True``; ``Task`` of ``group``, if ``use_calc_stream`` is set to ``False``.
+    
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            import paddle.distributed as dist
+
+            dist.init_parallel_env()
+            rank = dist.get_rank()
+            size = dist.get_world_size()
+
+            # case 1
+            input = paddle.arange(2, dtype='int64') + rank * 2
+            # input for rank 0: [0, 1]
+            # input for rank 1: [2, 3]
+            
+            output = paddle.empty([2], dtype='int64')
+            dist.alltoall_single(input, output)
+            # output for rank 0: [0, 2]
+            # output for rank 1: [1, 3]
+
+            # case 2
+            in_split_sizes = [i + 1 for i in range(size)]
+            # in_split_sizes for rank 0: [1, 2] and for rank 1: [1, 2]
+            out_split_sizes = [rank + 1 for i in range(size)]
+            # out_split_sizes for rank 0: [1, 1] and for rank 1: [2, 2]
+
+            input = paddle.ones([sum(in_split_sizes), size], dtype='float32') * rank
+            # input for rank 0: [[0., 0.], [0., 0.], [0., 0.]]
+            # input for rank 1: [[1., 1.], [1., 1.], [1., 1.]]
+            output = paddle.empty([(rank + 1) * size, size], dtype='float32')
+
+            group = dist.new_group([0, 1])
+            task = dist.alltoall_single(input,
+                                        output,
+                                        in_split_sizes,
+                                        out_split_sizes,
+                                        use_calc_stream=False,
+                                        group=group)
+            task.wait()
+            # output for rank 0: [[0., 0.], [1., 1.]]
+            # output for rank 1: [[0., 0.], [0., 0.], [1., 1.], [1., 1.]]
+
+    """
+    if group is not None and not group.is_member():
+        return
+
+    assert in_dygraph_mode(), "Only suppport alltoall_single in eager mode."
+    # _check_single_tensor
+
+    group = _get_default_group() if group is None else group
+    in_split_sizes = [] if in_split_sizes is None else in_split_sizes
+    out_split_sizes = [] if out_split_sizes is None else out_split_sizes
+
+    task = group.process_group.alltoall_single(in_tensor, out_tensor,
+                                               in_split_sizes, out_split_sizes)
+    if use_calc_stream:
+        task.wait()
+        return
+    else:
+        return task
+
+
+def _get_group_rank(global_rank, group=None):
+    return global_rank if group is None else group.get_group_rank(global_rank)
+
+
 def send(tensor, dst=0, group=None, use_calc_stream=True):
     """
     Send a tensor to the receiver.
@@ -1899,7 +2066,7 @@ def send(tensor, dst=0, group=None, use_calc_stream=True):
     """
     if group is not None and not group.is_member():
         return
-
+    dst = _get_group_rank(dst, group)
     if in_dygraph_mode():
         group = _get_default_group() if group is None else group
         task = group.process_group.send(tensor, dst)
@@ -1962,6 +2129,7 @@ def recv(tensor, src=0, group=None, use_calc_stream=True):
     if group is not None and not group.is_member():
         return
 
+    src = _get_group_rank(src, group)
     if in_dygraph_mode():
         group = _get_default_group() if group is None else group
         task = group.process_group.recv(tensor, src)
@@ -1991,3 +2159,390 @@ def recv(tensor, src=0, group=None, use_calc_stream=True):
                          'dtype': tensor.dtype,
                          'use_calc_stream': use_calc_stream,
                      })
+
+
+def _check_single_tensor(tensor, tensor_name):
+    if not isinstance(tensor, (core.eager.Tensor, paddle.Tensor)):
+        raise RuntimeError("Invalid function argument. Expected parameter {}"
+                           "to be of type paddle.Tensor, but it's {}".format(
+                               tensor_name, type(tensor)))
+
+
+def _check_tensor_list(tensor_list, tensor_name):
+    if not isinstance(tensor_list, list) or \
+        not all(isinstance(t, (core.eager.Tensor, paddle.Tensor)) for t in tensor_list):
+        raise RuntimeError("Invalid function argument. Expected parameter {}"
+                           "to be of type paddle.Tensor".format(tensor_name))
+
+
+def isend(tensor, dst, group=None):
+    """
+    Sends a tensor asynchronously
+
+    Args:
+        tensor (Tensor): The Tensor to send. Its data type
+            should be float16, float32, float64, int32 or int64.
+        dst (int): The destination rank.
+        group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
+    
+    Returns:
+        A distributed task object.
+
+    Warning:    
+        This API only supports the dygraph mode.
+
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            import paddle.distributed as dist
+
+            dist.init_parallel_env()
+            rank = dist.get_rank()
+            world_size = dist.get_world_size()
+
+            if rank == 0:
+                data = paddle.to_tensor([7, 8, 9])
+                task = paddle.distributed.isend(data, dst=1)
+            else:
+                data = paddle.to_tensor([1, 2, 3])
+                task = paddle.distributed.irecv(data, src=0)
+
+            task.wait()
+
+            print(data)
+            # paddle.tensor([7, 8, 9])     # Rank-0
+            # paddle.tensor([7, 8, 9])     # Rank-1
+
+    """
+    _check_single_tensor(tensor, "tensor")
+    if group is not None and not group.is_member():
+        return
+
+    if in_dygraph_mode():
+        group = _get_default_group() if group is None else group
+        group_dst_rank = group.get_group_rank(dst)
+        assert group_dst_rank >= 0, ("dst rank out of group, need global rank")
+        return group.process_group.send(tensor, group_dst_rank)
+    else:
+        raise RuntimeError("Don't support static graph mode currently.")
+
+
+def irecv(tensor, src=None, group=None):
+    """
+    Receive a tensor to the sender.
+
+    Args:
+        tensor (Tensor): The Tensor to receive. Its data type
+            should be float16, float32, float64, int32 or int64.
+        src (int): The source rank id.
+        group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
+
+    Returns:
+         A distributed task object.
+
+    Warning:    
+        This API only supports the dygraph mode.
+
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            import paddle.distributed as dist
+
+            dist.init_parallel_env()
+            rank = dist.get_rank()
+            world_size = dist.get_world_size()
+
+            if rank == 0:
+                data = paddle.to_tensor([7, 8, 9])
+                task = paddle.distributed.isend(data, dst=1)
+            else:
+                data = paddle.to_tensor([1, 2, 3])
+                task = paddle.distributed.irecv(data, src=0)
+
+            task.wait()
+
+            print(data)
+            # paddle.tensor([7, 8, 9])     # Rank-0
+            # paddle.tensor([7, 8, 9])     # Rank-1
+    """
+    _check_single_tensor(tensor, "tensor")
+    if group is not None and not group.is_member():
+        return
+
+    if in_dygraph_mode():
+        group = _get_default_group() if group is None else group
+        group_src_rank = group.get_group_rank(src)
+        assert group_src_rank >= 0, ("src rank out of group, need global rank")
+        return group.process_group.recv(tensor, group_src_rank)
+    else:
+        raise RuntimeError("Don't support static graph mode currently.")
+
+
+class P2POp(object):
+    """
+    A class that makes point-to-point operations for "batch_isend_irecv".
+
+    This class creates the type of P2P operation, communication buffer, peer rank,
+    Group. Instances of this class will be passed to
+    ``paddle.distributed.batch_isend_irecv`` for point-to-point communication.
+
+    Args:
+        op (callable): A function to send data to or receive data from a peer process.
+            The type of ``op`` is either ``paddle.distributed.isend`` or ``paddle.distributed.irecv``.
+        tensor (Tensor): Tensor to send or receive.
+        peer (int): The destination or source rank.
+        group (Group, optional): The group instance return by new_group or None for global 
+            default group. Default: None.
+
+    """
+
+    def __init__(self, op, tensor, peer, group=None):
+        if op not in [isend, irecv]:
+            raise RuntimeError("Invalid ``op`` function. Expected ``op`` "
+                               "to be of type ``paddle.distributed.isend`` or "
+                               "``paddle.distributed.irecv``.")
+        _check_single_tensor(tensor, "tensor")
+
+        self.op = op
+        self.tensor = tensor
+        self.peer = peer
+        self.group = _get_default_group() if group is None else group
+
+
+@contextlib.contextmanager
+def _with_batch_p2p_guard(backend):
+    if backend == "nccl":
+        core.ProcessGroupNCCL.group_start()
+    try:
+        yield
+    finally:
+        if backend == "nccl":
+            core.ProcessGroupNCCL.group_end()
+
+
+def _check_p2p_op_list(p2p_op_list):
+    """
+    Helper to check that the ``p2p_op_list`` is a list of P2POp instances and
+    all ops use the same backend.
+    """
+    if not isinstance(p2p_op_list, list) or not all(
+            isinstance(p2p_op, P2POp) for p2p_op in p2p_op_list):
+        raise RuntimeError("Invalid ``p2p_op_list``. Each op is expected to "
+                           "to be of type ``paddle.distributed.P2POp``.")
+
+    backend = _group_map_backend[p2p_op_list[0].group]
+    if not all(backend == _group_map_backend[p2p_op.group]
+               for p2p_op in p2p_op_list):
+        raise RuntimeError("All groups need to use the same backend.")
+
+
+def batch_isend_irecv(p2p_op_list):
+    """
+    Send or Receive a batch of tensors asynchronously and return a list of requests.
+
+    Process each of the point-to-point operations in ``p2p_op_list`` and return the 
+    corresponding tasks. NCCL are currently supported.
+
+    Args:
+        p2p_op_list: A list of point-to-point operations(type of each operator is
+            ``paddle.distributed.P2POp``). The order of the isend/irecv in the list
+            matters and it needs to match with corresponding isend/irecv on the
+            remote end.
+
+    Returns:
+        A list of distributed tasks returned by calling the corresponding
+        op in the op_list. 
+
+    Warning:    
+        This API only supports the dygraph mode.
+
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+
+            import paddle
+            import paddle.distributed as dist
+
+            dist.init_parallel_env()
+            rank = dist.get_rank()
+            world_size = dist.get_world_size()
+
+            send_t = paddle.arange(2) + rank
+            # paddle.tensor([0, 1])  # Rank-0
+            # paddle.tensor([1, 2])  # Rank-1
+
+            recv_t = paddle.empty(shape=[2], dtype=send_t.dtype)
+
+            send_op = dist.P2POp(dist.isend, send_t, (rank + 1) % world_size)
+            recv_op = dist.P2POp(dist.irecv, recv_t, (rank - 1 + world_size) % world_size)
+
+            tasks = dist.batch_isend_irecv([send_op, recv_op])
+
+            for task in tasks:
+                task.wait()
+            
+            print(recv_t)
+            # paddle.tensor([1, 2])     # Rank-0
+            # paddle.tensor([0, 1])     # Rank-1
+    """
+    _check_p2p_op_list(p2p_op_list)
+    group = p2p_op_list[0].group
+    if group is not None and not group.is_member():
+        return
+
+    if in_dygraph_mode():
+        group = _get_default_group() if group is None else group
+        backend = _group_map_backend[group]
+        tasks = []
+        with _with_batch_p2p_guard(backend):
+            for p2p_op in p2p_op_list:
+                op = p2p_op.op
+                tensor = p2p_op.tensor
+                peer = p2p_op.peer
+                comm_group = p2p_op.group
+                task = op(tensor, peer, comm_group)
+                if task is not None:
+                    tasks.append(task)
+        return tasks
+    else:
+        raise RuntimeError("Don't support static graph mode currently.")
+
+
+def reduce_scatter(tensor,
+                   tensor_list,
+                   op=ReduceOp.SUM,
+                   group=None,
+                   use_calc_stream=True):
+    """
+    Reduces, then scatters a list of tensors to all processes in a group
+
+    Args:
+        tensor (Tensor): Output tensor.
+        tensor_list (list[Tensor]): List of tensors to reduce and scatter.
+        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.Min|ReduceOp.PROD): Optional. The operation used. Default: ReduceOp.SUM.
+        group (Group, optional): The group instance return by new_group or None for global 
+            default group. Default: None.
+        use_calc_stream (bool, optional): Whether this op should be an async op.
+
+    Returns:
+        Async task handle, if use_calc_stream is set to False.
+        None, if use_calc_stream or if not part of the group.
+    
+    Warning:    
+        This API only supports the dygraph mode.
+
+
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            import paddle.distributed as dist
+
+            dist.init_parallel_env()
+            rank = dist.get_rank()
+            world_size = dist.get_world_size()
+
+            if rank == 0:
+                t1 = paddle.to_tensor([0, 1])
+                t2 = paddle.to_tensor([2, 3])
+            else:
+                t1 = paddle.to_tensor([4, 5])
+                t2 = paddle.to_tensor([6, 7])
+
+            tensor_list = [t1, t2]
+
+            output = paddle.empty(shape=[2], dtype=tensor_list[0].dtype)
+            dist.reduce_scatter(output, tensor_list)
+
+            print(output)
+            # [4, 6]     # Rank-0
+            # [8, 10]     # Rank-1
+
+    """
+    _check_single_tensor(tensor, "tensor")
+    _check_tensor_list(tensor_list, "tensor_list")
+
+    if group is not None and not group.is_member():
+        return
+
+    if in_dygraph_mode():
+        op_type = _get_reduce_op(op, "reduce_scatter")
+        group = _get_default_group() if group is None else group
+
+        temp = paddle.concat(tensor_list, axis=0)
+        task = group.process_group._reduce_scatter_base(tensor, temp, op_type)
+        if use_calc_stream:
+            task.wait()
+            return None
+        else:
+            return task
+    else:
+        raise RuntimeError("Don't support static graph mode currently.")
+
+
+def _reduce_scatter_base(output,
+                         input,
+                         op=ReduceOp.SUM,
+                         group=None,
+                         use_calc_stream=True):
+    """
+    Reduces, then scatters a flattened tensor to all processes in a group.
+
+    Args:
+        output (Tensor): Output tensor.
+        input (Tensor): Input tensor that is of size output tensor size times world size
+        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.Min|ReduceOp.PROD): Optional. The operation used. Default: ReduceOp.SUM.
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+        use_calc_stream (bool, optional): Wether to use calculation stream (True) or communication stream (False).
+            Default to True.
+    Returns:
+        Async task handle, if use_calc_stream is set to False.
+        None, if use_calc_stream or if not part of the group.
+
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+
+            import paddle
+            import paddle.distributed as dist
+
+            dist.init_parallel_env()
+            rank = dist.get_rank()
+            world_size = dist.get_world_size()
+
+            input = paddle.arange(4) + rank
+            # [0, 1, 2, 3]  # Rank-0
+            # [1, 2, 3, 4]  # Rank-1
+
+            output = paddle.empty(shape=[2], dtype=input.dtype)
+            paddle.distributed.collective._reduce_scatter_base(output, input)
+            print(output)
+            # [1, 3]     # Rank-0
+            # [5, 7]     # Rank-1
+
+    """
+    _check_single_tensor(output, "output")
+    _check_single_tensor(input, "input")
+
+    if group is not None and not group.is_member():
+        return
+
+    if in_dygraph_mode():
+        op_type = _get_reduce_op(op, "_reduce_scatter_base")
+        group = _get_default_group() if group is None else group
+        task = group.process_group._reduce_scatter_base(output, input, op_type)
+        if use_calc_stream:
+            task.wait()
+            return None
+        else:
+            return task
+    else:
+        raise RuntimeError("Don't support static graph mode currently.")
diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
index 17c7f5a9bbc4a..6f917d9f89d6a 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
@@ -54,25 +54,29 @@ def __init__(self):
     def _recv_shape_dtype(self, group):
         # recv len(shape)
         dims = paddle.to_tensor([0])
-        paddle.distributed.recv(dims, src=0, group=group)
+        src_rank = group.ranks[0]
+
+        paddle.distributed.recv(dims, src=src_rank, group=group)
         dims = dims.item()
 
         # recv shape
         shape = paddle.to_tensor([0] * dims)
-        paddle.distributed.recv(shape, src=0, group=group)
+        paddle.distributed.recv(shape, src=src_rank, group=group)
 
         # recv dtype
         dtype = paddle.to_tensor([0])
-        paddle.distributed.recv(dtype, src=0, group=group)
+        paddle.distributed.recv(dtype, src=src_rank, group=group)
 
         # recv stop_gradient
         stop_grad = paddle.to_tensor([0])
-        paddle.distributed.recv(stop_grad, src=0, group=group)
+        paddle.distributed.recv(stop_grad, src=src_rank, group=group)
         return shape.numpy().tolist(), dtype.item(), stop_grad.item()
 
     def recv_meta(self, group):
         tensor_type = paddle.to_tensor([0])
-        paddle.distributed.recv(tensor_type, src=0, group=group)
+        src_rank = group.ranks[0]
+
+        paddle.distributed.recv(tensor_type, src=src_rank, group=group)
         tensor_type = tensor_type.item()
 
         if tensor_type == 0:
@@ -83,7 +87,7 @@ def recv_meta(self, group):
 
         elif tensor_type == 1:
             num = paddle.to_tensor([0])
-            paddle.distributed.recv(num, src=0, group=group)
+            paddle.distributed.recv(num, src=src_rank, group=group)
             num = num.item()
             shapes = []
             dtypes = []
@@ -101,34 +105,38 @@ def recv_meta(self, group):
     def _send_dims_shape_dtype(self, tensor, group):
         # send len(shape)
         dims = paddle.to_tensor(len(tensor.shape))
-        paddle.distributed.send(dims, dst=1, group=group)
+        dst_rank = group.ranks[1]
+
+        paddle.distributed.send(dims, dst=dst_rank, group=group)
 
         # send shape
         shape = paddle.to_tensor(tensor.shape)
-        paddle.distributed.send(shape, dst=1, group=group)
+        paddle.distributed.send(shape, dst=dst_rank, group=group)
 
         # send dtype
         dtype = paddle.to_tensor(paddle_2_number(tensor.dtype))
-        paddle.distributed.send(dtype, dst=1, group=group)
+        paddle.distributed.send(dtype, dst=dst_rank, group=group)
 
         # send trainable
         stop_grad = paddle.to_tensor(int(tensor.stop_gradient))
-        paddle.distributed.send(stop_grad, dst=1, group=group)
+        paddle.distributed.send(stop_grad, dst=dst_rank, group=group)
 
     def send_meta(self, tensor, group):
+        dst_rank = group.ranks[1]
+
         if isinstance(tensor, (paddle.Tensor, core.eager.Tensor)):
             tensor_type = paddle.to_tensor([0])
             # send tensor type
-            paddle.distributed.send(tensor_type, dst=1, group=group)
+            paddle.distributed.send(tensor_type, dst=dst_rank, group=group)
 
             self._send_dims_shape_dtype(tensor, group)
         elif isinstance(tensor, tuple):
             tensor_type = paddle.to_tensor([1])
             # send tensor type
-            paddle.distributed.send(tensor_type, dst=1, group=group)
+            paddle.distributed.send(tensor_type, dst=dst_rank, group=group)
 
             nums = paddle.to_tensor(len(tensor))
-            paddle.distributed.send(nums, dst=1, group=group)
+            paddle.distributed.send(nums, dst=dst_rank, group=group)
 
             for d in tensor:
                 assert isinstance(d, (paddle.Tensor, core.eager.Tensor))
@@ -166,6 +174,7 @@ def send_partial(tensor,
                  rank_id=0,
                  group=None,
                  use_calc_stream=True):
+    # dst: local rank in group
     if group is not None and not group.is_member():
         return
     ring_id = 0 if group is None else group.id
@@ -176,7 +185,7 @@ def send_partial(tensor,
                                    dst, 'num', nranks, 'id', rank_id)
     else:
         return paddle.distributed.send(tensor.detach(),
-                                       dst=dst,
+                                       dst=group.ranks[dst],
                                        group=group,
                                        use_calc_stream=use_calc_stream)
 
@@ -187,6 +196,7 @@ def recv_partial(tensor,
                  rank_id=0,
                  group=None,
                  use_calc_stream=True):
+    # src: local rank in group
     if group is not None and not group.is_member():
         return
     ring_id = 0 if group is None else group.id
@@ -198,7 +208,7 @@ def recv_partial(tensor,
                             tensor.shape)
     else:
         paddle.distributed.recv(tensor.detach(),
-                                src=src,
+                                src=group.ranks[src],
                                 group=group,
                                 use_calc_stream=use_calc_stream)
 
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py
index 39e92f8878028..f13739960b38a 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py
@@ -210,9 +210,10 @@ def _grad_scale(self):
                     scale=self._world_size_scaling)
 
         # Scale grads of params
-        for param in self._trainable_params:
-            if param.name in self._param_grads and param.grad is not None:
-                param.grad.scale_(scale=self._world_size_scaling)
+        with paddle.no_grad():
+            for param in self._trainable_params:
+                if param.name in self._param_grads and param.grad is not None:
+                    param.grad.scale_(scale=self._world_size_scaling)
                 # param._reset_grad_inplace_version(True)
 
             # Scale grads of master params with offload strategy
diff --git a/python/paddle/distributed/launch/context/__init__.py b/python/paddle/distributed/launch/context/__init__.py
index 902c8189b1720..3e8f0de3e69d5 100644
--- a/python/paddle/distributed/launch/context/__init__.py
+++ b/python/paddle/distributed/launch/context/__init__.py
@@ -76,6 +76,10 @@ def is_legacy_mode(self):
     def get_envs(self):
         return self.envs.copy()
 
+    def set_envs(self, env={}):
+        env = {k: v for k, v in env.items() if isinstance(v, str)}
+        self.envs.update(env)
+
     def _enable_plugin(self):
         for pl in plugins.enabled_plugins:
             pl(self)
diff --git a/python/paddle/distributed/launch/controllers/controller.py b/python/paddle/distributed/launch/controllers/controller.py
index 1f43679d748f1..bc628be59dc22 100644
--- a/python/paddle/distributed/launch/controllers/controller.py
+++ b/python/paddle/distributed/launch/controllers/controller.py
@@ -49,6 +49,8 @@ def __init__(self, ctx):
                        jid=self.ctx.args.job_id)
         self.pod = Pod()
 
+        self.ctx.set_envs({"POD_NAME": self.pod.name})
+
         self.join_server = None
 
     def deploy_pod(self):
@@ -104,17 +106,18 @@ def watch(self) -> bool:
                 self.ctx.logger.info("Pod {}".format(status))
                 self.ctx.logger.error("Container failed !!!\n{}".format(fc[0]))
                 fc[0].tail()
-                self.pod.stop()
 
                 if self.ctx.args.elastic_level <= 0:
+                    self.pod.stop(timeout=3)
                     return True
                 else:
+                    self.pod.stop(timeout=30)
                     return False
 
             # peer failure
             if self.ctx.status.is_restarting(
             ) and self.master.get_status() != self.ctx.status.COMPLETED:
-                self.pod.stop()
+                self.pod.stop(timeout=30)
                 return False
 
     def stop(self, sigint=None):
@@ -123,7 +126,7 @@ def stop(self, sigint=None):
         self.watcher.stop()
 
         self.master.stop()
-        self.pod.stop(sigint)
+        self.pod.stop(timeout=30)
 
     def finalize(self):
         self.pod.join()
@@ -133,17 +136,16 @@ def finalize(self):
         sys.exit(self.pod.exit_code)
 
     def signal_handler(self, sigint, frame):
-        self.ctx.logger.info("Terminating with signal {}".format(sigint))
-
         if hasattr(self, 'sigint'):
             self.ctx.logger.info("Force quit in 10 seconds...")
-            time.sleep(11)
+            self.pod.stop(timeout=10)
             sys.exit(sigint)
 
+        self.ctx.logger.info("Terminating with signal {}".format(sigint))
+
         self.sigint = sigint
         self.ctx.status.done()
-        self.stop(sigint)
-        time.sleep(1)
+        self.stop(sigint=sigint)
         self.ctx.logger.info("Exit with signal {}".format(sigint))
         sys.exit(sigint)
 
diff --git a/python/paddle/distributed/launch/controllers/master.py b/python/paddle/distributed/launch/controllers/master.py
index 8e8d31f86dd9f..825be9c36888c 100644
--- a/python/paddle/distributed/launch/controllers/master.py
+++ b/python/paddle/distributed/launch/controllers/master.py
@@ -316,5 +316,5 @@ def get_status(self):
     def stop(self):
         if hasattr(self, 'beat_thread'):
             self.ctx.status.done()
-            # TODO(kuizhiqing) thread should exit
+            # daemon thread
             #self.beat_thread.join()
diff --git a/python/paddle/distributed/launch/controllers/watcher.py b/python/paddle/distributed/launch/controllers/watcher.py
index 6e8a2cc4e8781..4b8e346e7908f 100644
--- a/python/paddle/distributed/launch/controllers/watcher.py
+++ b/python/paddle/distributed/launch/controllers/watcher.py
@@ -93,4 +93,6 @@ def _save_gpu_log(self, util_key):
 
     def stop(self):
         if hasattr(self, "proc"):
-            self.proc.join()
+            # daemon without join
+            # self.proc.join()
+            pass
diff --git a/python/paddle/distributed/launch/job/container.py b/python/paddle/distributed/launch/job/container.py
index 8f515d9e6f38b..e0f580da0ac45 100644
--- a/python/paddle/distributed/launch/job/container.py
+++ b/python/paddle/distributed/launch/job/container.py
@@ -131,7 +131,11 @@ def terminate(self, force=False):
             return self._proc.terminate(force)
 
     def wait(self, timeout=None):
-        self._proc.wait(timeout)
+        try:
+            self._proc.wait(timeout)
+            return True
+        except Exception:
+            return False
 
     @property
     def exit_code(self):
diff --git a/python/paddle/distributed/launch/job/pod.py b/python/paddle/distributed/launch/job/pod.py
index cda400f0a324a..c99b2db547a26 100644
--- a/python/paddle/distributed/launch/job/pod.py
+++ b/python/paddle/distributed/launch/job/pod.py
@@ -116,14 +116,26 @@ def deploy(self):
 
         self._restart += 1
 
-    def stop(self, sigint=0):
+    def stop(self, sigint=15, timeout=None):
         for c in self._containers:
-            force = True if sigint == 9 else False
-            c.terminate(force)
+            if isinstance(sigint, int) and timeout is None:
+                c.send_signal(sigint)
+            else:
+                c.terminate()
+
+        if isinstance(timeout, int):
+            if not self.join(timeout):
+                for c in self._containers:
+                    c.terminate(force=True)
+                return False
+            else:
+                return True
 
-    def join(self):
+    def join(self, timeout=None):
         for c in self._containers:
-            c.wait(None)
+            if not c.wait(timeout):
+                return False
+        return True
 
     @property
     def status(self):
diff --git a/python/paddle/distributed/launch/main.py b/python/paddle/distributed/launch/main.py
index 4c1b99df178ea..fccb352c2a3cd 100644
--- a/python/paddle/distributed/launch/main.py
+++ b/python/paddle/distributed/launch/main.py
@@ -98,11 +98,11 @@ def launch():
         The ``training_script_args`` includes arguments required by IPU distributed launch and illustrated as below.
         ``Examples 10`` has provided a example of paddle.distributed.launch with IPUs.
 
-        - ``--hosts``: The hosts for IPU distributd training.
+        - ``--hosts``: The hosts for IPU distributd training. Each host is able to include multiple processes.
         
-        - ``--nproc_per_host``: The number of processes launched per host.
+        - ``--nproc_per_host``: The number of processes launched per host. Each process is able to include multiple replicas.
 
-        - ``--ipus_per_replica``: The number of IPUs requested per replica.
+        - ``--ipus_per_replica``: The number of IPUs requested per replica. Each replica is able to include multiple IPUs.
 
         - ``--ipu_partition``: The partition name of IPU devices.
 
@@ -110,7 +110,7 @@ def launch():
 
         - ``training_script``: The full path to the IPU distributed training program/script to be launched in parallel. e.g., ``training.py``.
 
-        - ``training_script_args``: The args of the IPU distributed training program/script.
+        - ``training_script_args``: The args of the IPU distributed training program/script. e.g., ``--lr=0.1``.
 
     Returns:
         - ``None``
@@ -253,9 +253,11 @@ def launch():
         .. code-block:: bash
             :name: code-block-example-bash10
 
-            # With the following command, the job will begin to run the distributhed program with IPUs.
-            # Only support and require the `device_num` as the arg and `ipu` as the launch script.
-            # Please Check the details about the following args of the launch scripte from `utils/ipu_launch.py`.
+            # With the following command, the job will begin to run the distributhed program with IPUs
+            # Require `devices` as the number of IPUs
+            # Require `training_script` to be set as `ipu`
+            # Require `training_script_args` as the arguments of IPU distributed training instead of the arguments of the training program/script
+            # Please Check the `IPU Parameters` for details
             python -m paddle.distributed.launch --devices 4 ipu --hosts=localhost --nproc_per_host=2 --ipus_per_replica=1 --ipu_partition=pod16 --vipu_server=127.0.0.1 train.py
 
     """
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index 52d19ae52b2ba..e95b771fe6f6a 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -42,6 +42,7 @@
 from paddle.distributed.collective import _set_default_store
 from paddle.distributed.collective import _new_process_group_impl
 from paddle.distributed.collective import Group
+from paddle.distributed.collective import _set_group_map_backend
 
 __all__ = []
 
@@ -257,6 +258,7 @@ def train():
                       name=_default_group_name)
         _set_group_map_by_name(_default_group_name, group)
         _set_group_map(0, group)
+        _set_group_map_backend(group, backend)
         parallel_helper._set_parallel_ctx(True)
 
         paddle.distributed.barrier(group=group)
diff --git a/python/paddle/distributed/passes/auto_parallel_amp.py b/python/paddle/distributed/passes/auto_parallel_amp.py
index f0d02451141ae..7afba8c0f1377 100644
--- a/python/paddle/distributed/passes/auto_parallel_amp.py
+++ b/python/paddle/distributed/passes/auto_parallel_amp.py
@@ -452,7 +452,7 @@ def _check_and_update_gradient(params_grads, loss_scaling, dist_context):
 
     inputs = {'X': grads, 'Scale': loss_scaling}
     outputs = {'Out': grads, 'FoundInfinite': found_inf}
-    attrs = {'op_role': OpRole.Backward}
+    attrs = {'op_role': OpRole.Optimize}
     new_op = main_block.append_op(type='check_finite_and_unscale',
                                   inputs=inputs,
                                   outputs=outputs,
@@ -732,7 +732,7 @@ def _update_loss_scaling(self, grads, found_inf):
             'incr_ratio': self.get_attr("incr_ratio"),
             'decr_ratio': self.get_attr("decr_ratio"),
             'stop_update': self.get_attr("stop_update"),
-            'op_role': OpRole.Backward
+            'op_role': OpRole.Optimize
         }
 
         new_op = main_block.append_op(type='update_loss_scaling',
diff --git a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
index 394d71706c4c4..717f8fa27f2df 100644
--- a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
+++ b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
@@ -21,20 +21,13 @@
 from paddle.fluid import layers
 from paddle.fluid.framework import program_guard, device_guard
 from .pass_base import PassBase, PassType, register_pass
-from paddle.distributed.fleet.meta_optimizers.common import OpRole
-from paddle.distributed.auto_parallel.utils import set_var_dist_attr
+from paddle.distributed.auto_parallel.utils import set_var_dist_attr, is_optimize_op, OpRole, OP_ROLE_KEY
 from paddle.distributed.auto_parallel.utils import naive_set_dist_op_attr_for_program_by_mesh_and_mapping
 from paddle.distributed.auto_parallel.process_group import get_world_process_group
 
 world_process_group = get_world_process_group()
 
 
-def _is_the_optimizer_op(op):
-    OP_ROLE_KEY = core.op_proto_and_checker_maker.kOpRoleAttrName()
-    return OP_ROLE_KEY in op.attr_names and \
-            int(op.all_attrs()[OP_ROLE_KEY]) & int(OpRole.Optimize)
-
-
 def _remove_and_get_optimizer_op(main_program, dist_context):
     # 1 create tmp block
     # 2 mv optimizer op from global program to tmp block
@@ -43,9 +36,8 @@ def _remove_and_get_optimizer_op(main_program, dist_context):
     temp_block = main_program._create_block()
     removed_op_idx = []
     optimize_ops_desc = []
-    skip_ops = ["increment", "elementwise_mod", "equal"]
     for idx, op in enumerate(main_block.ops):
-        if _is_the_optimizer_op(op) and op.type not in skip_ops:
+        if is_optimize_op(op):
             # append optimizer op to tmp block
             new_op_desc = temp_block.desc.append_op()
             new_op_desc.copy_from(op.desc)
@@ -57,7 +49,8 @@ def _remove_and_get_optimizer_op(main_program, dist_context):
                 dist_context.del_dist_op_for_program(op)
 
     for idx in removed_op_idx[::-1]:
-        main_block._remove_op(idx)
+        main_block._remove_op(idx, sync=False)
+    main_block._sync_with_cpp()
 
     return optimize_ops_desc
 
@@ -65,7 +58,7 @@ def _remove_and_get_optimizer_op(main_program, dist_context):
 def _remove_op_role_var(param, grad):
     op_maker = core.op_proto_and_checker_maker
     op = grad.op
-    if op.has_attr(op_maker.kOpRoleVarAttrName()):
+    if op and op.has_attr(op_maker.kOpRoleVarAttrName()):
         op._remove_attr(op_maker.kOpRoleVarAttrName())
 
 
@@ -109,7 +102,7 @@ def _get_gm_cond_var(main_program, k_steps, dist_context):
                                             outputs={'Out': [step_var]},
                                             attrs={
                                                 'step': float(1.0),
-                                                'op_role': OpRole.Optimize
+                                                OP_ROLE_KEY: OpRole.Backward
                                             })
         naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
             increment_op, world_process_group.ranks, [-1], dist_context)
@@ -123,7 +116,8 @@ def _get_gm_cond_var(main_program, k_steps, dist_context):
                                                   attrs={
                                                       'axis': -1,
                                                       'use_mkldnn': False,
-                                                      'op_role': OpRole.Optimize
+                                                      OP_ROLE_KEY:
+                                                      OpRole.Backward
                                                   })
         naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
             elementwise_mod_op, world_process_group.ranks, [-1], dist_context)
@@ -134,7 +128,7 @@ def _get_gm_cond_var(main_program, k_steps, dist_context):
                                             'Y': zero_var
                                         },
                                         outputs={'Out': cond_var},
-                                        attrs={'op_role': OpRole.Optimize})
+                                        attrs={OP_ROLE_KEY: OpRole.Backward})
         naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
             equal_op, world_process_group.ranks, [-1], dist_context)
 
@@ -143,7 +137,6 @@ def _get_gm_cond_var(main_program, k_steps, dist_context):
 
 def _append_gradient_merge_backward_op(
         main_program, startup_program, params_grads: List[Tuple[Any, Any]],
-        cond_var_name: str,
         dist_context) -> Tuple[List[Tuple[Any, Any]], Dict[str, Any]]:
     main_block = main_program.global_block()
     startup_block = startup_program.global_block()
@@ -201,7 +194,7 @@ def _append_gradient_merge_backward_op(
                                            attrs={
                                                'axis': -1,
                                                'use_mkldnn': False,
-                                               'op_role': OpRole.Optimize
+                                               OP_ROLE_KEY: OpRole.Backward
                                            })
         new_params_to_grads.append([param, gradient_merge_var])
         grad_to_gradient_merge[grad.name] = gradient_merge_var.name
@@ -233,8 +226,7 @@ def true_apply_gradient():
                                         'bias': 0.0,
                                         'bias_after_scale': False
                                     })
-                new_grad.op._set_attr(op_maker.kOpRoleAttrName(),
-                                      OpRole.Optimize)
+                new_grad.op._set_attr(OP_ROLE_KEY, OpRole.Optimize)
 
         # append optimizer ops
         for op_desc in optimize_ops_desc:
@@ -272,29 +264,27 @@ def true_apply_gradient():
                                  dtype=new_grad.dtype,
                                  value=0.0,
                                  out=new_grad)
-            new_grad.op._set_attr(op_maker.kOpRoleAttrName(),
-                                  op_maker.OpRole.Optimize)
+            new_grad.op._set_attr(OP_ROLE_KEY, op_maker.OpRole.Optimize)
 
     layers.cond(cond_var, true_fn=true_apply_gradient, false_fn=None)
     cond_op = main_program.global_block().ops[-1]
-    cond_op._set_attr('op_role', OpRole.Optimize)
+    cond_op._set_attr(OP_ROLE_KEY, OpRole.Optimize)
 
 
 def parse_program(main_program, startup_program, params_grads, k_steps, avg,
                   dist_context):
-    # 1 create gradient_merge_cond
-    cond_var = _get_gm_cond_var(main_program, k_steps, dist_context)
-
-    # 2 remove optimizer_op from main_program
+    # 1 remove optimizer_op from main_program
     optimize_ops_desc = _remove_and_get_optimizer_op(main_program, dist_context)
 
     # back to block 0
     main_program._rollback()
 
-    # 3 append gradient merge backward op to main_program
+    # 2 append gradient merge backward op to main_program
     new_params_to_grads, grad_to_gradient_merge = _append_gradient_merge_backward_op(
-        main_program, startup_program, params_grads, cond_var.name,
-        dist_context)
+        main_program, startup_program, params_grads, dist_context)
+
+    # 3 create gradient_merge_cond
+    cond_var = _get_gm_cond_var(main_program, k_steps, dist_context)
 
     # 4 create ConditionalBlock and append gradient merge optimizer ops
     _create_cond_block_and_update_optimizer(main_program, cond_var,
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index c37ac87da71b8..5ed01a0114421 100755
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -2211,12 +2211,6 @@ def gradients(targets, inputs, target_gradients=None, no_grad_set=None):
     check_type(target_gradients, 'target_gradients',
                (framework.Variable, list, tuple, type(None)),
                'paddle.static.gradients')
-
-    from ..incubate.autograd.primx import _gradients
-    from ..incubate.autograd.utils import prim_enabled
-    if prim_enabled():
-        return _gradients(targets, inputs, target_gradients)
-
     outs = calc_gradient(targets, inputs, target_gradients, no_grad_set)
     return _as_list(outs)
 
diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py
index 06f206c36d111..1f81afbed64d7 100644
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -16,6 +16,7 @@
 import os
 import six
 import sys
+import warnings
 from .. import compat as cpt
 from . import framework
 from .framework import _get_paddle_place, _get_paddle_place_list
@@ -373,6 +374,12 @@ def _compile_data_parallel(self, places, use_device, scope=None):
             else:
                 self._exec_strategy.num_threads = len(places) * 2
 
+        if "FLAGS_use_cinn" in core.globals() and core.globals(
+        )["FLAGS_use_cinn"] and self._exec_strategy.num_threads != 1:
+            warnings.warn("At present, when CINN is turned on, each process can " \
+                  "only contain one thread, so reset the number of threads to 1 here.")
+            self._exec_strategy.num_threads = 1
+
         if self._build_strategy.num_trainers > 1:
             assert self._is_data_parallel, \
                 "If you use multi-trainer to train the model, you should use "\
@@ -499,6 +506,192 @@ def _get_places(self, place, place_list):
         return place_list
 
 
+class IpuDynamicPatcher(object):
+    """
+    Patcher for IPU dynamic2static support.
+    """
+
+    patcher_cache = []
+
+    def __init__(self):
+        pass
+
+    @staticmethod
+    def convert_concrete_program(ipu_strategy,
+                                 concrete_program,
+                                 class_instance=None):
+        """
+        Convert the ConcreteProgram to IPUConcreteProgram.
+        """
+        from ..fluid.dygraph.base import switch_to_static_graph
+        from ..fluid import backward
+        from ..fluid.initializer import Constant
+        from ..fluid.framework import device_guard
+        import paddle
+
+        inputs = concrete_program.inputs
+        outputs = concrete_program.outputs
+        startup_program = concrete_program.startup_program
+
+        scope = paddle.static.global_scope()
+
+        @switch_to_static_graph
+        def append_backward_desc():
+            program = concrete_program.main_program
+
+            # backward with optimizer to add backward graph to program
+            backward.gradients_with_optimizer(program, ipu_strategy._optimizer)
+
+            # initialize backward parameters
+            exe = paddle.static.Executor(paddle.CPUPlace())
+            startup_program = paddle.static.default_startup_program()
+            exe.run(startup_program)
+
+            return program
+
+        if ipu_strategy.enable_fp16:
+            class_instance.to(dtype="float16")
+
+        # copy the bias and filters
+        for param_or_buffer in concrete_program.parameters:
+            param_or_buffer_tensor = scope.var(
+                param_or_buffer.name).get_tensor()
+            src_tensor = param_or_buffer.value().get_tensor()
+            param_or_buffer_tensor._share_data_with(src_tensor)
+
+        # TODO(czr): feed and fetch list needs to consider more type
+        if class_instance:
+            feed_list = [elem.name for elem in inputs[1:] if elem is not None]
+        else:
+            feed_list = [elem.name for elem in inputs if elem is not None]
+        fetch_list = [elem.name for elem in outputs]
+
+        if ipu_strategy.is_training:
+            concrete_program.main_program = append_backward_desc()
+            # copy optimizer parameters
+            optimizer = ipu_strategy._optimizer
+            for k, v in optimizer._accumulators.items():
+                for param_name, var_tmp in v.items():
+                    var = optimizer.helper.create_global_variable(
+                        name=var_tmp.name,
+                        persistable=True,
+                        dtype=var_tmp.dtype,
+                        type=var_tmp.type,
+                        shape=var_tmp.shape,
+                        belong_to_optimizer=True)
+                    device = optimizer._get_device_for_param(param_name)
+                    with device_guard(device):
+                        optimizer.helper.set_variable_initializer(
+                            var, initializer=Constant(value=0.0))
+                    param_or_lr_tensor = scope.find_var(
+                        var_tmp.name).get_tensor()
+                    optim_tensor = var.value().get_tensor()
+                    param_or_lr_tensor._share_data_with(optim_tensor)
+                    optimizer._accumulators[k][param_name] = var
+
+        @switch_to_static_graph
+        def func_compile():
+            if ipu_strategy.enable_fp16:
+                amp_list = paddle.static.amp.CustomOpLists()
+                amp_list.unsupported_list = {"cumsum"}
+                to_fp16_var_names = paddle.static.amp.cast_model_to_fp16(
+                    concrete_program.main_program,
+                    amp_list,
+                    use_fp16_guard=False)
+                paddle.static.amp.cast_parameters_to_fp16(
+                    paddle.CPUPlace(),
+                    concrete_program.main_program,
+                    to_fp16_var_names=to_fp16_var_names)
+
+            program = IpuCompiledProgram(concrete_program.main_program,
+                                         ipu_strategy=ipu_strategy,
+                                         scope=scope).compile(
+                                             feed_list, fetch_list)
+            return program
+
+        main_program = func_compile()
+        concrete_program.main_program = main_program
+        return concrete_program
+
+    @staticmethod
+    def patch_program_cache(ipu_strategy):
+        """ Monkey patch ProgramCache discriptor to support dynamic2static in IPU.
+
+        Args:
+            ipu_strategy: The ipu_strategy used in dynamic graph.
+
+        Returns:
+            None
+        """
+        from ..fluid.dygraph.dygraph_to_static.program_translator import ProgramCache
+        from ..fluid.dygraph.dygraph_to_static.program_translator import CacheKey
+        from ..fluid.dygraph.dygraph_to_static import logging_utils
+        from ..fluid.dygraph.dygraph_to_static.program_translator import MAX_TRACED_PROGRAM_COUNT
+        from ..fluid.dygraph.dygraph_to_static.partial_program import partial_program_from
+
+        old_getter = ProgramCache.__getitem__
+
+        def patch_getter(self, item):
+            if not isinstance(item, CacheKey):
+                raise ValueError(
+                    'type(item) should be CacheKey, but received %s' %
+                    type(item).__name__)
+            item_id = hash(item)
+            self._recent_key = item_id
+            if item_id not in self._caches or ipu_strategy.need_compile:
+                if item_id in self._caches:
+                    logging_utils.warn(
+                        "ipu_strategy chances detected. Please sync weights.")
+                if self._caches and not ipu_strategy.need_compile:
+                    logging_utils.warn(
+                        "dynamic2static on IPU doesn't support mutiple caches. Please make sure"
+                        "dynamic inputs is not used.")
+                concrete_program, _ = self._build_once(item)
+                concrete_program = IpuDynamicPatcher.convert_concrete_program(
+                    ipu_strategy, concrete_program, item.class_instance)
+
+                self._caches[item_id] = (concrete_program,
+                                         partial_program_from(concrete_program))
+                # Note: raise warnings if number of traced program is more than `max_tracing_count`
+                current_tracing_count = len(self._caches)
+                if current_tracing_count > MAX_TRACED_PROGRAM_COUNT:
+                    logging_utils.warn(
+                        "Current traced program number: {} > `max_tracing_count`:{}. Too much cached programs will bring expensive overhead. "
+                        "The reason may be: (1) passing tensors with different shapes, (2) passing python objects instead of tensors."
+                        .format(current_tracing_count,
+                                MAX_TRACED_PROGRAM_COUNT))
+
+            return self._caches[item_id]
+
+        setattr(ProgramCache, '__getitem__', patch_getter)
+        IpuDynamicPatcher.patcher_cache.append(
+            [ProgramCache, '__getitem__', old_getter])
+
+    @staticmethod
+    def patch_lr_scheduler(ipu_strategy):
+        from paddle.optimizer.lr import LRScheduler
+        # For IPU dynamic graph usage, lr_var is not synced in executor as static mode do.
+        # Manually set lr to ipu_strategy to update the lr.
+        old_step = LRScheduler.step
+
+        def patch_step(self, epoch=None):
+            old_step(self, epoch)
+            ipu_strategy.set_options({"lr": self.last_lr})
+
+        setattr(LRScheduler, 'step', patch_step)
+        IpuDynamicPatcher.patcher_cache.append([LRScheduler, 'step', old_step])
+
+    @staticmethod
+    def register_patch(ipu_strategy):
+        IpuDynamicPatcher.patch_program_cache(ipu_strategy)
+        IpuDynamicPatcher.patch_lr_scheduler(ipu_strategy)
+
+    @staticmethod
+    def release_patch():
+        for module, key, attr in IpuDynamicPatcher.patcher_cache:
+            setattr(module, key, attr)
+
+
 class IpuStrategy(object):
     """
     Help users precisely control the graph building in :code:`paddle.static.IpuCompiledProgram` .
@@ -535,10 +728,121 @@ def __init__(self):
             self._ipu_strategy.set_options(default_options)
             self.has_custom_ops = False
             self.custom_op_names = []
+            self.need_compile = True
         else:
             raise RuntimeError(
                 "Can not use IpuStrategy in non IPU compiled environment, please re-compile with WITH_IPU=ON."
             )
+        from paddle import in_dynamic_mode
+        if in_dynamic_mode():
+            self.register_patch()
+
+    def register_patch(self):
+        """
+        Register patchs function to support dynamic to static on IPU. This operation would break the dy2static functionality on CPU.
+        Use `release_patch` to release the patch.
+
+        Examples:
+            .. code-block:: python
+	
+                # required: ipu
+
+                import paddle
+                import paddle.static as static
+
+                ipu_strategy = static.IpuStrategy()
+
+                ipu_strategy.register_patch()
+        """
+        IpuDynamicPatcher.register_patch(self)
+
+    def release_patch(self):
+        """
+        Release the registered IPU functions.
+
+        Examples:
+            .. code-block:: python
+	
+                # required: ipu
+
+                import paddle
+                import paddle.static as static
+
+                ipu_strategy = static.IpuStrategy()
+
+                ipu_strategy.release_patch()
+        """
+        IpuDynamicPatcher.release_patch()
+
+    def set_optimizer(self, optimizer):
+        """
+        Set optimizer to ipu_strategy in dynamic mode.
+
+          Args:
+              optimizer (Optimizer): Optimizer to be used in training.
+              
+          Returns:
+              None.
+
+          Examples:
+              .. code-block:: python
+	
+                  # required: ipu
+
+                  import paddle
+                  import paddle.static as static
+
+                  linear = paddle.nn.Linear(10, 10)
+                  optimizer = paddle.optimizer.SGD(learning_rate=0.01,
+                                                   parameters=linear.parameters())
+                  ipu_strategy = static.IpuStrategy()
+                  ipu_strategy.set_optimizer(optimizer)
+        """
+        from paddle import in_dynamic_mode
+        if in_dynamic_mode():
+            self._optimizer = optimizer
+            optimizer_attrs = self.parse_optimizer(optimizer)
+            self._ipu_strategy.set_options(optimizer_attrs)
+        else:
+            raise RuntimeError("Only needs to set optimizer in dynamic mode.")
+
+    def parse_optimizer(self, optimizer):
+        """
+        Parse optimizer attributes for IPU dynamic to static support. Currently only support parse lr.
+
+          Args:
+              optimizer (Optimizer): Optimizer to be parsed.
+              
+          Returns:
+              Dict.
+
+          Examples:
+              .. code-block:: python
+	
+                  # required: ipu
+
+                  import paddle
+                  import paddle.static as static
+
+                  linear = paddle.nn.Linear(10, 10)
+                  optimizer = paddle.optimizer.SGD(learning_rate=0.01,
+                                                   parameters=linear.parameters())
+                  ipu_strategy = static.IpuStrategy()
+                  attrs = ipu_strategy.parse_optimizer(optimizer)
+        """
+
+        def get_lr():
+            from paddle.optimizer.lr import LRScheduler
+            if isinstance(optimizer._learning_rate, float):
+                return {"lr": optimizer._learning_rate}
+            elif isinstance(optimizer._learning_rate, LRScheduler):
+                return {"lr": optimizer._learning_rate()}
+
+        attr_fn = [get_lr]
+        optimizer_attrs = {"is_dynamic": True}
+        for fn in attr_fn:
+            optimizer_attrs.update(fn())
+        return optimizer_attrs
 
     def set_graph_config(self,
                          num_ipus=1,
@@ -736,6 +1040,10 @@ def set_options(self, options):
                 ipu_strategy.set_options(options)
         """
         self._ipu_strategy.set_options(options)
+        # check whether to recompile program with updated ipu options.
+        recompile_white_list = {'lr'}
+        if options.keys() - recompile_white_list:
+            self.need_compile = True
 
     def get_option(self, option):
         """
@@ -1043,4 +1351,6 @@ def compile(self, feed_list, fetch_list):
         if not hasattr(program, 'org_program'):
             program.org_program = self._program
 
+        self._ipu_strategy.need_compile = False
+
         return program
diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
index f1da3990a36be..a46a0d12fddea 100644
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -963,10 +963,10 @@ def _update_program(self):
         else:
             scale_dict = self._quantized_threshold
         for key, val in scale_dict.items():
-            utils.set_variable_data(self._scope, self._place, key + ".scale",
+            utils.set_variable_data(self._scope, self._place, key + "@scale",
                                     np.array([val], dtype=np.float32))
             utils.set_variable_data(self._scope, self._place,
-                                    key + ".quant_dequant.scale",
+                                    key + ".quant_dequant@scale",
                                     np.array([val], dtype=np.float32))
 
         if not self._onnx_format:
diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
index 622d54343f6a0..2f155ca0edfc2 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
@@ -63,8 +63,8 @@ def __init__(self,
         self._op_ids_to_skip = _op_ids_to_skip if _op_ids_to_skip is not None else set(
             [-1])
         self._scale_immutable_ops = [
-            'transpose2', 'reshape2', 'pool2d', 'slice', 'nearest_interp',
-            'nearest_interp_v2'
+            'transpose2', 'reshape2', 'pool2d', 'slice', 'shape',
+            'nearest_interp', 'nearest_interp_v2'
         ]
         self._scale_ops = ['scale']
         self._conv_ops = ['conv2d', 'depthwise_conv2d']
@@ -247,7 +247,7 @@ def _update_scales(graph):
             waiting_for_scale = set()
             for op in graph.all_op_nodes():
                 if op.name() in self._scale_immutable_ops:
-                    if op.name() == 'slice':
+                    if op.name() == 'slice' or op.name() == 'shape':
                         input_name = op.input("Input")[0]
                     else:
                         input_name = op.input("X")[0]
@@ -264,6 +264,14 @@ def _update_scales(graph):
                     elif output_name in self._var_quant_scales:
                         self._var_quant_scales[
                             input_name] = self._var_quant_scales[output_name]
+                elif op.name() == 'concat':
+                    output_name = op.output("Out")[0]
+                    if output_name in self._var_quant_scales:
+                        input_names = op.input("X")
+                        for input_name in input_names:
+                            self._var_quant_scales[
+                                input_name] = self._var_quant_scales[
+                                    output_name]
                 elif op.name() in self._scale_ops:
                     input_name = op.input("X")[0]
                     output_name = op.output("Out")[0]
@@ -595,13 +603,6 @@ def _compute_lstm_weight_scales(wx_name, wh_name):
         _compute_lstm_weight_scales("WeightX", "WeightH")
         return graph
 
-    def _find_avg_pooling_ids(self, graph):
-        for op in graph.all_op_nodes():
-            if op.name() in self._pool_ops:
-                if op.op().attr("pooling_type") == "avg":
-                    self._op_ids_to_skip.add(op.id())
-        return self._op_ids_to_skip
-
     def _update_relu_output_scales(self, graph):
 
         def _set_unsigned_scale(graph, ops, op_out_name, predicate):
@@ -651,11 +652,9 @@ def _quantize_fp32_graph(self, graph):
                                  'reshape_transpose_matmul_mkldnn_fuse_pass')
         graph = self._apply_pass(
             graph, 'reshape_transpose_matmul_v2_mkldnn_fuse_pass')
-        graph = self._apply_pass(
-            graph, 'cpu_quantize_placement_pass',
-            ['quantize_enabled_op_types', 'quantize_excluded_op_ids'],
-            [self._ops_to_quantize,
-             self._find_avg_pooling_ids(graph)])
+        graph = self._apply_pass(graph, 'cpu_quantize_placement_pass',
+                                 ['quantize_enabled_op_types'],
+                                 [self._ops_to_quantize])
         graph = self._apply_pass(
             graph, 'cpu_quantize_pass', ['quant_var_scales', 'data_layout'],
             [self._var_quant_scales,
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index 3a316e9192e39..e2502e7f5d447 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -906,7 +906,7 @@ def _quantized_scale_name(self, var_name):
         """
         Return the scale name of quantized variable for the input `var_name`.
         """
-        return "%s.scale" % (var_name)
+        return "%s@scale" % (var_name)
 
     def _is_skip_quant(self, graph, op_node):
         """
@@ -1246,8 +1246,8 @@ def _original_var_name(self, var_name):
             return var_name[:-len('.quantized')]
         if var_name.endswith('.dequantized'):
             return var_name[:-len('.dequantized')]
-        if var_name.endswith('.scale'):
-            return var_name[:-len('.scale')]
+        if var_name.endswith('@scale'):
+            return var_name[:-len('@scale')]
         else:
             return var_name
 
@@ -1440,11 +1440,18 @@ def apply(self, graph):
                         [core.VarDesc.VarType.FP64, core.VarDesc.VarType.FP32]:
                         continue
 
-                    scale_node = graph.create_persistable_node(
-                        name=self._scale_name(in_node.name()),
-                        var_type=core.VarDesc.VarType.LOD_TENSOR,
-                        shape=[1],
-                        var_dtype=in_node.dtype())
+                    try:
+                        graph._find_node_by_name(
+                            graph.all_var_nodes(),
+                            self._scale_name(in_node.name()))
+                        continue
+                    except:
+                        scale_node = graph.create_persistable_node(
+                            name=self._scale_name(in_node.name()),
+                            var_type=core.VarDesc.VarType.LOD_TENSOR,
+                            shape=[1],
+                            var_dtype=in_node.dtype())
+
                     data_type = 'float64' if in_node.dtype() \
                         == core.VarDesc.VarType.FP64 else 'float32'
                     _init_var_node(scale_node, np.ones([1], dtype=data_type),
@@ -1705,7 +1712,7 @@ def _inser_quant_dequant_moving_average_abs_max_op(self, graph, var_node,
                                                shape=var_node.shape(),
                                                var_dtype=var_node.dtype())
         scale_in_node = graph.create_persistable_node(
-            name="{}.quant_dequant.scale".format(var_node.name()),
+            name="{}.quant_dequant@scale".format(var_node.name()),
             var_type=core.VarDesc.VarType.LOD_TENSOR,
             shape=[1],
             var_dtype=var_node.dtype())
@@ -1922,7 +1929,7 @@ def _quantized_scale_name(self, var_name):
         """
         Return the scale name of quantized variable for the input `var_name`.
         """
-        return "%s.scale" % (var_name)
+        return "%s@scale" % (var_name)
 
     def _zero_point_name(self, var_name):
         """
diff --git a/python/paddle/fluid/contrib/slim/tests/imperative_test_utils.py b/python/paddle/fluid/contrib/slim/tests/imperative_test_utils.py
index 36302aea187af..d69241d6cb982 100644
--- a/python/paddle/fluid/contrib/slim/tests/imperative_test_utils.py
+++ b/python/paddle/fluid/contrib/slim/tests/imperative_test_utils.py
@@ -68,7 +68,7 @@ def train_lenet(lenet, reader, optimizer):
 
         out = lenet(img)
         loss = fluid.layers.cross_entropy(out, label)
-        avg_loss = fluid.layers.mean(loss)
+        avg_loss = paddle.mean(loss)
         avg_loss.backward()
 
         optimizer.minimize(avg_loss)
diff --git a/python/paddle/fluid/contrib/slim/tests/test_graph.py b/python/paddle/fluid/contrib/slim/tests/test_graph.py
index d8887e1964128..1102ddb0074a2 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_graph.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_graph.py
@@ -46,7 +46,7 @@ def conv_block():
                                                   act="relu")
     prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_loss = fluid.layers.mean(loss)
+    avg_loss = paddle.mean(loss)
     return [img, label], avg_loss
 
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
index 0bb246f9ac923..2c18eff983e4c 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
@@ -118,7 +118,7 @@ def func_qat(self):
                     out = lenet(img)
                     acc = fluid.layers.accuracy(out, label)
                     loss = fluid.layers.cross_entropy(out, label)
-                    avg_loss = fluid.layers.mean(loss)
+                    avg_loss = paddle.mean(loss)
                     avg_loss.backward()
                     adam.minimize(avg_loss)
                     lenet.clear_gradients()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py
index e40816f39545a..6a3e35007dd46 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py
@@ -115,7 +115,7 @@ def model_train(self, model, batch_num=-1, batch_size=32, use_amp=False):
                     out = model(img)
                     acc = fluid.layers.accuracy(out, label)
                     loss = fluid.layers.cross_entropy(out, label)
-                    avg_loss = fluid.layers.mean(loss)
+                    avg_loss = paddle.mean(loss)
                 scaled_loss = scaler.scale(avg_loss)
                 scaled_loss.backward()
 
@@ -125,7 +125,7 @@ def model_train(self, model, batch_num=-1, batch_size=32, use_amp=False):
                 out = model(img)
                 acc = fluid.layers.accuracy(out, label)
                 loss = fluid.layers.cross_entropy(out, label)
-                avg_loss = fluid.layers.mean(loss)
+                avg_loss = paddle.mean(loss)
                 avg_loss.backward()
 
                 adam.minimize(avg_loss)
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py
index 28706d34c63fd..fbb1adefa1111 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py
@@ -45,7 +45,7 @@ def conv_net(img, label):
                                                   act="relu")
     prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_loss = fluid.layers.mean(loss)
+    avg_loss = paddle.mean(loss)
     return avg_loss
 
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
index c42777d673a7d..ce06bd63a8628 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
@@ -41,7 +41,7 @@ def linear_fc(num):
     for _ in six.moves.xrange(num):
         hidden = fluid.layers.fc(hidden, size=128, act='relu')
     loss = fluid.layers.cross_entropy(input=hidden, label=label)
-    loss = fluid.layers.mean(loss)
+    loss = paddle.mean(loss)
     return loss
 
 
@@ -92,7 +92,7 @@ def conv_bn_layer(input,
                                    pool_stride=2)
     fc = fluid.layers.fc(input=pool, size=10)
     loss = fluid.layers.cross_entropy(input=fc, label=label)
-    loss = fluid.layers.mean(loss)
+    loss = paddle.mean(loss)
     return loss
 
 
@@ -116,7 +116,7 @@ def conv_net(img, label, quant_skip_pattern):
     with fluid.name_scope(quant_skip_pattern):
         prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_loss = fluid.layers.mean(loss)
+    avg_loss = paddle.mean(loss)
     return avg_loss
 
 
@@ -620,7 +620,7 @@ def conv_bn_layer(input,
         pool_add = fluid.layers.elementwise_add(x=pool1, y=pool2, act='relu')
     fc = fluid.layers.fc(input=pool_add, size=10)
     loss = fluid.layers.cross_entropy(input=fc, label=label)
-    loss = fluid.layers.mean(loss)
+    loss = paddle.mean(loss)
     return loss
 
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py
index 0f4c450cfa98d..2e78d4ea8cba3 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py
@@ -53,7 +53,7 @@ def conv_net(img, label):
     hidden = fluid.layers.fc(input=conv_pool_2, size=100, act='relu')
     prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_loss = fluid.layers.mean(loss)
+    avg_loss = paddle.mean(loss)
     return avg_loss
 
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantize_transpiler_v2.py b/python/paddle/fluid/contrib/slim/tests/test_quantize_transpiler_v2.py
index 80fe720504efd..d2a5383024338 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantize_transpiler_v2.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantize_transpiler_v2.py
@@ -48,7 +48,7 @@ def conv_net(img, label):
         hidden = fluid.layers.fc(input=conv_pool_1, size=100, act='relu')
     prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_loss = fluid.layers.mean(loss)
+    avg_loss = paddle.mean(loss)
     return avg_loss
 
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py b/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py
index ec9ab8820a613..9ee4f3681588d 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py
@@ -55,7 +55,7 @@ def conv_net(img, label):
     hidden = fluid.layers.fc(input=conv_pool_2, size=100, act='relu')
     prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_loss = fluid.layers.mean(loss)
+    avg_loss = paddle.mean(loss)
     return avg_loss
 
 
diff --git a/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py b/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
index 60dfde6b45c37..35787c02eef3e 100644
--- a/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
+++ b/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
@@ -136,7 +136,7 @@ def train(net_type, use_cuda, save_dirname, is_local):
         logits = fluid.layers.fc(input=net, size=classdim, act="softmax")
         cost, predict = fluid.layers.softmax_with_cross_entropy(
             logits, label, return_softmax=True)
-        avg_cost = fluid.layers.mean(cost)
+        avg_cost = paddle.mean(cost)
         acc = fluid.layers.accuracy(input=predict, label=label)
 
         # Test program
@@ -460,7 +460,7 @@ def decorate_with_data_loader(self):
                 logits = fluid.layers.fc(input=net, size=10, act="softmax")
                 cost, predict = fluid.layers.softmax_with_cross_entropy(
                     logits, label, return_softmax=True)
-                avg_cost = fluid.layers.mean(cost)
+                avg_cost = paddle.mean(cost)
 
                 optimizer = fluid.optimizer.Lamb(learning_rate=0.001)
                 amp_lists = fluid.contrib.mixed_precision.AutoMixedPrecisionLists(
diff --git a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
index dd900ff428135..a7472f3bce5ac 100644
--- a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
+++ b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
@@ -32,7 +32,7 @@ def linear_fc(num):
     for _ in six.moves.xrange(num):
         hidden = fluid.layers.fc(hidden, size=128, act='relu')
     loss = fluid.layers.cross_entropy(input=hidden, label=label)
-    loss = fluid.layers.mean(loss)
+    loss = paddle.mean(loss)
     return loss
 
 
@@ -63,7 +63,7 @@ def conv_bn_layer(input,
         hidden = fluid.layers.elementwise_add(x=conv, y=short, act='relu')
     fc = fluid.layers.fc(input=hidden, size=10)
     loss = fluid.layers.cross_entropy(input=fc, label=label)
-    loss = fluid.layers.mean(loss)
+    loss = paddle.mean(loss)
     return loss
 
 
@@ -83,7 +83,7 @@ def conv_net(img, label):
                                                   act="relu")
     prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_loss = fluid.layers.mean(loss)
+    avg_loss = paddle.mean(loss)
     return avg_loss
 
 
diff --git a/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py b/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
index bbc61d34613da..4bb1ed72b7b9f 100644
--- a/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
+++ b/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
@@ -87,7 +87,7 @@ def bow_net(data,
     fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
     prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
     cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
+    avg_cost = paddle.mean(x=cost)
 
     return avg_cost
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/assert_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/assert_transformer.py
index 3d5ca1c136816..57d952fd6bb73 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/assert_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/assert_transformer.py
@@ -18,9 +18,10 @@
 
 from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper
 from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
+from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer
 
 
-class AssertTransformer(gast.NodeTransformer):
+class AssertTransformer(BaseTransformer):
     """
     A class transforms python assert to convert_assert.
     """
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
index aa01945ac849e..a9e8f447e998c 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
@@ -20,6 +20,7 @@
 # See details in https://github.com/serge-sans-paille/gast/
 import os
 from paddle.utils import gast
+from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer
 from paddle.fluid.dygraph.dygraph_to_static.early_return_transformer import EarlyReturnTransformer
 from paddle.fluid.dygraph.dygraph_to_static.assert_transformer import AssertTransformer
 from paddle.fluid.dygraph.dygraph_to_static.basic_api_transformer import BasicApiTransformer
@@ -34,6 +35,7 @@
 from paddle.fluid.dygraph.dygraph_to_static.loop_transformer import LoopTransformer
 from paddle.fluid.dygraph.dygraph_to_static.print_transformer import PrintTransformer
 from paddle.fluid.dygraph.dygraph_to_static.return_transformer import ReturnTransformer
+from paddle.fluid.dygraph.dygraph_to_static.create_variable_transformer import CreateVariableTransformer
 from paddle.fluid.dygraph.dygraph_to_static.static_analysis import StaticAnalysisVisitor
 from paddle.fluid.dygraph.dygraph_to_static.tensor_shape_transformer import TensorShapeTransformer
 
@@ -58,7 +60,7 @@ def apply_optimization(transformers):
         transformers.insert(3, BreakTransformOptimizer)
 
 
-class DygraphToStaticAst(gast.NodeTransformer):
+class DygraphToStaticAst(BaseTransformer):
     """
     Main class to transform Dygraph to Static Graph
     """
@@ -95,6 +97,7 @@ def transfer_from_node_type(self, node_wrapper):
             BreakContinueTransformer,  # break/continue in loops
             ReturnTransformer,  # return in functions
             LogicalTransformer,  # logical and/or/not
+            CreateVariableTransformer,  # create undefined var for if / while / for
             LoopTransformer,  # for/while -> while_op
             IfElseTransformer,  # if/else -> cond_op
             AssertTransformer,  # assert statement
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/base_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/base_transformer.py
new file mode 100644
index 0000000000000..9df7e8d9b4f41
--- /dev/null
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/base_transformer.py
@@ -0,0 +1,614 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.utils import gast
+from paddle.fluid import unique_name
+from paddle.fluid.dygraph.dygraph_to_static.utils import get_attribute_full_name
+from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
+from paddle.fluid.dygraph.dygraph_to_static.utils import create_assign_node
+from paddle.fluid.dygraph.dygraph_to_static.utils import ORIGI_INFO
+from paddle.fluid.dygraph.dygraph_to_static.utils import FOR_ITER_INDEX_PREFIX
+from paddle.fluid.dygraph.dygraph_to_static.utils import FOR_ITER_TUPLE_PREFIX
+from paddle.fluid.dygraph.dygraph_to_static.utils import FOR_ITER_TUPLE_INDEX_PREFIX
+from paddle.fluid.dygraph.dygraph_to_static.utils import FOR_ITER_VAR_LEN_PREFIX
+from paddle.fluid.dygraph.dygraph_to_static.utils import FOR_ITER_VAR_NAME_PREFIX
+from paddle.fluid.dygraph.dygraph_to_static.utils import FOR_ITER_ZIP_TO_LIST_PREFIX
+from paddle.fluid.dygraph.dygraph_to_static.utils import FOR_ITER_TARGET_PREFIX
+from paddle.fluid.dygraph.dygraph_to_static.utils import FOR_ITER_ITERATOR_PREFIX
+
+
+class BaseTransformer(gast.NodeTransformer):
+
+    def visit(self, node):
+        if not isinstance(node, gast.AST):
+            msg = ('Expected "gast.AST", but got "{}".').format(type(node))
+            raise ValueError(msg)
+        origin_info = getattr(node, ORIGI_INFO, None)
+
+        result = super(BaseTransformer, self).visit(node)
+
+        iter_result = result
+        if iter_result is not node and iter_result is not None:
+            if not isinstance(iter_result, (list, tuple)):
+                iter_result = (iter_result, )
+            if origin_info is not None:
+                for n in iter_result:
+                    setattr(n, ORIGI_INFO, origin_info)
+
+        return result
+
+
+class RenameTransformer(BaseTransformer):
+
+    def __init__(self, node):
+        assert isinstance(
+            node, gast.AST), "RenameTransformer only accepts gast.AST as input"
+        self.root = node
+        self.old_name = ""
+        self.new_name = ""
+
+    def rename(self, old_name, new_name):
+        self.old_name = old_name
+        self.new_name = new_name
+        self.visit(self.root)
+
+    def visit_Name(self, node):
+        self.generic_visit(node)
+        if node.id == self.old_name:
+            node.id = self.new_name
+        return node
+
+    def visit_Attribute(self, node):
+        self.generic_visit(node)
+        attr_full_name = get_attribute_full_name(node)
+        if attr_full_name == self.old_name:
+            new_name_node = gast.parse(self.new_name).body[0].value
+            return new_name_node
+        return node
+
+
+class NameNodeReplaceTransformer(BaseTransformer):
+    """
+    This class replaces specified gast.Name node by replace_node.
+    """
+
+    def __init__(self, root_node, target_name, replace_node):
+        assert isinstance(target_name, str)
+
+        # NOTE(liym27):
+        # Use gast.Name to replace gast.Name, otherwise, errors may occur.
+        #
+        # For examples:
+        # If using a gast.Subscript to replace gast.Name, and the original gast.Name
+        # is in the arguments of FunctionDef, an exception will be raised.
+        #
+        # ```
+        # def func(x[i])) # x[i] can not be a argument
+        #    # ...
+        # ```
+
+        assert isinstance(replace_node, gast.Name)
+        self.target_name = target_name
+        self.replace_node = replace_node
+
+        self.visit(root_node)
+
+    def visit_Name(self, node):
+        if node.id == self.target_name:
+            return self.replace_node
+        return node
+
+    def visit_Nonlocal(self, node):
+        names = node.names
+
+        def replace(s):
+            if s == self.target_name: return self.replace_node.id
+            return s
+
+        node.names = list(map(replace, names))
+        return node
+
+
+class ForLoopTuplePreTransformer(BaseTransformer):
+    """ pre-process of for loop.
+    >>> for A in B: 
+    >>>    C
+
+    will be changed into : 
+
+    >>> UUID_iterator = _jst.Indexable(B)  # make iterator-only to indexable list.
+    >>> for UUID_target in UUID_iterator:
+    >>>     A = _jst.Unpack(UUID_target, structure)
+    >>>     C
+
+    make the later loop_transform have unified type:
+    >>> for target in iter:
+    >>>     body
+    """
+
+    def __init__(self, wrapper_root):
+        self.wrapper_root = wrapper_root
+        self.root = wrapper_root.node
+
+    def transform(self):
+        self.visit(self.root)
+
+    def visit_For(self, node):
+        self.generic_visit(node)
+        tuple_target = unique_name.generate(FOR_ITER_TARGET_PREFIX)
+        tuple_iterator = unique_name.generate(FOR_ITER_ITERATOR_PREFIX)
+        origin_tuple_node = node.target
+        assign_iterator_node = gast.parse(
+            f"{tuple_iterator} = _jst.Indexable({ast_to_source_code(node.iter).strip()})"
+        ).body[0]
+        node.target = gast.Name(id=tuple_target,
+                                ctx=gast.Store(),
+                                annotation=None,
+                                type_comment=None)
+        node.iter = gast.Name(id=tuple_iterator,
+                              ctx=gast.Load(),
+                              annotation=None,
+                              type_comment=None)
+        node.body[0:0] = self.tuple_to_stmts(origin_tuple_node, tuple_target)
+        # return a list will insert a list of node replace the original for node.
+        return [assign_iterator_node, node]
+
+    def tuple_node_to_unpack_structure(self, node):
+        """ Create a sequence to represents the structure of nest.
+            For example: `a, (b,c), [d,e,f]` is represented by 
+            `[1, [1,1], [1,1,1]]`. the `1` is just a notation.
+            
+            Specially, `a` is represented by `1`.
+        """
+        ret = []
+        if not isinstance(node, (gast.Tuple, gast.List)):
+            return 1
+        for element in node.elts:
+            ret.append(self.tuple_node_to_unpack_structure(element))
+        return ret
+
+    def tuple_to_stmts(self, node, tuple_name):
+        structure_str = str(self.tuple_node_to_unpack_structure(node))
+        node_str = ast_to_source_code(node).strip()
+        assign_node_str = f"{node_str} = _jst.Unpack({tuple_name}, {structure_str})"
+        assign_node = gast.parse(assign_node_str).body[0]
+        return [assign_node]
+
+
+class SplitAssignTransformer(BaseTransformer):
+    """
+    This class transforms sequence assignments and multi-target assignments to normal assignments.
+    """
+
+    def __init__(self, ast_node):
+        assert isinstance(ast_node, gast.AST)
+        self.ast_root = ast_node
+
+    def transform(self):
+        self.visit(self.ast_root)
+
+    def visit_Assign(self, node):
+        target_nodes = node.targets
+        if len(target_nodes) == 1:
+            node = self._parse_sequence_assign(node)
+        else:
+            node = self._parse_multi_target_assign(node)
+        return node
+
+    def _parse_sequence_assign(self, node):
+        """
+        a, b = c, d
+        ->
+        a = c
+        b = d
+        """
+        assert isinstance(node, gast.Assign)
+
+        target_nodes = node.targets
+        value_node = node.value
+        if not isinstance(target_nodes[0], (gast.List, gast.Tuple)):
+            return node
+        if not isinstance(value_node, (gast.List, gast.Tuple)):
+            return node
+
+        targets = node.targets[0].elts
+        values = node.value.elts
+        if len(targets) != len(values):
+            return node
+
+        new_nodes = []
+        for target, value in zip(targets, values):
+            assign_node = gast.Assign(targets=[target], value=value)
+            new_nodes.append(assign_node)
+
+        return new_nodes
+
+    def _parse_multi_target_assign(self, node):
+        """
+         Example 1:
+         a = b = c
+         ->
+         b = c
+         a = b
+
+         Example 2:
+         a, b = c, d = x
+         ->
+         c,d = x
+         a = c
+         b = d
+         """
+        assert isinstance(node, gast.Assign)
+
+        target_nodes = node.targets
+        value_node = node.value
+        new_nodes = []
+        for target in reversed(target_nodes):
+            assign_node = gast.Assign(targets=[target], value=value_node)
+            # NOTE: Because assign_node can be sequence assign statement like `a,b = c,d`,
+            # it's necessary to visit this new assign_node
+            parsed_node = self.visit_Assign(assign_node)
+            if not isinstance(parsed_node, list):
+                parsed_node = [parsed_node]
+
+            new_nodes.extend(parsed_node)
+            value_node = target
+
+        return new_nodes
+
+
+class ForNodeVisitor(object):
+    """
+    This class parses python for statement, get transformed 3 statement components of for node
+    three key statements:
+        1). init_stmts: list[node], prepare nodes of for loop, may not only one
+        2). cond_stmt: node, condition node to judge whether continue loop
+        3). body_stmts: list[node], updated loop body, sometimes we should change
+            the original statement in body, not just append new statement
+
+    In this process, the semantics of for does not change.
+
+    Now only can parse 3 type statements (Here var is VarBase(Tensor) or python variable):
+        1). for x in range(var[*]|var.numpy()[*])
+        2). for x in var|var.numpy()
+        3). for i, x enumerate(var|var.numpy())
+    """
+
+    def __init__(self, for_node):
+        assert isinstance(
+            for_node, gast.For
+        ), "Input node for the initialization of ForNodeVisitor is not gast.For node."
+        # 1. original for node
+        self.node = for_node
+
+        # 2. gast.For node main parts
+        self.target = for_node.target
+        # NOTE: type may be Node or list[Node]
+        self.iter_args = for_node.iter if self.is_for_iter(
+        ) else for_node.iter.args
+        self.body = for_node.body
+
+        # 3. key shared node or names
+        # - x:
+        #   - for x in range(***)
+        #   - for x in var|var.numpy()
+        #   - for i, x enumerate(var|var.numpy())
+        self.iter_var_name = self._get_iter_var_name()
+
+        # - created index var to slice Variable: __for_loop_var_index_0
+        #   - for x in var|var.numpy()
+        #   - for i, x enumerate(var|var.numpy())
+        self.iter_idx_name = unique_name.generate(FOR_ITER_INDEX_PREFIX)
+
+        # - created shape var to build loop condition: __for_loop_var_len_0
+        #   - for x in var|var.numpy()
+        #   - for i, x enumerate(var|var.numpy())
+        #   - for x in var
+        self.iter_var_len_name = unique_name.generate(FOR_ITER_VAR_LEN_PREFIX)
+        # - created zip to list var : __for_loop_iter_zip_0
+        self.iter_zip_to_list_name = unique_name.generate(
+            FOR_ITER_ZIP_TO_LIST_PREFIX)
+
+        # - var.numpy()/var
+        #   - for x in var|var.numpy()
+        #   - for i, x enumerate(var|var.numpy())
+        self.iter_node = self._get_iter_node()
+
+        # - enumeate i:
+        #   - for i, x enumerate(var|var.numpy())
+        self.enum_idx_name = self._get_enum_idx_name()
+
+        # - range/enumerate args length
+        self.args_length = None
+
+    def parse(self):
+        self._args_check()
+        if self.is_for_range_iter():
+            return self._parse_for_range_stmts()
+        elif self.is_for_iter():
+            return self._parse_for_stmts()
+        elif self.is_for_enumerate_iter():
+            return self._parse_for_enumerate_stmts()
+        else:
+            return None
+
+    def is_for_range_iter(self):
+        return isinstance(self.node.iter, gast.Call) and isinstance(
+            self.node.iter.func,
+            gast.Name) and self.node.iter.func.id == "range"
+
+    def is_for_iter(self):
+        if isinstance(self.node.iter,
+                      (gast.Name, gast.Attribute, gast.List, gast.Tuple)):
+            return True
+        elif isinstance(self.node.iter, gast.Call) and isinstance(
+                self.node.iter.func,
+                gast.Attribute) and self.node.iter.func.attr == 'numpy':
+            return True
+        elif isinstance(self.node.iter, gast.Subscript):
+            return True
+        else:
+            return False
+
+    def is_for_enumerate_iter(self):
+        return isinstance(self.node.iter, gast.Call) and isinstance(
+            self.node.iter.func,
+            gast.Name) and self.node.iter.func.id == "enumerate"
+
+    def _args_check(self):
+        if self.is_for_range_iter():
+            self.args_length = len(self.iter_args)
+            assert self.args_length >= 1 and self.args_length <= 3, "range() function takes 1 to 3 arguments"
+        elif self.is_for_enumerate_iter():
+            self.args_length = len(self.iter_args)
+            assert self.args_length >= 1 and self.args_length <= 2, "enumerate() function takes 1 to 2 arguments"
+        else:
+            self.args_length = None
+
+    def _parse_for_range_stmts(self):
+        init_stmts = []
+        init_stmts.append(self._build_index_init_node())
+
+        compare_node = self._build_compare_node()
+        step_node = self._build_step_node()
+        cond_stmt = self._build_cond_stmt(step_node, compare_node)
+
+        body_stmts = self.body
+        body_stmts.append(self._build_index_increase_node(step_node))
+
+        return init_stmts, cond_stmt, body_stmts
+
+    def _parse_for_stmts(self):
+        init_stmts = []
+        init_stmts.extend(self._build_iter_node())
+        init_stmts.append(self._build_index_init_node())
+        init_stmts.append(self._build_var_len_assign_node())
+
+        compare_node = self._build_compare_node()
+        step_node = self._build_step_node()
+        cond_stmt = self._build_cond_stmt(step_node, compare_node)
+
+        body_stmts = self.body
+
+        # NOTE(liym27): Here add a gast.Assign, and the target of it is gast.Name.
+        # In NameNodeReplaceTransformer, using gast.Name to replace gast.Name is safe.
+        target_node, assign_node = self._build_assign_var_slice_node()
+        body_stmts[0:0] = [assign_node]
+        for body_node in body_stmts:
+            NameNodeReplaceTransformer(body_node, self.iter_var_name,
+                                       target_node)
+        body_stmts.append(self._build_index_increase_node(step_node))
+
+        return init_stmts, cond_stmt, body_stmts
+
+    def _parse_for_enumerate_stmts(self):
+        init_stmts = []
+        init_stmts.extend(self._build_iter_node())
+        init_stmts.append(self._build_index_init_node())
+        init_stmts.append(self._build_var_len_assign_node())
+        init_stmts.append(self._build_enum_init_node())
+
+        compare_node = self._build_compare_node()
+        step_node = self._build_step_node()
+        cond_stmt = self._build_cond_stmt(step_node, compare_node)
+
+        body_stmts = self.body
+
+        target_node, assign_node = self._build_assign_var_slice_node()
+        body_stmts[0:0] = [assign_node]
+        for body_node in body_stmts:
+            NameNodeReplaceTransformer(body_node, self.iter_var_name,
+                                       target_node)
+
+        body_stmts.append(self._build_index_increase_node(step_node))
+        body_stmts.append(self._build_enum_increase_node())
+
+        return init_stmts, cond_stmt, body_stmts
+
+    def _build_index_init_node(self):
+        if self.is_for_range_iter():
+            if self.args_length == 1:
+                index_init_value_str = '0'
+            else:
+                index_init_value_str = ast_to_source_code(
+                    self.iter_args[0]).strip()
+
+            index_init_var_name = self.iter_var_name
+        else:
+            index_init_value_str = '0'
+            index_init_var_name = self.iter_idx_name
+
+        index_init_node_source_str = "{target} = {value}".format(
+            target=index_init_var_name, value=index_init_value_str)
+
+        index_init_node = gast.parse(index_init_node_source_str).body[0]
+
+        return index_init_node
+
+    def _build_var_len_assign_node(self):
+        # get the length of iterable variable
+        if isinstance(self.iter_node, gast.Call) and isinstance(
+                self.iter_node.func,
+                gast.Attribute) and self.iter_node.func.attr == 'numpy':
+            iter_var_name = ast_to_source_code(
+                self.iter_node.func.value).strip()
+        else:
+            iter_var_name = ast_to_source_code(self.iter_node).strip()
+
+        convert_len_node_source_str = '{} = _jst.Len({})'.format(
+            self.iter_var_len_name, iter_var_name)
+
+        convert_len_node = gast.parse(convert_len_node_source_str).body[0]
+
+        return convert_len_node
+
+    def _build_iter_node(self):
+        """
+        Process special cases for iter_node inclue:
+          - Case 1 (for zip):
+            
+            - for i, val in enumerate(zip(x, y))  # original code:
+            
+            - __for_loop_iter_zip_0 = list(zip(x, y))
+            - for i, val in enumerate(__for_loop_iter_zip_0)
+        """
+        new_nodes = []
+        if isinstance(self.iter_node, gast.Call) and isinstance(
+                self.iter_node.func, gast.Name):
+            if self.iter_node.func.id == 'zip':
+                iter_var_name = ast_to_source_code(self.iter_node).strip()
+                zip_to_list_str = "{target} = list({value})".format(
+                    target=self.iter_zip_to_list_name, value=iter_var_name)
+                zip_to_list_node = gast.parse(zip_to_list_str).body[0]
+                new_nodes.append(zip_to_list_node)
+
+                self.iter_node = gast.Name(id=self.iter_zip_to_list_name,
+                                           ctx=gast.Load(),
+                                           annotation=None,
+                                           type_comment=None)
+
+        return new_nodes
+
+    def _build_enum_init_node(self):
+        if self.is_for_enumerate_iter() and self.args_length != 1:
+            init_value_str = ast_to_source_code(self.iter_args[1]).strip()
+        else:
+            init_value_str = '0'
+
+        enum_init_node_source_str = "{} = {}".format(self.enum_idx_name,
+                                                     init_value_str)
+        enum_init_node = gast.parse(enum_init_node_source_str).body[0]
+        return enum_init_node
+
+    def _build_compare_node(self):
+        if self.is_for_range_iter():
+            compare_node = self.iter_args[
+                0] if self.args_length == 1 else self.iter_args[1]
+        else:
+            compare_node = gast.Name(id=self.iter_var_len_name,
+                                     ctx=gast.Load(),
+                                     annotation=None,
+                                     type_comment=None)
+        return compare_node
+
+    def _build_step_node(self):
+        if self.is_for_range_iter():
+            step_node = self.iter_args[
+                2] if self.args_length == 3 else gast.Constant(value=1,
+                                                               kind=None)
+        else:
+            step_node = gast.Constant(value=1, kind=None)
+        return step_node
+
+    def _build_cond_stmt(self, step_node, compare_node):
+        if not isinstance(step_node, (gast.Constant, gast.UnaryOp)):
+            raise NotImplementedError(
+                "Dynamic-to-Static only supports the step value is a constant or negative constant in 'for-range' statements, "
+                "such as '2', '-3'. But received: '{}'. Please fix code to be compatible with Dynamic-to-Static."
+                .format(ast_to_source_code(step_node).strip()))
+
+        if isinstance(step_node, gast.UnaryOp) or step_node.value < 0:
+            # eg:
+            # range(max, min, -2)
+            # ->
+            # i > min
+            return gast.Compare(left=gast.Name(
+                id=self.iter_var_name
+                if self.is_for_range_iter() else self.iter_idx_name,
+                ctx=gast.Load(),
+                annotation=None,
+                type_comment=None),
+                                ops=[gast.Gt()],
+                                comparators=[compare_node])
+        else:
+            # eg:
+            # range(min, max, 2)
+            # ->
+            # i < max
+            return gast.Compare(left=gast.Name(
+                id=self.iter_var_name
+                if self.is_for_range_iter() else self.iter_idx_name,
+                ctx=gast.Load(),
+                annotation=None,
+                type_comment=None),
+                                ops=[gast.Lt()],
+                                comparators=[compare_node])
+
+    def _build_index_increase_node(self, step_node):
+        return gast.AugAssign(target=gast.Name(
+            id=self.iter_var_name
+            if self.is_for_range_iter() else self.iter_idx_name,
+            ctx=gast.Store(),
+            annotation=None,
+            type_comment=None),
+                              op=gast.Add(),
+                              value=step_node)
+
+    def _build_assign_var_slice_node(self):
+        var_slice_str = "{}[{}]".format(
+            ast_to_source_code(self.iter_node).strip(), self.iter_idx_name)
+        var_slice_node = gast.parse(var_slice_str).body[0].value
+        new_iter_var_name = unique_name.generate(FOR_ITER_VAR_NAME_PREFIX)
+        target_node, assign_node = create_assign_node(new_iter_var_name,
+                                                      var_slice_node)
+        return target_node, assign_node
+
+    def _build_enum_increase_node(self):
+        return gast.AugAssign(target=gast.Name(id=self.enum_idx_name,
+                                               ctx=gast.Store(),
+                                               annotation=None,
+                                               type_comment=None),
+                              op=gast.Add(),
+                              value=gast.Constant(value=1, kind=None))
+
+    def _get_iter_var_name(self):
+        if self.is_for_range_iter():
+            return self.target.id
+        elif self.is_for_iter():
+            return self.target.id
+        elif self.is_for_enumerate_iter():
+            return self.target.elts[1].id
+        return None
+
+    def _get_iter_node(self):
+        if self.is_for_iter():
+            return self.iter_args
+        elif self.is_for_enumerate_iter():
+            return self.iter_args[0]
+        return None
+
+    def _get_enum_idx_name(self):
+        if self.is_for_enumerate_iter():
+            return self.target.elts[0].id
+        return None
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/basic_api_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/basic_api_transformer.py
index acf2c3ec09b5d..2293071c7cd17 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/basic_api_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/basic_api_transformer.py
@@ -17,9 +17,10 @@
 
 from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper
 from paddle.fluid.dygraph.dygraph_to_static import utils
+from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer
 
 
-class BasicApiTransformer(gast.NodeTransformer):
+class BasicApiTransformer(BaseTransformer):
     """
     Class to transform basic API from dygraph to static graph.
     """
@@ -98,7 +99,7 @@ def _update_class_node_dict(self, node):
         return False
 
 
-class ToTensorTransformer(gast.NodeTransformer):
+class ToTensorTransformer(BaseTransformer):
     """
     Class to transform paddle.to_tensor and paddle.to_variable to paddle.assign
     """
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py
index b85a2137dad81..b63fe6eea5af2 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py
@@ -18,9 +18,10 @@
 
 from paddle.fluid import unique_name
 from paddle.fluid.dygraph.dygraph_to_static.utils import index_in_list
-from paddle.fluid.dygraph.dygraph_to_static.utils import ForNodeVisitor
 from paddle.fluid.dygraph.dygraph_to_static.utils import BaseNodeVisitor
-from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import create_fill_constant_node
+from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import create_bool_node
+from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer
+from paddle.fluid.dygraph.dygraph_to_static.base_transformer import ForNodeVisitor
 
 __all__ = ['BreakContinueTransformer']
 
@@ -28,7 +29,7 @@
 CONTINUE_NAME_PREFIX = '__continue'
 
 
-class ForToWhileTransformer(gast.NodeTransformer):
+class ForToWhileTransformer(BaseTransformer):
     """
     Transform python for loop into while loop and add condition node in the
     loop test
@@ -140,7 +141,7 @@ def visit_Break(self, node):
         self._replace_if_stmt(loop_node_index, first_block_index, variable_name)
 
         # 4. For 'break' add break into condition of the loop.
-        assign_false_node = create_fill_constant_node(variable_name, False)
+        assign_false_node = create_bool_node(variable_name, False)
         self._add_stmt_before_cur_node(loop_node_index, assign_false_node)
 
         cond_var_node = gast.UnaryOp(op=gast.Not(),
@@ -177,7 +178,7 @@ def visit_Continue(self, node):
         self._replace_if_stmt(loop_node_index, first_block_index, variable_name)
 
         # 4. For 'continue', set continue to False at the beginning of each loop
-        assign_false_node = create_fill_constant_node(variable_name, False)
+        assign_false_node = create_bool_node(variable_name, False)
         loop_node.body.insert(0, assign_false_node)
 
     def _remove_stmts_after_break_continue(self, break_continue_node,
@@ -221,7 +222,7 @@ def _replace_break_continue_in_stmt_list(self, stmt_list,
         i = index_in_list(stmt_list, break_continue_node)
         if i == -1:
             return False
-        assign_true_node = create_fill_constant_node(break_continue_name, True)
+        assign_true_node = create_bool_node(break_continue_name, True)
         stmt_list[i:] = [assign_true_node]
         return True
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py
index b14977ced1db5..15b909f3d3d84 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py
@@ -18,11 +18,12 @@
 from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper
 from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
 from paddle.fluid.dygraph.dygraph_to_static.utils import is_paddle_api
+from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer
 
 PDB_SET = "pdb.set_trace"
 
 
-class CallTransformer(gast.NodeTransformer):
+class CallTransformer(BaseTransformer):
     """
     This class transforms function calls into Static Graph Ast.
     """
@@ -39,7 +40,7 @@ def _no_need_convert_call(self, node):
         Determines whether a function needs to be transformed by `convert_call`.
         It doesn't need to be transformed when a function satisfies the following conditions:
           1. It's a api of paddle
-          2. It's a python builtin function not include `len` and `zip`
+          2. It's a python builtin function not include `len`, `zip`, `range` and `enumerate`
         """
         assert isinstance(node, gast.Call)
         if is_paddle_api(node):
@@ -47,11 +48,16 @@ def _no_need_convert_call(self, node):
 
         func_str = ast_to_source_code(node.func).strip()
         try:
-            from paddle.fluid.dygraph.dygraph_to_static.convert_call_func import is_builtin_len, is_builtin, is_builtin_zip
+            from paddle.fluid.dygraph.dygraph_to_static.convert_call_func import is_builtin
+            need_convert_builtin_func_list = {
+                'len',
+                'zip',
+                'range',
+                'enumerate',
+            }
             is_builtin = eval("is_builtin({})".format(func_str))
-            is_builtin_len = eval("is_builtin_len({})".format(func_str))
-            is_builtin_zip = eval("is_builtin_zip({})".format(func_str))
-            return is_builtin and not is_builtin_len and not is_builtin_zip
+            need_convert = func_str in need_convert_builtin_func_list
+            return is_builtin and not need_convert
         except Exception:
             return False
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/cast_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/cast_transformer.py
index 3b2d9be99ff00..a297d5cf56ed1 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/cast_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/cast_transformer.py
@@ -17,9 +17,10 @@
 
 from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper
 from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
+from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer
 
 
-class CastTransformer(gast.NodeTransformer):
+class CastTransformer(BaseTransformer):
     """
     This class transforms type casting into Static Graph Ast.
     """
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
index e660a64ab363c..5bb75bda8de97 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
@@ -28,6 +28,7 @@
 
 from paddle.fluid.dygraph.container import Sequential
 from paddle.fluid.dygraph.dygraph_to_static.convert_operators import convert_len, convert_zip
+from paddle.fluid.dygraph.dygraph_to_static.convert_operators import convert_range, convert_enumerate
 from paddle.fluid.dygraph.dygraph_to_static.logging_utils import TranslatorLogger
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import StaticFunction
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import convert_to_static
@@ -64,25 +65,22 @@ def __init__(self, not_convert=False):
         self.not_convert = not_convert
 
 
-def is_builtin(func):
-    if isinstance(func, types.BuiltinFunctionType):
+def is_builtin(func, name=None):
+    """ predict whether a function is a builtin function with name={name}.
+        if name == None, then any builtin function will return True
+    """
+
+    def name_judge():
+        return name is None or func.__name__ == name
+
+    if isinstance(func, types.BuiltinFunctionType) and name_judge():
         return True
-    elif func in six.moves.builtins.__dict__.values():
+    elif func in six.moves.builtins.__dict__.values() and name_judge():
         return True
     else:
         return False
 
 
-def is_builtin_len(func):
-    if isinstance(func, types.BuiltinFunctionType) and func.__name__ == 'len':
-        return True
-    return False
-
-
-def is_builtin_zip(func):
-    return is_builtin(func) and func.__name__ == 'zip'
-
-
 def is_unsupported(func):
     """
     Checks whether the func is supported by dygraph to static graph.
@@ -165,12 +163,18 @@ def dyfunc(x):
             .format(func))
         return func
 
-    if is_builtin_len(func):
+    if is_builtin(func, "len"):
         return convert_len
 
-    if is_builtin_zip(func):
+    if is_builtin(func, "zip"):
         return convert_zip
 
+    if is_builtin(func, "range"):
+        return convert_range
+
+    if is_builtin(func, "enumerate"):
+        return convert_enumerate
+
     if is_builtin(func) or is_unsupported(func):
         return func
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
index cbb4655f354a5..e0b46fe2341a3 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
@@ -12,19 +12,61 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import re
+import paddle
 from paddle.fluid.data_feeder import convert_dtype
 from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import to_static_variable
 from paddle.fluid.framework import core, Variable
 from paddle.fluid.layers import Assert, Print
+from paddle.fluid.layers import range as paddle_range
 from paddle.fluid.layers import array_length, array_read, array_write, create_array
 from paddle.fluid.layers import assign, fill_constant, slice, reduce_all, reduce_any
 from paddle.fluid.layers import cast, control_flow, logical_and, logical_not, logical_or, nn
 from paddle.fluid.layers.control_flow import cond, while_loop, less_than, increment
 from paddle.fluid.dygraph.dygraph_to_static.return_transformer import RETURN_NO_VALUE_VAR_NAME
-from paddle.fluid.dygraph.dygraph_to_static.utils import UndefinedVar
+from paddle.fluid.dygraph.dygraph_to_static.utils import UndefinedVar, Dygraph2StaticException
 
 
-def convert_while_loop(cond, body, loop_vars):
+def indexable(x, code=None):
+    if isinstance(x, Variable): return x
+    if hasattr(x, '__len__') and hasattr(x, '__getitem__'): return x
+    if hasattr(x, '__iter__'):
+        return [i for i in x]
+    else:
+        raise RuntimeError("X can't be convert into indexable.")
+
+
+def unpack_by_structure(target, structure):
+    """ unified unpack interface for paddle and python.
+    """
+    if isinstance(target, Variable):
+        return _unpack_by_structure_paddle(target, structure)
+    else:
+        return _unpack_by_structure_python(target, structure)
+
+
+def _unpack_by_structure_python(target, structure):
+    """ TODO(xiongkun): analysis the differences between python and paddle unpack.
+    """
+    return _unpack_by_structure_paddle(target, structure)
+
+
+def _unpack_by_structure_paddle(target, structure):
+    if structure == 1:
+        return target
+    ret = []
+    for idx, ele in enumerate(structure):
+        if ele == 1:
+            ret.append(target[idx])
+            continue
+        if isinstance(ele, list):
+            ret.append(unpack_by_structure(target[idx], ele))
+            continue
+        assert False, "structure element must be 1 or list"
+    return ret
+
+
+def convert_while_loop(cond, body, getter, setter):
     """
     A function representation of a Python ``while`` statement.
 
@@ -39,26 +81,47 @@ def convert_while_loop(cond, body, loop_vars):
 
     # NOTE: It may be slower if cond is very expensive, but usually cond is just O(1).
     # If loop_vars is changed during cond callable, then it causes bug, but current logical_and/logical_not/... doesn't change the loop_vars.
-    pred = cond(*loop_vars)
+    pred = cond()
     if isinstance(pred, Variable):
-        loop_vars = _run_paddle_while_loop(cond, body, loop_vars)
+        _run_paddle_while(cond, body, getter, setter)
     else:
-        loop_vars = _run_py_while(cond, body, loop_vars)
-
-    return loop_vars
+        _run_py_while(cond, body, getter, setter)
 
 
-def _run_paddle_while_loop(cond, body, loop_vars):
+def _run_paddle_while(cond, body, getter, setter):
     # NOTE: loop_vars of Paddle op `control_flow.while_loop` must be Paddle Tensors.
-    loop_vars = [to_static_variable(var) for var in loop_vars]
-    loop_vars = control_flow.while_loop(cond, body, loop_vars)
+    def new_body_fn(*args):
+        """ wrap the body() and add return value for `while_loop`
+        """
+        body()
+        return getter()
+
+    def new_cond_fn(*args):
+        """ cond is a zero-args function, which is not 
+            compatible with `while_loop`.
+        """
+        return cond()
+
+    # UndefinedVar will become data layer not check variable with value=NO_VALUE_MAGIC.
+    loop_vars = [
+        to_static_variable(var) if not isinstance(var, UndefinedVar) else var
+        for var in getter()
+    ]
+    setter(loop_vars)  # change the non-local var to variable
+    # variable maybe modified to inner var. change it into
+    loop_vars = control_flow.while_loop(new_cond_fn, new_body_fn, loop_vars)
+    setter(loop_vars)  # change the non-local var to variable
     return loop_vars
 
 
-def _run_py_while(cond, body, loop_vars):
-    while cond(*loop_vars):
-        loop_vars = body(*loop_vars)
-    return loop_vars
+def _run_py_while(cond, body, getter, setter):
+    while True:
+        pred = cond()
+        if isinstance(pred, Variable):
+            raise Dygraph2StaticException(
+                "python while pred change from bool to variable.")
+        if not pred: break
+        body()
 
 
 def convert_logical_and(x_func, y_func):
@@ -225,17 +288,32 @@ def _run_paddle_cond(pred, true_fn, false_fn, get_args, set_args,
 
     def new_true_fn():
         set_args(init_args)
-        outs = true_fn()
-        _check_no_undefined_var(outs, return_name_ids, 'if_body')
-        return outs
+        ret = true_fn()
+        # IfExpr will return a non-None return value, so we just return ret.
+        # We assume normal return has no return value.
+        if ret is None: return get_args()
+        else: return ret
 
     def new_false_fn():
         set_args(init_args)
-        outs = false_fn()
-        _check_no_undefined_var(outs, return_name_ids, 'else_body')
-        return outs
-
-    cond_outs = control_flow.cond(pred, new_true_fn, new_false_fn)
+        ret = false_fn()
+        if ret is None: return get_args()
+        else: return ret
+
+    try:
+        cond_outs = control_flow.cond(pred, new_true_fn, new_false_fn, None,
+                                      return_name_ids)
+    except Exception as e:
+        if re.search("Unsupported return type of true_fn and false_fn in cond",
+                     str(e)):
+            raise Dygraph2StaticException(
+                "Your if/else have different return type. TODO: add link to modifty. {}"
+                .format(str(e)))
+        if re.search("Incompatible return values of", str(e)):
+            raise Dygraph2StaticException(
+                "Your if/else have different number of return value. TODO: add link to modifty. {}"
+                .format(str(e)))
+        raise e
     return _recover_args_state(cond_outs, get_args, set_args, return_name_ids)
 
 
@@ -245,8 +323,7 @@ def _run_py_ifelse(pred, true_fn, false_fn, get_args, set_args,
     Evaluate python original branch function if-else.
     """
     py_outs = true_fn() if pred else false_fn()
-    py_outs = _remove_no_value_return_var(py_outs)
-    return _recover_args_state(py_outs, get_args, set_args, return_name_ids)
+    return py_outs
 
 
 def _remove_no_value_return_var(out):
@@ -307,13 +384,14 @@ def _recover_args_state(outs, get_args, set_args, return_name_ids):
     init_args = get_args()
     # recover args state
     num_outs = len(return_name_ids)
-    num_args = 1 if not isinstance(init_args, tuple) else len(init_args)
+    num_args = len(init_args)
     assert num_outs <= num_args
 
     if num_args == 1:
-        final_outs = outs
+        final_outs = (outs, ) if not isinstance(outs,
+                                                (list, tuple)) else tuple(outs)
     else:
-        outs = (outs, ) if num_outs == 1 else outs
+        outs = (outs, ) if num_outs == 1 else tuple(outs)
         final_outs = outs + init_args[num_outs:]
 
     set_args(final_outs)
@@ -344,6 +422,8 @@ def convert_len(var):
                 'len(var) only supports LoDTensor/LoDTensorArray/SelectedRows, but received %s.'
                 % type(var))
     else:
+        if isinstance(var, VariableTuple):
+            return var.__len__()
         return len(var)
 
 
@@ -356,6 +436,44 @@ def convert_zip(*args):
     return zip(*args)
 
 
+# TODO(xiongkun): delete when list<variable> is ready.
+class VariableTuple:
+    """ 
+        this class will cause enumerate can't be wrapped by other iterator change function.
+        this will be fixed when list<Variable> is producted.
+        VariableTuple can only deal with variables which is fixed.
+    """
+
+    def __init__(self, var, start=0):
+        self.var = var
+        self.len = convert_len(var)
+        self.rag = paddle_range(start, start + self.len, 1, paddle.int64)
+
+    def __getitem__(self, idx):
+        return self.rag[idx], self.var[idx]
+
+    def __len__(self):
+        return self.len
+
+
+def convert_enumerate(*args):
+    has_variable = any(map(lambda x: isinstance(x, Variable), args))
+    if has_variable:
+        return VariableTuple(*args)
+    return enumerate(*args)
+
+
+def convert_range(*args):
+    has_variable = any(map(lambda x: isinstance(x, Variable), args))
+    if has_variable:
+        if len(args) == 1: return paddle_range(0, args[0], 1, paddle.int64)
+        if len(args) == 2:
+            return paddle_range(args[0], args[1], 1, paddle.int64)
+        if len(args) == 3:
+            return paddle_range(args[0], args[1], args[2], paddle.int64)
+    return range(*args)
+
+
 def convert_shape(x):
     """
     A function representation of the shape of variable.
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/create_variable_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/create_variable_transformer.py
new file mode 100644
index 0000000000000..8ae4c12eb8eaf
--- /dev/null
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/create_variable_transformer.py
@@ -0,0 +1,48 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+from paddle.utils import gast
+from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper
+from paddle.fluid.dygraph.dygraph_to_static.utils import FunctionNameLivenessAnalysis
+from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import create_undefined_var
+from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer
+
+
+class CreateVariableTransformer(BaseTransformer):
+    """
+    """
+
+    def __init__(self, wrapper_root):
+        assert isinstance(
+            wrapper_root, AstNodeWrapper
+        ), "Type of input node should be AstNodeWrapper, but received %s ." % type(
+            wrapper_root)
+        self.root = wrapper_root.node
+        FunctionNameLivenessAnalysis(self.root)
+
+    def transform(self):
+        """
+        Main function to transform AST.
+        """
+        self.visit(self.root)
+
+    def visit_FunctionDef(self, node):
+        #attributes = set(filter(lambda x: '.' in x, node.pd_scope.modified_vars()))
+        bodys = node.body
+        names = sorted(node.pd_scope.created_vars())
+        for name in names:
+            bodys[0:0] = [create_undefined_var(name)]
+        return node
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/early_return_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/early_return_transformer.py
index bef1efb0427cf..9cf82b020994e 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/early_return_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/early_return_transformer.py
@@ -16,9 +16,10 @@
 
 from paddle.utils import gast
 from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper
+from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer
 
 
-class EarlyReturnTransformer(gast.NodeTransformer):
+class EarlyReturnTransformer(BaseTransformer):
     """
     Transform if/else return statement of Dygraph into Static Graph.
     """
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/error.py b/python/paddle/fluid/dygraph/dygraph_to_static/error.py
index c422c5269e75d..3b868ade4e29b 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/error.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/error.py
@@ -274,19 +274,25 @@ def _simplify_error_value(self):
         bottom_error_message = error_value_lines[empty_line_idx + 1:]
         revise_suggestion = self._create_revise_suggestion(bottom_error_message)
 
-        user_filepath = ''
         error_traceback = []
         user_code_traceback_index = []
         pattern = 'File "(?P<filepath>.+)", line (?P<lineno>.+), in (?P<function_name>.+)'
+
+        # Distinguish user code and framework code using static_info_map
+        static_info_map = {}
+        for k, v in self.origin_info_map.items():
+            origin_filepath = v.location.filepath
+            origin_lineno = v.location.lineno
+            static_info_map[(origin_filepath, origin_lineno)] = k
+
         for i in range(0, len(error_value_lines_strip), 2):
             if error_value_lines_strip[i].startswith("File "):
                 re_result = re.search(pattern, error_value_lines_strip[i])
                 tmp_filepath, lineno_str, function_name = re_result.groups()
                 code = error_value_lines_strip[
                     i + 1] if i + 1 < len(error_value_lines_strip) else ''
-                if i == 0:
-                    user_filepath = tmp_filepath
-                if tmp_filepath == user_filepath:
+
+                if static_info_map.get((tmp_filepath, int(lineno_str))):
                     user_code_traceback_index.append(len(error_traceback))
 
                 error_traceback.append(
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/grad_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/grad_transformer.py
index d8d8d0bc043dd..09125623e16a5 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/grad_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/grad_transformer.py
@@ -19,9 +19,10 @@
 
 from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper
 from paddle.fluid.dygraph.dygraph_to_static import utils
+from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer
 
 
-class GradTransformer(gast.NodeTransformer):
+class GradTransformer(BaseTransformer):
     """
     A class transforms dygraph paddle.grad to static graph paddle.gradients. The
     transformation is applied to support double grad mode.
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
index 1935629f54e86..07d4920d43344 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
@@ -27,11 +27,14 @@
 from paddle.fluid import unique_name
 
 from paddle.fluid.dygraph.dygraph_to_static.utils import create_funcDef_node, ast_to_source_code
-from paddle.fluid.dygraph.dygraph_to_static.utils import create_assign_node
+from paddle.fluid.dygraph.dygraph_to_static.utils import create_assign_node, FunctionNameLivenessAnalysis
 from paddle.fluid.dygraph.dygraph_to_static.static_analysis import StaticAnalysisVisitor
 from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper
 from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import create_undefined_var
-from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import create_nonlocal_stmt_node
+from paddle.fluid.dygraph.dygraph_to_static.utils import create_nonlocal_stmt_nodes
+from paddle.fluid.dygraph.dygraph_to_static.utils import create_get_args_node, create_set_args_node
+from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer
+from paddle.fluid.dygraph.dygraph_to_static.utils import FOR_ITER_INDEX_PREFIX, FOR_ITER_TUPLE_PREFIX, FOR_ITER_TUPLE_INDEX_PREFIX, FOR_ITER_VAR_LEN_PREFIX, FOR_ITER_VAR_NAME_PREFIX, FOR_ITER_ZIP_TO_LIST_PREFIX, FOR_ITER_TARGET_PREFIX, FOR_ITER_ITERATOR_PREFIX
 
 TRUE_FUNC_PREFIX = 'true_fn'
 FALSE_FUNC_PREFIX = 'false_fn'
@@ -40,7 +43,7 @@
 ARGS_NAME = '__args'
 
 
-class IfElseTransformer(gast.NodeTransformer):
+class IfElseTransformer(BaseTransformer):
     """
     Transform if/else statement of Dygraph into Static Graph.
     """
@@ -51,7 +54,8 @@ def __init__(self, wrapper_root):
         ), "Type of input node should be AstNodeWrapper, but received %s ." % type(
             wrapper_root)
         self.root = wrapper_root.node
-        self.static_analysis_visitor = StaticAnalysisVisitor(self.root)
+        FunctionNameLivenessAnalysis(
+            self.root)  # name analysis of current ast tree.
 
     def transform(self):
         """
@@ -271,188 +275,6 @@ def _update_name_ids(self, new_name_ids):
             self.name_ids[name_id] = ctxs + self.name_ids[name_id]
 
 
-def get_name_ids(nodes, after_node=None, end_node=None):
-    """
-    Return all ast.Name.id of python variable in nodes range from
-    (after_node, end_node) exclusively. If after_node or end_node is None, the
-    range is unlimited.
-    """
-    name_visitor = NameVisitor(after_node, end_node)
-    for node in nodes:
-        name_visitor.visit(node)
-    return name_visitor.name_ids
-
-
-def parse_cond_args(parent_ids,
-                    var_ids_dict,
-                    modified_ids_dict=None,
-                    ctx=gast.Load):
-    """
-    Find out the ast.Name.id list of input by analyzing node's AST information.
-    """
-
-    # 1. filter the var fit the ctx
-    arg_name_ids = [
-        var_id for var_id, var_ctx in six.iteritems(var_ids_dict)
-        if isinstance(var_ctx[0], ctx)
-    ]
-
-    # 2. args should contain modified var ids in if-body or else-body
-    #  case:
-    #
-    #   ```
-    #   if b < 1:
-    #     z = y
-    #   else:
-    #     z = x
-    #   ```
-    #
-    #   In the above case, `z` should be in the args of cond()
-    if modified_ids_dict:
-        arg_name_ids = set(arg_name_ids) | set(modified_ids_dict)
-
-    # 3. args should not contain the vars not in parent ids
-    #  case :
-    #
-    #   ```
-    #   x = 1
-    #   if x > y:
-    #     z = [v for v in range(i)]
-    #   ```
-    #
-    #   In the above case, `v` should not be in the args of cond()
-    arg_name_ids = set(arg_name_ids) & set(parent_ids)
-
-    return arg_name_ids
-
-
-def parse_cond_return(parent_vars_dict, if_vars_dict, else_vars_dict,
-                      after_ifelse_vars_dict):
-    """
-    Find out the ast.Name list of output by analyzing node's AST information.
-    One of the following conditions should be satisfied while determining whether a variable is a return value:
-    1. the var in parent scope is modified in If.body or If.orelse node.
-    2. new var is both created in If.body and If.orelse node.
-    3. new var is created only in one of If.body or If.orelse node, and it used as gast.Load firstly after gast.If node.
-
-    For example:
-        x, y = 5, 10
-        if x > 4:
-            x = x+1
-            z = x*x
-            q = 10
-        else:
-            y = y - 1
-            z = y*y
-            m = 20
-            n = 20
-
-        print(q)
-        n = 30
-        print(n)
-
-
-    The return_ids are (x, y, z, q) for `If.body` and `If.orelse`node, because
-    1. x is modified in If.body node,
-    2. y is modified in If.body node,
-    3. z is both created in If.body and If.orelse node,
-    4. q is created only in If.body, and it is used by `print(q)` as gast.Load.
-    Note:
-        After transformed, q and z are created in parent scope. For example,
-
-        x, y = 5, 10
-        q = paddle.jit.dy2static.UndefindVar('q')
-        z = paddle.jit.dy2static.UndefindVar('z')
-
-        def true_func(x, y, q):
-            x = x+1
-            z = x*x
-            q = 10
-            return x,y,z,q
-
-        def false_func(x, y, q):
-            y = y - 1
-            z = y*y
-            m = 20
-            n = 20
-            return x,y,z,q
-
-        x,y,z,q = fluid.layers.cond(x>4, lambda: true_func(x, y), lambda: false_func(x, y, q))
-
-    m and n are not in return_ids, because
-    5. m is created only in If.orelse, but it is not used after gast.If node.
-    6. n is created only in If.orelse, and it is used by `n = 30` and `print(n)`, but it is not used as gast.Load firstly but gast.Store .
-
-    """
-
-    def _is_return_var(ctxs):
-        for ctx in ctxs:
-            if isinstance(ctx, (gast.Store, gast.Param)):
-                return True
-        return False
-
-    def _vars_with_store(ids_dict):
-        vars = []
-        for k, ctxs in six.iteritems(ids_dict):
-            if _is_return_var(ctxs):
-                vars.append(k)
-        return vars
-
-    def _modified_vars(child_dict, parent_dict):
-        return set(
-            [var for var in _vars_with_store(child_dict) if var in parent_dict])
-
-    def _vars_loaded(ids_dict):
-        """
-        gast.Param is also a kind of `load` semantic.
-        """
-        new_dict = defaultdict(list)
-        for k, ctxs in six.iteritems(ids_dict):
-            for ctx in ctxs:
-                if isinstance(ctx, (gast.Load, gast.Param)):
-                    new_dict[k].append(ctx)
-        return new_dict
-
-    # modified vars
-    body_modified_vars = _modified_vars(if_vars_dict, parent_vars_dict)
-    orelse_modified_vars = _modified_vars(else_vars_dict, parent_vars_dict)
-    modified_vars = body_modified_vars | orelse_modified_vars
-
-    # new vars
-    body_new_vars = set([
-        var for var in _vars_with_store(if_vars_dict)
-        if var not in parent_vars_dict
-    ])
-    orelse_new_vars = set([
-        var for var in _vars_with_store(else_vars_dict)
-        if var not in parent_vars_dict
-    ])
-    new_vars_in_body_or_orelse = body_new_vars | orelse_new_vars
-    new_vars_in_one_of_body_or_orelse = body_new_vars ^ orelse_new_vars
-
-    # 1. the var in parent scope is modified in If.body or If.orelse node.
-    modified_vars_from_parent = modified_vars - new_vars_in_body_or_orelse
-
-    # 2. new var is both created in If.body and If.orelse node.
-    new_vars_in_body_and_orelse = body_new_vars & orelse_new_vars
-
-    # 3. new var is created only in one of If.body or If.orelse node, and it used as gast.Load firstly after gast.If node.
-    # TODO(zhhsplendid): the _vars_loaded can be optimized as _vars_loaded_before_store. Because if a variable is stored before load,
-    # the value would change by the store statement, we don't have to return to change the value. However, analysis is
-    # complex because if the IfElse is nested and outer IfElse store statement may not run at all. We will put this optimization
-    # as the future TODO
-    used_vars_after_ifelse = set(
-        [var for var in _vars_loaded(after_ifelse_vars_dict)])
-    new_vars_to_create = new_vars_in_one_of_body_or_orelse & used_vars_after_ifelse | new_vars_in_body_and_orelse
-
-    # 4. generate return_ids of if/else node.
-    return_ids = list(modified_vars_from_parent | new_vars_in_body_and_orelse
-                      | new_vars_to_create)
-    return_ids.sort()
-
-    return return_ids, modified_vars_from_parent, new_vars_to_create
-
-
 def _valid_nonlocal_names(return_name_ids, nonlocal_names):
     """
     All var in return_name_ids should be in nonlocal_names.
@@ -483,15 +305,7 @@ def transform_if_else(node, root):
     """
 
     # TODO(liym27): Consider variable like `self.a` modified in if/else node.
-    parent_name_ids = get_name_ids([root], end_node=node)
-    body_name_ids = get_name_ids(node.body)
-    orelse_name_ids = get_name_ids(node.orelse)
-    # Get after_ifelse_name_ids, which means used var names after If.body and If.orelse node.
-    after_ifelse_name_ids = get_name_ids([root], after_node=node)
-
-    return_name_ids, modified_name_ids_from_parent, new_vars_to_create = parse_cond_return(
-        parent_name_ids, body_name_ids, orelse_name_ids, after_ifelse_name_ids)
-
+    return_name_ids = sorted(list(node.pd_scope.modified_vars()))
     # NOTE: Python can create variable only in if body or only in else body, and use it out of if/else.
     # E.g.
     #
@@ -501,31 +315,30 @@ def transform_if_else(node, root):
     #
     # Create static variable for those variables
     create_new_vars_in_parent_stmts = []
-    for name in new_vars_to_create:
-        # NOTE: Consider variable like `self.a` modified in if/else node.
-        if "." not in name:
-            create_new_vars_in_parent_stmts.append(create_undefined_var(name))
-
-    parent_ids_set = set()
-    for k, ctxs in parent_name_ids.items():
-        if any([not isinstance(ctx, gast.Load) for ctx in ctxs]):
-            parent_ids_set.add(k)
-
-    trun_args = parse_cond_args(parent_ids_set, body_name_ids,
-                                modified_name_ids_from_parent)
-    false_args = parse_cond_args(parent_ids_set, orelse_name_ids,
-                                 modified_name_ids_from_parent)
-    nonlocal_names = list(trun_args | false_args | new_vars_to_create)
+
+    nonlocal_names = list(return_name_ids)
     nonlocal_names.sort()
     # NOTE: All var in return_name_ids should be in nonlocal_names.
     nonlocal_names = _valid_nonlocal_names(return_name_ids, nonlocal_names)
 
     # TODO(dev): Need a better way to deal this.
-    if ARGS_NAME in nonlocal_names:
-        nonlocal_names.remove(ARGS_NAME)
+    # LoopTransformer will create some special vars, which is not visiable by users. so we can sure it's safe to remove them.
+    filter_names = [
+        ARGS_NAME, FOR_ITER_INDEX_PREFIX, FOR_ITER_TUPLE_PREFIX,
+        FOR_ITER_TARGET_PREFIX, FOR_ITER_ITERATOR_PREFIX,
+        FOR_ITER_TUPLE_INDEX_PREFIX, FOR_ITER_VAR_LEN_PREFIX,
+        FOR_ITER_VAR_NAME_PREFIX, FOR_ITER_ZIP_TO_LIST_PREFIX
+    ]
+
+    def remove_if(x):
+        for name in filter_names:
+            if x.startswith(name): return False
+        return True
 
-    nonlocal_stmt_node = [create_nonlocal_stmt_node(nonlocal_names)
-                          ] if nonlocal_names else []
+    nonlocal_names = list(filter(remove_if, nonlocal_names))
+    return_name_ids = nonlocal_names
+
+    nonlocal_stmt_node = create_nonlocal_stmt_nodes(nonlocal_names)
 
     empty_arg_node = gast.arguments(args=[],
                                     posonlyargs=[],
@@ -539,12 +352,12 @@ def transform_if_else(node, root):
         nonlocal_stmt_node + node.body,
         name=unique_name.generate(TRUE_FUNC_PREFIX),
         input_args=empty_arg_node,
-        return_name_ids=return_name_ids)
+        return_name_ids=[])
     false_func_node = create_funcDef_node(
         nonlocal_stmt_node + node.orelse,
         name=unique_name.generate(FALSE_FUNC_PREFIX),
         input_args=empty_arg_node,
-        return_name_ids=return_name_ids)
+        return_name_ids=[])
 
     get_args_node = create_get_args_node(nonlocal_names)
     set_args_node = create_set_args_node(nonlocal_names)
@@ -552,70 +365,6 @@ def transform_if_else(node, root):
     return create_new_vars_in_parent_stmts, true_func_node, false_func_node, get_args_node, set_args_node, return_name_ids
 
 
-def create_get_args_node(names):
-    """
-    Create get_args function as follows:
-
-        def get_args_0():
-            nonlocal x, y
-            return x, y
-    """
-
-    def empty_node():
-        func_def = """
-        def {func_name}():
-            return
-        """.format(func_name=unique_name.generate(GET_ARGS_FUNC_PREFIX))
-        return gast.parse(textwrap.dedent(func_def)).body[0]
-
-    assert isinstance(names, (list, tuple))
-    if not names:
-        return empty_node()
-
-    template = """
-    def {func_name}():
-        nonlocal {vars}
-        return {vars}
-    """
-    func_def = template.format(
-        func_name=unique_name.generate(GET_ARGS_FUNC_PREFIX),
-        vars=",".join(names))
-    return gast.parse(textwrap.dedent(func_def)).body[0]
-
-
-def create_set_args_node(names):
-    """
-    Create set_args function as follows:
-
-        def set_args_0(__args):
-            nonlocal x, y
-            x, y = __args
-    """
-
-    def empty_node():
-        func_def = """
-        def {func_name}({args}):
-            pass
-        """.format(func_name=unique_name.generate(SET_ARGS_FUNC_PREFIX),
-                   args=ARGS_NAME)
-        return gast.parse(textwrap.dedent(func_def)).body[0]
-
-    assert isinstance(names, (list, tuple))
-    if not names:
-        return empty_node()
-
-    template = """
-    def {func_name}({args}):
-        nonlocal {vars}
-        {vars} = {args}
-    """
-    func_def = template.format(
-        func_name=unique_name.generate(SET_ARGS_FUNC_PREFIX),
-        args=ARGS_NAME,
-        vars=",".join(names))
-    return gast.parse(textwrap.dedent(func_def)).body[0]
-
-
 def create_convert_ifelse_node(return_name_ids,
                                pred,
                                true_func,
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/list_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/list_transformer.py
index 48fa9906828c0..29e3ed5296806 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/list_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/list_transformer.py
@@ -21,11 +21,11 @@
 from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
 from paddle.fluid.dygraph.dygraph_to_static.utils import slice_is_num
 from paddle.fluid.dygraph.dygraph_to_static.utils import is_control_flow_to_transform
+from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer
+from paddle.fluid.dygraph.dygraph_to_static.base_transformer import SplitAssignTransformer
 
-from paddle.fluid.dygraph.dygraph_to_static.utils import SplitAssignTransformer
 
-
-class ListTransformer(gast.NodeTransformer):
+class ListTransformer(BaseTransformer):
     """
     This class transforms python list used in control flow into Static Graph Ast.
     """
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/logical_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/logical_transformer.py
index 80f5bffe46d1b..3e9a56b0e74dd 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/logical_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/logical_transformer.py
@@ -16,6 +16,7 @@
 
 from paddle.utils import gast
 from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
+from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer
 
 cmpop_type_to_str = {
     gast.Eq: "==",
@@ -35,7 +36,7 @@ def cmpop_node_to_str(node):
     return cmpop_type_to_str[type(node)]
 
 
-class LogicalTransformer(gast.NodeTransformer):
+class LogicalTransformer(BaseTransformer):
     """
     Transform python boolean op into Paddle logical op.
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
index 832c502c0aa5c..099f669748035 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
@@ -25,10 +25,14 @@
 from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
 from paddle.fluid.dygraph.dygraph_to_static.utils import generate_name_node
 from paddle.fluid.dygraph.dygraph_to_static.utils import get_attribute_full_name
-from paddle.fluid.dygraph.dygraph_to_static.utils import ForLoopTuplePreTransformer
-from paddle.fluid.dygraph.dygraph_to_static.utils import ForNodeVisitor
-from paddle.fluid.dygraph.dygraph_to_static.utils import RenameTransformer
-from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import create_fill_constant_node
+from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import create_undefined_var
+from paddle.fluid.dygraph.dygraph_to_static.utils import create_nonlocal_stmt_nodes, create_get_args_node, create_set_args_node
+from paddle.fluid.dygraph.dygraph_to_static.utils import FunctionNameLivenessAnalysis
+from paddle.fluid.dygraph.dygraph_to_static.ifelse_transformer import ARGS_NAME
+from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer
+from paddle.fluid.dygraph.dygraph_to_static.base_transformer import RenameTransformer
+from paddle.fluid.dygraph.dygraph_to_static.base_transformer import ForLoopTuplePreTransformer
+from paddle.fluid.dygraph.dygraph_to_static.base_transformer import ForNodeVisitor
 
 __all__ = ['LoopTransformer', 'NameVisitor']
 
@@ -37,12 +41,10 @@
 
 FOR_CONDITION_PREFIX = 'for_loop_condition'
 FOR_BODY_PREFIX = 'for_loop_body'
-GENERATE_VARIABLE_PREFIX = 'generate_variable'
 
-ATTRIBUTE_VARIABLE_PREFIX = '__attribute_variable'
 
-
-def create_while_nodes(condition_name, body_name, loop_var_names):
+def create_while_nodes(condition_name, body_name, loop_var_names, getter_name,
+                       setter_name):
     """
     Returns a list of gast.Node which represents the calling of Paddle
     controlflow while_loop.
@@ -74,133 +76,23 @@ def create_while_nodes(condition_name, body_name, loop_var_names):
     #
     # For example: loop_var_names = [a, b, foo.x], the type of `a` or `b` is gast.Name,
     # but the type of `foo.x` gast.Attribute.
-
-    unique_name_to_origin = {}
     # We have to make loop_var_names and assign_loop_var_names with same order
     # set doesn't have order so we convert it to list
     loop_var_names = list(loop_var_names)
     assign_loop_var_names = []
     for name in (loop_var_names):
-        if "." in name:
-            # name is an attribute variable such as foo.x
-            tmp_attr_name = unique_name.generate(ATTRIBUTE_VARIABLE_PREFIX)
-            unique_name_to_origin[tmp_attr_name] = name
-            assign_loop_var_names.append(tmp_attr_name)
-        else:
-            assign_loop_var_names.append(name)
+        assign_loop_var_names.append(name)
 
     while_func_name = "_jst.While"
-    while_node_str = "[{}] = {}({}, {}, [{}])".format(
-        ",".join(assign_loop_var_names), while_func_name, condition_name,
-        body_name, ",".join(loop_var_names))
+    while_node_str = "{}({}, {}, {}, {})".format(while_func_name,
+                                                 condition_name, body_name,
+                                                 getter_name, setter_name)
     while_node = gast.parse(while_node_str).body[0]
 
     ret = [while_node]
-    for tmp_attr_name in unique_name_to_origin:
-        origin_attr_var = unique_name_to_origin[tmp_attr_name]
-        dot_pos = origin_attr_var.rindex(".")
-        obj_name = origin_attr_var[0:dot_pos]
-        attr_name = origin_attr_var[dot_pos + 1:]
-        assign_if_not_prop_str = "if not isinstance(getattr(type({}), '{}', None), property): {} = {}".format(
-            obj_name, attr_name, origin_attr_var, tmp_attr_name)
-        assign_if_not_prop_node = gast.parse(assign_if_not_prop_str).body[0]
-        ret.append(assign_if_not_prop_node)
     return ret
 
 
-class NameScope:
-
-    def __init__(self):
-        """ we don't analyze the read only variable
-            because they keep the same in control flow.
-        """
-        self.globals = set()
-        self.nonlocals = set()
-        self.args = set()
-        self.w_vars = set()  # all vars been stored,
-        # may be globals or non-locals
-    def created_vars(self):
-        return self.w_vars - self.globals - self.nonlocals - self.args
-
-    def write_vars(self):
-        return self.w_vars
-
-    def global_vars(self):
-        return self.globals
-
-
-class FunctionNameLivenessAnalysis(gast.NodeVisitor):
-    """ analyze the liveness of a function.
-
-        every variables stored in this scope will be collected,
-        in addition with global/nonlocal information.
-
-        1. global variable is stored in node.var_globals.
-        2. nonlocal variable is stored in node.var_nonlocals.
-        3. arguments is stored in node.var_args.
-
-        For example:
-
-        def func(*args, **kargs):
-            a = 12
-            global i,j
-            nonlocal x,y
-            print(a)
-            i = k
-            for m in range(10):
-                q = 12
-        
-        After this visitor we have: 
-        # node is the FunctionDef node with name: "func"
-        node.pd_scope = NameScope(
-            globals = ['i', 'j'],
-            nonlocals = ['x', 'y'],
-            args = ['args', 'kargs'], 
-            wr_vars = ['a', 'i', 'q', 'm']
-        )
-    """
-
-    def __init__(self, root_node):
-        self.funcdef_stack = []
-        self.visit(root_node)
-
-    def _current_funcdef_scope(self):
-        return self.funcdef_stack[-1].pd_scope
-
-    def visit_Name(self, node):
-        self.generic_visit(node)
-        write_context = (gast.Store, gast.AugStore, gast.Del)
-        if isinstance(node.ctx, write_context):
-            self._current_funcdef_scope().w_vars.add(node.id)
-
-    def visit_FunctionDef(self, node):
-        setattr(node, 'pd_scope', NameScope())
-        self.funcdef_stack.append(node)
-        self._current_funcdef_scope().args |= set(
-            self._get_argument_names(node))
-        self.generic_visit(node)
-        self.funcdef_stack.pop()
-
-    def visit_Global(self, node):
-        self._current_funcdef_scope().globals |= set(node.names)
-
-    def visit_Nonlocal(self, node):
-        self._current_funcdef_scope().nonlocals |= set(node.names)
-
-    def _get_argument_names(self, node):
-        """ get all arguments name in the functiondef node.
-            this node is local to the function and shouldn't 
-            be created.
-        """
-        assert isinstance(
-            node, gast.FunctionDef), "Input node is not function define node"
-        names = [a for a in node.args.args]
-        names.append(node.args.vararg)
-        names.append(node.args.kwarg)
-        names = [i.id for i in names if i is not None]
-        return names
-
-
 class NameVisitor(gast.NodeVisitor):
     '''
     Analysis name liveness for loop transformer
@@ -282,9 +174,7 @@ def get_loop_var_names(self, node):
                 # If this var is a basic variable and read-only and not
                 # condition var, it may not be loop_var else it should
                 # be in loop_var as input
-                if (not name in condition_names) and (
-                        not name in write_names
-                ) and self._node_var_type_is_basic(name_to_type[name]):
+                if (not name in condition_names) and (not name in write_names):
                     continue
                 loop_var_names.add(name)
 
@@ -582,7 +472,7 @@ def filter_name_nodes_from(root_node, target_var_names):
         return loop_vars - removed_vars
 
 
-class LoopTransformer(gast.NodeTransformer):
+class LoopTransformer(BaseTransformer):
     """
     This class transforms python while/for statement into Static Graph Ast
     """
@@ -593,20 +483,21 @@ def __init__(self, wrapper_root):
         ), "Input non-AstNodeWrapper node for the initialization of LoopTransformer."
         self.wrapper_root = wrapper_root
         self.root = wrapper_root.node
+        FunctionNameLivenessAnalysis(self.root)
 
     def transform(self):
         ForLoopTuplePreTransformer(self.wrapper_root).transform()
-        self.name_visitor = NameVisitor(self.root)
         self.visit(self.root)
 
-    def visit(self, node):
+    def visit_While(self, node):
         self.generic_visit(node)
-        # All parent nodes that may contain gast.While/gast.For
-        if hasattr(node, 'body'):
-            self.replace_stmt_list(node.body)
-        if hasattr(node, 'orelse'):
-            self.replace_stmt_list(node.orelse)
-        return node
+        new_stmts = self.get_while_stmt_nodes(node)
+        return new_stmts
+
+    def visit_For(self, node):
+        self.generic_visit(node)
+        new_stmts = self.get_for_stmt_nodes(node)
+        return new_stmts
 
     def replace_stmt_list(self, body_list):
         if not isinstance(body_list, list):
@@ -645,21 +536,20 @@ def get_for_stmt_nodes(self, node):
         if stmts_tuple is None:
             return [node]
         init_stmts, cond_stmt, body_stmts = stmts_tuple
-
         # 2. get original loop vars
-        loop_var_names, create_var_names = self.name_visitor.get_loop_var_names(
-            node)
+        loop_var_names, create_var_names = node.pd_scope.modified_vars(
+        ), node.pd_scope.created_vars()
+        # TODO: Remove the bunch of code?  We have the unique format `for A in B:`
         # NOTE: in 'for x in var' or 'for i, x in enumerate(var)' cases,
         # we need append new loop var & remove useless loop var
         #   1. for x in var -> x is no need
         #   2. for i, x in enumerate(var) -> x is no need
-        if current_for_node_parser.is_for_iter(
-        ) or current_for_node_parser.is_for_enumerate_iter():
+        if current_for_node_parser.is_for_iter():
             iter_var_name = current_for_node_parser.iter_var_name
             iter_idx_name = current_for_node_parser.iter_idx_name
             loop_var_names.add(iter_idx_name)
-            if iter_var_name not in create_var_names:
-                loop_var_names.remove(iter_var_name)
+            if current_for_node_parser.enum_idx_name is not None:
+                loop_var_names.add(current_for_node_parser.enum_idx_name)
 
         # 3. prepare result statement list
         new_stmts = []
@@ -669,10 +559,17 @@ def get_for_stmt_nodes(self, node):
         #     y += x
         # print(x) # x = 10
         #
-        # We need to create static variable for those variables
-        for name in create_var_names:
-            if "." not in name:
-                new_stmts.append(create_fill_constant_node(name))
+        # We don't need to create static variable for them, because
+        # we do this in CreateUndefinedVarTransformer
+
+        # create non-local statement for body and cond.
+        nonlocal_names = list(loop_var_names | create_var_names)
+        nonlocal_names.sort()
+        # TODO(dev): Need a better way to deal this.
+        if ARGS_NAME in nonlocal_names:
+            nonlocal_names.remove(ARGS_NAME)
+
+        nonlocal_stmt_node = create_nonlocal_stmt_nodes(nonlocal_names)
 
         # 4. append init statements
         new_stmts.extend(init_stmts)
@@ -680,72 +577,63 @@ def get_for_stmt_nodes(self, node):
         # 5. create & append condition function node
         condition_func_node = gast.FunctionDef(
             name=unique_name.generate(FOR_CONDITION_PREFIX),
-            args=gast.arguments(args=[
-                gast.Name(id=name,
-                          ctx=gast.Param(),
-                          annotation=None,
-                          type_comment=None) for name in loop_var_names
-            ],
+            args=gast.arguments(args=[],
                                 posonlyargs=[],
                                 vararg=None,
                                 kwonlyargs=[],
                                 kw_defaults=None,
                                 kwarg=None,
                                 defaults=[]),
-            body=[gast.Return(value=cond_stmt)],
+            body=nonlocal_stmt_node + [gast.Return(value=cond_stmt)],
             decorator_list=[],
             returns=None,
             type_comment=None)
-        for name in loop_var_names:
-            if "." in name:
-                rename_transformer = RenameTransformer(condition_func_node)
-                rename_transformer.rename(
-                    name, unique_name.generate(GENERATE_VARIABLE_PREFIX))
         new_stmts.append(condition_func_node)
 
         # 6. create & append loop body function node
         # append return values for loop body
-        body_stmts.append(
-            gast.Return(value=generate_name_node(
-                loop_var_names, ctx=gast.Load(), gen_tuple_if_single=True)))
         body_func_node = gast.FunctionDef(
             name=unique_name.generate(FOR_BODY_PREFIX),
-            args=gast.arguments(args=[
-                gast.Name(id=name,
-                          ctx=gast.Param(),
-                          annotation=None,
-                          type_comment=None) for name in loop_var_names
-            ],
+            args=gast.arguments(args=[],
                                 posonlyargs=[],
                                 vararg=None,
                                 kwonlyargs=[],
                                 kw_defaults=None,
                                 kwarg=None,
                                 defaults=[]),
-            body=body_stmts,
+            body=nonlocal_stmt_node + body_stmts,
             decorator_list=[],
             returns=None,
             type_comment=None)
-        for name in loop_var_names:
-            if "." in name:
-                rename_transformer = RenameTransformer(body_func_node)
-                rename_transformer.rename(
-                    name, unique_name.generate(GENERATE_VARIABLE_PREFIX))
         new_stmts.append(body_func_node)
 
+        get_args_node = create_get_args_node(nonlocal_names)
+        set_args_node = create_set_args_node(nonlocal_names)
         # 7. create & append while loop node
         while_loop_nodes = create_while_nodes(condition_func_node.name,
                                               body_func_node.name,
-                                              loop_var_names)
+                                              nonlocal_names,
+                                              get_args_node.name,
+                                              set_args_node.name)
+        new_stmts.extend([get_args_node, set_args_node])
         new_stmts.extend(while_loop_nodes)
 
         return new_stmts
 
     def get_while_stmt_nodes(self, node):
-        loop_var_names, create_var_names = self.name_visitor.get_loop_var_names(
-            node)
+        loop_var_names, create_var_names = node.pd_scope.modified_vars(
+        ), node.pd_scope.created_vars()
         new_stmts = []
 
+        # create non-local statement for body and cond.
+        nonlocal_names = list(loop_var_names | create_var_names)
+        nonlocal_names.sort()
+        # TODO(dev): Need a better way to deal this.
+        if ARGS_NAME in nonlocal_names:
+            nonlocal_names.remove(ARGS_NAME)
+
+        nonlocal_stmt_node = create_nonlocal_stmt_nodes(nonlocal_names)
+
         # Python can create variable in loop and use it out of loop, E.g.
         #
         # while x < 10:
@@ -753,68 +641,48 @@ def get_while_stmt_nodes(self, node):
         #     y = x
         # z = y
         #
-        # We need to create static variable for those variables
-        for name in create_var_names:
-            if "." not in name:
-                new_stmts.append(create_fill_constant_node(name))
+        # We don't need to create static variable for those variables, because
+        # we do this in CreateUndefinedVarTransformer
 
         condition_func_node = gast.FunctionDef(
             name=unique_name.generate(WHILE_CONDITION_PREFIX),
-            args=gast.arguments(args=[
-                gast.Name(id=name,
-                          ctx=gast.Param(),
-                          annotation=None,
-                          type_comment=None) for name in loop_var_names
-            ],
+            args=gast.arguments(args=[],
                                 posonlyargs=[],
                                 vararg=None,
                                 kwonlyargs=[],
                                 kw_defaults=None,
                                 kwarg=None,
                                 defaults=[]),
-            body=[gast.Return(value=node.test)],
+            body=nonlocal_stmt_node + [gast.Return(value=node.test)],
             decorator_list=[],
             returns=None,
             type_comment=None)
 
-        for name in loop_var_names:
-            if "." in name:
-                rename_transformer = RenameTransformer(condition_func_node)
-                rename_transformer.rename(
-                    name, unique_name.generate(GENERATE_VARIABLE_PREFIX))
         new_stmts.append(condition_func_node)
 
         new_body = node.body
-        new_body.append(
-            gast.Return(value=generate_name_node(
-                loop_var_names, ctx=gast.Load(), gen_tuple_if_single=True)))
         body_func_node = gast.FunctionDef(
             name=unique_name.generate(WHILE_BODY_PREFIX),
-            args=gast.arguments(args=[
-                gast.Name(id=name,
-                          ctx=gast.Param(),
-                          annotation=None,
-                          type_comment=None) for name in loop_var_names
-            ],
+            args=gast.arguments(args=[],
                                 posonlyargs=[],
                                 vararg=None,
                                 kwonlyargs=[],
                                 kw_defaults=None,
                                 kwarg=None,
                                 defaults=[]),
-            body=new_body,
+            body=nonlocal_stmt_node + new_body,
             decorator_list=[],
             returns=None,
             type_comment=None)
-        for name in loop_var_names:
-            if "." in name:
-                rename_transformer = RenameTransformer(body_func_node)
-                rename_transformer.rename(
-                    name, unique_name.generate(GENERATE_VARIABLE_PREFIX))
         new_stmts.append(body_func_node)
+        get_args_node = create_get_args_node(nonlocal_names)
+        set_args_node = create_set_args_node(nonlocal_names)
 
         while_loop_nodes = create_while_nodes(condition_func_node.name,
                                               body_func_node.name,
-                                              loop_var_names)
+                                              nonlocal_names,
+                                              get_args_node.name,
+                                              set_args_node.name)
+        new_stmts.extend([get_args_node, set_args_node])
         new_stmts.extend(while_loop_nodes)
         return new_stmts
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py b/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
index de12677768332..93f089cf8dd9d 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
@@ -20,16 +20,13 @@
 from paddle.utils import gast
 from paddle.fluid import core
 from paddle.fluid.dygraph.dygraph_to_static.utils import unwrap
+from paddle.fluid.dygraph.dygraph_to_static.utils import ORIGI_INFO
 from paddle.fluid.framework import Program
 try:
     from collections.abc import Sequence
 except:
     from collections import Sequence
 
-# NOTE(liym27): Please use `getattr(ast_node, ORIGI_INFO)` instead of . operation to get the original information of ast node.
-ORIGI_INFO = "Original information of source code for ast node."
-ORIGI_INFO_MAP = "Original information map of source code."
-
 
 class Location(object):
     """
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
index 318585972f0e6..4faa4a098e016 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
@@ -441,11 +441,18 @@ def _prepare(self, inputs):
                 continue
             input_vars.append(var)
 
+        # mapping from name(string) -> VarBase
+        out_varbase_map = {}
+
         def create_out(var_id):
             var = self._outputs[var_id]
             assert isinstance(var, framework.Variable)
             var_desc = var.desc
             varbase = None
+
+            if var_desc.name() in out_varbase_map:
+                return out_varbase_map[var_desc.name()]
+
             if not framework._in_eager_mode_:
                 var_base = core.VarBase(var_desc.dtype(), var_desc.shape(),
                                         var_desc.name(), var_desc.type(), False)
@@ -453,6 +460,7 @@ def create_out(var_id):
                 var_base = core.eager.Tensor(var_desc.dtype(), var_desc.shape(),
                                              var_desc.name(), var_desc.type(),
                                              False)
+            out_varbase_map[var_desc.name()] = var_base
             return var_base
 
         # Create VarBase to receive output data.
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
index d7a889ad2fc9c..8615b3596e081 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
@@ -17,9 +17,10 @@
 from paddle.utils import gast
 
 from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper, StaticAnalysisVisitor
+from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer
 
 
-class PrintTransformer(gast.NodeTransformer):
+class PrintTransformer(BaseTransformer):
     """
     This class transforms python print function to fluid.layers.Print.
     """
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index 49a218412c92d..43ce1fae16fc2 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -252,9 +252,11 @@ def __init__(self, function, input_spec=None, **kwargs):
             **kwargs(dict): other arguments like `build_strategy` et.al.
         """
         # save the instance `self` while decorating a method of class.
+
         if inspect.ismethod(function):
             self._dygraph_function = getattr(function, '__func__')
             self._class_instance = getattr(function, '__self__')
+
             self._class_instance._original_funcs[
                 function.__name__] = self._dygraph_function
         else:
@@ -272,6 +274,13 @@ def __init__(self, function, input_spec=None, **kwargs):
         self._cuda_graph_capture_mode = ""
         self._cuda_graph_pool_id = 0
 
+        self._property = kwargs.get("property", False)
+
+    @property
+    def is_property(self):
+        # whether is class proproty to be exported.
+        return self._property
+
     def train(self):
         if isinstance(self._class_instance,
                       layers.Layer) and self._class_instance.training == False:
@@ -325,7 +334,8 @@ def forward(self, x, y):
         return self._descriptor_cache[instance]
 
     def _clone(self):
-        return self.__class__(self._dygraph_function, self._input_spec)
+        return self.__class__(self._dygraph_function, self._input_spec,
+                              **self._kwargs)
 
     def __call__(self, *args, **kwargs):
         """
@@ -338,6 +348,8 @@ def __call__(self, *args, **kwargs):
         Return:
             Outputs of decorated function.
         """
+        if self._property:
+            return self._call_dygraph_function(*args, **kwargs)
 
         # 1. call dygraph function directly if not enable `declarative`
         if not self._program_trans.enable_to_static:
@@ -417,6 +429,15 @@ def _call_dygraph_function(self, *args, **kwargs):
 
         return dygraph_function(*args, **kwargs)
 
+    def _raise_when_property(self):
+        """raise RuntimeError when property=True
+
+        Raises:
+            RuntimeError: can not call this func when property=True
+        """
+        if self.is_property:
+            raise RuntimeError("Can not call the func when property=True.")
+
     def get_concrete_program(self, *args, **kwargs):
         """
         Returns traced concrete program and inner executable partial layer.
@@ -428,6 +449,7 @@ def get_concrete_program(self, *args, **kwargs):
         Returns:
             Traced ConcreteProgram and executable translated Layer.
         """
+        self._raise_when_property()
 
         with_hook = kwargs.get("with_hook", False)
         is_train = kwargs.get("is_train", True)
@@ -518,6 +540,7 @@ def concrete_program_specify_input_spec(self,
             input_spec (list[InputSpec], optional): Describes the input of
                 the translate function.
         """
+        self._raise_when_property()
         # if specific the `input_spec`, the length of program_cache will always 1,
         # else, return the last one.
         cached_program_len = len(self._program_cache)
@@ -670,6 +693,7 @@ def inputs(self):
         """
         Returns input tensors of recent converted static program.
         """
+        self._raise_when_property()
         concrete_program = self.concrete_program
         inputs = [
             var for var in flatten(concrete_program.inputs)
@@ -682,6 +706,7 @@ def outputs(self):
         """
         Returns output tensors of recent converted static program.
         """
+        self._raise_when_property()
         concrete_program = self.concrete_program
         outputs = [
             var for var in flatten(concrete_program.outputs)
@@ -695,6 +720,7 @@ def main_program(self):
         """
         Returns recent converted static main program.
         """
+        self._raise_when_property()
         concrete_program = self.concrete_program
         main_program = concrete_program.main_program
         return main_program
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py
index 7e387b45c4020..3eadd455e1033 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py
@@ -21,6 +21,7 @@
 from paddle.fluid.dygraph.dygraph_to_static.break_continue_transformer import ForToWhileTransformer
 from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import create_fill_constant_node
 from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
+from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer
 
 __all__ = [
     'RETURN_NO_VALUE_MAGIC_NUM', 'RETURN_NO_VALUE_VAR_NAME', 'ReturnTransformer'
@@ -42,7 +43,9 @@
 # solve it in dy2stat, we put float64 value with this magic number at Static
 # graph as a place holder to indicate the returning placeholder means no value
 # should return.
-RETURN_NO_VALUE_MAGIC_NUM = 1.77113e+279
+
+# Assign not support float64, use float32 value as magic number.
+RETURN_NO_VALUE_MAGIC_NUM = 1.77113e+27
 RETURN_NO_VALUE_VAR_NAME = "__no_value_return_var"
 
 
@@ -57,7 +60,7 @@ def get_return_size(return_node):
     return return_length
 
 
-class ReplaceReturnNoneTransformer(gast.NodeTransformer):
+class ReplaceReturnNoneTransformer(BaseTransformer):
     """
     Replace 'return None' to  'return' because 'None' cannot be a valid input
     in control flow. In ReturnTransformer single 'Return' will be appended no
@@ -133,7 +136,7 @@ def get_func_max_return_length(self, func_node):
         return self.max_return_length[func_node]
 
 
-class ReturnTransformer(gast.NodeTransformer):
+class ReturnTransformer(BaseTransformer):
     """
     Transforms return statements into equivalent python statements containing
     only one return statement at last. The basics idea is using a return value
@@ -185,9 +188,7 @@ def visit(self, node):
         Self-defined visit for appending ancestor
         """
         self.ancestor_nodes.append(node)
-        method = 'visit_' + node.__class__.__name__
-        visitor = getattr(self, method, self.generic_visit)
-        ret = visitor(node)
+        ret = super(ReturnTransformer, self).visit(node)
         self.ancestor_nodes.pop()
         return ret
 
@@ -215,44 +216,17 @@ def visit_FunctionDef(self, node):
                                             ctx=gast.Load(),
                                             annotation=None,
                                             type_comment=None)))
-            init_names = [
-                unique_name.generate(RETURN_VALUE_INIT_NAME)
-                for i in range(max_return_length)
-            ]
-            assign_zero_nodes = [
-                create_fill_constant_node(iname, 0.0) for iname in init_names
-            ]
-            if len(init_names) == 1:
-                return_value_nodes = gast.Name(id=init_names[0],
-                                               ctx=gast.Load(),
-                                               annotation=None,
-                                               type_comment=None)
-            else:
-                # We need to initialize return value as a tuple because control
-                # flow requires some inputs or outputs have same structure
-                return_value_nodes = gast.Tuple(elts=[
-                    gast.Name(id=iname,
-                              ctx=gast.Load(),
-                              annotation=None,
-                              type_comment=None) for iname in init_names
-                ],
-                                                ctx=gast.Load())
             assign_return_value_node = gast.Assign(targets=[
                 gast.Name(id=value_name,
                           ctx=gast.Store(),
                           annotation=None,
                           type_comment=None)
             ],
-                                                   value=return_value_nodes)
+                                                   value=gast.Constant(
+                                                       kind=None, value=None))
             node.body.insert(0, assign_return_value_node)
-            node.body[:0] = assign_zero_nodes
 
         # Prepend no value placeholders
-        for name in self.return_no_value_name[node]:
-            assign_no_value_node = create_fill_constant_node(
-                name, RETURN_NO_VALUE_MAGIC_NUM)
-            node.body.insert(0, assign_no_value_node)
-
         self.function_def.pop()
         return node
 
@@ -339,74 +313,21 @@ def _replace_return_in_stmt_list(self, stmt_list, return_node, return_name,
 
         cur_func_node = self.function_def[-1]
         return_length = get_return_size(return_node)
-        if return_length < max_return_length:
-            # In this case we should append RETURN_NO_VALUE placeholder
-            #
-            # max_return_length must be >= 1 here because return_length will be
-            # 0 at least.
+        # In this case we should NOT append RETURN_NO_VALUE placeholder
+        if return_node.value is not None:
+            cur_func_node = self.function_def[-1]
             if self.return_value_name[cur_func_node] is None:
                 self.return_value_name[cur_func_node] = unique_name.generate(
                     RETURN_VALUE_PREFIX)
 
-            no_value_names = [
-                unique_name.generate(RETURN_NO_VALUE_VAR_NAME)
-                for j in range(max_return_length - return_length)
-            ]
-            self.return_no_value_name[cur_func_node].extend(no_value_names)
-
-            # Handle tuple/non-tuple case
-            if max_return_length == 1:
-                assign_nodes.append(
-                    gast.Assign(targets=[
-                        gast.Name(id=self.return_value_name[cur_func_node],
-                                  ctx=gast.Store(),
-                                  annotation=None,
-                                  type_comment=None)
-                    ],
-                                value=gast.Name(id=no_value_names[0],
-                                                ctx=gast.Load(),
-                                                annotation=None,
-                                                type_comment=None)))
-            else:
-                # max_return_length > 1 which means we should assign tuple
-                fill_tuple = [
-                    gast.Name(id=n,
-                              ctx=gast.Load(),
+            assign_nodes.append(
+                gast.Assign(targets=[
+                    gast.Name(id=self.return_value_name[cur_func_node],
+                              ctx=gast.Store(),
                               annotation=None,
-                              type_comment=None) for n in no_value_names
-                ]
-                if return_node.value is not None:
-                    if isinstance(return_node.value, gast.Tuple):
-                        fill_tuple[:0] = return_node.value.elts
-                    else:
-                        fill_tuple.insert(0, return_node.value)
-
-                assign_nodes.append(
-                    gast.Assign(targets=[
-                        gast.Name(id=self.return_value_name[cur_func_node],
-                                  ctx=gast.Store(),
-                                  annotation=None,
-                                  type_comment=None)
-                    ],
-                                value=gast.Tuple(elts=fill_tuple,
-                                                 ctx=gast.Load())))
-        else:
-            # In this case we should NOT append RETURN_NO_VALUE placeholder
-            if return_node.value is not None:
-                cur_func_node = self.function_def[-1]
-                if self.return_value_name[cur_func_node] is None:
-                    self.return_value_name[
-                        cur_func_node] = unique_name.generate(
-                            RETURN_VALUE_PREFIX)
-
-                assign_nodes.append(
-                    gast.Assign(targets=[
-                        gast.Name(id=self.return_value_name[cur_func_node],
-                                  ctx=gast.Store(),
-                                  annotation=None,
-                                  type_comment=None)
-                    ],
-                                value=return_node.value))
+                              type_comment=None)
+                ],
+                            value=return_node.value))
 
         stmt_list[i:] = assign_nodes
         return True
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py
index b7a2087d1f24d..88ece85cd139e 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py
@@ -18,9 +18,10 @@
 
 from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
 from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper
+from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer
 
 
-class TensorShapeTransformer(gast.NodeTransformer):
+class TensorShapeTransformer(BaseTransformer):
     """
     This class transforms variable.shape  into Static Graph Ast.
     All 'xxx.shape' will be converted int '_jst.Shape(x)'.
@@ -37,6 +38,7 @@ def transform(self):
         self.visit(self.root)
 
     def visit_Attribute(self, node):
+        self.generic_visit(node)
         if node.attr == 'shape':
             args = ast_to_source_code(node.value).strip()
             # NOTE(dev): we can deal with paddle.shape in this case, but it's
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
index 8dd11c06e463f..ed7faf83cefe5 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
@@ -30,12 +30,20 @@
 import paddle
 from paddle.fluid import unique_name
 from paddle.fluid.data_feeder import convert_dtype
+from paddle.fluid import core
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.layers import assign
 
 # Note(Aurelius): Do not forget the dot `.` to distinguish other
 # module such as paddlenlp.
 PADDLE_MODULE_PREFIX = 'paddle.'
 DYGRAPH_MODULE_PREFIX = 'paddle.fluid.dygraph'
 DYGRAPH_TO_STATIC_MODULE_PREFIX = 'paddle.fluid.dygraph.dygraph_to_static'
+GET_ARGS_FUNC_PREFIX = 'get_args'
+SET_ARGS_FUNC_PREFIX = 'set_args'
+ARGS_NAME = '__args'
+# NOTE(liym27): Please use `getattr(ast_node, ORIGI_INFO)` instead of . operation to get the original information of ast node.
+ORIGI_INFO = "Original information of source code for ast node."
 
 
 class BaseNodeVisitor(gast.NodeVisitor):
@@ -74,6 +82,8 @@ def visit(self, node):
 
 FOR_ITER_INDEX_PREFIX = '__for_loop_var_index'
 FOR_ITER_TUPLE_PREFIX = '__for_loop_iter_tuple'
+FOR_ITER_TARGET_PREFIX = '__for_loop_iter_target'
+FOR_ITER_ITERATOR_PREFIX = '__for_loop_iter_iterator'
 FOR_ITER_TUPLE_INDEX_PREFIX = '__for_loop_iter_tuple_index'
 FOR_ITER_VAR_LEN_PREFIX = '__for_loop_var_len'
 FOR_ITER_VAR_NAME_PREFIX = '__for_loop_iter_var'
@@ -87,6 +97,69 @@ def visit(self, node):
 ])
 
 
+def data_layer_not_check(name, shape, dtype='float32', lod_level=0):
+    """
+    This function creates a Tensor on the global block. The created Tensor
+    doesn't check the dtype and the shape of feed data because dygraph input
+    data can be various-length. This API is used in translating dygraph into
+    static graph.
+
+     Note: 
+        The default :code:`stop_gradient` attribute of the Tensor created by
+        this API is true, which means the gradient won't be passed backward
+        through the data Tensor. Set :code:`var.stop_gradient = False` If
+        user would like to pass backward gradient.
+
+    Args:
+       name (str): The name/alias of the Tensor, see :ref:`api_guide_Name`
+           for more details.
+       shape (list|tuple): List|Tuple of integers declaring the shape. You can
+           set "None" at a dimension to indicate the dimension can be of any
+           size. For example, it is useful to set changeable batch size as "None" 
+       dtype (np.dtype|VarType|str, optional): The type of the data. Supported
+           dtype: bool, float16, float32, float64, int8, int16, int32, int64,
+           uint8. Default: float32
+       lod_level (int, optional): The LoD level of the LoDTensor. Usually users
+           don't have to set this value. For more details about when and how to
+           use LoD level, see :ref:`user_guide_lod_tensor` . Default: 0
+
+    Returns:
+        Tensor: The global Tensor that gives access to the data.
+    """
+    helper = LayerHelper('data', **locals())
+    shape = list(shape)
+    for i in six.moves.range(len(shape)):
+        if shape[i] is None:
+            shape[i] = -1
+
+    return helper.create_global_variable(name=name,
+                                         shape=shape,
+                                         dtype=dtype,
+                                         type=core.VarDesc.VarType.LOD_TENSOR,
+                                         stop_gradient=True,
+                                         lod_level=lod_level,
+                                         is_data=True,
+                                         need_check_feed=False)
+
+
+def create_undefined_var_like(variable):
+    """ create a undefined var with the same shape and dtype like varaible.
+    """
+    from paddle.fluid.dygraph.dygraph_to_static.return_transformer import RETURN_NO_VALUE_MAGIC_NUM
+    var = data_layer_not_check(unique_name.generate("undefined_var"),
+                               variable.shape, variable.dtype)
+    assign(RETURN_NO_VALUE_MAGIC_NUM, var)
+    return var
+
+
+def create_undefined_variable():
+    from paddle.fluid.dygraph.dygraph_to_static.return_transformer import RETURN_NO_VALUE_MAGIC_NUM
+    var = data_layer_not_check(unique_name.generate("undefined_var"), [1],
+                               "float64")
+    assign(RETURN_NO_VALUE_MAGIC_NUM, var)
+    return var
+
+
 class UndefinedVar:
 
     def __init__(self, name):
@@ -97,6 +170,12 @@ def check(self):
             "local variable '{}' should be created before using it.")
 
 
+class Dygraph2StaticException(Exception):
+
+    def __init__(self, message):
+        super().__init__(message)
+
+
 def saw(x):
     if isinstance(x, UndefinedVar):
         return x.check()
@@ -412,10 +491,16 @@ def generate_name_node(name_ids, ctx=gast.Load(), gen_tuple_if_single=False):
         raise TypeError(
             'name_ids must be list or tuple or set, but received %s' %
             type(type(name_ids)))
-    gast_names = [
-        gast.Name(id=name_id, ctx=ctx, annotation=None, type_comment=None)
-        for name_id in name_ids
-    ]
+
+    def create_node_for_name(name):
+        if '.' not in name:
+            return gast.Name(id=name,
+                             ctx=ctx,
+                             annotation=None,
+                             type_comment=None)
+        return gast.parse(name).body[0].value
+
+    gast_names = [create_node_for_name(name_id) for name_id in name_ids]
     if len(gast_names) == 1 and not gen_tuple_if_single:
         name_node = gast_names[0]
     else:
@@ -460,35 +545,6 @@ def create_assign_node(name, node):
     return targets, assign_node
 
 
-class RenameTransformer(gast.NodeTransformer):
-
-    def __init__(self, node):
-        assert isinstance(
-            node, gast.AST), "RenameTransformer only accepts gast.AST as input"
-        self.root = node
-        self.old_name = ""
-        self.new_name = ""
-
-    def rename(self, old_name, new_name):
-        self.old_name = old_name
-        self.new_name = new_name
-        self.visit(self.root)
-
-    def visit_Name(self, node):
-        self.generic_visit(node)
-        if node.id == self.old_name:
-            node.id = self.new_name
-        return node
-
-    def visit_Attribute(self, node):
-        self.generic_visit(node)
-        attr_full_name = get_attribute_full_name(node)
-        if attr_full_name == self.old_name:
-            new_name_node = gast.parse(self.new_name).body[0].value
-            return new_name_node
-        return node
-
-
 def ast_to_func(ast_root, dyfunc, delete_on_exit=True):
     """
     Transform modified AST of decorated function into python callable object.
@@ -588,7 +644,12 @@ def ast_to_source_code(ast_node):
             type(ast_node))
     if isinstance(ast_node, gast.AST):
         ast_node = gast.gast_to_ast(ast_node)
-    source_code = astor.to_source(ast_node)
+
+    # Do not wrap lines even if they are too long
+    def pretty_source(source):
+        return ''.join(source)
+
+    source_code = astor.to_source(ast_node, pretty_source=pretty_source)
     return source_code
 
 
@@ -811,603 +872,6 @@ def get_compare_nodes_with_tensor(self):
         return self._compare_node_tenor_set
 
 
-class NameNodeReplaceTransformer(gast.NodeTransformer):
-    """
-    This class replaces specified gast.Name node by replace_node.
-    """
-
-    def __init__(self, root_node, target_name, replace_node):
-        assert isinstance(target_name, str)
-
-        # NOTE(liym27):
-        # Use gast.Name to replace gast.Name, otherwise, errors may occur.
-        #
-        # For examples:
-        # If using a gast.Subscript to replace gast.Name, and the original gast.Name
-        # is in the arguments of FunctionDef, an exception will be raised.
-        #
-        # ```
-        # def func(x[i])) # x[i] can not be a argument
-        #    # ...
-        # ```
-
-        assert isinstance(replace_node, gast.Name)
-        self.target_name = target_name
-        self.replace_node = replace_node
-
-        self.visit(root_node)
-
-    def visit_Name(self, node):
-        if node.id == self.target_name:
-            return self.replace_node
-        return node
-
-
-class ForLoopTuplePreTransformer(gast.NodeTransformer):
-    """
-    ForNodeVisitor parses 3 type statements (Here var is VarBase(Tensor) or python variable):
-        1). for x in range(var[*]|var.numpy()[*])
-        2). for x in var|var.numpy()
-        3). for i, x in enumerate(var|var.numpy())
-
-        We chose these 3 types because they are easier (x can be variable name iterating in var).
-        However, users can write tuples in Python for loop, such as
-        1). for var1, var2 in var|var.numpy()
-        2). for t in enumerate(var|var.numpy())
-        2). for i, (var1, var2, va3) in enumerate(var|var.numpy())
-
-        To handle these case, this method will do the rewrite tuple pre-process:
-        1). Non-enumerate case: for var1, var2 in var|var.numpy() will be re-written as:
-          for FOR_ITER_TUPLE_PREFIX_x in var | var.numpy():
-            var1 = FOR_ITER_TUPLE_PREFIX_x[0]
-            var2 = FOR_ITER_TUPLE_PREFIX_x[1]
-        2). Enumerate out tuple case: for t in enumerate(var|var.numpy) will be rewritten as:
-          for FOR_ITER_TUPLE_INDEX_PREFIX_x, FOR_ITER_TUPLE_PREFIX_x in enumerate(var|var.numpy):
-            t = (FOR_ITER_TUPLE_INDEX_PREFIX_x, FOR_ITER_TUPLE_PREFIX_x)
-        3). Enumerate inner tuple case: for i, (var1, (var2, va3)) in enumerate(var|var.numpy()) will
-        be re-written as:
-          for i, FOR_ITER_TUPLE_PREFIX_x in var | var.numpy():
-            var1 = FOR_ITER_TUPLE_PREFIX_x[0]
-            var2 = FOR_ITER_TUPLE_PREFIX_x[1][0]
-            var3 = FOR_ITER_TUPLE_PREFIX_x[1][1]
-    """
-
-    def __init__(self, wrapper_root):
-        self.wrapper_root = wrapper_root
-        self.root = wrapper_root.node
-
-    def transform(self):
-        self.visit(self.root)
-
-    def visit_For(self, node):
-        if self.is_for_enumerate_iter(node):
-            if isinstance(node.target, (gast.Name, gast.Attribute)):
-                # Out tuple case
-                out_tuple_name = ast_to_source_code(node.target).strip()
-                tuple_iter_name = unique_name.generate(
-                    FOR_ITER_TUPLE_INDEX_PREFIX)
-                tuple_var_name = unique_name.generate(FOR_ITER_TUPLE_PREFIX)
-                node.target = gast.Tuple(elts=[
-                    gast.Name(id=tuple_iter_name,
-                              ctx=gast.Store(),
-                              annotation=None,
-                              type_comment=None),
-                    gast.Name(id=tuple_var_name,
-                              ctx=gast.Store(),
-                              annotation=None,
-                              type_comment=None)
-                ],
-                                         ctx=gast.Store())
-                node.body.insert(
-                    0,
-                    gast.Assign(targets=[
-                        gast.Name(id=out_tuple_name,
-                                  ctx=gast.Store(),
-                                  annotation=None,
-                                  type_comment=None)
-                    ],
-                                value=gast.Tuple(elts=[
-                                    gast.Name(id=tuple_iter_name,
-                                              ctx=gast.Load(),
-                                              annotation=None,
-                                              type_comment=None),
-                                    gast.Name(id=tuple_var_name,
-                                              ctx=gast.Load(),
-                                              annotation=None,
-                                              type_comment=None)
-                                ],
-                                                 ctx=gast.Load())))
-            elif isinstance(node.target, (gast.List, gast.Tuple)) and len(
-                    node.target.elts) >= 2 and isinstance(
-                        node.target.elts[1], (gast.List, gast.Tuple)):
-                # Inner tuple case
-                inner_tuple_name = unique_name.generate(FOR_ITER_TUPLE_PREFIX)
-                origin_inner_tuple_node = node.target.elts[1]
-                node.target.elts[1] = gast.Name(id=inner_tuple_name,
-                                                ctx=gast.Store(),
-                                                annotation=None,
-                                                type_comment=None)
-                node.body[0:0] = self.tuple_to_stmts(origin_inner_tuple_node,
-                                                     inner_tuple_name)
-        elif self.is_for_iter(node) and isinstance(node.target,
-                                                   (gast.List, gast.Tuple)):
-            # Non-enumrate case:
-            tuple_name = unique_name.generate(FOR_ITER_TUPLE_PREFIX)
-            origin_tuple_node = node.target
-            node.target = gast.Name(id=tuple_name,
-                                    ctx=gast.Store(),
-                                    annotation=None,
-                                    type_comment=None)
-            node.body[0:0] = self.tuple_to_stmts(origin_tuple_node, tuple_name)
-        return node
-
-    def tuple_to_stmts(self, node, tuple_name, idx=[]):
-        if not isinstance(node, (gast.Tuple, gast.List)):
-            value_node_str = tuple_name
-            for i in idx:
-                value_node_str = value_node_str + "[{}]".format(i)
-
-            node_str = ast_to_source_code(node).strip()
-            assign_node_str = "{} = {}".format(node_str, value_node_str)
-            assign_node = gast.parse(assign_node_str).body[0]
-            return [assign_node]
-
-        # isinstance(node, (gast.Tuple, gast.List))
-        ret = []
-        for i, element in enumerate(node.elts):
-            ret += self.tuple_to_stmts(node.elts[i], tuple_name, idx + [i])
-        return ret
-
-    def is_for_iter(self, for_node):
-        assert isinstance(for_node,
-                          gast.For), "Input node is not gast.For node."
-        if isinstance(for_node.iter, (gast.Name, gast.Attribute)):
-            return True
-        elif isinstance(for_node.iter, gast.Call) and isinstance(
-                for_node.iter.func,
-                gast.Attribute) and for_node.iter.func.attr == 'numpy':
-            return True
-        elif isinstance(for_node.iter, gast.Subscript):
-            return True
-        else:
-            return False
-
-    def is_for_enumerate_iter(self, for_node):
-        assert isinstance(for_node,
-                          gast.For), "Input node is not gast.For node."
-        return isinstance(for_node.iter, gast.Call) and isinstance(
-            for_node.iter.func,
-            gast.Name) and for_node.iter.func.id == "enumerate"
-
-
-class ForNodeVisitor(object):
-    """
-    This class parses python for statement, get transformed 3 statement components of for node
-    three key statements:
-        1). init_stmts: list[node], prepare nodes of for loop, may not only one
-        2). cond_stmt: node, condition node to judge whether continue loop
-        3). body_stmts: list[node], updated loop body, sometimes we should change
-            the original statement in body, not just append new statement
-
-    In this process, the semantics of for does not change.
-
-    Now only can parse 3 type statements (Here var is VarBase(Tensor) or python variable):
-        1). for x in range(var[*]|var.numpy()[*])
-        2). for x in var|var.numpy()
-        3). for i, x enumerate(var|var.numpy())
-    """
-
-    def __init__(self, for_node):
-        assert isinstance(
-            for_node, gast.For
-        ), "Input node for the initialization of ForNodeVisitor is not gast.For node."
-        # 1. original for node
-        self.node = for_node
-
-        # 2. gast.For node main parts
-        self.target = for_node.target
-        # NOTE: type may be Node or list[Node]
-        self.iter_args = for_node.iter if self.is_for_iter(
-        ) else for_node.iter.args
-        self.body = for_node.body
-
-        # 3. key shared node or names
-        # - x:
-        #   - for x in range(***)
-        #   - for x in var|var.numpy()
-        #   - for i, x enumerate(var|var.numpy())
-        self.iter_var_name = self._get_iter_var_name()
-
-        # - created index var to slice Variable: __for_loop_var_index_0
-        #   - for x in var|var.numpy()
-        #   - for i, x enumerate(var|var.numpy())
-        self.iter_idx_name = unique_name.generate(FOR_ITER_INDEX_PREFIX)
-
-        # - created shape var to build loop condition: __for_loop_var_len_0
-        #   - for x in var|var.numpy()
-        #   - for i, x enumerate(var|var.numpy())
-        #   - for x in var
-        self.iter_var_len_name = unique_name.generate(FOR_ITER_VAR_LEN_PREFIX)
-        # - created zip to list var : __for_loop_iter_zip_0
-        self.iter_zip_to_list_name = unique_name.generate(
-            FOR_ITER_ZIP_TO_LIST_PREFIX)
-
-        # - var.numpy()/var
-        #   - for x in var|var.numpy()
-        #   - for i, x enumerate(var|var.numpy())
-        self.iter_node = self._get_iter_node()
-
-        # - enumeate i:
-        #   - for i, x enumerate(var|var.numpy())
-        self.enum_idx_name = self._get_enum_idx_name()
-
-        # - range/enumerate args length
-        self.args_length = None
-
-    def parse(self):
-        self._args_check()
-        if self.is_for_range_iter():
-            return self._parse_for_range_stmts()
-        elif self.is_for_iter():
-            return self._parse_for_stmts()
-        elif self.is_for_enumerate_iter():
-            return self._parse_for_enumerate_stmts()
-        else:
-            return None
-
-    def is_for_range_iter(self):
-        return isinstance(self.node.iter, gast.Call) and isinstance(
-            self.node.iter.func,
-            gast.Name) and self.node.iter.func.id == "range"
-
-    def is_for_iter(self):
-        if isinstance(self.node.iter,
-                      (gast.Name, gast.Attribute, gast.List, gast.Tuple)):
-            return True
-        elif isinstance(self.node.iter, gast.Call) and isinstance(
-                self.node.iter.func,
-                gast.Attribute) and self.node.iter.func.attr == 'numpy':
-            return True
-        elif isinstance(self.node.iter, gast.Subscript):
-            return True
-        else:
-            return False
-
-    def is_for_enumerate_iter(self):
-        return isinstance(self.node.iter, gast.Call) and isinstance(
-            self.node.iter.func,
-            gast.Name) and self.node.iter.func.id == "enumerate"
-
-    def _args_check(self):
-        if self.is_for_range_iter():
-            self.args_length = len(self.iter_args)
-            assert self.args_length >= 1 and self.args_length <= 3, "range() function takes 1 to 3 arguments"
-        elif self.is_for_enumerate_iter():
-            self.args_length = len(self.iter_args)
-            assert self.args_length >= 1 and self.args_length <= 2, "enumerate() function takes 1 to 2 arguments"
-        else:
-            self.args_length = None
-
-    def _parse_for_range_stmts(self):
-        init_stmts = []
-        init_stmts.append(self._build_index_init_node())
-
-        compare_node = self._build_compare_node()
-        step_node = self._build_step_node()
-        cond_stmt = self._build_cond_stmt(step_node, compare_node)
-
-        body_stmts = self.body
-        body_stmts.append(self._build_index_increase_node(step_node))
-
-        return init_stmts, cond_stmt, body_stmts
-
-    def _parse_for_stmts(self):
-        init_stmts = []
-        init_stmts.extend(self._build_iter_node())
-        init_stmts.append(self._build_index_init_node())
-        init_stmts.append(self._build_var_len_assign_node())
-
-        compare_node = self._build_compare_node()
-        step_node = self._build_step_node()
-        cond_stmt = self._build_cond_stmt(step_node, compare_node)
-
-        body_stmts = self.body
-
-        # NOTE(liym27): Here add a gast.Assign, and the target of it is gast.Name.
-        # In NameNodeReplaceTransformer, using gast.Name to replace gast.Name is safe.
-        target_node, assign_node = self._build_assign_var_slice_node()
-        body_stmts[0:0] = [assign_node]
-        for body_node in body_stmts:
-            NameNodeReplaceTransformer(body_node, self.iter_var_name,
-                                       target_node)
-        body_stmts.append(self._build_index_increase_node(step_node))
-
-        return init_stmts, cond_stmt, body_stmts
-
-    def _parse_for_enumerate_stmts(self):
-        init_stmts = []
-        init_stmts.extend(self._build_iter_node())
-        init_stmts.append(self._build_index_init_node())
-        init_stmts.append(self._build_var_len_assign_node())
-        init_stmts.append(self._build_enum_init_node())
-
-        compare_node = self._build_compare_node()
-        step_node = self._build_step_node()
-        cond_stmt = self._build_cond_stmt(step_node, compare_node)
-
-        body_stmts = self.body
-
-        target_node, assign_node = self._build_assign_var_slice_node()
-        body_stmts[0:0] = [assign_node]
-        for body_node in body_stmts:
-            NameNodeReplaceTransformer(body_node, self.iter_var_name,
-                                       target_node)
-
-        body_stmts.append(self._build_index_increase_node(step_node))
-        body_stmts.append(self._build_enum_increase_node())
-
-        return init_stmts, cond_stmt, body_stmts
-
-    def _build_index_init_node(self):
-        if self.is_for_range_iter():
-            if self.args_length == 1:
-                index_init_value_str = '0'
-            else:
-                index_init_value_str = ast_to_source_code(
-                    self.iter_args[0]).strip()
-
-            index_init_var_name = self.iter_var_name
-        else:
-            index_init_value_str = '0'
-            index_init_var_name = self.iter_idx_name
-
-        index_init_node_source_str = "{target} = {value}".format(
-            target=index_init_var_name, value=index_init_value_str)
-
-        index_init_node = gast.parse(index_init_node_source_str).body[0]
-
-        return index_init_node
-
-    def _build_var_len_assign_node(self):
-        # get the length of iterable variable
-        if isinstance(self.iter_node, gast.Call) and isinstance(
-                self.iter_node.func,
-                gast.Attribute) and self.iter_node.func.attr == 'numpy':
-            iter_var_name = ast_to_source_code(
-                self.iter_node.func.value).strip()
-        else:
-            iter_var_name = ast_to_source_code(self.iter_node).strip()
-
-        convert_len_node_source_str = '{} = _jst.Len({})'.format(
-            self.iter_var_len_name, iter_var_name)
-
-        convert_len_node = gast.parse(convert_len_node_source_str).body[0]
-
-        return convert_len_node
-
-    def _build_iter_node(self):
-        """
-        Process special cases for iter_node inclue:
-          - Case 1 (for zip):
-            
-            - for i, val in enumerate(zip(x, y))  # original code:
-            
-            - __for_loop_iter_zip_0 = list(zip(x, y))
-            - for i, val in enumerate(__for_loop_iter_zip_0)
-        """
-        new_nodes = []
-        if isinstance(self.iter_node, gast.Call) and isinstance(
-                self.iter_node.func, gast.Name):
-            if self.iter_node.func.id == 'zip':
-                iter_var_name = ast_to_source_code(self.iter_node).strip()
-                zip_to_list_str = "{target} = list({value})".format(
-                    target=self.iter_zip_to_list_name, value=iter_var_name)
-                zip_to_list_node = gast.parse(zip_to_list_str).body[0]
-                new_nodes.append(zip_to_list_node)
-
-                self.iter_node = gast.Name(id=self.iter_zip_to_list_name,
-                                           ctx=gast.Load(),
-                                           annotation=None,
-                                           type_comment=None)
-
-        return new_nodes
-
-    def _build_enum_init_node(self):
-        if self.is_for_enumerate_iter() and self.args_length != 1:
-            init_value_str = ast_to_source_code(self.iter_args[1]).strip()
-        else:
-            init_value_str = '0'
-
-        enum_init_node_source_str = "{} = {}".format(self.enum_idx_name,
-                                                     init_value_str)
-        enum_init_node = gast.parse(enum_init_node_source_str).body[0]
-        return enum_init_node
-
-    def _build_compare_node(self):
-        if self.is_for_range_iter():
-            compare_node = self.iter_args[
-                0] if self.args_length == 1 else self.iter_args[1]
-        else:
-            compare_node = gast.Name(id=self.iter_var_len_name,
-                                     ctx=gast.Load(),
-                                     annotation=None,
-                                     type_comment=None)
-        return compare_node
-
-    def _build_step_node(self):
-        if self.is_for_range_iter():
-            step_node = self.iter_args[
-                2] if self.args_length == 3 else gast.Constant(value=1,
-                                                               kind=None)
-        else:
-            step_node = gast.Constant(value=1, kind=None)
-        return step_node
-
-    def _build_cond_stmt(self, step_node, compare_node):
-        if not isinstance(step_node, (gast.Constant, gast.UnaryOp)):
-            raise NotImplementedError(
-                "Dynamic-to-Static only supports the step value is a constant or negative constant in 'for-range' statements, "
-                "such as '2', '-3'. But received: '{}'. Please fix code to be compatible with Dynamic-to-Static."
-                .format(ast_to_source_code(step_node).strip()))
-
-        if isinstance(step_node, gast.UnaryOp) or step_node.value < 0:
-            # eg:
-            # range(max, min, -2)
-            # ->
-            # i > min
-            return gast.Compare(left=gast.Name(
-                id=self.iter_var_name
-                if self.is_for_range_iter() else self.iter_idx_name,
-                ctx=gast.Load(),
-                annotation=None,
-                type_comment=None),
-                                ops=[gast.Gt()],
-                                comparators=[compare_node])
-        else:
-            # eg:
-            # range(min, max, 2)
-            # ->
-            # i < max
-            return gast.Compare(left=gast.Name(
-                id=self.iter_var_name
-                if self.is_for_range_iter() else self.iter_idx_name,
-                ctx=gast.Load(),
-                annotation=None,
-                type_comment=None),
-                                ops=[gast.Lt()],
-                                comparators=[compare_node])
-
-    def _build_index_increase_node(self, step_node):
-        return gast.AugAssign(target=gast.Name(
-            id=self.iter_var_name
-            if self.is_for_range_iter() else self.iter_idx_name,
-            ctx=gast.Store(),
-            annotation=None,
-            type_comment=None),
-                              op=gast.Add(),
-                              value=step_node)
-
-    def _build_assign_var_slice_node(self):
-        var_slice_str = "{}[{}]".format(
-            ast_to_source_code(self.iter_node).strip(), self.iter_idx_name)
-        var_slice_node = gast.parse(var_slice_str).body[0].value
-        new_iter_var_name = unique_name.generate(FOR_ITER_VAR_NAME_PREFIX)
-        target_node, assign_node = create_assign_node(new_iter_var_name,
-                                                      var_slice_node)
-        return target_node, assign_node
-
-    def _build_enum_increase_node(self):
-        return gast.AugAssign(target=gast.Name(id=self.enum_idx_name,
-                                               ctx=gast.Store(),
-                                               annotation=None,
-                                               type_comment=None),
-                              op=gast.Add(),
-                              value=gast.Constant(value=1, kind=None))
-
-    def _get_iter_var_name(self):
-        if self.is_for_range_iter():
-            return self.target.id
-        elif self.is_for_iter():
-            return self.target.id
-        elif self.is_for_enumerate_iter():
-            return self.target.elts[1].id
-        return None
-
-    def _get_iter_node(self):
-        if self.is_for_iter():
-            return self.iter_args
-        elif self.is_for_enumerate_iter():
-            return self.iter_args[0]
-        return None
-
-    def _get_enum_idx_name(self):
-        if self.is_for_enumerate_iter():
-            return self.target.elts[0].id
-        return None
-
-
-class SplitAssignTransformer(gast.NodeTransformer):
-    """
-    This class transforms sequence assignments and multi-target assignments to normal assignments.
-    """
-
-    def __init__(self, ast_node):
-        assert isinstance(ast_node, gast.AST)
-        self.ast_root = ast_node
-
-    def transform(self):
-        self.visit(self.ast_root)
-
-    def visit_Assign(self, node):
-        target_nodes = node.targets
-        if len(target_nodes) == 1:
-            node = self._parse_sequence_assign(node)
-        else:
-            node = self._parse_multi_target_assign(node)
-        return node
-
-    def _parse_sequence_assign(self, node):
-        """
-        a, b = c, d
-        ->
-        a = c
-        b = d
-        """
-        assert isinstance(node, gast.Assign)
-
-        target_nodes = node.targets
-        value_node = node.value
-        if not isinstance(target_nodes[0], (gast.List, gast.Tuple)):
-            return node
-        if not isinstance(value_node, (gast.List, gast.Tuple)):
-            return node
-
-        targets = node.targets[0].elts
-        values = node.value.elts
-        if len(targets) != len(values):
-            return node
-
-        new_nodes = []
-        for target, value in zip(targets, values):
-            assign_node = gast.Assign(targets=[target], value=value)
-            new_nodes.append(assign_node)
-
-        return new_nodes
-
-    def _parse_multi_target_assign(self, node):
-        """
-         Example 1:
-         a = b = c
-         ->
-         b = c
-         a = b
-
-         Example 2:
-         a, b = c, d = x
-         ->
-         c,d = x
-         a = c
-         b = d
-         """
-        assert isinstance(node, gast.Assign)
-
-        target_nodes = node.targets
-        value_node = node.value
-        new_nodes = []
-        for target in reversed(target_nodes):
-            assign_node = gast.Assign(targets=[target], value=value_node)
-            # NOTE: Because assign_node can be sequence assign statement like `a,b = c,d`,
-            # it's necessary to visit this new assign_node
-            parsed_node = self.visit_Assign(assign_node)
-            if not isinstance(parsed_node, list):
-                parsed_node = [parsed_node]
-
-            new_nodes.extend(parsed_node)
-            value_node = target
-
-        return new_nodes
-
-
 # NOTE: inspect.unwrap() exits in PY3 but not in PY2.
 def unwrap(func):
     """
@@ -1527,3 +991,316 @@ def slice_is_num(slice_node):
         return True
 
     return False
+
+
+class NameScope:
+
+    def __init__(self):
+        """ 
+            A NameScope is a object which manager all the variable names. 
+            only FunctionDef and Controlflow node will have a namescope property.
+
+            type can be "function" and "controlflow"
+
+            we don't analyze the read only variable because they don't affect the analysis.
+        """
+        self.globals = set()
+        self.nonlocals = set()
+        self.args = set()
+        self.father = None  # point to the nearest function name scope.
+        self.w_vars = set()  # all qualified + normal names been stored
+        self.created = set(
+        )  # useful for control flow compatibility. may be remove later
+
+    def set_father(self, father):
+        self.father = father
+
+    def existed_vars(self):
+        """ vars existing in current scope. 
+            they must not contain qualified names.
+        """
+        local_vars = self.w_vars - self.globals - self.nonlocals - self.args
+        return set(filter(lambda x: '.' not in x, local_vars))
+
+    def created_vars(self):
+        return self.created
+
+    def modified_vars(self):
+        # may be globals / non-locals / args / qualified names and created_vars
+        return self.w_vars
+
+    def control_flow_vars(self):
+        valid_names = self.w_vars
+        tmp = self.father.global_vars & valid_names,
+        return {"global": tmp, "nonlocal": self.w_vars - tmp}
+
+    def global_vars(self):
+        return self.globals
+
+    def merge_from(self, name_scope):
+        self.globals |= name_scope.globals
+        self.nonlocals |= name_scope.nonlocals
+        self.args |= name_scope.args
+        self.w_vars |= name_scope.w_vars
+
+
+class FunctionNameLivenessAnalysis(gast.NodeVisitor):
+    """ analyze the liveness of a function.
+
+        every variables stored in this scope will be collected,
+        in addition with global/nonlocal information.
+
+        1. global variable is stored in node.var_globals.
+        2. nonlocal variable is stored in node.var_nonlocals.
+        3. arguments is stored in node.var_args.
+
+        For example:
+
+        def func(*args, **kargs):
+            a = 12
+            global i,j
+            nonlocal x,y
+            print(a)
+            i = k
+            for m in range(10):
+                q = 12
+        
+        After this visitor we have: 
+        # node is the FunctionDef node with name: "func"
+        node.pd_scope = NameScope(
+            globals = ['i', 'j'],
+            nonlocals = ['x', 'y'],
+            args = ['args', 'kargs'], 
+            wr_vars = ['a', 'i', 'q', 'm']
+        )
+    """
+
+    def __init__(self, root_node):
+        self.scope_node_stack = []  # controlflow, functiondef node
+        self.visit(root_node)
+
+    def _reset_name_scope(self, node):
+        # always reset the node as empty namescope.
+        setattr(node, "pd_scope", NameScope())
+
+    def _get_name_scope(self, node):
+        if not hasattr(node, "pd_scope"):
+            setattr(node, "pd_scope", NameScope())
+        return node.pd_scope
+
+    def _current_name_scope(self):
+        return self._get_name_scope(self.scope_node_stack[-1])
+
+    def _father_name_scope(self):
+        if len(self.scope_node_stack) == 1: return None
+        return self._get_name_scope(self.scope_node_stack[-2])
+
+    def _nearest_function_scope(self):
+        if len(self.scope_node_stack) == 1: return None
+        for node in self.scope_node_stack[-2::-1]:
+            if isinstance(node, gast.FunctionDef):
+                return self._get_name_scope(node)
+
+    def visit_ListComp(self, node):
+        """ [ i for i in range(10) ]
+            In this case, `i` will not created in FunctionScope. 
+            We don't collect `i` by not calling generic_visit.
+        """
+        pass
+
+    def visit_DictComp(self, node):
+        """ the same as ListComp.
+        """
+        pass
+
+    def visit_Name(self, node):
+        self.generic_visit(node)
+        write_context = (gast.Store, gast.AugStore, gast.Del)
+        if isinstance(node.ctx, write_context):
+            self._current_name_scope().w_vars.add(node.id)
+
+    def visit_FunctionDef(self, node):
+
+        def pre_func():
+            self._current_name_scope().args |= set(
+                self._get_argument_names(node))
+
+        def post_func():
+            """ NOTE: why we need merge w_vars here ? 
+                because we do ifelse_transformer after loop_transformer. Loops will changed into functioons. but we know this function will be called in if. so we add w_vars to father function scope.
+            """
+            from paddle.fluid.dygraph.dygraph_to_static.loop_transformer import WHILE_CONDITION_PREFIX, WHILE_BODY_PREFIX, FOR_CONDITION_PREFIX, FOR_BODY_PREFIX
+            from paddle.fluid.dygraph.dygraph_to_static.ifelse_transformer import TRUE_FUNC_PREFIX, FALSE_FUNC_PREFIX
+            control_flow_function_def = [
+                WHILE_BODY_PREFIX, WHILE_BODY_PREFIX, FOR_CONDITION_PREFIX,
+                FOR_BODY_PREFIX, TRUE_FUNC_PREFIX, FALSE_FUNC_PREFIX
+            ]
+
+            def is_control_flow_def_node():
+                for prefix in control_flow_function_def:
+                    if node.name.startswith(prefix): return True
+                return False
+
+            if self._father_name_scope() and is_control_flow_def_node():
+                self._father_name_scope().w_vars |= self._current_name_scope(
+                ).w_vars
+
+        self._visit_scope_node(node, pre_func, post_func)
+
+    def _visit_scope_node(self, node, pre_func, post_func):
+        """ scope node main visit logic.
+            pre_func and post_func is callbacks
+        """
+        self._reset_name_scope(node)
+        self.scope_node_stack.append(node)
+        self._current_name_scope().father = self._nearest_function_scope()
+        if pre_func: pre_func()
+        self.generic_visit(node)
+        if post_func: post_func()
+        self.scope_node_stack.pop()
+
+    def _visit_controlflow_node(self, node):
+
+        def post_func():
+            self._father_name_scope().merge_from(self._current_name_scope())
+            self._nearest_function_scope().merge_from(
+                self._current_name_scope())
+            self._current_name_scope().created = self._nearest_function_scope(
+            ).existed_vars() - node.before_created
+            # gather created vars into father and used in CreateUndefinedVarTransform
+            self._nearest_function_scope().created |= self._current_name_scope(
+            ).created
+
+        def pre_func():
+            setattr(node, "before_created",
+                    self._nearest_function_scope().existed_vars())
+
+        self._visit_scope_node(node, pre_func, post_func)
+
+    def visit_For(self, node):
+        self._visit_controlflow_node(node)
+
+    def visit_While(self, node):
+        self._visit_controlflow_node(node)
+
+    def visit_If(self, node):
+        self._visit_controlflow_node(node)
+
+    def visit_Global(self, node):
+        self._current_name_scope().globals |= set(node.names)
+
+    def visit_Nonlocal(self, node):
+        self._current_name_scope().nonlocals |= set(node.names)
+
+    def visit_Attribute(self, node):
+        self.generic_visit(node)
+        write_context = (gast.Store, gast.AugStore, gast.Del)
+        if isinstance(node.ctx, write_context):
+            name = ast_to_source_code(node).strip()
+            self._current_name_scope().w_vars.add(name)
+
+    def _get_argument_names(self, node):
+        """ get all arguments name in the functiondef node.
+            this node is local to the function and shouldn't 
+            be created.
+        """
+        assert isinstance(
+            node, gast.FunctionDef), "Input node is not function define node"
+        names = [a for a in node.args.args]
+        names.append(node.args.vararg)
+        names.append(node.args.kwarg)
+        names = [i.id for i in names if i is not None]
+        return names
+
+
+def create_get_args_node(names):
+    """
+    Create get_args function as follows:
+
+        def get_args_0():
+            nonlocal x, y
+            return x, y
+    """
+
+    def empty_node():
+        func_def = """
+        def {func_name}():
+            return
+        """.format(func_name=unique_name.generate(GET_ARGS_FUNC_PREFIX))
+        return gast.parse(textwrap.dedent(func_def)).body[0]
+
+    assert isinstance(names, (list, tuple))
+    mapped = list(filter(lambda n: '.' not in n, names))
+    nonlocal_names = sorted(
+        mapped,
+        key=mapped.index)  # to keep the order, we can't use set() to unique
+    if not names:
+        return empty_node()
+    if not nonlocal_names:
+        nonlocal_vars = "\n"
+    else:
+        nonlocal_vars = "nonlocal " + ",".join(nonlocal_names)
+    template = """
+    def {func_name}():
+        {nonlocal_vars}
+        return {vars},
+    """
+    func_def = template.format(
+        func_name=unique_name.generate(GET_ARGS_FUNC_PREFIX),
+        nonlocal_vars=nonlocal_vars,
+        vars=",".join(names))
+    return gast.parse(textwrap.dedent(func_def)).body[0]
+
+
+def create_set_args_node(names):
+    """
+    Create set_args function as follows:
+
+        def set_args_0(__args):
+            nonlocal x, y
+            x, y = __args
+    """
+
+    def empty_node():
+        func_def = """
+        def {func_name}({args}):
+            pass
+        """.format(func_name=unique_name.generate(SET_ARGS_FUNC_PREFIX),
+                   args=ARGS_NAME)
+        return gast.parse(textwrap.dedent(func_def)).body[0]
+
+    assert isinstance(names, (list, tuple))
+    mapped = list(filter(lambda n: '.' not in n, names))
+    nonlocal_names = sorted(
+        mapped,
+        key=mapped.index)  # to keep the order, we can't use set() to unique
+    if not names:
+        return empty_node()
+    if not nonlocal_names:
+        nonlocal_vars = "\n"
+    else:
+        nonlocal_vars = "nonlocal " + ",".join(nonlocal_names)
+    template = """
+    def {func_name}({args}):
+        {nonlocal_vars}
+        {vars}, = {args}
+    """
+    func_def = template.format(
+        func_name=unique_name.generate(SET_ARGS_FUNC_PREFIX),
+        args=ARGS_NAME,
+        nonlocal_vars=nonlocal_vars,
+        vars=",".join(names))
+    return gast.parse(textwrap.dedent(func_def)).body[0]
+
+
+def create_nonlocal_stmt_nodes(names):
+    assert isinstance(names, (list, tuple))
+
+    mapped = list(filter(lambda n: '.' not in n, names))
+    names = sorted(
+        mapped,
+        key=mapped.index)  # to keep the order, we can't use set() to unique
+    if not names:
+        return []
+    func_code = "nonlocal {}".format(','.join(names))
+    return [gast.parse(func_code).body[0]]
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py
index 92ef7a3f13d9b..5593658ee6232 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py
@@ -16,15 +16,17 @@
 
 import six
 import paddle
+import textwrap
 from paddle.utils import gast
-from paddle.fluid import core
 from paddle.fluid import unique_name
 from paddle.fluid.framework import Variable
-from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.dygraph.dygraph_to_static.utils import UndefinedVar, create_undefined_variable
 
 __all__ = [
-    'create_bool_as_type', 'create_fill_constant_node', 'to_static_variable',
-    'create_undefined_var'
+    'create_bool_as_type',
+    'create_fill_constant_node',
+    'to_static_variable',
+    'create_undefined_var',
 ]
 
 
@@ -33,12 +35,6 @@ def create_undefined_var(name):
     return gast.parse(func_code).body[0]
 
 
-def create_nonlocal_stmt_node(names):
-    assert isinstance(names, (list, tuple))
-    func_code = "nonlocal {}".format(','.join(names))
-    return gast.parse(func_code).body[0]
-
-
 def create_fill_constant_node(name, value=0):
     func_code = "{} = paddle.full(shape=[1], ".format(name)
     if isinstance(value, bool):
@@ -66,7 +62,10 @@ def to_static_variable(x):
         return paddle.full(shape=[1], dtype='float64', fill_value=x)
     if isinstance(x, six.integer_types):
         return paddle.full(shape=[1], dtype='int64', fill_value=x)
-
+    if isinstance(x, UndefinedVar) or x is None:
+        """ for early return case, we need a variable to represent None, current we use data_layer_not_check.
+        """
+        return create_undefined_variable()
     return x
 
 
@@ -78,3 +77,12 @@ def create_bool_as_type(x, value=True):
         return paddle.full(shape=[1], fill_value=value, dtype="bool")
     else:
         return value
+
+
+def create_bool_node(name, value):
+    '''
+    Create a assign stmt for name = value .
+    '''
+    assert isinstance(value, bool)
+    node = "{} = {}".format(name, value)
+    return gast.parse(node).body[0]
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index b6847efab1d68..a55bcb9aaaba6 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -103,7 +103,7 @@ def _dygraph_to_static_func_(dygraph_func):
 
           @dygraph_to_static_func
           def func(x):
-              if fluid.layers.mean(x) < 0:
+              if paddle.mean(x) < 0:
                   x_v = x - 1
               else:
                   x_v = x + 1
@@ -160,7 +160,10 @@ def copy_decorator_attrs(original_func, decorated_obj):
     return decorated_obj
 
 
-def declarative(function=None, input_spec=None, build_strategy=None):
+def declarative(function=None,
+                input_spec=None,
+                build_strategy=None,
+                property=False):
     """
     Converts imperative dygraph APIs into declarative function APIs. Decorator
     @declarative handles the Program and Executor of static mode and returns
@@ -178,6 +181,7 @@ def declarative(function=None, input_spec=None, build_strategy=None):
             in the computational graph and memory optimization during the execution
             of the computational graph. For more information about build_strategy,
             please refer to :code:`paddle.static.BuildStrategy`. The default is None.
+        property(bool, Optional): whether the fucntion is python property. The default is False.
 
 
     Returns:
@@ -215,7 +219,8 @@ def decorated(python_func):
                                             decorated_obj=StaticFunction(
                                                 function=python_func,
                                                 input_spec=input_spec,
-                                                build_strategy=build_strategy))
+                                                build_strategy=build_strategy,
+                                                property=property))
 
         return static_layer
 
@@ -304,6 +309,9 @@ def __init__(self):
         self._program_only = False
         self.with_hook = False
 
+        # if True, multi `StaticFunction` will share params in one file.
+        self.combine_params = False
+
     @property
     def output_spec(self):
         return self._output_spec
@@ -371,7 +379,9 @@ def keep_name_table(self, value):
 
 
 def _parse_save_configs(configs):
-    supported_configs = ['output_spec', "with_hook"]
+    supported_configs = [
+        'output_spec', "with_hook", "combine_params", "clip_extra"
+    ]
 
     # input check
     for key in configs:
@@ -384,6 +394,8 @@ def _parse_save_configs(configs):
     inner_config = _SaveLoadConfig()
     inner_config.output_spec = configs.get('output_spec', None)
     inner_config.with_hook = configs.get('with_hook', False)
+    inner_config.combine_params = configs.get("combine_params", False)
+    inner_config.clip_extra = configs.get("clip_extra", False)
 
     return inner_config
 
@@ -840,6 +852,9 @@ def fun(inputs):
     # whether outermost layer has pre/post hook, if does, we need also save
     # these operators in program.
     with_hook = configs.with_hook
+    combine_params = configs.combine_params
+    if combine_params:
+        configs._program_only = True
 
     scope = core.Scope()
     extra_var_info = dict()
@@ -852,10 +867,21 @@ def fun(inputs):
         functions = [
             layer,
         ]
+
+    all_vars = set()
+    property_vals = []  # (value, key)
     for attr_func in functions:
         if isinstance(layer, Layer):
             static_func = getattr(inner_layer, attr_func, None)
             if isinstance(static_func, StaticFunction):
+                if static_func.is_property:
+                    # property method to be exported
+                    immediate_val = static_func()
+                    property_vals.append(
+                        (immediate_val,
+                         layer.__class__.__name__ + '.' + attr_func))
+                    continue
+
                 concrete_program = static_func.concrete_program_specify_input_spec(
                     inner_input_spec, with_hook=with_hook)
             elif 'forward' == attr_func:
@@ -875,10 +901,15 @@ def fun(inputs):
                 inner_input_spec = None
             else:
                 continue
-
         else:
             # When layer is a function
             if isinstance(attr_func, StaticFunction):
+                if attr_func.is_property:
+                    # property method to be exported
+                    immediate_val = attr_func()
+                    property_vals.append((immediate_val, attr_func))
+                    continue
+
                 concrete_program = attr_func.concrete_program_specify_input_spec(
                     inner_input_spec)
             else:
@@ -894,6 +925,7 @@ def fun(inputs):
                         '`jit.save` will only save the `Program`, not the parameters. If you have to save the parameters, please make sure that {} is a member function of `paddle.nn.Layer` and the saved parameters are in `state_dict`'
                         .format(layer))
 
+        # when save multi `StaticFunction`, all `StaticFunction` share params.
         dygraph_state_dict = None
         if isinstance(inner_layer, Layer):
             dygraph_state_dict = inner_layer.to_static_state_dict()
@@ -913,35 +945,32 @@ def fun(inputs):
                 state_names_dict[var.name] = structured_name
                 state_var_dict[var.name] = var
 
-            # 3. share parameters from Layer to scope & record var info
-            with dygraph.guard():
-                for param_or_buffer in concrete_program.parameters:
-                    # share to scope
-                    if param_or_buffer.type == core.VarDesc.VarType.VOCAB:
-                        scr_tensor = param_or_buffer.value().get_map_tensor()
-                        tgt_var = scope.var(param_or_buffer.name)
-                        tgt_var.set_vocab(scr_tensor)
-                    else:
-                        param_or_buffer_tensor = scope.var(
-                            param_or_buffer.name).get_tensor()
-                        #src_tensor = param_or_buffer.value().get_tensor()
-                        src_tensor = state_var_dict[
-                            param_or_buffer.name].value().get_tensor()
-                        param_or_buffer_tensor._share_data_with(src_tensor)
-                    # record var info
-                    if param_or_buffer.name not in extra_var_info:
-                        extra_info_dict = dict()
-                        if param_or_buffer.name in state_names_dict:
-                            extra_info_dict[
-                                'structured_name'] = state_names_dict[
-                                    param_or_buffer.name]
-                        extra_info_dict[
-                            'stop_gradient'] = param_or_buffer.stop_gradient
-                        if isinstance(param_or_buffer,
-                                      (ParamBase, EagerParamBase)):
-                            extra_info_dict[
-                                'trainable'] = param_or_buffer.trainable
-                        extra_var_info[param_or_buffer.name] = extra_info_dict
+        # 3. share parameters from Layer to scope & record var info
+        with dygraph.guard():
+            for param_or_buffer in concrete_program.parameters:
+                # share to scope
+                if param_or_buffer.type == core.VarDesc.VarType.VOCAB:
+                    scr_tensor = param_or_buffer.value().get_map_tensor()
+                    tgt_var = scope.var(param_or_buffer.name)
+                    tgt_var.set_vocab(scr_tensor)
+                else:
+                    param_or_buffer_tensor = scope.var(
+                        param_or_buffer.name).get_tensor()
+                    #src_tensor = param_or_buffer.value().get_tensor()
+                    src_tensor = state_var_dict[
+                        param_or_buffer.name].value().get_tensor()
+                    param_or_buffer_tensor._share_data_with(src_tensor)
+                # record var info
+                if param_or_buffer.name not in extra_var_info:
+                    extra_info_dict = dict()
+                    if param_or_buffer.name in state_names_dict:
+                        extra_info_dict['structured_name'] = state_names_dict[
+                            param_or_buffer.name]
+                    extra_info_dict[
+                        'stop_gradient'] = param_or_buffer.stop_gradient
+                    if isinstance(param_or_buffer, (ParamBase, EagerParamBase)):
+                        extra_info_dict['trainable'] = param_or_buffer.trainable
+                    extra_var_info[param_or_buffer.name] = extra_info_dict
 
         # 4. build input & output of save_infernece_model
         # NOTE(chenweihang): [ Get input variables name ]
@@ -989,7 +1018,23 @@ def fun(inputs):
                 params_filename=params_filename,
                 export_for_deployment=configs._export_for_deployment,
                 program_only=configs._program_only,
-                clip_extra=False)
+                clip_extra=configs.clip_extra)
+
+        # collect all vars
+        for var in concrete_program.main_program.list_vars():
+            all_vars.add(var)
+
+    # save shared params
+    if combine_params:
+        params_filename = file_prefix + INFER_PARAMS_SUFFIX
+        with scope_guard(scope):
+            paddle.static.save_vars(Executor(_current_expected_place()),
+                                    dirname=model_path,
+                                    vars=list(
+                                        filter(paddle.fluid.io.is_persistable,
+                                               all_vars)),
+                                    filename=params_filename)
+        # TODO: save property
 
     # NOTE(chenweihang): [ Save extra variable info ]
     # save_inference_model will lose some important variable information, including:
diff --git a/python/paddle/fluid/dygraph/learning_rate_scheduler.py b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
index 4b9c50127f046..18950144bc4d4 100644
--- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py
+++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
@@ -897,7 +897,7 @@ def step(self, loss):
         check_type(loss, 'loss', Variable, 'ReduceLROnPlateau.step')
         assert len(loss.shape) == 1 and loss.shape[0] == 1, "the loss.shape " \
             "should be (1L,), but the current loss.shape is {}. Maybe that "  \
-            "you should call fluid.layers.mean to process it first.".format(loss.shape)
+            "you should call paddle.mean to process it first.".format(loss.shape)
 
         self.epoch_num += 1
         if self.cooldown_counter > 0:
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 9eb044188f0d1..48497f4b9092f 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -1039,8 +1039,11 @@ def to_sparse_coo(self, sparse_dim):
 
         def dtype_str(dtype):
             if dtype in _PADDLE_DTYPE_2_NUMPY_DTYPE:
+                numpy_dtype = _PADDLE_DTYPE_2_NUMPY_DTYPE[dtype]
+                if numpy_dtype == 'uint16':
+                    numpy_dtype = 'bfloat16'
                 prefix = 'paddle.'
-                return prefix + _PADDLE_DTYPE_2_NUMPY_DTYPE[dtype]
+                return prefix + numpy_dtype
             else:
                 # for example, paddle.fluid.core.VarDesc.VarType.LOD_TENSOR
                 return origin(dtype)
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 62578eef86cfc..c7bfd19e5a9d0 100755
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -25,6 +25,7 @@
 from .data_feeder import convert_dtype
 from .framework import Program, default_main_program, Variable, Operator
 from .framework import convert_np_dtype_to_dtype_
+
 from . import core
 from . import unique_name
 from . import compiler
@@ -397,15 +398,12 @@ def _is_enable_standalone_executor():
     Whether to use experimental executor `StandaloneExecutor`.
     """
     flag = False
-
     from ..distributed.fleet import fleet
-    if fleet._role_maker is not None:
-        warnings.warn("do not use standalone executor in fleet by default")
-        env_val = os.environ.get('FLAGS_USE_STANDALONE_EXECUTOR', None)
-    else:
-        env_val = os.environ.get('FLAGS_USE_STANDALONE_EXECUTOR', '1')
+    # use standalone_executor by default if not distributed
+    if fleet._role_maker is None and framework._enable_standalone_executor_ is None:
+        framework._enable_standalone_executor_ = 1
 
-    if env_val in [1, '1', True, 'True', 'true']:
+    if framework._enable_standalone_executor_ in [1, '1', True, 'True', 'true']:
         flag = True
 
     return flag
@@ -537,7 +535,7 @@ def __init__(self, place, main_program, scope):
         self._scope = scope
         self._new_exe = self._create_new_executor()
 
-    def run(self, feed_names, fetch_list, return_numpy=True):
+    def run(self, scope, feed_names, fetch_list, return_numpy=True):
         """
         Args:
             feed_names(list): This parameter represents the input names of the model.
@@ -549,17 +547,15 @@ def run(self, feed_names, fetch_list, return_numpy=True):
         """
         fetch_list = self._check_fetch(fetch_list)
 
-        tensors = self._new_exe.run(feed_names, fetch_list)._move_to_list()
+        tensors = self._new_exe.run(scope, feed_names,
+                                    fetch_list)._move_to_list()
         if return_numpy:
             return as_numpy(tensors, copy=True)
         else:
             return tensors
 
     def _create_new_executor(self):
-        # NOTE: It's a trick to set empty start_up program.
-        startup_program = Program()
-        new_exe = core.StandaloneExecutor(self._place, startup_program.desc,
-                                          self._main_program.desc, self._scope)
+        new_exe = core.StandaloneExecutor(self._place, self._main_program.desc)
 
         return new_exe
 
@@ -1392,21 +1388,31 @@ def _run_impl(self, program, feed, fetch_list, feed_var_name,
             program = pruned_program
 
         def _can_use_interpreter_core(program, place):
-            if core.is_compiled_with_npu() or core.is_compiled_with_mlu(
-            ) or core.is_compiled_with_ipu() or isinstance(
+            if core.is_compiled_with_mlu() or isinstance(
                     place, core.CustomPlace):
                 return False
 
             compiled = isinstance(program, compiler.CompiledProgram)
-            # print("compiled is : {}".format(compiled))
-            # NOTE(zhiqiu): do not support compiled program now
             if compiled:
-                return False
-                # if program._is_data_parallel and len(
-                #         program._get_places(place, program._places)) == 1:
-                #     return True
-                # else:
-                #     return False
+                # Unsupported case 1 : the CompiledProgram is constructed by Graph
+                if program._program is None:
+                    return False
+
+                # Unsupported case 2 : disabled by FLAGS_CONVERT_GRAPH_TO_PROGRAM
+                if os.environ.get('FLAGS_CONVERT_GRAPH_TO_PROGRAM',
+                                  None) not in [1, '1', True, 'True', 'true']:
+                    return False
+
+                # Unsupported case 3: data parallel
+                if program._is_data_parallel and len(
+                        program._get_places(place, program._places)) != 1:
+                    return False
+
+                # Unsupported case 4: inference
+                if program._is_inference:
+                    return False
+
+                return True
             else:
                 if isinstance(program._graph, compiler.CompiledProgram):
                     return False
@@ -1437,6 +1443,16 @@ def _can_use_interpreter_core(program, place):
                 # a little bit tricy here, use inner_program before _add_feed_fetch_ops to get key
                 # while use program to geet _StandaloneExecutor
                 if key not in self._executor_cache._cached_executors:
+                    # To apply IR pass, compile the Program to IrGraph and convert it back to Program
+                    if isinstance(program, compiler.CompiledProgram):
+                        program._compile(scope, self.place)
+                        ir_graph = framework.IrGraph(program._graph)
+                        inner_program = ir_graph.to_program()
+                    else:
+                        from paddle.incubate.autograd import prim_enabled, prim2orig
+                        if prim_enabled() and program == default_main_program():
+                            prim2orig()
+
                     program = self._add_feed_fetch_ops(
                         program=inner_program,
                         feed=feed,
@@ -1470,7 +1486,8 @@ def _can_use_interpreter_core(program, place):
                     cpu_tensor = _as_lodtensor(data, core.CPUPlace())
                     tensor._copy_from(cpu_tensor, self.place)
 
-                return new_exe.run(list(feed.keys()), fetch_list, return_numpy)
+                return new_exe.run(scope, list(feed.keys()), fetch_list,
+                                   return_numpy)
 
         compiled = isinstance(program, compiler.CompiledProgram)
 
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 3d88a1377a056..d6e4af586699b 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -75,7 +75,7 @@
 CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName()
 
 _dygraph_tracer_ = None
-_in_eager_mode_ = (os.environ.get('FLAGS_enable_eager_mode') == '1')
+_in_eager_mode_ = (os.environ.get('FLAGS_enable_eager_mode', '1') == '1')
 _global_expected_place_ = None
 _current_device = None
 global_prog_seed = 0
@@ -84,6 +84,8 @@
 _already_patch_varbase = False
 _current_cuda_graph_mode = None
 _global_flags_ = core.globals()
+_enable_standalone_executor_ = (os.environ.get('FLAGS_USE_STANDALONE_EXECUTOR',
+                                               None))
 
 # Some explanation of our execution system 2022.03
 # For now we have 3 kinds of execution system, since we refactored dygraph mode to
@@ -259,6 +261,17 @@ def _test_eager_guard(place=None):
 ipu_stage_attr_name = 'ipu_stage'
 
 
+@signature_safe_contextmanager
+def _enable_standalone_executor(enable=True):
+    global _enable_standalone_executor_
+    original_ = _enable_standalone_executor_
+    _enable_standalone_executor_ = enable
+    try:
+        yield
+    finally:
+        _enable_standalone_executor_ = original_
+
+
 @signature_safe_contextmanager
 def ipu_shard_guard(index=-1, stage=-1):
     """
diff --git a/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py b/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py
index 806de1e6da900..1b763c6ed5952 100644
--- a/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py
+++ b/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py
@@ -16,6 +16,7 @@
 import logging
 import time
 
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
@@ -123,7 +124,7 @@ def model():
     auc_var, batch_auc_var, auc_states = fluid.layers.auc(input=predict,
                                                           label=label)
     cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
+    avg_cost = paddle.mean(x=cost)
 
     return datas, avg_cost, predict, train_file_path
 
diff --git a/python/paddle/fluid/install_check.py b/python/paddle/fluid/install_check.py
index 0c621766b3794..98d7fa6a037a6 100644
--- a/python/paddle/fluid/install_check.py
+++ b/python/paddle/fluid/install_check.py
@@ -102,7 +102,7 @@ def test_parallerl_exe():
                     exe = executor.Executor(
                         core.CUDAPlace(0) if core.is_compiled_with_cuda() and
                         (core.get_cuda_device_count() > 0) else core.CPUPlace())
-                    loss = layers.mean(out)
+                    loss = paddle.mean(out)
                     loss.persistable = True
                     optimizer.SGD(learning_rate=0.01).minimize(loss)
                     startup_prog.random_seed = 1
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 3d071fce6c77e..db88331040fa7 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -376,7 +376,7 @@ def name_has_fc(var):
                          vars=list(filter(predicate, main_program.list_vars())),
                          filename=filename)
     else:
-        params_var_name = unique_name.generate("saved_params")
+        params_var_name = "saved_params"
         # give warning when there is no var in model
         if len(list(vars)) == 0:
             warnings.warn(
@@ -493,7 +493,7 @@ def save_params(executor, dirname, main_program=None, filename=None):
             predict = fluid.layers.fc(input=image, size=10, act='softmax')
 
             loss = fluid.layers.cross_entropy(input=predict, label=label)
-            avg_loss = fluid.layers.mean(loss)
+            avg_loss = paddle.mean(loss)
 
             exe = fluid.Executor(fluid.CPUPlace())
             exe.run(fluid.default_startup_program())
@@ -719,7 +719,7 @@ def save_persistables(executor, dirname, main_program=None, filename=None):
 
             predict = fluid.layers.fc(input=image, size=10, act='softmax')
             loss = fluid.layers.cross_entropy(input=predict, label=label)
-            avg_loss = fluid.layers.mean(loss)
+            avg_loss = paddle.mean(loss)
             exe = fluid.Executor(fluid.CPUPlace())
             exe.run(fluid.default_startup_program())
             fluid.io.save_persistables(executor=exe, dirname=dir_path, filename=file_name)
@@ -1315,7 +1315,7 @@ def save_inference_model(dirname,
             predict = fluid.layers.fc(input=image, size=10, act='softmax')
 
             loss = fluid.layers.cross_entropy(input=predict, label=label)
-            avg_loss = fluid.layers.mean(loss)
+            avg_loss = paddle.mean(loss)
 
             exe = fluid.Executor(fluid.CPUPlace())
             exe.run(fluid.default_startup_program())
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 4c3a4e5e8fcb1..d7b859612473f 100755
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -21,7 +21,7 @@
 from ..framework import Program, Variable, Operator, _non_static_mode, static_only, _in_legacy_dygraph, in_dygraph_mode
 from ..layer_helper import LayerHelper, unique_name
 from .nn import logical_and, logical_not, logical_or
-from .utils import assert_same_structure, map_structure, hold_mutable_vars, copy_mutable_vars
+from .utils import assert_same_structure, map_structure, hold_mutable_vars, copy_mutable_vars, padding_to_same_structure, is_sequence, pack_sequence_as, flatten, to_sequence
 import numpy
 import warnings
 import six
@@ -107,9 +107,15 @@ def select_input(inputs, mask):
 
 def select_input_with_buildin_type(inputs, mask):
     from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import to_static_variable
-    support_ret_buildin_type = (bool, float, six.integer_types)
+    from paddle.fluid.dygraph.dygraph_to_static.utils import UndefinedVar, create_undefined_var_like
     false_var, true_var = inputs
 
+    if isinstance(false_var, UndefinedVar) and isinstance(
+            true_var, UndefinedVar):
+        """ None -> UndefinedVar, so the real value is a [None, UndefinedVar] or [None, None], we just return None.
+        """
+        return None
+
     if isinstance(false_var, Variable) and isinstance(true_var, Variable):
         return select_input(inputs, mask)
 
@@ -132,6 +138,27 @@ def select_input_with_buildin_type(inputs, mask):
             "Return results from different branches in cond are not same type: "
             "false_var returned by fasle_fn is '{}' and true_var of true_fn is "
             "'{}'".format(type(false_var), type(true_var)))
+    elif ((isinstance(false_var, UndefinedVar)
+           and isinstance(true_var, (Variable, ) + support_ret_buildin_type))
+          or (isinstance(true_var, UndefinedVar)
+              and isinstance(false_var,
+                             (Variable, ) + support_ret_buildin_type))):
+
+        def create_var_if_not_undefined_var(a):
+            if isinstance(a, UndefinedVar): return a
+            return to_static_variable(a)
+
+        def create_like_if_undefined_var(a, b):
+            if isinstance(a, UndefinedVar): return create_undefined_var_like(b)
+            return a
+
+        # TODO(xiongkun): add warning here.
+        true_var, false_var = create_var_if_not_undefined_var(
+            true_var), create_var_if_not_undefined_var(false_var)
+        inputs = [
+            create_like_if_undefined_var(false_var, true_var),
+            create_like_if_undefined_var(true_var, false_var)
+        ]
     else:
         raise TypeError(
             "Unsupported return type of true_fn and false_fn in cond: false_var "
@@ -1154,12 +1181,19 @@ def _complete(self):
             })
 
 
+support_ret_buildin_type = (bool, float, six.integer_types)
+
+
 def assign_skip_lod_tensor_array(input, output):
     """
     Assign input to output, but skip the process of copying LoDTensorArray unless it's created in while_block.
     """
-    if not isinstance(input, Variable) and not isinstance(input, core.VarBase):
-        output = input
+    if not isinstance(input, (Variable, core.VarBase)):
+        if isinstance(output, Variable) and isinstance(
+                input, support_ret_buildin_type):
+            assign(input, output)
+        else:
+            output = input
         return
 
     if input.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
@@ -1266,6 +1300,7 @@ def body(i, ten):
         if not isinstance(output_vars, (list, tuple)):
             output_vars = [output_vars]
         try:
+            loop_vars = _deal_with_undefined_var(output_vars, loop_vars)
             assert_same_structure(output_vars, loop_vars, check_types=False)
         except ValueError as e:
             raise ValueError(
@@ -1277,6 +1312,36 @@ def body(i, ten):
     return loop_vars
 
 
+def _deal_with_undefined_var(output_vars, loop_vars):
+    """ Deal with undefined var cases, We create undefined variable based on the results of body().
+        In Dy2Static, we use undefined var to represent the var created in control flow. This function
+        expand the loop_vars and replace original loop_vars.
+        1. UndefinedVar = Variable      # create a variable
+        2. UndefinedVar = None          # create a undefined var with RETURN_NO_VALUE_MAGIC_NUM
+        3. UndefinedVar = List(int)     # create a list of variable
+        4. UndefinedVar = value         # create a variable
+    """
+    from paddle.fluid.dygraph.dygraph_to_static.utils import UndefinedVar, create_undefined_variable
+
+    def create_var_like(o_var):
+        if isinstance(o_var,
+                      (Variable, ) + support_ret_buildin_type) or o_var is None:
+            return create_undefined_variable()
+        if isinstance(o_var, (tuple, list)):
+            return [create_undefined_variable() for i in range(len(o_var))]
+
+    if len(output_vars) != len(loop_vars):
+        raise ValueError("The length of loop_vars should be the same.")
+
+    results = []
+    for o_var, l_var in zip(output_vars, loop_vars):
+        if isinstance(l_var, UndefinedVar) or l_var is None:
+            results.append(create_var_like(o_var))
+        else:
+            results.append(l_var)
+    return results
+
+
 def lod_rank_table(x, level=0):
     """
     LoD Rank Table Operator. Given an input variable **x** and a level number
@@ -2377,7 +2442,7 @@ def copy_var_to_parent_block(var, layer_helper):
     return parent_block_var
 
 
-def cond(pred, true_fn=None, false_fn=None, name=None):
+def cond(pred, true_fn=None, false_fn=None, name=None, return_names=None):
     """
     This API returns ``true_fn()`` if the predicate ``pred`` is true else
     ``false_fn()`` . Users could also set ``true_fn`` or ``false_fn`` to
@@ -2426,6 +2491,10 @@ def cond(pred, true_fn=None, false_fn=None, name=None):
         name(str, optional): The default value is ``None`` . Normally users
              don't have to set this parameter. For more information, please
              refer to :ref:`api_guide_Name` .
+        return_names(sequence of string, optional): The default value is ``None`` . 
+             Normally users don't have to set this parameters.  A sequence of strings 
+             to represents the name of returned vars.  The structure of sequence must 
+             be same with return values of true_fn and false_fn.
 
     Returns:
         Tensor|list(Tensor)|tuple(Tensor): returns ``true_fn()`` if the
@@ -2536,12 +2605,30 @@ def false_func():
             "true_fn returns non-None while false_fn returns None")
 
     # Merge ture and false output if they are not None
-    try:
-        assert_same_structure(true_output, false_output, check_types=False)
-    except ValueError as e:
+    if return_names is None:
+        return_names = ["no name"] * len(to_sequence(true_output))
+    else:
+        """ 
+        dy2static will set the return_names and expand the return values to UndefinedVar.
+        """
+        true_output, false_output = expand_undefined_var(
+            true_output, false_output, return_names)
+        true_output, false_output = change_none_to_undefinedvar(
+            true_output, false_output)
+    if len(to_sequence(true_output)) != len(to_sequence(false_output)):
         raise ValueError(
-            "Incompatible return values of true_fn and false_fn in cond: {}".
-            format(e))
+            "true fn returns {} vars, but false fn returns {} vars, which is not equals"
+            .format(len(to_sequence(true_output)),
+                    len(to_sequence(false_output))))
+    for true_out, false_out, return_name in zip(to_sequence(true_output),
+                                                to_sequence(false_output),
+                                                to_sequence(return_names)):
+        try:
+            assert_same_structure(true_out, false_out, check_types=False)
+        except ValueError as e:
+            raise ValueError(
+                "Incompatible return values of `{}` in true_fn and false_fn in cond: {}"
+                .format(return_name, e))
 
     mask = cast(pred, dtype='int32')
     merge_func = lambda false_var, true_var: select_input_with_buildin_type(
@@ -2550,6 +2637,46 @@ def false_func():
     return merged_output
 
 
+def change_none_to_undefinedvar(nest1, nest2):
+    from paddle.fluid.dygraph.dygraph_to_static.utils import UndefinedVar
+
+    def map_fn(x):
+        if x is None: return UndefinedVar("padding")
+        return x
+
+    nest1_out = pack_sequence_as(nest1, list(map(map_fn, flatten(nest1))))
+    nest2_out = pack_sequence_as(nest2, list(map(map_fn, flatten(nest2))))
+    return nest1_out, nest2_out
+
+
+def expand_undefined_var(nest1, nest2, names):
+    """ TODO: make this function recursively.
+        nest1: Var1, (UndefinedVar, [1,2,3])
+        nest2: Var2, ([1,2,3,4], UndefinedVar)
+        In this case, we should not expand recursively.
+    """
+    from paddle.fluid.dygraph.dygraph_to_static.utils import UndefinedVar
+    from paddle.fluid.dygraph.dygraph_to_static.return_transformer import RETURN_VALUE_PREFIX
+
+    def pack_undefined_var_as(seq):
+        return pack_sequence_as(seq,
+                                [UndefinedVar("padding") for i in flatten(seq)])
+
+    def map_fn(n1, n2, name):
+        if not name.startswith(RETURN_VALUE_PREFIX) and (isinstance(
+                n1, UndefinedVar) or n1 is None):
+            return pack_undefined_var_as(n2)
+        return n1
+
+    nest1_out = list(
+        map(map_fn, to_sequence(nest1), to_sequence(nest2), to_sequence(names)))
+    nest2_out = list(
+        map(map_fn, to_sequence(nest2), to_sequence(nest1), to_sequence(names)))
+    if not is_sequence(nest1): nest1_out = nest1_out[0]
+    if not is_sequence(nest2): nest2_out = nest2_out[0]
+    return nest1_out, nest2_out
+
+
 def _error_message(what, arg_name, op_name, right_value, error_value):
     error_message = "{what} of '{arg_name}' in {op_name} must be " \
         "{right_value}, but received: {error_value}.".format(
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 9a7ab0ebbb5aa..ddcc1db84b752 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -3731,52 +3731,13 @@ def distribute_fpn_proposals(fpn_rois,
                 refer_level=4,
                 refer_scale=224)
     """
-    num_lvl = max_level - min_level + 1
-
-    if _non_static_mode():
-        assert rois_num is not None, "rois_num should not be None in dygraph mode."
-        attrs = ('min_level', min_level, 'max_level', max_level, 'refer_level',
-                 refer_level, 'refer_scale', refer_scale)
-        multi_rois, restore_ind, rois_num_per_level = _C_ops.distribute_fpn_proposals(
-            fpn_rois, rois_num, num_lvl, num_lvl, *attrs)
-        return multi_rois, restore_ind, rois_num_per_level
-
-    check_variable_and_dtype(fpn_rois, 'fpn_rois', ['float32', 'float64'],
-                             'distribute_fpn_proposals')
-    helper = LayerHelper('distribute_fpn_proposals', **locals())
-    dtype = helper.input_dtype('fpn_rois')
-    multi_rois = [
-        helper.create_variable_for_type_inference(dtype) for i in range(num_lvl)
-    ]
-
-    restore_ind = helper.create_variable_for_type_inference(dtype='int32')
-
-    inputs = {'FpnRois': fpn_rois}
-    outputs = {
-        'MultiFpnRois': multi_rois,
-        'RestoreIndex': restore_ind,
-    }
-
-    if rois_num is not None:
-        inputs['RoisNum'] = rois_num
-        rois_num_per_level = [
-            helper.create_variable_for_type_inference(dtype='int32')
-            for i in range(num_lvl)
-        ]
-        outputs['MultiLevelRoIsNum'] = rois_num_per_level
-
-    helper.append_op(type='distribute_fpn_proposals',
-                     inputs=inputs,
-                     outputs=outputs,
-                     attrs={
-                         'min_level': min_level,
-                         'max_level': max_level,
-                         'refer_level': refer_level,
-                         'refer_scale': refer_scale
-                     })
-    if rois_num is not None:
-        return multi_rois, restore_ind, rois_num_per_level
-    return multi_rois, restore_ind
+    return paddle.vision.ops.distribute_fpn_proposals(fpn_rois=fpn_rois,
+                                                      min_level=min_level,
+                                                      max_level=max_level,
+                                                      refer_level=refer_level,
+                                                      refer_scale=refer_scale,
+                                                      rois_num=rois_num,
+                                                      name=name)
 
 
 @templatedoc()
diff --git a/python/paddle/fluid/layers/loss.py b/python/paddle/fluid/layers/loss.py
index 1ad4e3c4298c2..00c2aa56fa3e0 100644
--- a/python/paddle/fluid/layers/loss.py
+++ b/python/paddle/fluid/layers/loss.py
@@ -1230,6 +1230,63 @@ def softmax_with_cross_entropy(logits,
         return_softmax, axis)
 
 
+def identity_loss(x, reduction="none"):
+    r"""Marks a tensor as being part of the loss calculation for IPU.
+
+    This operator is used to handle on the (final) loss of a model so that
+    it is used as the start of backpropagation.
+
+    When `reduction` is `none`, return raw `Out`.
+    
+    When `reduction` is `mean`, return
+
+    .. math::
+        Out = MEAN(Out)
+
+    When `reduction` is `sum`, return
+
+    .. math::
+        Out = SUM(Out)
+
+    Parameters:
+        x (Variable): The input tensor. The shapes is [N, *], where N is batch size and `*` means any number of
+             additional dimensions. It's data type should be float32, float64 on CPU and float16, float32 on IPU.
+        reduction(str|int, optional): Reduce the loss output. Supported string values are: 'sum', 'mean', 'none'
+                            the corresponding int values are 0, 1, 2 respectively. The default value is "none".
+
+    Returns:
+        Variable: The loss ``Tensor`` with the specified reduction applied.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
+            loss = fluid.data(name="loss", shape=[-1, 1], dtype="float32")
+            out = paddle.incubate.identity_loss(loss, reduction=1)
+    """
+    if isinstance(reduction, str):
+        reduction = {"sum": 0, "mean": 1, "none": 2}.get(reduction.lower())
+        if reduction is None:
+            raise Exception("Unsupported reduction type.")
+
+    if _non_static_mode():
+        return _C_ops.identity_loss(x, "reduction", reduction)
+
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], "identity_loss")
+    attrs = {'reduction': reduction}
+    helper = LayerHelper('identity_loss', **locals())
+    dtype = helper.input_dtype(input_param_name='x')
+    out = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(type="identity_loss",
+                     inputs={"X": x},
+                     outputs={"Out": out},
+                     attrs=attrs)
+    return out
+
+
 def rank_loss(label, left, right, name=None):
     r"""
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index d7f0feb103c5f..050d6bfcb6bbb 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -13089,7 +13089,7 @@ def mean(x, name=None):
 
             input = fluid.layers.data(
                 name='data', shape=[2, 3], dtype='float32')
-            mean = fluid.layers.mean(input)
+            mean = paddle.mean(input)
     """
 
     if _in_legacy_dygraph():
diff --git a/python/paddle/fluid/layers/utils.py b/python/paddle/fluid/layers/utils.py
index ca11727221f23..be8045b7bb8a5 100644
--- a/python/paddle/fluid/layers/utils.py
+++ b/python/paddle/fluid/layers/utils.py
@@ -125,6 +125,13 @@ def _yield_flat_nest(nest):
             yield n
 
 
+def to_sequence(nest):
+    if is_sequence(nest):
+        return nest
+    else:
+        return [nest]
+
+
 def flatten(nest):
     """
 	:alias_main: paddle.flatten
@@ -260,6 +267,26 @@ def _recursive_assert_same_structure(nest1, nest2, check_types):
         _recursive_assert_same_structure(n1, n2, check_types)
 
 
+def padding_to_same_structure(nest1, nest2, obj=None):
+
+    def _padding_to_same_structure_single(value, obj):
+
+        def change_none_to_obj(x):
+            if x is None: return obj
+            return x
+
+        if is_sequence(value):
+            value = pack_sequence_as(
+                value, [change_none_to_obj(item) for item in flatten(value)])
+        else:
+            value = change_none_to_obj(value)
+        return value
+
+    nest1 = _padding_to_same_structure_single(nest1, obj)
+    nest2 = _padding_to_same_structure_single(nest2, obj)
+    return nest1, nest2
+
+
 def assert_same_structure(nest1, nest2, check_types=True):
     """
     Confirm two nested structures with the same structure.
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 315382262a0f3..c97809a069d5c 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -913,7 +913,7 @@ def backward(self,
             program = loss.block.program
             assert len(loss.shape) == 1 and loss.shape[0] == 1, \
                 "The loss.shape should be (1L,), but the current loss.shape is {}. " \
-                "Maybe that you should call fluid.layers.mean to process the current loss.".format(
+                "Maybe that you should call paddle.mean to process the current loss.".format(
                     loss.shape)
             parameter_list = parameter_list if parameter_list \
                 else self._parameter_list
@@ -6834,7 +6834,7 @@ class LookaheadOptimizer(object):
             label = fluid.layers.data(name="label", shape=[1], dtype="int64")
             y = fluid.layers.fc(input=[x], size=2, act="softmax")
             loss = fluid.layers.cross_entropy(input=y, label=label)
-            loss = fluid.layers.mean(x=loss)
+            loss = paddle.mean(x=loss)
             sgd = fluid.optimizer.SGD(learning_rate=0.01)
             optimizer = fluid.optimizer.LookaheadOptimizer(sgd,
                                                 alpha=0.5,
diff --git a/python/paddle/fluid/tests/CMakeLists.txt b/python/paddle/fluid/tests/CMakeLists.txt
index 6acee6dc11c89..92e29202b28b8 100644
--- a/python/paddle/fluid/tests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/CMakeLists.txt
@@ -12,5 +12,6 @@ add_subdirectory(unittests)
 add_subdirectory(book)
 add_subdirectory(custom_op)
 add_subdirectory(custom_kernel)
+add_subdirectory(custom_runtime)
 
 set_tests_properties(test_beam_search_decoder PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/book/notest_understand_sentiment.py b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
index d96e640f77a96..941ff43ab7d69 100644
--- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py
+++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
@@ -48,7 +48,7 @@ def convolution_net(data,
                                  size=class_dim,
                                  act="softmax")
     cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(cost)
+    avg_cost = paddle.mean(cost)
     accuracy = fluid.layers.accuracy(input=prediction, label=label)
     return avg_cost, accuracy, prediction
 
@@ -93,7 +93,7 @@ def gate_common(ipt, hidden, size):
     last = fluid.layers.sequence_last_step(rnn())
     prediction = fluid.layers.fc(input=last, size=class_dim, act="softmax")
     cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(cost)
+    avg_cost = paddle.mean(cost)
     accuracy = fluid.layers.accuracy(input=prediction, label=label)
     return avg_cost, accuracy, prediction
 
@@ -132,7 +132,7 @@ def stacked_lstm_net(data,
                                  size=class_dim,
                                  act='softmax')
     cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(cost)
+    avg_cost = paddle.mean(cost)
     accuracy = fluid.layers.accuracy(input=prediction, label=label)
     return avg_cost, accuracy, prediction
 
diff --git a/python/paddle/fluid/tests/book/test_fit_a_line.py b/python/paddle/fluid/tests/book/test_fit_a_line.py
index 71ba7f0c79ec9..62aaefedde780 100644
--- a/python/paddle/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/test_fit_a_line.py
@@ -56,16 +56,16 @@ def train(use_cuda, save_dirname, is_local, use_bf16, pure_bf16):
             with amp.bf16.bf16_guard():
                 y_predict = fluid.layers.fc(input=x, size=1, act=None)
             cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-            avg_cost = fluid.layers.mean(cost)
+            avg_cost = paddle.mean(cost)
         else:
             y_predict = fluid.layers.fc(input=x, size=1, act=None)
             with amp.bf16.bf16_guard():
                 cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-                avg_cost = fluid.layers.mean(cost)
+                avg_cost = paddle.mean(cost)
     else:
         y_predict = fluid.layers.fc(input=x, size=1, act=None)
         cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
+        avg_cost = paddle.mean(cost)
 
     lr = 5e-3 if use_bf16 else 1e-3
     sgd_optimizer = fluid.optimizer.SGD(learning_rate=lr)
diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py
index e2f78a0f36f7b..0b31a62e8e8f7 100644
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -126,7 +126,7 @@ def train(net_type, use_cuda, save_dirname, is_local):
 
     predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
     cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.mean(cost)
+    avg_cost = paddle.mean(cost)
     acc = fluid.layers.accuracy(input=predict, label=label)
 
     # Test program
diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index cb962493e7ac8..c8ffe0cb49cb6 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -160,7 +160,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
                                              param_attr=fluid.ParamAttr(
                                                  name='crfw',
                                                  learning_rate=mix_hidden_lr))
-    avg_cost = fluid.layers.mean(crf_cost)
+    avg_cost = paddle.mean(crf_cost)
 
     # TODO(qiao)
     # check other optimizers and check why out will be NAN
diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py
index 5301f9aa7607c..e81061f665477 100644
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -34,7 +34,7 @@
 def loss_net(hidden, label):
     prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_loss = fluid.layers.mean(loss)
+    avg_loss = paddle.mean(loss)
     acc = fluid.layers.accuracy(input=prediction, label=label)
     return prediction, avg_loss, acc
 
diff --git a/python/paddle/fluid/tests/book/test_recommender_system.py b/python/paddle/fluid/tests/book/test_recommender_system.py
index 0a26a03eb878b..048bfac344e79 100644
--- a/python/paddle/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/fluid/tests/book/test_recommender_system.py
@@ -153,7 +153,7 @@ def model():
 
     label = layers.data(name='score', shape=[1], dtype='float32')
     square_cost = layers.square_error_cost(input=scale_infer, label=label)
-    avg_cost = layers.mean(square_cost)
+    avg_cost = paddle.mean(square_cost)
 
     return scale_infer, avg_cost
 
diff --git a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
index 9499583c07bae..694ed70c04dea 100644
--- a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
+++ b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
@@ -158,7 +158,7 @@ def seq_to_seq_net():
                               dtype='int64',
                               lod_level=1)
     cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(cost)
+    avg_cost = paddle.mean(cost)
 
     return avg_cost, prediction
 
diff --git a/python/paddle/fluid/tests/book/test_word2vec_book.py b/python/paddle/fluid/tests/book/test_word2vec_book.py
index 9e79fd3f523f8..b1325abea01b6 100644
--- a/python/paddle/fluid/tests/book/test_word2vec_book.py
+++ b/python/paddle/fluid/tests/book/test_word2vec_book.py
@@ -85,7 +85,7 @@ def __network__(words):
                                        size=dict_size,
                                        act='softmax')
         cost = fluid.layers.cross_entropy(input=predict_word, label=words[4])
-        avg_cost = fluid.layers.mean(cost)
+        avg_cost = paddle.mean(cost)
         return avg_cost, predict_word
 
     word_dict = paddle.dataset.imikolov.build_dict()
diff --git a/python/paddle/fluid/tests/custom_kernel/CMakeLists.txt b/python/paddle/fluid/tests/custom_kernel/CMakeLists.txt
index b2bdfac908069..af700c22038e3 100644
--- a/python/paddle/fluid/tests/custom_kernel/CMakeLists.txt
+++ b/python/paddle/fluid/tests/custom_kernel/CMakeLists.txt
@@ -1,2 +1,15 @@
-py_test(test_custom_kernel_dot SRCS test_custom_kernel_dot.py)
-py_test(test_custom_kernel_load SRCS test_custom_kernel_load.py)
+file(
+  GLOB TEST_OPS
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+set(CUSTOM_ENVS
+    PADDLE_SOURCE_DIR=${PADDLE_SOURCE_DIR}
+    PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}
+    CUSTOM_DEVICE_ROOT=${CMAKE_BINARY_DIR}/python/paddle/fluid/tests/custom_kernel
+)
+
+foreach(TEST_OP ${TEST_OPS})
+  py_test(${TEST_OP} SRCS ${TEST_OP}.py ENVS ${CUSTOM_ENVS})
+endforeach()
diff --git a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_c_setup.py b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_c_setup.py
index a94307161d431..e162daf2b87e1 100644
--- a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_c_setup.py
+++ b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_c_setup.py
@@ -48,10 +48,9 @@ def build_extensions(self):
     os.path.join(site_packages_path, 'paddle', 'include'),
 ]
 # include path third_party
-compile_third_party_path = os.path.join(os.environ['PADDLE_ROOT'],
-                                        'build/third_party')
+compile_third_party_path = os.path.join(os.environ['PADDLE_BINARY_DIR'],
+                                        'third_party')
 paddle_custom_kernel_include += [
-    os.path.join(compile_third_party_path, 'boost/src/extern_boost'),  # boost
     os.path.join(compile_third_party_path, 'install/gflags/include'),  # gflags
     os.path.join(compile_third_party_path, 'install/glog/include'),  # glog
 ]
diff --git a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py
index 94de1a39ccfbb..efe5368cdca56 100644
--- a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py
+++ b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py
@@ -50,10 +50,9 @@ def build_extensions(self):
         site_packages_path))
 
 # include path third_party
-compile_third_party_path = os.path.join(os.environ['PADDLE_ROOT'],
-                                        'build/third_party')
+compile_third_party_path = os.path.join(os.environ['PADDLE_BINARY_DIR'],
+                                        'third_party')
 paddle_custom_kernel_include += [
-    os.path.join(compile_third_party_path, 'boost/src/extern_boost'),  # boost
     os.path.join(compile_third_party_path, 'install/gflags/include'),  # gflags
     os.path.join(compile_third_party_path, 'install/glog/include'),  # glog
 ]
diff --git a/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_dot.py b/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_dot.py
index e28bfe00e7c4f..130f74c06d554 100644
--- a/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_dot.py
+++ b/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_dot.py
@@ -31,10 +31,6 @@ def setUp(self):
             cur_dir, sys.executable)
         os.system(cmd)
 
-        # set environment for loading and registering compiled custom kernels
-        # only valid in current process
-        os.environ['CUSTOM_DEVICE_ROOT'] = cur_dir
-
     def test_custom_kernel_dot_run(self):
         # test dot run
         x_data = np.random.uniform(1, 5, [2, 10]).astype(np.int8)
@@ -52,9 +48,6 @@ def test_custom_kernel_dot_run(self):
             "custom kernel dot out: {},\n numpy dot out: {}".format(
                 out.numpy(), result))
 
-    def tearDown(self):
-        del os.environ['CUSTOM_DEVICE_ROOT']
-
 
 class TestCustomKernelDotC(unittest.TestCase):
 
@@ -67,10 +60,6 @@ def setUp(self):
             cur_dir, sys.executable)
         os.system(cmd)
 
-        # set environment for loading and registering compiled custom kernels
-        # only valid in current process
-        os.environ['CUSTOM_DEVICE_ROOT'] = cur_dir
-
     def test_custom_kernel_dot_run(self):
         # test dot run
         x_data = np.random.uniform(1, 5, [2, 10]).astype(np.int8)
@@ -88,9 +77,6 @@ def test_custom_kernel_dot_run(self):
             "custom kernel dot out: {},\n numpy dot out: {}".format(
                 out.numpy(), result))
 
-    def tearDown(self):
-        del os.environ['CUSTOM_DEVICE_ROOT']
-
 
 if __name__ == '__main__':
     if os.name == 'nt' or sys.platform.startswith('darwin'):
diff --git a/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.h b/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.h
index 9cec48f9c99b5..ffe89fde0470e 100644
--- a/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.h
+++ b/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.h
@@ -66,7 +66,7 @@ struct ReluFunctor {
       return;
     }
 #endif
-    LAUNCH_RELU_KERNEL(paddle::platform::CPUDeviceContext);
+    LAUNCH_RELU_KERNEL(phi::CPUContext);
 
 #undef LAUNCH_RELU_KERNEL
   }
diff --git a/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt b/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt
new file mode 100644
index 0000000000000..482dc9cb1f3f6
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt
@@ -0,0 +1,11 @@
+if(WITH_CUSTOM_DEVICE)
+  file(
+    GLOB TEST_OPS
+    RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+    "test_*.py")
+  string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+  foreach(TEST_OP ${TEST_OPS})
+    py_test(${TEST_OP} SRCS ${TEST_OP}.py)
+  endforeach()
+endif()
diff --git a/python/paddle/autograd/utils.py b/python/paddle/fluid/tests/custom_runtime/__init__.py
similarity index 72%
rename from python/paddle/autograd/utils.py
rename to python/paddle/fluid/tests/custom_runtime/__init__.py
index 6b8865f4d7df0..97043fd7ba688 100644
--- a/python/paddle/autograd/utils.py
+++ b/python/paddle/fluid/tests/custom_runtime/__init__.py
@@ -11,16 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-import typing
-
-from paddle.fluid import framework
-
-
-def as_tensors(xs):
-    if isinstance(xs, framework.Variable):
-        return (xs, )
-    elif isinstance(xs, typing.Sequence):
-        return tuple(xs)
-    else:
-        return xs
diff --git a/python/paddle/fluid/tests/custom_runtime/test_custom_cpu_plugin.py b/python/paddle/fluid/tests/custom_runtime/test_custom_cpu_plugin.py
new file mode 100644
index 0000000000000..00d7255a83f21
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_runtime/test_custom_cpu_plugin.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import site
+import unittest
+import numpy as np
+
+
+class TestCustomCPUPlugin(unittest.TestCase):
+
+    def setUp(self):
+        # compile so and set to current path
+        cur_dir = os.path.dirname(os.path.abspath(__file__))
+        cmd = 'rm -rf PaddleCustomDevice && git clone https://github.com/PaddlePaddle/PaddleCustomDevice.git && cd PaddleCustomDevice/backends/custom_cpu && mkdir build && cd build && cmake .. && make -j8'
+        os.system(cmd)
+
+        # set environment for loading and registering compiled custom kernels
+        # only valid in current process
+        os.environ['CUSTOM_DEVICE_ROOT'] = os.path.join(
+            cur_dir, 'PaddleCustomDevice/backends/custom_cpu/build')
+
+    def test_custom_device(self):
+        import paddle
+
+        with paddle.fluid.framework._test_eager_guard():
+            self._test_custom_device_dataloader()
+            self._test_custom_device_mnist()
+            self._test_eager_backward_api()
+        self._test_custom_device_dataloader()
+        self._test_custom_device_mnist()
+
+    def _test_custom_device_dataloader(self):
+        import paddle
+
+        paddle.set_device('custom_cpu')
+        dataset = paddle.vision.datasets.MNIST(
+            mode='test',
+            transform=paddle.vision.transforms.Compose([
+                paddle.vision.transforms.CenterCrop(20),
+                paddle.vision.transforms.RandomResizedCrop(14),
+                paddle.vision.transforms.Normalize(),
+                paddle.vision.transforms.ToTensor()
+            ]))
+        loader = paddle.io.DataLoader(dataset,
+                                      batch_size=32,
+                                      num_workers=1,
+                                      shuffle=True)
+        for image, label in loader:
+            self.assertTrue(image.place.is_custom_place())
+            self.assertTrue(label.place.is_custom_place())
+            break
+
+    def _test_custom_device_mnist(self):
+        import paddle
+
+        class MNIST(paddle.nn.Layer):
+
+            def __init__(self):
+                super(MNIST, self).__init__()
+                self.shape = 1 * 28 * 28
+                self.size = 10
+                self.output_weight = self.create_parameter(
+                    [self.shape, self.size])
+                self.accuracy = paddle.metric.Accuracy()
+
+            def forward(self, inputs, label=None):
+                x = paddle.reshape(inputs, shape=[-1, self.shape])
+                x = paddle.matmul(x, self.output_weight)
+                x = paddle.nn.functional.softmax(x)
+                if label is not None:
+                    self.accuracy.reset()
+                    correct = self.accuracy.compute(x, label)
+                    self.accuracy.update(correct)
+                    acc = self.accuracy.accumulate()
+                    return x, acc
+                else:
+                    return x
+
+        paddle.set_device('custom_cpu')
+        dataset = paddle.vision.datasets.MNIST(
+            mode='train',
+            transform=paddle.vision.transforms.Compose(
+                [paddle.vision.transforms.ToTensor()]))
+        loader = paddle.io.DataLoader(dataset,
+                                      batch_size=64,
+                                      num_workers=1,
+                                      shuffle=True)
+
+        mnist = MNIST()
+        sgd = paddle.optimizer.SGD(learning_rate=0.01,
+                                   parameters=mnist.parameters())
+
+        data = next(loader())
+        img = data[0]
+        label = data[1]
+        label_int32 = paddle.cast(label, 'int32')
+
+        pred, acc = mnist(img, label_int32)
+        avg_loss = paddle.nn.functional.cross_entropy(pred, label_int32)
+        avg_loss.backward()
+        sgd.step()
+        sgd.clear_grad()
+
+        self.assertTrue(pred.place.is_custom_place())
+
+    def _test_eager_backward_api(self):
+        x = np.random.random([2, 2]).astype("float32")
+        y = np.random.random([2, 2]).astype("float32")
+        grad = np.ones([2, 2]).astype("float32")
+
+        import paddle
+        paddle.set_device('custom_cpu')
+        x_tensor = paddle.to_tensor(x, stop_gradient=False)
+        y_tensor = paddle.to_tensor(y)
+        z1_tensor = paddle.matmul(x_tensor, y_tensor)
+        z2_tensor = paddle.matmul(x_tensor, y_tensor)
+
+        grad_tensor = paddle.to_tensor(grad)
+        paddle.autograd.backward([z1_tensor, z2_tensor], [grad_tensor, None])
+
+        self.assertTrue(x_tensor.grad.place.is_custom_place())
+
+    def tearDown(self):
+        del os.environ['CUSTOM_DEVICE_ROOT']
+
+
+if __name__ == '__main__':
+    if os.name == 'nt' or sys.platform.startswith('darwin'):
+        # only support Linux now
+        exit()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/test_beam_search_decoder.py b/python/paddle/fluid/tests/test_beam_search_decoder.py
index f37090f67e257..887180d3f01fd 100644
--- a/python/paddle/fluid/tests/test_beam_search_decoder.py
+++ b/python/paddle/fluid/tests/test_beam_search_decoder.py
@@ -145,7 +145,7 @@ def train_main(use_cuda):
                         dtype='int64',
                         lod_level=1)
     cost = layers.cross_entropy(input=rnn_out, label=label)
-    avg_cost = layers.mean(x=cost)
+    avg_cost = paddle.mean(x=cost)
 
     optimizer = fluid.optimizer.Adagrad(learning_rate=1e-3)
     optimizer.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/test_error_clip.py b/python/paddle/fluid/tests/test_error_clip.py
index e3b20c323929a..68380f8187083 100644
--- a/python/paddle/fluid/tests/test_error_clip.py
+++ b/python/paddle/fluid/tests/test_error_clip.py
@@ -35,7 +35,7 @@
     label = fluid.layers.data(name='y', shape=[1], dtype='int64')
 
     cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.mean(cost)
+    avg_cost = paddle.mean(cost)
 
 prog_clip = prog.clone()
 prog_clip.block(0).var(hidden1.name)._set_error_clip(
diff --git a/python/paddle/fluid/tests/test_if_else_op.py b/python/paddle/fluid/tests/test_if_else_op.py
index 12d33d1c724df..4e140032dd8dc 100644
--- a/python/paddle/fluid/tests/test_if_else_op.py
+++ b/python/paddle/fluid/tests/test_if_else_op.py
@@ -66,7 +66,7 @@ def not_test_raw_api(self):
                                     mask=cond,
                                     x=image)
             loss = layers.cross_entropy(input=prob, label=label)
-            avg_loss = layers.mean(loss)
+            avg_loss = paddle.mean(loss)
 
             optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
             optimizer.minimize(avg_loss, startup_prog)
@@ -124,7 +124,7 @@ def not_test_ifelse(self):
 
             prob = ie()
             loss = layers.cross_entropy(input=prob[0], label=label)
-            avg_loss = layers.mean(loss)
+            avg_loss = paddle.mean(loss)
 
             optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
             optimizer.minimize(avg_loss, startup_prog)
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 6df9c8c4269ca..28bd796efdcee 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -72,7 +72,10 @@ list(APPEND DIST_TEST_OPS test_auto_parallel_data_unshard)
 list(APPEND DIST_TEST_OPS test_auto_parallel_save_load)
 list(APPEND DIST_TEST_OPS test_auto_parallel_autoconvert)
 list(APPEND DIST_TEST_OPS test_collective_process_group)
+list(APPEND DIST_TEST_OPS test_collective_alltoall_single)
 list(APPEND DIST_TEST_OPS test_eager_dist_api)
+list(APPEND DIST_TEST_OPS test_collective_batch_isend_irecv)
+list(APPEND DIST_TEST_OPS test_collective_reduce_scatter)
 set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS})
 #remove distribute unittests.
 list(APPEND MIXED_DIST_TEST_OPS test_dgc_op)
@@ -202,8 +205,6 @@ endif()
 # Temporally disable test_deprecated_decorator
 list(REMOVE_ITEM TEST_OPS test_deprecated_decorator)
 
-list(REMOVE_ITEM TEST_OPS test_tensordot)
-
 if(WIN32)
   list(REMOVE_ITEM TEST_OPS test_multiprocess_reader_exception)
   list(REMOVE_ITEM TEST_OPS test_trainer_desc)
@@ -334,7 +335,11 @@ if((NOT WITH_GPU) AND (NOT WITH_ROCM))
   list(REMOVE_ITEM TEST_OPS test_auto_parallel_save_load)
   list(REMOVE_ITEM TEST_OPS test_auto_parallel_autoconvert)
   list(REMOVE_ITEM TEST_OPS test_collective_process_group)
+  list(REMOVE_ITEM TEST_OPS test_collective_alltoall_single)
   list(REMOVE_ITEM TEST_OPS test_eager_dist_api)
+  list(REMOVE_ITEM TEST_OPS test_collective_batch_isend_irecv)
+  list(REMOVE_ITEM TEST_OPS test_collective_reduce_scatter)
+
 elseif(WITH_GPU)
   if(${CUDNN_VERSION} VERSION_LESS 7100)
     list(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
@@ -1412,7 +1417,7 @@ set_tests_properties(test_index_select_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_parallel_ssa_graph_inference_feed_partial_data
                      PROPERTIES TIMEOUT 120)
 set_tests_properties(test_parallel_executor_crf PROPERTIES TIMEOUT 120)
-#set_tests_properties(test_tensordot PROPERTIES TIMEOUT 200)
+set_tests_properties(test_tensordot PROPERTIES TIMEOUT 200)
 set_tests_properties(test_imperative_save_load PROPERTIES TIMEOUT 120)
 set_tests_properties(test_partial_eager_deletion_transformer PROPERTIES TIMEOUT
                                                                         120)
@@ -1569,8 +1574,10 @@ if(WITH_DISTRIBUTE
   set_tests_properties(test_auto_parallel_save_load PROPERTIES TIMEOUT 120)
   set_tests_properties(test_auto_parallel_autoconvert PROPERTIES TIMEOUT 120)
   set_tests_properties(test_collective_process_group PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_collective_alltoall_single PROPERTIES TIMEOUT 60)
   set_tests_properties(test_eager_dist_api PROPERTIES TIMEOUT 100)
-
+  set_tests_properties(test_collective_batch_isend_irecv PROPERTIES TIMEOUT 100)
+  set_tests_properties(test_collective_reduce_scatter PROPERTIES TIMEOUT 100)
   if(${NCCL_VERSION} VERSION_GREATER_EQUAL 2212)
     set_tests_properties(test_parallel_dygraph_sparse_embedding
                          PROPERTIES TIMEOUT 200)
@@ -1687,3 +1694,22 @@ if($ENV{USE_STANDALONE_EXECUTOR})
   set_tests_properties(test_imperative_mnist_sorted_gradient
                        PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
 endif()
+
+if(WITH_CINN AND WITH_TESTING)
+  set_tests_properties(
+    test_resnet50_with_cinn
+    PROPERTIES
+      LABELS
+      "RUN_TYPE=CINN"
+      ENVIRONMENT
+      FLAGS_allow_cinn_ops="conv2d;conv2d_grad;elementwise_add;elementwise_add_grad;relu;relu_grad;sum"
+  )
+  set_tests_properties(
+    test_parallel_executor_run_cinn
+    PROPERTIES
+      LABELS
+      "RUN_TYPE=CINN"
+      ENVIRONMENT
+      FLAGS_allow_cinn_ops="conv2d;conv2d_grad;elementwise_add;elementwise_add_grad;relu;relu_grad;sum"
+  )
+endif()
diff --git a/python/paddle/fluid/tests/unittests/asp/asp_pruning_base.py b/python/paddle/fluid/tests/unittests/asp/asp_pruning_base.py
index 1b387c081208d..0e24e94d456b0 100644
--- a/python/paddle/fluid/tests/unittests/asp/asp_pruning_base.py
+++ b/python/paddle/fluid/tests/unittests/asp/asp_pruning_base.py
@@ -61,7 +61,7 @@ def run_inference_pruning_test(self, get_mask_gen_func,
 
     def run_training_pruning_test(self, get_mask_gen_func, get_mask_check_func):
         with fluid.program_guard(self.main_program, self.startup_program):
-            loss = fluid.layers.mean(
+            loss = paddle.mean(
                 fluid.layers.cross_entropy(input=self.predict,
                                            label=self.label))
             optimizer = paddle.incubate.asp.decorate(
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py b/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py
index 27b4361852f6a..e35430b046ac5 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py
@@ -242,7 +242,7 @@ def test_inference_pruning(self):
 
     def test_training_pruning(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            loss = fluid.layers.mean(
+            loss = paddle.mean(
                 fluid.layers.cross_entropy(input=self.predict,
                                            label=self.label))
             optimizer = sparsity.decorate(
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_optimize_static.py b/python/paddle/fluid/tests/unittests/asp/test_asp_optimize_static.py
index c6b3d00cac249..8770d4cb3b575 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_optimize_static.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_optimize_static.py
@@ -48,7 +48,7 @@ def build_model():
 
         with fluid.program_guard(self.main_program, self.startup_program):
             self.img, self.label, predict = build_model()
-            self.loss = fluid.layers.mean(
+            self.loss = paddle.mean(
                 fluid.layers.cross_entropy(input=predict, label=self.label))
             self.optimizer = fluid.optimizer.SGD(learning_rate=0.01)
 
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_static.py b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_static.py
index 1bb5c1477b29b..4796bf364a207 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_static.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_static.py
@@ -66,7 +66,7 @@ def test_inference_pruning(self):
 
     def test_training_pruning(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            loss = fluid.layers.mean(
+            loss = paddle.mean(
                 fluid.layers.cross_entropy(input=self.predict,
                                            label=self.label))
             optimizer = paddle.incubate.asp.decorate(
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_save_load.py b/python/paddle/fluid/tests/unittests/asp/test_asp_save_load.py
index dc5316d254fd5..66543514e5369 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_save_load.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_save_load.py
@@ -141,7 +141,7 @@ def build_model():
 
         with fluid.program_guard(self.main_program, self.startup_program):
             self.img, self.label, predict = build_model()
-            self.loss = fluid.layers.mean(
+            self.loss = paddle.mean(
                 fluid.layers.cross_entropy(input=predict, label=self.label))
             self.optimizer = fluid.optimizer.SGD(learning_rate=0.01)
             self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
index e0eb04e2535c5..6c51ce1fffae3 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
@@ -51,6 +51,8 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
   py_test_modules(test_cluster MODULES test_cluster ENVS ${dist_ENVS})
   py_test_modules(test_comm_cost MODULES test_comm_cost ENVS ${dist_ENVS})
   py_test_modules(test_comp_cost MODULES test_comp_cost ENVS ${dist_ENVS})
+  py_test_modules(test_base_cost MODULES test_base_cost ENVS ${dist_ENVS})
   py_test_modules(test_dist_context MODULES test_dist_context ENVS ${dist_ENVS})
   py_test_modules(test_prim_dist_op MODULES test_prim_dist_op ENVS ${dist_ENVS})
+  py_test_modules(test_to_static MODULES test_to_static ENVS ${dist_ENVS})
 endif()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_base_cost.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_base_cost.py
new file mode 100644
index 0000000000000..0fbe4f5bd3d09
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_base_cost.py
@@ -0,0 +1,234 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import os
+import json
+import tempfile
+
+import paddle
+import paddle.nn as nn
+import paddle.static as static
+import paddle.nn.functional as F
+import paddle.utils as utils
+import paddle.distributed.auto_parallel as auto
+from paddle.distributed.auto_parallel.completion import Completer
+from paddle.distributed.auto_parallel.dist_context import DistributedContext
+from paddle.distributed import fleet
+from paddle.distributed.auto_parallel.parallelizer import AutoParallelizer
+from paddle.distributed.auto_parallel.partitioner import Partitioner
+from paddle.distributed.auto_parallel.reshard import Resharder
+from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
+from paddle.distributed.auto_parallel.cluster import Cluster
+from paddle.distributed.auto_parallel.cost import CommContext
+from paddle.distributed.auto_parallel.cost.base_cost import build_comp_desc_from_dist_op
+from paddle.distributed.auto_parallel.cost.base_cost import build_comm_desc_from_dist_op
+from paddle.distributed.auto_parallel.cost.base_cost import build_comm_costs_from_descs
+from paddle.distributed.auto_parallel.cost.base_cost import build_comp_costs_from_descs
+from paddle.distributed.auto_parallel.cost.base_cost import build_dp_costs
+from paddle.distributed.auto_parallel.cost import AllreduceSumOpCost
+from paddle.distributed.auto_parallel.cost import _g_op_cost_factory
+from test_cluster import cluster_json
+
+paddle.enable_static()
+_global_parallel_strategy = "dp_mp_pp"
+_global_process_mesh = auto.ProcessMesh([[[0, 1], [4, 5]], [[2, 3], [6, 7]]])
+PP_MESH_0 = auto.ProcessMesh([[0, 1], [4, 5]])
+PP_MESH_1 = auto.ProcessMesh([[2, 3], [6, 7]])
+
+
+class MLPLayer(nn.Layer):
+
+    def __init__(self,
+                 hidden_size=1024,
+                 intermediate_size=4 * 1024,
+                 initializer_range=0.02):
+        super(MLPLayer, self).__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        weight_attr = paddle.ParamAttr(
+            initializer=nn.initializer.Normal(mean=0.0, std=initializer_range))
+        bias_attr = None
+
+        self.linear0 = nn.Linear(d_model,
+                                 dim_feedforward,
+                                 weight_attr,
+                                 bias_attr=bias_attr)
+        self.linear1 = nn.Linear(dim_feedforward,
+                                 d_model,
+                                 weight_attr,
+                                 bias_attr=bias_attr)
+        self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
+
+    def forward(self, input):
+        auto.shard_tensor(self.linear0.weight,
+                          dist_attr={
+                              "process_mesh": PP_MESH_0,
+                              "dims_mapping": [-1, 1]
+                          })
+        auto.shard_tensor(self.linear1.weight,
+                          dist_attr={
+                              "process_mesh": PP_MESH_1,
+                              "dims_mapping": [1, -1]
+                          })
+
+        out = self.norm(input)
+        out = self.linear0(out)
+        out = F.gelu(out, approximate=True)
+        out = self.linear1(out)
+
+        return out
+
+
+def mlp_forward(train_program, start_program):
+    with static.program_guard(train_program,
+                              start_program), utils.unique_name.guard():
+        batch_size = 4
+        hidden_size = 1024
+        sequence_len = 512
+        input = static.data(name="input",
+                            shape=[batch_size, hidden_size],
+                            dtype='float32')
+        label = static.data(name="label",
+                            shape=[batch_size, 1],
+                            dtype='float32')
+
+        fill_constant_out = paddle.fluid.layers.fill_constant_batch_size_like(
+            input=input, shape=[batch_size], value=1, dtype="int32")
+        embedding = paddle.nn.Embedding(10, hidden_size, sparse=True)
+        embedding_out = embedding(fill_constant_out)
+
+        auto.shard_tensor(input,
+                          dist_attr={
+                              "process_mesh": PP_MESH_0,
+                              "dims_mapping": [0, -1]
+                          })
+        auto.shard_tensor(label,
+                          dist_attr={
+                              "process_mesh": PP_MESH_1,
+                              "dims_mapping": [0, -1]
+                          })
+
+        mlp = MLPLayer(hidden_size=hidden_size,
+                       intermediate_size=4 * hidden_size,
+                       initializer_range=0.02)
+
+        predict = mlp(embedding_out)
+        error_cost = paddle.nn.functional.square_error_cost(predict, label)
+        loss = paddle.mean(error_cost)
+
+    return loss, train_program, start_program
+
+
+def get_prog(train_program, startup_program, dist_context, rank_id):
+    global _global_process_mesh
+    dist_context.process_mesh = _global_process_mesh
+    loss, train_program, startup_program = mlp_forward(train_program,
+                                                       startup_program)
+
+    fleet._user_defined_strategy = fleet.DistributedStrategy()
+    fleet.user_defined_optimizer = paddle.fluid.optimizer.AdamOptimizer()
+    parallelizer = AutoParallelizer(fleet)
+    parallelizer._dist_context = dist_context
+
+    # serial forward & backward completion
+    completer = Completer(dist_context)
+    complete_train_program = completer.complete_forward_annotation(
+        train_program)
+    dist_context.block_state.parse_forward_blocks(complete_train_program)
+    params_grads = parallelizer._generate_backward(complete_train_program,
+                                                   startup_program,
+                                                   loss,
+                                                   parameter_list=None,
+                                                   no_grad_set=None,
+                                                   callbacks=None)
+    return train_program, startup_program, params_grads
+
+
+class TestBaseCost(unittest.TestCase):
+
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
+    def test_base_cost(self):
+        # Build cluster
+        cluster_json_path = os.path.join(self.temp_dir.name,
+                                         "auto_parallel_cluster.json")
+        cluster_json_object = json.loads(cluster_json)
+        with open(cluster_json_path, "w") as cluster_json_file:
+            json.dump(cluster_json_object, cluster_json_file)
+        cluster = Cluster()
+        cluster.build_from_file(cluster_json_path)
+
+        train_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        dist_context = DistributedContext()
+        rank_id = 2
+        train_program, startup_program, params_grads = get_prog(
+            train_program, startup_program, dist_context, rank_id)
+
+        for op in train_program.global_block().ops:
+            dist_op = dist_context.get_dist_op_for_program(op)
+            if dist_op:
+                processes = dist_op.dist_attr.process_mesh.processes
+                comp_descs = build_comp_desc_from_dist_op(dist_op, dist_context)
+                self.assertTrue(isinstance(comp_descs, dict) and comp_descs)
+                var_names = None
+                if op.input_arg_names:
+                    var_names = op.input_arg_names[0]
+                    comm_descs = build_comm_desc_from_dist_op("c_allreduce_sum",
+                                                              dist_op,
+                                                              dist_context,
+                                                              var_names,
+                                                              attrs=None,
+                                                              parallel_axis=0,
+                                                              group_ranks=None)
+                    self.assertTrue(isinstance(comm_descs, dict) and comm_descs)
+                    comm_descs = build_comm_desc_from_dist_op(
+                        "c_allreduce_sum",
+                        dist_op,
+                        dist_context,
+                        var_names,
+                        attrs=None,
+                        parallel_axis=None,
+                        group_ranks=processes)
+                    self.assertTrue(isinstance(comm_descs, dict) and comm_descs)
+
+                    comm_costs = build_comm_costs_from_descs(
+                        AllreduceSumOpCost, dist_context, processes, comm_descs,
+                        cluster)
+                    self.assertTrue(comm_costs)
+
+                    comp_costs = build_comp_costs_from_descs(
+                        _g_op_cost_factory[op.type], dist_context, processes,
+                        comp_descs, cluster)
+                    self.assertTrue(comp_costs)
+
+                    result = []
+                    build_dp_costs(result, dist_op, dist_context, var_names[0],
+                                   None, 0, cluster)
+                    self.assertTrue(result)
+
+        # Remove unnecessary files
+        if os.path.exists(cluster_json_path):
+            os.remove(cluster_json_path)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_cluster.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_cluster.py
index dd9b0110dbebd..2fa01bdfa6a59 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_cluster.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_cluster.py
@@ -19,6 +19,7 @@
 
 import paddle
 from paddle.distributed.auto_parallel.cluster import Cluster
+from paddle.distributed.auto_parallel.cluster import get_default_cluster
 
 cluster_json = """
 { 
@@ -1997,6 +1998,10 @@ def test_single_machine(self):
         self.assertTrue(devices == [0, 1, 2, 3])
         self.assertTrue(involved_machine_count == 1)
 
+        # Remove unnecessary files
+        if os.path.exists(cluster_json_path):
+            os.remove(cluster_json_path)
+
     def test_multi_machine(self):
         # Build cluster
         cluster_json_path = os.path.join(self.temp_dir.name,
@@ -2018,6 +2023,21 @@ def test_multi_machine(self):
         self.assertTrue(devices == [5, 6, 7, 10])
         self.assertTrue(involved_machine_count == 2)
 
+        # Remove unnecessary files
+        if os.path.exists(cluster_json_path):
+            os.remove(cluster_json_path)
+
+    def test_default_config_cluster(self):
+        cluster = Cluster()
+        cluster.gen_default_config_cluster(device_count=8)
+        # check machines and devices
+        self.assertTrue(cluster.get_num_machines() == 1)
+        self.assertTrue(cluster.get_num_devices_per_machine() == 8)
+
+    def test_default_cluster(self):
+        cluster = get_default_cluster()
+        self.assertTrue(isinstance(cluster, Cluster))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_comm_cost.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_comm_cost.py
index 215385787880c..5744cf6d39206 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_comm_cost.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_comm_cost.py
@@ -154,6 +154,10 @@ def test_cross_machine_comm_cost(self):
                                   comm_context=comm_context)
         self.assertTrue(recv_op_cost.time > 0)
 
+        # Remove unnecessary files
+        if os.path.exists(cluster_json_path):
+            os.remove(cluster_json_path)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_new_cost_model.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_new_cost_model.py
index fe46131225759..6b0db61b984c5 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_new_cost_model.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_new_cost_model.py
@@ -19,8 +19,8 @@
 
 import paddle
 import paddle.distributed.auto_parallel.cost as cost_model
-from paddle.distributed.auto_parallel.cost.base_cost import parse_to_desc
-from paddle.distributed.auto_parallel.cost.base_cost import parse_desc_to_str
+from paddle.distributed.auto_parallel.cost.base_cost import build_comp_desc_from_op
+from paddle.distributed.auto_parallel.cost.base_cost import build_comp_desc_str_for_predict
 from paddle.distributed.auto_parallel.cost.base_cost import calc_time_by_modeling
 from paddle.distributed.auto_parallel.cluster import Cluster
 from paddle.distributed.auto_parallel.cost import CommContext
@@ -60,8 +60,8 @@ def test_comp_cost(self):
                 break
         matmul_v2_cost = cost_model._g_op_cost_factory["matmul_v2"](
             op=matmul_v2_op)
-        desc = parse_to_desc(op=matmul_v2_op)
-        desc_str = parse_desc_to_str(desc)
+        desc = build_comp_desc_from_op(op=matmul_v2_op)
+        desc_str = build_comp_desc_str_for_predict(desc)
         self.assertIsNotNone(desc_str)
         self.assertTrue(check_cost(matmul_v2_cost.cost))
         time = calc_time_by_modeling(op=matmul_v2_op)
@@ -92,11 +92,29 @@ def test_comm_cost(self):
             op_desc=desc, comm_context=CommContext(cluster))
         self.assertTrue(check_cost(allreduce_cost.cost))
 
+        # Remove unnecessary files
+        if os.path.exists(cluster_json_path):
+            os.remove(cluster_json_path)
+
     def test_cost_estimator(self):
+        # Build cluster
+        cluster_json_path = os.path.join(self.temp_dir.name,
+                                         "auto_parallel_cluster.json")
+        cluster_json_object = json.loads(cluster_json)
+        with open(cluster_json_path, "w") as cluster_json_file:
+            json.dump(cluster_json_object, cluster_json_file)
+        cluster = Cluster()
+        cluster.build_from_file(cluster_json_path)
+
         train_program = paddle.static.Program()
-        cost_estimator = cost_model.CostEstimator(train_program)
+        cost_estimator = cost_model.CostEstimator(train_program,
+                                                  cluster=cluster)
         self.assertIsNotNone(cost_estimator)
 
+        # Remove unnecessary files
+        if os.path.exists(cluster_json_path):
+            os.remove(cluster_json_path)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_to_static.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_to_static.py
new file mode 100644
index 0000000000000..4e4fb9b5825ed
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_to_static.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import os
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle.distributed.auto_parallel as auto
+import paddle.distributed.fleet as fleet
+
+from paddle.io import Dataset
+from paddle.static import InputSpec
+from paddle.fluid.framework import _non_static_mode
+from paddle.distributed.auto_parallel.engine import Engine
+
+batch_size = 4
+batch_num = 30
+hidden_size = 1024
+class_num = 10
+
+
+class MyDataset(Dataset):
+
+    def __init__(self, num_samples):
+        super(MyDataset, self).__init__()
+        self.num_samples = num_samples
+
+    def __getitem__(self, index):
+        input = np.random.uniform(size=hidden_size).astype("float32")
+        label = np.random.randint(0, class_num - 1, dtype="int64")
+        return input, label
+
+    def __len__(self):
+        return self.num_samples
+
+
+class MLPLayer(nn.Layer):
+
+    def __init__(self,
+                 hidden_size=1024,
+                 intermediate_size=4 * 1024,
+                 dropout_ratio=0.1,
+                 initializer_range=0.02):
+        super(MLPLayer, self).__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        weight_attr = paddle.ParamAttr(
+            initializer=nn.initializer.Normal(mean=0.0, std=initializer_range))
+
+        self.linear0 = nn.Linear(d_model,
+                                 dim_feedforward,
+                                 weight_attr,
+                                 bias_attr=None)
+        self.linear1 = nn.Linear(dim_feedforward,
+                                 d_model,
+                                 weight_attr,
+                                 bias_attr=None)
+        self.linear2 = nn.Linear(d_model, 1, weight_attr, bias_attr=None)
+        self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
+        self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train")
+
+    def forward(self, input):
+        out = self.norm(input)
+        out = self.linear0(out)
+        out = F.gelu(out, approximate=True)
+        out = self.linear1(out)
+        out = self.dropout(out)
+        out = self.linear2(out)
+
+        return out
+
+
+class TestToStatic(unittest.TestCase):
+
+    def test_to_static(self):
+
+        mlp = MLPLayer(hidden_size=hidden_size,
+                       intermediate_size=4 * hidden_size,
+                       dropout_ratio=0.1,
+                       initializer_range=0.02)
+        loss = paddle.nn.CrossEntropyLoss()
+        optimizer = paddle.optimizer.SGD(learning_rate=0.00001,
+                                         parameters=mlp.parameters())
+
+        dataset = MyDataset(batch_num * batch_size)
+
+        inputs = InputSpec([batch_size, hidden_size], 'float32', 'x')
+        labels = InputSpec([batch_size], 'int64', 'label')
+
+        engine = Engine(model=mlp,
+                        inputs_spec=inputs,
+                        labels_spec=labels,
+                        strategy=None)
+        assert _non_static_mode() == True
+
+        engine.prepare(optimizer=optimizer,
+                       loss=loss,
+                       metrics=paddle.metric.Accuracy())
+
+        assert _non_static_mode() == False
+        engine.fit(dataset, batch_size=batch_size)
+        engine.evaluate(dataset, batch_size=batch_size)
+        engine.predict(dataset, batch_size=batch_size)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_parallelizer.py b/python/paddle/fluid/tests/unittests/auto_parallel_parallelizer.py
index 7d738d3678926..688a31b78de00 100755
--- a/python/paddle/fluid/tests/unittests/auto_parallel_parallelizer.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel_parallelizer.py
@@ -96,7 +96,7 @@ def mlp_pretrain_forward(train_program, start_program):
         predict = mlp(input)
 
         cost = layers.cross_entropy(input=predict, label=label)
-        avg_cost = layers.mean(x=cost)
+        avg_cost = paddle.mean(x=cost)
 
     return avg_cost, train_program, start_program
 
diff --git a/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt b/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt
index 832ecc61ee190..45c0a08efe828 100644
--- a/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt
@@ -17,7 +17,7 @@ endforeach()
 
 set_tests_properties(test_autograd_functional_dynamic PROPERTIES TIMEOUT 200)
 set_tests_properties(test_autograd_functional_static PROPERTIES TIMEOUT 160)
-set_tests_properties(test_gradients_and_minimize PROPERTIES TIMEOUT 60)
+set_tests_properties(test_minimize PROPERTIES TIMEOUT 60)
 if(NOT WIN32)
   set_tests_properties(test_autograd_functional_prim PROPERTIES TIMEOUT 60)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py
index a98b509f963c7..5eda21eb4c14b 100644
--- a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py
@@ -21,7 +21,7 @@
 import paddle.fluid as fluid
 import paddle.compat as cpt
 import paddle.nn.functional as F
-from paddle.autograd.utils import as_tensors
+from paddle.incubate.autograd.utils import as_tensors
 from paddle.fluid.framework import _test_eager_guard, _in_legacy_dygraph, _in_eager_without_dygraph_check
 
 import config
@@ -78,9 +78,9 @@ def vjp_test():
             xs = self.gen_inputs(inputs)
             if v is not None:
                 v = self.gen_inputs(v)
-                outputs, inputs_grad = paddle.autograd.vjp(func, xs, v)
+                outputs, inputs_grad = paddle.incubate.autograd.vjp(func, xs, v)
             else:
-                outputs, inputs_grad = paddle.autograd.vjp(func, xs)
+                outputs, inputs_grad = paddle.incubate.autograd.vjp(func, xs)
             return outputs, inputs_grad
 
         def grad_test():
@@ -116,14 +116,14 @@ def jvp_test():
             xs = self.gen_inputs(inputs)
             if v is not None:
                 v = self.gen_inputs(v)
-                outputs, outputs_grad = paddle.autograd.jvp(
+                outputs, outputs_grad = paddle.incubate.autograd.jvp(
                     func,
                     xs,
                     v,
                     create_graph=create_graph,
                     allow_unused=allow_unused)
             else:
-                outputs, outputs_grad = paddle.autograd.jvp(
+                outputs, outputs_grad = paddle.incubate.autograd.jvp(
                     func,
                     xs,
                     create_graph=create_graph,
@@ -223,6 +223,11 @@ def test_all_cases(self):
         self.func_vjp_nested()
         self.func_vjp_aliased_input()
 
+    def test_input_single_tensor(self):
+        self.assertIsInstance(
+            paddle.incubate.autograd.vjp(paddle.tanh, paddle.rand((3, 4)))[1],
+            paddle.fluid.framework.Variable)
+
 
 @utils.place(config.DEVICES)
 @utils.parameterize(
@@ -233,8 +238,8 @@ class TestVJPException(unittest.TestCase):
 
     def func_vjp(self):
         with self.assertRaises(self.expected_exception):
-            paddle.autograd.vjp(self.fun, paddle.to_tensor(self.xs),
-                                paddle.to_tensor(self.v))
+            paddle.incubate.autograd.vjp(self.fun, paddle.to_tensor(self.xs),
+                                         paddle.to_tensor(self.v))
 
     def test_all_cases(self):
         with _test_eager_guard():
@@ -243,8 +248,10 @@ def test_all_cases(self):
 
 
 def jac(grad_fn, f, inputs):
-    assert grad_fn in [paddle.autograd.vjp, paddle.autograd.jvp]
-    if grad_fn is paddle.autograd.jvp:
+    assert grad_fn in [
+        paddle.incubate.autograd.vjp, paddle.incubate.autograd.jvp
+    ]
+    if grad_fn is paddle.incubate.autograd.jvp:
         vs = [paddle.zeros_like(x) for x in inputs]
     else:
         outputs = f(*inputs)
@@ -265,7 +272,7 @@ def jac(grad_fn, f, inputs):
             JJ_cols.append(d_outs)
     # JJ is the fully unrolled jacobian
     JJ = paddle.stack(JJ_cols)
-    if grad_fn is paddle.autograd.vjp:
+    if grad_fn is paddle.incubate.autograd.vjp:
         JJ = JJ.t()
     return JJ
 
@@ -279,8 +286,8 @@ def func_jvp_i1o1(self):
         ]  # noqa
         for f, inputs in test_cases:
             inputs = self.gen_inputs(inputs)
-            forward_jac = jac(paddle.autograd.jvp, f, inputs)
-            reverse_jac = jac(paddle.autograd.vjp, f, inputs)
+            forward_jac = jac(paddle.incubate.autograd.jvp, f, inputs)
+            reverse_jac = jac(paddle.incubate.autograd.vjp, f, inputs)
             self.check_results(forward_jac, reverse_jac)
 
     def func_jvp_i2o1(self):
@@ -289,8 +296,8 @@ def func_jvp_i2o1(self):
         ]  # noqa
         for f, inputs in test_cases:
             inputs = self.gen_inputs(inputs)
-            forward_jac = jac(paddle.autograd.jvp, f, inputs)
-            reverse_jac = jac(paddle.autograd.vjp, f, inputs)
+            forward_jac = jac(paddle.incubate.autograd.jvp, f, inputs)
+            reverse_jac = jac(paddle.incubate.autograd.vjp, f, inputs)
             self.check_results(forward_jac, reverse_jac)
 
     def func_jvp_i2o2(self):
@@ -299,8 +306,8 @@ def func_jvp_i2o2(self):
         ]  # noqa
         for f, inputs in test_cases:
             inputs = self.gen_inputs(inputs)
-            forward_jac = jac(paddle.autograd.jvp, f, inputs)
-            reverse_jac = jac(paddle.autograd.vjp, f, inputs)
+            forward_jac = jac(paddle.incubate.autograd.jvp, f, inputs)
+            reverse_jac = jac(paddle.incubate.autograd.vjp, f, inputs)
             self.check_results(forward_jac, reverse_jac)
 
     def func_jvp_i2o2_omitting_v(self):
@@ -309,9 +316,9 @@ def func_jvp_i2o2_omitting_v(self):
         ]  # noqa
         for f, inputs in test_cases:
             inputs = self.gen_inputs(inputs)
-            results_omitting_v = paddle.autograd.jvp(f, inputs)
+            results_omitting_v = paddle.incubate.autograd.jvp(f, inputs)
             v = [paddle.ones_like(x) for x in inputs]
-            results_with_v = paddle.autograd.jvp(f, inputs, v)
+            results_with_v = paddle.incubate.autograd.jvp(f, inputs, v)
             self.check_results(results_omitting_v, results_with_v)
 
     def test_all_cases(self):
@@ -334,7 +341,7 @@ def test_all_cases(self):
     ('multi_in_single_out', paddle.matmul,
      (np.random.rand(2, 2), np.random.rand(2, 2))),
 ))
-class TestJacobianClassNoBatch(unittest.TestCase):
+class TestJacobianNoBatch(unittest.TestCase):
 
     def setUp(self):
         self._dtype = self.xs[0].dtype if isinstance(
@@ -349,7 +356,7 @@ def setUp(self):
     def func_jacobian(self):
         xs = [paddle.to_tensor(x) for x in self.xs] if isinstance(
             self.xs, typing.Sequence) else paddle.to_tensor(self.xs)
-        self._actual = paddle.autograd.Jacobian(self.func, xs, False)
+        self._actual = paddle.incubate.autograd.Jacobian(self.func, xs, False)
         self._expected = self._get_expected()
 
         Index = collections.namedtuple('Index', ('type', 'value'))
@@ -387,7 +394,7 @@ def test_all_cases(self):
     ('3d_in_3d_out', utils.square, np.random.rand(2, 3, 4)),
     ('multi_in_single_out', utils.square, np.random.rand(2, 3)),
 ))
-class TestJacobianClassBatchFirst(unittest.TestCase):
+class TestJacobianBatchFirst(unittest.TestCase):
 
     def setUp(self):
         self._dtype = self.xs[0].dtype if isinstance(
@@ -402,7 +409,7 @@ def setUp(self):
     def func_jacobian(self):
         xs = [paddle.to_tensor(x) for x in self.xs] if isinstance(
             self.xs, typing.Sequence) else paddle.to_tensor(self.xs)
-        self._actual = paddle.autograd.Jacobian(self.func, xs, True)
+        self._actual = paddle.incubate.autograd.Jacobian(self.func, xs, True)
         self._expected = self._get_expected()
 
         Index = collections.namedtuple('Index', ('type', 'value'))
@@ -444,7 +451,7 @@ def test_all_cases(self):
         self.func_jacobian()
 
 
-class TestHessianClassNoBatch(unittest.TestCase):
+class TestHessianNoBatch(unittest.TestCase):
 
     @classmethod
     def setUpClass(self):
@@ -470,7 +477,7 @@ def func(x):
         numerical_hessian = utils._np_concat_matrix_sequence(numerical_hessian)
 
         self.x.stop_gradient = False
-        hessian = paddle.autograd.Hessian(func, self.x)
+        hessian = paddle.incubate.autograd.Hessian(func, self.x)
         np.testing.assert_allclose(hessian[:].numpy(), numerical_hessian,
                                    self.rtol, self.atol)
 
@@ -484,7 +491,7 @@ def func(x, y):
         numerical_hessian = utils._np_concat_matrix_sequence(numerical_hessian)
         self.x.stop_gradient = False
         self.y.stop_gradient = False
-        hessian = paddle.autograd.Hessian(func, [self.x, self.y])
+        hessian = paddle.incubate.autograd.Hessian(func, [self.x, self.y])
         np.testing.assert_allclose(hessian[:].numpy(),
                                    numerical_hessian,
                                    rtol=self.rtol,
@@ -500,7 +507,7 @@ def func(x, y):
         numerical_hessian = utils._np_concat_matrix_sequence(numerical_hessian)
         self.x.stop_gradient = False
         self.y.stop_gradient = False
-        hessian = paddle.autograd.Hessian(func, [self.x, self.y])
+        hessian = paddle.incubate.autograd.Hessian(func, [self.x, self.y])
         np.testing.assert_allclose(hessian[:].numpy(), numerical_hessian,
                                    self.rtol, self.atol)
 
@@ -514,7 +521,7 @@ def func(x):
             func, self.x, self.numerical_delta, self.np_dtype)
         numerical_hessian = utils._np_concat_matrix_sequence(numerical_hessian)
         self.x.stop_gradient = False
-        hessian = paddle.autograd.Hessian(func, self.x)
+        hessian = paddle.incubate.autograd.Hessian(func, self.x)
         assert hessian[:].stop_gradient == False
         np.testing.assert_allclose(hessian[:].numpy(), numerical_hessian,
                                    self.rtol, self.atol)
@@ -526,7 +533,7 @@ def func(x):
             return x * x
 
         with self.assertRaises(RuntimeError):
-            paddle.autograd.Hessian(func, paddle.ones([3]))
+            paddle.incubate.autograd.Hessian(func, paddle.ones([3]))
 
     def test_all_cases(self):
         with _test_eager_guard():
@@ -544,7 +551,7 @@ def test_all_cases(self):
         self.func_out_not_single()
 
 
-class TestHessianClassBatchFirst(unittest.TestCase):
+class TestHessianBatchFirst(unittest.TestCase):
 
     @classmethod
     def setUpClass(self):
@@ -572,7 +579,7 @@ def func(x):
         expected = utils._compute_numerical_batch_hessian(
             func, self.x, self.numerical_delta, self.np_dtype)
 
-        H = paddle.autograd.Hessian(func, self.x, is_batched=True)
+        H = paddle.incubate.autograd.Hessian(func, self.x, is_batched=True)
         actual = utils._np_transpose_matrix_format(H[:].numpy(),
                                                    utils.MatrixFormat.BNM,
                                                    utils.MatrixFormat.NBM)
@@ -596,7 +603,8 @@ def func(x, y):
 
         self.x.stop_gradient = False
         self.y.stop_gradient = False
-        H = paddle.autograd.Hessian(func, [self.x, self.y], is_batched=True)
+        H = paddle.incubate.autograd.Hessian(func, [self.x, self.y],
+                                             is_batched=True)
         actual = utils._np_transpose_matrix_format(H[:].numpy(),
                                                    utils.MatrixFormat.BNM,
                                                    utils.MatrixFormat.NBM)
@@ -620,8 +628,8 @@ def func(x, y):
                                                      utils.MatrixFormat.NBM,
                                                      utils.MatrixFormat.BNM)
 
-        actual = paddle.autograd.Hessian(func, [self.x, self.y],
-                                         is_batched=True)[:]
+        actual = paddle.incubate.autograd.Hessian(func, [self.x, self.y],
+                                                  is_batched=True)[:]
 
         np.testing.assert_allclose(actual,
                                    expected,
@@ -638,7 +646,7 @@ def func(x):
 
         x = self.x.clone()
         x.stop_gradient = True
-        H = paddle.autograd.Hessian(func, self.x, is_batched=True)[:]
+        H = paddle.incubate.autograd.Hessian(func, self.x, is_batched=True)[:]
         actual = utils._np_transpose_matrix_format(H[:].numpy(),
                                                    utils.MatrixFormat.BNM,
                                                    utils.MatrixFormat.NBM)
@@ -652,7 +660,9 @@ def func(x):
             return (x * x)
 
         with self.assertRaises(RuntimeError):
-            paddle.autograd.Hessian(func, paddle.ones((3, 3)), is_batched=True)
+            paddle.incubate.autograd.Hessian(func,
+                                             paddle.ones((3, 3)),
+                                             is_batched=True)
 
     def test_all_cases(self):
         with _test_eager_guard():
@@ -670,829 +680,6 @@ def test_all_cases(self):
         self.func_out_not_single()
 
 
-class TestHessian(unittest.TestCase):
-
-    @classmethod
-    def setUpClass(self):
-        self.shape = (2, 2)
-        self.dtype = 'float32'
-        self.np_dtype = np.float32
-        self.numerical_delta = config.TOLERANCE.get(
-            self.dtype).get("second_order_grad").get("eps")
-        self.rtol = config.TOLERANCE.get(
-            self.dtype).get("second_order_grad").get("rtol")
-        self.atol = config.TOLERANCE.get(
-            self.dtype).get("second_order_grad").get("atol")
-
-        self.x = paddle.rand(shape=self.shape, dtype=self.dtype)
-        self.y = paddle.rand(shape=self.shape, dtype=self.dtype)
-
-    def func_single_input(self):
-
-        def func(x):
-            return paddle.sum(paddle.matmul(x, x))
-
-        numerical_hessian = _compute_numerical_hessian(func, self.x,
-                                                       self.numerical_delta,
-                                                       self.np_dtype)
-
-        self.x.stop_gradient = False
-        hessian = paddle.autograd.hessian(func, self.x)
-        np.testing.assert_allclose(hessian.numpy(), numerical_hessian[0][0],
-                                   self.rtol, self.atol)
-
-    def func_multi_input(self):
-
-        def func(x, y):
-            return paddle.sum(paddle.matmul(x, y))
-
-        numerical_hessian = _compute_numerical_hessian(func, [self.x, self.y],
-                                                       self.numerical_delta,
-                                                       self.np_dtype)
-
-        self.x.stop_gradient = False
-        self.y.stop_gradient = False
-        hessian = paddle.autograd.hessian(func, [self.x, self.y])
-        for i in range(len(hessian)):
-            for j in range(len(hessian[0])):
-                np.testing.assert_allclose(hessian[i][j].numpy(),
-                                           numerical_hessian[i][j], self.rtol,
-                                           self.atol)
-
-    def func_allow_unused_false(self):
-
-        def func(x, y):
-            return paddle.sum(paddle.matmul(x, x))
-
-        try:
-            self.x.stop_gradient = False
-            self.y.stop_gradient = False
-            hessian = paddle.autograd.hessian(func, [self.x, self.y])
-        except ValueError as e:
-            error_msg = cpt.get_exception_message(e)
-            assert error_msg.find("allow_unused") > 0
-
-    def func_allow_unused_true(self):
-
-        def func(x, y):
-            return paddle.sum(paddle.matmul(x, x))
-
-        numerical_hessian = _compute_numerical_hessian(func, [self.x, self.y],
-                                                       self.numerical_delta,
-                                                       self.np_dtype)
-        self.x.stop_gradient = False
-        self.y.stop_gradient = False
-        hessian = paddle.autograd.hessian(func, [self.x, self.y],
-                                          allow_unused=True)
-        for i in range(len(hessian)):
-            for j in range(len(hessian[0])):
-                if i == j == 0:
-                    np.testing.assert_allclose(hessian[i][j].numpy(),
-                                               numerical_hessian[i][j],
-                                               self.rtol, self.atol)
-                else:
-                    assert hessian[i][j] is None
-
-    def func_create_graph_false(self):
-
-        def func(x):
-            return paddle.sum(paddle.matmul(x, x))
-
-        numerical_hessian = _compute_numerical_hessian(func, self.x,
-                                                       self.numerical_delta,
-                                                       self.np_dtype)
-        self.x.stop_gradient = False
-        hessian = paddle.autograd.hessian(func, self.x)
-        assert hessian.stop_gradient == True
-        np.testing.assert_allclose(hessian.numpy(), numerical_hessian[0][0],
-                                   self.rtol, self.atol)
-        try:
-            paddle.grad(hessian, self.x)
-        except Exception as e:
-            error_msg = cpt.get_exception_message(e)
-            assert error_msg.find("has no gradient") > 0 or error_msg.find(
-                "does not appear") > 0
-
-    def func_create_graph_true(self):
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
-
-        def func(x):
-            return paddle.sum(F.sigmoid(x))
-
-        numerical_hessian = _compute_numerical_hessian(func, self.x,
-                                                       self.numerical_delta,
-                                                       self.np_dtype)
-        self.x.stop_gradient = False
-        hessian = paddle.autograd.hessian(func, self.x, create_graph=True)
-        assert hessian.stop_gradient == False
-        np.testing.assert_allclose(hessian.numpy(), numerical_hessian[0][0],
-                                   self.rtol, self.atol)
-        triple_grad = paddle.grad(hessian, self.x)
-        assert triple_grad is not None
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
-
-    def test_all_cases(self):
-        with _test_eager_guard():
-            self.setUpClass()
-            self.func_single_input()
-            self.func_multi_input()
-            self.func_allow_unused_false()
-            self.func_allow_unused_true()
-            self.func_create_graph_false()
-            self.func_create_graph_true()
-        self.setUpClass()
-        self.func_single_input()
-        self.func_multi_input()
-        self.func_allow_unused_false()
-        self.func_allow_unused_true()
-        self.func_create_graph_false()
-        self.func_create_graph_true()
-
-
-class TestHessianFloat64(TestHessian):
-
-    @classmethod
-    def setUpClass(self):
-        self.shape = (2, 2)
-        self.dtype = 'float64'
-        self.np_dtype = np.float64
-        self.numerical_delta = config.TOLERANCE.get(
-            self.dtype).get("second_order_grad").get("eps")
-        self.rtol = config.TOLERANCE.get(
-            self.dtype).get("second_order_grad").get("rtol")
-        self.atol = config.TOLERANCE.get(
-            self.dtype).get("second_order_grad").get("atol")
-        self.x = paddle.rand(shape=self.shape, dtype=self.dtype)
-        self.y = paddle.rand(shape=self.shape, dtype=self.dtype)
-
-
-class TestBatchHessian(unittest.TestCase):
-
-    @classmethod
-    def setUpClass(self):
-        self.x_shape = (5, 2)
-        self.weight_shape = (2, 4)
-        self.y_shape = (5, 2)
-        self.dtype = 'float32'
-        self.np_dtype = np.float32
-        self.numerical_delta = config.TOLERANCE.get(
-            self.dtype).get("second_order_grad").get("eps")
-        self.rtol = config.TOLERANCE.get(
-            self.dtype).get("second_order_grad").get("rtol")
-        self.atol = config.TOLERANCE.get(
-            self.dtype).get("second_order_grad").get("atol")
-        self.x = paddle.rand(shape=self.x_shape, dtype=self.dtype)
-        self.weight = paddle.rand(shape=self.weight_shape, dtype=self.dtype)
-        self.y = paddle.rand(shape=self.y_shape, dtype=self.dtype)
-
-    def func_single_input(self):
-
-        def func(x):
-            return paddle.matmul(x * x, self.weight)[:, 0:1]
-
-        numerical_hessian = _compute_numerical_batch_hessian(
-            func, self.x, self.numerical_delta, self.np_dtype)
-        self.x.stop_gradient = False
-        hessian = paddle.autograd.batch_hessian(func, self.x, create_graph=True)
-        np.testing.assert_allclose(hessian, numerical_hessian, self.rtol,
-                                   self.atol)
-
-    def func_multi_input(self):
-
-        def func(x, y):
-            return paddle.matmul(x * x * y * y, self.weight)[:, 0:1]
-
-        numerical_hessian = _compute_numerical_batch_hessian(
-            func, [self.x, self.y], self.numerical_delta, self.np_dtype)
-
-        self.x.stop_gradient = False
-        self.y.stop_gradient = False
-        hessian = paddle.autograd.batch_hessian(func, [self.x, self.y])
-
-        shape_tensor = paddle.to_tensor(numerical_hessian).astype("float64")
-        hessian_reshape = np.reshape(hessian, (shape_tensor.shape))
-        np.testing.assert_allclose(hessian_reshape, numerical_hessian,
-                                   self.rtol, self.atol)
-
-    def func_allow_unused_false(self):
-
-        def func(x, y):
-            return paddle.matmul(x * x, self.weight)[:, 0:1]
-
-        try:
-            self.x.stop_gradient = False
-            self.y.stop_gradient = False
-            hessian = paddle.autograd.batch_hessian(func, [self.x, self.y])
-        except ValueError as e:
-            error_msg = cpt.get_exception_message(e)
-            assert error_msg.find("allow_unused") > 0
-
-    def func_allow_unused_true(self):
-
-        def func(x, y):
-            return paddle.matmul(x * x, self.weight)[:, 0:1]
-
-        numerical_hessian = _compute_numerical_batch_hessian(
-            func, [self.x, self.y], self.numerical_delta, self.np_dtype)
-        self.x.stop_gradient = False
-        self.y.stop_gradient = False
-        hessian = paddle.autograd.batch_hessian(func, [self.x, self.y],
-                                                allow_unused=True)
-
-        for i in range(len(hessian)):
-            for j in range(len(hessian[0])):
-                if i == j == 0:
-                    numerical_hessian = np.stack(
-                        (numerical_hessian[i][j], numerical_hessian[i][j + 1]),
-                        axis=0)
-                    np.testing.assert_allclose(hessian[i][j], numerical_hessian,
-                                               self.rtol, self.atol)
-                else:
-                    assert hessian[i][j] is None
-
-    def func_create_graph_false(self):
-
-        def func(x):
-            return paddle.matmul(x * x, self.weight)[:, 0:1]
-
-        numerical_hessian = _compute_numerical_batch_hessian(
-            func, self.x, self.numerical_delta, self.np_dtype)
-        self.x.stop_gradient = False
-        hessian = paddle.autograd.batch_hessian(func, self.x)
-        assert hessian.stop_gradient == True
-        np.testing.assert_allclose(hessian.numpy(), numerical_hessian,
-                                   self.rtol, self.atol)
-        try:
-            paddle.grad(hessian, self.x)
-        except Exception as e:
-            error_msg = cpt.get_exception_message(e)
-            assert error_msg.find("has no gradient") > 0 or error_msg.find(
-                "does not appear") > 0
-
-    def func_create_graph_true(self):
-
-        def func(x):
-            return paddle.matmul(x * x, self.weight)[:, 0:1]
-
-        numerical_hessian = _compute_numerical_batch_hessian(
-            func, self.x, self.numerical_delta, self.np_dtype)
-        self.x.stop_gradient = False
-        hessian = paddle.autograd.batch_hessian(func, self.x, create_graph=True)
-        assert hessian.stop_gradient == False
-        np.testing.assert_allclose(hessian.numpy(), numerical_hessian,
-                                   self.rtol, self.atol)
-        triple_grad = paddle.grad(hessian, self.x)
-        assert triple_grad is not None
-
-    def test_all_cases(self):
-        with _test_eager_guard():
-            self.setUpClass()
-            self.func_single_input()
-            self.func_multi_input()
-            self.func_allow_unused_false()
-            self.func_allow_unused_true()
-            self.func_create_graph_false()
-            self.func_create_graph_true()
-        self.setUpClass()
-        self.func_single_input()
-        self.func_multi_input()
-        self.func_allow_unused_false()
-        self.func_allow_unused_true()
-        self.func_create_graph_false()
-        self.func_create_graph_true()
-
-
-class TestBatchHessianFloat64(TestBatchHessian):
-
-    @classmethod
-    def setUpClass(self):
-        self.x_shape = (5, 2)
-        self.weight_shape = (2, 4)
-        self.y_shape = (5, 2)
-        self.dtype = 'float64'
-        self.np_dtype = np.float64
-        self.numerical_delta = config.TOLERANCE.get(
-            self.dtype).get("second_order_grad").get("eps")
-        self.rtol = config.TOLERANCE.get(
-            self.dtype).get("second_order_grad").get("rtol")
-        self.atol = config.TOLERANCE.get(
-            self.dtype).get("second_order_grad").get("atol")
-        self.x = paddle.rand(shape=self.x_shape, dtype=self.dtype)
-        self.weight = paddle.rand(shape=self.weight_shape, dtype=self.dtype)
-        self.y = paddle.rand(shape=self.y_shape, dtype=self.dtype)
-
-
-class TestVHP(unittest.TestCase):
-
-    @classmethod
-    def setUpClass(self):
-        self.shape = (2, 2)
-        self.dtype = 'float32'
-        self.np_dtype = np.float32
-        self.numerical_delta = config.TOLERANCE.get(
-            self.dtype).get("second_order_grad").get("eps")
-        self.rtol = config.TOLERANCE.get(
-            self.dtype).get("second_order_grad").get("rtol")
-        self.atol = config.TOLERANCE.get(
-            self.dtype).get("second_order_grad").get("atol")
-        self.x = paddle.rand(shape=self.shape, dtype=self.dtype)
-        self.y = paddle.rand(shape=self.shape, dtype=self.dtype)
-        self.vx = paddle.rand(shape=self.shape, dtype=self.dtype)
-        self.vy = paddle.rand(shape=self.shape, dtype=self.dtype)
-
-    def func_single_input(self):
-
-        def func(x):
-            return paddle.sum(paddle.matmul(x, x))
-
-        numerical_func_output = func(self.x).numpy()
-        numerical_vhp = _compute_numerical_vhp(func, self.x, self.vx,
-                                               self.numerical_delta,
-                                               self.np_dtype)
-
-        self.x.stop_gradient = False
-        func_output, vhp = paddle.autograd.vhp(func, self.x, self.vx)
-        np.testing.assert_allclose(func_output.numpy(), numerical_func_output,
-                                   self.rtol, self.atol)
-        np.testing.assert_allclose(vhp[0].numpy(), numerical_vhp[0], self.rtol,
-                                   self.atol)
-
-    def func_multi_input(self):
-
-        def func(x, y):
-            return paddle.sum(paddle.matmul(x, y))
-
-        numerical_func_output = func(self.x, self.y).numpy()
-        numerical_vhp = _compute_numerical_vhp(func, [self.x, self.y],
-                                               [self.vx, self.vy],
-                                               self.numerical_delta,
-                                               self.np_dtype)
-
-        self.x.stop_gradient = False
-        self.y.stop_gradient = False
-        func_output, vhp = paddle.autograd.vhp(func, [self.x, self.y],
-                                               [self.vx, self.vy])
-        np.testing.assert_allclose(func_output.numpy(), numerical_func_output,
-                                   self.rtol, self.atol)
-        for i in range(len(vhp)):
-            np.testing.assert_allclose(vhp[i].numpy(), numerical_vhp[i],
-                                       self.rtol, self.atol)
-
-    def func_v_default(self):
-
-        def func(x, y):
-            return paddle.sum(paddle.matmul(x, y))
-
-        numerical_func_output = func(self.x, self.y).numpy()
-        vx = paddle.ones(self.vx.shape, dtype=self.vx.dtype)
-        vy = paddle.ones(self.vy.shape, dtype=self.vy.dtype)
-        numerical_vhp = _compute_numerical_vhp(func, [self.x, self.y], [vx, vy],
-                                               self.numerical_delta,
-                                               self.np_dtype)
-
-        self.x.stop_gradient = False
-        self.y.stop_gradient = False
-        func_output, vhp = paddle.autograd.vhp(func, [self.x, self.y])
-        np.testing.assert_allclose(func_output.numpy(), numerical_func_output,
-                                   self.rtol, self.atol)
-        for i in range(len(vhp)):
-            np.testing.assert_allclose(vhp[i].numpy(), numerical_vhp[i],
-                                       self.rtol, self.atol)
-
-    def func_allow_unused_true(self):
-
-        def func(x, y):
-            return paddle.sum(paddle.matmul(x, x))
-
-        numerical_func_output = func(self.x, self.y).numpy()
-        numerical_vhp = _compute_numerical_vhp(func, [self.x, self.y],
-                                               [self.vx, self.vy],
-                                               self.numerical_delta,
-                                               self.np_dtype)
-
-        self.x.stop_gradient = False
-        self.y.stop_gradient = False
-        func_output, vhp = paddle.autograd.vhp(func, [self.x, self.y],
-                                               [self.vx, self.vy])
-        np.testing.assert_allclose(func_output.numpy(), numerical_func_output,
-                                   self.rtol, self.atol)
-        np.testing.assert_allclose(vhp[0].numpy(), numerical_vhp[0], self.rtol,
-                                   self.atol)
-
-    def func_create_graph_true(self):
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
-
-        def func(x):
-            return paddle.sum(F.sigmoid(x))
-
-        numerical_func_output = func(self.x).numpy()
-        numerical_vhp = _compute_numerical_vhp(func, self.x, self.vx,
-                                               self.numerical_delta,
-                                               self.np_dtype)
-
-        self.x.stop_gradient = False
-        func_output, vhp = paddle.autograd.vhp(func, self.x, self.vx)
-        np.testing.assert_allclose(func_output.numpy(), numerical_func_output,
-                                   self.rtol, self.atol)
-        assert vhp[0].stop_gradient == False
-        np.testing.assert_allclose(vhp[0].numpy(), numerical_vhp[0], self.rtol,
-                                   self.atol)
-        triple_grad = paddle.grad(vhp, self.x)
-        assert triple_grad is not None
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
-
-    def test_all_cases(self):
-        with _test_eager_guard():
-            self.setUpClass()
-            self.func_v_default()
-            self.func_multi_input()
-            self.func_single_input()
-            self.func_allow_unused_true()
-            self.func_create_graph_true()
-        self.setUpClass()
-        self.func_v_default()
-        self.func_multi_input()
-        self.func_single_input()
-        self.func_allow_unused_true()
-        self.func_create_graph_true()
-
-
-class TestJacobian(unittest.TestCase):
-
-    @classmethod
-    def setUpClass(self):
-        self.shape = (4, 4)
-        self.dtype = 'float32'
-        self.np_dtype = np.float32
-        self.numerical_delta = 1e-4
-        self.rtol = 1e-3
-        self.atol = 1e-3
-        self.x = paddle.rand(shape=self.shape, dtype=self.dtype)
-        self.y = paddle.rand(shape=self.shape, dtype=self.dtype)
-
-    def func_single_input_and_single_output(self):
-
-        def func(x):
-            return paddle.matmul(x, x)
-
-        numerical_jacobian = _compute_numerical_jacobian(
-            func, self.x, self.numerical_delta, self.np_dtype)
-        self.x.stop_gradient = False
-        jacobian = paddle.autograd.jacobian(func, self.x)
-        np.testing.assert_allclose(jacobian.numpy(), numerical_jacobian[0][0],
-                                   self.rtol, self.atol)
-
-    def func_single_input_and_multi_output(self):
-
-        def func(x):
-            return paddle.matmul(x, x), x * x
-
-        numerical_jacobian = _compute_numerical_jacobian(
-            func, self.x, self.numerical_delta, self.np_dtype)
-        self.x.stop_gradient = False
-        jacobian = paddle.autograd.jacobian(func, self.x)
-        for i in range(len(jacobian)):
-            np.testing.assert_allclose(jacobian[i].numpy(),
-                                       numerical_jacobian[i][0], self.rtol,
-                                       self.atol)
-
-    def func_multi_input_and_single_output(self):
-
-        def func(x, y):
-            return paddle.matmul(x, y)
-
-        numerical_jacobian = _compute_numerical_jacobian(
-            func, [self.x, self.y], self.numerical_delta, self.np_dtype)
-        self.x.stop_gradient = False
-        self.y.stop_gradient = False
-        jacobian = paddle.autograd.jacobian(func, [self.x, self.y])
-        for j in range(len(jacobian)):
-            np.testing.assert_allclose(jacobian[j].numpy(),
-                                       numerical_jacobian[0][j], self.rtol,
-                                       self.atol)
-
-    def func_multi_input_and_multi_output(self):
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
-
-        def func(x, y):
-            return paddle.matmul(x, y), x * y
-
-        numerical_jacobian = _compute_numerical_jacobian(
-            func, [self.x, self.y], self.numerical_delta, self.np_dtype)
-        self.x.stop_gradient = False
-        self.y.stop_gradient = False
-        jacobian = paddle.autograd.jacobian(func, [self.x, self.y])
-        for i in range(len(jacobian)):
-            for j in range(len(jacobian[0])):
-                np.testing.assert_allclose(jacobian[i][j].numpy(),
-                                           numerical_jacobian[i][j], self.rtol,
-                                           self.atol)
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
-
-    def func_allow_unused_false(self):
-
-        def func(x, y):
-            return paddle.matmul(x, x)
-
-        try:
-            self.x.stop_gradient = False
-            self.y.stop_gradient = False
-            jacobian = paddle.autograd.jacobian(func, [self.x, self.y])
-        except ValueError as e:
-            error_msg = cpt.get_exception_message(e)
-            assert error_msg.find("allow_unused") > 0
-
-    def func_allow_unused_true(self):
-
-        def func(x, y):
-            return paddle.matmul(x, x)
-
-        numerical_jacobian = _compute_numerical_jacobian(
-            func, [self.x, self.y], self.numerical_delta, self.np_dtype)
-        self.x.stop_gradient = False
-        self.y.stop_gradient = False
-        jacobian = paddle.autograd.jacobian(func, [self.x, self.y],
-                                            allow_unused=True)
-        np.testing.assert_allclose(jacobian[0].numpy(),
-                                   numerical_jacobian[0][0], self.rtol,
-                                   self.atol)
-        assert jacobian[1] is None
-
-    def func_create_graph_false(self):
-
-        def func(x, y):
-            return paddle.matmul(x, y)
-
-        numerical_jacobian = _compute_numerical_jacobian(
-            func, [self.x, self.y], self.numerical_delta, self.np_dtype)
-        self.x.stop_gradient = False
-        self.y.stop_gradient = False
-        jacobian = paddle.autograd.jacobian(func, [self.x, self.y])
-        for j in range(len(jacobian)):
-            assert jacobian[j].stop_gradient == True
-            np.testing.assert_allclose(jacobian[j].numpy(),
-                                       numerical_jacobian[0][j], self.rtol,
-                                       self.atol)
-        try:
-            paddle.grad(jacobian[0], [self.x, self.y])
-        except Exception as e:
-            error_msg = cpt.get_exception_message(e)
-            assert error_msg.find("has no gradient") > 0 or error_msg.find(
-                "does not appear") > 0
-
-    def func_create_graph_true(self):
-
-        def func(x, y):
-            return paddle.matmul(x, y)
-
-        numerical_jacobian = _compute_numerical_jacobian(
-            func, [self.x, self.y], self.numerical_delta, self.np_dtype)
-        self.x.stop_gradient = False
-        self.y.stop_gradient = False
-        jacobian = paddle.autograd.jacobian(func, [self.x, self.y],
-                                            create_graph=True)
-        for j in range(len(jacobian)):
-            assert jacobian[j].stop_gradient == False
-            np.testing.assert_allclose(jacobian[j].numpy(),
-                                       numerical_jacobian[0][j], self.rtol,
-                                       self.atol)
-        double_grad = paddle.grad(jacobian[0], [self.x, self.y])
-        assert double_grad is not None
-
-    def test_all_cases(self):
-        with _test_eager_guard():
-            self.setUpClass()
-            self.func_multi_input_and_multi_output()
-            self.func_multi_input_and_single_output()
-            self.func_single_input_and_multi_output()
-            self.func_single_input_and_single_output()
-            self.func_allow_unused_false()
-            self.func_allow_unused_true()
-            self.func_create_graph_false()
-            self.func_create_graph_true()
-        self.setUpClass()
-        self.func_multi_input_and_multi_output()
-        self.func_multi_input_and_single_output()
-        self.func_single_input_and_multi_output()
-        self.func_single_input_and_single_output()
-        self.func_allow_unused_false()
-        self.func_allow_unused_true()
-        self.func_create_graph_false()
-        self.func_create_graph_true()
-
-
-class TestJacobianFloat64(TestJacobian):
-
-    @classmethod
-    def setUpClass(self):
-        self.shape = (4, 4)
-        self.dtype = 'float64'
-        self.np_dtype = np.float64
-        self.numerical_delta = 1e-7
-        self.rtol = 1e-7
-        self.atol = 1e-7
-        self.x = paddle.rand(shape=self.shape, dtype=self.dtype)
-        self.y = paddle.rand(shape=self.shape, dtype=self.dtype)
-
-
-class TestJacobianBatch(unittest.TestCase):
-
-    @classmethod
-    def setUpClass(self):
-        self.x_shape = (4, 2)
-        self.weight_shape = (2, 4)
-        self.y_shape = (4, 2)
-        self.dtype = 'float32'
-        self.np_dtype = np.float32
-        self.numerical_delta = 1e-4
-        self.rtol = 1e-3
-        self.atol = 1e-3
-        self.x = paddle.rand(shape=self.x_shape, dtype=self.dtype)
-        self.weight = paddle.rand(shape=self.weight_shape, dtype=self.dtype)
-        self.y = paddle.rand(shape=self.y_shape, dtype=self.dtype)
-
-    def func_batch_single_input_and_batch_single_output(self):
-
-        def func(x):
-            return paddle.matmul(paddle.matmul(x, self.weight), self.y)
-
-        numerical_jacobian = _compute_numerical_batch_jacobian(
-            func, [self.x], self.numerical_delta, self.np_dtype)
-
-        self.x.stop_gradient = False
-        batch_jacobian = paddle.autograd.batch_jacobian(
-            func,
-            self.x,
-        )
-
-        self.assertTrue(
-            np.allclose(batch_jacobian.numpy().all(),
-                        numerical_jacobian[0][0].all()))
-
-    def func_batch_single_input_and_batch_multi_output(self):
-
-        def func(x):
-            return paddle.matmul(paddle.matmul(x, self.weight), self.y), x * x
-
-        numerical_jacobian = _compute_numerical_batch_jacobian(
-            func, [self.x], self.numerical_delta, self.np_dtype)
-
-        self.x.stop_gradient = False
-        batch_jacobian = paddle.autograd.batch_jacobian(
-            func,
-            self.x,
-        )
-
-        for i in range(len(batch_jacobian)):
-            np.testing.assert_allclose(batch_jacobian[i].numpy(),
-                                       numerical_jacobian[i][0], self.rtol,
-                                       self.atol)
-
-    def func_batch_multi_input_and_batch_single_output(self):
-
-        def func(x, y):
-            return x * y
-
-        numerical_jacobian = _compute_numerical_batch_jacobian(
-            func, [self.x, self.y], self.numerical_delta, self.np_dtype)
-
-        self.x.stop_gradient = False
-        self.y.stop_gradient = False
-        batch_jacobian = paddle.autograd.batch_jacobian(func, [self.x, self.y])
-
-        for j in range(len(batch_jacobian)):
-            np.testing.assert_allclose(batch_jacobian[j].numpy(),
-                                       numerical_jacobian[0][j], self.rtol,
-                                       self.atol)
-
-    def func_batch_multi_input_and_batch_multi_output(self):
-
-        def func(x, y):
-            return x * y, x * y
-
-        numerical_jacobian = _compute_numerical_batch_jacobian(
-            func, [self.x, self.y], self.numerical_delta, self.np_dtype)
-
-        self.x.stop_gradient = False
-        self.y.stop_gradient = False
-        batch_jacobian = paddle.autograd.batch_jacobian(func, [self.x, self.y])
-
-        for i in range(len(batch_jacobian)):
-            np.testing.assert_allclose(batch_jacobian[i], numerical_jacobian[i],
-                                       self.rtol, self.atol)
-
-    def func_allow_unused_false(self):
-
-        def func(x, y):
-            return x * x
-
-        try:
-            self.x.stop_gradient = False
-            self.y.stop_gradient = False
-            jacobian = paddle.autograd.batch_jacobian(func, [self.x, self.y])
-        except ValueError as e:
-            error_msg = cpt.get_exception_message(e)
-            assert error_msg.find("allow_unused") > 0
-
-    def func_allow_unused_true(self):
-
-        def func(x, y):
-            return x * x
-
-        numerical_jacobian = _compute_numerical_batch_jacobian(
-            func, [self.x, self.y], self.numerical_delta, self.np_dtype)
-        self.x.stop_gradient = False
-        self.y.stop_gradient = False
-        jacobian = paddle.autograd.batch_jacobian(func, [self.x, self.y],
-                                                  allow_unused=True)
-
-        np.testing.assert_allclose(jacobian[0].numpy(),
-                                   numerical_jacobian[0][0], self.rtol,
-                                   self.atol)
-        assert jacobian[1] is None
-
-    def func_create_graph_false(self):
-
-        def func(x, y):
-            return x * y
-
-        numerical_jacobian = _compute_numerical_batch_jacobian(
-            func, [self.x, self.y], self.numerical_delta, self.np_dtype)
-        self.x.stop_gradient = False
-        self.y.stop_gradient = False
-        jacobian = paddle.autograd.batch_jacobian(func, [self.x, self.y])
-        for j in range(len(jacobian)):
-            assert jacobian[j].stop_gradient == True
-            np.testing.assert_allclose(jacobian[j].numpy(),
-                                       numerical_jacobian[0][j], self.rtol,
-                                       self.atol)
-        try:
-            paddle.grad(jacobian[0], [self.x, self.y])
-        except Exception as e:
-            error_msg = cpt.get_exception_message(e)
-            assert error_msg.find("has no gradient") > 0 or error_msg.find(
-                "does not appear") > 0
-
-    def func_create_graph_true(self):
-
-        def func(x, y):
-            return x * y
-
-        numerical_jacobian = _compute_numerical_batch_jacobian(
-            func, [self.x, self.y], self.numerical_delta, self.np_dtype)
-        self.x.stop_gradient = False
-        self.y.stop_gradient = False
-        jacobian = paddle.autograd.batch_jacobian(func, [self.x, self.y],
-                                                  create_graph=True)
-        for j in range(len(jacobian)):
-            assert jacobian[j].stop_gradient == False
-            np.testing.assert_allclose(jacobian[j].numpy(),
-                                       numerical_jacobian[0][j], self.rtol,
-                                       self.atol)
-        double_grad = paddle.grad(jacobian[0], [self.x, self.y])
-        assert double_grad is not None
-
-    def test_all_cases(self):
-        with _test_eager_guard():
-            self.setUpClass()
-            self.func_batch_single_input_and_batch_single_output()
-            self.func_batch_single_input_and_batch_multi_output()
-            self.func_batch_multi_input_and_batch_single_output()
-            self.func_batch_multi_input_and_batch_multi_output()
-            self.func_allow_unused_false()
-            self.func_allow_unused_true()
-            self.func_create_graph_false()
-            self.func_create_graph_true()
-        self.setUpClass()
-        self.func_batch_single_input_and_batch_single_output()
-        self.func_batch_single_input_and_batch_multi_output()
-        self.func_batch_multi_input_and_batch_single_output()
-        self.func_batch_multi_input_and_batch_multi_output()
-        self.func_allow_unused_false()
-        self.func_allow_unused_true()
-        self.func_create_graph_false()
-        self.func_create_graph_true()
-
-
-class TestJacobianBatchFloat64(TestJacobianBatch):
-
-    @classmethod
-    def setUpClass(self):
-        self.x_shape = (12, 2)
-        self.weight_shape = (2, 12)
-        self.y_shape = (12, 2)
-        self.dtype = 'float64'
-        self.np_dtype = np.float64
-        self.numerical_delta = config.TOLERANCE.get(
-            self.dtype).get('second_order_grad').get('eps')
-        self.rtol = config.TOLERANCE.get(
-            self.dtype).get('second_order_grad').get('rtol')
-        self.atol = config.TOLERANCE.get(
-            self.dtype).get('second_order_grad').get('atol')
-        self.x = paddle.rand(shape=self.x_shape, dtype=self.dtype)
-        self.weight = paddle.rand(shape=self.weight_shape, dtype=self.dtype)
-        self.y = paddle.rand(shape=self.y_shape, dtype=self.dtype)
-
-
 if __name__ == "__main__":
+    np.random.seed(2022)
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_prim.py b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_prim.py
index f75460df6b52d..d17420c904546 100644
--- a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_prim.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_prim.py
@@ -145,5 +145,130 @@ def wrapper(fun, args):
                                    atol=self._atol)
 
 
+@utils.place(config.DEVICES)
+@utils.parameterize((utils.TEST_CASE_NAME, 'fun', 'args', 'dtype'), (
+    ('unary_float32', paddle.tanh, (np.random.rand(2, 3), ), 'float32'),
+    ('binary_float32', paddle.matmul,
+     (np.random.rand(2, 3), np.random.rand(3, 2)), 'float32'),
+    ('unary_float64', paddle.tanh, (np.random.rand(2, 3), ), 'float64'),
+    ('binary_float64', paddle.matmul,
+     (np.random.rand(2, 3), np.random.rand(3, 2)), 'float64'),
+))
+class TestJvpPrim(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.args = [arg.astype(cls.dtype) for arg in cls.args]
+        cls._rtol = config.TOLERANCE.get(
+            cls.dtype).get('first_order_grad').get('rtol')
+        cls._atol = config.TOLERANCE.get(
+            cls.dtype).get('first_order_grad').get('atol')
+
+    def setUp(self):
+        paddle.enable_static()
+        paddle.incubate.autograd.enable_prim()
+
+    def tearDown(self):
+        paddle.incubate.autograd.disable_prim()
+        paddle.disable_static()
+
+    def test_jacobian_prim(self):
+
+        def wrapper(fun, args):
+            mp = paddle.static.Program()
+            sp = paddle.static.Program()
+            with paddle.static.program_guard(mp, sp):
+                static_args = [
+                    paddle.static.data(f'arg{i}', arg.shape, self.dtype)
+                    for i, arg in enumerate(args)
+                ]
+                for arg in static_args:
+                    arg.stop_gradient = False
+                _, jvp_res = paddle.incubate.autograd.jvp(fun, static_args)
+                if paddle.incubate.autograd.prim_enabled():
+                    paddle.incubate.autograd.prim2orig()
+            exe = paddle.static.Executor()
+            exe.run(sp)
+            jvp_res = exe.run(
+                mp,
+                feed={f'arg{i}': arg
+                      for i, arg in enumerate(args)},
+                fetch_list=[jvp_res])
+            return jvp_res
+
+        paddle.incubate.autograd.enable_prim()
+        prim_jvp = wrapper(self.fun, self.args)
+        paddle.incubate.autograd.disable_prim()
+        orig_jvp = wrapper(self.fun, self.args)
+
+        np.testing.assert_allclose(orig_jvp,
+                                   prim_jvp,
+                                   rtol=self._rtol,
+                                   atol=self._atol)
+
+
+@utils.place(config.DEVICES)
+@utils.parameterize((utils.TEST_CASE_NAME, 'fun', 'args', 'dtype'), (
+    ('unary_float32', paddle.tanh, (np.random.rand(2, 3), ), 'float32'),
+    ('binary_float32', paddle.matmul,
+     (np.random.rand(2, 3), np.random.rand(3, 2)), 'float32'),
+    ('unary_float64', paddle.tanh, (np.random.rand(2, 3), ), 'float64'),
+    ('binary_float64', paddle.matmul,
+     (np.random.rand(2, 3), np.random.rand(3, 2)), 'float64'),
+))
+class TestVjpPrim(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.args = [arg.astype(cls.dtype) for arg in cls.args]
+        cls._rtol = config.TOLERANCE.get(
+            cls.dtype).get('first_order_grad').get('rtol')
+        cls._atol = config.TOLERANCE.get(
+            cls.dtype).get('first_order_grad').get('atol')
+
+    def setUp(self):
+        paddle.enable_static()
+        paddle.incubate.autograd.enable_prim()
+
+    def tearDown(self):
+        paddle.incubate.autograd.disable_prim()
+        paddle.disable_static()
+
+    def test_jacobian_prim(self):
+
+        def wrapper(fun, args):
+            mp = paddle.static.Program()
+            sp = paddle.static.Program()
+            with paddle.static.program_guard(mp, sp):
+                static_args = [
+                    paddle.static.data(f'arg{i}', arg.shape, self.dtype)
+                    for i, arg in enumerate(args)
+                ]
+                for arg in static_args:
+                    arg.stop_gradient = False
+                _, vjp_res = paddle.incubate.autograd.vjp(fun, static_args)
+                if paddle.incubate.autograd.prim_enabled():
+                    paddle.incubate.autograd.prim2orig()
+            exe = paddle.static.Executor()
+            exe.run(sp)
+            vjp_res = exe.run(
+                mp,
+                feed={f'arg{i}': arg
+                      for i, arg in enumerate(args)},
+                fetch_list=[vjp_res])
+            return vjp_res
+
+        paddle.incubate.autograd.enable_prim()
+        prim_vjp = wrapper(self.fun, self.args)
+        paddle.incubate.autograd.disable_prim()
+        orig_vjp = wrapper(self.fun, self.args)
+
+        for orig, prim in zip(orig_vjp, prim_vjp):
+            np.testing.assert_allclose(orig,
+                                       prim,
+                                       rtol=self._rtol,
+                                       atol=self._atol)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_static.py b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_static.py
index 4e01ad5382c91..9b2098d37b882 100644
--- a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_static.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_static.py
@@ -59,7 +59,8 @@ def _vjp(self):
         with paddle.static.program_guard(mp, sp):
             feed, static_xs, static_v = utils.gen_static_data_and_feed(
                 self.xs, self.v, stop_gradient=self.stop_gradient)
-            ys, xs_grads = paddle.autograd.vjp(self.fun, static_xs, static_v)
+            ys, xs_grads = paddle.incubate.autograd.vjp(self.fun, static_xs,
+                                                        static_v)
         exe.run(sp)
         return exe.run(mp, feed=feed, fetch_list=[ys, xs_grads])
 
@@ -103,7 +104,8 @@ def _vjp(self):
         with paddle.static.program_guard(mp, sp):
             feed, static_xs, static_v = utils.gen_static_data_and_feed(
                 self.xs, self.v)
-            ys, xs_grads = paddle.autograd.vjp(self.fun, static_xs, static_v)
+            ys, xs_grads = paddle.incubate.autograd.vjp(self.fun, static_xs,
+                                                        static_v)
         self.exe.run(sp)
         return self.exe.run(mp, feed, fetch_list=[ys, xs_grads])
 
@@ -214,7 +216,7 @@ def run_test_by_fullmatrix(self, pd_f, np_f, inps, batch=False):
         startup = fluid.Program()
         with fluid.program_guard(main, startup):
             xs = make_tensors(inps)
-            JJ = paddle.autograd.functional.Jacobian(pd_f, xs, is_batched=batch)
+            JJ = paddle.incubate.autograd.Jacobian(pd_f, xs, is_batched=batch)
             if batch:
                 _, nrow, ncol = JJ.shape
             else:
@@ -244,7 +246,7 @@ def run_test_by_rows(self, pd_f, np_f, inps, batch=False):
         startup = fluid.Program()
         with fluid.program_guard(main, startup):
             xs = make_tensors(inps)
-            JJ = paddle.autograd.functional.Jacobian(pd_f, xs, is_batched=batch)
+            JJ = paddle.incubate.autograd.Jacobian(pd_f, xs, is_batched=batch)
             if batch:
                 nbatch, nrow, ncol = JJ.shape
                 rows = [JJ[:, i, :] for i in range(nrow)]
@@ -269,7 +271,7 @@ def run_test_by_entries(self, pd_f, np_f, inps, batch=False):
         startup = fluid.Program()
         with fluid.program_guard(main, startup):
             xs = make_tensors(inps)
-            JJ = paddle.autograd.functional.Jacobian(pd_f, xs, is_batched=batch)
+            JJ = paddle.incubate.autograd.Jacobian(pd_f, xs, is_batched=batch)
             if batch:
                 nbatch, nrow, ncol = JJ.shape
                 entries = [
@@ -390,7 +392,7 @@ def run_test_by_fullmatrix(self, pd_f, inps, np_hess, batch=False):
         startup = fluid.Program()
         with fluid.program_guard(main, startup):
             xs = make_tensors(inps)
-            HH = paddle.autograd.functional.Hessian(pd_f, xs, is_batched=batch)
+            HH = paddle.incubate.autograd.Hessian(pd_f, xs, is_batched=batch)
             nrow, ncol = HH.shape
             full_hessian = HH[:]
         exe = fluid.Executor(self.place)
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_gradients_and_minimize.py b/python/paddle/fluid/tests/unittests/autograd/test_minimize.py
similarity index 56%
rename from python/paddle/fluid/tests/unittests/autograd/test_gradients_and_minimize.py
rename to python/paddle/fluid/tests/unittests/autograd/test_minimize.py
index 67ebe01d9f027..10259802c6933 100644
--- a/python/paddle/fluid/tests/unittests/autograd/test_gradients_and_minimize.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_minimize.py
@@ -13,82 +13,16 @@
 # limitations under the License.
 
 import unittest
-import numpy as np
 
+import numpy as np
 import paddle
 from paddle.incubate.autograd.primx import prim2orig
-from paddle.incubate.autograd.utils import enable_prim, disable_prim, prim_enabled
+from paddle.incubate.autograd.utils import (disable_prim, enable_prim,
+                                            prim_enabled)
 
 paddle.enable_static()
 
 
-class TestGradients(unittest.TestCase):
-
-    def test_third_order(self):
-        enable_prim()
-        main = paddle.static.Program()
-        startup = paddle.static.Program()
-        with paddle.static.program_guard(main, startup):
-            x = paddle.static.data(name='x', shape=[1], dtype='float32')
-            x2 = paddle.multiply(x, x)
-            x3 = paddle.multiply(x2, x)
-            x4 = paddle.multiply(x3, x)
-
-            grad1, = paddle.static.gradients([x4], [x])
-            grad2, = paddle.static.gradients([grad1], [x])
-            grad3, = paddle.static.gradients([grad2], [x])
-
-            prim2orig(main.block(0))
-
-        feed = {x.name: np.array([2.]).astype('float32')}
-        fetch_list = [grad3.name]
-        result = [np.array([48.])]
-
-        place = paddle.CPUPlace()
-        if paddle.device.is_compiled_with_cuda():
-            place = paddle.CUDAPlace(0)
-        exe = paddle.static.Executor(place)
-        exe.run(startup)
-        outs = exe.run(main, feed=feed, fetch_list=fetch_list)
-        np.allclose(outs, result)
-        disable_prim()
-
-    def test_fourth_order(self):
-        enable_prim()
-        main = paddle.static.Program()
-        startup = paddle.static.Program()
-        with paddle.static.program_guard(main, startup):
-            x = paddle.static.data(name='x', shape=[1], dtype='float32')
-            x2 = paddle.multiply(x, x)
-            x3 = paddle.multiply(x2, x)
-            x4 = paddle.multiply(x3, x)
-            x5 = paddle.multiply(x4, x)
-            out = paddle.sqrt(x5 + x4)
-
-            grad1, = paddle.static.gradients([out], [x])
-            grad2, = paddle.static.gradients([grad1], [x])
-            grad3, = paddle.static.gradients([grad2], [x])
-            grad4, = paddle.static.gradients([grad3], [x])
-
-            prim2orig(main.block(0))
-
-        feed = {
-            x.name: np.array([2.]).astype('float32'),
-        }
-        fetch_list = [grad4.name]
-        # (3*(-5*x^2-16*x-16))/(16*(x+1)^3.5)
-        result = [np.array([-0.27263762711])]
-
-        place = paddle.CPUPlace()
-        if paddle.device.is_compiled_with_cuda():
-            place = paddle.CUDAPlace(0)
-        exe = paddle.static.Executor(place)
-        exe.run(startup)
-        outs = exe.run(main, feed=feed, fetch_list=fetch_list)
-        np.allclose(outs, result)
-        disable_prim()
-
-
 class TestMinimize(unittest.TestCase):
 
     def model(self, x, w, bias, opt):
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_primapi.py b/python/paddle/fluid/tests/unittests/autograd/test_primapi.py
index 0137f4103fbb3..09bd64ee67834 100644
--- a/python/paddle/fluid/tests/unittests/autograd/test_primapi.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_primapi.py
@@ -23,6 +23,117 @@
 import utils
 
 
+@utils.place(config.DEVICES)
+@utils.parameterize(
+    (utils.TEST_CASE_NAME, 'fun', 'xs', 'v', 'dtype'),
+    (('matmul', paddle.matmul,
+      (np.random.rand(2, 3), np.random.rand(3, 2)), None, 'float32'), ))
+class TestWithoutProgramGuard(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.xs = tuple(x.astype(cls.dtype) for x in cls.xs)
+        cls._rtol = config.TOLERANCE.get(str(
+            cls.dtype)).get("first_order_grad").get("rtol")
+        cls._atol = config.TOLERANCE.get(str(
+            cls.dtype)).get("first_order_grad").get("atol")
+
+    def setUp(self):
+        paddle.enable_static()
+        paddle.incubate.autograd.enable_prim()
+
+    def tearDown(self):
+        paddle.incubate.autograd.disable_prim()
+        paddle.disable_static()
+
+    def test_forward_grad_without_program_guard(self):
+
+        def with_program_guard():
+            paddle.incubate.autograd.enable_prim()
+            sp = paddle.static.Program()
+            mp = paddle.static.Program()
+            with paddle.static.program_guard(mp, sp):
+                feed, static_xs, static_v = utils.gen_static_data_and_feed(
+                    self.xs, self.v, stop_gradient=False)
+                ys = self.fun(*static_xs) if isinstance(
+                    static_xs, typing.Sequence) else self.fun(static_xs)
+                ys_grad = paddle.incubate.autograd.forward_grad(
+                    ys, static_xs, static_v)
+                paddle.incubate.autograd.prim2orig(mp.block(0))
+            exe = paddle.static.Executor()
+            exe.run(sp)
+            out = exe.run(mp, feed=feed, fetch_list=ys_grad)
+            paddle.incubate.autograd.disable_prim()
+            return out
+
+        def without_program_guard():
+            paddle.incubate.autograd.enable_prim()
+            feed, static_xs, static_v = utils.gen_static_data_and_feed(
+                self.xs, self.v, stop_gradient=False)
+            ys = self.fun(*static_xs) if isinstance(
+                static_xs, typing.Sequence) else self.fun(static_xs)
+            ys_grad = paddle.incubate.autograd.forward_grad(
+                ys, static_xs, static_v)
+            sp = paddle.fluid.framework.default_startup_program()
+            mp = paddle.fluid.framework.default_main_program()
+            exe = paddle.static.Executor()
+            exe.run(sp)
+            out = exe.run(mp, feed=feed, fetch_list=ys_grad)
+            paddle.incubate.autograd.disable_prim()
+            return out
+
+        expected = with_program_guard()
+        actual = without_program_guard()
+        self.assertEqual(type(actual), type(expected))
+        np.testing.assert_allclose(np.concatenate(actual),
+                                   np.concatenate(expected),
+                                   rtol=self._rtol,
+                                   atol=self._atol)
+
+    def test_grad_without_program_guard(self):
+
+        def with_program_guard():
+            paddle.incubate.autograd.enable_prim()
+            sp = paddle.static.Program()
+            mp = paddle.static.Program()
+            with paddle.static.program_guard(mp, sp):
+                feed, static_xs, static_v = utils.gen_static_data_and_feed(
+                    self.xs, self.v, stop_gradient=False)
+                ys = self.fun(*static_xs) if isinstance(
+                    static_xs, typing.Sequence) else self.fun(static_xs)
+                xs_grad = paddle.incubate.autograd.grad(ys, static_xs, static_v)
+                paddle.incubate.autograd.prim2orig(mp.block(0))
+            exe = paddle.static.Executor()
+            exe.run(sp)
+            out = exe.run(mp, feed=feed, fetch_list=xs_grad)
+            paddle.incubate.autograd.disable_prim()
+            return out
+
+        def without_program_guard():
+            paddle.incubate.autograd.enable_prim()
+            feed, static_xs, static_v = utils.gen_static_data_and_feed(
+                self.xs, self.v, stop_gradient=False)
+            ys = self.fun(*static_xs) if isinstance(
+                static_xs, typing.Sequence) else self.fun(static_xs)
+            xs_grad = paddle.incubate.autograd.grad(ys, static_xs, static_v)
+            sp = paddle.fluid.framework.default_startup_program()
+            mp = paddle.fluid.framework.default_main_program()
+            exe = paddle.static.Executor()
+            exe.run(sp)
+            out = exe.run(mp, feed=feed, fetch_list=xs_grad)
+            paddle.incubate.autograd.disable_prim()
+            return out
+
+        expected = with_program_guard()
+        actual = without_program_guard()
+        for i, j in zip(actual, expected):
+            self.assertEqual(type(i), type(j))
+            np.testing.assert_allclose(np.concatenate(i),
+                                       np.concatenate(j),
+                                       rtol=self._rtol,
+                                       atol=self._atol)
+
+
 @utils.place(config.DEVICES)
 @utils.parameterize(
     (utils.TEST_CASE_NAME, 'fun', 'xs', 'v', 'dtype'),
@@ -37,7 +148,7 @@
      ('input_gradients_not_none', paddle.matmul,
       (np.random.rand(3, 3), np.random.rand(3, 3)),
       (np.random.rand(3, 3), np.random.rand(3, 3)), 'float64')))
-class TestForwardGradients(unittest.TestCase):
+class TestForwardGrad(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls):
@@ -55,7 +166,7 @@ def tearDown(self):
         paddle.incubate.autograd.disable_prim()
         paddle.disable_static()
 
-    def test_forward_gradients(self):
+    def test_forward_grad(self):
 
         def expected():
             paddle.incubate.autograd.disable_prim()
@@ -64,7 +175,8 @@ def expected():
             with paddle.static.program_guard(mp, sp):
                 feed, static_xs, static_v = utils.gen_static_data_and_feed(
                     self.xs, self.v, stop_gradient=False)
-                _, ys_grad = paddle.autograd.jvp(self.fun, static_xs, static_v)
+                _, ys_grad = paddle.incubate.autograd.jvp(
+                    self.fun, static_xs, static_v)
             exe = paddle.static.Executor()
             exe.run(sp)
             out = exe.run(mp, feed=feed, fetch_list=ys_grad)
@@ -80,7 +192,8 @@ def actual():
                     self.xs, self.v, stop_gradient=False)
                 ys = self.fun(*static_xs) if isinstance(
                     static_xs, typing.Sequence) else self.fun(static_xs)
-                ys_grad = primapi.forward_gradients(ys, static_xs, static_v)
+                ys_grad = paddle.incubate.autograd.forward_grad(
+                    ys, static_xs, static_v)
                 paddle.incubate.autograd.prim2orig(mp.block(0))
             exe = paddle.static.Executor()
             exe.run(sp)
@@ -106,7 +219,7 @@ def test_prim_disabled(self):
                     self.xs, self.v, stop_gradient=False)
                 ys = self.fun(*static_xs) if isinstance(
                     static_xs, typing.Sequence) else self.fun(static_xs)
-                ys_grad = primapi.forward_gradients(ys, static_xs, static_v)
+                ys_grad = primapi.forward_grad(ys, static_xs, static_v)
                 paddle.incubate.autograd.prim2orig(mp.block(0))
             exe = paddle.static.Executor()
             exe.run(sp)
@@ -116,14 +229,161 @@ def test_prim_disabled(self):
     def test_illegal_param(self):
         paddle.incubate.autograd.enable_prim()
         with self.assertRaises(TypeError):
-            primapi.forward_gradients(1, paddle.static.data('inputs',
-                                                            shape=[1]))
+            primapi.forward_grad(1, paddle.static.data('inputs', shape=[1]))
 
         with self.assertRaises(TypeError):
-            primapi.forward_gradients(paddle.static.data('targets', shape=[1]),
-                                      1)
+            primapi.forward_grad(paddle.static.data('targets', shape=[1]), 1)
+        paddle.incubate.autograd.disable_prim()
+
+
+class TestGrad(unittest.TestCase):
+
+    def setUp(self):
+        paddle.enable_static()
+        paddle.incubate.autograd.enable_prim()
+
+    def tearDown(self):
+        paddle.incubate.autograd.disable_prim()
+        paddle.disable_static()
+
+    def test_third_order(self):
+        paddle.incubate.autograd.enable_prim()
+        main = paddle.static.Program()
+        startup = paddle.static.Program()
+        with paddle.static.program_guard(main, startup):
+            x = paddle.static.data(name='x', shape=[1], dtype='float32')
+            x2 = paddle.multiply(x, x)
+            x3 = paddle.multiply(x2, x)
+            x4 = paddle.multiply(x3, x)
+
+            grad1, = paddle.incubate.autograd.grad([x4], [x])
+            grad2, = paddle.incubate.autograd.grad([grad1], [x])
+            grad3, = paddle.incubate.autograd.grad([grad2], [x])
+
+            paddle.incubate.autograd.prim2orig(main.block(0))
+
+        feed = {x.name: np.array([2.]).astype('float32')}
+        fetch_list = [grad3.name]
+        result = [np.array([48.])]
+
+        place = paddle.CPUPlace()
+        if paddle.device.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+        exe = paddle.static.Executor(place)
+        exe.run(startup)
+        outs = exe.run(main, feed=feed, fetch_list=fetch_list)
+        np.testing.assert_allclose(outs, result, rtol=1e-5, atol=1e-5)
+        paddle.incubate.autograd.disable_prim()
+
+    def test_fourth_order(self):
+        paddle.incubate.autograd.enable_prim()
+        main = paddle.static.Program()
+        startup = paddle.static.Program()
+        with paddle.static.program_guard(main, startup):
+            x = paddle.static.data(name='x', shape=[1], dtype='float32')
+            x2 = paddle.multiply(x, x)
+            x3 = paddle.multiply(x2, x)
+            x4 = paddle.multiply(x3, x)
+            x5 = paddle.multiply(x4, x)
+            out = paddle.sqrt(x5 + x4)
+
+            grad1, = paddle.incubate.autograd.grad([out], [x])
+            grad2, = paddle.incubate.autograd.grad([grad1], [x])
+            grad3, = paddle.incubate.autograd.grad([grad2], [x])
+            grad4, = paddle.incubate.autograd.grad([grad3], [x])
+
+            paddle.incubate.autograd.prim2orig(main.block(0))
+
+        feed = {
+            x.name: np.array([2.]).astype('float32'),
+        }
+        fetch_list = [grad4.name]
+        # (3*(-5*x^2-16*x-16))/(16*(x+1)^3.5)
+        result = [np.array([-0.27263762711])]
+
+        place = paddle.CPUPlace()
+        if paddle.device.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+        exe = paddle.static.Executor(place)
+        exe.run(startup)
+        outs = exe.run(main, feed=feed, fetch_list=fetch_list)
+        np.testing.assert_allclose(outs, result, rtol=1e-5, atol=1e-5)
+        paddle.incubate.autograd.disable_prim()
+
+    def test_fifth_order(self):
+        paddle.incubate.autograd.enable_prim()
+        main = paddle.static.Program()
+        startup = paddle.static.Program()
+        with paddle.static.program_guard(main, startup):
+            x = paddle.static.data(name='x', shape=[1], dtype='float32')
+            x2 = paddle.multiply(x, x)
+            x3 = paddle.multiply(x2, x)
+            x4 = paddle.multiply(x3, x)
+            x5 = paddle.multiply(x4, x)
+            x6 = paddle.multiply(x5, x)
+            out = x6 + x5
+
+            grad1, = paddle.incubate.autograd.grad([out], [x])
+            grad2, = paddle.incubate.autograd.grad([grad1], [x])
+            grad3, = paddle.incubate.autograd.grad([grad2], [x])
+            grad4, = paddle.incubate.autograd.grad([grad3], [x])
+            grad5, = paddle.incubate.autograd.grad([grad4], [x])
+
+            paddle.incubate.autograd.prim2orig()
+
+        feed = {
+            x.name: np.array([2.]).astype('float32'),
+        }
+        fetch_list = [grad5.name]
+        result = [np.array([1560.0])]
+
+        place = paddle.CPUPlace()
+        if paddle.device.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+        exe = paddle.static.Executor(place)
+        exe.run(startup)
+        outs = exe.run(main, feed=feed, fetch_list=fetch_list)
+        np.testing.assert_allclose(outs, result, rtol=1e-5, atol=1e-5)
         paddle.incubate.autograd.disable_prim()
 
+    def test_disable_prim(self):
+
+        def actual(x: np.array):
+            paddle.incubate.autograd.disable_prim()
+            main = paddle.static.Program()
+            startup = paddle.static.Program()
+            with paddle.static.program_guard(main, startup):
+                var_x = paddle.static.data('x', shape=x.shape, dtype=x.dtype)
+                var_x.stop_gradient = False
+                y = paddle.tanh(var_x)
+                y_grad = paddle.incubate.autograd.grad(y, var_x)
+                y_second_grad = paddle.incubate.autograd.grad(y_grad, var_x)
+            exe = paddle.static.Executor()
+            exe.run(startup)
+            return exe.run(main,
+                           feed={'x': x},
+                           fetch_list=[y_grad, y_second_grad])
+
+        def expect(x: np.array):
+            paddle.incubate.autograd.disable_prim()
+            main = paddle.static.Program()
+            startup = paddle.static.Program()
+            with paddle.static.program_guard(main, startup):
+                var_x = paddle.static.data('x', shape=x.shape, dtype=x.dtype)
+                var_x.stop_gradient = False
+                y = paddle.tanh(var_x)
+                y_grad = paddle.static.gradients(y, var_x)
+                y_second_grad = paddle.static.gradients(y_grad, var_x)
+            exe = paddle.static.Executor()
+            exe.run(startup)
+            return exe.run(main,
+                           feed={'x': x},
+                           fetch_list=[y_grad, y_second_grad])
+
+        x = np.random.randn(100, 200)
+        for i, j in zip(actual(x), expect(x)):
+            np.testing.assert_allclose(i, j)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_primops.py b/python/paddle/fluid/tests/unittests/autograd/test_primops.py
index ccbd630bfd084..f14664237f36f 100644
--- a/python/paddle/fluid/tests/unittests/autograd/test_primops.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_primops.py
@@ -21,7 +21,7 @@
                                               concat, reduce, matmul,
                                               slice_select, slice_assign,
                                               gather, scatter_add, fill_const)
-from paddle.incubate.autograd.primx import Transform, topo_path, orig2prim, prim2orig, _gradients
+from paddle.incubate.autograd.primx import Transform, topo_path, orig2prim, prim2orig
 from paddle.incubate.autograd.utils import enable_prim, disable_prim, prim_enabled
 
 
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_transform.py b/python/paddle/fluid/tests/unittests/autograd/test_transform.py
index 08626593e2904..f976ef729cc7a 100644
--- a/python/paddle/fluid/tests/unittests/autograd/test_transform.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_transform.py
@@ -88,6 +88,12 @@ def init_data(self):
             'mul_p',
             'mul_p'
         ]
+        self.prim2orig_ops_with_blacklist = [
+            'tanh', 'tanh', 'add_p', 'fill_constant', 'fill_constant',
+            'fill_constant', 'elementwise_mul', 'sub_p', 'fill_constant',
+            'elementwise_mul', 'sub_p', 'fill_constant', 'elementwise_mul',
+            'elementwise_mul'
+        ]
         self.prim2orig_ops = [
             'tanh', 'tanh', 'elementwise_add', 'fill_constant', 'fill_constant',
             'fill_constant', 'elementwise_mul', 'elementwise_sub',
@@ -132,6 +138,13 @@ def test_run(self):
             for k, v in self.ys_shape_map.items():
                 self.assertEqual(flatten_ys_bar[k].shape, v)
 
+            # Test prim2orig with blacklist
+            prim2orig(block=self.main_program.block(0),
+                      blacklist=['add_p', 'sub_p'])
+            prim2orig_ops = [op.type for op in self.main_program.block(0).ops]
+            self.assertEqual(sorted(prim2orig_ops),
+                             sorted(self.prim2orig_ops_with_blacklist))
+
             # Test prim2orig
             prim2orig(block=self.main_program.block(0))
             prim2orig_ops = [op.type for op in self.main_program.block(0).ops]
@@ -198,6 +211,26 @@ def init_data(self):
             'reshape_p',
         ]
 
+        self.prim2orig_ops_with_blacklist = [
+            'reshape2',
+            'fill_constant',
+            'fill_constant',
+            'fill_constant',
+            'elementwise_mul',
+            'add_p',
+            'matmul_v2',
+            'fill_constant',
+            'fill_constant',
+            'fill_constant',
+            'elementwise_mul',
+            'transpose2',
+            'matmul_v2',
+            'transpose2',
+            'matmul_v2',
+            # 'elementwise_mul',
+            'reshape2',
+        ]
+
         self.prim2orig_ops = [
             'reshape2',
             'fill_constant',
@@ -312,6 +345,17 @@ def init_data(self):
             'add_p',
         ]
 
+        self.prim2orig_ops_with_blacklist = [
+            'expand_v2', 'add_p', 'reshape2', 'elementwise_mul', 'reduce_sum',
+            'sqrt', 'expand_v2', 'sub_p', 'concat', 'gather', 'fill_constant',
+            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
+            'fill_constant', 'elementwise_mul', 'reduce_sum', 'reshape2',
+            'reshape2', 'elementwise_mul', 'elementwise_mul', 'reshape2',
+            'expand_v2', 'elementwise_div', 'reduce_sum', 'reshape2',
+            'fill_constant', 'sub_p', 'split', 'fill_constant', 'fill_any_like',
+            'add_p', 'scatter', 'elementwise_add', 'add_p'
+        ]
+
         self.prim2orig_ops = [
             'expand_v2', 'elementwise_add', 'reshape2', 'elementwise_mul',
             'reduce_sum', 'sqrt', 'expand_v2', 'elementwise_sub', 'concat',
diff --git a/python/paddle/fluid/tests/unittests/autograd/utils.py b/python/paddle/fluid/tests/unittests/autograd/utils.py
index 8a0e51f60f47b..6afd0ff392288 100644
--- a/python/paddle/fluid/tests/unittests/autograd/utils.py
+++ b/python/paddle/fluid/tests/unittests/autograd/utils.py
@@ -22,7 +22,7 @@
 import collections
 import numpy as np
 import paddle
-from paddle.autograd.utils import as_tensors
+from paddle.incubate.autograd.utils import as_tensors
 
 
 ##########################################################
diff --git a/python/paddle/fluid/tests/unittests/check_nan_inf_base.py b/python/paddle/fluid/tests/unittests/check_nan_inf_base.py
index d188ae6654509..69cd4a1b55411 100644
--- a/python/paddle/fluid/tests/unittests/check_nan_inf_base.py
+++ b/python/paddle/fluid/tests/unittests/check_nan_inf_base.py
@@ -70,7 +70,7 @@ def net():
     cost, y_predict = fluid.layers.softmax_with_cross_entropy(
         hidden, y, return_softmax=True)
     acc_top1 = fluid.layers.accuracy(input=y_predict, label=y, k=1)
-    avg_cost = fluid.layers.mean(cost)
+    avg_cost = paddle.mean(cost)
 
     sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.05)
     sgd_optimizer.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/collective_alltoall_single.py b/python/paddle/fluid/tests/unittests/collective_alltoall_single.py
new file mode 100644
index 0000000000000..cb6777d20bc25
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_alltoall_single.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+
+import paddle
+import numpy as np
+import random
+import paddle.distributed as dist
+import paddle.fluid as fluid
+import paddle.distributed.fleet as fleet
+from paddle import framework
+
+
+class TestCollectiveAllToAllSingle(unittest.TestCase):
+
+    def setUp(self):
+        assert not paddle.distributed.is_initialized(), \
+            "The distributed environment has not been initialized."
+        dist.init_parallel_env()
+        assert paddle.distributed.is_initialized(), \
+            "The distributed environment has been initialized."
+
+        paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+
+    def test_collective_alltoall_single(self):
+        rank = dist.get_rank()
+        size = dist.get_world_size()
+
+        # case 1
+        input = paddle.ones([size, size], dtype='int64') * rank
+        output = paddle.empty([size, size], dtype='int64')
+        expected_output = paddle.concat(
+            [paddle.ones([1, size], dtype='int64') * i for i in range(size)])
+
+        group = dist.new_group([0, 1])
+        dist.alltoall_single(input, output, group=group)
+
+        np.testing.assert_allclose(output.numpy(), expected_output.numpy())
+        dist.destroy_process_group(group)
+
+        # case 2
+        in_split_sizes = [i + 1 for i in range(size)]
+        out_split_sizes = [rank + 1 for i in range(size)]
+
+        input = paddle.ones([sum(in_split_sizes), size], dtype='float32') * rank
+        output = paddle.empty([(rank + 1) * size, size], dtype='float32')
+        expected_output = paddle.concat([
+            paddle.ones([rank + 1, size], dtype='float32') * i
+            for i in range(size)
+        ])
+
+        group = dist.new_group([0, 1])
+        task = dist.alltoall_single(input,
+                                    output,
+                                    in_split_sizes,
+                                    out_split_sizes,
+                                    use_calc_stream=False,
+                                    group=group)
+        task.wait()
+
+        np.testing.assert_allclose(output.numpy(), expected_output.numpy())
+        dist.destroy_process_group(group)
+
+    def tearDown(self):
+        dist.destroy_process_group()
+        assert not paddle.distributed.is_initialized(), \
+            "The distributed environment has been deinitialized."
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective_batch_isend_irecv.py b/python/paddle/fluid/tests/unittests/collective_batch_isend_irecv.py
new file mode 100644
index 0000000000000..5aa309a2bbe5d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_batch_isend_irecv.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+
+import paddle
+import numpy as np
+import random
+import paddle.distributed as dist
+import paddle.fluid as fluid
+import paddle.distributed.fleet as fleet
+from paddle import framework
+
+
+class TestCollectiveBatchIsendIrecv(unittest.TestCase):
+
+    def setUp(self):
+        dist.init_parallel_env()
+        paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+
+    def test_collective_batch_isend_irecv(self):
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+        send_t = paddle.arange(2) + rank
+        # paddle.tensor([0, 1])  # Rank-0
+        # paddle.tensor([1, 2])  # Rank-1
+        recv_t = paddle.empty(shape=[2], dtype=send_t.dtype)
+        send_op = dist.P2POp(dist.isend, send_t, (rank + 1) % world_size)
+        recv_op = dist.P2POp(dist.irecv, recv_t,
+                             (rank - 1 + world_size) % world_size)
+        tasks = dist.batch_isend_irecv([send_op, recv_op])
+
+        for task in tasks:
+            task.wait()
+
+        if rank == 0:
+            np.testing.assert_allclose(recv_t.numpy(), [1, 2])
+        elif rank == 1:
+            np.testing.assert_allclose(recv_t.numpy(), [0, 1])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective_reduce_scatter.py b/python/paddle/fluid/tests/unittests/collective_reduce_scatter.py
new file mode 100644
index 0000000000000..0e36296e4089c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_reduce_scatter.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+
+import paddle
+import numpy as np
+import random
+import paddle.distributed as dist
+import paddle.fluid as fluid
+import paddle.distributed.fleet as fleet
+from paddle import framework
+
+
+class TestCollectiveReduceScatter(unittest.TestCase):
+
+    def setUp(self):
+        dist.init_parallel_env()
+        paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+
+    def test_collective_reduce_scatter_sum(self):
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+
+        if rank == 0:
+            t1 = paddle.to_tensor([0, 1])
+            t2 = paddle.to_tensor([2, 3])
+        else:
+            t1 = paddle.to_tensor([4, 5])
+            t2 = paddle.to_tensor([6, 7])
+
+        input_list = [t1, t2]
+
+        output = paddle.empty(shape=[2], dtype=input_list[0].dtype)
+        dist.reduce_scatter(output, input_list)
+
+        if rank == 0:
+            np.testing.assert_allclose(output.numpy(), [4, 6])
+        elif rank == 1:
+            np.testing.assert_allclose(output.numpy(), [8, 10])
+
+    def test_collective_reduce_scatter_max(self):
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+
+        if rank == 0:
+            t1 = paddle.to_tensor([0, 1], dtype="float16")
+            t2 = paddle.to_tensor([2, 3], dtype="float16")
+        else:
+            t1 = paddle.to_tensor([4, 5], dtype="float16")
+            t2 = paddle.to_tensor([6, 7], dtype="float16")
+
+        input_list = [t1, t2]
+
+        output = paddle.empty(shape=[2], dtype=input_list[0].dtype)
+        dist.reduce_scatter(output, input_list, op=dist.ReduceOp.MAX)
+
+        if rank == 0:
+            np.testing.assert_allclose(output.numpy(), [4, 5])
+        elif rank == 1:
+            np.testing.assert_allclose(output.numpy(), [6, 7])
+
+    def test_collective_reduce_scatter_base(self):
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+
+        input = paddle.arange(4) + rank
+        # [0, 1, 2, 3]  # Rank-0
+        # [1, 2, 3, 4]  # Rank-1
+
+        output = paddle.empty(shape=[2], dtype=input.dtype)
+        task = paddle.distributed.collective._reduce_scatter_base(
+            output, input, use_calc_stream=False)
+
+        task.wait()
+
+        if rank == 0:
+            np.testing.assert_allclose(output.numpy(), [1, 3])
+        elif rank == 1:
+            np.testing.assert_allclose(output.numpy(), [5, 7])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dist_allreduce_op.py b/python/paddle/fluid/tests/unittests/dist_allreduce_op.py
index 1360d975603b2..9f7f411be5b21 100644
--- a/python/paddle/fluid/tests/unittests/dist_allreduce_op.py
+++ b/python/paddle/fluid/tests/unittests/dist_allreduce_op.py
@@ -84,7 +84,7 @@ def get_model(self, batch_size=2, single_device=False):
         # Train program
         predict = cnn_model(images)
         cost = fluid.layers.cross_entropy(input=predict, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
+        avg_cost = paddle.mean(x=cost)
 
         # Evaluator
         batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
diff --git a/python/paddle/fluid/tests/unittests/dist_ctr.py b/python/paddle/fluid/tests/unittests/dist_ctr.py
index 6cd452ed1952a..0e811cb050bcc 100644
--- a/python/paddle/fluid/tests/unittests/dist_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_ctr.py
@@ -92,7 +92,7 @@ def get_model(self, batch_size=2):
         auc_var, batch_auc_var, auc_states = fluid.layers.auc(input=predict,
                                                               label=label)
         cost = fluid.layers.cross_entropy(input=predict, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
+        avg_cost = paddle.mean(x=cost)
 
         inference_program = paddle.fluid.default_main_program().clone()
 
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
index 9508dc6c26292..a33624ee5eedf 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
@@ -143,7 +143,7 @@ def net(self, args, is_train=True, batch_size=4, lr=0.01):
                                                               label=label)
 
         cost = fluid.layers.cross_entropy(input=predict, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
+        avg_cost = paddle.mean(x=cost)
 
         self.feeds = datas
         self.train_file_path = ["fake1", "fake2"]
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_heter_pipeline_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_heter_pipeline_ctr.py
index f714526286c92..dc0feb35ae8d1 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_heter_pipeline_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_heter_pipeline_ctr.py
@@ -116,7 +116,7 @@ def net(self, args, batch_size=4, lr=0.01):
             predict = fluid.layers.fc(input=merge_layer, size=2, act='softmax')
 
             cost = fluid.layers.cross_entropy(input=predict, label=label)
-            avg_cost = fluid.layers.mean(x=cost)
+            avg_cost = paddle.mean(x=cost)
             fluid.layers.Print(avg_cost, message="avg_cost")
 
         self.feeds = datas
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer.py b/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer.py
index 19e278b4f4620..50a2089cd177e 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer.py
@@ -76,7 +76,7 @@ def get_model(self, batch_size=2, single_device=False):
         # Train program
         predict = cnn_model(images)
         cost = fluid.layers.cross_entropy(input=predict, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
+        avg_cost = paddle.mean(x=cost)
 
         # Evaluator
         batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer_fuse_allreduce.py b/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer_fuse_allreduce.py
index cab4484d3e49c..003de5458786e 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer_fuse_allreduce.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer_fuse_allreduce.py
@@ -76,7 +76,7 @@ def get_model(self, batch_size=2, single_device=False):
         # Train program
         predict = cnn_model(images)
         cost = fluid.layers.cross_entropy(input=predict, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
+        avg_cost = paddle.mean(x=cost)
 
         # Evaluator
         batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
index 4a43fb44f46f7..66647b52bb500 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
@@ -91,7 +91,7 @@ def get_loss(cos_q_pt, cos_q_nt):
                                                    shape=[-1, 1],
                                                    value=0.0,
                                                    dtype='float32'), loss_op2)
-    avg_cost = fluid.layers.mean(loss_op3)
+    avg_cost = paddle.mean(loss_op3)
     return avg_cost
 
 
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
index 60b8a7bb6fdff..17589f7f93bb1 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py
@@ -133,7 +133,7 @@ def net(self, args, batch_size=4, lr=0.01):
         acc = fluid.layers.accuracy(input=predict, label=label)
         auc_var, _, _ = fluid.layers.auc(input=predict, label=label)
         cost = fluid.layers.cross_entropy(input=predict, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
+        avg_cost = paddle.mean(x=cost)
 
         self.feeds = datas
         self.train_file_path = ["fake1", "fake2"]
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist.py b/python/paddle/fluid/tests/unittests/dist_mnist.py
index cdfec08f9fe7a..0d1e826c1f559 100644
--- a/python/paddle/fluid/tests/unittests/dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist.py
@@ -85,7 +85,7 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
         # Train program
         predict = cnn_model(images)
         cost = fluid.layers.cross_entropy(input=predict, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
+        avg_cost = paddle.mean(x=cost)
 
         # Evaluator
         batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist_batch_merge.py b/python/paddle/fluid/tests/unittests/dist_mnist_batch_merge.py
index ca59e33ec9e12..c3d7a4f9a56f4 100644
--- a/python/paddle/fluid/tests/unittests/dist_mnist_batch_merge.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist_batch_merge.py
@@ -58,7 +58,7 @@ def get_model(self, batch_size=2):
         # Train program
         predict = cnn_model(images)
         cost = fluid.layers.cross_entropy(input=predict, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
+        avg_cost = paddle.mean(x=cost)
 
         # Evaluator
         batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist_fp16_allreduce.py b/python/paddle/fluid/tests/unittests/dist_mnist_fp16_allreduce.py
index b78dd744a9ae1..034bcbdb9a04a 100644
--- a/python/paddle/fluid/tests/unittests/dist_mnist_fp16_allreduce.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist_fp16_allreduce.py
@@ -38,7 +38,7 @@ def get_model(self, batch_size=2):
         # Train program
         predict = cnn_model(images)
         cost = fluid.layers.cross_entropy(input=predict, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
+        avg_cost = paddle.mean(x=cost)
 
         # Evaluator
         batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist_gradient_merge.py b/python/paddle/fluid/tests/unittests/dist_mnist_gradient_merge.py
index 50a053f57b801..75d9bd806c921 100644
--- a/python/paddle/fluid/tests/unittests/dist_mnist_gradient_merge.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist_gradient_merge.py
@@ -37,7 +37,7 @@ def get_model(self, batch_size=2):
         # Train program
         predict = cnn_model(images)
         cost = fluid.layers.cross_entropy(input=predict, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
+        avg_cost = paddle.mean(x=cost)
 
         # Evaluator
         batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist_lars.py b/python/paddle/fluid/tests/unittests/dist_mnist_lars.py
index 31362565c8981..9de09d0ff6ce4 100644
--- a/python/paddle/fluid/tests/unittests/dist_mnist_lars.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist_lars.py
@@ -49,7 +49,7 @@ def get_model(self, batch_size=2):
         # Train program
         predict = cnn_model(images)
         cost = fluid.layers.cross_entropy(input=predict, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
+        avg_cost = paddle.mean(x=cost)
 
         # Evaluator
         batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
index ad5d632637ebb..eb4b41aff91ec 100644
--- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
@@ -229,7 +229,7 @@ def get_model(self, batch_size=2, use_dgc=False):
         out = model.net(input=image, class_dim=102)
         cost = fluid.layers.cross_entropy(input=out, label=label)
 
-        avg_cost = fluid.layers.mean(x=cost)
+        avg_cost = paddle.mean(x=cost)
         acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
         acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
 
diff --git a/python/paddle/fluid/tests/unittests/dist_sharding_save.py b/python/paddle/fluid/tests/unittests/dist_sharding_save.py
index e31901c8c85b9..e989374e2af46 100755
--- a/python/paddle/fluid/tests/unittests/dist_sharding_save.py
+++ b/python/paddle/fluid/tests/unittests/dist_sharding_save.py
@@ -56,7 +56,7 @@ def runtime_main():
                                                 act='softmax')
             cost = paddle.fluid.layers.cross_entropy(input=prediction,
                                                      label=input_y)
-            avg_cost = paddle.fluid.layers.mean(x=cost)
+            avg_cost = paddle.mean(x=cost)
 
             strategy = paddle.distributed.fleet.DistributedStrategy()
             strategy.sharding = True
diff --git a/python/paddle/fluid/tests/unittests/dist_text_classification.py b/python/paddle/fluid/tests/unittests/dist_text_classification.py
index ede62e643d2e6..08a96575617cf 100644
--- a/python/paddle/fluid/tests/unittests/dist_text_classification.py
+++ b/python/paddle/fluid/tests/unittests/dist_text_classification.py
@@ -137,7 +137,7 @@ def get_model(self, batch_size=2):
         # Train program
         predict = conv_net(data, dict_dim)
         cost = fluid.layers.cross_entropy(input=predict, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
+        avg_cost = paddle.mean(x=cost)
         acc = fluid.layers.accuracy(input=predict, label=label)
         inference_program = fluid.default_main_program().clone()
 
diff --git a/python/paddle/fluid/tests/unittests/dist_word2vec.py b/python/paddle/fluid/tests/unittests/dist_word2vec.py
index 744a6d6729a71..06bd017612204 100644
--- a/python/paddle/fluid/tests/unittests/dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/dist_word2vec.py
@@ -94,7 +94,7 @@ def __network__(words):
                     initializer=fluid.initializer.Constant(value=0.1)))
             cost = fluid.layers.cross_entropy(input=predict_word,
                                               label=words[4])
-            avg_cost = fluid.layers.mean(cost)
+            avg_cost = paddle.mean(cost)
             return avg_cost, predict_word
 
         word_dict = paddle.dataset.imikolov.build_dict()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py
index 7ee6203fb9433..68b3962dde9a6 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from __future__ import absolute_import, division, print_function
-
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import Embedding, Layer, Linear
 from paddle.fluid.dygraph.jit import declarative
@@ -357,7 +357,7 @@ def forward(self, src_ids, position_ids, sentence_ids, input_mask,
 
         mask_lm_loss = fluid.layers.softmax_with_cross_entropy(logits=fc_out,
                                                                label=mask_label)
-        mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss)
+        mean_mask_lm_loss = paddle.mean(mask_lm_loss)
 
         next_sent_fc_out = self.next_sent_fc(next_sent_feat)
 
@@ -367,7 +367,7 @@ def forward(self, src_ids, position_ids, sentence_ids, input_mask,
         next_sent_acc = fluid.layers.accuracy(input=next_sent_softmax,
                                               label=labels)
 
-        mean_next_sent_loss = fluid.layers.mean(next_sent_loss)
+        mean_next_sent_loss = paddle.mean(next_sent_loss)
 
         loss = mean_next_sent_loss + mean_mask_lm_loss
         return next_sent_acc, mean_mask_lm_loss, loss
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py
index 34264cac8a1be..b37accce9d1b8 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py
@@ -41,7 +41,7 @@ def dyfunc_empty_nonlocal(x):
 
 
 def dyfunc_with_if_else(x_v, label=None):
-    if fluid.layers.mean(x_v).numpy()[0] > 5:
+    if paddle.mean(x_v).numpy()[0] > 5:
         x_v = x_v - 1
     else:
         x_v = x_v + 1
@@ -87,14 +87,14 @@ def false_fn_0(q, x, y):
         m = x + 2
         n = x + 3
         return q, x, y, z
-    q, x, y, z = fluid.layers.cond(fluid.layers.mean(x)[0] < 5, lambda :
+    q, x, y, z = fluid.layers.cond(paddle.mean(x)[0] < 5, lambda :
         paddle.jit.dy2static.convert_call(true_fn_0)(q, x, y),
         lambda : paddle.jit.dy2static.convert_call(false_fn_0)(q,
         x, y))
     """
     y = x + 1
     # NOTE: x_v[0] < 5 is True
-    if fluid.layers.mean(x).numpy()[0] < 5:
+    if paddle.mean(x).numpy()[0] < 5:
         x = x + 1
         z = x + 2
         q = x + 3
@@ -117,7 +117,7 @@ def dyfunc_with_if_else_early_return1():
         b = paddle.zeros([3, 3])
         return a, b
     a = paddle.zeros([2, 2]) + 1
-    return a
+    return a, None
 
 
 def dyfunc_with_if_else_early_return2():
@@ -131,7 +131,7 @@ def dyfunc_with_if_else_early_return2():
         d = paddle.zeros([3, 3]) + 1
         return c, d
     e = paddle.zeros([2, 2]) + 3
-    return e
+    return e, None
 
 
 def dyfunc_with_if_else_with_list_geneator(x):
@@ -155,13 +155,13 @@ def nested_if_else(x_v):
         batch_size = fluid.layers.shape(x_v)[0]
 
     # if tensor.shape is [1], now support to compare with numpy.
-    if fluid.layers.mean(x_v).numpy() < 0:
+    if paddle.mean(x_v).numpy() < 0:
         y = x_v + bias
         w = fluid.layers.fill_constant([feat_size], dtype='float32', value=10)
         if y.numpy()[0] < 10:
             tmp = y * w
             y = fluid.layers.relu(tmp)
-            if fluid.layers.mean(y).numpy()[0] < batch_size:
+            if paddle.mean(y).numpy()[0] < batch_size:
                 y = fluid.layers.abs(y)
             else:
                 tmp = fluid.layers.fill_constant([feat_size],
@@ -257,7 +257,7 @@ def forward(self, input):
                                                                 value=1)
         # Control flow `if` statement
         fc_out = self.fc(input)
-        if fluid.layers.mean(fc_out).numpy()[0] < 0:
+        if paddle.mean(fc_out).numpy()[0] < 0:
             y = fc_out + self.constant_vars['bias']
             self.constant_vars['w'] = fluid.layers.fill_constant(
                 [5], dtype='float32', value=10)
@@ -280,13 +280,13 @@ def forward(self, input):
         else:
             y = fc_out - self.constant_vars['bias']
 
-        loss = fluid.layers.mean(y)
+        loss = paddle.mean(y)
         return loss
 
 
 def if_with_and_or(x_v, label=None):
     batch_size = fluid.layers.shape(x_v)
-    if x_v is not None and (fluid.layers.mean(x_v).numpy()[0] > 0 or label
+    if x_v is not None and (paddle.mean(x_v).numpy()[0] > 0 or label
                             is not None) and batch_size[0] > 1 and True:
         x_v = x_v - 1
     else:
@@ -318,7 +318,7 @@ def if_with_and_or_2(x, y=None):
 
 def if_with_and_or_3(x, y=None):
     batch_size = fluid.layers.shape(x)
-    mean_res = fluid.layers.mean(x)
+    mean_res = paddle.mean(x)
     if x is not None and batch_size[0] > 1 and y is not None and mean_res.numpy(
     )[0] > 0:
         x = x + 1
@@ -329,7 +329,7 @@ def if_with_and_or_3(x, y=None):
 
 def if_with_and_or_4(x, y=None):
     batch_size = fluid.layers.shape(x)
-    mean_res = fluid.layers.mean(x)
+    mean_res = paddle.mean(x)
     if (x is not None and batch_size[0] > 1) or (y is not None
                                                  and mean_res.numpy()[0] > 0):
         x = x + 1
@@ -349,7 +349,7 @@ def __init__(self):
 
     foo = Foo()
     batch_size = fluid.layers.shape(x)
-    mean_res = fluid.layers.mean(x)
+    mean_res = paddle.mean(x)
 
     if batch_size[0] > foo.a:
         x = x + foo.b
@@ -361,7 +361,7 @@ def __init__(self):
 def if_tensor_case(x):
     x = fluid.dygraph.to_variable(x)
 
-    mean = fluid.layers.mean(x)
+    mean = paddle.mean(x)
     # It is equivalent to `if mean != 0`
     if mean:
         for i in range(0, 10):
@@ -376,7 +376,7 @@ def if_tensor_case(x):
             x += i
 
     # join `and`/`or`
-    if fluid.layers.mean(x) + 1 and mean > 1 and x is not None or 2 > 1:
+    if paddle.mean(x) + 1 and mean > 1 and x is not None or 2 > 1:
         x -= 1
 
     # `not` statement
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
index 8c7f301e9ed55..ce322db06cf8c 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
@@ -18,6 +18,7 @@
 from __future__ import print_function
 
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid import ParamAttr
 from paddle.fluid import layers
@@ -360,7 +361,7 @@ def beam_search(self, inputs):
         predicted_ids = []
         parent_ids = []
 
-        for step_idx in range(self.beam_max_step_num):
+        for step_idx in range(paddle.to_tensor(self.beam_max_step_num)):
             if fluid.layers.reduce_sum(1 - beam_finished).numpy()[0] == 0:
                 break
             step_input = self._merge_batch_beams(step_input)
@@ -384,6 +385,7 @@ def beam_search(self, inputs):
                         dropout_implementation='upscale_in_train')
                 else:
                     step_input = new_hidden
+
             cell_outputs = self._split_batch_beams(step_input)
             cell_outputs = self.fc(cell_outputs)
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ast_util.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ast_util.py
index 00eb25792b2d2..75374cc4db797 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ast_util.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ast_util.py
@@ -19,6 +19,7 @@
 from paddle.utils import gast
 import inspect
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_func
 
@@ -59,7 +60,7 @@ def test_ast2func_static(self):
 
         def func(x):
             y = fluid.layers.relu(x)
-            loss = fluid.layers.mean(y)
+            loss = paddle.mean(y)
             return loss
 
         x_data = np.random.random([10, 16]).astype('float32')
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
index 14683b33feb37..f240fb9e5c112 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
@@ -590,7 +590,7 @@ def val_bmn(model, args):
 
         loss, tem_loss, pem_reg_loss, pem_cls_loss = bmn_loss_func(
             pred_bm, pred_start, pred_end, gt_iou_map, gt_start, gt_end, args)
-        avg_loss = fluid.layers.mean(loss)
+        avg_loss = paddle.mean(loss)
 
         loss_data += [
             avg_loss.numpy()[0],
@@ -665,7 +665,7 @@ def train_bmn(self, args, place, to_static):
                     loss, tem_loss, pem_reg_loss, pem_cls_loss = bmn_loss_func(
                         pred_bm, pred_start, pred_end, gt_iou_map, gt_start,
                         gt_end, args)
-                    avg_loss = fluid.layers.mean(loss)
+                    avg_loss = paddle.mean(loss)
 
                     avg_loss.backward()
                     adam.minimize(avg_loss)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py
index 79b6880b0d871..9edff1859e41a 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py
@@ -19,11 +19,29 @@
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph.jit import declarative
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator
+from paddle.fluid.dygraph.dygraph_to_static.utils import Dygraph2StaticException
 
 SEED = 2020
 np.random.seed(SEED)
 
 
+class TestDy2staticException(unittest.TestCase):
+
+    def setUp(self):
+        self.x = np.random.random([10, 16]).astype('float32')
+        self.dyfunc = None
+        self.error = "Your if/else have different number of return value."
+
+    def test_error(self):
+        if self.dyfunc:
+            with self.assertRaisesRegex(Dygraph2StaticException, self.error):
+                ProgramTranslator().enable(True)
+                self.assertTrue(declarative(self.dyfunc)(self.x))
+        paddle.fluid.dygraph.base._in_declarative_mode_ = False
+        ProgramTranslator().enable(False)
+
+
 def test_continue_in_for(x):
     x = fluid.dygraph.to_variable(x)
     for i in range(10):
@@ -101,7 +119,11 @@ def test_break_continue_in_for(x):
         x += 10086
 
     a = fluid.layers.fill_constant(shape=[1], dtype='int32', value=0)
-    for i in range(1, 10, 1):
+    b = fluid.layers.fill_constant(shape=[1], dtype='int32', value=3)
+    # b = 10
+    # TODO: add Raise Error and suggestion for usage:
+    #   Py for contains break/continue depends on control-flow.
+    for i in range(b):
         if a <= 4:
             x += 1
             a += 1
@@ -261,10 +283,12 @@ def init_dygraph_func(self):
         self.dygraph_func = while_loop_class_var
 
 
-class TestOptimBreakInFor(TestContinueInWhile):
+class TestOptimBreakInFor(TestDy2staticException):
 
-    def init_dygraph_func(self):
-        self.dygraph_func = test_optim_break_in_for
+    def setUp(self):
+        self.x = np.random.random([10, 16]).astype('float32')
+        self.dyfunc = test_optim_break_in_for
+        self.error = "python while pred change from bool to variable."
 
 
 class TestOptimBreakInWhile(TestContinueInWhile):
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cache_program.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cache_program.py
index 3d2339f58f387..68e725d7fc5f8 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cache_program.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cache_program.py
@@ -17,7 +17,7 @@
 import unittest
 import numpy as np
 from collections import Counter
-
+import paddle
 import paddle.fluid as fluid
 
 from paddle.fluid.dygraph.jit import declarative
@@ -113,7 +113,7 @@ def test_with_optimizer(self):
 
 def simple_func(x):
     inputs = fluid.dygraph.to_variable(x)
-    mean = fluid.layers.mean(inputs)
+    mean = paddle.mean(inputs)
     return mean
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_closure_analysis.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_closure_analysis.py
index 7986fb1cbae48..f588008b4625f 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_closure_analysis.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_closure_analysis.py
@@ -17,20 +17,25 @@
 import unittest
 
 import paddle
-from paddle.fluid.dygraph.dygraph_to_static.loop_transformer import FunctionNameLivenessAnalysis
+from paddle.fluid.dygraph.dygraph_to_static.utils import FunctionNameLivenessAnalysis
 from paddle.utils import gast
 import inspect
 
 
 class JudgeVisitor(gast.NodeVisitor):
 
-    def __init__(self, ans):
+    def __init__(self, ans, mod):
         self.ans = ans
+        self.mod = mod
 
     def visit_FunctionDef(self, node):
         scope = node.pd_scope
         expected = self.ans.get(node.name, set())
-        assert scope.created_vars() == expected, "Not Equals."
+        exp_mod = self.mod.get(node.name, set())
+        assert scope.existed_vars() == expected, "Not Equals."
+        assert scope.modified_vars(
+        ) == exp_mod, "Not Equals in function:{} . expect {} , but get {}".format(
+            node.name, exp_mod, scope.modified_vars())
         self.generic_visit(node)
 
 
@@ -108,12 +113,31 @@ def init_dygraph_func(self):
             },
         ]
 
+        self.modified_var = [
+            {
+                'func': set('ki'),
+                'test_nonlocal': set('i')
+            },
+            {
+                'func': set({'i'}),
+                'test_global': set({"t"})
+            },
+            {
+                'func': set('i'),
+            },
+            {
+                'func': set('i'),
+                'test_normal_argument': set('x')
+            },
+        ]
+
     def test_main(self):
-        for ans, func in zip(self.answer, self.all_dygraph_funcs):
+        for mod, ans, func in zip(self.modified_var, self.answer,
+                                  self.all_dygraph_funcs):
             test_func = inspect.getsource(func)
             gast_root = gast.parse(test_func)
             name_visitor = FunctionNameLivenessAnalysis(gast_root)
-            JudgeVisitor(ans).visit(gast_root)
+            JudgeVisitor(ans, mod).visit(gast_root)
 
 
 def TestClosureAnalysis_Attribute_func():
@@ -128,6 +152,10 @@ def init_dygraph_func(self):
 
         self.all_dygraph_funcs = [TestClosureAnalysis_Attribute_func]
         self.answer = [{"TestClosureAnalysis_Attribute_func": set({'i'})}]
+        self.modified_var = [{
+            "TestClosureAnalysis_Attribute_func":
+            set({'i', 'self.current.function'})
+        }]
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py
index 136d2d37db800..3c1f31d0638b9 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py
@@ -37,7 +37,7 @@
 # Use a decorator to test exception
 @paddle.jit.to_static
 def dyfunc_with_if(x_v):
-    if fluid.layers.mean(x_v).numpy()[0] > 5:
+    if paddle.mean(x_v).numpy()[0] > 5:
         x_v = x_v - 1
     else:
         x_v = x_v + 1
@@ -58,7 +58,7 @@ def fn1():
 @paddle.jit.to_static
 def dyfunc_with_third_library_logging(x_v):
     logging.info('test dyfunc_with_third_library_logging')
-    if fluid.layers.mean(x_v).numpy()[0] > 5:
+    if paddle.mean(x_v).numpy()[0] > 5:
         x_v = x_v - 1
     else:
         x_v = x_v + 1
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_duplicate_output.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_duplicate_output.py
new file mode 100644
index 0000000000000..aea7a1910b0b6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_duplicate_output.py
@@ -0,0 +1,65 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+
+import paddle
+
+np.random.seed(1)
+
+if paddle.fluid.is_compiled_with_cuda():
+    place = paddle.fluid.CUDAPlace(0)
+else:
+    place = paddle.fluid.CPUPlace()
+
+
+class SimpleNet(paddle.nn.Layer):
+
+    def __init__(self):
+        super().__init__()
+        self._linear = paddle.nn.Linear(1, 1)
+
+    def forward(self, x):
+        """ forward with duplicate outputs.
+        """
+        x = self._linear(x)
+        return x, x
+
+
+class TestDuplicateOutput(unittest.TestCase):
+    """
+    TestCase for the transformation from control flow `if/else`
+    dependent on tensor in Dygraph into Static `fluid.layers.cond`.
+    """
+
+    def setUp(self):
+        self.net = paddle.jit.to_static(SimpleNet())
+        self.x = paddle.to_tensor([1.0])
+
+    def _run_static(self):
+        loss0, loss1 = self.net(self.x)
+        loss0.backward()
+        param = self.net.parameters()
+        self.assertEqual(param[0].grad.numpy(), 1.0)
+
+    def test_ast_to_func(self):
+        self._run_static()
+
+
+if __name__ == '__main__':
+    with paddle.fluid.framework._test_eager_guard():
+        unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py
index 8058234cb5f96..7d980b5f75a62 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py
@@ -33,7 +33,7 @@ def inner_func():
 def func_error_in_compile_time(x):
     x = fluid.dygraph.to_variable(x)
     inner_func()
-    if fluid.layers.mean(x) < 0:
+    if paddle.mean(x) < 0:
         x_v = x - 1
     else:
         x_v = x + 1
@@ -78,7 +78,7 @@ def __init__(self, fc_size=20):
     def forward(self, x):
         y = self._linear(x)
         z = fluid.layers.fill_constant(shape=[1, 2], value=9, dtype="int")
-        out = fluid.layers.mean(y[z])
+        out = paddle.mean(y[z])
         return out
 
 
@@ -386,7 +386,7 @@ def set_message(self):
              'y = self._linear(x)',
              'z = fluid.layers.fill_constant(shape=[1, 2], value=9, dtype="int")',
              '<--- HERE',
-             'out = fluid.layers.mean(y[z])',
+             'out = paddle.mean(y[z])',
              'return out'
              ]
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_fetch_feed.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_fetch_feed.py
index 555e71ce9a0ca..d3654260d8d77 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_fetch_feed.py
@@ -16,7 +16,7 @@
 
 import numpy as np
 import unittest
-
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph.jit import declarative
 from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
@@ -59,7 +59,7 @@ def __init__(self, input_dim=10, output_dim=5):
     @declarative
     def forward(self, x):
         pre = self.fc(x)
-        loss = fluid.layers.mean(pre)
+        loss = paddle.mean(pre)
         return pre, loss
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_full_name_usage.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_full_name_usage.py
index 33b50af7c6dcf..108c6228499e0 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_full_name_usage.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_full_name_usage.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 import unittest
 from paddle.fluid.dygraph import declarative
@@ -23,7 +24,7 @@
 @fluid.dygraph.declarative
 def dygraph_decorated_func(x):
     x = fluid.dygraph.to_variable(x)
-    if fluid.layers.mean(x) > 0:
+    if paddle.mean(x) > 0:
         x_v = x - 1
     else:
         x_v = x + 1
@@ -33,7 +34,7 @@ def dygraph_decorated_func(x):
 @fluid.dygraph.declarative
 def jit_decorated_func(x):
     x = fluid.dygraph.to_variable(x)
-    if fluid.layers.mean(x) > 0:
+    if paddle.mean(x) > 0:
         x_v = x - 1
     else:
         x_v = x + 1
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
index 822835a8c7cd1..acfd29102691a 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
@@ -20,6 +20,7 @@
 import paddle
 from paddle.fluid.dygraph.jit import declarative
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator
+from paddle.fluid.dygraph.dygraph_to_static.utils import Dygraph2StaticException
 import paddle.fluid.core as core
 
 from ifelse_simple_func import *
@@ -32,6 +33,22 @@
     place = fluid.CPUPlace()
 
 
+class TestDy2staticException(unittest.TestCase):
+
+    def setUp(self):
+        self.x = np.random.random([10, 16]).astype('float32')
+        self.dyfunc = None
+        self.error = "Your if/else have different number of return value."
+
+    def test_error(self):
+        if self.dyfunc:
+            with self.assertRaisesRegex(Dygraph2StaticException, self.error):
+                ProgramTranslator().enable(True)
+                self.assertTrue(declarative(self.dyfunc)(self.x))
+        paddle.fluid.dygraph.base._in_declarative_mode_ = False
+        ProgramTranslator().enable(False)
+
+
 class TestDygraphIfElse(unittest.TestCase):
     """
     TestCase for the transformation from control flow `if/else`
@@ -251,7 +268,7 @@ def relu(x):
 
 
 def call_external_func(x, label=None):
-    if fluid.layers.mean(x) < 0:
+    if paddle.mean(x) < 0:
         x_v = x - 1
     else:
         x_v = add_fn(x)
@@ -274,7 +291,7 @@ class NetWithExternalFunc(fluid.dygraph.Layer):
 
     @declarative
     def forward(self, x, label=None):
-        if fluid.layers.mean(x) < 0:
+        if paddle.mean(x) < 0:
             x_v = x - 1
         else:
             x_v = add_fn(x)
@@ -417,16 +434,12 @@ def test_ast_to_func(self):
         self.assertIsInstance(self.out[1], int)
 
 
-class TestDy2StIfElseRetInt2(TestDy2StIfElseRetInt1):
+class TestDy2StIfElseRetInt2(TestDy2staticException):
 
     def setUp(self):
         self.x = np.random.random([5]).astype('float32')
+        self.error = "Your if/else have different number of return value."
         self.dyfunc = dyfunc_ifelse_ret_int2
-        self.out = self.get_dy2stat_out()
-
-    def test_ast_to_func(self):
-        self.assertIsInstance(self.out[0], (paddle.Tensor, core.eager.Tensor))
-        self.assertIsInstance(self.out[1], (paddle.Tensor, core.eager.Tensor))
 
 
 class TestDy2StIfElseRetInt3(TestDy2StIfElseRetInt1):
@@ -448,7 +461,7 @@ def setUp(self):
 
     def test_ast_to_func(self):
         ProgramTranslator().enable(True)
-        with self.assertRaises(TypeError):
+        with self.assertRaises(Dygraph2StaticException):
             static_func = paddle.jit.to_static(self.dyfunc)
             out = static_func(self.x)
         # Why need set `_in_declarative_mode_` here?
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse_basic.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse_basic.py
index 826063cf67392..97043fd7ba688 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse_basic.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse_basic.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,264 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import textwrap
-from paddle.utils import gast
-from paddle.fluid.dygraph.dygraph_to_static.ifelse_transformer import get_name_ids
-from paddle.fluid.dygraph.dygraph_to_static.static_analysis import StaticAnalysisVisitor
-from paddle.fluid.dygraph.dygraph_to_static.static_analysis import NodeVarType
-from paddle.fluid.dygraph.dygraph_to_static.utils import is_control_flow_to_transform
-
-
-class TestGetNameIds(unittest.TestCase):
-    """
-    Test for parsing the ast.Name list from the ast.Nodes
-    """
-
-    def setUp(self):
-        self.source = """
-          def test_fn(x):
-            return x+1
-        """
-        self.all_name_ids = {'x': [gast.Param(), gast.Load()]}
-
-    def test_get_name_ids(self):
-        source = textwrap.dedent(self.source)
-        root = gast.parse(source)
-        all_name_ids = get_name_ids([root])
-        self.assertDictEqual(self.transfer_dict(self.all_name_ids),
-                             self.transfer_dict(all_name_ids))
-
-    def transfer_dict(self, name_ids_dict):
-        new_dict = {}
-        for name, ctxs in name_ids_dict.items():
-            new_dict[name] = [type(ctx) for ctx in ctxs]
-        return new_dict
-
-
-class TestGetNameIds2(TestGetNameIds):
-
-    def setUp(self):
-        self.source = """
-          def test_fn(x, y):
-            a = 1
-            x = y + a
-            if x > y:
-               z = x * x
-               z = z + a
-            else:
-               z = y * y
-            return z
-        """
-        self.all_name_ids = {
-            'x':
-            [gast.Param(),
-             gast.Store(),
-             gast.Load(),
-             gast.Load(),
-             gast.Load()],
-            'a': [gast.Store(), gast.Load(),
-                  gast.Load()],
-            'y': [
-                gast.Param(),
-                gast.Load(),
-                gast.Load(),
-                gast.Load(),
-                gast.Load(),
-            ],
-            'z': [
-                gast.Store(),
-                gast.Load(),
-                gast.Store(),
-                gast.Store(),
-                gast.Load(),
-            ]
-        }
-
-
-class TestGetNameIds3(TestGetNameIds):
-
-    def setUp(self):
-        self.source = """
-          def test_fn(x, y):
-            z = 1
-            if x > y:
-               z = x * x
-               z = z + y
-            return z
-        """
-        self.all_name_ids = {
-            'x': [
-                gast.Param(),
-                gast.Load(),
-                gast.Load(),
-                gast.Load(),
-            ],
-            'y': [
-                gast.Param(),
-                gast.Load(),
-                gast.Load(),
-            ],
-            'z': [
-                gast.Store(),
-                gast.Store(),
-                gast.Load(),
-                gast.Store(),
-                gast.Load(),
-            ]
-        }
-
-
-class TestIsControlFlowIf(unittest.TestCase):
-
-    def check_false_case(self, code):
-        code = textwrap.dedent(code)
-        node = gast.parse(code)
-        node_test = node.body[0].value
-
-        self.assertFalse(is_control_flow_to_transform(node_test))
-
-    def test_expr(self):
-        # node is not ast.Compare
-        self.check_false_case("a+b")
-
-    def test_expr2(self):
-        # x is a Tensor.
-        node = gast.parse("a + x.numpy()")
-        node_test = node.body[0].value
-        self.assertTrue(is_control_flow_to_transform(node_test))
-
-    def test_is_None(self):
-        self.check_false_case("x is None")
-
-    def test_is_None2(self):
-        self.check_false_case("fluid.layers.sum(x) is None")
-
-    def test_is_None3(self):
-        self.check_false_case("fluid.layers.sum(x).numpy() != None")
-
-    def test_is_None4(self):
-        node = gast.parse("fluid.layers.sum(x) and 2>1")
-        node_test = node.body[0].value
-
-        self.assertTrue(is_control_flow_to_transform(node_test))
-
-    def test_if(self):
-        node = gast.parse("x.numpy()[1] > 1")
-        node_test = node.body[0].value
-
-        self.assertTrue(is_control_flow_to_transform(node_test))
-
-    def test_if_with_and(self):
-        node = gast.parse("x and 1 < x.numpy()[1]")
-        node_test = node.body[0].value
-
-        self.assertTrue(is_control_flow_to_transform(node_test))
-
-    def test_if_with_or(self):
-        node = gast.parse("1 < fluid.layers.sum(x).numpy()[2] or x+y < 0")
-        node_test = node.body[0].value
-
-        self.assertTrue(is_control_flow_to_transform(node_test))
-
-    def test_shape(self):
-        code = """
-            def foo(x):
-                batch_size = fluid.layers.shape(x)
-                if batch_size[0] > 16:
-                    x = x + 1
-                return x
-        """
-        code = textwrap.dedent(code)
-        node = gast.parse(code)
-        static_analysis_visitor = StaticAnalysisVisitor(node)
-        test_node = node.body[0].body[1].test
-
-        self.assertTrue(
-            is_control_flow_to_transform(test_node, static_analysis_visitor))
-
-    def test_shape_with_andOr(self):
-        code = """
-            def foo(x):
-                batch_size = fluid.layers.shape(x)
-                if x is not None and batch_size[0] > 16 or 2 > 1:
-                    x = x + 1
-                return x
-        """
-        code = textwrap.dedent(code)
-        node = gast.parse(code)
-        static_analysis_visitor = StaticAnalysisVisitor(node)
-        test_node = node.body[0].body[1].test
-
-        self.assertTrue(
-            is_control_flow_to_transform(test_node, static_analysis_visitor))
-
-    def test_paddle_api(self):
-        code = """
-            def foo(x):
-                if fluid.layers.shape(x)[0] > 16:
-                    x = x + 1
-                return x
-        """
-        code = textwrap.dedent(code)
-        node = gast.parse(code)
-        static_analysis_visitor = StaticAnalysisVisitor(node)
-        test_node = node.body[0].body[0].test
-
-        self.assertTrue(
-            is_control_flow_to_transform(test_node, static_analysis_visitor))
-
-    def test_paddle_api_with_andOr(self):
-        code_or = """
-            def foo(x):
-                if 2 > 1 and fluid.layers.shape(x)[0] > 16 or x is not None :
-                    x = x + 1
-                return x
-        """
-
-        code_and = """
-            def foo(x):
-                if 2 > 1 and fluid.layers.shape(x)[0] > 16 and x is not None :
-                    x = x + 1
-                return x
-        """
-        for code in [code_or, code_and]:
-            code = textwrap.dedent(code)
-            node = gast.parse(code)
-            static_analysis_visitor = StaticAnalysisVisitor(node)
-            test_node = node.body[0].body[0].test
-
-            self.assertTrue(
-                is_control_flow_to_transform(test_node,
-                                             static_analysis_visitor))
-
-    def test_with_node_var_type_map(self):
-        node = gast.parse("x > 1")
-        node_test = node.body[0].value
-
-        # if x is a Tensor
-        var_name_to_type = {"x": {NodeVarType.TENSOR}}
-
-        self.assertTrue(
-            is_control_flow_to_transform(node_test,
-                                         var_name_to_type=var_name_to_type))
-
-        # if x is not a Tensor
-        var_name_to_type = {"x": {NodeVarType.NUMPY_NDARRAY}}
-        self.assertFalse(
-            is_control_flow_to_transform(node_test,
-                                         var_name_to_type=var_name_to_type))
-
-    def test_raise_error(self):
-        node = "a + b"
-        with self.assertRaises(Exception) as e:
-            self.assertRaises(TypeError, is_control_flow_to_transform(node))
-        self.assertTrue(
-            "The type of input node must be gast.AST" in str(e.exception))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
index ddda462525f31..0c41621f6e719 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
@@ -403,7 +403,7 @@ def forward(self, word, target, length=None):
         crf_cost = self.linear_chain_crf(input=emission,
                                          label=target,
                                          length=length)
-        avg_cost = fluid.layers.mean(x=crf_cost)
+        avg_cost = paddle.mean(x=crf_cost)
         crf_decode = self.crf_decoding(input=emission, length=length)
         return avg_cost, crf_decode
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lambda.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lambda.py
index 7eccbedf4d219..8254a6d24b534 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lambda.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lambda.py
@@ -16,6 +16,7 @@
 
 import numpy as np
 import unittest
+import paddle
 import paddle.fluid as fluid
 
 from paddle.fluid.dygraph import declarative
@@ -25,7 +26,7 @@ def call_lambda_as_func(x):
     x = fluid.dygraph.to_variable(x)
 
     add_func = lambda x, y: x + y
-    mean_func = lambda x: fluid.layers.mean(x)
+    mean_func = lambda x: paddle.mean(x)
 
     y = add_func(x, 1)
     y = add_func(y, add_func(y, -1))
@@ -38,7 +39,7 @@ def call_lambda_directly(x):
     x = fluid.dygraph.to_variable(x)
 
     y = (lambda x, y: x + y)(x, x)
-    out = (lambda x: fluid.layers.mean(x))(y)
+    out = (lambda x: paddle.mean(x))(y)
 
     return out
 
@@ -48,7 +49,7 @@ def call_lambda_in_func(x):
 
     add_func = lambda x: x + 1
 
-    y = fluid.layers.mean((lambda x: fluid.layers.relu(x))(x))
+    y = paddle.mean((lambda x: fluid.layers.relu(x))(x))
     out = add_func(y) if y > 1 and y < 2 else (lambda x: x**2)(y)
 
     return out
@@ -59,7 +60,7 @@ def call_lambda_with_ifExpr(x):
 
     add_func = lambda x: x + 1
 
-    y = fluid.layers.mean(x)
+    y = paddle.mean(x)
     out = add_func(y) if y or y < 2 else (lambda x: x**2)(y)
 
     return out
@@ -70,7 +71,7 @@ def call_lambda_with_ifExpr2(x):
 
     add_func = lambda x: x + 1
 
-    y = fluid.layers.mean(x)
+    y = paddle.mean(x)
 
     # NOTE: y is Variable, but z<2 is python bool value
     z = 0
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py
index f573960b5dba0..1d64e7b81849f 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py
@@ -177,7 +177,6 @@ def test_list_pop_in_for_loop(x, iter_num):
     one = fluid.layers.ones(shape=[1], dtype="int32")
     for i in range(one.numpy()[0]):
         item = a.pop()
-
     return a[0], item, b[1]
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py
index 78d97a3884aed..ff3e0da6fea17 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py
@@ -270,7 +270,7 @@ def test_nested_loop_vars(self):
         self.loop_var_names = [
             set(["j", "two"]),
             set(["i", "three", "b"]),
-            set(["i", "j"])
+            set(["i"])
         ]
         self.create_var_names = [set(), set(["b"]), set()]
 
@@ -442,13 +442,6 @@ class TestErrorInForLoop(TestTransformForLoop):
     def _init_dyfunc(self):
         self.dyfunc = for_loop_dyfunc_not_support
 
-    def test_ast_to_func(self):
-        with self.assertRaisesRegexp(
-                NotImplementedError,
-                "Dynamic-to-Static only supports the step value is a constant or negative constant "
-        ):
-            self._run_static()
-
 
 if __name__ == '__main__':
     with fluid.framework._test_eager_guard():
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
index 35c8b4d952295..6396c093ba137 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
@@ -119,7 +119,7 @@ def forward(self, inputs, label=None):
         if label is not None:
             acc = fluid.layers.accuracy(input=x, label=label)
             loss = fluid.layers.cross_entropy(x, label)
-            avg_loss = fluid.layers.mean(loss)
+            avg_loss = paddle.mean(loss)
 
             return x, acc, avg_loss
         else:
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
index 18694f6cdec58..29bdddf73cbdc 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
@@ -479,7 +479,7 @@ def train_mobilenet(args, to_static):
                 softmax_out = fluid.layers.softmax(out, use_cudnn=False)
                 loss = fluid.layers.cross_entropy(input=softmax_out,
                                                   label=label)
-                avg_loss = fluid.layers.mean(x=loss)
+                avg_loss = paddle.mean(x=loss)
                 acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
                 acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
                 t_start_back = time.time()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
index 8549d03f7e27b..8ecae3c6b8d3a 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
@@ -33,7 +33,7 @@ def nested_input(x, y):
     sub_res = z_elem[0] - z_elem[1]
 
     mul_res = y[-1]['d']['da'] * y[-1]['d']['dc']
-    mean_func = fluid.layers.mean
+    mean_func = paddle.mean
     out = mean_func(sub_res) + mean_func(sum_res) + mean_func(mul_res)
     return out
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
index 41968278f7bc0..27debe00af10a 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
@@ -42,7 +42,7 @@ def simple_func(x, weight_numpy):
     x = fluid.dygraph.to_variable(x)
     w = fluid.dygraph.to_variable(weight_numpy)
     y = fluid.layers.matmul(x, w)
-    z = fluid.layers.mean(y)
+    z = paddle.mean(y)
     return z
 
 
@@ -51,7 +51,7 @@ def decorated_simple_func(x, weight_numpy):
     x = fluid.dygraph.to_variable(x)
     w = fluid.dygraph.to_variable(weight_numpy)
     y = fluid.layers.matmul(x, w)
-    z = fluid.layers.mean(y)
+    z = paddle.mean(y)
     return z
 
 
@@ -66,114 +66,114 @@ def get_source_code(func):
 class StaticCode1():
 
     def dyfunc_with_if_else(x_v, label=None):
-        __return_value_init_0 = paddle.full(shape=[1],
-                                            dtype='float64',
-                                            fill_value=0.0,
-                                            name='__return_value_init_0')
-        __return_value_0 = __return_value_init_0
+        loss = _jst.UndefinedVar('loss')
+        __return_1 = _jst.UndefinedVar('__return_1')
+        __return_0 = _jst.UndefinedVar('__return_0')
+        __return_value_0 = None
 
         def get_args_0():
             nonlocal x_v
-            return x_v
+            return x_v,
 
         def set_args_0(__args):
             nonlocal x_v
-            x_v = __args
+            x_v, = __args
 
         def true_fn_0():
             nonlocal x_v
             x_v = x_v - 1
-            return x_v
+            return
 
         def false_fn_0():
             nonlocal x_v
             x_v = x_v + 1
-            return x_v
+            return
 
         _jst.IfElse(
-            fluid.layers.mean(x_v)[0] > 5, true_fn_0, false_fn_0, get_args_0,
+            paddle.mean(x_v)[0] > 5, true_fn_0, false_fn_0, get_args_0,
             set_args_0, ('x_v', ))
 
         def get_args_1():
-            nonlocal __return_value_0, label, x_v
-            return __return_value_0, label, x_v
+            nonlocal __return_0, __return_1, __return_value_0, loss
+            return __return_0, __return_1, __return_value_0, loss
 
         def set_args_1(__args):
-            nonlocal __return_value_0, label, x_v
-            __return_value_0, label, x_v = __args
+            nonlocal __return_0, __return_1, __return_value_0, loss
+            __return_0, __return_1, __return_value_0, loss = __args
 
         def true_fn_1():
-            nonlocal __return_value_0, label, x_v
+            nonlocal __return_0, __return_1, __return_value_0, loss
             loss = fluid.layers.cross_entropy(x_v, label)
             __return_0 = _jst.create_bool_as_type(label is not None, True)
             __return_value_0 = loss
-            return __return_value_0
+            return
 
         def false_fn_1():
-            nonlocal __return_value_0, label, x_v
+            nonlocal __return_0, __return_1, __return_value_0, loss
             __return_1 = _jst.create_bool_as_type(label is not None, True)
             __return_value_0 = x_v
-            return __return_value_0
+            return
 
         _jst.IfElse(label is not None, true_fn_1, false_fn_1, get_args_1,
-                    set_args_1, ('__return_value_0', ))
+                    set_args_1,
+                    ('__return_0', '__return_1', '__return_value_0', 'loss'))
         return __return_value_0
 
 
 class StaticCode2():
     # TODO: Transform return statement
     def dyfunc_with_if_else(x_v, label=None):
-        __return_value_init_1 = paddle.full(shape=[1],
-                                            dtype='float64',
-                                            fill_value=0.0,
-                                            name='__return_value_init_1')
-        __return_value_1 = __return_value_init_1
+        loss = _jst.UndefinedVar('loss')
+        __return_3 = _jst.UndefinedVar('__return_3')
+        __return_2 = _jst.UndefinedVar('__return_2')
+        __return_value_1 = None
 
         def get_args_2():
             nonlocal x_v
-            return x_v
+            return x_v,
 
         def set_args_2(__args):
             nonlocal x_v
-            x_v = __args
+            x_v, = __args
 
         def true_fn_2():
             nonlocal x_v
             x_v = x_v - 1
-            return x_v
+            return
 
         def false_fn_2():
             nonlocal x_v
             x_v = x_v + 1
-            return x_v
+            return
 
         _jst.IfElse(
-            fluid.layers.mean(x_v)[0] > 5, true_fn_2, false_fn_2, get_args_2,
+            paddle.mean(x_v)[0] > 5, true_fn_2, false_fn_2, get_args_2,
             set_args_2, ('x_v', ))
 
         def get_args_3():
-            nonlocal __return_value_1, label, x_v
-            return __return_value_1, label, x_v
+            nonlocal __return_2, __return_3, __return_value_1, loss
+            return __return_2, __return_3, __return_value_1, loss
 
         def set_args_3(__args):
-            nonlocal __return_value_1, label, x_v
-            __return_value_1, label, x_v = __args
+            nonlocal __return_2, __return_3, __return_value_1, loss
+            __return_2, __return_3, __return_value_1, loss = __args
 
         def true_fn_3():
-            nonlocal __return_value_1, label, x_v
+            nonlocal __return_2, __return_3, __return_value_1, loss
             loss = fluid.layers.cross_entropy(x_v, label)
             __return_2 = _jst.create_bool_as_type(label is not None, True)
             __return_value_1 = loss
-            return __return_value_1
+            return
 
         def false_fn_3():
-            nonlocal __return_value_1, label, x_v
+            nonlocal __return_2, __return_3, __return_value_1, loss
             __return_3 = _jst.create_bool_as_type(label is not None, True)
             __return_value_1 = x_v
-            return __return_value_1
+            return
 
         _jst.IfElse(label is not None, true_fn_3, false_fn_3, get_args_3,
-                    set_args_3, ('__return_value_1', ))
+                    set_args_3,
+                    ('__return_2', '__return_3', '__return_value_1', 'loss'))
         return __return_value_1
 
 
@@ -195,15 +195,19 @@ def setUp(self):
     def test_decorator(self):
         program_translator = ProgramTranslator()
         code = program_translator.get_code(dyfunc_with_if_else)
+        #print(code)
         answer = get_source_code(StaticCode1.dyfunc_with_if_else)
-        self.assertEqual(answer, code)
+        self.assertEqual(
+            answer.replace('\n', '').replace(' ', ''),
+            code.replace('\n', '').replace(' ', ''))
 
     def test_program_translator(self):
         answer = get_source_code(StaticCode2.dyfunc_with_if_else)
         program_translator = ProgramTranslator()
         code = program_translator.get_code(dyfunc_with_if_else)
-        # print(code)
-        self.assertEqual(answer, code)
+        self.assertEqual(
+            answer.replace('\n', '').replace(' ', ''),
+            code.replace('\n', '').replace(' ', ''))
 
 
 class TestEnableDeclarative(unittest.TestCase):
@@ -377,13 +381,13 @@ def test_ifelse_early_return1(self):
         answer = np.zeros([2, 2]) + 1
         static_func = paddle.jit.to_static(dyfunc_with_if_else_early_return1)
         out = static_func()
-        self.assertTrue(np.allclose(answer, out.numpy()))
+        self.assertTrue(np.allclose(answer, out[0].numpy()))
 
     def test_ifelse_early_return2(self):
         answer = np.zeros([2, 2]) + 3
         static_func = paddle.jit.to_static(dyfunc_with_if_else_early_return2)
         out = static_func()
-        self.assertTrue(np.allclose(answer, out.numpy()))
+        self.assertTrue(np.allclose(answer, out[0].numpy()))
 
 
 class TestRemoveCommentInDy2St(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
index 553ad00a6d29b..bd1c926091c92 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
@@ -262,7 +262,7 @@ def train(self, to_static, build_strategy=None):
 
                     pred = resnet(img)
                     loss = fluid.layers.cross_entropy(input=pred, label=label)
-                    avg_loss = fluid.layers.mean(x=loss)
+                    avg_loss = paddle.mean(x=loss)
                     acc_top1 = fluid.layers.accuracy(input=pred,
                                                      label=label,
                                                      k=1)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_amp.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_amp.py
index cfdd7d9df51d0..2aa2d6b96901f 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_amp.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_amp.py
@@ -75,7 +75,7 @@ def train(to_static, build_strategy=None):
                     # precision problem, need to figure out the underlying reason.
                     # If we remove it, the loss between dygraph and dy2stat is exactly same.
                     loss = fluid.layers.cross_entropy(input=pred, label=label)
-                avg_loss = fluid.layers.mean(x=pred)
+                avg_loss = paddle.mean(x=pred)
                 acc_top1 = fluid.layers.accuracy(input=pred, label=label, k=1)
                 acc_top5 = fluid.layers.accuracy(input=pred, label=label, k=5)
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_pure_fp16.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_pure_fp16.py
index fa0460f5200b2..3e301e5a6f009 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_pure_fp16.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_pure_fp16.py
@@ -77,7 +77,7 @@ def train(to_static, build_strategy=None):
                                       level='O2'):
                 pred = resnet(img)
                 loss = fluid.layers.cross_entropy(input=pred, label=label)
-            avg_loss = fluid.layers.mean(x=pred)
+            avg_loss = paddle.mean(x=pred)
             acc_top1 = fluid.layers.accuracy(input=pred, label=label, k=1)
             acc_top5 = fluid.layers.accuracy(input=pred, label=label, k=5)
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_return.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_return.py
index a5a6b14676982..7f78788e59652 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_return.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_return.py
@@ -19,6 +19,7 @@
 import paddle.fluid.core as core
 from paddle.jit import to_static
 from paddle.jit import ProgramTranslator
+from paddle.fluid.dygraph.dygraph_to_static.utils import Dygraph2StaticException
 
 import unittest
 import numpy as np
@@ -245,7 +246,7 @@ def _run(self, to_static=False):
                 return res.numpy()
             return res
 
-    def test_transformed_static_result(self):
+    def _test_value_impl(self):
         dygraph_res = self._run(to_static=False)
         static_res = self._run(to_static=True)
         if isinstance(dygraph_res, tuple):
@@ -264,6 +265,13 @@ def test_transformed_static_result(self):
         else:
             self.assertEqual(dygraph_res, static_res)
 
+    def test_transformed_static_result(self):
+        if hasattr(self, "error"):
+            with self.assertRaisesRegex(Dygraph2StaticException, self.error):
+                self._test_value_impl()
+        else:
+            self._test_value_impl()
+
 
 class TestInsideFuncBase(TestReturnBase):
 
@@ -312,12 +320,14 @@ class TestReturnDifferentLengthIfBody(TestReturnBase):
 
     def init_dygraph_func(self):
         self.dygraph_func = test_return_different_length_if_body
+        self.error = "Your if/else have different number of return value."
 
 
 class TestReturnDifferentLengthElse(TestReturnBase):
 
     def init_dygraph_func(self):
         self.dygraph_func = test_return_different_length_else
+        self.error = "Your if/else have different number of return value."
 
 
 class TestNoReturn(TestReturnBase):
@@ -330,12 +340,14 @@ class TestReturnNone(TestReturnBase):
 
     def init_dygraph_func(self):
         self.dygraph_func = test_return_none
+        self.error = "Your if/else have different number of return value."
 
 
 class TestReturnNoVariable(TestReturnBase):
 
     def init_dygraph_func(self):
         self.dygraph_func = test_return_no_variable
+        self.error = "Your if/else have different number of return value."
 
 
 class TestReturnListOneValue(TestReturnBase):
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py
index 6c8216dac55fa..9549844f59c05 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py
@@ -45,7 +45,7 @@ def __init__(self, fc_size):
     def forward(self, x):
         y = self._linear(x)
         z = self._linear(y)
-        out = fluid.layers.mean(z)
+        out = paddle.mean(z)
         return out, y
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
index 965013adf5d8f..16e51784a07a3 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
@@ -318,7 +318,7 @@ def forward(self, inputs, label):
 
         softmax_out = fluid.layers.softmax(out, use_cudnn=False)
         loss = fluid.layers.cross_entropy(input=softmax_out, label=label)
-        avg_loss = fluid.layers.mean(x=loss)
+        avg_loss = paddle.mean(x=loss)
 
         acc_top1 = fluid.layers.accuracy(input=softmax_out, label=label, k=1)
         acc_top5 = fluid.layers.accuracy(input=softmax_out, label=label, k=5)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py
index 108c060fab868..719645aa2b5ce 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py
@@ -97,7 +97,7 @@ def forward(self, inputs, label=None):
         prediction = self._fc_prediction(fc_1)
 
         cost = fluid.layers.cross_entropy(input=prediction, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
+        avg_cost = paddle.mean(x=cost)
         acc = fluid.layers.accuracy(input=prediction, label=label)
         return avg_cost, prediction, acc
 
@@ -141,7 +141,7 @@ def forward(self, inputs, label=None):
         prediction = self._fc_prediction(fc_2)
 
         cost = fluid.layers.cross_entropy(input=prediction, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
+        avg_cost = paddle.mean(x=cost)
         acc = fluid.layers.accuracy(input=prediction, label=label)
         return avg_cost, prediction, acc
 
@@ -189,7 +189,7 @@ def forward(self, inputs, label=None):
         prediction = self._fc_prediction(fc_2)
 
         cost = fluid.layers.cross_entropy(input=prediction, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
+        avg_cost = paddle.mean(x=cost)
         acc = fluid.layers.accuracy(input=prediction, label=label)
         return avg_cost, prediction, acc
 
@@ -247,7 +247,7 @@ def forward(self, inputs, label=None):
         # TODO(Aurelius84): Uncomment the following codes when we support return variable-length vars.
         # if label is not None:
         cost = fluid.layers.cross_entropy(input=prediction, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
+        avg_cost = paddle.mean(x=cost)
         acc = fluid.layers.accuracy(input=prediction, label=label)
         return avg_cost, prediction, acc
         # else:
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
index 9b1cde6dcc5e1..0d1dc69823a56 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
@@ -578,8 +578,8 @@ def _set_test_func(self):
         self.dygraph_func = dyfunc_with_for_1
 
     def _set_expected_op_num(self):
-        self.expected_op_num = 22
-        self.expected_shape_op_num = 3
+        self.expected_op_num = 29
+        self.expected_shape_op_num = 2
         self.expected_slice_op_num = 3
 
 
@@ -589,7 +589,7 @@ def _set_test_func(self):
         self.dygraph_func = dyfunc_with_while_1
 
     def _set_expected_op_num(self):
-        self.expected_op_num = 22
+        self.expected_op_num = 21
         self.expected_shape_op_num = 3
         self.expected_slice_op_num = 3
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py
index 481858be6f469..15a1db65b941a 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py
@@ -303,7 +303,7 @@ def train(args, fake_data_reader, to_static):
                 loss = fluid.layers.cross_entropy(input=outputs,
                                                   label=labels,
                                                   ignore_index=-1)
-                avg_loss = fluid.layers.mean(loss)
+                avg_loss = paddle.mean(loss)
                 acc_top1 = fluid.layers.accuracy(input=outputs,
                                                  label=labels,
                                                  k=1)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
index ab52d518fe7af..57b6fc55efb97 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
@@ -21,6 +21,7 @@
 from paddle.fluid.dygraph import Embedding, Layer, LayerNorm, Linear, to_variable
 from paddle.fluid.dygraph.jit import dygraph_to_static_func
 from paddle.fluid.layers.utils import map_structure
+import paddle
 
 
 def position_encoding_init(n_position, d_pos_vec):
@@ -633,7 +634,7 @@ def gather(input, indices, batch_pos):
                 value=0),
         } for i in range(self.n_layer)]
 
-        for i in range(max_len):
+        for i in range(paddle.to_tensor(max_len)):
             trg_pos = layers.fill_constant(shape=trg_word.shape,
                                            dtype="int64",
                                            value=i)
diff --git a/python/paddle/fluid/tests/unittests/fleet_heter_ps_training.py b/python/paddle/fluid/tests/unittests/fleet_heter_ps_training.py
index c6a39bd6d0418..3748894f4effd 100644
--- a/python/paddle/fluid/tests/unittests/fleet_heter_ps_training.py
+++ b/python/paddle/fluid/tests/unittests/fleet_heter_ps_training.py
@@ -101,7 +101,7 @@ def net(batch_size=4, lr=0.01):
         predict = fluid.layers.fc(input=merge_layer, size=2, act='softmax')
 
         cost = fluid.layers.cross_entropy(input=predict, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
+        avg_cost = paddle.mean(x=cost)
     return datas, avg_cost
 
 
diff --git a/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py b/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
index fe79bae75f530..ebeeb1e272f09 100755
--- a/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
+++ b/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
@@ -66,7 +66,7 @@ def net(self, main_prog, startup_prog):
                                                     act='softmax')
                 cost = paddle.fluid.layers.cross_entropy(input=prediction,
                                                          label=input_y)
-                avg_cost = paddle.fluid.layers.mean(x=cost)
+                avg_cost = paddle.mean(x=cost)
 
                 strategy = paddle.distributed.fleet.DistributedStrategy()
         return avg_cost, strategy
@@ -101,7 +101,7 @@ def fc_block(input_x):
                                                         act='softmax')
                     cost = paddle.fluid.layers.cross_entropy(input=prediction,
                                                              label=input_y)
-                    avg_cost = paddle.fluid.layers.mean(x=cost)
+                    avg_cost = paddle.mean(x=cost)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         return avg_cost, strategy
diff --git a/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt b/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt
index c60a7511022b4..ee215ebf27a39 100644
--- a/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt
@@ -55,3 +55,6 @@ py_test_modules(
 py_test_modules(
   test_standalone_executor_serial_run MODULES test_standalone_executor ENVS
   FLAGS_new_executor_serial_run=true)
+
+py_test_modules(test_convert_graph_to_program MODULES test_standalone_executor
+                ENVS FLAGS_CONVERT_GRAPH_TO_PROGRAM=true)
diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_controlflow.py b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_controlflow.py
index 5ce035097d01a..aa0290cf4b5fa 100644
--- a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_controlflow.py
+++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_controlflow.py
@@ -16,7 +16,7 @@
 import sys
 import unittest
 import paddle
-from paddle.fluid import core
+from paddle.fluid import core, framework
 from paddle.fluid.core import StandaloneExecutor
 import paddle.fluid as fluid
 from paddle.fluid.framework import Program, program_guard
@@ -81,17 +81,13 @@ def _run(self, feed):
         return ret
 
     def run_raw_executor(self, feed):
-        os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '0'
-        out = self._run(feed)
-        del os.environ['FLAGS_USE_STANDALONE_EXECUTOR']
-        print("GT:", out)
+        with framework._enable_standalone_executor(False):
+            out = self._run(feed)
         return out
 
     def run_new_executor(self, feed):
-        os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '1'
-        out = self._run(feed)
-        del os.environ['FLAGS_USE_STANDALONE_EXECUTOR']
-        print("New:", out)
+        with framework._enable_standalone_executor(True):
+            out = self._run(feed)
         return out
 
     def test_with_feed(self):
diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
index 9e375126550cc..ad13061d17802 100644
--- a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
+++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py
@@ -20,7 +20,7 @@
 import unittest
 import paddle
 import json
-from paddle.fluid import core
+from paddle.fluid import core, framework
 from paddle.fluid.core import StandaloneExecutor
 from paddle.profiler import profiler
 
@@ -29,7 +29,7 @@
 paddle.enable_static()
 
 
-class LinearTestCase(unittest.TestCase):
+class TestDryRun(unittest.TestCase):
 
     def setUp(self):
         place = paddle.CUDAPlace(
@@ -48,29 +48,16 @@ def build_program(self):
 
         return startup_program, main_program, c
 
-    def test_interp_base(self):
-        startup_program, main_program, c = self.build_program()
-        standaloneexecutor = StandaloneExecutor(self.place,
-                                                startup_program.desc,
-                                                main_program.desc, core.Scope())
-        out = standaloneexecutor.run(
-            {"a": np.ones([2, 2], dtype="float32") * 2}, [c.name])
-        for i in range(10):
-            out = standaloneexecutor.run(
-                {"a": np.ones([2, 2], dtype="float32") * i}, [c.name])
-
-        for i in range(10):
-            out = standaloneexecutor.run(
-                {"a": np.ones([2, 2], dtype="float32") * i}, ['a', c.name])
-
     def test_dry_run(self):
+        scope = core.Scope()
         startup_program, main_program, c = self.build_program()
-        standaloneexecutor = StandaloneExecutor(self.place,
-                                                startup_program.desc,
-                                                main_program.desc, core.Scope())
+        exe = paddle.static.Executor(self.place)
+        exe.run(startup_program, scope=scope)
+
+        standaloneexecutor = StandaloneExecutor(self.place, main_program.desc)
         # test for cost_info
         cost_info = standaloneexecutor.dry_run(
-            {"a": np.ones([2, 2], dtype="float32")})
+            scope, {"a": np.ones([2, 2], dtype="float32")})
         self.check_cost_info(cost_info)
 
     def check_cost_info(self, cost_info):
@@ -121,99 +108,49 @@ def setUp(self):
         self.iter_n = 3
         self.place = paddle.CUDAPlace(
             0) if core.is_compiled_with_cuda() else paddle.CPUPlace()
-
-    def test_standalone_executor_statistics(self):
-        if os.getenv("FLAGS_static_executor_perfstat_filepath") is None:
-            return
-
-        paddle.seed(2020)
-        main_program, startup_program, fetch_list = build_program()
-        fetch_list = [x.name for x in fetch_list]
-
-        p = core.Place()
-        p.set_place(self.place)
-        executor = StandaloneExecutor(p, startup_program.desc,
-                                      main_program.desc, core.Scope())
-
-        helper_profiler = profiler.Profiler(
-            targets=[profiler.ProfilerTarget.CPU], scheduler=(1, 2))
-        helper_profiler.start()
-        for i in range(self.iter_n):
-            executor.run({}, fetch_list)
-            helper_profiler.step()
-        helper_profiler.stop()
-
-        perfstat_filepath = os.environ[
-            'FLAGS_static_executor_perfstat_filepath']
-        self.assertTrue(os.path.exists(perfstat_filepath))
-        with open(perfstat_filepath, 'r') as load_f:
-            stat_res = json.load(load_f)
-            self.assertTrue(len(stat_res) > 0)
-
-        os.remove(perfstat_filepath)
-        shutil.rmtree('./profiler_log')
+        self.perf_path = './perfstat'
 
     def test_parallel_executor_statistics(self):
-        if os.getenv("FLAGS_static_executor_perfstat_filepath") is None:
-            return
+        self.run_with_statistics(executor='ParallelExecutor')
 
-        paddle.seed(2020)
-        main_program, startup_program, fetch_list = build_program()
-        fetch_list = [x.name for x in fetch_list]
-
-        main_program = paddle.fluid.compiler.CompiledProgram(main_program)
-        os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '0'
-        executor = paddle.static.Executor(self.place)
-        os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '1'
-        executor.run(startup_program)
-
-        helper_profiler = profiler.Profiler(
-            targets=[profiler.ProfilerTarget.CPU], scheduler=(1, 2))
-        helper_profiler.start()
-        for i in range(self.iter_n):
-            executor.run(main_program, fetch_list=fetch_list)
-            helper_profiler.step()
-        helper_profiler.stop()
-
-        perfstat_filepath = os.environ[
-            'FLAGS_static_executor_perfstat_filepath']
-        self.assertTrue(os.path.exists(perfstat_filepath))
-        with open(perfstat_filepath, 'r') as load_f:
-            stat_res = json.load(load_f)
-            self.assertTrue(len(stat_res) > 0)
+    def test_executor_statistics(self):
+        self.run_with_statistics(executor='Executor')
 
-        os.remove(perfstat_filepath)
-        shutil.rmtree('./profiler_log')
+    def test_standalone_executor_statistics(self):
+        self.run_with_statistics(executor='StandaloneExecutor')
 
-    def test_executor_statistics(self):
+    def run_with_statistics(self, executor=None):
         if os.getenv("FLAGS_static_executor_perfstat_filepath") is None:
             return
-
         paddle.seed(2020)
+        # note: startup program is empty
         main_program, startup_program, fetch_list = build_program()
-        fetch_list = [x.name for x in fetch_list]
-
-        os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '0'
-        executor = paddle.static.Executor(self.place)
-        os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '1'
-        executor.run(startup_program)
-
-        helper_profiler = profiler.Profiler(
-            targets=[profiler.ProfilerTarget.CPU], scheduler=(1, 2))
-        helper_profiler.start()
-        for i in range(self.iter_n):
-            executor.run(main_program, fetch_list=fetch_list)
-            helper_profiler.step()
-        helper_profiler.stop()
-
-        perfstat_filepath = os.environ[
-            'FLAGS_static_executor_perfstat_filepath']
-        self.assertTrue(os.path.exists(perfstat_filepath))
-        with open(perfstat_filepath, 'r') as load_f:
+
+        enable = True
+        if executor == 'ParallelExecutor':
+            main_program = paddle.fluid.compiler.CompiledProgram(main_program)
+            enable = False
+        elif executor == 'Executor':
+            enable = False
+
+        scope = paddle.static.Scope()
+        with paddle.static.scope_guard(scope):
+            with framework._enable_standalone_executor(enable):
+                exe = paddle.static.Executor(self.place)
+                helper_profiler = profiler.Profiler(
+                    targets=[profiler.ProfilerTarget.CPU], scheduler=(1, 2))
+                helper_profiler.start()
+                for i in range(self.iter_n):
+                    exe.run(main_program, fetch_list=fetch_list)
+                    helper_profiler.step()
+                helper_profiler.stop()
+
+        self.assertTrue(os.path.exists(self.perf_path))
+        with open(self.perf_path, 'r') as load_f:
             stat_res = json.load(load_f)
             self.assertTrue(len(stat_res) > 0)
 
-        os.remove(perfstat_filepath)
+        os.remove(self.perf_path)
         shutil.rmtree('./profiler_log')
 
 
@@ -225,57 +162,24 @@ def setUp(self):
             0) if core.is_compiled_with_cuda() else paddle.CPUPlace()
 
     def test_result(self):
-        ground_truths = self.run_raw_executor()
-        res = self.run_new_executor()
+        ground_truths = self.run_test(False)
+        res = self.run_test(True)
 
         for gt, out in zip(ground_truths, res):
             self.assertEqual(gt[0], out[0])
 
-    def run_raw_executor(self):
+    def run_test(self, use_new_executor=True):
         paddle.seed(2020)
         main_program, startup_program, fetch_list = build_program()
 
-        exe = paddle.static.Executor(self.place)
-        exe.run(startup_program)
-
-        outs = []
-        for i in range(self.iter_n):
-            outs.append(exe.run(main_program, fetch_list=fetch_list))
-
-        return outs
-
-    def run_new_executor(self):
-        paddle.seed(2020)
-        main_program, startup_program, fetch_list = build_program()
-        fetch_list = [x.name for x in fetch_list]
-
-        p = core.Place()
-        p.set_place(self.place)
-        inter_core = StandaloneExecutor(p, startup_program.desc,
-                                        main_program.desc, core.Scope())
-
-        outs = []
-        for i in range(self.iter_n):
-            outs.append(
-                np.array(inter_core.run({}, fetch_list)._move_to_list()[0]))
-        return outs
-
-
-class SwitchExecutorInterfaceTestCase(MultiStreamModelTestCase):
-
-    def run_new_executor(self):
-        paddle.seed(2020)
-        os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '1'
-        main_program, startup_program, fetch_list = build_program()
-        exe = paddle.static.Executor(self.place)
-        exe.run(startup_program)
-
-        outs = []
-        for i in range(self.iter_n):
-            outs.append(exe.run(main_program, fetch_list=fetch_list))
-
-        del os.environ['FLAGS_USE_STANDALONE_EXECUTOR']
-
+        with framework._enable_standalone_executor(use_new_executor):
+            scope = core.Scope()
+            exe = paddle.static.Executor(self.place)
+            outs = []
+            for i in range(self.iter_n):
+                outs.append(
+                    exe.run(main_program, scope=scope, fetch_list=fetch_list))
+            print(outs)
         return outs
 
 
@@ -331,23 +235,23 @@ def _run(self,
         return outs
 
     def run_raw_executor(self, feed, use_compiled=False):
-        # run construct program 1
-        out1 = self._run(feed,
-                         use_str=False,
-                         is_double=False,
-                         use_compiled=use_compiled)
-        # run construct program 2 with same executor
-        out2 = self._run(feed,
-                         use_str=True,
-                         is_double=True,
-                         use_compiled=use_compiled)
-
-        return [out1, out2]
+        with framework._enable_standalone_executor(False):
+            # run construct program 1
+            out1 = self._run(feed,
+                             use_str=False,
+                             is_double=False,
+                             use_compiled=use_compiled)
+            # run construct program 2 with same executor
+            out2 = self._run(feed,
+                             use_str=True,
+                             is_double=True,
+                             use_compiled=use_compiled)
+
+            return [out1, out2]
 
     def run_new_executor(self, feed, use_compiled=False):
-        os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '1'
-        out = self.run_raw_executor(feed, use_compiled=use_compiled)
-        del os.environ['FLAGS_USE_STANDALONE_EXECUTOR']
+        with framework._enable_standalone_executor():
+            out = self.run_raw_executor(feed, use_compiled=use_compiled)
         return out
 
     def test_with_feed(self):
@@ -363,9 +267,8 @@ def test_with_error(self):
         feed = [{'a': np.ones([2, 2], dtype="float32")}]
 
         with self.assertRaises(TypeError):
-            os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '1'
-            self._run(feed[0], add_wrong_fetch=True)
-            del os.environ['FLAGS_USE_STANDALONE_EXECUTOR']
+            with framework._enable_standalone_executor():
+                self._run(feed[0], add_wrong_fetch=True)
 
     def test_compiled_program(self):
         data = np.ones([2, 2], dtype="float32")
@@ -376,6 +279,15 @@ def test_compiled_program(self):
         for x, y in zip(gt, res):
             self.assertTrue(np.array_equal(x, y))
 
+    def test_compiled_program_convert_graph_to_program(self):
+        data = np.ones([2, 2], dtype="float32")
+        feed = {"a": data}
+
+        res = self.run_new_executor(feed, use_compiled=True)
+        gt = self.run_raw_executor(feed, use_compiled=True)
+        for x, y in zip(gt, res):
+            self.assertTrue(np.array_equal(x, y))
+
     def test_empty_program(self):
         program = paddle.static.Program()
         exe = paddle.static.Executor(self.place)
@@ -384,9 +296,8 @@ def test_empty_program(self):
 
         for i in range(10):
             print(i, flush=1)
-            os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '1'
-            out = exe.run(program, feed=None)
-            del os.environ['FLAGS_USE_STANDALONE_EXECUTOR']
+            with framework._enable_standalone_executor():
+                out = exe.run(program, feed=None)
 
 
 class TestException(unittest.TestCase):
@@ -420,14 +331,12 @@ def _run(self, feeds):
 
         for feed in feeds:
             out = exe.run(main_program, feed=feed, fetch_list=fetch_vars)
-        print(main_program)
         self.fetch_vars = fetch_vars
         return out
 
     def run_new_executor(self, feed):
-        os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '1'
-        out = self._run(feed)
-        del os.environ['FLAGS_USE_STANDALONE_EXECUTOR']
+        with framework._enable_standalone_executor():
+            out = self._run(feed)
         return out
 
     def test_exception(self):
@@ -475,14 +384,12 @@ def test_increment(self):
             with paddle.fluid.device_guard("cpu"):
                 x = paddle.increment(x)
             exe = paddle.static.Executor(paddle.CUDAPlace(0))
-            os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '1'
-
-            for i in range(10):
-                a, = exe.run(paddle.static.default_main_program(),
-                             fetch_list=[x])
-                self.assertEqual(a[0], 1)
+            with framework._enable_standalone_executor():
 
-            del os.environ['FLAGS_USE_STANDALONE_EXECUTOR']
+                for i in range(10):
+                    a, = exe.run(paddle.static.default_main_program(),
+                                 fetch_list=[x])
+                    self.assertEqual(a[0], 1)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt
index 0174274827358..2b698ce9363fd 100644
--- a/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt
@@ -4,7 +4,6 @@ if(WITH_IPU)
     RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
     "test_*.py")
   string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-
   foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
     # set all UTs timeout to 200s
@@ -15,4 +14,7 @@ if(WITH_IPU)
   set_tests_properties(test_elemetwise_x_op_ipu PROPERTIES TIMEOUT 300)
   set_tests_properties(test_reduce_x_op_ipu PROPERTIES TIMEOUT 600)
   set_tests_properties(test_save_load_ipu PROPERTIES TIMEOUT 600)
+
+  add_subdirectory(custom_ops)
+
 endif()
diff --git a/python/paddle/fluid/tests/unittests/ipu/custom_ops/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ipu/custom_ops/CMakeLists.txt
new file mode 100644
index 0000000000000..d7615f933aad0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/custom_ops/CMakeLists.txt
@@ -0,0 +1,12 @@
+if(WITH_IPU)
+  file(
+    GLOB CUSTOM_OP_TESTS
+    RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+    "test_*.py")
+  string(REPLACE ".py" "" CUSTOM_OP_TESTS "${CUSTOM_OP_TESTS}")
+  foreach(CUSTOM_OP_TEST ${CUSTOM_OP_TESTS})
+    py_test(${CUSTOM_OP_TEST} SRCS ${CUSTOM_OP_TEST}.py)
+  endforeach()
+
+  add_subdirectory(deprecated)
+endif()
diff --git a/python/paddle/fluid/tests/unittests/ipu/custom_ops/README.md b/python/paddle/fluid/tests/unittests/ipu/custom_ops/README.md
new file mode 100644
index 0000000000000..efac2a764ad10
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/custom_ops/README.md
@@ -0,0 +1,71 @@
+# Add custom op for Paddle on IPU
+
+## Add custom op in Paddle
+
+reference
+
+https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/custom_op/new_cpp_op_cn.html
+
+## Write custom op for PopART
+
+reference
+
+https://docs.graphcore.ai/projects/popart-user-guide/en/latest/custom_ops.html
+
+## Register custom op for Paddle on IPU
+
+这里采用即时编译(JIT Compile) 的方法使用 custom op.
+
+### 实现 custom op
+
+根据上面的两个文档, 首先添加 custom op 的实现.
+
+`leaky_relu_cpu.cc` 包含了 Paddle 中 custom op 的定义和 cpu 实现, 这里的实现是和标准的 Paddle 添加 custom op 是完全一致的. 这里的 cpu 实现不是必须的, cpu 实现可以用来检验 ipu 实现的正确性.
+
+`leaky_relu_ipu.cc` 包含了 PopART 中 custom op 的定义和 ipu 实现, 同样的, 这里的实现和标准的 PopART 添加 custom op 是完全一致的.
+
+### 载入 custom op
+
+分别在 Paddle 和 PopART 中实现 custom op 的定义后, 使用 `paddle.utils.cpp_extension.load` 编译源文件并把对应的动态库加载到当前进程中.
+
+```python
+
+cur_dir = os.path.dirname(os.path.realpath(__file__))
+custom_ops = load(
+    name="custom_jit_ops",
+    sources=[
+        f"{cur_dir}/leaky_relu_cpu.cc",
+        f"{cur_dir}/leaky_relu_ipu.cc",
+    ],
+    # 编译 leaky_relu_ipu.cc 时需要添加此参数
+    extra_cxx_cflags=['-DONNX_NAMESPACE=onnx'])
+
+```
+
+由于 Paddle 中 op 的定义和 PopART 中存在一些差异, 需要手动映射 custom op
+
+```python
+
+# paddle_op is custom op type in Paddle
+# popart_op, domain and version is custom op identifier in PopART
+ipu_strategy = paddle.static.IpuStrategy()
+ipu_strategy.add_custom_op(
+    paddle_op="custom_leaky_relu",
+    popart_op="LeakyRelu",
+    domain='custom.ops',
+    version=1)
+
+```
+
+### 使用 custom op
+
+```python
+
+x = paddle.static.data(
+    name=self.feed_list[0],
+    shape=self.feed_shape[0],
+    dtype=self.feed_dtype[0])
+# custom op
+out = custom_ops.custom_leaky_relu(x, **self.attrs)
+
+```
diff --git a/python/paddle/fluid/tests/unittests/ipu/custom_ops/custom_checkpointoutput.cc b/python/paddle/fluid/tests/unittests/ipu/custom_ops/custom_checkpointoutput.cc
new file mode 100644
index 0000000000000..c2957ba224886
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/custom_ops/custom_checkpointoutput.cc
@@ -0,0 +1,43 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/extension.h"
+
+namespace {
+std::vector<std::vector<int64_t>> InferShape(std::vector<int64_t> x_shape) {
+  return {x_shape};
+}
+
+std::vector<paddle::DataType> InferDtype(paddle::DataType x_dtype) {
+  return {x_dtype};
+}
+
+std::vector<paddle::Tensor> OpForward(const paddle::Tensor &x) { return {x}; }
+
+std::vector<paddle::Tensor> OpBackward(const paddle::Tensor &x) { return {x}; }
+}  // namespace
+
+// https://github.com/graphcore/popart/blob/sdk-release-2.5/willow/src/builder_impl.cpp#L1458
+// only support one input
+PD_BUILD_OP(checkpointoutput)
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .SetInferShapeFn(PD_INFER_SHAPE(InferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(InferDtype))
+    .SetKernelFn(PD_KERNEL(OpForward));
+
+PD_BUILD_GRAD_OP(checkpointoutput)
+    .Inputs({paddle::Grad("Out")})
+    .Outputs({paddle::Grad("X")})
+    .SetKernelFn(PD_KERNEL(OpBackward));
diff --git a/python/paddle/fluid/tests/unittests/ipu/custom_ops/custom_detach.cc b/python/paddle/fluid/tests/unittests/ipu/custom_ops/custom_detach.cc
new file mode 100644
index 0000000000000..2eb62599c0e36
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/custom_ops/custom_detach.cc
@@ -0,0 +1,42 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/extension.h"
+
+namespace {
+std::vector<std::vector<int64_t>> InferShape(std::vector<int64_t> x_shape) {
+  return {x_shape};
+}
+
+std::vector<paddle::DataType> InferDtype(paddle::DataType x_dtype) {
+  return {x_dtype};
+}
+
+std::vector<paddle::Tensor> OpForward(const paddle::Tensor &x) { return {x}; }
+
+std::vector<paddle::Tensor> OpBackward(const paddle::Tensor &x) { return {x}; }
+}  // namespace
+
+// https://github.com/graphcore/popart/blob/sdk-release-2.5/willow/src/builder.cpp#L502
+PD_BUILD_OP(custom_detach)
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .SetInferShapeFn(PD_INFER_SHAPE(InferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(InferDtype))
+    .SetKernelFn(PD_KERNEL(OpForward));
+
+PD_BUILD_GRAD_OP(custom_detach)
+    .Inputs({paddle::Grad("Out")})
+    .Outputs({paddle::Grad("X")})
+    .SetKernelFn(PD_KERNEL(OpBackward));
diff --git a/python/paddle/fluid/tests/unittests/ipu/custom_ops/custom_identity.cc b/python/paddle/fluid/tests/unittests/ipu/custom_ops/custom_identity.cc
new file mode 100644
index 0000000000000..0ed9cc7440f0b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/custom_ops/custom_identity.cc
@@ -0,0 +1,42 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/extension.h"
+
+namespace {
+std::vector<std::vector<int64_t>> InferShape(std::vector<int64_t> x_shape) {
+  return {x_shape};
+}
+
+std::vector<paddle::DataType> InferDtype(paddle::DataType x_dtype) {
+  return {x_dtype};
+}
+
+std::vector<paddle::Tensor> OpForward(const paddle::Tensor &x) { return {x}; }
+
+std::vector<paddle::Tensor> OpBackward(const paddle::Tensor &x) { return {x}; }
+}  // namespace
+
+// https://github.com/graphcore/popart/blob/sdk-release-2.5/willow/src/builder.gen.cpp#L620
+PD_BUILD_OP(custom_identity)
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .SetInferShapeFn(PD_INFER_SHAPE(InferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(InferDtype))
+    .SetKernelFn(PD_KERNEL(OpForward));
+
+PD_BUILD_GRAD_OP(custom_identity)
+    .Inputs({paddle::Grad("Out")})
+    .Outputs({paddle::Grad("X")})
+    .SetKernelFn(PD_KERNEL(OpBackward));
diff --git a/python/paddle/fluid/tests/unittests/ipu/custom_ops/custom_nll.cc b/python/paddle/fluid/tests/unittests/ipu/custom_ops/custom_nll.cc
new file mode 100644
index 0000000000000..f08c1c326baca
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/custom_ops/custom_nll.cc
@@ -0,0 +1,62 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/extension.h"
+
+namespace {
+std::vector<std::vector<int64_t>> InferShape(
+    std::vector<int64_t> x_shape,
+    std::vector<int64_t> y_shape,
+    const std::string &reduction,
+    const int &ignoreIndex,
+    const bool &inputIsLogProbability) {
+  // reduction type: Sum, Mean, None
+  if (reduction == "None") {
+    return {y_shape};
+  } else {
+    return {{1}};
+  }
+}
+
+std::vector<paddle::DataType> InferDtype(paddle::DataType x_dtype,
+                                         paddle::DataType y_dtype) {
+  return {x_dtype};
+}
+
+std::vector<paddle::Tensor> OpForward(const paddle::Tensor &x,
+                                      const paddle::Tensor &y) {
+  return {x};
+}
+
+std::vector<paddle::Tensor> OpBackward(const paddle::Tensor &x) { return {x}; }
+}  // namespace
+
+// https://github.com/graphcore/popart/blob/sdk-release-2.5/willow/src/builder.cpp#L775
+// type of `reduction` is std::string
+// `ignoreIndex` is optional, if no need, need to remove it manually(which is a
+// new custom op in paddle)
+PD_BUILD_OP(custom_nll)
+    .Inputs({"X", "Y"})
+    .Outputs({"Out"})
+    .Attrs({"reduction: std::string",
+            "ignoreIndex: int",
+            "inputIsLogProbability: bool"})
+    .SetInferShapeFn(PD_INFER_SHAPE(InferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(InferDtype))
+    .SetKernelFn(PD_KERNEL(OpForward));
+
+PD_BUILD_GRAD_OP(custom_nll)
+    .Inputs({paddle::Grad("Out")})
+    .Outputs({paddle::Grad("X")})
+    .SetKernelFn(PD_KERNEL(OpBackward));
diff --git a/python/paddle/fluid/tests/unittests/ipu/custom_ops/deprecated/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ipu/custom_ops/deprecated/CMakeLists.txt
new file mode 100644
index 0000000000000..c132a2517e80b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/custom_ops/deprecated/CMakeLists.txt
@@ -0,0 +1,3 @@
+if(WITH_IPU)
+  py_test(test_custom_nllloss_ipu SRCS test_custom_nllloss_ipu.py)
+endif()
diff --git a/python/paddle/fluid/tests/unittests/ipu/custom_ops/deprecated/custom_nllloss.cc b/python/paddle/fluid/tests/unittests/ipu/custom_ops/deprecated/custom_nllloss.cc
new file mode 100644
index 0000000000000..a4f123144d39a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/custom_ops/deprecated/custom_nllloss.cc
@@ -0,0 +1,53 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/extension.h"
+
+std::vector<paddle::Tensor> Kernel_Function() { return {}; }
+std::vector<paddle::Tensor> Kernel_Function_Grad() { return {}; }
+
+// nllloss
+std::vector<std::vector<int64_t>> InferShape_NllLoss(
+    std::vector<int64_t> x_shape,
+    std::vector<int64_t> y_shape,
+    const int& reduction,
+    const std::string& ignoreIndex,
+    const bool& inputIsLogProbability) {
+  // 0: sum, 1: mean, 2: none
+  if (reduction == 2) {
+    return {y_shape};
+  } else {
+    return {{1}};
+  }
+}
+
+std::vector<paddle::DataType> InferDtype_NllLoss(paddle::DataType x_dtype,
+                                                 paddle::DataType y_dtype) {
+  return {x_dtype};
+}
+
+PD_BUILD_OP(custom_nll_loss)
+    .Inputs({"X", "Y"})
+    .Outputs({"Out"})
+    .Attrs({"reduction: int",
+            "ignoreIndex: std::string",
+            "inputIsLogProbability: bool"})
+    .SetKernelFn(PD_KERNEL(Kernel_Function))
+    .SetInferShapeFn(PD_INFER_SHAPE(InferShape_NllLoss))
+    .SetInferDtypeFn(PD_INFER_DTYPE(InferDtype_NllLoss));
+
+PD_BUILD_GRAD_OP(custom_nll_loss)
+    .Inputs({paddle::Grad("Out")})
+    .Outputs({paddle::Grad("X")})
+    .SetKernelFn(PD_KERNEL(Kernel_Function_Grad));
diff --git a/python/paddle/fluid/tests/unittests/ipu/custom_ops/deprecated/test_custom_nllloss_ipu.py b/python/paddle/fluid/tests/unittests/ipu/custom_ops/deprecated/test_custom_nllloss_ipu.py
new file mode 100644
index 0000000000000..9ae7b307ca543
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/custom_ops/deprecated/test_custom_nllloss_ipu.py
@@ -0,0 +1,110 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import sys
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.utils.cpp_extension import load
+
+sys.path.append(
+    os.path.dirname(os.path.dirname(os.path.dirname(
+        os.path.abspath(__file__)))))
+from op_test_ipu import IPUOpTest
+
+
+def load_custom_ops():
+    cur_dir = os.path.dirname(os.path.realpath(__file__))
+    custom_ops = load(name="custom_nll_loss",
+                      sources=[f"{cur_dir}/custom_nllloss.cc"],
+                      extra_cxx_cflags=['-DONNX_NAMESPACE=onnx'],
+                      extra_ldflags=['-lpopfloat'])
+    return custom_ops
+
+
+class TestBase(IPUOpTest):
+
+    def setUp(self):
+        self.load_custom_ops()
+        self.set_atol()
+        self.set_test_op()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+
+    @property
+    def fp16_enabled(self):
+        return False
+
+    def load_custom_ops(self):
+        self.custom_ops = load_custom_ops()
+
+    def set_data_feed(self):
+        x = np.random.rand(16, 20, 256).astype('float32')
+        label = np.random.uniform(0, 256, size=[16, 20]).astype('int32')
+        self.feed_fp32 = {
+            'x': x,
+            'label': label,
+        }
+
+    def set_test_op(self):
+        self.op = self.custom_ops.custom_nll_loss
+        self.op_attrs = {
+            "reduction": 0,
+            "ignoreindex": "0",
+            "inputislogprobability": False,
+        }
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+        label = paddle.static.data(name=self.feed_list[1],
+                                   shape=self.feed_shape[1],
+                                   dtype='int32')
+        out = self.op(x, label, **self.op_attrs)
+        out = paddle.mean(out)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        self.build_model()
+        # only test IPU_FP32
+        self.run_model(IPUOpTest.ExecutionMode.IPU_FP32)
+        print(self.output_dict)
+
+
+class TestCase1(TestBase):
+
+    def set_test_op(self):
+        self.op = self.custom_ops.custom_nll_loss
+        self.op_attrs = {
+            "reduction": 0,
+            "ignoreindex": "None",
+            "inputislogprobability": False,
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/custom_ops/leaky_relu_cpu.cc b/python/paddle/fluid/tests/unittests/ipu/custom_ops/leaky_relu_cpu.cc
new file mode 100644
index 0000000000000..d118aa4380246
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/custom_ops/leaky_relu_cpu.cc
@@ -0,0 +1,111 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/extension.h"
+
+#define CHECK_INPUT(x) \
+  PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.")
+
+template <typename data_t>
+void leaky_relu_cpu_forward_kernel(const data_t* x_data,
+                                   data_t* out_data,
+                                   int64_t x_numel,
+                                   float alpha) {
+  // x < 0.0f ? alpha * x : x
+  for (int i = 0; i < x_numel; ++i) {
+    if (x_data[i] > static_cast<data_t>(0.)) {
+      out_data[i] = x_data[i];
+    } else {
+      out_data[i] = static_cast<data_t>(alpha) * x_data[i];
+    }
+  }
+}
+
+template <typename data_t>
+void leaky_relu_cpu_backward_kernel(const data_t* grad_out_data,
+                                    const data_t* out_data,
+                                    data_t* grad_x_data,
+                                    int64_t out_numel,
+                                    float alpha) {
+  // (grad * (x < 0.0f ? alpha : 1))
+  for (int i = 0; i < out_numel; ++i) {
+    if (out_data[i]<out_data[i]> static_cast<data_t>(0)) {
+      grad_x_data[i] = static_cast<data_t>(alpha);
+    } else {
+      grad_x_data[i] = static_cast<data_t>(1.);
+    }
+  }
+}
+
+std::vector<paddle::Tensor> LeakyReluCPUForward(const paddle::Tensor& x,
+                                                float alpha) {
+  CHECK_INPUT(x);
+
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
+
+  PD_DISPATCH_FLOATING_TYPES(x.type(), "relu_cpu_forward_kernel", ([&] {
+                               leaky_relu_cpu_forward_kernel<data_t>(
+                                   x.data<data_t>(),
+                                   out.mutable_data<data_t>(x.place()),
+                                   x.size(),
+                                   alpha);
+                             }));
+
+  return {out};
+}
+
+std::vector<paddle::Tensor> LeakyReluCPUBackward(const paddle::Tensor& x,
+                                                 const paddle::Tensor& out,
+                                                 const paddle::Tensor& grad_out,
+                                                 float alpha) {
+  CHECK_INPUT(x);
+  CHECK_INPUT(out);
+  CHECK_INPUT(grad_out);
+
+  auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
+
+  PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_backward_kernel", ([&] {
+                               leaky_relu_cpu_backward_kernel<data_t>(
+                                   grad_out.data<data_t>(),
+                                   out.data<data_t>(),
+                                   grad_x.mutable_data<data_t>(x.place()),
+                                   out.size(),
+                                   alpha);
+                             }));
+
+  return {grad_x};
+}
+
+std::vector<std::vector<int64_t>> LeakyReluInferShape(
+    std::vector<int64_t> x_shape) {
+  return {x_shape};
+}
+
+std::vector<paddle::DataType> LeakyReluInferDtype(paddle::DataType x_dtype) {
+  return {x_dtype};
+}
+
+PD_BUILD_OP(custom_leaky_relu)
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .Attrs({"alpha: float"})
+    .SetKernelFn(PD_KERNEL(LeakyReluCPUForward))
+    .SetInferShapeFn(PD_INFER_SHAPE(LeakyReluInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(LeakyReluInferDtype));
+
+PD_BUILD_GRAD_OP(custom_leaky_relu)
+    .Inputs({"X", "Out", paddle::Grad("Out")})
+    .Outputs({paddle::Grad("X")})
+    .Attrs({"alpha: float"})
+    .SetKernelFn(PD_KERNEL(LeakyReluCPUBackward));
diff --git a/python/paddle/fluid/tests/unittests/ipu/custom_ops/leaky_relu_ipu.cc b/python/paddle/fluid/tests/unittests/ipu/custom_ops/leaky_relu_ipu.cc
new file mode 100644
index 0000000000000..1fea75b3b5ae7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/custom_ops/leaky_relu_ipu.cc
@@ -0,0 +1,229 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <popart/opmanager.hpp>
+#include <popart/opserialiser.hpp>
+#include <popart/popx/opxmanager.hpp>
+#include <popart/shapeinference.hpp>
+
+#include <popops/ElementWise.hpp>
+
+namespace CustomOperators {
+const popart::OperatorIdentifier LeakyReluId = {"custom.ops", "LeakyRelu", 1};
+}  // namespace CustomOperators
+namespace CustomGradOperators {
+const popart::OperatorIdentifier LeakyReluGradId = {
+    "custom.ops", "LeakyReluGrad", 1};
+}  // namespace CustomGradOperators
+
+class LeakyReluOp;
+class LeakyReluOpx;
+class LeakyReluGradOpx;
+
+class LeakyReluGradOp : public popart::Op {
+ public:
+  explicit LeakyReluGradOp(const LeakyReluOp &fwdOp);
+
+  std::unique_ptr<popart::Op> clone() const final {
+    return std::make_unique<LeakyReluGradOp>(*this);
+  }
+  void setup() final { outInfo(0) = inInfo(0); };
+
+  const std::vector<popart::GradInOutMapper> &gradInputInfo() const;
+
+  // The Grad Op has 1 output, which is the gradient of the only input
+  const std::map<int, int> &gradOutToNonGradIn() const;
+
+  bool requiresRandomSeed() const override { return false; }
+
+  // an estimate of how valuable sub-graph matching will be
+  float getSubgraphValue() const final { return getHighSubgraphValue(); }
+
+  float getAlpha() const { return alpha; }
+
+  // Implementation defined below
+  void appendAttributes(popart::OpSerialiserBase &os) const override;
+
+  // Implementation defined below
+  void appendOutlineAttributes(popart::OpSerialiserBase &os) const override;
+
+ private:
+  float alpha;
+};
+
+class LeakyReluOp : public popart::Op {
+ public:
+  LeakyReluOp(const popart::OperatorIdentifier &_opid,
+              float _alpha,
+              const popart::Op::Settings &settings_)
+      : popart::Op(_opid, settings_), alpha(_alpha) {}
+
+  std::unique_ptr<Op> clone() const final {
+    return std::make_unique<LeakyReluOp>(*this);
+  }
+
+  void setup() final { outInfo(0) = inInfo(0); }
+
+  void appendAttributes(popart::OpSerialiserBase &os) const override {
+    Op::appendAttributes(os);
+    os.appendAttribute("alpha", getAlpha());
+  }
+
+  void appendOutlineAttributes(popart::OpSerialiserBase &os) const override {
+    Op::appendOutlineAttributes(os);
+    os.appendAttribute("alpha", getAlpha());
+  }
+
+  std::vector<std::unique_ptr<popart::Op>> getGradOps() {
+    std::vector<std::unique_ptr<Op>> upops;
+    upops.emplace_back(new LeakyReluGradOp(*this));
+    return upops;
+  }
+
+  float getSubgraphValue() const final { return getHighSubgraphValue(); }
+
+  bool requiresRandomSeed() const override { return false; }
+
+  // Attributes
+  float getAlpha() const { return alpha; }
+
+ private:
+  float alpha;
+};
+
+namespace {
+using popart::DataType;
+using popart::OpDefinition;
+
+static OpDefinition::DataTypes T = {DataType::FLOAT16, DataType::FLOAT};
+
+static OpDefinition leakyReluOpDef({OpDefinition::Inputs({{"input", T}}),
+                                    OpDefinition::Outputs({{"output", T}}),
+                                    OpDefinition::Attributes({{"alpha",
+                                                               {"*"}}})});
+
+static popart::OpCreator<LeakyReluOp> leakyReluOpCreator(
+    popart::OpDefinitions({{CustomOperators::LeakyReluId, leakyReluOpDef}}),
+    [](const popart::OpCreatorInfo &info) {
+      // default alpha is 10**(-2)
+      float alpha = info.attributes.getAttribute<popart::Attributes::Float>(
+          "alpha", 1e-2f);
+      return std::make_unique<LeakyReluOp>(info.opid, alpha, info.settings);
+    },
+    true);
+}  // namespace
+
+static popart::RegisterShapeInferenceFunction leakyReluShapeInfer(
+    CustomOperators::LeakyReluId,
+    [](popart::ShapeInferenceContext &ctx  // NO_LINT
+    ) { ctx.outInfo(0) = ctx.inInfo(0); });
+
+namespace pe = popops::expr;
+
+class LeakyReluOpx : public popart::popx::Opx {
+ public:
+  LeakyReluOpx(popart::Op *op, popart::popx::Devicex *devicex)
+      : popart::popx::Opx(op, devicex) {
+    verifyOp<LeakyReluOp>(op, {CustomOperators::LeakyReluId});
+  }
+
+  void grow(poplar::program::Sequence &prog) const final {  // NOLINT
+    popart::logging::ir::trace("start Growing LeakyReluOpx");
+
+    auto op = getOp<LeakyReluOp>();
+
+    poplar::Tensor input = getInTensor(0);
+
+    float alpha = op.getAlpha();
+
+    // x < 0.0f ? alpha * x : x
+    auto expression = pe::Select(pe::Mul(pe::Const(alpha), pe::_1),
+                                 pe::_1,
+                                 pe::Lt(pe::_1, pe::Const(0.0f)));
+
+    popops::mapInPlace(graph(),
+                       expression,
+                       {input},
+                       prog,
+                       debugContext("LeakyRelu"),
+                       poplar::OptionFlags());
+
+    setOutTensor(0, input);
+  }
+};
+
+class LeakyReluGradOpx : public popart::popx::Opx {
+ public:
+  LeakyReluGradOpx(popart::Op *op, popart::popx::Devicex *devicex)
+      : popart::popx::Opx(op, devicex) {
+    verifyOp<LeakyReluGradOp>(op, {CustomGradOperators::LeakyReluGradId});
+  }
+
+  void grow(poplar::program::Sequence &prog) const final {  // NOLINT
+    auto op = getOp<LeakyReluGradOp>();
+
+    poplar::Tensor grad = getInTensor(0);
+    poplar::Tensor input = getInTensor(1);
+
+    float alpha = op.getAlpha();
+
+    // (grad * (x < 0.0f ? alpha : 1))
+    pe::Mul expression = pe::Mul(
+        pe::Select(
+            pe::Const(alpha), pe::Const(1.0f), pe::Lt(pe::_2, pe::Const(0.0f))),
+        pe::_1);
+
+    auto output = popops::map(graph(),
+                              expression,
+                              {grad, input},
+                              prog,
+                              debugContext("LeakyReluGrad"),
+                              poplar::OptionFlags());
+
+    setOutTensor(0, output);
+  }
+};
+
+LeakyReluGradOp::LeakyReluGradOp(const LeakyReluOp &fwdOp)
+    : popart::Op(CustomGradOperators::LeakyReluGradId, fwdOp.settings),
+      alpha(fwdOp.getAlpha()) {}
+
+const std::vector<popart::GradInOutMapper> &LeakyReluGradOp::gradInputInfo()
+    const {
+  static const std::vector<popart::GradInOutMapper> inInfo = {
+      {0, 0, popart::GradOpInType::GradOut}, {1, 0, popart::GradOpInType::In}};
+  return inInfo;
+}
+
+// The Grad Op has 1 output, which is the gradient of the only input
+const std::map<int, int> &LeakyReluGradOp::gradOutToNonGradIn() const {
+  static const std::map<int, int> outInfo = {{0, 0}};
+  return outInfo;
+}
+
+void LeakyReluGradOp::appendAttributes(popart::OpSerialiserBase &os) const {
+  Op::appendAttributes(os);
+  os.appendAttribute("alpha", getAlpha());
+}
+
+void LeakyReluGradOp::appendOutlineAttributes(
+    popart::OpSerialiserBase &os) const {
+  Op::appendOutlineAttributes(os);
+  os.appendAttribute("alpha", getAlpha());
+}
+
+static popart::popx::OpxCreator<LeakyReluOpx> LeakyReluOpxCreator(
+    {CustomOperators::LeakyReluId});
+static popart::popx::OpxCreator<LeakyReluGradOpx> LeakyReluGradOpxCreator(
+    {CustomGradOperators::LeakyReluGradId});
diff --git a/python/paddle/fluid/tests/unittests/ipu/custom_ops/test_checkpointoutput_ipu.py b/python/paddle/fluid/tests/unittests/ipu/custom_ops/test_checkpointoutput_ipu.py
new file mode 100644
index 0000000000000..698cef211db66
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/custom_ops/test_checkpointoutput_ipu.py
@@ -0,0 +1,88 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import sys
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.utils.cpp_extension import load
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from op_test_ipu import IPUOpTest
+
+
+def load_custom_ops():
+    cur_dir = os.path.dirname(os.path.realpath(__file__))
+    custom_ops = load(name="checkpointoutput",
+                      sources=[
+                          f"{cur_dir}/custom_checkpointoutput.cc",
+                      ],
+                      extra_cxx_cflags=['-DONNX_NAMESPACE=onnx'])
+    return custom_ops
+
+
+class TestCheckpointoutput(IPUOpTest):
+
+    def setUp(self):
+        self.load_custom_ops()
+        self.set_atol()
+        self.set_test_op()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+
+    @property
+    def fp16_enabled(self):
+        return False
+
+    def load_custom_ops(self):
+        self.custom_ops = load_custom_ops()
+
+    def set_test_op(self):
+        self.op = self.custom_ops.checkpointoutput
+        self.op_attrs = {}
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+        x = paddle.add(x, x)
+        x = self.op(x, **self.op_attrs)
+        x = paddle.mean(x)
+        self.fetch_list = [x.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        self.build_model()
+        # only test IPU_FP32
+        self.run_model(IPUOpTest.ExecutionMode.IPU_FP32)
+        print(self.output_dict)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/custom_ops/test_custom_leaky_relu_ipu.py b/python/paddle/fluid/tests/unittests/ipu/custom_ops/test_custom_leaky_relu_ipu.py
new file mode 100644
index 0000000000000..fb3fcbf5fe416
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/custom_ops/test_custom_leaky_relu_ipu.py
@@ -0,0 +1,124 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import sys
+
+import numpy as np
+import paddle
+import paddle.optimizer
+import paddle.static
+from paddle.utils.cpp_extension import load
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from op_test_ipu import IPUOpTest, np_dtype_to_fluid_str
+
+
+def load_custom_ops():
+    # load custom ops
+    cur_dir = os.path.dirname(os.path.realpath(__file__))
+    custom_ops = load(name="custom_jit_ops",
+                      sources=[
+                          f"{cur_dir}/leaky_relu_cpu.cc",
+                          f"{cur_dir}/leaky_relu_ipu.cc",
+                      ],
+                      extra_cxx_cflags=['-DONNX_NAMESPACE=onnx'])
+    return custom_ops
+
+
+class TestBase(IPUOpTest):
+
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_feed()
+        self.set_feed_attr()
+        self.set_attrs()
+
+    def set_feed(self):
+        self.feed = {
+            "x": np.random.uniform(low=-2, high=2, size=[3,
+                                                         5]).astype('float32'),
+        }
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed.values()]
+        self.feed_list = list(self.feed.keys())
+        self.feed_dtype = [
+            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
+        ]
+
+    def set_attrs(self):
+        self.attrs = {'alpha': 0.1}
+
+    def _test_base(self, run_ipu=True):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        SEED = self.SEED
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        custom_ops = load_custom_ops()
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(name=self.feed_list[0],
+                                       shape=self.feed_shape[0],
+                                       dtype=self.feed_dtype[0])
+                # custom op
+                out = custom_ops.custom_leaky_relu(x, **self.attrs)
+                fetch_list = [out.name]
+
+            if run_ipu:
+                place = paddle.IPUPlace()
+            else:
+                place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if run_ipu:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=False)
+
+                # add name mapping for paddle custom op and popart custom ops
+                # `paddle_op` was defined in leaky_relu_cpu.cc
+                # `popart_op`, `domain` and `version` was defined in leaky_relu_ipu.cc
+                ipu_strategy.add_custom_op(paddle_op="custom_leaky_relu",
+                                           popart_op="LeakyRelu",
+                                           domain='custom.ops',
+                                           version=1)
+
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog, scope=scope,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            return result[0]
+
+    def test_base(self):
+        res0 = self._test_base(False)
+        res1 = self._test_base(True)
+
+        self.assertTrue(
+            np.allclose(res0.flatten(), res1.flatten(), atol=self.atol))
+
+        self.assertTrue(res0.shape == res1.shape)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/custom_ops/test_custom_ops_ipu.py b/python/paddle/fluid/tests/unittests/ipu/custom_ops/test_custom_ops_ipu.py
new file mode 100644
index 0000000000000..0dc182354e5e0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/custom_ops/test_custom_ops_ipu.py
@@ -0,0 +1,174 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import sys
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.utils.cpp_extension import load
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from op_test_ipu import IPUOpTest
+
+
+# just load one custom-op for the data race issue under parallel mode
+def load_custom_detach():
+    cur_dir = os.path.dirname(os.path.realpath(__file__))
+    custom_ops = load(name=f"custom_detach",
+                      sources=[
+                          f"{cur_dir}/custom_detach.cc",
+                      ],
+                      extra_cxx_cflags=['-DONNX_NAMESPACE=onnx'],
+                      extra_ldflags=['-lpopfloat'])
+    return custom_ops
+
+
+def load_custom_identity():
+    cur_dir = os.path.dirname(os.path.realpath(__file__))
+    custom_ops = load(name=f"custom_identity",
+                      sources=[
+                          f"{cur_dir}/custom_identity.cc",
+                      ],
+                      extra_cxx_cflags=['-DONNX_NAMESPACE=onnx'],
+                      extra_ldflags=['-lpopfloat'])
+    return custom_ops
+
+
+def load_custom_nll():
+    cur_dir = os.path.dirname(os.path.realpath(__file__))
+    custom_ops = load(name=f"custom_nll",
+                      sources=[
+                          f"{cur_dir}/custom_nll.cc",
+                      ],
+                      extra_cxx_cflags=['-DONNX_NAMESPACE=onnx'],
+                      extra_ldflags=['-lpopfloat'])
+    return custom_ops
+
+
+def build_ipu_strategy():
+    ipu_strategy = paddle.static.IpuStrategy()
+    ipu_strategy.add_custom_op(paddle_op="custom_detach",
+                               popart_op="Detach",
+                               domain="ai.graphcore",
+                               version=1)
+    ipu_strategy.add_custom_op(paddle_op="custom_identity",
+                               popart_op="Identity",
+                               domain="ai.onnx",
+                               version=11)
+    ipu_strategy.add_custom_op(paddle_op="custom_nll",
+                               popart_op="Nll",
+                               domain="ai.graphcore",
+                               version=1)
+    return ipu_strategy
+
+
+class TestBase(IPUOpTest):
+
+    def setUp(self):
+        self.load_custom_ops()
+        self.set_atol()
+        self.set_test_op()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+
+    @property
+    def fp16_enabled(self):
+        return False
+
+    def load_custom_ops(self):
+        self.custom_ops = load_custom_detach()
+
+    def set_test_op(self):
+        self.op = self.custom_ops.custom_detach
+        self.op_attrs = {}
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+        out = self.op(x, **self.op_attrs)
+        out = paddle.mean(out)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        ipu_strategy = build_ipu_strategy()
+        ipu_strategy.set_graph_config(is_training=self.is_training)
+        self.run_op_test(exec_mode, ipu_strategy=ipu_strategy)
+
+    def test(self):
+        self.build_model()
+        # only test IPU_FP32
+        self.run_model(IPUOpTest.ExecutionMode.IPU_FP32)
+        print(self.output_dict)
+
+
+class TestIdentity(TestBase):
+
+    def load_custom_ops(self):
+        self.custom_ops = load_custom_identity()
+
+    def set_test_op(self):
+        self.op = self.custom_ops.custom_identity
+        self.op_attrs = {}
+
+
+class TestNll(TestBase):
+
+    def load_custom_ops(self):
+        self.custom_ops = load_custom_nll()
+
+    def set_data_feed(self):
+        x = np.random.rand(16, 20, 256).astype('float32')
+        label = np.random.uniform(0, 256, size=[16, 20]).astype('int32')
+        self.feed_fp32 = {
+            'x': x,
+            'label': label,
+        }
+
+    def set_test_op(self):
+        self.op = self.custom_ops.custom_nll
+        self.op_attrs = {
+            "reduction": "Sum",
+            "ignoreindex": 0,
+            "inputislogprobability": False,
+        }
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+        label = paddle.static.data(name=self.feed_list[1],
+                                   shape=self.feed_shape[1],
+                                   dtype='int32')
+        out = self.op(x, label, **self.op_attrs)
+        out = paddle.mean(out)
+        self.fetch_list = [out.name]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py b/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py
index 5f2a0d59bb8be..90850b56aa657 100644
--- a/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py
@@ -67,9 +67,6 @@ def setUpClass(cls):
         random.seed(cls.SEED)
         paddle.seed(cls.SEED)
 
-        # Enable paddle static graph mode
-        paddle.enable_static()
-
     @classmethod
     def tearDownClass(cls):
         """Restore random seeds"""
@@ -86,43 +83,37 @@ def use_ipumodel(cls):
             if flag.upper() in ['1', "TRUE"]:
                 return True
 
-    # Decorator for static graph building
-    def static_graph(builder):
 
-        def wrapper(self, *args, **kwargs):
-            self.scope = paddle.static.Scope()
-            self.main_prog = paddle.static.Program()
-            self.startup_prog = paddle.static.Program()
-            self.main_prog.random_seed = self.SEED
-            self.startup_prog.random_seed = self.SEED
-            with paddle.static.scope_guard(self.scope):
-                with paddle.utils.unique_name.guard(
-                        paddle.utils.unique_name.generate('')):
-                    with paddle.static.program_guard(self.main_prog,
-                                                     self.startup_prog):
-                        builder(self, *args, **kwargs)
-
-        return wrapper
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class IPUD2STest(IPUTest):
 
-    # Cast a fp32 model to a full-fp16 model
     @classmethod
-    def cast_model_to_fp16(cls, main_program):
-        amp_list = paddle.static.amp.CustomOpLists()
-        amp_list.unsupported_list = {}
-        to_fp16_var_names = paddle.static.amp.cast_model_to_fp16(
-            main_program, amp_list, use_fp16_guard=False)
-        paddle.static.amp.cast_parameters_to_fp16(
-            paddle.CPUPlace(),
-            main_program,
-            to_fp16_var_names=to_fp16_var_names)
+    def setUpClass(cls):
+        super().setUpClass()
 
+        # Disable paddle static graph mode
+        paddle.disable_static()
 
+    def tearDown(self):
+        # Manual reset when using ipumodel
+        if self.use_ipumodel():
+            paddle.framework.core.IpuBackend.get_instance().reset()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
 class IPUOpTest(IPUTest):
+    """Base Class for single op unit tests using static graph on IPU.
+    """
 
     @classmethod
     def setUpClass(cls):
         super().setUpClass()
 
+        # Enable paddle static graph mode
+        paddle.enable_static()
+
         # Items that a op_tester needs
         cls.main_prog: paddle.static.Program = None
         cls.startup_prog: paddle.static.Program = None
@@ -166,6 +157,36 @@ def set_training(self):
         self.is_training = False
         self.epoch = 1
 
+    # Decorator for static graph building
+    def static_graph(builder):
+
+        def wrapper(self, *args, **kwargs):
+            self.scope = paddle.static.Scope()
+            self.main_prog = paddle.static.Program()
+            self.startup_prog = paddle.static.Program()
+            self.main_prog.random_seed = self.SEED
+            self.startup_prog.random_seed = self.SEED
+            with paddle.static.scope_guard(self.scope):
+                with paddle.utils.unique_name.guard(
+                        paddle.utils.unique_name.generate('')):
+                    with paddle.static.program_guard(self.main_prog,
+                                                     self.startup_prog):
+                        builder(self, *args, **kwargs)
+
+        return wrapper
+
+    # Cast a fp32 model to a full-fp16 model
+    @classmethod
+    def cast_model_to_fp16(cls, main_program):
+        amp_list = paddle.static.amp.CustomOpLists()
+        amp_list.unsupported_list = {}
+        to_fp16_var_names = paddle.static.amp.cast_model_to_fp16(
+            main_program, amp_list, use_fp16_guard=False)
+        paddle.static.amp.cast_parameters_to_fp16(
+            paddle.CPUPlace(),
+            main_program,
+            to_fp16_var_names=to_fp16_var_names)
+
     def run_op_test(self, exec_mode, ipu_strategy=None):
         # NOTE: some op has no inputs
         # if len(self.feed_list) == 0 or len(self.fetch_list) == 0:
@@ -216,42 +237,54 @@ def check(self, check_shape=False, output_dict=None):
             raise ValueError("output_dict is empty")
         cpu_fp32 = output_dict[ExecutionMode.CPU_FP32]
         ipu_fp32 = output_dict[ExecutionMode.IPU_FP32]
-        cpu_fp32 = np.asarray(cpu_fp32).astype(np.float32).flatten()
-        ipu_fp32 = np.asarray(ipu_fp32).astype(np.float32).flatten()
-        pass_check = np.allclose(ipu_fp32,
-                                 cpu_fp32,
-                                 rtol=self.rtol,
-                                 atol=self.atol)
-        if not pass_check:
-            max_atol = np.abs(ipu_fp32 - cpu_fp32).max()
-            cpu_fp32_abs = np.abs(cpu_fp32)
-            cpu_fp32_abs[cpu_fp32_abs == 0.0] = 1e-20
-            max_rtol = (np.abs(ipu_fp32 - cpu_fp32) / cpu_fp32_abs).max()
-            raise AssertionError(
-                f"ipu_fp32 check failed. max_atol is {max_atol}, max_rtol is {max_rtol}"
-            )
-
-        if check_shape:
-            self.assertTrue(cpu_fp32.shape == ipu_fp32.shape)
-
-        if ExecutionMode.IPU_FP16 in output_dict.keys():
-            ipu_fp16 = output_dict[ExecutionMode.IPU_FP16]
-            ipu_fp16 = np.asarray(ipu_fp16).astype(np.float32).flatten()
-            pass_check = np.allclose(ipu_fp16,
-                                     cpu_fp32,
-                                     rtol=self.rtol_fp16,
-                                     atol=self.atol_fp16)
+        if len(cpu_fp32) != len(ipu_fp32):
+            raise ValueError("different outputs number between ipu and cpu.")
+        for cpu_fp32_res, ipu_fp32_res in zip(cpu_fp32, ipu_fp32):
+            cpu_fp32_res = np.asarray(cpu_fp32_res).astype(np.float32).flatten()
+            ipu_fp32_res = np.asarray(ipu_fp32_res).astype(np.float32).flatten()
+            pass_check = np.allclose(ipu_fp32_res,
+                                     cpu_fp32_res,
+                                     rtol=self.rtol,
+                                     atol=self.atol)
             if not pass_check:
-                max_atol = np.abs(ipu_fp16 - cpu_fp32).max()
-                cpu_fp32_abs = np.abs(cpu_fp32)
+                max_atol = np.abs(ipu_fp32_res - cpu_fp32_res).max()
+                cpu_fp32_abs = np.abs(cpu_fp32_res)
                 cpu_fp32_abs[cpu_fp32_abs == 0.0] = 1e-20
-                max_rtol = (np.abs(ipu_fp16 - cpu_fp32) / cpu_fp32_abs).max()
+                max_rtol = (np.abs(ipu_fp32_res - cpu_fp32_res) /
+                            cpu_fp32_abs).max()
                 raise AssertionError(
-                    f"ipu_fp16 check failed. max_atol is {max_atol}, max_rtol is {max_rtol}"
+                    f"ipu_fp32 check failed. max_atol is {max_atol}, max_rtol is {max_rtol}"
                 )
 
             if check_shape:
-                self.assertTrue(ipu_fp16.shape == cpu_fp32.shape)
+                self.assertTrue(cpu_fp32_res.shape == ipu_fp32_res.shape)
+
+        if ExecutionMode.IPU_FP16 in output_dict.keys():
+            ipu_fp16 = output_dict[ExecutionMode.IPU_FP16]
+            if len(cpu_fp32) != len(ipu_fp16):
+                raise ValueError(
+                    "different outputs number between ipu and cpu.")
+            for cpu_fp32_res, ipu_fp16_res in zip(cpu_fp32, ipu_fp16):
+                cpu_fp32_res = np.asarray(cpu_fp32_res).astype(
+                    np.float32).flatten()
+                ipu_fp16_res = np.asarray(ipu_fp16_res).astype(
+                    np.float32).flatten()
+                pass_check = np.allclose(ipu_fp16_res,
+                                         cpu_fp32_res,
+                                         rtol=self.rtol_fp16,
+                                         atol=self.atol_fp16)
+                if not pass_check:
+                    max_atol = np.abs(ipu_fp16_res - cpu_fp32_res).max()
+                    cpu_fp32_abs = np.abs(cpu_fp32_res)
+                    cpu_fp32_abs[cpu_fp32_abs == 0.0] = 1e-20
+                    max_rtol = (np.abs(ipu_fp16_res - cpu_fp32_res) /
+                                cpu_fp32_abs).max()
+                    raise AssertionError(
+                        f"ipu_fp16 check failed. max_atol is {max_atol}, max_rtol is {max_rtol}"
+                    )
+
+                if check_shape:
+                    self.assertTrue(ipu_fp16_res.shape == cpu_fp32_res.shape)
 
     # Execution Mode
     class ExecutionMode(IntEnum):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_activation_ops_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_activation_ops_ipu.py
index 3c5a90afced72..97ee7a45e001c 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_activation_ops_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_activation_ops_ipu.py
@@ -21,8 +21,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_affine_channel_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_affine_channel_op_ipu.py
new file mode 100644
index 0000000000000..836b99099ffe0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_affine_channel_op_ipu.py
@@ -0,0 +1,96 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+
+
+class TestBase(IPUOpTest):
+
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return False
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 32, 32])
+        self.feed_fp32 = {'data': data.astype(np.float32)}
+        self.feed_fp16 = {'data': data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['data_layout'] = 'NCHW'
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        data = paddle.static.data(name=self.feed_list[0],
+                                  shape=self.feed_shape[0],
+                                  dtype='float32')
+        input_scale = paddle.fluid.layers.create_parameter(
+            shape=[self.feed_shape[0][1]], dtype="float32")
+        input_bias = paddle.fluid.layers.create_parameter(
+            shape=[self.feed_shape[0][1]], dtype="float32")
+        out = paddle.fluid.layers.affine_channel(data,
+                                                 scale=input_scale,
+                                                 bias=input_bias)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
+
+
+class TestCase1(TestBase):
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[2, 4, 64, 64])
+        self.feed_fp32 = {'data': data.astype(np.float32)}
+        self.feed_fp16 = {'data': data.astype(np.float16)}
+
+
+@unittest.skip("Only support NCHW")
+class TestNHWC(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['data_layout'] = 'NHWC'
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[2, 64, 64, 3])
+        self.feed_fp32 = {'data': data.astype(np.float32)}
+        self.feed_fp16 = {'data': data.astype(np.float16)}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py
index 3612656cea354..078e744ae507d 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_arg_min_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_arg_min_op_ipu.py
index 181f2017173b4..30c604901e877 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_arg_min_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_arg_min_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_argsort_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_argsort_op_ipu.py
index c1b585513d8b1..3f19da43c71c3 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_argsort_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_argsort_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py
index af03480fbf698..93cdaf018b400 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py
index 3f45bf485b817..a3be5458ad83f 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
index 2d2d331543930..08e5049a790eb 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_binary_cross_entropy_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_binary_cross_entropy_op_ipu.py
new file mode 100644
index 0000000000000..113412b834110
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_binary_cross_entropy_op_ipu.py
@@ -0,0 +1,99 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+import paddle.nn.functional as F
+
+
+class TestBase(IPUOpTest):
+
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3, 4, 2, 2])
+        target = np.random.uniform(size=[3, 4, 2, 2])
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "target": target.astype(np.float32)
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "target": target.astype(np.float16)
+        }
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {
+            'reduction': 'mean',
+        }
+
+    @IPUOpTest.static_graph
+    def build_model(self, on_ipu):
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype="float32")
+        target = paddle.static.data(name=self.feed_list[1],
+                                    shape=self.feed_shape[1],
+                                    dtype='float32')
+        out = F.binary_cross_entropy(x, target, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model(self.is_ipu_mode(m))
+                self.run_model(m)
+        self.check()
+
+
+class TestCase1(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {
+            'reduction': 'sum',
+        }
+
+
+class TestCase2(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {
+            'reduction': 'none',
+        }
+
+    def set_atol(self):
+        self.atol = 1e-10
+        self.rtol = 1e-6
+        self.atol_fp16 = 5e-2
+        self.rtol_fp16 = 2e-2
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_bmm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_bmm_op_ipu.py
index 5a08774c236c2..8ea20cebf07a3 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_bmm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_bmm_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py
index f361b779bb30b..6799f4141a416 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_clip_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_clip_op_ipu.py
new file mode 100644
index 0000000000000..a221ad617671d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_clip_op_ipu.py
@@ -0,0 +1,218 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+
+
+class TestBase(IPUOpTest):
+
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_feed()
+        self.set_op_attrs()
+
+    def set_atol(self):
+        self.atol = 1e-6
+        self.rtol = 1e-6
+        self.atol_fp16 = 1e-3
+        self.rtol_fp16 = 1e-3
+
+    def set_feed(self):
+        data = np.random.uniform(size=[5, 5])
+        self.feed_fp32 = {'x': data.astype(np.float32)}
+        self.feed_fp16 = {'x': data.astype(np.float16)}
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['min'] = 0.1
+        self.attrs['max'] = 3.4
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+        x = paddle.clip(x, **self.attrs)
+        self.fetch_list = [x.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
+
+
+class TestNoMin(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['max'] = 3.4
+
+
+class TestNoMax(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['min'] = 0.1
+
+
+class TestNoMinNoMax(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {}
+
+
+class TestMinMaxTensor(TestBase):
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+
+        min = paddle.fluid.layers.fill_constant(name="min",
+                                                shape=[1],
+                                                dtype='float32',
+                                                value=0.1)
+        max = paddle.fluid.layers.fill_constant(name="max",
+                                                shape=[1],
+                                                dtype='float32',
+                                                value=3.4)
+        x = paddle.clip(x, min=min, max=max)
+        self.fetch_list = [x.name]
+
+
+class TestMinTensor(TestBase):
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+
+        min = paddle.fluid.layers.fill_constant(name="min",
+                                                shape=[1],
+                                                dtype='float32',
+                                                value=0.1)
+        x = paddle.clip(x, min=min)
+        self.fetch_list = [x.name]
+
+
+class TestMaxTensor(TestBase):
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+
+        max = paddle.fluid.layers.fill_constant(name="max",
+                                                shape=[1],
+                                                dtype='float32',
+                                                value=3.4)
+        x = paddle.clip(x, max=max)
+        self.fetch_list = [x.name]
+
+
+class TestCombine1(TestBase):
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+
+        min = paddle.fluid.layers.fill_constant(name="min",
+                                                shape=[1],
+                                                dtype='float32',
+                                                value=0.1)
+        x = paddle.clip(x, min=min, max=3.4)
+        self.fetch_list = [x.name]
+
+
+class TestCombine2(TestBase):
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+
+        max = paddle.fluid.layers.fill_constant(name="max",
+                                                shape=[1],
+                                                dtype='float32',
+                                                value=3.4)
+        x = paddle.clip(x, min=0.1, max=max)
+        self.fetch_list = [x.name]
+
+
+class TestIntInput(TestBase):
+
+    def set_feed(self):
+        data = np.random.uniform(size=[5, 5])
+        self.feed_fp32 = {'x': data.astype(np.int32)}
+        self.feed_fp16 = {'x': data.astype(np.int32)}
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='int32')
+
+        x = paddle.clip(x, min=0.1, max=3.4)
+        self.fetch_list = [x.name]
+
+
+class TestIntMinMax(TestBase):
+
+    def set_feed(self):
+        data = np.random.uniform(size=[5, 5])
+        self.feed_fp32 = {'x': data.astype(np.int32)}
+        self.feed_fp16 = {'x': data.astype(np.int32)}
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='int32')
+        min = paddle.fluid.layers.fill_constant(name="min",
+                                                shape=[1],
+                                                dtype='int32',
+                                                value=1)
+        max = paddle.fluid.layers.fill_constant(name="max",
+                                                shape=[1],
+                                                dtype='int32',
+                                                value=3)
+        x = paddle.clip(x, min=min, max=max)
+        self.fetch_list = [x.name]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py
index d0160551b93bd..733a5291cf50b 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_conv2d_transpose_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_conv2d_transpose_op_ipu.py
new file mode 100644
index 0000000000000..6136bf34ffb67
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_conv2d_transpose_op_ipu.py
@@ -0,0 +1,160 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from op_test_ipu import IPUOpTest
+
+
+class TestBase(IPUOpTest):
+
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_feed()
+        self.set_op_attrs()
+
+    def set_atol(self):
+        self.atol = 1e-6
+        self.rtol = 1e-6
+        self.atol_fp16 = 1e-3
+        self.rtol_fp16 = 1e-3
+
+    def set_feed(self):
+        data = np.random.uniform(size=[1, 3, 8, 8])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['num_filters'] = 3
+        self.attrs['filter_size'] = 3
+        self.attrs['padding'] = 0
+        self.attrs['stride'] = 1
+        self.attrs['dilation'] = 1
+        self.attrs['bias_attr'] = False
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+        x = paddle.static.nn.conv2d_transpose(x, **self.attrs)
+        self.fetch_list = [x.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
+
+
+class TestCase1(TestBase):
+
+    def set_op_attrs(self):
+        super().set_op_attrs()
+        self.attrs['stride'] = 2
+
+
+@unittest.skip("Only support dilation=1")
+class TestCase2(TestBase):
+
+    def set_op_attrs(self):
+        super().set_op_attrs()
+        self.attrs['stride'] = 2
+        self.attrs['dilation'] = 2
+
+
+class TestCase3(TestBase):
+
+    def set_op_attrs(self):
+        super().set_op_attrs()
+        self.attrs['padding'] = 2
+
+
+class TestCase4(TestBase):
+
+    def set_op_attrs(self):
+        super().set_op_attrs()
+        self.attrs['padding'] = "SAME"
+
+
+class TestCase5(TestBase):
+
+    def set_op_attrs(self):
+        super().set_op_attrs()
+        self.attrs['stride'] = 2
+        self.attrs['padding'] = "SAME"
+
+
+class TestCase6(TestBase):
+
+    def set_op_attrs(self):
+        super().set_op_attrs()
+        self.attrs['padding'] = "VALID"
+
+
+class TestCase7(TestBase):
+
+    def set_op_attrs(self):
+        super().set_op_attrs()
+        self.attrs['padding'] = "VALID"
+        self.attrs['stride'] = 2
+
+
+class TestCase8(TestBase):
+
+    def set_op_attrs(self):
+        super().set_op_attrs()
+        self.attrs['filter_size'] = 4
+        self.attrs['stride'] = 2
+
+
+class TestCase9(TestBase):
+
+    # When bias_attr is not False, a Add Op will be added after conv2d_transpose Op.
+    # When bias_attr = None, the bias value is 0.
+    def set_op_attrs(self):
+        super().set_op_attrs()
+        self.attrs['bias_attr'] = None
+
+
+class TestCase10(TestBase):
+
+    # When output_size is not None, the filter_size will be re-computed by output_size
+    def set_op_attrs(self):
+        super().set_op_attrs()
+        self.attrs['filter_size'] = None
+        self.attrs['output_size'] = [12, 12]
+
+
+class TestCase11(TestBase):
+
+    # Depthwise conv2d transpose
+    def set_op_attrs(self):
+        super().set_op_attrs()
+        self.attrs['groups'] = 3
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py
index 5a2485e251c96..3fac45bbbd904 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
@@ -108,7 +106,7 @@ def set_op_attrs(self):
 
 
 class TestCase5(TestBase):
-
+    # Depthwise conv2d
     def set_op_attrs(self):
         super().set_op_attrs()
         self.attrs['groups'] = 3
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py
index ffd4368c089b5..92cf442fe27cc 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
@@ -116,5 +114,35 @@ def set_op_attrs(self):
         }
 
 
+class TestCase4(TestBase):
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3, 5, 7])
+        label = np.random.randint(0, 7, [3, 5, 1], dtype='int64')
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "label": label.astype(np.int64)
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "label": label.astype(np.int32)
+        }
+
+
+class TestCase5(TestBase):
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3, 5, 6, 7])
+        label = np.random.randint(0, 7, [3, 5, 6], dtype='int64')
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "label": label.astype(np.int64)
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "label": label.astype(np.int32)
+        }
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py
index 75cd3c92322ab..5f859b064feac 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
@@ -86,5 +84,35 @@ def set_op_attrs(self):
         self.attrs = {"exclusive": True, "reverse": True}
 
 
+class TestCase4(TestBase):
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[1, 128])
+        self.feed_fp32 = {"x": x.astype(np.int32)}
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype="int32")
+        out = paddle.fluid.layers.cumsum(x, **self.attrs)
+        self.fetch_list = [out.name]
+
+
+class TestCase5(TestBase):
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[1, 128])
+        self.feed_fp32 = {"x": x.astype(np.int64)}
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype="int64")
+        out = paddle.fluid.layers.cumsum(x, **self.attrs)
+        self.fetch_list = [out.name]
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_data_norm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_data_norm_op_ipu.py
new file mode 100644
index 0000000000000..de84b94bb7dde
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_data_norm_op_ipu.py
@@ -0,0 +1,128 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+
+
+class TestBase(IPUOpTest):
+
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_feed()
+        self.set_op_attrs()
+
+    def set_op_attrs(self):
+        self.attrs = {}
+
+    def set_feed(self):
+        data = np.random.uniform(size=[32, 100])
+        self.feed_fp32 = {'x': data.astype(np.float32)}
+        self.feed_fp16 = {'x': data.astype(np.float16)}
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+        x = paddle.static.nn.data_norm(input=x, **self.attrs)
+        self.fetch_list = [x.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
+
+
+class TestCase1(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {"in_place": True}
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+        x = paddle.static.nn.data_norm(input=x, **self.attrs)
+        x = x + 1
+        self.fetch_list = [x.name]
+
+
+@unittest.skip("Do not support in_place=True when test single data_norm Op")
+class TestCase2(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {"in_place": True}
+
+
+class TestCase3(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {"data_layout": "NHWC"}
+
+
+class TestCase4(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {"epsilon": 0.001}
+
+
+class TestCase5(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {"do_model_average_for_mean_and_var": True}
+
+
+class TestCase6(TestBase):
+    # If enable_scale_and_shift=True, it requires to set values of scale and bias in `param_attr`
+    def set_op_attrs(self):
+        self.attrs = {
+            "param_attr": {
+                "scale_w": 0.5,
+                "bias": 0.1
+            },
+            "enable_scale_and_shift": True
+        }
+
+
+class TestCase7(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {
+            "param_attr": {
+                "batch_size": 1e3,
+                "batch_sum": 0.1,
+                "batch_square": 1e3,
+                "scale_w": 0.5,
+                "bias": 0.1
+            },
+            "enable_scale_and_shift": True
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_dist_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_dist_op_ipu.py
new file mode 100644
index 0000000000000..5f8db4faba744
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_dist_op_ipu.py
@@ -0,0 +1,93 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from op_test_ipu import IPUOpTest
+
+
+class TestBase(IPUOpTest):
+
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    def set_data_feed(self):
+        data_x = np.random.uniform(size=[8, 1, 6, 1])
+        data_y = np.random.uniform(size=[7, 1, 5])
+        self.feed_fp32 = {
+            "x": data_x.astype(np.float32),
+            "y": data_y.astype(np.float32)
+        }
+        self.feed_fp16 = {
+            "x": data_x.astype(np.float16),
+            "y": data_y.astype(np.float16)
+        }
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
+
+    def set_op_attrs(self):
+        self.attrs = {"p": 2}
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+        y = paddle.static.data(name=self.feed_list[1],
+                               shape=self.feed_shape[1],
+                               dtype='float32')
+        out = paddle.dist(x, y, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
+
+
+class TestCase1(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {"p": 0}
+
+
+class TestCase2(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {"p": float("inf")}
+
+
+class TestCase3(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {"p": float("-inf")}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_dot_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_dot_op_ipu.py
index fb090cc5913a4..ed0c36f53eb0c 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_dot_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_dot_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py
index be96762549dd4..d104b39c29246 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_dy2static_fp16_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_dy2static_fp16_ipu.py
new file mode 100644
index 0000000000000..23ba121a07f2e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_dy2static_fp16_ipu.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tempfile
+import unittest
+
+import numpy as np
+import paddle
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUD2STest
+
+
+class SimpleLayer(paddle.nn.Layer):
+
+    def __init__(self, use_ipu=False):
+        super(SimpleLayer, self).__init__()
+        self.use_ipu = use_ipu
+        self.conv = paddle.nn.Conv2D(in_channels=3,
+                                     out_channels=1,
+                                     kernel_size=2,
+                                     stride=1)
+
+    def forward(self, x, target=None):
+        x = self.conv(x)
+        x = paddle.fluid.layers.flatten(x, axis=1)
+        if target is not None:
+            x = paddle.fluid.layers.softmax(x)
+            loss = paddle.fluid.layers.cross_entropy(x, target)
+            if self.use_ipu:
+                loss = paddle.incubate.identity_loss(loss, 1)
+            else:
+                loss = paddle.mean(loss)
+            return x, loss
+        return x
+
+
+class TestBase(IPUD2STest):
+
+    def setUp(self):
+        super().setUp()
+        self.save_path = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        super().tearDown()
+        self.save_path.cleanup()
+
+    def _test(self, use_ipu=False):
+        paddle.seed(self.SEED)
+        np.random.seed(self.SEED)
+        model = SimpleLayer(use_ipu)
+        specs = [
+            paddle.static.InputSpec(name="x",
+                                    shape=[32, 3, 10, 10],
+                                    dtype="float32"),
+            paddle.static.InputSpec(name="target", shape=[32], dtype="int64"),
+        ]
+        model = paddle.jit.to_static(model, input_spec=specs)
+        optim = paddle.optimizer.Adam(learning_rate=0.01,
+                                      parameters=model.parameters())
+        data = paddle.uniform((32, 3, 10, 10), dtype='float32')
+        label = paddle.randint(0, 10, shape=[32], dtype='int64')
+        model_path = '{}/model_state_dict_{}.pdparams'.format(
+            self.save_path, 'ipu' if use_ipu else 'cpu')
+        optim_path = '{}/optim_state_dict_{}.pdopt'.format(
+            self.save_path, 'ipu' if use_ipu else 'cpu')
+
+        if use_ipu:
+            paddle.set_device('ipu')
+            ipu_strategy = paddle.static.IpuStrategy()
+            ipu_strategy.set_graph_config(num_ipus=1,
+                                          is_training=True,
+                                          micro_batch_size=1,
+                                          enable_manual_shard=False)
+            ipu_strategy.set_precision_config(enable_fp16=True)
+            ipu_strategy.set_optimizer(optim)
+            data = data.astype(np.float16)
+
+        epochs = 100
+        result = []
+        for _ in range(epochs):
+            # ipu only needs call model() to do forward/backward/grad_update
+            pred, loss = model(data, label)
+            if not use_ipu:
+                loss.backward()
+                optim.step()
+                optim.clear_grad()
+            result.append(loss)
+
+        if use_ipu:
+            paddle.fluid.core.IpuBackend.get_instance().weights_to_host()
+
+        paddle.save(model.state_dict(), model_path)
+        paddle.save(optim.state_dict(), optim_path)
+        model.set_state_dict(paddle.load(model_path))
+        optim.set_state_dict(paddle.load(optim_path))
+
+        for _ in range(epochs):
+            # ipu only needs call model() to do forward/backward/grad_update
+            pred, loss = model(data, label)
+            if not use_ipu:
+                loss.backward()
+                optim.step()
+                optim.clear_grad()
+
+            result.append(loss)
+
+        if use_ipu:
+            ipu_strategy.release_patch()
+
+        return np.array(result)
+
+    def test_training(self):
+        cpu_loss = self._test(False).flatten()
+        ipu_loss = self._test(True).flatten()
+        self.assertTrue(np.allclose(ipu_loss, cpu_loss, atol=1e-2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_dy2static_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_dy2static_ipu.py
new file mode 100644
index 0000000000000..7b581de222819
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_dy2static_ipu.py
@@ -0,0 +1,245 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tempfile
+import unittest
+
+import numpy as np
+import paddle
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramCache
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUD2STest
+from paddle.jit import to_static
+from paddle.optimizer.lr import LRScheduler
+from functools import partial
+
+
+class SimpleLayer(paddle.nn.Layer):
+
+    def __init__(self,
+                 loss_op=None,
+                 use_softmax=True,
+                 use_reduction=True,
+                 use_identity_loss=True):
+        super(SimpleLayer, self).__init__()
+        self.loss_op = loss_op
+        self.conv = paddle.nn.Conv2D(in_channels=3,
+                                     out_channels=1,
+                                     kernel_size=2,
+                                     stride=1)
+        self.use_softmax = use_softmax
+        self.use_reduction = use_reduction
+        self.use_identity_loss = use_identity_loss
+
+    @to_static()
+    def forward(self, x, target=None):
+        x = self.conv(x)
+        x = paddle.fluid.layers.flatten(x, axis=1)
+        if target is not None:
+            if self.use_softmax:
+                x = paddle.fluid.layers.softmax(x)
+            if self.loss_op:
+                loss = self.loss_op(x, target)
+            else:
+                loss = paddle.fluid.layers.cross_entropy(x, target)
+            if self.use_reduction:
+                loss = paddle.mean(loss)
+            if self.use_identity_loss:
+                loss = paddle.incubate.identity_loss(loss, 1)
+            return x, loss
+        return x
+
+
+class TestBase(IPUD2STest):
+
+    def setUp(self):
+        self.set_op_attrs()
+        self.set_data_feed()
+
+    def set_op_attrs(self):
+        self.loss_op = paddle.fluid.layers.cross_entropy
+
+    def set_data_feed(self):
+        self.data = paddle.uniform((32, 3, 10, 10), dtype='float32')
+        self.label = paddle.randint(0, 10, shape=[32], dtype='int64')
+
+    def create_model(self, use_ipu=False):
+        return SimpleLayer(loss_op=self.loss_op,
+                           use_softmax=True,
+                           use_reduction=not use_ipu,
+                           use_identity_loss=use_ipu)
+
+    def _test(self, use_ipu=False):
+        paddle.seed(self.SEED)
+        np.random.seed(self.SEED)
+        model = self.create_model(use_ipu)
+        optim = paddle.optimizer.Adam(learning_rate=0.01,
+                                      parameters=model.parameters())
+
+        if use_ipu:
+            paddle.set_device('ipu')
+            ipu_strategy = paddle.static.IpuStrategy()
+            ipu_strategy.set_graph_config(num_ipus=1,
+                                          is_training=True,
+                                          micro_batch_size=1,
+                                          enable_manual_shard=False)
+            ipu_strategy.set_optimizer(optim)
+
+        epochs = 100
+        result = []
+        for _ in range(epochs):
+            # ipu only needs call model() to do forward/backward/grad_update
+            pred, loss = model(self.data, self.label)
+            if not use_ipu:
+                loss.backward()
+                optim.step()
+                optim.clear_grad()
+            result.append(loss)
+
+        if use_ipu:
+            ipu_strategy.release_patch()
+
+        return np.array(result)
+
+    def test_training(self):
+        ipu_loss = self._test(True).flatten()
+        cpu_loss = self._test(False).flatten()
+        self.assertTrue(np.allclose(ipu_loss, cpu_loss, atol=1e-4))
+
+
+class TestSaveLoad(TestBase):
+
+    def setUp(self):
+        super().setUp()
+        self.save_path = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        super().tearDown()
+        self.save_path.cleanup()
+
+    def _test(self, use_ipu=False):
+        paddle.seed(self.SEED)
+        np.random.seed(self.SEED)
+        model = self.create_model(use_ipu)
+        optim = paddle.optimizer.Adam(learning_rate=0.01,
+                                      parameters=model.parameters())
+        model_path = '{}/model_state_dict_{}.pdparams'.format(
+            self.save_path, 'ipu' if use_ipu else 'cpu')
+        optim_path = '{}/optim_state_dict_{}.pdopt'.format(
+            self.save_path, 'ipu' if use_ipu else 'cpu')
+
+        if use_ipu:
+            paddle.set_device('ipu')
+            ipu_strategy = paddle.static.IpuStrategy()
+            ipu_strategy.set_graph_config(num_ipus=1,
+                                          is_training=True,
+                                          micro_batch_size=1,
+                                          enable_manual_shard=False)
+            ipu_strategy.set_optimizer(optim)
+
+        epochs = 100
+        result = []
+        for _ in range(epochs):
+            # ipu only needs call model() to do forward/backward/grad_update
+            pred, loss = model(self.data, self.label)
+            if not use_ipu:
+                loss.backward()
+                optim.step()
+                optim.clear_grad()
+            result.append(loss)
+
+        if use_ipu:
+            paddle.fluid.core.IpuBackend.get_instance().weights_to_host()
+
+        paddle.save(model.state_dict(), model_path)
+        paddle.save(optim.state_dict(), optim_path)
+        model.set_state_dict(paddle.load(model_path))
+        optim.set_state_dict(paddle.load(optim_path))
+
+        for _ in range(epochs):
+            # ipu only needs call model() to do forward/backward/grad_update
+            pred, loss = model(self.data, self.label)
+            if not use_ipu:
+                loss.backward()
+                optim.step()
+                optim.clear_grad()
+            result.append(loss)
+
+        if use_ipu:
+            ipu_strategy.release_patch()
+
+        return np.array(result)
+
+
+class TestPatch(IPUD2STest):
+
+    def setUp(cls):
+        paddle.disable_static()
+
+    def test(self, use_ipu=False):
+        old_getter = ProgramCache.__getitem__
+        old_step = LRScheduler.step
+
+        ipu_strategy = paddle.static.IpuStrategy()
+        ipu_strategy.release_patch()
+
+        reset_getter = ProgramCache.__getitem__
+        reset_step = LRScheduler.step
+
+        self.assertTrue(reset_getter is old_getter)
+        self.assertTrue(reset_step is old_step)
+
+
+class TestWithoutIdentityLoss1(TestBase):
+
+    def create_model(self, use_ipu=False):
+        return SimpleLayer(loss_op=self.loss_op,
+                           use_softmax=True,
+                           use_reduction=True,
+                           use_identity_loss=False)
+
+
+class TestWithoutIdentityLoss2(TestBase):
+
+    def set_op_attrs(self):
+        self.loss_op = paddle.fluid.layers.softmax_with_cross_entropy
+
+    def set_data_feed(self):
+        self.data = paddle.uniform((32, 3, 10, 10), dtype='float32')
+        self.label = paddle.randint(0, 10, shape=[32, 1], dtype='int64')
+
+    def create_model(self, use_ipu=False):
+        return SimpleLayer(loss_op=self.loss_op,
+                           use_softmax=False,
+                           use_reduction=True,
+                           use_identity_loss=False)
+
+
+class TestWithoutIdentityLoss3(TestBase):
+
+    def set_op_attrs(self):
+        self.loss_op = partial(paddle.fluid.layers.kldiv_loss, reduction="none")
+
+    def set_data_feed(self):
+        self.data = paddle.uniform((32, 3, 10, 10), dtype='float32')
+        self.label = paddle.rand(shape=[32, 81], dtype='float32')
+
+    def create_model(self, use_ipu=False):
+        return SimpleLayer(loss_op=self.loss_op,
+                           use_softmax=True,
+                           use_reduction=True,
+                           use_identity_loss=False)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py
index f78f446404dcb..9c35e43970e74 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestMul(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py
index ad419c2e2bfc5..77a78a7cb78ca 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_eval_model_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_eval_model_ipu.py
index f81f5d7de74d1..5df7bbadebf6b 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_eval_model_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_eval_model_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_expand_as_v2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_expand_as_v2_op_ipu.py
new file mode 100644
index 0000000000000..ee68eba5e543f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_expand_as_v2_op_ipu.py
@@ -0,0 +1,102 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+
+
+class TestBase(IPUOpTest):
+
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+
+    def set_data_feed(self):
+        data_x = np.random.uniform(size=[1, 3])
+        data_y = np.random.uniform(size=[2, 2, 3])
+        self.feed_fp32 = {
+            'x': data_x.astype(np.float32),
+            'y': data_y.astype(np.float32)
+        }
+        self.feed_fp16 = {
+            'x': data_x.astype(np.float16),
+            'y': data_y.astype(np.float16)
+        }
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype="float32")
+        y = paddle.static.data(name=self.feed_list[1],
+                               shape=self.feed_shape[1],
+                               dtype="float32")
+        out = paddle.expand_as(x, y)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
+
+
+class TestCase1(TestBase):
+
+    def set_data_feed(self):
+        data_x = np.random.uniform(size=[2, 3])
+        data_y = np.random.uniform(size=[2, 4, 2, 3])
+        self.feed_fp32 = {
+            'x': data_x.astype(np.float32),
+            'y': data_y.astype(np.float32)
+        }
+        self.feed_fp16 = {
+            'x': data_x.astype(np.float16),
+            'y': data_y.astype(np.float16)
+        }
+
+
+@unittest.skip("corresponding dimensions must have the same value.")
+class TestCase2(TestBase):
+
+    def set_data_feed(self):
+        data_x = np.random.uniform(size=[2, 3])
+        data_y = np.random.uniform(size=[2, 4, 3, 3])
+        self.feed_fp32 = {
+            'x': data_x.astype(np.float32),
+            'y': data_y.astype(np.float32)
+        }
+        self.feed_fp16 = {
+            'x': data_x.astype(np.float16),
+            'y': data_y.astype(np.float16)
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py
index 872f4a4bef160..843ec0438d74d 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_expand_v2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_expand_v2_op_ipu.py
new file mode 100644
index 0000000000000..5cb949a1943e7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_expand_v2_op_ipu.py
@@ -0,0 +1,113 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+
+
+class TestBase(IPUOpTest):
+
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_attrs()
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[2, 3])
+        self.feed_fp32 = {'x': data.astype(np.float32)}
+        self.feed_fp16 = {'x': data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
+
+    def set_attrs(self):
+        self.attrs = {"shape": [2, 2, 3]}
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype="float32")
+        out = paddle.expand(x, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
+
+
+class TestCase1(TestBase):
+
+    def set_attrs(self):
+        self.attrs = {"shape": [5, 2, 2, 3]}
+
+
+class TestCase2(TestBase):
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[2, 1, 3])
+        self.feed_fp32 = {'x': data.astype(np.float32)}
+        self.feed_fp16 = {'x': data.astype(np.float16)}
+
+    def set_attrs(self):
+        self.attrs = {"shape": [5, 2, 2, 3]}
+
+
+@unittest.skip("corresponding dimensions must have the same value.")
+class TestCase3(TestBase):
+
+    def set_attrs(self):
+        self.attrs = {"shape": [5, 2, 4, 3]}
+
+
+@unittest.skip("Do not support `shape` = Tensors.")
+class TestCase4(TestBase):
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[3, 3])
+        self.feed_fp32 = {'x': data.astype(np.float32)}
+        self.feed_fp16 = {'x': data.astype(np.float16)}
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype="float32")
+        self.attrs = {
+            'name': 'y',
+            'shape': [3],
+            'dtype': 'int32',
+            'value': 3,
+        }
+        y = paddle.fluid.layers.fill_constant(**self.attrs)
+        out = paddle.expand(x, shape=y)
+        self.fetch_list = [out.name]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py
index a6c497433020c..74ecba6f18c86 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
@@ -69,5 +67,24 @@ def set_op_attrs(self):
         self.attrs = {'fill_value': 3, 'dtype': 'int32'}
 
 
+class TestError(TestBase):
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.fluid.data('x', [-1, 3, 13], 'float32')
+        x_fill = paddle.full_like(x, **self.attrs)
+        out = paddle.fluid.layers.elementwise_add(x_fill, x_fill)
+        self.fetch_list = [out.name]
+
+    def test(self):
+        self.build_model()
+
+        def test_error():
+            self.run_op_test(IPUOpTest.ExecutionMode.IPU_FP32)
+
+        self.assertRaisesRegex(Exception, "Please check tensor shape setting",
+                               test_error)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py
index 4d4d88351892f..7598b32581acc 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_flatten_contiguous_range_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_flatten_contiguous_range_op_ipu.py
new file mode 100644
index 0000000000000..4723f753fb698
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_flatten_contiguous_range_op_ipu.py
@@ -0,0 +1,90 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+
+
+class TestBase(IPUOpTest):
+
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[2, 2, 4, 6])
+        self.feed_fp32 = {"x": data.astype(np.float32)}
+        self.feed_fp16 = {"x": data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['start_axis'] = 0
+        self.attrs['stop_axis'] = -1
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+        out = paddle.flatten(x=x, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
+
+
+class TestCase1(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['start_axis'] = 0
+        self.attrs['stop_axis'] = 2
+
+
+class TestCase2(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['start_axis'] = 1
+        self.attrs['stop_axis'] = -1
+
+
+class TestCase3(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['start_axis'] = 1
+        self.attrs['stop_axis'] = 2
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py
index 29dd9510dda40..d7c1da14e296f 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_flip_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_flip_op_ipu.py
new file mode 100644
index 0000000000000..07e0acb60a123
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_flip_op_ipu.py
@@ -0,0 +1,88 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+
+
+class TestBase(IPUOpTest):
+
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    def set_atol(self):
+        self.atol = 1e-6
+        self.rtol = 1e-6
+        self.atol_fp16 = 1e-3
+        self.rtol_fp16 = 1e-3
+
+    def set_feed(self):
+        data = np.random.uniform(size=[3, 2, 2])
+        self.feed_fp32 = {'x': data.astype(np.float32)}
+        self.feed_fp16 = {'x': data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
+
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['axis'] = [0, 1]
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype=self.feed_dtype[0])
+        x = paddle.flip(x, **self.attrs)
+        self.fetch_list = [x.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
+
+
+class TestCase1(TestBase):
+
+    def set_feed(self):
+        data = np.random.randint(0, 10, size=[3, 2, 2])
+        self.feed_fp32 = {'x': data.astype(np.int32)}
+        self.feed_fp16 = {'x': data.astype(np.int32)}
+
+
+class TestCase2(TestBase):
+
+    def set_feed(self):
+        data = np.random.randint(0, 2, size=[4, 3, 2, 2])
+        self.feed_fp32 = {'x': data.astype(np.bool)}
+        self.feed_fp16 = {'x': data.astype(np.bool)}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fp16_support_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fp16_support_ipu.py
index 0cfe769225001..708dd0f405424 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_fp16_support_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_fp16_support_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py
index 42ba6babd7911..13a48e5a98f1b 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py
index 673c7c0503242..2d14621d5fc7e 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_gradient_clip_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_gradient_clip_ipu.py
index 7eea222e5e3c4..b63d176ff2791 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_gradient_clip_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_gradient_clip_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py
index eb3c0601dd148..4f2e9a1a94bfc 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestGreaterThan(IPUOpTest):
 
     def setUp(self):
@@ -127,5 +125,17 @@ def set_test_op(self):
         self.op = paddle.fluid.layers.equal
 
 
+class TestGreaterEqual(TestGreaterThan):
+
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.greater_equal
+
+
+class TestLessEqual(TestGreaterThan):
+
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.less_equal
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py
index 4c5098640fdba..dec4c6e1306a4 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_huber_loss_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_huber_loss_op_ipu.py
new file mode 100644
index 0000000000000..514b926dc82af
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_huber_loss_op_ipu.py
@@ -0,0 +1,93 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+import paddle.nn.functional as F
+
+
+class TestBase(IPUOpTest):
+
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3, 4, 2, 2])
+        target = np.random.uniform(size=[3, 4, 2, 2])
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "target": target.astype(np.float32)
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "target": target.astype(np.float16)
+        }
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {
+            'delta': 1.0,
+        }
+
+    @IPUOpTest.static_graph
+    def build_model(self, on_ipu):
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype="float32")
+        target = paddle.static.data(name=self.feed_list[1],
+                                    shape=self.feed_shape[1],
+                                    dtype='float32')
+        out = paddle.fluid.layers.huber_loss(x, target, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model(self.is_ipu_mode(m))
+                self.run_model(m)
+        self.check()
+
+
+class TestCase1(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {
+            'delta': 0.5,
+        }
+
+
+class TestCase2(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {
+            'delta': 0.0,
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_identity_loss_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_identity_loss_ipu.py
new file mode 100644
index 0000000000000..9a44a9e7c306f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_identity_loss_ipu.py
@@ -0,0 +1,105 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.compiler as compiler
+import paddle.optimizer
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
+                                                          np_dtype_to_fluid_str)
+from paddle.utils.cpp_extension import load
+
+paddle.enable_static()
+
+
+class TestBase(IPUOpTest):
+
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_feed()
+        self.set_feed_attr()
+        self.set_op()
+
+    def set_op(self):
+        # setup custom op
+        self.op = paddle.incubate.identity_loss
+
+    def set_feed(self):
+        self.feed = {
+            "x": np.random.uniform(low=-2, high=2, size=[3,
+                                                         5]).astype('float32'),
+        }
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed.values()]
+        self.feed_list = list(self.feed.keys())
+        self.feed_dtype = [
+            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
+        ]
+
+    def _test_base(self, reduction):
+        scope = fluid.core.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        SEED = 0
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+
+        with fluid.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(name=self.feed_list[0],
+                                       shape=self.feed_shape[0],
+                                       dtype=self.feed_dtype[0])
+
+                out = self.op(x, reduction)
+                fetch_list = [out.name]
+
+            place = paddle.IPUPlace()
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            feed_list = self.feed_list
+            ipu_strategy = paddle.static.IpuStrategy()
+            ipu_strategy.set_graph_config(num_ipus=1, is_training=False)
+            ipu_compiler = compiler.IpuCompiledProgram(
+                main_prog, ipu_strategy=ipu_strategy)
+            program = ipu_compiler.compile(feed_list, fetch_list)
+
+            ipu_res = exe.run(program, self.feed, fetch_list)
+
+            if reduction == 0:
+                # sum
+                cpu_res = self.feed['x'].sum()
+            elif reduction == 1:
+                # mean
+                cpu_res = self.feed['x'].mean()
+            else:
+                # none
+                cpu_res = self.feed['x']
+
+            self.assertTrue(np.allclose(ipu_res[0], cpu_res, atol=self.atol))
+
+    def test_base(self):
+        # TODO: use string instead of int for reduction
+        for reduction in [0, 1, 2]:
+            self._test_base(reduction)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_inference_model_io_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_inference_model_io_ipu.py
index 18cd5e30e88c1..d3a700b629647 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_inference_model_io_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_inference_model_io_ipu.py
@@ -21,8 +21,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py
index 3828728a567c3..b24e4be7ae738 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_interpolate_ops_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_interpolate_ops_ipu.py
new file mode 100644
index 0000000000000..70d01e120efc2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_interpolate_ops_ipu.py
@@ -0,0 +1,209 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+
+
+class TestBase(IPUOpTest):
+
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[2, 3, 6, 10])
+        self.feed_fp32 = {"x": x.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
+
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs["size"] = [12, 12]
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype="float32")
+        out = paddle.nn.functional.interpolate(x, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
+
+
+class TestCase0(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs["size"] = [3, 4]
+
+
+class TestCase1(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs["scale_factor"] = [2, 1]
+
+
+@unittest.skip("Only one of size or scale_factor should be defined")
+class TestCase2(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {"size": [12, 12], "scale_factor": [2, 1]}
+
+
+class TestCase3(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {"scale_factor": 2.5}
+
+
+class TestBilinear(TestBase):
+
+    @property
+    def fp16_enabled(self):
+        return False
+
+    def set_atol(self):
+        self.atol = 1e-6
+        self.rtol = 1e-6
+        self.atol_fp16 = 1e-3
+        self.rtol_fp16 = 1e-3
+
+    def set_op_attrs(self):
+        self.attrs = {"size": [12, 12], "mode": "bilinear"}
+
+
+# Take long time
+class TestBicubic(TestBase):
+
+    @property
+    def fp16_enabled(self):
+        return False
+
+    def set_atol(self):
+        self.atol = 1e-6
+        self.rtol = 1e-6
+        self.atol_fp16 = 1e-3
+        self.rtol_fp16 = 1e-3
+
+    def set_op_attrs(self):
+        self.attrs = {"size": [12, 12], "mode": "bicubic"}
+
+
+# Trilinear requires 5-D input
+class TestTrilinear(TestBase):
+
+    @property
+    def fp16_enabled(self):
+        return False
+
+    def set_atol(self):
+        self.atol = 1e-6
+        self.rtol = 1e-6
+        self.atol_fp16 = 1e-3
+        self.rtol_fp16 = 1e-3
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[2, 3, 3, 6, 10])
+        self.feed_fp32 = {"x": x.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16)}
+
+    def set_op_attrs(self):
+        self.attrs = {
+            "size": [12, 12, 12],
+            "mode": "trilinear",
+            "data_format": "NCDHW"
+        }
+
+
+# Linear requires 3-D input
+class TestLinear(TestBase):
+
+    @property
+    def fp16_enabled(self):
+        return False
+
+    def set_atol(self):
+        self.atol = 1e-6
+        self.rtol = 1e-6
+        self.atol_fp16 = 1e-3
+        self.rtol_fp16 = 1e-3
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3, 6, 10])
+        self.feed_fp32 = {"x": x.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16)}
+
+    def set_op_attrs(self):
+        self.attrs = {"size": [12], "mode": "linear", "data_format": "NCW"}
+
+
+@unittest.skip(
+    "Transfer to Pool Op with 2-D ksize, now we only support 1-D ksize.")
+class TestArea(TestBase):
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[2, 3, 6, 6])
+        self.feed_fp32 = {"x": x.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16)}
+
+    def set_op_attrs(self):
+        self.attrs = {"size": 12, "mode": "area"}
+
+
+# align_corners option can only be set with the interpolating modes: linear | bilinear | bicubic | trilinear
+class TestAlignCorners(TestBase):
+
+    @property
+    def fp16_enabled(self):
+        return False
+
+    def set_op_attrs(self):
+        self.attrs = {
+            "size": [12, 12],
+            "align_corners": True,
+            "mode": "bilinear"
+        }
+
+
+#
+class TestAlignMode(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {"size": [12, 12], "align_mode": 1}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py
index 13f146f6fd741..f4a48cf134051 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py
@@ -21,8 +21,6 @@
 paddle.enable_static()
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestIpuShard(unittest.TestCase):
 
     def _test(self):
@@ -65,8 +63,6 @@ def test_ipu_shard(self):
             np.allclose(ipu_index_list, expected_ipu_index_list, atol=0))
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestIpuPipeline(unittest.TestCase):
 
     def _test(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
index 14128109029c7..6fa3d77ead8a4 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
@@ -20,8 +20,6 @@
 paddle.enable_static()
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestIpuStrategy(unittest.TestCase):
 
     def test_set_options(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_kldiv_loss_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_kldiv_loss_op_ipu.py
new file mode 100644
index 0000000000000..8af6664179a97
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_kldiv_loss_op_ipu.py
@@ -0,0 +1,93 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+import paddle.nn.functional as F
+
+
+class TestBase(IPUOpTest):
+
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3, 4, 2, 2])
+        target = np.random.uniform(size=[3, 4, 2, 2])
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "target": target.astype(np.float32)
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "target": target.astype(np.float16)
+        }
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {
+            'reduction': 'mean',
+        }
+
+    @IPUOpTest.static_graph
+    def build_model(self, on_ipu):
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype="float32")
+        target = paddle.static.data(name=self.feed_list[1],
+                                    shape=self.feed_shape[1],
+                                    dtype='float32')
+        out = paddle.fluid.layers.kldiv_loss(x, target, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model(self.is_ipu_mode(m))
+                self.run_model(m)
+        self.check()
+
+
+class TestCase1(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {
+            'reduction': 'sum',
+        }
+
+
+class TestCase2(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {
+            'reduction': 'none',
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py
index e365ffd4e166f..9bf457d6f924f 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py
index a406fa128fc5b..d8eaa2f81bceb 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_logical_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_logical_x_op_ipu.py
index 71a75db9ab392..79c22f47da5c9 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_logical_x_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_logical_x_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestLogicalAnd(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py
index 27a70329ca132..ffcf8a64f53f9 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py
index c15eb3a3b8edb..2c8e7159cf217 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py
index f7a01b7268ddf..6c663bd5ac927 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py
@@ -31,8 +31,6 @@ def get_lr(self):
         return self.base_lr
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestConvNet(IPUOpTest):
 
     @IPUOpTest.static_graph
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py
index 222bb20209750..bf2af886959b5 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
@@ -157,8 +155,8 @@ def set_op_attrs(self):
 class TestCase7(TestBase):
 
     def set_data_feed(self):
-        x = np.random.uniform(size=[1, 12, 128, 64])
-        y = np.random.uniform(size=[1, 12, 128, 64])
+        x = np.random.uniform(size=[1, 3, 4, 5])
+        y = np.random.uniform(size=[1, 3, 4, 5])
 
         self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
         self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
@@ -205,5 +203,35 @@ def set_data_feed(self):
         self.feed_fp16 = {"x": x.astype(np.float16), "y": x.astype(np.float16)}
 
 
+class TestCase10(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {
+            "transpose_y": True,
+        }
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[4, 2, 3])
+        y = np.random.uniform(size=[2, 3])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+class TestCase11(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {
+            "transpose_x": True,
+        }
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[4, 3, 2])
+        y = np.random.uniform(size=[3, 2])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_matmul_serilize_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_matmul_serilize_ipu.py
index 8151c55326500..6ffb05dfd254b 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_matmul_serilize_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_matmul_serilize_ipu.py
@@ -26,8 +26,6 @@ def set_serialize_factor(serialize_factor):
     op._set_attr('serialize_factor', serialize_factor)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py
index 4777c42da138e..37f575f64bd99 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
@@ -150,5 +148,35 @@ def set_data_feed(self):
         }
 
 
+class TestCase9(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {
+            "transpose_y": True,
+        }
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[4, 2, 3])
+        y = np.random.uniform(size=[2, 3])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+class TestCase10(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {
+            "transpose_x": True,
+        }
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[4, 3, 2])
+        y = np.random.uniform(size=[3, 2])
+
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py
index 72c2c9cc3beed..0f60ed2485e7e 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
@@ -48,7 +46,7 @@ def build_model(self):
         x = paddle.static.data(name=self.feed_list[0],
                                shape=self.feed_shape[0],
                                dtype='float32')
-        out = paddle.fluid.layers.mean(x)
+        out = paddle.mean(x)
         self.fetch_list = [out.name]
 
     def run_model(self, exec_mode):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_meshgrid_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_meshgrid_op_ipu.py
new file mode 100644
index 0000000000000..8c3306aed1318
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_meshgrid_op_ipu.py
@@ -0,0 +1,131 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+
+
+class TestBase(IPUOpTest):
+
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    def set_atol(self):
+        self.atol = 1e-6
+        self.rtol = 1e-6
+        self.atol_fp16 = 1e-3
+        self.rtol_fp16 = 1e-3
+
+    def set_feed(self):
+        data1 = np.random.uniform(size=[100])
+        data2 = np.random.uniform(size=[200])
+        self.feed_fp32 = {
+            'x': data1.astype(np.float32),
+            'y': data2.astype(np.float32)
+        }
+        self.feed_fp16 = {
+            'x': data1.astype(np.float16),
+            'y': data2.astype(np.float16)
+        }
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
+
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['axis'] = [0, 1]
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype=self.feed_dtype[0])
+        y = paddle.static.data(name=self.feed_list[1],
+                               shape=self.feed_shape[1],
+                               dtype=self.feed_dtype[1])
+        r1, r2 = paddle.meshgrid(x, y)
+        self.fetch_list = [r1.name, r2.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        for k, v in self.output_dict.items():
+            self.output_dict[k] = np.concatenate([vv.flatten() for vv in v])
+        self.check()
+
+
+class TestCase1(TestBase):
+
+    def set_feed(self):
+        data1 = np.random.uniform(size=[10])
+        data2 = np.random.uniform(size=[20])
+        data3 = np.random.uniform(size=[30])
+        self.feed_fp32 = {
+            'x': data1.astype(np.float32),
+            'y': data2.astype(np.float32),
+            'z': data3.astype(np.float32)
+        }
+        self.feed_fp16 = {
+            'x': data1.astype(np.float16),
+            'y': data2.astype(np.float16),
+            'z': data3.astype(np.float16)
+        }
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype=self.feed_dtype[0])
+        y = paddle.static.data(name=self.feed_list[1],
+                               shape=self.feed_shape[1],
+                               dtype=self.feed_dtype[1])
+        z = paddle.static.data(name=self.feed_list[2],
+                               shape=self.feed_shape[2],
+                               dtype=self.feed_dtype[2])
+        r1, r2, r3 = paddle.meshgrid(x, y, z)
+        self.fetch_list = [r1.name, r2.name, r3.name]
+
+
+class TestCase2(TestBase):
+
+    def set_feed(self):
+        data1 = np.random.uniform(size=[100])
+        data2 = np.random.uniform(size=[200])
+        self.feed_fp32 = {
+            'x': data1.astype(np.int32),
+            'y': data2.astype(np.int32)
+        }
+        self.feed_fp16 = {
+            'x': data1.astype(np.int32),
+            'y': data2.astype(np.int32)
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_inference_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_inference_ipu.py
index ba8f9c7bad51f..21bcb7b7314ab 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_inference_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_inference_ipu.py
@@ -21,8 +21,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_training_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_training_ipu.py
index 4fc3b40f9ab8c..4524c1103052d 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_training_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_training_ipu.py
@@ -21,8 +21,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_model_parallel_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_model_parallel_ipu.py
index 81f5295c7dda8..253a87a6b7fa6 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_model_parallel_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_model_parallel_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_model_pipeline_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_model_pipeline_ipu.py
index 27538610a42b7..fb5f25619bf96 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_model_pipeline_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_model_pipeline_ipu.py
@@ -12,79 +12,59 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
+import unittest
 
 import numpy as np
-import unittest
 import paddle
 import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
-paddle.enable_static()
-SEED = 2021
-
-
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
-class TestCastNet(unittest.TestCase):
-
-    def _test(self, run_ipu=True):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
-        np.random.seed(SEED)
 
-        np_image = np.random.rand(1, 3, 10, 10).astype(np.float32)
+class TestBase(IPUOpTest):
 
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                image = paddle.static.data(name='image',
-                                           shape=[1, 3, 10, 10],
-                                           dtype='float32')
-                with paddle.static.ipu_shard_guard(index=0):
-                    conv1 = paddle.static.nn.conv2d(image,
-                                                    num_filters=3,
-                                                    filter_size=3,
-                                                    bias_attr=False)
-                with paddle.static.ipu_shard_guard(index=1):
-                    conv2 = paddle.static.nn.conv2d(conv1,
-                                                    num_filters=3,
-                                                    filter_size=3,
-                                                    bias_attr=False)
-                    loss = paddle.mean(conv2)
+    def setUp(self):
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
-                place = paddle.CPUPlace()
-            executor = paddle.static.Executor(place)
-            executor.run(startup_prog)
+    def set_data_feed(self):
+        data = np.random.uniform(size=[2, 3, 10, 10])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
 
-            if run_ipu:
-                feed_list = [image.name]
-                fetch_list = [loss.name]
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(num_ipus=2,
-                                              is_training=False,
-                                              enable_manual_shard=True)
-                ipu_strategy.set_pipelining_config(enable_pipelining=False)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
+    def set_feed_attr(self):
+        self.feed_shape = [(1, 3, 10, 10)]
+        self.feed_list = list(self.feed_fp32.keys())
 
-            loss_res = executor.run(program,
-                                    feed={"image": np_image},
-                                    fetch_list=[loss])
-            return loss_res
+    @IPUOpTest.static_graph
+    def build_model(self):
+        image = paddle.static.data(name=self.feed_list[0],
+                                   shape=self.feed_shape[0],
+                                   dtype='float32')
+        with paddle.static.ipu_shard_guard(index=0):
+            conv1 = paddle.static.nn.conv2d(image,
+                                            num_filters=3,
+                                            filter_size=3,
+                                            bias_attr=False)
+        with paddle.static.ipu_shard_guard(index=1):
+            conv2 = paddle.static.nn.conv2d(conv1,
+                                            num_filters=3,
+                                            filter_size=3,
+                                            bias_attr=False)
+            loss = paddle.mean(conv2)
+        self.fetch_list = [loss.name]
 
-    def test_cast(self):
-        cpu_outputs = self._test(False)
-        ipu_outputs = self._test(True)
+    def run_model(self, exec_mode):
+        ipu_strategy = paddle.static.IpuStrategy()
+        ipu_strategy.set_graph_config(num_ipus=2,
+                                      is_training=False,
+                                      enable_manual_shard=True)
+        ipu_strategy.set_pipelining_config(enable_pipelining=True,
+                                           batches_per_step=2)
+        self.run_op_test(exec_mode, ipu_strategy=ipu_strategy)
 
-        self.assertTrue(np.allclose(cpu_outputs, ipu_outputs, atol=1e-4))
+    def test(self):
+        self.build_model()
+        self.run_model(IPUOpTest.ExecutionMode.IPU_FP32)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py
index 50be6420a5569..a5ace5f1bf1c9 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_not_equal_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_not_equal_op_ipu.py
index c796cc7c02b42..c00a60775eb7f 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_not_equal_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_not_equal_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
@@ -90,8 +88,6 @@ def set_data_feed(self):
         self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestScalar(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_one_hot_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_one_hot_op_ipu.py
index 6c8c3b113143a..fe5b658426eee 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_one_hot_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_one_hot_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_one_hot_v2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_one_hot_v2_op_ipu.py
index 8822c352b8ba5..e958cfd1f89ba 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_one_hot_v2_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_one_hot_v2_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py
index 5169eddc70307..5041e8804a085 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py
@@ -19,8 +19,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_p_norm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_p_norm_op_ipu.py
new file mode 100644
index 0000000000000..bd6ff58751d3f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_p_norm_op_ipu.py
@@ -0,0 +1,73 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+
+
+class TestBase(IPUOpTest):
+
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_feed()
+        self.set_op_attrs()
+
+    def set_op_attrs(self):
+        self.attrs = {"p": 2}
+
+    def set_feed(self):
+        data = np.random.uniform(size=[2, 3, 4])
+        self.feed_fp32 = {'x': data.astype(np.float32)}
+        self.feed_fp16 = {'x': data.astype(np.float16)}
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+        x = paddle.nn.functional.normalize(x, **self.attrs)
+        self.fetch_list = [x.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
+
+
+class TestCase1(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {"axis": 1}
+
+
+class TestCase2(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {"p": 3.5, "axis": 1, "epsilon": 1e-3}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pad_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pad_op_ipu.py
new file mode 100644
index 0000000000000..c006da3c16d92
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_pad_op_ipu.py
@@ -0,0 +1,150 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from op_test_ipu import IPUOpTest
+
+
+class TestBase(IPUOpTest):
+
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_feed()
+        self.set_op_attrs()
+
+    def set_feed(self):
+        data = np.random.uniform(size=[5, 4, 2, 3])
+        self.feed_fp32 = {'x': data.astype(np.float32)}
+        self.feed_fp16 = {'x': data.astype(np.float16)}
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {"pad": [1, 2, 3, 4]}
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+        pad = paddle.nn.functional.pad(x, **self.attrs)
+        self.fetch_list = [pad.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
+
+
+@unittest.skip("Do not support `pad` as a tensor")
+class TestCase1(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {}
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+        const_attrs = {
+            'name': 'y',
+            'shape': [4],
+            'dtype': 'int32',
+            'value': 2,
+        }
+        y = paddle.fluid.layers.fill_constant(**const_attrs)
+        pad = paddle.nn.functional.pad(x, pad=y)
+        self.fetch_list = [pad.name]
+
+
+class TestCase2(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {"pad": [2, 5], "data_format": "NCL"}
+
+    def set_feed(self):
+        data = np.random.uniform(size=[4, 2, 3])
+        self.feed_fp32 = {'x': data.astype(np.float32)}
+        self.feed_fp16 = {'x': data.astype(np.float16)}
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+
+class TestCase3(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {"pad": [2, 5, 2, 3, 6, 3], "data_format": "NCDHW"}
+
+    def set_feed(self):
+        data = np.random.uniform(size=[2, 3, 4, 2, 3])
+        self.feed_fp32 = {'x': data.astype(np.float32)}
+        self.feed_fp16 = {'x': data.astype(np.float16)}
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+
+class TestCase4(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {"pad": [2, 2, 1, 1], "mode": "reflect"}
+
+
+@unittest.skip("replicate mode is not supported")
+class TestCase5(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {"pad": [1, 2, 3, 4], "mode": "replicate"}
+
+
+@unittest.skip("circular mode is not supported")
+class TestCase6(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {"pad": [1, 2, 3, 4], "mode": "circular"}
+
+
+@unittest.skip("Only support NCL, NCHW, NCDHW")
+class TestCase7(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {"pad": [1, 2], "data_format": "NLC"}
+
+
+@unittest.skip("Only support NCL, NCHW, NCDHW")
+class TestCase7(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {"pad": [1, 2, 3, 4], "data_format": "NHWC"}
+
+
+@unittest.skip("Only support NCL, NCHW, NCDHW")
+class TestCase7(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {"pad": [1, 2, 3, 4, 1, 3], "data_format": "NDHWC"}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py
index a9ffeb8dc0106..8a2aa26f1c2d8 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py
index e9fec9a02326d..dca1103a0cd98 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py
index 3f596f951cd0c..8355f5eefde8c 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_prelu_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_prelu_op_ipu.py
index b06b0dc96f17f..b80560dccb3f4 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_prelu_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_prelu_op_ipu.py
@@ -21,8 +21,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
@@ -61,7 +59,6 @@ def build_model(self):
     def run_model(self, exec_mode):
         ipu_strategy = paddle.static.IpuStrategy()
         ipu_strategy.set_graph_config(is_training=self.is_training)
-        ipu_strategy.set_options({'onnx_dump_path': 'onnx_dump_path.onnx'})
         self.run_op_test(exec_mode, ipu_strategy=ipu_strategy)
 
     def test(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py
index 1c050d1e485b8..a2da444519d29 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py
index ffa3c6d155025..c78165f86e21a 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestMean(IPUOpTest):
 
     def setUp(self):
@@ -148,5 +146,59 @@ def set_test_op(self):
         self.op = paddle.fluid.layers.reduce_sum
 
 
+class TestLogsumexp(TestMean):
+
+    def set_test_op(self):
+        self.op = paddle.logsumexp
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+        if 'dim' in self.attrs:
+            self.attrs['axis'] = self.attrs['dim']
+            del self.attrs['dim']
+        if 'keep_dim' in self.attrs:
+            self.attrs['keepdim'] = self.attrs['keep_dim']
+            del self.attrs['keep_dim']
+        out = self.op(x, **self.attrs)
+        self.fetch_list = [out.name]
+
+
+class TestAll(TestMean):
+
+    @property
+    def fp16_enabled(self):
+        return False
+
+    def set_data_feed0(self):
+        data = np.random.choice(a=[False, True], size=(2, 4))
+        self.feed_fp32 = {"in_0": data.astype(bool)}
+        self.set_feed_attr()
+
+    def set_data_feed1(self):
+        data = np.random.choice(a=[False, True], size=(2, 2, 2))
+        self.feed_fp32 = {"in_0": data.astype(bool)}
+        self.set_feed_attr()
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='bool')
+        out = self.op(x, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.reduce_all
+
+
+class TestAny(TestAll):
+
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.reduce_any
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py
index 9a8c127ab650c..66358d83ee680 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py
index 32cedf0cdda58..2da63cf733004 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py
index 1b39ead9b84a8..7c6470af3d10b 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py
@@ -23,8 +23,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py
index 8b6b8425b5209..296d365fea602 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_scaled_optimizer_state_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_scaled_optimizer_state_ipu.py
index 79527f7a13081..e1f6f7a23f294 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_scaled_optimizer_state_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_scaled_optimizer_state_ipu.py
@@ -19,8 +19,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py
index 2af8de38377b9..9bce0b5df73df 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_set_ipu_shard_api.py b/python/paddle/fluid/tests/unittests/ipu/test_set_ipu_shard_api.py
index a7104fd4266f6..ca1cdb4073134 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_set_ipu_shard_api.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_set_ipu_shard_api.py
@@ -45,8 +45,6 @@ def linear_relu2(self, x):
         return x
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestSetIpuShard(unittest.TestCase):
 
     def _test(self):
@@ -80,8 +78,6 @@ def test_set_ipu_shard(self):
             np.allclose(ipu_index_list, expected_ipu_index_list, atol=0))
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestSetIpuPipeline(unittest.TestCase):
 
     def _test(self):
@@ -115,8 +111,6 @@ def test_set_ipu_shard(self):
             np.allclose(ipu_index_list, expected_ipu_index_list, atol=0))
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestSetIpuShardAndPipeline(unittest.TestCase):
 
     def _test(self):
@@ -157,8 +151,6 @@ def test_set_ipu_shard(self):
             np.allclose(ipu_index_list, expected_ipu_index_list, atol=0))
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestSetIpuForModel(unittest.TestCase):
 
     def _test(self):
@@ -194,8 +186,6 @@ def test_set_ipu_shard(self):
             np.allclose(ipu_index_list, expected_ipu_index_list, atol=0))
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestSetIpuMixedModel(unittest.TestCase):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py
index 3a96d4bb0b9f8..3bcbe417b9861 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py
index be803e61cf533..ebc05942b9358 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_softmax_with_cross_entropy_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_softmax_with_cross_entropy_op_ipu.py
index 97b0c25f9380e..d3084154a063e 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_softmax_with_cross_entropy_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_softmax_with_cross_entropy_op_ipu.py
@@ -21,8 +21,6 @@
 import paddle.nn.functional as F
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
@@ -106,5 +104,104 @@ def set_data_feed(self):
         }
 
 
+class TestCase3(TestBase):
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3, 5, 7])
+        label = np.random.randint(0, 7, [3, 5, 1], dtype='int64')
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "label": label.astype(np.int64)
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "label": label.astype(np.int32)
+        }
+
+
+class TestCase4(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {
+            'soft_label': False,
+            'return_softmax': True,
+            'ignore_index': 1,
+        }
+
+    @IPUOpTest.static_graph
+    def build_model(self, on_ipu):
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype="float32")
+        if on_ipu:
+            label = paddle.static.data(name=self.feed_list[1],
+                                       shape=self.feed_shape[1],
+                                       dtype='int32')
+        else:
+            label = paddle.static.data(name=self.feed_list[1],
+                                       shape=self.feed_shape[1],
+                                       dtype='int64')
+        loss, softmax = F.softmax_with_cross_entropy(x, label, **self.attrs)
+        self.fetch_list = [loss.name, softmax.name]
+
+    def run_model(self, exec_mode):
+        if self.is_ipu_mode(exec_mode):
+            self.feed_fp32['label'] = self.feed_fp32['label'].astype(np.int32)
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model(self.is_ipu_mode(m))
+                self.run_model(m)
+        self.check()
+
+
+class TestCase5(TestCase4):
+
+    def set_op_attrs(self):
+        self.attrs = {
+            'soft_label': False,
+            'return_softmax': True,
+            'ignore_index': 1,
+            'axis': 1,
+        }
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3, 5, 7, 11])
+        label = np.random.randint(0, 5, [3, 1, 7, 11], dtype='int64')
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "label": label.astype(np.int64)
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "label": label.astype(np.int32)
+        }
+
+
+class TestCase6(TestCase4):
+
+    def set_op_attrs(self):
+        self.attrs = {
+            'soft_label': False,
+            'return_softmax': True,
+            'ignore_index': 1,
+            'axis': 2,
+        }
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3, 5, 7, 9, 11])
+        label = np.random.randint(0, 7, [3, 5, 1, 9, 11], dtype='int64')
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "label": label.astype(np.int64)
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "label": label.astype(np.int32)
+        }
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py
index 76b65a015e95f..8d8c5190692dc 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py
index 1afc79b6a6586..9039dfdb3f006 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py
index 1828772c07a51..fa0a48081b4a4 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py
index 084c68654239c..3c4f9ff80d557 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py
index 417d9c37675c3..4194887ab2f05 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestTopKOp(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py
index 03068d407b2f3..d7681b38a1728 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_unary_ops_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_unary_ops_ipu.py
index eac32819f8232..bbf0f7b6996ed 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_unary_ops_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_unary_ops_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py
index 998eee38b5e59..3f3b9f4f89062 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace_ipu.py
index b3535c8cd5690..495bc0d656a56 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestBase(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_warpctc_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_warpctc_op_ipu.py
new file mode 100644
index 0000000000000..0e2de2817eaff
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_warpctc_op_ipu.py
@@ -0,0 +1,120 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+import paddle.nn.functional as F
+
+
+class TestBase(IPUOpTest):
+
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    def set_training(self):
+        # ctcloss only support training currently.
+        self.is_training = True
+        self.epoch = 1
+
+    def set_data_feed(self):
+        self.batch_size = 16
+        self.max_seq_length = 5
+        self.max_label_length = 3
+        self.num_classes = 5
+        self.logits_length = np.array([self.max_seq_length] * self.batch_size,
+                                      dtype=np.int64)
+        self.labels_length = np.array([self.max_label_length] * self.batch_size,
+                                      dtype=np.int64)
+        self.blank = self.num_classes - 1
+        self.norm_by_times = False
+
+        logits = np.random.uniform(
+            0.1, 1.0, [self.max_seq_length, self.batch_size, self.num_classes
+                       ]).astype("float32")
+        labels = np.random.randint(0,
+                                   self.num_classes - 1,
+                                   [self.batch_size, self.max_label_length],
+                                   dtype="int32")
+
+        self.feed_fp32 = {
+            "Logits": logits,
+            "Label": labels,
+            "input_length": self.logits_length.astype("int64"),
+            "label_length": self.labels_length.astype("int64"),
+        }
+        self.feed_fp16 = {
+            "Logits": logits.astype(np.float16),
+            "Label": labels,
+            "input_length": self.logits_length.astype("int64"),
+            "label_length": self.labels_length.astype("int64"),
+        }
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {
+            "blank": self.blank,
+            "norm_by_times": self.norm_by_times,
+        }
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        data = paddle.static.data(name=self.feed_list[0],
+                                  shape=self.feed_shape[0],
+                                  dtype="float32")
+        logits = paddle.nn.Linear(self.num_classes,
+                                  self.num_classes,
+                                  bias_attr=False)(data)
+        labels = paddle.static.data(name=self.feed_list[1],
+                                    shape=self.feed_shape[1],
+                                    dtype='int32')
+        input_length = paddle.static.data(name=self.feed_list[2],
+                                          shape=self.feed_shape[2],
+                                          dtype='int64')
+        label_length = paddle.static.data(name=self.feed_list[3],
+                                          shape=self.feed_shape[3],
+                                          dtype='int64')
+        out = paddle.fluid.layers.warpctc(logits,
+                                          labels,
+                                          input_length=input_length,
+                                          label_length=label_length,
+                                          **self.attrs)
+        loss = paddle.mean(out)
+        adam = paddle.optimizer.Adam(learning_rate=1e-2)
+        adam.minimize(loss)
+        self.fetch_list = [loss.name, out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_weight_decay_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_weight_decay_ipu.py
index c2fa0e672729c..7fb467fced752 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_weight_decay_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_weight_decay_ipu.py
@@ -22,8 +22,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 @unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel")
 class TestBase(IPUOpTest):
 
@@ -36,6 +34,7 @@ def setUp(self):
         self.model_path = os.path.join(self.temp_dir.name, "weight_decay")
 
     def tearDown(self):
+        super().tearDown()
         self.temp_dir.cleanup()
 
     def set_atol(self):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py
index 52e88119af0e9..c06880b980854 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py
@@ -20,8 +20,6 @@
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
 class TestWeightSharing(IPUOpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
index 6f79a248cf38b..7a67bf95d15a8 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -28,6 +28,13 @@ if(NOT WITH_DISTRIBUTE)
   list(REMOVE_ITEM TEST_TRT_CONVERTER "test_trt_convert_c_allreduce")
 endif()
 
+if(WIN32)
+  list(REMOVE_ITEM TEST_INFERENCE_IR_PASSES
+       "test_trt_convert_fused_token_prune")
+  list(REMOVE_ITEM TEST_TRT_IR_PASSES "test_trt_convert_fused_token_prune")
+  list(REMOVE_ITEM TEST_TRT_CONVERTER "test_trt_convert_fused_token_prune")
+endif()
+
 # Only for cpu(mkl + openblas)
 set(TEST_INFERENCE_CPU_UT "test_mul_lstm_fuse_pass" "test_mul_gru_fuse_pass")
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py
new file mode 100644
index 0000000000000..20028fb335b8f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import PassAutoScanTest
+from program_config import TensorConfig, ProgramConfig, OpConfig
+import numpy as np
+from functools import partial
+import unittest
+import hypothesis.strategies as st
+
+
+class TestMatmulActivationMkldnnFusePass(PassAutoScanTest):
+
+    def sample_program_config(self, draw):
+        transpose_X = draw(st.booleans())
+        transpose_Y = draw(st.booleans())
+        alpha = draw(st.sampled_from([1, 2]))
+        batch_size = draw(st.sampled_from([4]))
+        channel = draw(st.sampled_from([8]))
+        input_dim = draw(st.sampled_from([32]))
+        activation_type = draw(
+            st.sampled_from([
+                'relu', 'gelu', 'tanh', 'sigmoid', 'swish', 'mish', 'sqrt',
+                'hard_swish', 'sigmoid', 'abs', 'relu6', 'clip', 'tanh',
+                'hard_sigmoid', 'leaky_relu'
+            ]))
+
+        def generate_input(type):
+            if transpose_X and transpose_Y:
+                shape_x = [batch_size, channel, input_dim, 32]
+                shape_y = [batch_size, channel, 64, input_dim]
+            elif transpose_X:
+                shape_x = [batch_size, channel, input_dim, 32]
+                shape_y = [batch_size, channel, input_dim, 64]
+            elif transpose_Y:
+                shape_x = [batch_size, channel, 32, input_dim]
+                shape_y = [batch_size, channel, 8, input_dim]
+            else:
+                shape_x = [batch_size, channel, 32, input_dim]
+                shape_y = [batch_size, channel, input_dim, 16]
+
+            if type == 'x':
+                return np.random.random(shape_x).astype(np.float32)
+            else:
+                return np.random.random(shape_y).astype(np.float32)
+
+        matmul_op = OpConfig(type='matmul',
+                             inputs={
+                                 'X': ['matmul_X'],
+                                 'Y': ['matmul_Y']
+                             },
+                             outputs={'Out': ['matmul_output']},
+                             attrs={
+                                 'transpose_X': transpose_X,
+                                 'transpose_Y': transpose_Y,
+                                 'alpha': alpha
+                             })
+
+        if activation_type == "relu6":
+            activation_op = OpConfig(activation_type,
+                                     inputs={"X": ["matmul_output"]},
+                                     outputs={"Out": ["activation_output"]},
+                                     threshold=draw(
+                                         st.floats(min_value=1.0,
+                                                   max_value=10.0)))
+        elif activation_type == "leaky_relu":
+            activation_op = OpConfig(activation_type,
+                                     inputs={"X": ["matmul_output"]},
+                                     outputs={"Out": ["activation_output"]},
+                                     alpha=draw(
+                                         st.floats(min_value=0.1,
+                                                   max_value=1.0)))
+        elif activation_type == "swish":
+            activation_op = OpConfig(activation_type,
+                                     inputs={"X": ["matmul_output"]},
+                                     outputs={"Out": ["activation_output"]},
+                                     beta=draw(
+                                         st.floats(min_value=0.1,
+                                                   max_value=1.0)))
+        elif activation_type == "clip":
+            activation_op = OpConfig(
+                activation_type,
+                inputs={"X": ["matmul_output"]},
+                outputs={"Out": ["activation_output"]},
+                min=draw(st.floats(min_value=0.1, max_value=0.49)),
+                max=draw(st.floats(min_value=0.5, max_value=1.0)))
+        else:
+            activation_op = OpConfig(activation_type,
+                                     inputs={"X": ["matmul_output"]},
+                                     outputs={"Out": ["activation_output"]})
+
+        model_net = [matmul_op, activation_op]
+
+        program_config = ProgramConfig(
+            ops=model_net,
+            weights={},
+            inputs={
+                'matmul_X': TensorConfig(data_gen=partial(generate_input, 'x')),
+                'matmul_Y': TensorConfig(data_gen=partial(generate_input, 'y'))
+            },
+            outputs=['activation_output'])
+
+        return program_config
+
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_mkldnn=True)
+        yield config, ['matmul'], (1e-5, 1e-5)
+
+    def test(self):
+        self.run_and_statis(quant=False,
+                            max_examples=30,
+                            passes=['matmul_activation_mkldnn_fuse_pass'])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_softplus_activation_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_softplus_activation_fuse_pass.py
index 0c25a790138cd..5e5dd4c719d98 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_softplus_activation_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_softplus_activation_fuse_pass.py
@@ -23,8 +23,8 @@
 
 
 class SoftplusActivationReluOneDNNFusePassTest(InferencePassTest):
-    fuse_activation_alpha = None
-    fuse_activation_beta = None
+    fuse_alpha = None
+    fuse_beta = None
     pass_name = 'softplus_activation_mkldnn_fuse_pass'
 
     def setUp(self):
@@ -34,13 +34,13 @@ def setUp(self):
                               shape=[-1, 3, 100, 100],
                               dtype="float32")
             softplus_out = fluid.layers.softplus(data)
-            if self.fuse_activation_beta is not None:
-                activation_out = self.fuse_activation(
-                    softplus_out, self.fuse_activation_alpha,
-                    self.fuse_activation_beta)
-            elif self.fuse_activation_alpha is not None:
-                activation_out = self.fuse_activation(
-                    softplus_out, self.fuse_activation_alpha)
+            if self.fuse_beta is not None:
+                activation_out = self.fuse_activation(softplus_out,
+                                                      self.fuse_alpha,
+                                                      self.fuse_beta)
+            elif self.fuse_alpha is not None:
+                activation_out = self.fuse_activation(softplus_out,
+                                                      self.fuse_alpha)
             else:
                 activation_out = self.fuse_activation(softplus_out)
 
@@ -73,7 +73,7 @@ class SoftplusActivationLeakyReluOneDNNFusePassTest(
 
     def set_params(self):
         self.fuse_activation = fluid.layers.leaky_relu
-        self.fuse_activation_alpha = 0.3
+        self.fuse_alpha = 0.3
 
 
 class SoftplusActivationSwishOneDNNFusePassTest(
@@ -81,7 +81,7 @@ class SoftplusActivationSwishOneDNNFusePassTest(
 
     def set_params(self):
         self.fuse_activation = fluid.layers.swish
-        self.fuse_activation_alpha = 3
+        self.fuse_alpha = 3
 
 
 class SoftplusActivationHardSwishOneDNNFusePassTest(
@@ -110,8 +110,8 @@ class SoftplusActivationClipOneDNNFusePassTest(
 
     def set_params(self):
         self.fuse_activation = fluid.layers.clip
-        self.fuse_activation_alpha = 1.1
-        self.fuse_activation_beta = 5.2
+        self.fuse_alpha = 1.1
+        self.fuse_beta = 5.2
 
 
 class SoftplusActivationGeluErfOneDNNFusePassTest(
@@ -126,7 +126,7 @@ class SoftplusActivationGeluTanhOneDNNFusePassTest(
 
     def set_params(self):
         self.fuse_activation = fluid.layers.gelu
-        self.fuse_activation_alpha = True  # simulated "Approximate" attr
+        self.fuse_alpha = True  # simulated "Approximate" attr
 
 
 class SoftplusActivationRelu6OneDNNFusePassTest(
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_quant_dequant_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_quant_dequant_pass.py
index f800d2fc3f4de..a726e2cd061f0 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_quant_dequant_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_quant_dequant_pass.py
@@ -18,6 +18,7 @@
 import numpy as np
 from inference_pass_test import InferencePassTest
 from quant_dequant_test import QuantDequantTest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.core import PassVersionChecker
@@ -54,7 +55,7 @@ def network():
                 cout = fluid.layers.reshape(conv_out, shape=[1, 1, 10816])
             result = fluid.layers.relu(cout)
             loss = fluid.layers.cross_entropy(input=result, label=label_shape)
-            avg_loss = fluid.layers.mean(loss)
+            avg_loss = paddle.mean(loss)
             return avg_loss, result
 
         self.main_program.random_seed = 2
@@ -152,7 +153,7 @@ def network():
             cout = fluid.layers.reshape(conv_out, shape=[1, 1, 10816])
             result = fluid.layers.relu(cout)
             loss = fluid.layers.cross_entropy(input=result, label=label_shape)
-            avg_loss = fluid.layers.mean(loss)
+            avg_loss = paddle.mean(loss)
             return avg_loss, result
 
         self.main_program.random_seed = 2
@@ -245,7 +246,7 @@ def network():
                 cout = fluid.layers.reshape(conv_out, shape=[1, 1, 10816])
             result = fluid.layers.relu(cout)
             loss = fluid.layers.cross_entropy(input=result, label=label_shape)
-            avg_loss = fluid.layers.mean(loss)
+            avg_loss = paddle.mean(loss)
             return avg_loss, result
 
         self.main_program.random_seed = 2
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py
index 0db051560516d..cab61143b7737 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py
@@ -219,5 +219,123 @@ def test_quant(self):
         self.run_test(quant=True)
 
 
+# Special case
+class TrtConvertConv2dTransposeTest2(TrtLayerAutoScanTest):
+
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        ver = paddle_infer.get_trt_compile_version()
+        if ver[0] * 1000 + ver[1] * 100 + ver[2] * 10 < 7000:
+            return False
+        return True
+
+    def sample_program_configs(self):
+        self.trt_param.workspace_size = 1073741824
+
+        def generate_input1(batch, num_channels, attrs: List[Dict[str, Any]]):
+            return np.ones([batch, num_channels, 20, 30]).astype(np.float32)
+
+        def generate_weight1(num_channels, attrs: List[Dict[str, Any]]):
+            return np.random.random([num_channels, 64, 3, 3]).astype(np.float32)
+
+        num_channels = 128
+        batch = 1
+
+        self.num_channels = num_channels
+        dics = [{
+            "data_fromat": 'NCHW',
+            "dilations": [1, 1],
+            "padding_algorithm": 'EXPLICIT',
+            "groups": 1,
+            "paddings": [1, 1],
+            "strides": [2, 2],
+            "output_padding": [1, 1],
+            "output_size": [],
+        }]
+
+        ops_config = [{
+            "op_type": "conv2d_transpose",
+            "op_inputs": {
+                "Input": ["input_data"],
+                "Filter": ["conv2d_weight"]
+            },
+            "op_outputs": {
+                "Output": ["output_data"]
+            },
+            "op_attrs": dics[0]
+        }]
+        ops = self.generate_op_config(ops_config)
+
+        program_config = ProgramConfig(
+            ops=ops,
+            weights={
+                "conv2d_weight":
+                TensorConfig(
+                    data_gen=partial(generate_weight1, num_channels, dics))
+            },
+            inputs={
+                "input_data":
+                TensorConfig(data_gen=partial(generate_input1, batch,
+                                              num_channels, dics))
+            },
+            outputs=["output_data"])
+
+        yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+
+        def generate_dynamic_shape(attrs):
+            self.dynamic_shape.min_input_shape = {
+                "input_data": [1, 128, 20, 30],
+            }
+            self.dynamic_shape.max_input_shape = {
+                "input_data": [1, 128, 20, 30],
+            }
+            self.dynamic_shape.opt_input_shape = {
+                "input_data": [1, 128, 20, 30],
+            }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            return 0, 3
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), (1e-5, 1e-3)
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), (1e-5, 1e-3)
+
+    def add_skip_trt_case(self):
+        pass
+
+    def test(self):
+        self.add_skip_trt_case()
+        self.run_test()
+
+    def test_quant(self):
+        self.add_skip_trt_case()
+        self.run_test(quant=True)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py
index 2fabc6013893e..c692b3f9d677f 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py
@@ -21,6 +21,204 @@
 from typing import Optional, List, Callable, Dict, Any, Set
 
 
+# This is the special test case with weight including batch dimension
+# I don't want to mess up the code written by others, so I wrote a class specifically
+class TrtConvertElementwiseTest_one_input_special_case0(TrtLayerAutoScanTest):
+
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+
+        def generate_input(shape):
+            return np.random.random(shape).astype(np.float32)
+
+        def generate_weight():
+            return np.random.randn(1, 32, 1, 1).astype(np.float32)
+
+        for batch in [1, 4]:
+            for shape in [[batch, 32, 16, 32]]:
+                for op_type in ["elementwise_add", "elementwise_mul"]:
+                    for axis in [-1]:
+                        self.dims = len(shape)
+                        dics = [{"axis": axis}]
+                        ops_config = [{
+                            "op_type": op_type,
+                            "op_inputs": {
+                                "X": ["input_data"],
+                                "Y": ["weight"]
+                            },
+                            "op_outputs": {
+                                "Out": ["output_data"]
+                            },
+                            "op_attrs": dics[0]
+                        }]
+                        ops = self.generate_op_config(ops_config)
+
+                        program_config = ProgramConfig(
+                            ops=ops,
+                            weights={
+                                "weight":
+                                TensorConfig(data_gen=partial(generate_weight))
+                            },
+                            inputs={
+                                "input_data":
+                                TensorConfig(
+                                    data_gen=partial(generate_input, shape)),
+                            },
+                            outputs=["output_data"])
+
+                        yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+
+        def generate_dynamic_shape(attrs):
+            # The input.dims[1] must be equal to the weight's length.
+            if self.dims == 4:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data": [1, 32, 4, 4]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [4, 32, 32, 32]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data": [4, 32, 16, 32]
+                }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            return 1, 2
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
+
+    def add_skip_trt_case(self):
+        pass
+
+    def test(self):
+        self.add_skip_trt_case()
+        self.run_test()
+
+
+# This is the special test case
+class TrtConvertElementwiseTest_one_input_special_case1(TrtLayerAutoScanTest):
+
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+
+        def generate_input(shape):
+            return np.random.random(shape).astype(np.float32)
+
+        def generate_weight():
+            return np.random.randn(1).astype(np.float32)
+
+        for shape in [[32]]:
+            for op_type in ["elementwise_add", "elementwise_mul"]:
+                for axis in [-1]:
+                    self.dims = len(shape)
+                    dics = [{"axis": axis}]
+                    ops_config = [{
+                        "op_type": op_type,
+                        "op_inputs": {
+                            "X": ["input_data"],
+                            "Y": ["weight"]
+                        },
+                        "op_outputs": {
+                            "Out": ["output_data"]
+                        },
+                        "op_attrs": dics[0]
+                    }]
+                    ops = self.generate_op_config(ops_config)
+
+                    program_config = ProgramConfig(
+                        ops=ops,
+                        weights={
+                            "weight":
+                            TensorConfig(data_gen=partial(generate_weight))
+                        },
+                        inputs={
+                            "input_data":
+                            TensorConfig(
+                                data_gen=partial(generate_input, shape)),
+                        },
+                        outputs=["output_data"])
+
+                    yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+
+        def generate_dynamic_shape(attrs):
+            self.dynamic_shape.min_input_shape = {"input_data": [32]}
+            self.dynamic_shape.max_input_shape = {"input_data": [64]}
+            self.dynamic_shape.opt_input_shape = {"input_data": [32]}
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            if not dynamic_shape:
+                return 0, 3
+            return 1, 2
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
+
+    def add_skip_trt_case(self):
+        pass
+
+    def test(self):
+        self.add_skip_trt_case()
+        self.run_test()
+
+
 class TrtConvertElementwiseTest_one_input(TrtLayerAutoScanTest):
 
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
@@ -103,7 +301,7 @@ def clear_dynamic_shape():
             self.dynamic_shape.opt_input_shape = {}
 
         def generate_trt_nodes_num(attrs, dynamic_shape):
-            if self.dims == 1:
+            if self.dims == 1 and not dynamic_shape:
                 return 0, 3
             return 1, 2
 
@@ -141,10 +339,6 @@ class TrtConvertElementwiseTest_two_input_without_broadcast(
         TrtLayerAutoScanTest):
 
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
-        inputs = program_config.inputs
-        if len(inputs['input_data1'].shape) == 1:
-            return False
-
         return True
 
     def sample_program_configs(self):
@@ -250,6 +444,11 @@ def clear_dynamic_shape():
             self.dynamic_shape.min_input_shape = {}
             self.dynamic_shape.opt_input_shape = {}
 
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            if self.dims == 1 and not dynamic_shape:
+                return 0, 4
+            return 1, 3
+
         attrs = [
             program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
@@ -257,9 +456,11 @@ def clear_dynamic_shape():
         # for static_shape
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), (1, 3), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (1, 3), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
@@ -416,15 +617,19 @@ def sample_program_configs(self):
         def generate_input(shape):
             return np.random.random(shape).astype(np.float32)
 
+        # use rand not randn to avoiding pow producing `NAN`
         def generate_weight():
-            return np.random.randn(32).astype(np.float32)
+            return np.random.rand(32).astype(np.float32)
 
         for batch in [1, 2, 4]:
             for shape in [[32], [batch, 32], [batch, 32, 32],
                           [batch, 32, 16, 32]]:
                 for op_type in [
-                        "elementwise_add", "elementwise_mul", "elementwise_sub",
-                        "elementwise_div", "elementwise_pow"
+                        "elementwise_add",
+                        "elementwise_mul",
+                        "elementwise_sub",
+                        "elementwise_div",
+                        "elementwise_pow",
                 ]:
                     for axis in [-1 if len(shape) == 1 else 1]:
                         self.dims = len(shape)
@@ -492,11 +697,6 @@ def clear_dynamic_shape():
             self.dynamic_shape.min_input_shape = {}
             self.dynamic_shape.opt_input_shape = {}
 
-        def generate_trt_nodes_num(attrs, dynamic_shape):
-            if self.dims == 1:
-                return 0, 3
-            return 1, 2
-
         attrs = [
             program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
@@ -504,33 +704,19 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for static_shape
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+        yield self.create_inference_config(), (0, 3), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+        yield self.create_inference_config(), (0, 3), 1e-5
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+        yield self.create_inference_config(), (1, 2), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+        yield self.create_inference_config(), (1, 2), 1e-5
 
     def add_skip_trt_case(self):
-
-        def teller1(program_config, predictor_config):
-            input_x_names = program_config.ops[0].inputs["X"]
-            for weight_name in program_config.weights:
-                if weight_name in input_x_names:
-                    return True
-            return False
-
-        self.add_skip_case(
-            teller1, SkipReasons.TRT_NOT_SUPPORT,
-            "Input X should not be parameters in elementwise op.")
+        pass
 
     def test(self):
         self.add_skip_trt_case()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_fc.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_fc.py
new file mode 100644
index 0000000000000..9b6badf394e0b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_fc.py
@@ -0,0 +1,361 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import unittest
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import os
+
+
+class TrtConvertFcTest(TrtLayerAutoScanTest):
+
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        # The output has diff between gpu and trt in CI windows
+        if (os.name == 'nt'):
+            return False
+        return True
+
+    def sample_program_configs(self):
+        self.trt_param.workspace_size = 1073741824
+
+        def generate_input1(batch, attrs: List[Dict[str, Any]]):
+            return np.random.random([batch, 3, 64, (int)(attrs[0]["m"] / 2),
+                                     2]).astype(np.float32)
+
+        def generate_w(batch, attrs: List[Dict[str, Any]]):
+            return np.random.random([attrs[0]["m"],
+                                     attrs[0]["n"]]).astype(np.float32)
+
+        def generate_bias(batch, attrs: List[Dict[str, Any]]):
+            return np.random.random([attrs[0]["n"]]).astype(np.float32)
+
+        for batch in [1, 4]:
+            for [m, n] in [[32, 23]]:
+                dics = [
+                    {
+                        "in_num_col_dims": 3,
+                        # for my conveinence
+                        "m": m,
+                        "n": n,
+                    },
+                    {}
+                ]
+
+                ops_config = [
+                    {
+                        "op_type": "fc",
+                        "op_inputs": {
+                            "Input": ["input_data"],
+                            "W": ["w_data"],
+                            "Bias": ["bias_data"]
+                        },
+                        "op_outputs": {
+                            "Out": ["output_data"]
+                        },
+                        "op_attrs": dics[0]
+                    },
+                ]
+
+                ops = self.generate_op_config(ops_config)
+
+                program_config = ProgramConfig(
+                    ops=ops,
+                    weights={
+                        "w_data":
+                        TensorConfig(data_gen=partial(generate_w, batch, dics)),
+                        "bias_data":
+                        TensorConfig(
+                            data_gen=partial(generate_bias, batch, dics))
+                    },
+                    inputs={
+                        "input_data":
+                        TensorConfig(
+                            data_gen=partial(generate_input1, batch, dics)),
+                    },
+                    outputs=["output_data"])
+
+                yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+
+        def generate_dynamic_shape(attrs):
+            self.dynamic_shape.min_input_shape = {
+                "input_data": [1, 3, 32, 16, 2],
+            }
+            self.dynamic_shape.max_input_shape = {
+                "input_data": [4, 3, 64, 16, 2],
+            }
+            self.dynamic_shape.opt_input_shape = {
+                "input_data": [1, 3, 64, 16, 2],
+            }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            return 1, 2
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+
+        # # for static_shape
+        # clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), (1e-5, 1e-5)
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), (1e-5, 1e-5)
+
+    def test(self):
+        self.run_test()
+
+    def test_quant(self):
+        self.run_test(quant=True)
+
+
+class TrtConvertFcTest2(TrtLayerAutoScanTest):
+
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        # The output has diff between gpu and trt in CI windows
+        if (os.name == 'nt'):
+            return False
+        return True
+
+    def sample_program_configs(self):
+        self.trt_param.workspace_size = 1073741824
+
+        def generate_input1(batch, attrs: List[Dict[str, Any]]):
+            return np.random.random([batch, 3, 64, 14]).astype(np.float32)
+
+        def generate_w(batch, attrs: List[Dict[str, Any]]):
+            return np.random.random([attrs[0]["m"],
+                                     attrs[0]["n"]]).astype(np.float32)
+
+        def generate_bias(batch, attrs: List[Dict[str, Any]]):
+            return np.random.random([attrs[0]["n"]]).astype(np.float32)
+
+        for batch in [1, 4]:
+            for [m, n] in [[14, 43]]:
+                dics = [
+                    {
+                        "in_num_col_dims": 3,
+                        # for my conveinence
+                        "m": m,
+                        "n": n,
+                    },
+                    {}
+                ]
+
+                ops_config = [
+                    {
+                        "op_type": "fc",
+                        "op_inputs": {
+                            "Input": ["input_data"],
+                            "W": ["w_data"],
+                            "Bias": ["bias_data"]
+                        },
+                        "op_outputs": {
+                            "Out": ["output_data"]
+                        },
+                        "op_attrs": dics[0]
+                    },
+                ]
+
+                ops = self.generate_op_config(ops_config)
+
+                program_config = ProgramConfig(
+                    ops=ops,
+                    weights={
+                        "w_data":
+                        TensorConfig(data_gen=partial(generate_w, batch, dics)),
+                        "bias_data":
+                        TensorConfig(
+                            data_gen=partial(generate_bias, batch, dics))
+                    },
+                    inputs={
+                        "input_data":
+                        TensorConfig(
+                            data_gen=partial(generate_input1, batch, dics)),
+                    },
+                    outputs=["output_data"])
+
+                yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+
+        def generate_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {
+                "input_data": [1, 3, 32, 14],
+            }
+            self.dynamic_shape.max_input_shape = {
+                "input_data": [4, 3, 64, 14],
+            }
+            self.dynamic_shape.opt_input_shape = {
+                "input_data": [1, 3, 64, 14],
+            }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        # # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (1, 2), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (1, 2), (1e-5, 1e-5)
+
+        # for dynamic_shape
+        generate_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (1, 2), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (1, 2), (1e-5, 1e-5)
+
+    def test(self):
+        self.run_test()
+
+
+# this is the special case when x_dim.nbDims == 4 && x_num_col_dims == 1
+class TrtConvertFcTest3(TrtLayerAutoScanTest):
+
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+        self.trt_param.workspace_size = 1073741824
+
+        def generate_input1(batch, attrs: List[Dict[str, Any]]):
+            return np.ones([batch, 14, 1, 2]).astype(np.float32)
+
+        def generate_w(batch, attrs: List[Dict[str, Any]]):
+            return np.ones([attrs[0]["m"], attrs[0]["n"]]).astype(np.float32)
+
+        def generate_bias(batch, attrs: List[Dict[str, Any]]):
+            return np.ones([attrs[0]["n"]]).astype(np.float32)
+
+        for batch in [1, 4]:
+            for [m, n] in [[28, 43]]:
+                dics = [
+                    {
+                        "in_num_col_dims": 1,
+                        "Input_scale": 0.1,
+                        "out_threshold": 0.1,
+                        "enable_int8": True,
+                        # for my conveinence
+                        "m": m,
+                        "n": n,
+                    },
+                    {}
+                ]
+
+                ops_config = [
+                    {
+                        "op_type": "fc",
+                        "op_inputs": {
+                            "Input": ["input_data"],
+                            "W": ["w_data"],
+                            "Bias": ["bias_data"]
+                        },
+                        "op_outputs": {
+                            "Out": ["output_data"]
+                        },
+                        "op_attrs": dics[0]
+                    },
+                ]
+
+                ops = self.generate_op_config(ops_config)
+
+                program_config = ProgramConfig(
+                    ops=ops,
+                    weights={
+                        "w_data":
+                        TensorConfig(data_gen=partial(generate_w, batch, dics)),
+                        "bias_data":
+                        TensorConfig(
+                            data_gen=partial(generate_bias, batch, dics))
+                    },
+                    inputs={
+                        "input_data":
+                        TensorConfig(
+                            data_gen=partial(generate_input1, batch, dics)),
+                    },
+                    outputs=["output_data"])
+
+                yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+
+        def generate_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {
+                "input_data": [1, 14, 1, 2],
+            }
+            self.dynamic_shape.max_input_shape = {
+                "input_data": [4, 14, 1, 2],
+            }
+            self.dynamic_shape.opt_input_shape = {
+                "input_data": [1, 14, 1, 2],
+            }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (1, 2), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (1, 2), (1e-5, 1e-5)
+
+        # for dynamic_shape
+        generate_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (1, 2), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (1, 2), (1e-5, 1e-5)
+        self.trt_param.precision = paddle_infer.PrecisionType.Int8
+        yield self.create_inference_config(), (1, 2), (1e-5, 1e-5)
+
+    def test(self):
+        self.run_test()
+
+    def test_quant(self):
+        self.run_test(quant=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_fill_constant.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_fill_constant.py
new file mode 100644
index 0000000000000..84ee70782acc2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_fill_constant.py
@@ -0,0 +1,142 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import unittest
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+
+
+class TrtConvertSplitTest(TrtLayerAutoScanTest):
+
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+
+        def generate_value_data(attrs: List[Dict[str, Any]]):
+            return np.array([1]).astype(np.int32)
+
+        def generate_shape_data(attrs: List[Dict[str, Any]]):
+            return np.array([4, 23]).astype(np.int32)
+
+        def generate_shapelist_data(attrs: List[Dict[str, Any]]):
+            return np.array([4]).astype(np.int32)
+
+        for shape in [[2, 3, 4]]:
+            for num_input in [0, 1, 2, 3]:
+                for dtype in [5, 2, 3]:
+                    for str_value in ["2", "23", "-1"]:
+                        self.num_input = num_input
+                        dics = [{
+                            "str_value": str_value,
+                            "shape": shape,
+                            "dtype": dtype
+                        }, {
+                            "axis": -1
+                        }]
+                        dics_intput = [{
+                            "ValueTensor": ["value_data"]
+                        }, {
+                            "ShapeTensor": ["shape_data"],
+                        }, {
+                            "ShapeTensorList": ["shapeT1_data", "shapeT2_data"],
+                        }, {}]
+                        ops_config = [
+                            {
+                                "op_type": "fill_constant",
+                                "op_inputs": dics_intput[num_input],
+                                "op_outputs": {
+                                    "Out": ["out_data"],
+                                },
+                                "op_attrs": dics[0]
+                            },
+                        ]
+
+                        def generate_input():
+                            return np.random.random([1, 1]).astype(np.float32)
+
+                        ops = self.generate_op_config(ops_config)
+                        program_config = ProgramConfig(
+                            ops=ops,
+                            weights={},
+                            inputs={
+                                "value_data":
+                                TensorConfig(data_gen=partial(
+                                    generate_value_data, dics)),
+                                "shape_data":
+                                TensorConfig(data_gen=partial(
+                                    generate_shape_data, dics)),
+                                "shapeT1_data":
+                                TensorConfig(data_gen=partial(
+                                    generate_shapelist_data, dics)),
+                                "shapeT2_data":
+                                TensorConfig(data_gen=partial(
+                                    generate_shapelist_data, dics)),
+                            },
+                            outputs=["out_data"])
+
+                        yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+
+        def generate_dynamic_shape(attrs):
+            self.input_shape = [1, 1]
+            max_shape = list(self.input_shape)
+            min_shape = list(self.input_shape)
+            opt_shape = list(self.input_shape)
+            for i in range(len(self.input_shape)):
+                max_shape[i] = max_shape[i] + 1
+            self.dynamic_shape.min_input_shape = {"Y_data": min_shape}
+            self.dynamic_shape.max_input_shape = {"Y_data": max_shape}
+            self.dynamic_shape.opt_input_shape = {"Y_data": opt_shape}
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            if (self.num_input < 3):
+                return 0, 6
+            return 1, 5
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+        # Don't test static shape
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
+
+    def add_skip_trt_case(self):
+        pass
+
+    def test(self):
+        self.add_skip_trt_case()
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_fused_token_prune.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_fused_token_prune.py
new file mode 100644
index 0000000000000..85c56506de5cf
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_fused_token_prune.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+
+class TrtConvertFusedTokenPruneTest(TrtLayerAutoScanTest):
+
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+        self.trt_param.workspace_size = 1073741824
+
+        def generate_attn_or_mask(attrs: List[Dict[str, Any]]):
+            return np.ones([4, 12, 64, 64]).astype(np.float32)
+
+        def generate_x(attrs: List[Dict[str, Any]]):
+            return np.random.random([4, 64, 76]).astype(np.float32)
+
+        def generate_new_mask(attrs: List[Dict[str, Any]]):
+            return np.random.random([4, 12, 32, 32]).astype(np.float32)
+
+        for keep_first_token in [True, False]:
+            for keep_order in [True, False]:
+                dics = [{
+                    "keep_first_token": keep_first_token,
+                    "keep_order": keep_order
+                }]
+                ops_config = [{
+                    "op_type": "fused_token_prune",
+                    "op_inputs": {
+                        "Attn": ["attn"],
+                        "X": ["x"],
+                        "Mask": ["mask"],
+                        "NewMask": ["new_mask"]
+                    },
+                    "op_outputs": {
+                        "SlimmedX": ["slimmed_x"],
+                        "CLSInds": ["cls_inds"]
+                    },
+                    "op_attrs": dics[0]
+                }]
+                ops = self.generate_op_config(ops_config)
+                program_config = ProgramConfig(
+                    ops=ops,
+                    weights={},
+                    inputs={
+                        "attn":
+                        TensorConfig(
+                            data_gen=partial(generate_attn_or_mask, dics)),
+                        "x":
+                        TensorConfig(data_gen=partial(generate_x, dics)),
+                        "mask":
+                        TensorConfig(
+                            data_gen=partial(generate_attn_or_mask, dics)),
+                        "new_mask":
+                        TensorConfig(data_gen=partial(generate_new_mask, dics))
+                    },
+                    outputs=["slimmed_x", "cls_inds"])
+
+                yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+
+        def generate_dynamic_shape(attrs):
+            self.dynamic_shape.min_input_shape = {
+                "attn": [4, 12, 64, 64],
+                "x": [4, 64, 76],
+                "mask": [4, 12, 64, 64],
+                "new_mask": [4, 12, 32, 32]
+            }
+            self.dynamic_shape.max_input_shape = {
+                "attn": [4, 12, 64, 64],
+                "x": [4, 64, 76],
+                "mask": [4, 12, 64, 64],
+                "new_mask": [4, 12, 32, 32]
+            }
+            self.dynamic_shape.opt_input_shape = {
+                "attn": [4, 12, 64, 64],
+                "x": [4, 64, 76],
+                "mask": [4, 12, 64, 64],
+                "new_mask": [4, 12, 32, 32]
+            }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            return 1, 6
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), (1e-5, 1e-5, 1e-5, 1e-5)
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), (1e-5, 1e-5, 1e-5, 1e-5)
+
+    def test(self):
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reshape.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reshape.py
index e05a78e66b900..7902a35a9a6b4 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reshape.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reshape.py
@@ -48,12 +48,16 @@ def sample_program_configs(self):
 
         def generate_input1(attrs: List[Dict[str, Any]]):
             if self.dims == 4:
+                self.input_shape = [1, 2, 4, 6]
                 return np.ones([1, 2, 4, 6]).astype(np.float32)
             elif self.dims == 3:
+                self.input_shape = [1, 8, 6]
                 return np.ones([1, 8, 6]).astype(np.float32)
             elif self.dims == 2:
+                self.input_shape = [1, 48]
                 return np.ones([1, 48]).astype(np.float32)
             elif self.dims == 1:
+                self.input_shape = [48]
                 return np.ones([48]).astype(np.float32)
 
         def generate_weight1(attrs: List[Dict[str, Any]]):
@@ -66,69 +70,36 @@ def generate_shapeT2_data(attrs: List[Dict[str, Any]]):
             return np.array([24]).astype(np.int32)
 
         for dims in [4, 3, 2, 1]:
-            for num_input in [0, 1, 2, 3]:
-                for shape in [[1, 6, 8], [1, 2, 4, 6], [1, 1, 0, 12], [1, 0, 6],
-                              [1, -1, 12], [2, -1], [3, 16], [3, 4, 4], [48]]:
-                    dics = [{
+            for shape in [[1, 6, 8], [1, 2, 4, 6], [1, 1, 0, 12], [1, 0, 6],
+                          [1, -1, 12], [2, -1], [3, 16], [3, 4, 4], [48],
+                          [-1, 48]]:
+                dics = [
+                    {
                         "shape": shape,
-                    }, {}]
-                    self.num_input = num_input
-                    self.dims = dims
-                    dics_intput = [{
-                        "X": ["reshape_input"],
-                        "Shape": ["shape_data"],
-                        "ShapeTensor": ["shapeT1_data", "shapeT2_data"],
-                    }, {
-                        "X": ["reshape_input"],
-                        "Shape": ["shape_data"],
-                    }, {
-                        "X": ["reshape_input"],
-                        "ShapeTensor": ["shapeT1_data", "shapeT2_data"],
-                    }, {
-                        "X": ["reshape_input"]
-                    }]
-
-                    dics_weight = [{
-                        "shape_data":
-                        TensorConfig(data_gen=partial(generate_weight1, dics)),
-                        "shapeT1_data":
-                        TensorConfig(
-                            data_gen=partial(generate_shapeT1_data, dics)),
-                        "shapeT2_data":
-                        TensorConfig(
-                            data_gen=partial(generate_shapeT2_data, dics))
-                    }, {
-                        "shape_data":
-                        TensorConfig(data_gen=partial(generate_weight1, dics))
-                    }, {
-                        "shapeT1_data":
-                        TensorConfig(
-                            data_gen=partial(generate_shapeT1_data, dics)),
-                        "shapeT2_data":
-                        TensorConfig(
-                            data_gen=partial(generate_shapeT2_data, dics))
-                    }, {}]
-
-                    ops_config = [{
-                        "op_type": "reshape",
-                        "op_inputs": dics_intput[num_input],
-                        "op_outputs": {
-                            "Out": ["reshape_out"]
-                        },
-                        "op_attrs": dics[0]
-                    }]
-                    ops = self.generate_op_config(ops_config)
-                    program_config = ProgramConfig(
-                        ops=ops,
-                        weights=dics_weight[num_input],
-                        inputs={
-                            "reshape_input":
-                            TensorConfig(
-                                data_gen=partial(generate_input1, dics))
-                        },
-                        outputs=["reshape_out"])
+                    },
+                ]
+                self.dims = dims
+                dics_intput = [{"X": ["reshape_input"]}]
+
+                ops_config = [{
+                    "op_type": "reshape",
+                    "op_inputs": dics_intput[0],
+                    "op_outputs": {
+                        "Out": ["reshape_out"]
+                    },
+                    "op_attrs": dics[0]
+                }]
+                ops = self.generate_op_config(ops_config)
+                program_config = ProgramConfig(
+                    ops=ops,
+                    weights={},
+                    inputs={
+                        "reshape_input":
+                        TensorConfig(data_gen=partial(generate_input1, dics))
+                    },
+                    outputs=["reshape_out"])
 
-                    yield program_config
+                yield program_config
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
@@ -169,22 +140,31 @@ def clear_dynamic_shape():
             self.dynamic_shape.opt_input_shape = {}
 
         def generate_trt_nodes_num(attrs, dynamic_shape):
+            # in static shape mode, here is consistent with op_teller.cc
+            if (not dynamic_shape):
+                if (attrs[0]['shape'][0] == 0):
+                    return 1, 2
+                elif (len(attrs[0]['shape']) == 1):
+                    return 0, 3
+                elif (np.prod(attrs[0]['shape'][1:]) == np.prod(
+                        self.input_shape[1:])):
+                    return 1, 2
+                else:
+                    return 0, 3
             return 1, 2
 
         attrs = [
             program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
-        if attrs[0]['shape'][0] > 1 and len(attrs[0]['shape']) > 1:
-            pass
-        else:
-            # for static_shape
-            clear_dynamic_shape()
-            self.trt_param.precision = paddle_infer.PrecisionType.Float32
-            yield self.create_inference_config(), generate_trt_nodes_num(
-                attrs, False), 1e-5
-            self.trt_param.precision = paddle_infer.PrecisionType.Half
-            yield self.create_inference_config(), generate_trt_nodes_num(
-                attrs, False), 1e-5
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
@@ -196,14 +176,243 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             attrs, True), 1e-5
 
     def add_skip_trt_case(self):
+        pass
+
+    def test(self):
+        self.add_skip_trt_case()
+        self.run_test()
+
+
+# reshape having three inputs.
+class TrtConvertReshapeTest2(TrtLayerAutoScanTest):
+
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
 
-        def teller1(program_config, predictor_config):
-            if len(program_config.weights) >= 1:
-                return True
-            return False
+        def generate_input1(attrs: List[Dict[str, Any]]):
+            if self.dims == 4:
+                return np.random.random([1, 2, 4, 6]).astype(np.float32)
+            elif self.dims == 3:
+                return np.random.random([1, 8, 6]).astype(np.float32)
+            elif self.dims == 2:
+                return np.random.random([1, 48]).astype(np.float32)
+            elif self.dims == 1:
+                return np.random.random([48]).astype(np.float32)
 
-        self.add_skip_case(teller1, SkipReasons.TRT_NOT_SUPPORT,
-                           "INPUT ShapeTensor and Shape NOT SUPPORT")
+        for dims in [4, 3, 2, 1]:
+            for shape in [[-1, 48]]:
+                dics = [{
+                    "shape": shape,
+                }, {}]
+                self.dims = dims
+                dics_intput = [
+                    {
+                        "X": ["reshape_input"],
+                        "ShapeTensor": ["shapeT1_data", "shapeT2_data"],
+                    },
+                ]
+                ops_config = [
+                    {
+                        "op_type": "fill_constant",
+                        "op_inputs": {},
+                        "op_outputs": {
+                            "Out": ["shapeT1_data"]
+                        },
+                        "op_attrs": {
+                            "dtype": 2,
+                            "str_value": "2",
+                            "shape": [1],
+                        },
+                    },
+                    {
+                        "op_type": "fill_constant",
+                        "op_inputs": {},
+                        "op_outputs": {
+                            "Out": ["shapeT2_data"]
+                        },
+                        "op_attrs": {
+                            "dtype": 2,
+                            "str_value": "24",
+                            "shape": [1],
+                        },
+                    },
+                    {
+                        "op_type": "reshape",
+                        "op_inputs": dics_intput[0],
+                        "op_outputs": {
+                            "Out": ["reshape_out"]
+                        },
+                        "op_attrs": dics[0]
+                    },
+                ]
+                ops = self.generate_op_config(ops_config)
+                program_config = ProgramConfig(
+                    ops=ops,
+                    weights={},
+                    inputs={
+                        "reshape_input":
+                        TensorConfig(data_gen=partial(generate_input1, dics))
+                    },
+                    outputs=["reshape_out"])
+
+                yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+
+        def generate_dynamic_shape():
+            if self.dims == 4:
+                self.dynamic_shape.min_input_shape = {
+                    "reshape_input": [1, 2, 4, 6]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "reshape_input": [4, 2, 4, 6]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "reshape_input": [1, 2, 4, 6]
+                }
+            elif self.dims == 3:
+                self.dynamic_shape.min_input_shape = {
+                    "reshape_input": [1, 8, 6]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "reshape_input": [4, 8, 6]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "reshape_input": [1, 8, 6]
+                }
+            elif self.dims == 2:
+                self.dynamic_shape.min_input_shape = {"reshape_input": [1, 48]}
+                self.dynamic_shape.max_input_shape = {"reshape_input": [4, 48]}
+                self.dynamic_shape.opt_input_shape = {"reshape_input": [1, 48]}
+            elif self.dims == 1:
+                self.dynamic_shape.min_input_shape = {"reshape_input": [48]}
+                self.dynamic_shape.max_input_shape = {"reshape_input": [48]}
+                self.dynamic_shape.opt_input_shape = {"reshape_input": [48]}
+
+        # for dynamic_shape
+        generate_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (1, 2), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (1, 2), 1e-5
+
+    def add_skip_trt_case(self):
+        pass
+
+    def test(self):
+        self.add_skip_trt_case()
+        self.run_test()
+
+
+# reshape having 2 inputs.
+class TrtConvertReshapeTest3(TrtLayerAutoScanTest):
+
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+
+        def generate_input1(attrs: List[Dict[str, Any]]):
+            if self.dims == 4:
+                return np.random.random([1, 2, 12, 6]).astype(np.float32)
+            elif self.dims == 3:
+                return np.random.random([1, 8, 18]).astype(np.float32)
+            elif self.dims == 2:
+                return np.random.random([1, 144]).astype(np.float32)
+            elif self.dims == 1:
+                return np.random.random([144]).astype(np.float32)
+
+        for dims in [4, 3, 2, 1]:
+            for shape in [[-1, 144]]:
+                dics = [{
+                    "shape": shape,
+                }, {}]
+                self.dims = dims
+                dics_intput = [
+                    {
+                        "X": ["reshape_input"],
+                        "shape_data": ["shape_data"],
+                    },
+                ]
+                ops_config = [
+                    {
+                        "op_type": "fill_constant",
+                        "op_inputs": {},
+                        "op_outputs": {
+                            "Out": ["shape_data"]
+                        },
+                        "op_attrs": {
+                            "dtype": 2,
+                            "str_value": "12",
+                            "shape": [2],
+                        },
+                    },
+                    {
+                        "op_type": "reshape",
+                        "op_inputs": dics_intput[0],
+                        "op_outputs": {
+                            "Out": ["reshape_out"]
+                        },
+                        "op_attrs": dics[0]
+                    },
+                ]
+                ops = self.generate_op_config(ops_config)
+                program_config = ProgramConfig(
+                    ops=ops,
+                    weights={},
+                    inputs={
+                        "reshape_input":
+                        TensorConfig(data_gen=partial(generate_input1, dics))
+                    },
+                    outputs=["reshape_out"])
+
+                yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+
+        def generate_dynamic_shape():
+            if self.dims == 4:
+                self.dynamic_shape.min_input_shape = {
+                    "reshape_input": [1, 2, 12, 6]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "reshape_input": [4, 2, 12, 6]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "reshape_input": [1, 2, 12, 6]
+                }
+            elif self.dims == 3:
+                self.dynamic_shape.min_input_shape = {
+                    "reshape_input": [1, 8, 18]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "reshape_input": [4, 8, 18]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "reshape_input": [1, 8, 18]
+                }
+            elif self.dims == 2:
+                self.dynamic_shape.min_input_shape = {"reshape_input": [1, 144]}
+                self.dynamic_shape.max_input_shape = {"reshape_input": [4, 144]}
+                self.dynamic_shape.opt_input_shape = {"reshape_input": [1, 144]}
+            elif self.dims == 1:
+                self.dynamic_shape.min_input_shape = {"reshape_input": [144]}
+                self.dynamic_shape.max_input_shape = {"reshape_input": [144]}
+                self.dynamic_shape.opt_input_shape = {"reshape_input": [144]}
+
+        # for dynamic_shape
+        generate_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (1, 2), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (1, 2), 1e-5
+
+    def add_skip_trt_case(self):
+        pass
 
     def test(self):
         self.add_skip_trt_case()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_slice.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_slice.py
index 76a84c77122c5..deac7ef9d2a14 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_slice.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_slice.py
@@ -111,13 +111,6 @@ def clear_dynamic_shape():
             self.dynamic_shape.opt_input_shape = {}
 
         def generate_trt_nodes_num(attrs, dynamic_shape):
-            inputs = program_config.inputs
-            if dynamic_shape == True and len(attrs[0]["decrease_axis"]) == 0:
-                return 1, 2
-            if dynamic_shape == True and len(attrs[0]["decrease_axis"]) != 1:
-                return 0, 3
-            if dynamic_shape == False and len(attrs[0]["decrease_axis"]) != 0:
-                return 0, 3
             if not dynamic_shape:
                 for x in attrs[0]["axes"]:
                     if x == 0:
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_quant_dequant_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_quant_dequant_pass.py
index e62b6557844c9..cf1fa96c1204d 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_quant_dequant_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_quant_dequant_pass.py
@@ -18,6 +18,7 @@
 import numpy as np
 from inference_pass_test import InferencePassTest
 from quant_dequant_test import QuantDequantTest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.core import AnalysisConfig
@@ -40,7 +41,7 @@ def network():
                                      act="relu")
             result = fluid.layers.relu(fc_out)
             loss = fluid.layers.cross_entropy(input=result, label=self.label)
-            avg_loss = fluid.layers.mean(loss)
+            avg_loss = paddle.mean(loss)
             return avg_loss, result
 
         self.main_program.random_seed = 2
@@ -105,7 +106,7 @@ def network():
             c_out = fluid.layers.reshape(fc_out, shape=[0, 784])
             result = fluid.layers.relu(c_out)
             loss = fluid.layers.cross_entropy(input=result, label=self.label)
-            avg_loss = fluid.layers.mean(loss)
+            avg_loss = paddle.mean(loss)
             return avg_loss, result
 
         self.main_program.random_seed = 2
@@ -172,7 +173,7 @@ def network():
             c_out = fluid.layers.reshape(fc_out, shape=[1, 1, 2744])
             result = fluid.layers.relu(c_out)
             loss = fluid.layers.cross_entropy(input=result, label=label_shape)
-            avg_loss = fluid.layers.mean(loss)
+            avg_loss = paddle.mean(loss)
             return avg_loss, result
 
         self.main_program.random_seed = 2
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul_quant_dequant.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul_quant_dequant.py
index 01f65b54bd4ae..baf02fc423309 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul_quant_dequant.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul_quant_dequant.py
@@ -16,6 +16,7 @@
 import numpy as np
 from inference_pass_test import InferencePassTest
 from quant_dequant_test import QuantDequantTest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.core import PassVersionChecker
@@ -44,7 +45,7 @@ def network():
                                      act=None)
             result = fluid.layers.relu(fc_out)
             loss = fluid.layers.cross_entropy(input=result, label=self.label)
-            avg_loss = fluid.layers.mean(loss)
+            avg_loss = paddle.mean(loss)
             return avg_loss, result
 
         self.main_program.random_seed = 2
@@ -136,7 +137,7 @@ def network():
                                      act=None)
             result = fluid.layers.relu(fc_out)
             loss = fluid.layers.cross_entropy(input=result, label=self.label)
-            avg_loss = fluid.layers.mean(loss)
+            avg_loss = paddle.mean(loss)
             return avg_loss, result
 
         self.main_program.random_seed = 2
@@ -227,7 +228,7 @@ def network():
                                      act=None)
             result = fluid.layers.relu(fc_out)
             loss = fluid.layers.cross_entropy(input=result, label=self.label)
-            avg_loss = fluid.layers.mean(loss)
+            avg_loss = paddle.mean(loss)
             return avg_loss, result
 
         self.main_program.random_seed = 2
diff --git a/python/paddle/fluid/tests/unittests/ir/pass_test.py b/python/paddle/fluid/tests/unittests/ir/pass_test.py
index 56e31aa705ff2..04e98202dd4a3 100644
--- a/python/paddle/fluid/tests/unittests/ir/pass_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/pass_test.py
@@ -20,7 +20,7 @@
 import unittest
 import warnings
 import numpy as np
-
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.framework import Program, Block
@@ -56,7 +56,7 @@ def grad(self, var):
 
     def append_gradients(self, outs):
         with fluid.program_guard(self.main_program, self.startup_program):
-            loss = fluid.layers.mean(outs)
+            loss = paddle.mean(outs)
             fluid.backward.append_backward(loss)
 
     def check_output(self, startup_on_cpu=False, atol=1e-5):
diff --git a/python/paddle/fluid/tests/unittests/ir/test_ir_subgraph_python_interface.py b/python/paddle/fluid/tests/unittests/ir/test_ir_subgraph_python_interface.py
index 0c9170242e7de..3f8703b657e8d 100644
--- a/python/paddle/fluid/tests/unittests/ir/test_ir_subgraph_python_interface.py
+++ b/python/paddle/fluid/tests/unittests/ir/test_ir_subgraph_python_interface.py
@@ -41,7 +41,7 @@ def linear_fc(num):
             for _ in six.moves.xrange(num):
                 hidden = fluid.layers.fc(hidden, size=128, act='relu')
             loss = fluid.layers.cross_entropy(input=hidden, label=label)
-            loss = fluid.layers.mean(loss)
+            loss = paddle.mean(loss)
             return loss
 
         main_program = Program()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
index 0471c295ad45d..91487fb0ab64d 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
@@ -246,16 +246,6 @@ def init_group(self):
         self.groups = 3
 
 
-# TODO(chenweihang): To solve the coverage problem, add this unittest,
-# remove this unittest after new executor set to default executor
-class TestConv2dMKLDNNByNewExecutor(TestConv2DMKLDNNOp):
-
-    def test_check_output_by_new_executor(self):
-        os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '1'
-        self.test_check_output()
-        del os.environ['FLAGS_USE_STANDALONE_EXECUTOR']
-
-
 if __name__ == '__main__':
     from paddle import enable_static
     enable_static()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py
index 2ae717d64a302..dc9a3862e0421 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py
@@ -68,6 +68,14 @@ def init_input_output(self):
         self.out = np.add(self.x, self.y)
 
 
+class TestMKLDNNElementwiseAddOpBroadcastXintoY(TestMKLDNNElementwiseAddOp):
+
+    def init_input_output(self):
+        self.x = np.random.uniform(1, 2, [2, 50, 1]).astype(self.dtype)
+        self.y = np.random.uniform(1, 2, [2, 50, 160]).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
+
+
 class TestMKLDNNElementwiseAddOp_broadcast_3(TestMKLDNNElementwiseAddOp):
 
     def init_input_output(self):
diff --git a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
index cac8e95521d31..385879c08a72f 100644
--- a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
@@ -50,5 +50,7 @@ if(WITH_MLU)
     set_tests_properties(test_collective_allgather_api_mlu PROPERTIES TIMEOUT
                                                                       120)
     set_tests_properties(test_c_comm_init_op_mlu PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_sync_batch_norm_op_mlu_baseline PROPERTIES TIMEOUT
+                                                                         120)
   endif()
 endif()
diff --git a/python/paddle/fluid/tests/unittests/mlu/sync_batch_norm_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/sync_batch_norm_op_mlu.py
new file mode 100644
index 0000000000000..4f80523a18254
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/sync_batch_norm_op_mlu.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+
+sys.path.append("..")
+import signal
+import time
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_sync_batch_norm_base_mlu import TestSyncBatchNormRunnerBase, runtime_main
+from paddle.fluid.tests.unittests.op_test import OpTest, _set_use_system_allocator
+
+from paddle.fluid.tests.unittests.test_sync_batch_norm_op import create_or_get_tensor
+
+_set_use_system_allocator(False)
+paddle.enable_static()
+
+
+class TestSyncBatchNormOpTraining(TestSyncBatchNormRunnerBase):
+
+    def __init__(self):
+        self.global_ring_id = 0
+
+        self.dtype = np.float32
+        self.N = 8
+        self.C = 16
+        self.H = 32
+        self.W = 32
+        self.dshape = [self.N, self.C, self.H, self.W]
+        self.atol = 1e-3
+
+    def get_model(self,
+                  main,
+                  startup,
+                  place,
+                  layout,
+                  seed,
+                  sync_bn=False,
+                  only_forward=False):
+        """Build program."""
+        use_cudnn = False
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, startup):
+                data = fluid.layers.data(name='input',
+                                         shape=self.dshape,
+                                         dtype=self.dtype,
+                                         append_batch_size=False)
+                conv = fluid.layers.conv2d(
+                    input=data,
+                    num_filters=32,
+                    filter_size=1,
+                    param_attr=fluid.ParamAttr(name='conv2d_weight'),
+                    bias_attr=False,
+                    use_cudnn=use_cudnn)
+                bn = fluid.layers.batch_norm(
+                    conv,
+                    param_attr=fluid.ParamAttr(name='bn_scale'),
+                    bias_attr=fluid.ParamAttr(name='bn_bias'),
+                    moving_mean_name='bn_moving_mean',
+                    moving_variance_name='bn_moving_variance',
+                    data_layout=layout,
+                    is_test=only_forward)
+                # if self.dtype == np.float16:
+                #     bn = fluid.layers.cast(bn, 'float32')
+                sigmoid = fluid.layers.sigmoid(bn)
+                out = fluid.layers.reduce_sum(sigmoid)
+                # if not sync_bn:
+                #     out = out / core.get_mlu_device_count()
+                if not only_forward:
+                    sgd_opt = fluid.optimizer.SGD(learning_rate=0.0)
+                    sgd_opt.backward(out)
+        return [out, conv, bn]
+
+
+if __name__ == "__main__":
+    # print('sync_batch_norm_op_mlu.py __main__')
+
+    runtime_main(TestSyncBatchNormOpTraining, "identity", 0)
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_bilinear_interp_v2_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_bilinear_interp_v2_op_mlu.py
index b8c31578099e1..d7e53639490d2 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_bilinear_interp_v2_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_bilinear_interp_v2_op_mlu.py
@@ -515,51 +515,55 @@ def init_test_case(self):
         self.scale_by_1Dtensor = True
 
 
-#TODO: comment this test for now until bilinear_interp_op added.
-# class TestBilinearInterpOpAPI(unittest.TestCase):
-#     def test_case(self):
-#         x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
-
-#         dim = fluid.data(name="dim", shape=[1], dtype="int32")
-#         shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32")
-#         actual_size = fluid.data(name="actual_size", shape=[2], dtype="int32")
-#         scale_tensor = fluid.data(
-#             name="scale_tensor", shape=[1], dtype="float32")
-
-#         out1 = fluid.layers.resize_bilinear(x, out_shape=[12, 12])
-#         out2 = fluid.layers.resize_bilinear(x, out_shape=[12, dim])
-#         out3 = fluid.layers.resize_bilinear(x, out_shape=shape_tensor)
-#         out4 = fluid.layers.resize_bilinear(
-#             x, out_shape=[4, 4], actual_shape=actual_size)
-#         out5 = fluid.layers.resize_bilinear(x, scale=scale_tensor)
-
-#         x_data = np.random.random((2, 3, 6, 6)).astype("float32")
-#         dim_data = np.array([12]).astype("int32")
-#         shape_data = np.array([12, 12]).astype("int32")
-#         actual_size_data = np.array([12, 12]).astype("int32")
-#         scale_data = np.array([2.0]).astype("float32")
-
-#         if core.is_compiled_with_mlu():
-#             place = paddle.device.MLUPlace(0)
-#         else:
-#             place = core.CPUPlace()
-#         exe = fluid.Executor(place)
-#         exe.run(fluid.default_startup_program())
-#         results = exe.run(fluid.default_main_program(),
-#                           feed={
-#                               "x": x_data,
-#                               "dim": dim_data,
-#                               "shape_tensor": shape_data,
-#                               "actual_size": actual_size_data,
-#                               "scale_tensor": scale_data
-#                           },
-#                           fetch_list=[out1, out2, out3, out4, out5],
-#                           return_numpy=True)
-
-#         expect_res = bilinear_interp_np(
-#             x_data, out_h=12, out_w=12, align_corners=True)
-#         for res in results:
-#             self.assertTrue(np.allclose(res, expect_res))
+class TestBilinearInterpOpAPI(unittest.TestCase):
+
+    def test_case(self):
+        x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+
+        dim = fluid.data(name="dim", shape=[1], dtype="int32")
+        shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32")
+        actual_size = fluid.data(name="actual_size", shape=[2], dtype="int32")
+        scale_tensor = fluid.data(name="scale_tensor",
+                                  shape=[1],
+                                  dtype="float32")
+
+        out1 = fluid.layers.resize_bilinear(x, out_shape=[12, 12])
+        out2 = fluid.layers.resize_bilinear(x, out_shape=[12, dim])
+        out3 = fluid.layers.resize_bilinear(x, out_shape=shape_tensor)
+        out4 = fluid.layers.resize_bilinear(x,
+                                            out_shape=[4, 4],
+                                            actual_shape=actual_size)
+        out5 = fluid.layers.resize_bilinear(x, scale=scale_tensor)
+
+        x_data = np.random.random((2, 3, 6, 6)).astype("float32")
+        dim_data = np.array([12]).astype("int32")
+        shape_data = np.array([12, 12]).astype("int32")
+        actual_size_data = np.array([12, 12]).astype("int32")
+        scale_data = np.array([2.0]).astype("float32")
+
+        if core.is_compiled_with_mlu():
+            place = paddle.device.MLUPlace(0)
+        else:
+            place = core.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        results = exe.run(fluid.default_main_program(),
+                          feed={
+                              "x": x_data,
+                              "dim": dim_data,
+                              "shape_tensor": shape_data,
+                              "actual_size": actual_size_data,
+                              "scale_tensor": scale_data
+                          },
+                          fetch_list=[out1, out2, out3, out4, out5],
+                          return_numpy=True)
+
+        expect_res = bilinear_interp_np(x_data,
+                                        out_h=12,
+                                        out_w=12,
+                                        align_corners=True)
+        for res in results:
+            self.assertTrue(np.allclose(res, expect_res))
 
 
 class TestBilinearInterpOpAPI_dy(unittest.TestCase):
@@ -572,8 +576,6 @@ def test_case(self):
             place = core.CPUPlace()
         with fluid.dygraph.guard(place):
             input_data = np.random.random((2, 3, 6, 6)).astype("float32")
-            input_data = np.load('input.npy').astype("float32")
-            # print(input_data)
             input_x = paddle.to_tensor(input_data)
             expect_res = bilinear_interp_np(input_data,
                                             out_h=12,
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_c_comm_init_op_mlu.sh b/python/paddle/fluid/tests/unittests/mlu/test_c_comm_init_op_mlu.sh
index 97f21798c1154..36fc85ba6da07 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_c_comm_init_op_mlu.sh
+++ b/python/paddle/fluid/tests/unittests/mlu/test_c_comm_init_op_mlu.sh
@@ -17,5 +17,4 @@
 set -e
 # use default values
 # FIXME: random fails on Unknown command lines -c (or -m).
-launch_py=${PADDLE_BINARY_DIR}/python/paddle/distributed/launch.py
-MLU_VISIBLE_DEVICES=0,1 python ${launch_py} c_comm_init_op_mlu.py
+MLU_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch c_comm_init_op_mlu.py
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_elementwise_pow_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_pow_op_mlu.py
new file mode 100644
index 0000000000000..7e04aed19c692
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_pow_op_mlu.py
@@ -0,0 +1,256 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import paddle.fluid as fluid
+import paddle
+
+import numpy as np
+import unittest
+import sys
+
+sys.path.append("..")
+from op_test import OpTest
+
+paddle.enable_static()
+SEED = 2022
+
+
+def ComputeGrad(x, y, out, axis):
+    grad = 1 / out.size
+    shape_x = x.shape
+    shape_y = y.shape
+    shape_out = out.shape
+    reduce_axes_x = []
+    reduce_axes_y = []
+
+    if shape_x != shape_out:
+        if len(shape_x) < len(shape_out):
+            src_axis = axis
+        else:
+            src_axis = 0
+
+        for ax in range(len(shape_out)):
+            if (ax < src_axis or ax >= src_axis + len(shape_x)) or (
+                    shape_out[ax] > 1 and shape_x[ax - src_axis] == 1):
+                reduce_axes_x.append(ax)
+
+    if shape_y != shape_out:
+        if len(shape_y) < len(shape_out):
+            src_axis = axis
+        else:
+            src_axis = 0
+
+        for ax in range(len(shape_out)):
+            if (ax < src_axis or ax >= src_axis + len(shape_y)) or (
+                    shape_out[ax] > 1 and shape_y[ax - src_axis] == 1):
+                reduce_axes_y.append(ax)
+
+    if len(reduce_axes_x) > 0:
+        for i in reduce_axes_x:
+            x = np.expand_dims(x, axis=i)
+
+    if len(reduce_axes_y) > 0:
+        for i in reduce_axes_y:
+            y = np.expand_dims(y, axis=i)
+
+    dx = y * np.power(x, y - 1) * grad
+    dy = np.log(x) * np.power(x, y) * grad
+
+    if len(reduce_axes_x) > 0:
+        for i, element in enumerate(reduce_axes_x):
+            dx = np.add.reduce(dx, element - i)
+
+    if len(reduce_axes_y) > 0:
+        for i, element in enumerate(reduce_axes_y):
+            dy = np.add.reduce(dy, element - i)
+
+    return dx, dy
+
+
+class TestElementwisePow(OpTest):
+
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "elementwise_pow"
+
+        self.init_dtype()
+        self.init_input_output()
+        self.init_axis()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.attrs = {'axis': self.axis}
+        self.outputs = {'Out': self.out}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def init_axis(self):
+        self.axis = -1
+
+    def init_input_output(self):
+        np.random.seed(SEED)
+        self.x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        self.y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        self.out = np.power(self.x, self.y)
+
+    def test_check_grad_normal(self):
+        dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis)
+        self.check_grad_with_place(self.place, ['X', 'Y'],
+                                   'Out',
+                                   user_defined_grads=[dx, dy])
+
+    def test_check_grad_ingore_x(self):
+        _, dy = ComputeGrad(self.x, self.y, self.out, self.axis)
+        self.check_grad_with_place(self.place, ['Y'],
+                                   'Out',
+                                   no_grad_set=set("X"),
+                                   user_defined_grads=[dy])
+
+    def test_check_grad_ingore_y(self):
+        dx, _ = ComputeGrad(self.x, self.y, self.out, self.axis)
+        self.check_grad_with_place(self.place, ['X'],
+                                   'Out',
+                                   no_grad_set=set("Y"),
+                                   user_defined_grads=[dx])
+
+
+class TestElementwisePowFp16(TestElementwisePow):
+
+    def init_input_output(self):
+        np.random.seed(SEED)
+        self.x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        self.y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        self.out = np.power(self.x, self.y)
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        # self.__class__.no_need_check_grad = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-5)
+
+
+class TestElementwisePowOp_broadcast_0(TestElementwisePow):
+
+    def init_axis(self):
+        self.axis = 1
+
+    def init_input_output(self):
+        np.random.seed(SEED)
+        self.x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        self.y = np.random.uniform(1, 2, [1, 11, 17]).astype(self.dtype)
+        self.out = np.power(self.x, self.y)
+
+    def test_check_grad_normal(self):
+        dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis)
+        self.check_grad_with_place(self.place, ['X', 'Y'],
+                                   'Out',
+                                   user_defined_grads=[dx, dy])
+
+    def test_check_grad_ingore_x(self):
+        _, dy = ComputeGrad(self.x, self.y, self.out, self.axis)
+        self.check_grad_with_place(self.place, ['Y'],
+                                   'Out',
+                                   no_grad_set=set("X"),
+                                   user_defined_grads=[dy])
+
+    def test_check_grad_ingore_y(self):
+        dx, _ = ComputeGrad(self.x, self.y, self.out, self.axis)
+        self.check_grad_with_place(self.place, ['X'],
+                                   'Out',
+                                   no_grad_set=set("Y"),
+                                   user_defined_grads=[dx])
+
+
+class TestElementwisePowOp_broadcast_1(TestElementwisePow):
+
+    def init_axis(self):
+        self.axis = 1
+
+    def init_input_output(self):
+        np.random.seed(SEED)
+        self.x = np.random.uniform(1, 2, [2, 100, 1]).astype(self.dtype)
+        self.y = np.random.uniform(1, 2, [100]).astype(self.dtype)
+        self.out = np.power(self.x, self.y.reshape(1, 100, 1))
+
+    def test_check_grad_normal(self):
+        dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis)
+        self.check_grad_with_place(self.place, ['X', 'Y'],
+                                   'Out',
+                                   user_defined_grads=[dx, dy])
+
+    def test_check_grad_ingore_x(self):
+        _, dy = ComputeGrad(self.x, self.y, self.out, self.axis)
+        self.check_grad_with_place(self.place, ['Y'],
+                                   'Out',
+                                   no_grad_set=set("X"),
+                                   user_defined_grads=[dy])
+
+    def test_check_grad_ingore_y(self):
+        dx, _ = ComputeGrad(self.x, self.y, self.out, self.axis)
+        self.check_grad_with_place(self.place, ['X'],
+                                   'Out',
+                                   no_grad_set=set("Y"),
+                                   user_defined_grads=[dx])
+
+
+class TestElementwisePowOp_broadcast_2(TestElementwisePow):
+
+    def init_axis(self):
+        self.axis = 0
+
+    def init_input_output(self):
+        np.random.seed(SEED)
+        self.x = np.random.uniform(0.1, 1, [100, 3, 1]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [100]).astype(self.dtype)
+        self.out = np.power(self.x, self.y.reshape(100, 1, 1))
+
+    def test_check_grad_normal(self):
+        dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis)
+        self.check_grad_with_place(self.place, ['X', 'Y'],
+                                   'Out',
+                                   user_defined_grads=[dx, dy])
+
+    def test_check_grad_ingore_x(self):
+        _, dy = ComputeGrad(self.x, self.y, self.out, self.axis)
+        self.check_grad_with_place(self.place, ['Y'],
+                                   'Out',
+                                   no_grad_set=set("X"),
+                                   user_defined_grads=[dy])
+
+    def test_check_grad_ingore_y(self):
+        dx, _ = ComputeGrad(self.x, self.y, self.out, self.axis)
+        self.check_grad_with_place(self.place, ['X'],
+                                   'Out',
+                                   no_grad_set=set("Y"),
+                                   user_defined_grads=[dx])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_exp_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_exp_op_mlu.py
new file mode 100644
index 0000000000000..70c001c69cf9d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_exp_op_mlu.py
@@ -0,0 +1,114 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+class TestExp(OpTest):
+
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "exp"
+        self.place = paddle.MLUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.rand(20, 5).astype(self.dtype)
+        out = np.exp(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out')
+
+
+class TestExpFp16(OpTest):
+
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "exp"
+        self.place = paddle.MLUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.rand(20, 5).astype(self.dtype)
+        out = np.exp(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestExpNeg(OpTest):
+
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "exp"
+        self.place = paddle.MLUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.random([20, 5]).astype(self.dtype)
+        x -= 1
+        out = np.exp(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_expand_v2_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_expand_v2_op_mlu.py
index d7b1768d50970..cbc99c2fa6686 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_expand_v2_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_expand_v2_op_mlu.py
@@ -25,6 +25,8 @@
 import paddle
 from paddle.fluid.framework import _test_eager_guard
 
+paddle.enable_static()
+
 
 # Situation 1: shape is a list(without tensor)
 class TestExpandV2OpRank1(OpTest):
@@ -304,5 +306,4 @@ def test_expand_times_is_tensor(self):
 
 
 if __name__ == "__main__":
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_fill_constant_batch_size_like_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_fill_constant_batch_size_like_op_mlu.py
new file mode 100644
index 0000000000000..1e8275df0b5a7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_fill_constant_batch_size_like_op_mlu.py
@@ -0,0 +1,219 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import sys
+
+sys.path.append("..")
+import paddle
+import paddle.fluid.core as core
+from paddle.static import program_guard, Program
+import paddle.compat as cpt
+import unittest
+import numpy as np
+from op_test import OpTest
+from paddle.fluid.framework import convert_np_dtype_to_dtype_
+
+paddle.enable_static()
+
+
+def fill_constant_batch_size_like(input,
+                                  shape,
+                                  value,
+                                  data_type,
+                                  input_dim_idx=0,
+                                  output_dim_idx=0,
+                                  force_cpu=False):
+    return paddle.fluid.layers.fill_constant_batch_size_like(
+        input, shape, data_type, value, input_dim_idx, output_dim_idx,
+        force_cpu)
+
+
+class TestFillConstantBatchSizeLike(OpTest):
+
+    def setUp(self):
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+        self.op_type = "fill_constant_batch_size_like"
+        self.init_shape()
+        self.init_value()
+        self.init_dtype()
+        self.init_force_cpu()
+        self.init_dim_idx()
+
+        self.inputs = {
+            'Input': np.random.random(self.input_shape).astype("float32")
+        }
+        self.attrs = {
+            'shape': self.shape,
+            'value': self.value,
+            'str_value': self.str_value,
+            'dtype': self.dtype,
+            'force_cpu': self.force_cpu,
+            'input_dim_idx': self.input_dim_idx,
+            'output_dim_idx': self.output_dim_idx
+        }
+        self.outputs = {
+            'Out': np.full(self.output_shape, self.output_value,
+                           self.output_dtype)
+        }
+
+    def init_shape(self):
+        self.input_shape = [4, 5]
+        self.shape = [123, 92]
+        self.output_shape = (4, 92)
+
+    def init_value(self):
+        self.value = 3.8
+        self.str_value = ''
+        self.output_value = 3.8
+
+    def init_dtype(self):
+        self.dtype = core.VarDesc.VarType.FP32
+        self.output_dtype = np.float32
+
+    def init_force_cpu(self):
+        self.force_cpu = False
+
+    def init_dim_idx(self):
+        self.input_dim_idx = 0
+        self.output_dim_idx = 0
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestFillConstantBatchSizeLike2(TestFillConstantBatchSizeLike):
+
+    def init_shape(self):
+        # test shape
+        self.input_shape = [4, 5, 6, 7]
+        self.shape = [10, 123, 92]
+        self.output_shape = (4, 123, 92)
+
+
+class TestFillConstantBatchSizeLike3(TestFillConstantBatchSizeLike):
+
+    def init_value(self):
+        # use 'str_value' rather than 'value'
+        self.value = 3.8
+        self.str_value = '4.5'
+        self.output_value = 4.5
+
+
+class TestFillConstantBatchSizeLike4(TestFillConstantBatchSizeLike):
+
+    def init_value(self):
+        # str_value = 'inf'
+        self.value = 3.8
+        self.str_value = 'inf'
+        self.output_value = float('inf')
+
+
+class TestFillConstantBatchSizeLike5(TestFillConstantBatchSizeLike):
+
+    def init_value(self):
+        # str_value = '-inf'
+        self.value = 3.8
+        self.str_value = '-inf'
+        self.output_value = -float('inf')
+
+
+class TestFillConstantBatchSizeLike6(TestFillConstantBatchSizeLike):
+
+    def init_dtype(self):
+        self.dtype = core.VarDesc.VarType.FP16
+        self.output_dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-2)
+
+
+class TestFillConstantBatchSizeLike7(TestFillConstantBatchSizeLike):
+
+    def init_dtype(self):
+        self.dtype = core.VarDesc.VarType.INT32
+        self.output_dtype = np.int32
+
+
+class TestFillConstantBatchSizeLike8(TestFillConstantBatchSizeLike):
+
+    def init_force_cpu(self):
+        self.force_cpu = True
+
+
+class TestFillConstantBatchSizeLike9(TestFillConstantBatchSizeLike):
+
+    def init_shape(self):
+        self.input_shape = [4, 5]
+        self.shape = [123, 92]
+        self.output_shape = (123, 4)
+
+    def init_dim_idx(self):
+        self.input_dim_idx = 0
+        self.output_dim_idx = 1
+
+
+class TestFillConstantBatchSizeLikeLodTensor(TestFillConstantBatchSizeLike):
+    # test LodTensor
+    def setUp(self):
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+        self.op_type = "fill_constant_batch_size_like"
+        self.init_shape()
+        self.init_value()
+        self.init_dtype()
+        self.init_force_cpu()
+        self.init_dim_idx()
+
+        lod = [[3, 2, 5]]
+        self.inputs = {
+            'Input': (np.random.random(self.input_shape).astype("float32"), lod)
+        }
+        self.attrs = {
+            'shape': self.shape,
+            'value': self.value,
+            'str_value': self.str_value,
+            'dtype': self.dtype,
+            'force_cpu': self.force_cpu,
+            'input_dim_idx': self.input_dim_idx,
+            'output_dim_idx': self.output_dim_idx
+        }
+        self.outputs = {
+            'Out': np.full(self.output_shape, self.output_value,
+                           self.output_dtype)
+        }
+
+    def init_shape(self):
+        self.input_shape = [10, 20]
+        self.shape = [123, 92]
+        self.output_shape = (3, 92)
+
+
+class TestFillConstantBatchSizeLikeLodTensor2(
+        TestFillConstantBatchSizeLikeLodTensor):
+    # test LodTensor with 'input_dim_idx' != 0
+    def init_shape(self):
+        self.input_shape = [10, 20]
+        self.shape = [123, 92]
+        self.output_shape = (20, 92)
+
+    def init_dim_idx(self):
+        self.input_dim_idx = 1
+        self.output_dim_idx = 0
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_hard_sigmoid_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_hard_sigmoid_op_mlu.py
new file mode 100644
index 0000000000000..5050e2006f333
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_hard_sigmoid_op_mlu.py
@@ -0,0 +1,195 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.nn.functional as F
+
+paddle.enable_static()
+SEED = 2022
+np.random.seed(SEED)
+
+
+def ref_hardsigmoid(x, slope=0.166666666666667, offset=0.5):
+    return np.maximum(np.minimum(x * slope + offset, 1.), 0.).astype(x.dtype)
+
+
+class TestMLUHardSigmoid(OpTest):
+
+    def setUp(self):
+        paddle.enable_static()
+
+        self.op_type = "hard_sigmoid"
+        self.set_mlu()
+        self.init_dtype()
+        self.set_attrs()
+
+        x = np.random.uniform(-5, 5, [10, 12]).astype(self.dtype)
+        lower_threshold = -self.offset / self.slope
+        upper_threshold = (1. - self.offset) / self.slope
+
+        # Same reason as TestAbs
+        delta = 0.005
+        x[np.abs(x - lower_threshold) < delta] = lower_threshold - 0.02
+        x[np.abs(x - upper_threshold) < delta] = upper_threshold - 0.02
+
+        out = ref_hardsigmoid(x, self.slope, self.offset)
+
+        self.attrs = {'slope': self.slope, 'offset': self.offset}
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out')
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def set_attrs(self):
+        self.slope = 0.166666666666667
+        self.offset = 0.5
+
+
+class TestMLUHardSigmoid2(TestMLUHardSigmoid):
+
+    def set_attrs(self):
+        self.slope = 0.2
+        self.offset = 0.5
+
+
+class TestMLUHardSigmoid3(TestMLUHardSigmoid):
+
+    def set_attrs(self):
+        self.slope = 0.2
+        self.offset = 0.4
+
+
+class TestMLUHardSigmoidFp16(unittest.TestCase):
+
+    def setUp(self):
+        paddle.disable_static()
+
+        self.place = paddle.MLUPlace(0)
+        self.dtype = np.float32
+
+        # float32
+        self.float32_x = np.random.uniform(-5, 5, [10, 12]).astype(np.float32)
+        paddle.set_device('cpu')
+        data = paddle.to_tensor(self.float32_x, stop_gradient=True)
+        self.float32_y = F.hardsigmoid(data)
+
+        # float16
+        self.float16_x = self.float32_x.astype(np.float16)
+        self.float16_y = ref_hardsigmoid(self.float16_x)
+
+    def test_check_output_and_grad_mlu(self):
+        # mlu float16
+        paddle.set_device('mlu')
+        data = paddle.to_tensor(self.float16_x, stop_gradient=True)
+        mlu_float16_y = F.hardsigmoid(data)
+
+        cpu_diff_1 = np.divide(
+            np.sum(np.abs(self.float32_y.numpy() - self.float16_y)),
+            np.sum(np.abs(self.float32_y.numpy())))
+        mlu_diff_1 = np.divide(
+            np.sum(np.abs(self.float32_y.numpy() - mlu_float16_y.numpy())),
+            np.sum(np.abs(self.float32_y.numpy())))
+
+        cpu_diff_2 = np.divide(
+            np.sum(np.square(self.float32_y.numpy() - self.float16_y)),
+            np.sum(np.square(self.float32_y.numpy())))
+        mlu_diff_2 = np.divide(
+            np.sum(np.square(self.float32_y.numpy() - mlu_float16_y.numpy())),
+            np.sum(np.square(self.float32_y.numpy())))
+        assert mlu_diff_1 <= cpu_diff_1
+        assert mlu_diff_2 <= cpu_diff_2
+
+
+class TestHardsigmoidAPI(unittest.TestCase):
+    # test paddle.nn.Hardsigmoid, paddle.nn.functional.hardsigmoid
+    def setUp(self):
+        self.x_np = np.random.uniform(-1, 1, [10, 12]).astype(np.float32)
+        self.place = paddle.MLUPlace(0)
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.static.data('X', self.x_np.shape, self.x_np.dtype)
+            out1 = F.hardsigmoid(x)
+            m = paddle.nn.Hardsigmoid()
+            out2 = m(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_hardsigmoid(self.x_np)
+        for r in res:
+            self.assertTrue(np.allclose(out_ref, r))
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.hardsigmoid(x)
+        m = paddle.nn.Hardsigmoid()
+        out2 = m(x)
+        out_ref = ref_hardsigmoid(self.x_np)
+        for r in [out1, out2]:
+            self.assertTrue(np.allclose(out_ref, r.numpy()))
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            out = fluid.layers.hard_sigmoid(x)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_hardsigmoid(self.x_np, 0.2, 0.5)
+        self.assertTrue(np.allclose(out_ref, res[0]))
+
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out = paddle.fluid.layers.hard_sigmoid(x)
+        self.assertTrue(np.allclose(out_ref, out.numpy()))
+        paddle.enable_static()
+
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            # The input type must be Variable.
+            self.assertRaises(TypeError, F.hardsigmoid, 1)
+            # The input dtype must be float16, float32, float64.
+            x_int32 = paddle.fluid.data(name='x_int32',
+                                        shape=[12, 10],
+                                        dtype='int32')
+            self.assertRaises(TypeError, F.hardsigmoid, x_int32)
+            # support the input dtype is float16
+            x_fp16 = paddle.fluid.data(name='x_fp16',
+                                       shape=[12, 10],
+                                       dtype='float16')
+            F.hardsigmoid(x_fp16)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_hard_swish_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_hard_swish_op_mlu.py
new file mode 100644
index 0000000000000..1f12d47da42a2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_hard_swish_op_mlu.py
@@ -0,0 +1,166 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import paddle.nn.functional as F
+import paddle.fluid as fluid
+import paddle
+import sys
+
+sys.path.append("..")
+from op_test import OpTest
+
+import numpy as np
+import unittest
+
+paddle.enable_static()
+SEED = 2021
+np.random.seed(SEED)
+
+
+def scalarToType(val, data_type):
+    converted_val = np.array([val]).astype(data_type)[0]
+    print("converted_val type: ", type(converted_val))
+    return converted_val
+
+
+def ref_hard_swish_grad(x, threshold, scale, offset, data_type):
+    threshold = scalarToType(threshold, data_type)
+    scale = scalarToType(scale, data_type)
+    offset = scalarToType(offset, data_type)
+    dout = np.full_like(x, fill_value=1. / x.size)
+    tmp = ((x + offset) < threshold).astype(x.dtype)
+    dx = dout * (((x + offset) > 0).astype(x.dtype) *
+                 (2 * x + offset) * tmp / scale + 1.0 - tmp)
+    return dx
+
+
+class TestHardSwishMLU(OpTest):
+
+    def setUp(self):
+        paddle.enable_static()
+
+        self.op_type = "hard_swish"
+        self.place = paddle.MLUPlace(0)
+        self.init_dtype()
+
+        x = np.random.uniform(-2, 2, [10, 12]).astype(self.dtype)
+        threshold = 6.0
+        scale = 6.0
+        offset = 3.0
+
+        x[np.abs(x + offset) < 0.005] = 0.02
+        x[np.abs(x - threshold + offset) < 0.005] = threshold - offset + 0.02
+
+        out = (
+            x *
+            (np.minimum(np.maximum(x + offset, 0.), threshold) / scale)).astype(
+                self.dtype)
+        self.x_grad = ref_hard_swish_grad(x, threshold, scale, offset,
+                                          self.dtype)
+        self.set_mlu()
+        self.inputs = {'X': x}
+        self.attrs = {'threshold': threshold, 'scale': scale, 'offset': offset}
+        self.outputs = {'Out': out}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out')
+
+
+class TestHardSwishMLUWithCPUFloat16(unittest.TestCase):
+
+    def setUp(self):
+        paddle.disable_static()
+
+        self.place = paddle.MLUPlace(0)
+        self.dtype = np.float32
+
+        # float32
+        self.float32_x = np.random.uniform(-6, 10, [8, 15]).astype(np.float32)
+        paddle.set_device('cpu')
+        data = paddle.to_tensor(self.float32_x, stop_gradient=False)
+        self.float32_y = F.hardswish(data)
+        self.float32_y.sum().backward()
+        self.float32_grad = data.grad
+
+        # float16
+        self.float16_x = self.float32_x.astype(np.float16)
+        threshold = 6.0
+        scale = 6.0
+        offset = 3.0
+
+        threshold = scalarToType(threshold, np.float16)
+        scale = scalarToType(scale, np.float16)
+        offset = scalarToType(offset, np.float16)
+        self.float16_y = (self.float16_x * (np.minimum(
+            np.maximum(self.float16_x + offset, scalarToType(0., np.float16)),
+            threshold) / scale)).astype(np.float16)
+        self.float16_grad = ref_hard_swish_grad(self.float16_x, threshold,
+                                                scale, offset, np.float16)
+
+    def test_check_output_and_grad_mlu(self):
+        # mlu float16
+        paddle.set_device('mlu')
+        data = paddle.to_tensor(self.float16_x, stop_gradient=False)
+        mlu_float16_y = F.hardswish(data)
+        mlu_float16_y.sum().backward()
+        mlu_float16_grad = data.grad
+
+        cpu_diff_1 = np.divide(
+            np.sum(np.abs(self.float32_y.numpy() - self.float16_y)),
+            np.sum(np.abs(self.float32_y.numpy())))
+        mlu_diff_1 = np.divide(
+            np.sum(np.abs(self.float32_y.numpy() - mlu_float16_y.numpy())),
+            np.sum(np.abs(self.float32_y.numpy())))
+
+        cpu_diff_2 = np.divide(
+            np.sum(np.square(self.float32_y.numpy() - self.float16_y)),
+            np.sum(np.square(self.float32_y.numpy())))
+        mlu_diff_2 = np.divide(
+            np.sum(np.square(self.float32_y.numpy() - mlu_float16_y.numpy())),
+            np.sum(np.square(self.float32_y.numpy())))
+        assert mlu_diff_1 <= cpu_diff_1
+        assert mlu_diff_2 <= cpu_diff_2
+
+        cpu_diff_1 = np.divide(
+            np.sum(np.abs(self.float32_grad.numpy() - self.float16_grad)),
+            np.sum(np.abs(self.float32_grad.numpy())))
+        mlu_diff_1 = np.divide(
+            np.sum(np.abs(self.float32_grad.numpy() -
+                          mlu_float16_grad.numpy())),
+            np.sum(np.abs(self.float32_grad.numpy())))
+
+        cpu_diff_2 = np.divide(
+            np.sum(np.square(self.float32_grad.numpy() - self.float16_grad)),
+            np.sum(np.square(self.float32_grad.numpy())))
+        mlu_diff_2 = np.divide(
+            np.sum(
+                np.square(self.float32_grad.numpy() -
+                          mlu_float16_grad.numpy())),
+            np.sum(np.square(self.float32_grad.numpy())))
+        assert mlu_diff_1 <= cpu_diff_1
+        assert mlu_diff_2 <= cpu_diff_2
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_masked_select_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_masked_select_op_mlu.py
new file mode 100644
index 0000000000000..7efed0ea4b0f8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_masked_select_op_mlu.py
@@ -0,0 +1,169 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import sys
+
+sys.path.append("..")
+import numpy as np
+from op_test import OpTest, skip_check_grad_ci
+import paddle.fluid as fluid
+import paddle
+
+paddle.enable_static()
+
+
+def np_masked_select(shape, x, mask):
+    result = np.empty(shape=(0), dtype=x.dtype)
+    sum = 0
+    for index, (ele, ma) in enumerate(zip(np.nditer(x), np.nditer(mask))):
+        if ma:
+            sum = sum + 1
+            result = np.append(result, ele)
+    for index, (ele, ma) in enumerate(zip(np.nditer(x), np.nditer(mask))):
+        if index >= sum:
+            result = np.append(result, 0)
+    result = np.reshape(result, shape)
+    return result
+
+
+class TestMaskedSelectOp(OpTest):
+
+    def setUp(self):
+        self.init()
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+        self.op_type = "masked_select"
+        self.python_api = paddle.masked_select
+        x = np.random.random(self.shape).astype('float32')
+        mask = np.array(np.random.randint(2, size=self.shape, dtype=bool))
+        out = np_masked_select(self.shape, x, mask)
+        self.inputs = {'X': x, 'Mask': mask}
+        self.outputs = {'Y': out}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X'], 'Y')
+
+    def init(self):
+        self.shape = (50, 3)
+
+
+class TestMaskedSelectOp1(TestMaskedSelectOp):
+
+    def init(self):
+        self.shape = (6, 8, 9, 18)
+
+
+class TestMaskedSelectOp2(TestMaskedSelectOp):
+
+    def init(self):
+        self.shape = (168, )
+
+
+@skip_check_grad_ci(reason="get_numeric_gradient not support int32")
+class TestMaskedSelectOpInt32(TestMaskedSelectOp):
+
+    def init_dtype(self):
+        self.dtype = np.int32
+
+    def test_check_grad(self):
+        pass
+
+
+class TestMaskedSelectOpFp16(TestMaskedSelectOp):
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_grad(self):
+        x_grad = self.inputs['Mask'].astype(self.dtype)
+        x_grad = x_grad * (1 / x_grad.size)
+        self.check_grad_with_place(self.place, ['X'],
+                                   'Y',
+                                   user_defined_grads=[x_grad])
+
+
+class TestMaskedSelectAPI(unittest.TestCase):
+
+    def test_imperative_mode(self):
+        paddle.disable_static()
+        shape = (88, 6, 8)
+        np_x = np.random.random(shape).astype('float32')
+        np_mask = np.array(np.random.randint(2, size=shape, dtype=bool))
+        x = paddle.to_tensor(np_x)
+        mask = paddle.to_tensor(np_mask)
+        out = paddle.masked_select(x, mask)
+        np_out = np_masked_select(shape, np_x, np_mask)
+        self.assertEqual(np.allclose(out.numpy(), np_out), True)
+        paddle.enable_static()
+
+    def test_static_mode(self):
+        shape = [8, 9, 6]
+        x = paddle.fluid.data(shape=shape, dtype='float32', name='x')
+        mask = paddle.fluid.data(shape=shape, dtype='bool', name='mask')
+        np_x = np.random.random(shape).astype('float32')
+        np_mask = np.array(np.random.randint(2, size=shape, dtype=bool))
+
+        out = paddle.masked_select(x, mask)
+        np_out = np_masked_select(shape, np_x, np_mask)
+
+        exe = paddle.static.Executor(place=paddle.device.MLUPlace(0))
+
+        res = exe.run(paddle.static.default_main_program(),
+                      feed={
+                          "x": np_x,
+                          "mask": np_mask
+                      },
+                      fetch_list=[out])
+        self.assertEqual(np.allclose(res, np_out), True)
+
+
+class TestMaskedSelectError(unittest.TestCase):
+
+    def test_error(self):
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+
+            shape = [8, 9, 6]
+            x = paddle.fluid.data(shape=shape, dtype='float32', name='x')
+            mask = paddle.fluid.data(shape=shape, dtype='bool', name='mask')
+            mask_float = paddle.fluid.data(shape=shape,
+                                           dtype='float32',
+                                           name='mask_float')
+            np_x = np.random.random(shape).astype('float32')
+            np_mask = np.array(np.random.randint(2, size=shape, dtype=bool))
+
+            def test_x_type():
+                paddle.masked_select(np_x, mask)
+
+            self.assertRaises(TypeError, test_x_type)
+
+            def test_mask_type():
+                paddle.masked_select(x, np_mask)
+
+            self.assertRaises(TypeError, test_mask_type)
+
+            def test_mask_dtype():
+                paddle.masked_select(x, mask_float)
+
+            self.assertRaises(TypeError, test_mask_dtype)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_momentum_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_momentum_op_mlu.py
index abe16155d0362..b8272e3bce9da 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_momentum_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_momentum_op_mlu.py
@@ -148,7 +148,7 @@ def test_momentum(self):
             y = fluid.layers.data(name='y', shape=[1], dtype='float32')
             y_predict = fluid.layers.fc(input=x, size=1, act=None)
             cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-            avg_cost = fluid.layers.mean(cost)
+            avg_cost = paddle.mean(cost)
 
             rms_optimizer = paddle.optimizer.Momentum(learning_rate=0.1,
                                                       momentum=0.9)
@@ -271,7 +271,7 @@ def test_momentum_static(self):
             y = fluid.layers.data(name='y', shape=[1], dtype='float32')
             y_predict = fluid.layers.fc(input=x, size=1, act=None)
             cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-            avg_cost = fluid.layers.mean(cost)
+            avg_cost = paddle.mean(cost)
 
             momentum_optimizer = paddle.fluid.contrib.optimizer.Momentum(
                 learning_rate=0.1, momentum=0.9)
@@ -591,7 +591,7 @@ def _momentum_optimize_static(self,
                                           name='X',
                                           dtype='float32')
             hidden = paddle.static.nn.fc(x=data, size=10)
-            loss = paddle.fluid.layers.mean(hidden)
+            loss = paddle.mean(hidden)
             optimizer.minimize(loss)
         exe.run(startup_program)
         if use_amp:
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_nearest_interp_v2_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_nearest_interp_v2_op_mlu.py
index e9235e62a7989..59078a21d0fa8 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_nearest_interp_v2_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_nearest_interp_v2_op_mlu.py
@@ -274,6 +274,7 @@ def init_test_case(self):
         self.align_corners = True
 
 
+# comment out since 5-D input not supported now
 # class TestNearestNeighborInterpCase1(TestNearestInterpOp):
 #     def init_test_case(self):
 #         self.interp_method = 'nearest'
@@ -537,56 +538,66 @@ def init_test_case(self):
         self.scale_by_1Dtensor = True
 
 
-#TODO: comment this test for now until nearest_interp_op added.
-# class TestNearestAPI(unittest.TestCase):
-#     def test_case(self):
-#         x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
-#         y = fluid.data(name="y", shape=[2, 6, 6, 3], dtype="float32")
-
-#         dim = fluid.data(name="dim", shape=[1], dtype="int32")
-#         shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32")
-#         actual_size = fluid.data(name="actual_size", shape=[2], dtype="int32")
-#         scale_tensor = fluid.data(
-#             name="scale_tensor", shape=[1], dtype="float32")
-
-#         out1 = fluid.layers.resize_nearest(
-#             y, out_shape=[12, 12], data_format='NHWC', align_corners=False)
-#         out2 = fluid.layers.resize_nearest(
-#             x, out_shape=[12, dim], align_corners=False)
-#         out3 = fluid.layers.resize_nearest(
-#             x, out_shape=shape_tensor, align_corners=False)
-#         out4 = fluid.layers.resize_nearest(
-#             x, out_shape=[4, 4], actual_shape=actual_size, align_corners=False)
-#         out5 = fluid.layers.resize_nearest(
-#             x, scale=scale_tensor, align_corners=False)
-
-#         x_data = np.random.random((2, 3, 6, 6)).astype("float32")
-#         dim_data = np.array([12]).astype("int32")
-#         shape_data = np.array([12, 12]).astype("int32")
-#         actual_size_data = np.array([12, 12]).astype("int32")
-#         scale_data = np.array([2.0]).astype("float32")
-
-#         place = paddle.MLUPlace(0)
-#         exe = fluid.Executor(place)
-#         exe.run(fluid.default_startup_program())
-#         results = exe.run(fluid.default_main_program(),
-#                           feed={
-#                               "x": x_data,
-#                               "y": np.transpose(x_data, (0, 2, 3, 1)),
-#                               "dim": dim_data,
-#                               "shape_tensor": shape_data,
-#                               "actual_size": actual_size_data,
-#                               "scale_tensor": scale_data
-#                           },
-#                           fetch_list=[out1, out2, out3, out4, out5],
-#                           return_numpy=True)
-
-#         expect_res = nearest_neighbor_interp_np(
-#             x_data, out_h=12, out_w=12, align_corners=False)
-#         self.assertTrue(
-#             np.allclose(results[0], np.transpose(expect_res, (0, 2, 3, 1))))
-#         for i in range(len(results) - 1):
-#             self.assertTrue(np.allclose(results[i + 1], expect_res))
+class TestNearestAPI(unittest.TestCase):
+
+    def test_case(self):
+        x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+        y = fluid.data(name="y", shape=[2, 6, 6, 3], dtype="float32")
+
+        dim = fluid.data(name="dim", shape=[1], dtype="int32")
+        shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32")
+        actual_size = fluid.data(name="actual_size", shape=[2], dtype="int32")
+        scale_tensor = fluid.data(name="scale_tensor",
+                                  shape=[1],
+                                  dtype="float32")
+
+        out1 = fluid.layers.resize_nearest(y,
+                                           out_shape=[12, 12],
+                                           data_format='NHWC',
+                                           align_corners=False)
+        out2 = fluid.layers.resize_nearest(x,
+                                           out_shape=[12, dim],
+                                           align_corners=False)
+        out3 = fluid.layers.resize_nearest(x,
+                                           out_shape=shape_tensor,
+                                           align_corners=False)
+        out4 = fluid.layers.resize_nearest(x,
+                                           out_shape=[4, 4],
+                                           actual_shape=actual_size,
+                                           align_corners=False)
+        out5 = fluid.layers.resize_nearest(x,
+                                           scale=scale_tensor,
+                                           align_corners=False)
+
+        x_data = np.random.random((2, 3, 6, 6)).astype("float32")
+        dim_data = np.array([12]).astype("int32")
+        shape_data = np.array([12, 12]).astype("int32")
+        actual_size_data = np.array([12, 12]).astype("int32")
+        scale_data = np.array([2.0]).astype("float32")
+
+        place = paddle.MLUPlace(0)
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        results = exe.run(fluid.default_main_program(),
+                          feed={
+                              "x": x_data,
+                              "y": np.transpose(x_data, (0, 2, 3, 1)),
+                              "dim": dim_data,
+                              "shape_tensor": shape_data,
+                              "actual_size": actual_size_data,
+                              "scale_tensor": scale_data
+                          },
+                          fetch_list=[out1, out2, out3, out4, out5],
+                          return_numpy=True)
+
+        expect_res = nearest_neighbor_interp_np(x_data,
+                                                out_h=12,
+                                                out_w=12,
+                                                align_corners=False)
+        self.assertTrue(
+            np.allclose(results[0], np.transpose(expect_res, (0, 2, 3, 1))))
+        for i in range(len(results) - 1):
+            self.assertTrue(np.allclose(results[i + 1], expect_res))
 
 
 class TestNearestInterpException(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_reciprocal_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_reciprocal_op_mlu.py
new file mode 100644
index 0000000000000..1791b1dab28b8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_reciprocal_op_mlu.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function, division
+
+import numpy as np
+import unittest
+import sys
+
+sys.path.append("..")
+from op_test import OpTest, skip_check_grad_ci
+import paddle
+
+paddle.enable_static()
+
+
+class TestMLUReciprocal(OpTest):
+
+    def setUp(self):
+        self.op_type = "reciprocal"
+        self.set_mlu()
+        self.init_dtype()
+
+        np.random.seed(1024)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.reciprocal(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X'],
+                                   'Out',
+                                   max_relative_error=0.01)
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+
+class TestMLUReciprocalFp16(TestMLUReciprocal):
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_relu6_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_relu6_op_mlu.py
index ffb6fee30f5e7..a6bb42878a684 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_relu6_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_relu6_op_mlu.py
@@ -15,13 +15,13 @@
 from __future__ import print_function
 import paddle.fluid as fluid
 import paddle
+import sys
+
+sys.path.append("..")
 from op_test import OpTest
 
 import numpy as np
 import unittest
-import sys
-
-sys.path.append("..")
 
 paddle.enable_static()
 SEED = 2021
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_rnn_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_rnn_op_mlu.py
index f1aabbd3b603b..917597daf3a1d 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_rnn_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_rnn_op_mlu.py
@@ -135,43 +135,50 @@ def init_size(self):
 
     def test_output(self):
         self.check_output_with_place(
-            self.place, no_check_set=['Reserve', 'DropoutState', 'State'])
+            self.place,
+            atol=1e-4,
+            no_check_set=['Reserve', 'DropoutState', 'State'])
 
     def set_attrs(self):
         pass
 
-    # def test_grad(self):
-    #     if not self.is_test:
-    #         var_name_list = self.get_weight_names()
-    #         grad_check_list = ['Input', 'init_h', 'init_c']
-    #         grad_check_list.extend(var_name_list)
-    #         self.check_grad_with_place(self.place, set(grad_check_list),
-    #                                    ['Out', 'last_hidden', 'last_cell'])
+    def test_grad(self):
+        if not self.is_test and self.sequence_length is None:
+            # if not self.is_test:
+            var_name_list = self.get_weight_names()
+            grad_check_list = ['Input', 'init_h', 'init_c']
+            grad_check_list.extend(var_name_list)
+            self.check_grad_with_place(self.place, set(grad_check_list),
+                                       ['Out', 'last_hidden', 'last_cell'])
 
 
-# class TestRNNOp1(TestRNNOp):
+class TestRNNOp1(TestRNNOp):
 
-#     def set_attrs(self):
-#         self.sequence_length = None
+    def set_attrs(self):
+        self.sequence_length = None
 
-# class TestRNNOp2(TestRNNOp):
 
-#     def set_attrs(self):
-#         self.sequence_length = None
-#         self.is_bidirec = True
+class TestRNNOp2(TestRNNOp):
 
-# class TestRNNOp3(TestRNNOp):
+    def set_attrs(self):
+        self.sequence_length = None
+        self.is_bidirec = True
 
-#     def set_attrs(self):
-#         self.is_test = True
-#         self.sequence_length = None
 
-# class TestRNNOp4(TestRNNOp):
+class TestRNNOp3(TestRNNOp):
+
+    def set_attrs(self):
+        self.is_test = True
+        self.sequence_length = None
+
+
+class TestRNNOp4(TestRNNOp):
+
+    def set_attrs(self):
+        self.is_test = True
+        self.sequence_length = None
+        self.is_bidirec = True
 
-#     def set_attrs(self):
-#         self.is_test = True
-#         self.sequence_length = None
-#         self.is_bidirec = True
 
 #TODO(chenxiao): cnnl doesn't support num_layers > 1 case
 # class TestRNNOp5(TestRNNOp):
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_scatter_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_scatter_op_mlu.py
index 0725a27e5125a..d901813e3482a 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_scatter_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_scatter_op_mlu.py
@@ -25,6 +25,8 @@
 import paddle.fluid.core as core
 from paddle.fluid.dygraph.base import switch_to_static_graph
 
+paddle.enable_static()
+
 
 class TestScatterOp(OpTest):
 
@@ -243,5 +245,4 @@ def executed_api(self):
 
 
 if __name__ == "__main__":
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_spawn_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_spawn_mlu.py
index e52b5ee301c5a..fc1d62bfdad5d 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_spawn_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_spawn_mlu.py
@@ -102,12 +102,13 @@ def test_get_default_nprocs(self):
         self.assertEqual(nprocs, core.get_mlu_device_count())
 
     def test_spawn(self):
-        context = dist.spawn(train, backend='cncl', nprocs=4)
+        num_devs = core.get_mlu_device_count()
+        context = dist.spawn(train, backend='cncl', nprocs=num_devs)
         rank_list = []
-        for i in range(4):
+        for i in range(num_devs):
             rank_list.append(context.return_queues[i].get())
         rank_list.sort()
-        self.assertEqual(rank_list, list(range(4)))
+        self.assertEqual(rank_list, list(range(num_devs)))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_squared_l2_norm_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_squared_l2_norm_op_mlu.py
index eee7a4db55d77..6a81c11c70b1b 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_squared_l2_norm_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_squared_l2_norm_op_mlu.py
@@ -24,6 +24,8 @@
 import paddle
 from paddle import _C_ops
 
+paddle.enable_static()
+
 
 class TestL2LossOp(OpTest):
     """Test squared_l2_norm
@@ -66,5 +68,4 @@ def test_main(self):
 
 
 if __name__ == "__main__":
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_base_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_base_mlu.py
new file mode 100644
index 0000000000000..3081ee9d38754
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_base_mlu.py
@@ -0,0 +1,506 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import numpy as np
+import unittest
+import time
+import argparse
+import os
+import six
+import sys
+
+sys.path.append("..")
+import subprocess
+import traceback
+import functools
+import pickle
+from contextlib import closing
+import paddle.fluid as fluid
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+from six import string_types
+import paddle
+
+from paddle.fluid.tests.unittests.op_test import OpTest, _set_use_system_allocator
+
+from paddle.fluid.tests.unittests.test_sync_batch_norm_op import create_or_get_tensor
+
+_set_use_system_allocator(False)
+paddle.enable_static()
+
+SEED = 10
+
+
+class TestSyncBatchNormRunnerBase(object):
+
+    def get_model(self,
+                  main,
+                  startup,
+                  place,
+                  layout,
+                  seed,
+                  sync_bn=False,
+                  only_forward=False):
+        raise NotImplementedError(
+            "get model should be implemented by child class.")
+
+    def wait_server_ready(self, endpoints):
+        assert not isinstance(endpoints, string_types)
+        while True:
+            all_ok = True
+            not_ready_endpoints = []
+            for ep in endpoints:
+                ip_port = ep.split(":")
+                with closing(socket.socket(socket.AF_INET,
+                                           socket.SOCK_STREAM)) as sock:
+                    sock.settimeout(2)
+                    sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+                    if hasattr(socket, 'SO_REUSEPORT'):
+                        sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT,
+                                        1)
+
+                    result = sock.connect_ex((ip_port[0], int(ip_port[1])))
+                    if result != 0:
+                        all_ok = False
+                        not_ready_endpoints.append(ep)
+            if not all_ok:
+                sys.stderr.write("server not ready, wait 3 sec to retry...\n")
+                sys.stderr.write("not ready endpoints:" +
+                                 str(not_ready_endpoints) + "\n")
+                sys.stderr.flush()
+                time.sleep(3)
+            else:
+                break
+
+    def initCommunicator(self, program, rank, nranks, wait_port,
+                         current_endpoint, endpoints):
+        other_endpoints = endpoints[:]
+        other_endpoints.remove(current_endpoint)
+        if rank == 0 and wait_port:
+            self.wait_server_ready(other_endpoints)
+        block = program.global_block()
+        cncl_id_var = block.create_var(name=nameGen.generate('cncl_id'),
+                                       persistable=True,
+                                       type=core.VarDesc.VarType.RAW)
+        block.append_op(type='c_gen_cncl_id',
+                        inputs={},
+                        outputs={'Out': cncl_id_var},
+                        attrs={
+                            'rank': rank,
+                            'endpoint': current_endpoint,
+                            'other_endpoints': other_endpoints
+                        })
+        block.append_op(type='c_comm_init',
+                        inputs={'X': cncl_id_var},
+                        outputs={},
+                        attrs={
+                            'nranks': nranks,
+                            'rank': rank,
+                            'ring_id': self.global_ring_id
+                        })
+
+    def run_trainer(self, args):
+        device_id = int(os.getenv("FLAGS_selected_mlus", "0"))
+        place = fluid.MLUPlace(device_id)
+        places = [place]
+
+        # Test training
+        for place in places:
+            for layout in ["NCHW", "NHWC"]:
+                self._compare(args, place, layout, False)
+
+        # Test inference
+        for place in places:
+            for layout in ["NCHW", "NHWC"]:
+                self._compare(args, place, layout, True)
+
+        # # Test FP16 - @TODO
+        # self.dtype = np.float16
+        # self.atol = 1e-2
+
+        # # Test training
+        # for place in places:
+        #     for layout in ["NCHW", "NHWC"]:
+        #         self._compare(args, place, layout, False)
+
+        # # Test inference
+        # for place in places:
+        #     for layout in ["NCHW", "NHWC"]:
+        #         self._compare(args, place, layout, True)
+
+        sys.stdout.buffer.write(
+            pickle.dumps(
+                'training, inference, fp32, fp16, NCHW, NHWC all passed'))
+
+    def _compare(self, args, place, layout, only_forward):
+        scope = core.Scope()
+
+        np.random.seed(SEED)
+        data = np.random.random(size=self.dshape).astype(self.dtype) * 4. - 2
+        sys.stderr.write("data: " + str(data) + "\n")
+        data = create_or_get_tensor(scope, "input",
+                                    OpTest.np_dtype_to_fluid_dtype(data), place)
+
+        bn_fetches = self._cal_single_card(args, data, place, layout,
+                                           only_forward)
+        fetch_names, sync_bn_fetches = self._cal_multiple_cards(
+            args, data, place, layout, only_forward)
+
+        sys.stderr.write("len(sync_bn_fetches): " + str(len(sync_bn_fetches)) +
+                         "\n")
+        for i in six.moves.xrange(0, len(sync_bn_fetches)):
+            sys.stderr.write("i: " + str(i) + "\n")
+            sys.stderr.write("fetch_names[i]): " + fetch_names[i] + "\n")
+
+            bn_val = bn_fetches[i]
+            sync_bn_val = sync_bn_fetches[i]
+            if sync_bn_val.shape != bn_val.shape:
+                sync_bn_val = sync_bn_val[:bn_val.shape[0]]
+
+            # i = 0
+            if fetch_names[i] == 'reduce_sum_0.tmp_0':
+                # sys.stderr.write("skip reduce_sum_0.tmp_0 (Out of reduce_sum op)" + "\n")
+                sys.stderr.write("reduce_sum_0.tmp_0 (Out of reduce_sum op)" +
+                                 "\n")
+                sys.stderr.write("bn_val: " + str(bn_val) + "\n")
+                sys.stderr.write("sync_bn_val: " + str(sync_bn_val) + "\n")
+
+                # continue
+
+            # i = 1
+            if fetch_names[i] == 'conv2d_0.tmp_0':
+                # sys.stderr.write("skip conv2d_0.tmp_0 (X)" + "\n")
+                sys.stderr.write("conv2d_0.tmp_0 (X)" + "\n")
+                sys.stderr.write("bn_val: " + str(bn_val) + "\n")
+                sys.stderr.write("sync_bn_val: " + str(sync_bn_val) + "\n")
+
+                # continue
+
+            # i = 2
+            if fetch_names[i] == 'batch_norm_0.tmp_3':
+                # sys.stderr.write("skip batch_norm_0.tmp_3 (Y)" + "\n")
+                sys.stderr.write("batch_norm_0.tmp_3 (Y)" + "\n")
+                sys.stderr.write("bn_val: " + str(bn_val) + "\n")
+                sys.stderr.write("sync_bn_val: " + str(sync_bn_val) + "\n")
+
+                # continue
+
+            # i = 2
+            if fetch_names[i] == 'batch_norm_0.tmp_2':
+                # sys.stderr.write("skip batch_norm_0.tmp_2 (ReserveSpace of batch_norm)" + "\n")
+                sys.stderr.write(
+                    "batch_norm_0.tmp_2 (ReserveSpace of batch_norm)" + "\n")
+                sys.stderr.write("bn_val: " + str(bn_val) + "\n")
+                sys.stderr.write("sync_bn_val: " + str(sync_bn_val) + "\n")
+
+                # continue
+
+            # i = 3
+            if fetch_names[i] == 'bn_moving_mean':
+                sys.stderr.write("skip bn_moving_mean (MeanOut)" + "\n")
+                sys.stderr.write("bn_val: " + str(bn_val) + "\n")
+                sys.stderr.write("sync_bn_val: " + str(sync_bn_val) + "\n")
+
+                continue
+
+            # i = 4
+            if fetch_names[i] == 'bn_moving_variance':
+                sys.stderr.write("skip bn_moving_variance (VarianceOut)" + "\n")
+                sys.stderr.write("bn_val: " + str(bn_val) + "\n")
+                sys.stderr.write("sync_bn_val: " + str(sync_bn_val) + "\n")
+
+                continue
+
+            # i = 7
+            if fetch_names[i] == 'batch_norm_0.tmp_0':
+                # sys.stderr.write("skip batch_norm_0.tmp_0 (SavedMean)" + "\n")
+                sys.stderr.write("batch_norm_0.tmp_0 (SavedMean)" + "\n")
+                sys.stderr.write("bn_val: " + str(bn_val) + "\n")
+                sys.stderr.write("sync_bn_val: " + str(sync_bn_val) + "\n")
+
+                # continue
+
+            # i = 8
+            if fetch_names[i] == 'batch_norm_0.tmp_1':
+                sys.stderr.write("skip batch_norm_0.tmp_1 (SavedVariance)" +
+                                 "\n")
+                sys.stderr.write("bn_val: " + str(bn_val) + "\n")
+                sys.stderr.write("sync_bn_val: " + str(sync_bn_val) + "\n")
+
+                continue
+
+            # i = 9
+            if fetch_names[i] == 'bn_scale@GRAD':
+                # sys.stderr.write("skip bn_scale@GRAD (Scale@GRAD)" + "\n")
+                sys.stderr.write("bn_scale@GRAD (Scale@GRAD)" + "\n")
+                sys.stderr.write("bn_val: " + str(bn_val) + "\n")
+                sys.stderr.write("sync_bn_val: " + str(sync_bn_val) + "\n")
+
+                # continue
+
+            # i = 10
+            if fetch_names[i] == 'bn_bias@GRAD':
+                # sys.stderr.write("skip bn_bias@GRAD (Bias@GRAD)" + "\n")
+                sys.stderr.write("bn_bias@GRAD (Bias@GRAD)" + "\n")
+                sys.stderr.write("bn_val: " + str(bn_val) + "\n")
+                sys.stderr.write("sync_bn_val: " + str(sync_bn_val) + "\n")
+
+                # continue
+
+            # i = 11
+            if fetch_names[i] == 'batch_norm_0.tmp_3@GRAD':
+                # sys.stderr.write("skip batch_norm_0.tmp_3@GRAD (Y@GRAD)" + "\n")
+                sys.stderr.write("batch_norm_0.tmp_3@GRAD (Y@GRAD)" + "\n")
+                sys.stderr.write("bn_val: " + str(bn_val) + "\n")
+                sys.stderr.write("sync_bn_val: " + str(sync_bn_val) + "\n")
+
+                # continue
+
+            # i = 12
+            if fetch_names[i] == 'conv2d_0.tmp_0@GRAD':
+                # sys.stderr.write("skip conv2d_0.tmp_0@GRAD (X@GRAD)" + "\n")
+                sys.stderr.write("conv2d_0.tmp_0@GRAD (X@GRAD)" + "\n")
+                sys.stderr.write("bn_val: " + str(bn_val) + "\n")
+                sys.stderr.write("sync_bn_val: " + str(sync_bn_val) + "\n")
+
+                # continue
+
+            atol = self.atol
+            if fetch_names[i] == 'conv2d_0.tmp_0@GRAD':
+                atol = 1e-2
+
+            assert np.allclose(
+                bn_val, sync_bn_val, atol=atol), "Output (" + fetch_names[
+                    i] + ") has diff. \n" + "\nBN     " + str(
+                        bn_val) + "\n" + "Sync BN " + str(sync_bn_val)
+
+    def _cal_single_card(self, args, data, place, layout, only_forward):
+        # Single-MLU, N = 32 per MLU
+        train_prog = fluid.Program()
+        startup_prog = fluid.Program()
+        train_prog.global_seed(SEED)
+        startup_prog.global_seed(SEED)
+        paddle.seed(SEED)
+
+        outs = self.get_model(train_prog, startup_prog, place, layout, SEED,
+                              False, only_forward)
+
+        exe = fluid.Executor(place)
+        exe.run(startup_prog)
+        fetch_names = [v.name for v in outs] + [
+            'bn_moving_mean', 'bn_moving_variance', 'bn_scale', 'bn_bias'
+        ]
+        if not only_forward:
+            others = [
+                'batch_norm_0.tmp_0', 'batch_norm_0.tmp_1', 'bn_scale@GRAD',
+                'bn_bias@GRAD', 'batch_norm_0.tmp_3@GRAD', 'conv2d_0.tmp_0@GRAD'
+            ]
+            fetch_names += others
+        bn_fetches = exe.run(program=train_prog,
+                             feed={'input': data},
+                             fetch_list=fetch_names)
+
+        return bn_fetches
+
+    def _cal_multiple_cards(self, args, data, place, layout, only_forward):
+        # Multi-MLUs, self.N per MLU
+        assert core.get_mlu_device_count() > 1
+
+        train_prog = fluid.Program()
+        startup_prog = fluid.Program()
+        train_prog.global_seed(SEED)
+        startup_prog.global_seed(SEED)
+        paddle.seed(SEED)
+        sys.stderr.write("train_prog: " + train_prog.to_string(True) + "\n")
+        sys.stderr.write("startup_prog: " + startup_prog.to_string(True) + "\n")
+
+        endpoints = args["endpoints"].split(",")
+        rank = args["trainerid"]
+        current_endpoint = args["currentendpoint"]
+        nranks = 2
+
+        self.initCommunicator(startup_prog, rank, nranks, True,
+                              current_endpoint, endpoints)
+        sys.stderr.write("after init, startup_prog: " +
+                         startup_prog.to_string(True) + "\n")
+        train_prog.global_seed(SEED)
+        train_prog._sync_with_cpp()
+        startup_prog.global_seed(SEED)
+        startup_prog._sync_with_cpp()
+        paddle.seed(SEED)
+
+        self.rank = rank
+        outs = self.get_model(train_prog, startup_prog, place, layout, SEED,
+                              True, only_forward)
+        sys.stderr.write("after get_model, train_prog: " +
+                         train_prog.to_string(True) + "\n")
+        sys.stderr.write("after get_model, startup_prog: " +
+                         startup_prog.to_string(True) + "\n")
+
+        ops = train_prog.blocks[0].ops
+        for i, op in enumerate(ops):
+            if op.type == 'batch_norm':
+                sys.stderr.write("i: " + str(i) + "\n")
+                sys.stderr.write("op type: " + op.type + "\n")
+                op.desc.set_type('sync_batch_norm')
+            if op.type == 'batch_norm_grad':
+                sys.stderr.write("i: " + str(i) + "\n")
+                sys.stderr.write("op type: " + op.type + "\n")
+                op.desc.set_type('sync_batch_norm_grad')
+
+        sys.stderr.write("after update sync_batch_norm, train_prog: " +
+                         train_prog.to_string(True) + "\n")
+
+        exe = fluid.Executor(place)
+        exe.run(startup_prog)
+        fetch_names = [v.name for v in outs] + [
+            'bn_moving_mean', 'bn_moving_variance', 'bn_scale', 'bn_bias'
+        ]
+        if not only_forward:
+            others = [
+                'batch_norm_0.tmp_0', 'batch_norm_0.tmp_1', 'bn_scale@GRAD',
+                'bn_bias@GRAD', 'batch_norm_0.tmp_3@GRAD', 'conv2d_0.tmp_0@GRAD'
+            ]
+            fetch_names += others
+        sync_bn_fetches = exe.run(program=train_prog,
+                                  feed={'input': data},
+                                  fetch_list=fetch_names)
+
+        return fetch_names, sync_bn_fetches
+
+
+def runtime_main(test_class, col_type, sub_type):
+    args = {}
+    model = test_class()
+    args["deviceid"] = os.getenv("FLAGS_selected_mlus")
+    args["trainerid"] = int(os.getenv("PADDLE_TRAINER_ID"))
+    args["trainernum"] = int(os.getenv("PADDLE_TRAINERS_NUM"))
+    args["endpoints"] = os.getenv('PADDLE_TRAINER_ENDPOINTS')
+    args["currentendpoint"] = os.getenv("PADDLE_CURRENT_ENDPOINT")
+    args["col_type"] = col_type
+    model.run_trainer(args)
+
+
+import paddle.compat as cpt
+import socket
+from contextlib import closing
+
+
+class TestDistBase(unittest.TestCase):
+
+    def setUp(self):
+        self._port_set = set()
+        self._trainers = 2
+        self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+            self._find_free_port(), self._find_free_port())
+        self._python_interp = sys.executable
+
+    def _find_free_port(self):
+
+        def __free_port():
+            with closing(socket.socket(socket.AF_INET,
+                                       socket.SOCK_STREAM)) as s:
+                s.bind(('', 0))
+                return s.getsockname()[1]
+
+        while True:
+            port = __free_port()
+            if port not in self._port_set:
+                self._port_set.add(port)
+                return port
+
+    def _run_cluster(self, model_file, envs):
+        worker_endpoints = self._ps_endpoints.split(",")
+        w0_ep, w1_ep = worker_endpoints
+        # print("w0_ep:", w0_ep, " w1_ep:", w1_ep)
+        env0 = {
+            "FLAGS_selected_mlus": "0",
+            "PADDLE_TRAINER_ID": "0",
+            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
+            "PADDLE_CURRENT_ENDPOINT": w0_ep,
+        }
+
+        env1 = {
+            "FLAGS_selected_mlus": "1",
+            "PADDLE_TRAINER_ID": "1",
+            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
+            "PADDLE_CURRENT_ENDPOINT": w1_ep,
+        }
+        #update environment
+        env0.update(envs)
+        env1.update(envs)
+
+        tr_cmd = "%s %s"
+        tr0_cmd = tr_cmd % (self._python_interp, model_file)
+        tr1_cmd = tr_cmd % (self._python_interp, model_file)
+        tr0_pipe = open("/tmp/tr0_err_%d.log" % os.getpid(), "w")
+        tr1_pipe = open("/tmp/tr1_err_%d.log" % os.getpid(), "w")
+        print("tr0_cmd: {}, env: {}\n".format(tr0_cmd, env0))
+        print("tr1_cmd: {}, env: {}\n".format(tr1_cmd, env1))
+        tr0_proc = subprocess.Popen(tr0_cmd.strip().split(),
+                                    stdout=subprocess.PIPE,
+                                    stderr=tr0_pipe,
+                                    env=env0)
+
+        tr1_proc = subprocess.Popen(tr0_cmd.strip().split(),
+                                    stdout=subprocess.PIPE,
+                                    stderr=tr1_pipe,
+                                    env=env1)
+
+        tr0_out, tr0_err = tr0_proc.communicate()
+        tr1_out, tr1_err = tr1_proc.communicate()
+
+        sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err)
+        sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
+        # close trainer file
+        tr0_pipe.close()
+        tr1_pipe.close()
+        with open("/tmp/tr0_err_%d.log" % os.getpid(), "r") as f:
+            sys.stderr.write('trainer 0 stderr file: %s\n' % f.read())
+        with open("/tmp/tr1_err_%d.log" % os.getpid(), "r") as f:
+            sys.stderr.write('trainer 1 stderr file: %s\n' % f.read())
+        return pickle.loads(tr0_out), pickle.loads(
+            tr1_out), tr0_proc.pid, tr1_proc.pid
+
+    def check_with_place(self,
+                         model_file,
+                         col_type,
+                         check_error_log=False,
+                         need_envs={}):
+        required_envs = {
+            "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
+            "FLAGS_eager_delete_tensor_gb": "0.0",
+            "PATH": os.getenv("PATH"),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "LD_PRELOAD": os.getenv("LD_PRELOAD", ""),
+            "FLAGS_call_stack_level": "2",
+            "GLOG_v": "3",
+            "PADDLE_WITH_GLOO": '0',
+            "BACKEND": "cncl"
+        }
+        required_envs.update(need_envs)
+        if check_error_log:
+            required_envs["GLOG_v"] = "3"
+            required_envs["GLOG_logtostderr"] = "1"
+            required_envs["GLOO_LOG_LEVEL"] = "TRACE"
+        tr0_out, tr1_out, pid0, pid1 = self._run_cluster(
+            model_file, required_envs)
+        self.assertEqual(
+            tr0_out, 'training, inference, fp32, fp16, NCHW, NHWC all passed')
+        self.assertEqual(
+            tr1_out, 'training, inference, fp32, fp16, NCHW, NHWC all passed')
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_op_mlu_baseline.py b/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_op_mlu_baseline.py
new file mode 100644
index 0000000000000..ac3f686cb8fe2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_op_mlu_baseline.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+import os
+import sys
+
+sys.path.append("..")
+from paddle.fluid.tests.unittests.op_test import OpTest, _set_use_system_allocator
+
+from test_sync_batch_norm_base_mlu import TestDistBase
+
+_set_use_system_allocator(False)
+paddle.enable_static()
+
+
+class TestSyncBatchNormOp(TestDistBase):
+
+    def _setup_config(self):
+        pass
+
+    def test_identity(self, col_type="identity"):
+        self.check_with_place("sync_batch_norm_op_mlu.py",
+                              col_type,
+                              check_error_log=True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_op_mlu_extra.py b/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_op_mlu_extra.py
new file mode 100644
index 0000000000000..955d9a122a292
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_op_mlu_extra.py
@@ -0,0 +1,177 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+test for sync bachnorm op.
+for both FP32 and FP16 input.
+"""
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import os
+import sys
+import six
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+import paddle.nn as nn
+from paddle.fluid import Program, program_guard
+
+from paddle.fluid.tests.unittests.op_test import OpTest, _set_use_system_allocator
+from paddle.fluid.tests.unittests.test_dist_base import TestDistBase
+
+paddle.enable_static()
+
+
+class TestDygraphSyncBatchNormAPIError(unittest.TestCase):
+
+    def test_errors(self):
+        if not core.is_compiled_with_mlu():
+            return
+
+        with program_guard(Program(), Program()):
+            my_sync_batch_norm = paddle.nn.SyncBatchNorm(10)
+            x1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]),
+                                         [[1, 1, 1, 1]], fluid.MLUPlace(0))
+            self.assertRaises(TypeError, my_sync_batch_norm, x1)
+
+            # the input dtype of SyncBatchNorm must be float16 or float32
+            x2 = fluid.layers.data(name='x2', shape=[3, 4, 5, 6], dtype="int32")
+            self.assertRaises(TypeError, my_sync_batch_norm, x2)
+
+
+class TestConvertSyncBatchNorm(unittest.TestCase):
+
+    def test_convert(self):
+        if not core.is_compiled_with_mlu():
+            return
+
+        with program_guard(Program(), Program()):
+            compare_model = paddle.nn.Sequential(paddle.nn.Conv2D(3, 5, 3),
+                                                 paddle.nn.BatchNorm2D(5),
+                                                 paddle.nn.BatchNorm2D(5))
+            model = paddle.nn.Sequential(
+                paddle.nn.Conv2D(3, 5, 3), paddle.nn.BatchNorm2D(5),
+                paddle.nn.BatchNorm2D(
+                    5,
+                    weight_attr=fluid.ParamAttr(name='bn.scale'),
+                    bias_attr=fluid.ParamAttr(name='bn.bias')))
+            model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+            for idx, sublayer in enumerate(compare_model.sublayers()):
+                if isinstance(sublayer, paddle.nn.BatchNorm2D):
+                    self.assertEqual(
+                        isinstance(model[idx], paddle.nn.SyncBatchNorm), True)
+
+
+class TestConvertSyncBatchNormCast1(unittest.TestCase):
+
+    def test_convert(self):
+        if not core.is_compiled_with_mlu():
+            return
+
+        class Net(nn.Layer):
+
+            def __init__(self):
+                super(Net, self).__init__()
+                self.conv1 = nn.Conv2D(3, 5, 3)
+                self.bn = []
+                bn = self.add_sublayer('bn', nn.BatchNorm2D(5))
+                self.bn.append(bn)
+
+            def forward(self, x):
+                x = self.conv1(x)
+                for bn in self.bn:
+                    x = bn(x)
+                return x
+
+        model = nn.Sequential()
+        model.add_sublayer('net1', Net())
+        model.add_sublayer('net2', Net())
+        compare_model = nn.Sequential()
+        compare_model.add_sublayer('net1', Net())
+        compare_model.add_sublayer('net2', Net())
+        model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
+        self.assertEqual(len(compare_model.sublayers()), len(model.sublayers()))
+
+
+class TestConvertSyncBatchNormCase2(unittest.TestCase):
+
+    def test_convert(self):
+        if not core.is_compiled_with_mlu():
+            return
+
+        with fluid.dygraph.guard(fluid.MLUPlace(0)):
+
+            class SyBNNet(paddle.nn.Layer):
+
+                def __init__(self, in_ch=3, out_ch=3, dirate=1):
+                    super(SyBNNet, self).__init__()
+                    self.bn_s1 = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(
+                        paddle.nn.BatchNorm3D(
+                            out_ch,
+                            weight_attr=paddle.ParamAttr(
+                                regularizer=paddle.regularizer.L2Decay(0.))))
+                    self.bn_s2 = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(
+                        paddle.nn.BatchNorm3D(out_ch, data_format='NDHWC'))
+
+                def forward(self, x):
+                    x = self.bn_s1(x)
+                    out = paddle.sum(paddle.abs(self.bn_s2(x)))
+                    return out
+
+            class BNNet(paddle.nn.Layer):
+
+                def __init__(self, in_ch=3, out_ch=3, dirate=1):
+                    super(BNNet, self).__init__()
+                    self.bn_s1 = paddle.nn.BatchNorm3D(
+                        out_ch,
+                        weight_attr=paddle.ParamAttr(
+                            regularizer=paddle.regularizer.L2Decay(0.)))
+                    self.bn_s2 = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(
+                        paddle.nn.BatchNorm3D(out_ch, data_format='NDHWC'))
+
+                def forward(self, x):
+                    x = self.bn_s1(x)
+                    out = paddle.sum(paddle.abs(self.bn_s2(x)))
+                    return out
+
+            bn_model = BNNet()
+            sybn_model = SyBNNet()
+            np.random.seed(10)
+            data = np.random.random([3, 3, 3, 3, 3]).astype('float32')
+            x = paddle.to_tensor(data)
+            bn_out = bn_model(x)
+            sybn_out = sybn_model(x)
+            self.assertTrue(
+                np.allclose(bn_out.numpy(), sybn_out.numpy()),
+                "Output has diff. \n" + "\nBN     " + str(bn_out.numpy()) +
+                "\n" + "Sync BN " + str(sybn_out.numpy()))
+
+
+class TestDygraphSyncBatchNormDataFormatError(unittest.TestCase):
+
+    def test_errors(self):
+        if not core.is_compiled_with_mlu():
+            return
+
+        with fluid.dygraph.guard(fluid.MLUPlace(0)):
+            my_sync_batch_norm = paddle.nn.SyncBatchNorm(10, data_format='CN')
+            data = np.random.random([3, 3, 3]).astype('float32')
+            x = paddle.to_tensor(data)
+            self.assertRaises(ValueError, my_sync_batch_norm, x)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_tile_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_tile_op_mlu.py
index 1a2f5dbd40eb6..7c1e227ba2c07 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_tile_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_tile_op_mlu.py
@@ -24,6 +24,8 @@
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
 
+paddle.enable_static()
+
 
 #Situation 1: repeat_times is a list (without tensor)
 class TestTileOpRank1(OpTest):
@@ -277,5 +279,4 @@ def test_api(self):
 
 
 if __name__ == "__main__":
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_where_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_where_op_mlu.py
index 38d5e6e94c066..3f1d553f7386e 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_where_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_where_op_mlu.py
@@ -29,6 +29,8 @@
 from paddle.fluid.backward import append_backward
 from paddle.fluid.framework import _test_eager_guard
 
+paddle.enable_static()
+
 
 class TestWhereOp(OpTest):
 
@@ -107,7 +109,7 @@ def test_api(self, use_mlu=False):
                     x.stop_gradient = x_stop_gradient
                     y.stop_gradient = y_stop_gradient
                     result = paddle.where(cond, x, y)
-                    append_backward(layers.mean(result))
+                    append_backward(paddle.mean(result))
                     for use_mlu in [False, True]:
                         place = (paddle.device.MLUPlace(0)
                                  if use_mlu else fluid.CPUPlace())
@@ -396,5 +398,4 @@ def test_value_error(self):
 
 
 if __name__ == "__main__":
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_momentum_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_momentum_op_npu.py
index 6c2e24bb16382..93b1f06598de9 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_momentum_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_momentum_op_npu.py
@@ -117,7 +117,7 @@ def test_momentum(self):
             y = fluid.layers.data(name='y', shape=[1], dtype='float32')
             y_predict = fluid.layers.fc(input=x, size=1, act=None)
             cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-            avg_cost = fluid.layers.mean(cost)
+            avg_cost = paddle.mean(cost)
 
             rms_optimizer = paddle.optimizer.Momentum(learning_rate=0.1,
                                                       momentum=0.9)
@@ -243,7 +243,7 @@ def test_momentum_static(self):
             y = fluid.layers.data(name='y', shape=[1], dtype='float32')
             y_predict = fluid.layers.fc(input=x, size=1, act=None)
             cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-            avg_cost = fluid.layers.mean(cost)
+            avg_cost = paddle.mean(cost)
 
             momentum_optimizer = paddle.fluid.contrib.optimizer.Momentum(
                 learning_rate=0.1, momentum=0.9)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py
index 9d734eac48be0..ada6e0f5f5384 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py
@@ -86,7 +86,7 @@ def _test(self, run_npu=True):
             prob = fluid.layers.softmax(prediction, axis=1)
 
             cost = fluid.layers.cross_entropy(input=prob, label=label)
-            loss = fluid.layers.mean(cost)
+            loss = paddle.mean(cost)
             sgd = fluid.optimizer.SGD(learning_rate=0.01)
             sgd.minimize(loss)
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_where_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_where_op_npu.py
index c90bf0cb49398..2c3b07c5bb2b9 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_where_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_where_op_npu.py
@@ -105,7 +105,7 @@ def test_api(self):
                     y.stop_gradient = y_stop_gradient
 
                     result = paddle.where(cond, x, y)
-                    append_backward(fluid.layers.mean(result))
+                    append_backward(paddle.mean(result))
 
                     exe = fluid.Executor(self.place)
                     exe.run(startup)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_while_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_while_op_npu.py
index 22918347a2de3..b3d7f41ec9d4b 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_while_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_while_op_npu.py
@@ -90,7 +90,7 @@ def simple_net(self):
                 layers.array_write(result2, i=j, array=mem_array)
                 layers.less_than(x=j, y=array_len2, cond=cond2)
         sum_result = layers.array_read(array=mem_array, i=j)
-        loss = layers.mean(sum_result)
+        loss = paddle.mean(sum_result)
         return loss, sum_result
 
     def test_simple_net(self):
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py
index 93ca1fa5b56a0..4a6905ca66b89 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py
@@ -112,7 +112,7 @@ def forward(self, inputs, label):
         x = fluid.layers.reshape(x, shape=[-1, self.pool_2_shape])
         cost = self._fc(x)
         loss = fluid.layers.cross_entropy(cost, label)
-        avg_loss = fluid.layers.mean(loss)
+        avg_loss = paddle.mean(loss)
         return avg_loss
 
 
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py
index 6ee04dd342b81..460e6110aba73 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py
@@ -335,7 +335,7 @@ def run_one_loop(self, model, opt, data):
         out = model(img)
         softmax_out = fluid.layers.softmax(out, use_cudnn=False)
         loss = fluid.layers.cross_entropy(input=softmax_out, label=label)
-        avg_loss = fluid.layers.mean(x=loss)
+        avg_loss = paddle.mean(x=loss)
         return avg_loss
 
 
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
index a8e099137a349..c35e6d37b43e0 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
@@ -94,7 +94,7 @@ def run_one_loop(self, model, opt, data):
 
         out = model(img)
 
-        out = fluid.layers.mean(out)
+        out = paddle.mean(out)
 
         return out
 
diff --git a/python/paddle/fluid/tests/unittests/pipeline_mnist.py b/python/paddle/fluid/tests/unittests/pipeline_mnist.py
index 90238f56eea24..928d5e4b83f3d 100644
--- a/python/paddle/fluid/tests/unittests/pipeline_mnist.py
+++ b/python/paddle/fluid/tests/unittests/pipeline_mnist.py
@@ -104,7 +104,7 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
             predict = cnn_model(images)
         with fluid.device_guard("gpu:1"):
             cost = fluid.layers.cross_entropy(input=predict, label=label)
-            avg_cost = fluid.layers.mean(x=cost)
+            avg_cost = paddle.mean(x=cost)
 
         # Evaluator
         with fluid.device_guard("gpu:1"):
diff --git a/python/paddle/fluid/tests/unittests/pipeline_mnist_multi_device.py b/python/paddle/fluid/tests/unittests/pipeline_mnist_multi_device.py
index 3ec8dfb44850e..e0323092164ca 100644
--- a/python/paddle/fluid/tests/unittests/pipeline_mnist_multi_device.py
+++ b/python/paddle/fluid/tests/unittests/pipeline_mnist_multi_device.py
@@ -104,7 +104,7 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
             predict = cnn_model(images)
         with fluid.device_guard("gpu:1"):
             cost = fluid.layers.cross_entropy(input=predict, label=label)
-            avg_cost = fluid.layers.mean(x=cost)
+            avg_cost = paddle.mean(x=cost)
 
         # Evaluator
         with fluid.device_guard("gpu:1"):
diff --git a/python/paddle/fluid/tests/unittests/pipeline_mnist_one_device.py b/python/paddle/fluid/tests/unittests/pipeline_mnist_one_device.py
index cfc5a4904ac3e..61d7208617dee 100644
--- a/python/paddle/fluid/tests/unittests/pipeline_mnist_one_device.py
+++ b/python/paddle/fluid/tests/unittests/pipeline_mnist_one_device.py
@@ -98,7 +98,7 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
             predict = cnn_model(images)
         with fluid.device_guard("gpu:0"):
             cost = fluid.layers.cross_entropy(input=predict, label=label)
-            avg_cost = fluid.layers.mean(x=cost)
+            avg_cost = paddle.mean(x=cost)
 
         # Evaluator
         with fluid.device_guard("gpu:0"):
diff --git a/python/paddle/fluid/tests/unittests/seresnext_net.py b/python/paddle/fluid/tests/unittests/seresnext_net.py
index b014a079b80e3..68af1bdcc9385 100644
--- a/python/paddle/fluid/tests/unittests/seresnext_net.py
+++ b/python/paddle/fluid/tests/unittests/seresnext_net.py
@@ -17,6 +17,7 @@
 
 fluid.core._set_eager_deletion_mode(-1, -1, False)
 
+import paddle
 import paddle.fluid.layers.ops as ops
 from paddle.fluid.layers.learning_rate_scheduler import cosine_decay
 from simple_nets import init_data
@@ -172,7 +173,7 @@ def SE_ResNeXt50Small(use_feed):
     # Classifier layer:
     prediction = fluid.layers.fc(input=dropout, size=1000, act='softmax')
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
+    loss = paddle.mean(loss)
     return loss
 
 
diff --git a/python/paddle/fluid/tests/unittests/simple_nets.py b/python/paddle/fluid/tests/unittests/simple_nets.py
index b9e38d21da831..9326d51591576 100644
--- a/python/paddle/fluid/tests/unittests/simple_nets.py
+++ b/python/paddle/fluid/tests/unittests/simple_nets.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle
 import paddle.fluid as fluid
 import numpy as np
 
@@ -27,7 +28,7 @@ def simple_fc_net_with_inputs(img, label, class_num=10):
                 value=1.0)))
     prediction = fluid.layers.fc(hidden, size=class_num, act='softmax')
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
+    loss = paddle.mean(loss)
     return loss
 
 
@@ -51,7 +52,7 @@ def batchnorm_fc_with_inputs(img, label, class_num=10):
 
     prediction = fluid.layers.fc(hidden, size=class_num, act='softmax')
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
+    loss = paddle.mean(loss)
     return loss
 
 
@@ -87,7 +88,7 @@ def bow_net(use_feed,
     fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
     prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
     cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
+    avg_cost = paddle.mean(x=cost)
 
     return avg_cost
 
diff --git a/python/paddle/fluid/tests/unittests/test_adadelta_op.py b/python/paddle/fluid/tests/unittests/test_adadelta_op.py
index 5d96dc38a7103..b6c2a47ac38ae 100644
--- a/python/paddle/fluid/tests/unittests/test_adadelta_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adadelta_op.py
@@ -136,7 +136,7 @@ def test_adadelta(self):
             y = fluid.layers.data(name='y', shape=[1], dtype='float32')
             y_predict = fluid.layers.fc(input=x, size=1, act=None)
             cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-            avg_cost = fluid.layers.mean(cost)
+            avg_cost = paddle.mean(cost)
 
             rms_optimizer = paddle.optimizer.Adadelta(learning_rate=0.1)
             rms_optimizer.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
index 61597562a4ab0..428d0e7c21026 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -932,7 +932,7 @@ def test_adam_flatten_param_grads_with_regularizer(self):
                                         act=None,
                                         param_attr=weight_attr)
             cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-            avg_cost = fluid.layers.mean(cost)
+            avg_cost = paddle.mean(cost)
 
             adam = fluid.optimizer.AdamOptimizer(0.01,
                                                  flatten_param_grads=True,
@@ -1149,7 +1149,7 @@ def _adam_optimize_static(self,
                                           name='X',
                                           dtype='float32')
             hidden = paddle.static.nn.fc(x=data, size=10)
-            loss = paddle.fluid.layers.mean(hidden)
+            loss = paddle.mean(hidden)
             optimizer.minimize(loss)
         exe.run(startup_program)
         if use_amp:
diff --git a/python/paddle/fluid/tests/unittests/test_adam_optimizer_fp32_fp64.py b/python/paddle/fluid/tests/unittests/test_adam_optimizer_fp32_fp64.py
index cc57293a7fa04..1f08eb085a3c5 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_optimizer_fp32_fp64.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_optimizer_fp32_fp64.py
@@ -33,7 +33,7 @@ def main_test_func(place, dtype):
             y = fluid.data(name='y', shape=[None, 1], dtype=dtype)
             y_predict = fluid.layers.fc(input=x, size=1, act=None)
             cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-            avg_cost = fluid.layers.mean(cost)
+            avg_cost = paddle.mean(cost)
 
             adam_optimizer = fluid.optimizer.AdamOptimizer(0.01)
             adam_optimizer.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py
index 2ece3d2d8ddf0..e39638d86555e 100644
--- a/python/paddle/fluid/tests/unittests/test_adamw_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py
@@ -584,7 +584,7 @@ def test_adamw_op(self):
                 fc2_b_mon2 = np.zeros((linear2.bias.shape)).astype("float32")
 
                 cost = fluid.layers.square_error_cost(input=out, label=y)
-                avg_cost = fluid.layers.mean(cost)
+                avg_cost = paddle.mean(cost)
 
                 simple_lr_fun = partial(simple_lr_setting,
                                         decay_rate=0.8,
diff --git a/python/paddle/fluid/tests/unittests/test_array_read_write_op.py b/python/paddle/fluid/tests/unittests/test_array_read_write_op.py
index 8ed220daf035a..a8d630278f735 100644
--- a/python/paddle/fluid/tests/unittests/test_array_read_write_op.py
+++ b/python/paddle/fluid/tests/unittests/test_array_read_write_op.py
@@ -44,15 +44,15 @@ def _test_read_write(x):
     i = layers.increment(x=i)
     a2 = layers.array_read(array=arr, i=i)
 
-    mean_a0 = layers.mean(a0)
-    mean_a1 = layers.mean(a1)
-    mean_a2 = layers.mean(a2)
+    mean_a0 = paddle.mean(a0)
+    mean_a1 = paddle.mean(a1)
+    mean_a2 = paddle.mean(a2)
 
     a_sum = layers.sums(input=[mean_a0, mean_a1, mean_a2])
 
-    mean_x0 = layers.mean(x[0])
-    mean_x1 = layers.mean(x[1])
-    mean_x2 = layers.mean(x[2])
+    mean_x0 = paddle.mean(x[0])
+    mean_x1 = paddle.mean(x[1])
+    mean_x2 = paddle.mean(x[2])
 
     x_sum = layers.sums(input=[mean_x0, mean_x1, mean_x2])
 
diff --git a/python/paddle/fluid/tests/unittests/test_assign_op.py b/python/paddle/fluid/tests/unittests/test_assign_op.py
index 31afb85750e8c..147633a62afd1 100644
--- a/python/paddle/fluid/tests/unittests/test_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_assign_op.py
@@ -82,7 +82,7 @@ def test_assign_LoDTensorArray(self):
             init_array = fluid.layers.array_write(x=z, i=i)
             array = fluid.layers.assign(init_array)
             sums = fluid.layers.array_read(array=init_array, i=i)
-            mean = fluid.layers.mean(sums)
+            mean = paddle.mean(sums)
             append_backward(mean)
         fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
@@ -128,7 +128,7 @@ def test_assign_LoDTensorArray(self):
             init_array = fluid.layers.array_write(x=z, i=i)
             array = paddle.assign(init_array)
             sums = fluid.layers.array_read(array=init_array, i=i)
-            mean = fluid.layers.mean(sums)
+            mean = paddle.mean(sums)
             append_backward(mean)
 
         place = fluid.CUDAPlace(
diff --git a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
index 9dee8088ecd96..497457197dcc5 100644
--- a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
@@ -54,7 +54,7 @@ def convolutional_neural_network(use_py_reader):
 
         prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
         loss = fluid.layers.cross_entropy(input=prediction, label=label)
-        avg_loss = fluid.layers.mean(loss)
+        avg_loss = paddle.mean(loss)
         acc = fluid.layers.accuracy(input=prediction, label=label)
         i = fluid.layers.zeros(shape=[1], dtype='int64')
         array = fluid.layers.array_write(x=prediction, i=i)
diff --git a/python/paddle/fluid/tests/unittests/test_backward.py b/python/paddle/fluid/tests/unittests/test_backward.py
index a6c9caacc7806..b64c9e98654e1 100644
--- a/python/paddle/fluid/tests/unittests/test_backward.py
+++ b/python/paddle/fluid/tests/unittests/test_backward.py
@@ -180,7 +180,7 @@ def __init__(self):
             u'softmax',  # fc
             u'elementwise_sub',
             u'square',
-            u'mean'
+            u'reduce_mean'
         ]  # loss
         self.shape = [16, 50]
 
@@ -235,7 +235,7 @@ def build_model(self):
                                     name='fc_no_use')
         # loss
         cost = fluid.layers.square_error_cost(input=predict, label=label)
-        loss = fluid.layers.mean(cost, name='mean_loss')
+        loss = paddle.mean(cost, name='mean_loss')
 
         return loss
 
@@ -308,7 +308,7 @@ def build_net(self):
         x_emb = fluid.embedding(x, size=[100, 256])
         y_predict = fluid.layers.fc(input=x_emb, size=1, name='my_fc')
         loss = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_loss = fluid.layers.mean(loss)
+        avg_loss = paddle.mean(loss)
         param_names = [
             param.name
             for param in fluid.default_main_program().block(0).all_parameters()
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
index cfd5d5f7c9bd0..7aa3b8cddf80c 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
@@ -82,50 +82,58 @@ def error3d():
                 self.assertRaises(ValueError, error2d_dataformat)
                 self.assertRaises(ValueError, error3d_dataformat)
 
-    def test_eager_api(self):
-        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for p in places:
-            shape = [4, 10, 4, 4]
+    def test_large_batch(self):
 
-            def compute_v1(x):
-                with fluid.dygraph.guard(p):
-                    bn = fluid.dygraph.BatchNorm(shape[1])
-                    #bn = paddle.nn.BatchNorm2D(shape[1])
+        def compute_baseline(x):
+            with fluid.dygraph.guard(p):
+                bn = fluid.dygraph.BatchNorm(shape[1])
+                x1 = paddle.to_tensor(x)
+                x1.stop_gradient = False
+                y = bn(x1)
+                y.backward()
+                return y.numpy(), x1.gradient()
+
+        def compute_1d(x):
+            with fluid.dygraph.guard(p):
+                with _test_eager_guard():
+                    bn = paddle.nn.BatchNorm1D(shape[1])
                     x1 = paddle.to_tensor(x)
                     x1.stop_gradient = False
                     y = bn(x1)
                     y.backward()
                     return y.numpy(), x1.gradient()
 
-            def compute_v2(x):
-                with fluid.dygraph.guard(p):
-                    with _test_eager_guard():
-                        print("v2")
-                        bn = paddle.nn.BatchNorm2D(shape[1])
-                        x1 = paddle.to_tensor(x)
-                        x1.stop_gradient = False
-                        y = bn(x1)
-                        y.backward()
-                        return y.numpy(), x1.gradient()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            # [N, C]
+            shape = [200000, 4]
+            x = np.random.randn(*shape).astype("float32")
+            y1, g1 = compute_baseline(x)
+            y2, g2 = compute_1d(x)
+            self.assertTrue(np.allclose(g1, g2))
+            self.assertTrue(np.allclose(y1, y2))
 
+            # [N, C, L]
+            shape = [1000000, 4, 4]
             x = np.random.randn(*shape).astype("float32")
-            y1, g1 = compute_v1(x)
-            y2, g2 = compute_v2(x)
+            y1, g1 = compute_baseline(x)
+            y2, g2 = compute_1d(x)
             self.assertTrue(np.allclose(g1, g2))
             self.assertTrue(np.allclose(y1, y2))
 
-    def test_eager_api_1d(self):
+    def test_eager_api(self):
         places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
         for p in places:
-            shape = [200000, 4]
+            shape = [4, 10, 4, 4]
 
             def compute_v1(x):
                 with fluid.dygraph.guard(p):
                     bn = fluid.dygraph.BatchNorm(shape[1])
+                    #bn = paddle.nn.BatchNorm2D(shape[1])
                     x1 = paddle.to_tensor(x)
                     x1.stop_gradient = False
                     y = bn(x1)
@@ -135,7 +143,8 @@ def compute_v1(x):
             def compute_v2(x):
                 with fluid.dygraph.guard(p):
                     with _test_eager_guard():
-                        bn = paddle.nn.BatchNorm1D(shape[1])
+                        print("v2")
+                        bn = paddle.nn.BatchNorm2D(shape[1])
                         x1 = paddle.to_tensor(x)
                         x1.stop_gradient = False
                         y = bn(x1)
diff --git a/python/paddle/fluid/tests/unittests/test_calc_gradient.py b/python/paddle/fluid/tests/unittests/test_calc_gradient.py
index 53c578fc6c1e8..92eb35896255d 100644
--- a/python/paddle/fluid/tests/unittests/test_calc_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_calc_gradient.py
@@ -31,7 +31,7 @@ def test_calc_gradient(self):
             x = layers.create_parameter(dtype="float32", shape=[5, 10])
             y = layers.create_parameter(dtype="float32", shape=[10, 8])
             mul_out = layers.mul(x=x, y=y)
-            mean_out = layers.mean(mul_out)
+            mean_out = paddle.mean(mul_out)
             a = calc_gradient(mean_out, mul_out)
             b = calc_gradient(mean_out, x)
         place = fluid.CPUPlace()
diff --git a/python/paddle/fluid/tests/unittests/test_case.py b/python/paddle/fluid/tests/unittests/test_case.py
index ed633c758b540..79bb1e0bffd44 100644
--- a/python/paddle/fluid/tests/unittests/test_case.py
+++ b/python/paddle/fluid/tests/unittests/test_case.py
@@ -17,6 +17,7 @@
 import numpy as np
 import unittest
 
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.layers as layers
@@ -266,12 +267,12 @@ def test_optimizer_in_case(self):
 
         def fn_1():
             sum = layers.elementwise_mul(x, y)
-            loss = layers.mean(sum, name="f_1_loss")
+            loss = paddle.mean(sum, name="f_1_loss")
             adam.minimize(loss)
 
         def fn_2():
             sum = layers.elementwise_mul(x, y)
-            loss = layers.mean(sum, name="f_2_loss")
+            loss = paddle.mean(sum, name="f_2_loss")
             adagrad.minimize(loss)
 
         layers.case(pred_fn_pairs=[(switch_id == one, fn_1)], default=fn_2)
diff --git a/python/paddle/fluid/tests/unittests/test_collective_alltoall_single.py b/python/paddle/fluid/tests/unittests/test_collective_alltoall_single.py
new file mode 100644
index 0000000000000..e848404850d9e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_collective_alltoall_single.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import unittest
+import paddle.fluid as fluid
+
+from test_parallel_dygraph_dataparallel import TestMultipleGpus
+
+
+class TestCollectiveAllToAllSingle(TestMultipleGpus):
+
+    def test_collective_alltoall_single(self):
+        self.run_mnist_2gpu('collective_alltoall_single.py', eager_mode=True)
+
+
+if __name__ == "__main__":
+    os.environ["FLAGS_enable_eager_mode"] = "1"
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_batch_isend_irecv.py b/python/paddle/fluid/tests/unittests/test_collective_batch_isend_irecv.py
new file mode 100644
index 0000000000000..a93c417b99c65
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_collective_batch_isend_irecv.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import unittest
+import paddle.fluid as fluid
+
+from test_parallel_dygraph_dataparallel import TestMultipleGpus
+
+
+class TestCollectiveBatchIsendIrecv(TestMultipleGpus):
+
+    def test_collective_batch_isend_irecv(self):
+        self.run_mnist_2gpu('collective_batch_isend_irecv.py', eager_mode=True)
+
+
+if __name__ == "__main__":
+    os.environ["FLAGS_enable_eager_mode"] = "1"
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_reduce_scatter.py b/python/paddle/fluid/tests/unittests/test_collective_reduce_scatter.py
new file mode 100644
index 0000000000000..93d181243b1fa
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_collective_reduce_scatter.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import unittest
+import paddle.fluid as fluid
+
+from test_parallel_dygraph_dataparallel import TestMultipleGpus
+
+
+class TestCollectiveReduceScatter(TestMultipleGpus):
+
+    def test_collective_reduce_scatter(self):
+        self.run_mnist_2gpu('collective_reduce_scatter.py', eager_mode=True)
+
+
+if __name__ == "__main__":
+    os.environ["FLAGS_enable_eager_mode"] = "1"
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_async.py b/python/paddle/fluid/tests/unittests/test_communicator_async.py
index f6fd89dc37dae..a6ec339c9b9c3 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_async.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_async.py
@@ -36,7 +36,7 @@ def net(self):
         y = fluid.layers.data(name='y', shape=[1], dtype='float32')
 
         cost = fluid.layers.square_error_cost(input=x, label=y)
-        avg_cost = fluid.layers.mean(cost)
+        avg_cost = paddle.mean(cost)
         return avg_cost
 
     def test_communicator_async(self):
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_geo.py b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
index c3f2566d6f7f4..f7593f8bb31fe 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
@@ -53,7 +53,7 @@ def net(self):
         y = fluid.layers.data(name='y', shape=[1], dtype='float32')
 
         cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
+        avg_cost = paddle.mean(cost)
         return avg_cost, x, x1, y
 
     def fake_reader(self):
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_half_async.py b/python/paddle/fluid/tests/unittests/test_communicator_half_async.py
index c4a7edc21f92b..4c06e80547f1f 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_half_async.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_half_async.py
@@ -38,7 +38,7 @@ def net(self):
         y = fluid.layers.data(name='y', shape=[1], dtype='float32')
 
         cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
+        avg_cost = paddle.mean(cost)
         return avg_cost, x, y
 
     def fake_reader(self):
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py b/python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py
index 5726372e40f97..0846eb4dbdb1e 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py
@@ -59,7 +59,7 @@ def test_communicator_ps_gpu(self):
         slots_vars = [x, y]
 
         cost = fluid.layers.square_error_cost(input=x, label=y)
-        avg_cost = fluid.layers.mean(cost)
+        avg_cost = paddle.mean(cost)
 
         optimizer = fluid.optimizer.Adam(0.01)
 
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_sync.py b/python/paddle/fluid/tests/unittests/test_communicator_sync.py
index f13cfd885765a..1380866652f2e 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_sync.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_sync.py
@@ -34,7 +34,7 @@ def net(self):
         x = fluid.layers.data(name='x', shape=[1], dtype='float32')
         y = fluid.layers.data(name='y', shape=[1], dtype='float32')
         cost = fluid.layers.square_error_cost(input=x, label=y)
-        avg_cost = fluid.layers.mean(cost)
+        avg_cost = paddle.mean(cost)
         return avg_cost
 
     def test_communicator_sync(self):
diff --git a/python/paddle/fluid/tests/unittests/test_compiled_program.py b/python/paddle/fluid/tests/unittests/test_compiled_program.py
index e16ac4881c761..fab70b2c6ada4 100644
--- a/python/paddle/fluid/tests/unittests/test_compiled_program.py
+++ b/python/paddle/fluid/tests/unittests/test_compiled_program.py
@@ -105,7 +105,7 @@ def build_simple_model(self):
         label = fluid.layers.data(name='label', shape=[1], dtype='int64')
         prediction = fluid.layers.fc(input=img, size=10, act='softmax')
         loss = fluid.layers.cross_entropy(input=prediction, label=label)
-        avg_loss = fluid.layers.mean(loss)
+        avg_loss = paddle.mean(loss)
 
     def compile_program_not_compiled(self):
         with fluid.program_guard(fluid.Program()):
diff --git a/python/paddle/fluid/tests/unittests/test_complex_op.py b/python/paddle/fluid/tests/unittests/test_complex_op.py
index 1faef17a2ade3..49ad644b0ab75 100644
--- a/python/paddle/fluid/tests/unittests/test_complex_op.py
+++ b/python/paddle/fluid/tests/unittests/test_complex_op.py
@@ -58,6 +58,7 @@ def init_spec(self):
 
     def setUp(self):
         self.op_type = "complex"
+        self.python_api = paddle.complex
         self.init_spec()
         x = np.random.randn(*self.x_shape).astype(self.dtype)
         y = np.random.randn(*self.y_shape).astype(self.dtype)
diff --git a/python/paddle/fluid/tests/unittests/test_complex_view_op.py b/python/paddle/fluid/tests/unittests/test_complex_view_op.py
index 6b224209edcc5..a2fd77bcabf9e 100644
--- a/python/paddle/fluid/tests/unittests/test_complex_view_op.py
+++ b/python/paddle/fluid/tests/unittests/test_complex_view_op.py
@@ -67,6 +67,7 @@ def setUp(self):
         out_ref = ref_view_as_real(x)
         self.inputs = {'X': x}
         self.outputs = {'Out': out_ref}
+        self.python_api = paddle.as_real
         self.out_grad = np.ones([10, 10, 2], dtype="float64")
 
     def test_check_output(self):
diff --git a/python/paddle/fluid/tests/unittests/test_cond.py b/python/paddle/fluid/tests/unittests/test_cond.py
index 1680461305188..61043cab36a68 100644
--- a/python/paddle/fluid/tests/unittests/test_cond.py
+++ b/python/paddle/fluid/tests/unittests/test_cond.py
@@ -17,7 +17,7 @@
 import numpy as np
 import os
 import unittest
-
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.layers as layers
@@ -25,6 +25,7 @@
 from paddle.fluid.backward import append_backward
 from paddle.fluid.framework import Program, program_guard
 from simple_nets import simple_fc_net_with_inputs, batchnorm_fc_with_inputs
+import paddle
 
 np.random.seed(123)
 
@@ -41,6 +42,8 @@ def test_return_single_var(self):
             return -1
         """
 
+        paddle.enable_static()
+
         def true_func():
             return layers.fill_constant(shape=[2, 3], dtype='int32', value=2)
 
@@ -73,6 +76,8 @@ def test_return_var_tuple(self):
             return 3, 2
         """
 
+        paddle.enable_static()
+
         def true_func():
             return layers.fill_constant(shape=[1, 2], dtype='int32',
                                         value=1), layers.fill_constant(
@@ -114,6 +119,8 @@ def test_pass_and_modify_var(self):
                 a = a - (i - 1)
         """
 
+        paddle.enable_static()
+
         def true_func(a, i):
             a = a * (i + 1)
             return a
@@ -152,6 +159,8 @@ def test_return_none(self):
                 pass
         """
 
+        paddle.enable_static()
+
         def true_func():
             pass
 
@@ -181,6 +190,8 @@ def test_wrong_structure_exception(self):
         test returning different number of tensors cannot merge into output
         """
 
+        paddle.enable_static()
+
         def func_return_none():
             return None
 
@@ -223,10 +234,11 @@ def func_return_two_tensors():
                 out = layers.cond(pred, func_return_one_tensor,
                                   func_return_two_tensors)
             self.assertTrue(
-                "Incompatible return values of true_fn and false_fn in cond" in
-                str(e.exception))
+                "true fn returns 1 vars, but false fn returns 2 vars, which is not equals"
+                in str(e.exception))
 
     def test_extremely_simple_net_with_op_in_condition(self):
+        paddle.enable_static()
         main_program = fluid.Program()
         startup_program = fluid.Program()
         with fluid.program_guard(main_program, startup_program):
@@ -272,6 +284,8 @@ def test_cond_inside_cond(self):
                     return a / a
         """
 
+        paddle.enable_static()
+
         def less_than_branch(i, a):
             return layers.cond(i >= 3.0, lambda: layers.elementwise_add(a, a),
                                lambda: layers.elementwise_sub(a, a))
@@ -287,7 +301,7 @@ def greater_equal_branch(i, a):
             a = 2.0 * i
             out = layers.cond(i < 5.0, lambda: less_than_branch(i, a),
                               lambda: greater_equal_branch(i, a))
-            mean = layers.mean(out)
+            mean = paddle.mean(out)
             append_backward(mean)
 
         place = fluid.CUDAPlace(
@@ -308,6 +322,7 @@ def greater_equal_branch(i, a):
             self.assertEqual(ret[1][0], expected_a_grad)
 
     def test_cond_op_in_condition(self):
+        paddle.enable_static()
         main_program = fluid.Program()
         startup_program = fluid.Program()
 
@@ -344,6 +359,7 @@ def backward_value_helper(self, cond_func, use_cuda, use_parallel_exe):
         """
         Helper function that compares calculated backward value is close to dy/dx
         """
+        paddle.enable_static()
         main_program = Program()
         main_program.random_seed = 123
         startup_program = Program()
@@ -474,6 +490,8 @@ def add_optimizer_helper(self, cond_func, use_cuda, use_parallel_exe):
 
     def test_cond_backward(self):
 
+        paddle.enable_static()
+
         def cond_func(i, img, label):
             predicate = ((i % 2) == 0)
             return layers.cond(
@@ -494,6 +512,7 @@ def cond_func(i, img, label):
                                       use_parallel_exe)
 
     def test_half_nested_cond_backward(self):
+        paddle.enable_static()
 
         def branch(i, img, label):
             return layers.cond(
@@ -503,10 +522,10 @@ def branch(i, img, label):
 
         def cond_func_simple_net_at_true(i, img, label):
             return layers.cond(i < 5, lambda: branch(i, img, label),
-                               lambda: layers.mean(img))
+                               lambda: paddle.mean(img))
 
         def cond_func_simple_net_at_false(i, img, label):
-            return layers.cond(i < 5, lambda: layers.mean(img),
+            return layers.cond(i < 5, lambda: paddle.mean(img),
                                lambda: branch(i, img, label))
 
         for use_parallel_exe in [False, True]:
@@ -530,6 +549,7 @@ def cond_func_simple_net_at_false(i, img, label):
                                       use_parallel_exe)
 
     def test_nested_cond_backward(self):
+        paddle.enable_static()
 
         def branch(i, img, label, mod_two):
             if mod_two:
@@ -560,6 +580,7 @@ def cond_func(i, img, label):
 class TestCondWithError(unittest.TestCase):
 
     def test_input_type_error(self):
+        paddle.enable_static()
         main_program = framework.Program()
         startup_program = framework.Program()
         with framework.program_guard(main_program, startup_program):
diff --git a/python/paddle/fluid/tests/unittests/test_conditional_block.py b/python/paddle/fluid/tests/unittests/test_conditional_block.py
index 64980115d9ea6..dc59246faa343 100644
--- a/python/paddle/fluid/tests/unittests/test_conditional_block.py
+++ b/python/paddle/fluid/tests/unittests/test_conditional_block.py
@@ -16,6 +16,7 @@
 
 import numpy as np
 import unittest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import paddle.fluid.core as core
@@ -46,7 +47,7 @@ def test_forward(self):
 
             outs = exe.run(main_program, feed={'X': x}, fetch_list=[out])[0]
             print(outs)
-            loss = layers.mean(out)
+            loss = paddle.mean(out)
             append_backward(loss=loss)
             outs = exe.run(
                 main_program,
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_graph.py b/python/paddle/fluid/tests/unittests/test_cuda_graph.py
index fda3fa79ef664..446a5500bc30b 100644
--- a/python/paddle/fluid/tests/unittests/test_cuda_graph.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_graph.py
@@ -236,6 +236,16 @@ def __getitem__(self, idx):
             self.assertTrue(np.array_equal(actual_x, x.numpy()))
             self.assertTrue(np.array_equal(actual_y, y.numpy()))
 
+    def test_dev_ctx_alloc(self):
+        if not can_use_cuda_graph():
+            return
+
+        x = paddle.to_tensor([2], dtype='float32')
+        graph = CUDAGraph()
+        graph.capture_begin()
+        y = paddle.cast(x, dtype='float16')
+        graph.capture_end()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dataset.py b/python/paddle/fluid/tests/unittests/test_dataset.py
index 86e23c79d07a4..ed01e7e06f6a9 100644
--- a/python/paddle/fluid/tests/unittests/test_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset.py
@@ -943,7 +943,7 @@ def test_dataset_fleet(self):
                 slots_vars.append(var)
             fake_cost = \
                 fluid.layers.elementwise_sub(slots_vars[0], slots_vars[-1])
-            fake_cost = fluid.layers.mean(fake_cost)
+            fake_cost = paddle.mean(fake_cost)
         with fluid.scope_guard(scope):
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
@@ -1008,7 +1008,7 @@ def test_dataset_fleet2(self):
                 slots_vars.append(var)
             fake_cost = \
                 fluid.layers.elementwise_sub(slots_vars[0], slots_vars[-1])
-            fake_cost = fluid.layers.mean(fake_cost)
+            fake_cost = paddle.mean(fake_cost)
         with fluid.scope_guard(scope):
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
@@ -1136,7 +1136,7 @@ def test_bosps_dataset_fleet2(self):
                 slots_vars.append(var)
             fake_cost = \
                 fluid.layers.elementwise_sub(slots_vars[0], slots_vars[-1])
-            fake_cost = fluid.layers.mean(fake_cost)
+            fake_cost = paddle.mean(fake_cost)
         with fluid.scope_guard(scope):
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
diff --git a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
index 75dc36f9bb938..ba89f623b2cef 100644
--- a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
@@ -61,7 +61,7 @@ def simple_fc_net(places, use_legacy_py_reader, use_double_buffer):
             predict_label = fluid.layers.fc(hidden,
                                             size=CLASS_NUM,
                                             act='softmax')
-            loss = fluid.layers.mean(
+            loss = paddle.mean(
                 fluid.layers.cross_entropy(input=predict_label, label=label))
 
             optimizer = fluid.optimizer.Adam()
diff --git a/python/paddle/fluid/tests/unittests/test_desc_clone.py b/python/paddle/fluid/tests/unittests/test_desc_clone.py
index c82ba2bc8cb8e..33ae3f0c6d024 100644
--- a/python/paddle/fluid/tests/unittests/test_desc_clone.py
+++ b/python/paddle/fluid/tests/unittests/test_desc_clone.py
@@ -77,7 +77,7 @@ def get_model(batch_size):
     # Train program
     predict = cnn_model(images)
     cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
+    avg_cost = paddle.mean(x=cost)
 
     # Evaluator
     batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
@@ -181,7 +181,7 @@ def test_clone_with_stop_gradient(self):
             loss = fluid.layers.cross_entropy(
                 input=fluid.layers.fc(hidden2, size=10, act='softmax'),
                 label=fluid.layers.data(name='label', shape=[1], dtype='int64'))
-            avg_loss = fluid.layers.mean(loss)
+            avg_loss = paddle.mean(loss)
             test_program = train_program.clone(for_test=False)
 
         self.assertEqual(
@@ -217,7 +217,7 @@ def false_fn():
             loss = fluid.layers.cross_entropy(
                 input=fluid.layers.fc(hidden2, size=10, act='softmax'),
                 label=fluid.layers.data(name='label', shape=[1], dtype='int64'))
-            avg_loss = fluid.layers.mean(loss)
+            avg_loss = paddle.mean(loss)
             test_program = train_program.clone(for_test=False)
 
         self.assertEqual(
@@ -256,7 +256,7 @@ def false_fn():
             loss = fluid.layers.cross_entropy(
                 input=fluid.layers.fc(hidden2, size=10, act='softmax'),
                 label=fluid.layers.data(name='label', shape=[1], dtype='int64'))
-            avg_loss = fluid.layers.mean(loss)
+            avg_loss = paddle.mean(loss)
             test_program = train_program.clone(for_test=False)
 
         self.assertRaises(ValueError, train_program._copy_data_info_from,
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
index 38fea7f2413c7..8f66b9098d23f 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
@@ -50,7 +50,7 @@ def test_a_sync_optimizer_trainer(self):
         x = paddle.fluid.layers.data(name='x', shape=[1], dtype='float32')
         y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32')
         cost = paddle.fluid.layers.square_error_cost(input=x, label=y)
-        avg_cost = paddle.fluid.layers.mean(cost)
+        avg_cost = paddle.mean(cost)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = False
@@ -88,7 +88,7 @@ def test_a_sync_optimizer_pserver(self):
         x = paddle.fluid.layers.data(name='x', shape=[1], dtype='float32')
         y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32')
         cost = paddle.fluid.layers.square_error_cost(input=x, label=y)
-        avg_cost = paddle.fluid.layers.mean(cost)
+        avg_cost = paddle.mean(cost)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = True
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py
index 3e683b0d693c0..64ee376c176ce 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py
@@ -54,7 +54,7 @@ def test_a_sync_optimizer1(self):
         prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
         cost = paddle.fluid.layers.cross_entropy(input=prediction,
                                                  label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
+        avg_cost = paddle.mean(x=cost)
 
         os.environ["FLAGS_LAUNCH_BARRIER"] = "0"
         strategy = paddle.distributed.fleet.DistributedStrategy()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
index d2ed6ad7ff1de..07dffa9efb14d 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
@@ -66,7 +66,7 @@ def test_a_sync_optimizer3(self):
         prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
         cost = paddle.fluid.layers.cross_entropy(input=prediction,
                                                  label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
+        avg_cost = paddle.mean(x=cost)
 
         os.environ["FLAGS_LAUNCH_BARRIER"] = "0"
         strategy = paddle.distributed.fleet.DistributedStrategy()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
index 707f072060a80..d73e5ab16fd88 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
@@ -58,7 +58,7 @@ def test_a_sync_optimizer2(self):
         prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
         cost = paddle.fluid.layers.cross_entropy(input=prediction,
                                                  label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
+        avg_cost = paddle.mean(x=cost)
         os.environ["FLAGS_LAUNCH_BARRIER"] = "0"
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.auto = True
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py
index 51eb9b81619b7..c96d6768155fd 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py
@@ -55,7 +55,7 @@ def test_a_sync_optimizer_trainer(self):
         prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
         cost = paddle.fluid.layers.cross_entropy(input=prediction,
                                                  label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
+        avg_cost = paddle.mean(x=cost)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = True
@@ -88,7 +88,7 @@ def test_a_sync_optimizer_pserver(self):
         prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
         cost = paddle.fluid.layers.cross_entropy(input=prediction,
                                                  label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
+        avg_cost = paddle.mean(x=cost)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = True
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
index 3d7aa1b3fee0d..50b4e867678d4 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
@@ -41,7 +41,7 @@ def test_gradient_merge_optimizer(self):
         x = paddle.fluid.layers.data(name='x', shape=[1], dtype='float32')
         y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32')
         cost = paddle.fluid.layers.square_error_cost(input=x, label=y)
-        avg_cost = paddle.fluid.layers.mean(cost)
+        avg_cost = paddle.mean(cost)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = False
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py
index 7e3e5258aed60..67e2c3ffb85ca 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py
@@ -66,7 +66,7 @@ def get_loss(cos_q_pt, cos_q_nt):
                                                            value=0.0,
                                                            dtype='float32'),
                 loss_op2)
-            avg_cost = fluid.layers.mean(loss_op3)
+            avg_cost = paddle.mean(loss_op3)
             return avg_cost
 
         is_distributed = False
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py
index ba70a3d1def7f..01560951c0c5d 100755
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py
@@ -67,7 +67,7 @@ def get_loss(cos_q_pt, cos_q_nt):
                                                            value=0.0,
                                                            dtype='float32'),
                 loss_op2)
-            avg_cost = fluid.layers.mean(loss_op3)
+            avg_cost = paddle.mean(loss_op3)
             return avg_cost
 
         is_distributed = False
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py
index af61dc7fa3cf9..45d74260b09f6 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py
@@ -70,7 +70,7 @@ def get_loss(cos_q_pt, cos_q_nt):
                                                            value=0.0,
                                                            dtype='float32'),
                 loss_op2)
-            avg_cost = fluid.layers.mean(loss_op3)
+            avg_cost = paddle.mean(loss_op3)
             return avg_cost
 
         is_distributed = False
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
index 243023b4fe1c6..216ea4c2926fd 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
@@ -70,7 +70,7 @@ def get_loss(cos_q_pt, cos_q_nt):
                                                            value=0.0,
                                                            dtype='float32'),
                 loss_op2)
-            avg_cost = fluid.layers.mean(loss_op3)
+            avg_cost = paddle.mean(loss_op3)
             return avg_cost
 
         is_distributed = False
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py
index b8ff052c192cd..d6fe562dc93b9 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py
@@ -66,7 +66,7 @@ def get_loss(cos_q_pt, cos_q_nt):
                                                            value=0.0,
                                                            dtype='float32'),
                 loss_op2)
-            avg_cost = fluid.layers.mean(loss_op3)
+            avg_cost = paddle.mean(loss_op3)
             return avg_cost
 
         is_distributed = False
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
index 32af1959f25db..338ef3af6621e 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
@@ -66,7 +66,7 @@ def get_loss(cos_q_pt, cos_q_nt):
                                                            value=0.0,
                                                            dtype='float32'),
                 loss_op2)
-            avg_cost = fluid.layers.mean(loss_op3)
+            avg_cost = paddle.mean(loss_op3)
             return avg_cost
 
         is_distributed = False
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
index 63ea8f639aae4..12a65a01cfe6d 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
@@ -66,7 +66,7 @@ def get_loss(cos_q_pt, cos_q_nt):
                                                            value=0.0,
                                                            dtype='float32'),
                 loss_op2)
-            avg_cost = fluid.layers.mean(loss_op3)
+            avg_cost = paddle.mean(loss_op3)
             return avg_cost
 
         is_distributed = False
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py
index 692f586a43546..31f3c8d6d8a4f 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py
@@ -66,7 +66,7 @@ def get_loss(cos_q_pt, cos_q_nt):
                                                            value=0.0,
                                                            dtype='float32'),
                 loss_op2)
-            avg_cost = fluid.layers.mean(loss_op3)
+            avg_cost = paddle.mean(loss_op3)
             return avg_cost
 
         is_distributed = False
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_trainer_desc_config.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_trainer_desc_config.py
index d692528f5bb34..d929edfabd467 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_trainer_desc_config.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_trainer_desc_config.py
@@ -44,7 +44,7 @@ def test_trainer_desc_config(self):
         x = paddle.fluid.layers.data(name='x', shape=[1], dtype='float32')
         y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32')
         cost = paddle.fluid.layers.square_error_cost(input=x, label=y)
-        avg_cost = paddle.fluid.layers.mean(cost)
+        avg_cost = paddle.mean(cost)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = True
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py
index 265e59ff94919..ce5e79fcd9b31 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py
@@ -53,7 +53,7 @@ def test_open_sync_batch_norm(self):
 
         data = fluid.layers.data(name='X', shape=[1], dtype='float32')
         hidden = fluid.layers.fc(input=data, size=10)
-        loss = fluid.layers.mean(hidden)
+        loss = paddle.mean(hidden)
 
         optimizer = fluid.optimizer.AdamOptimizer()
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index 5905b682d8941..e04638a1d80e6 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -26,6 +26,7 @@
 
 gc.set_debug(gc.DEBUG_COLLECTABLE)
 
+import paddle
 import paddle.fluid as fluid
 
 
@@ -51,7 +52,7 @@ def net_conf(self):
                                     bias_attr=fluid.ParamAttr(name='fc_b'))
         y = fluid.layers.data(name='y', shape=[1], dtype='float32')
         cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
+        avg_cost = paddle.mean(cost)
         sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
         sgd_optimizer.minimize(avg_cost)
 
@@ -257,7 +258,7 @@ def net_conf(self):
                                     bias_attr=fluid.ParamAttr(name='fc_b'))
         y = fluid.layers.data(name='y', shape=[1], dtype='float32')
         cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
+        avg_cost = paddle.mean(cost)
         sgd_optimizer = fluid.optimizer.SGD(
             learning_rate=fluid.layers.exponential_decay(learning_rate=1.0,
                                                          decay_steps=2100,
@@ -402,7 +403,7 @@ def net_conf(self):
                                     bias_attr=fluid.ParamAttr(name='fc_b'))
         y = fluid.layers.data(name='y', shape=[1], dtype='float32')
         cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
+        avg_cost = paddle.mean(cost)
         opt = fluid.optimizer.DecayedAdagrad(learning_rate=0.1)
         opt.minimize(avg_cost)
 
@@ -422,7 +423,7 @@ def net_conf(self):
                                     bias_attr=fluid.ParamAttr(name='fc_b'))
         y = fluid.layers.data(name='y', shape=[1], dtype='float32')
         cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
+        avg_cost = paddle.mean(cost)
         opt = fluid.optimizer.Ftrl(learning_rate=0.1)
         opt.minimize(avg_cost)
 
@@ -442,7 +443,7 @@ def net_conf(self):
                                     bias_attr=fluid.ParamAttr(name='fc_b'))
         y = fluid.layers.data(name='y', shape=[1], dtype='float32')
         cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
+        avg_cost = paddle.mean(cost)
         sgd_optimizer = fluid.optimizer.SGD(
             learning_rate=fluid.layers.piecewise_decay([10000, 20000],
                                                        [1.0, 0.5, 1.0]))
@@ -491,7 +492,7 @@ def net_conf(self):
             bias_attr=fluid.ParamAttr(name='fc_b'))
         y = fluid.layers.data(name='y', shape=[1], dtype='float32')
         cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
+        avg_cost = paddle.mean(cost)
         sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
 
         def filter(param):
@@ -523,7 +524,7 @@ def net_conf(self):
                                     bias_attr=fluid.ParamAttr(name='fc_b'))
         y = fluid.layers.data(name='y', shape=[1], dtype='float32')
         cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
+        avg_cost = paddle.mean(cost)
         base_lr = 1.0
         bd = [1, 10, 20, 30]
         lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
@@ -568,7 +569,7 @@ def net_conf(self):
                                     bias_attr=False)
         y = fluid.layers.data(name='y', shape=[1], dtype='float32')
         cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
+        avg_cost = paddle.mean(cost)
         sgd_optimizer = fluid.optimizer.SGD(learning_rate=1.0)
         sgd_optimizer.minimize(avg_cost)
 
@@ -624,7 +625,7 @@ def emb_pool(ids, table_name, is_distributed):
 
         label = fluid.layers.data(name='label', shape=[1], dtype='int64')
         cost = fluid.layers.cross_entropy(input=predict, label=label)
-        avg_cost = fluid.layers.mean(cost)
+        avg_cost = paddle.mean(cost)
         optimizer = fluid.optimizer.Adam(learning_rate=0.003)
         optimizer.minimize(avg_cost)
 
@@ -852,7 +853,7 @@ def net_conf(self):
                                     bias_attr=fluid.ParamAttr(name='fc_b'))
         y = fluid.layers.data(name='y', shape=[1], dtype='float32')
         cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
+        avg_cost = paddle.mean(cost)
         optimizer = fluid.optimizer.RMSProp(learning_rate=0.1)
         optimizer.minimize(avg_cost)
 
@@ -882,7 +883,7 @@ def net_conf(self):
                                     bias_attr=fluid.ParamAttr(name='fc_b'))
         y = fluid.layers.data(name='y', shape=[1], dtype='float32')
         cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
+        avg_cost = paddle.mean(cost)
         optimizer = fluid.optimizer.RMSProp(learning_rate=0.1)
         optimizer.minimize(avg_cost)
 
@@ -1027,7 +1028,7 @@ def network_with_table(self, is_sparse, is_distributed):
                                 seed=1,
                                 num_neg_samples=5,
                                 is_sparse=is_sparse)
-        avg_cost = fluid.layers.mean(cost)
+        avg_cost = paddle.mean(cost)
         # optimizer
         optimizer = fluid.optimizer.Adam(learning_rate=0.003)
         optimizer.minimize(avg_cost)
@@ -1096,7 +1097,7 @@ def network_with_table(self, is_sparse, is_distributed):
                                      path_code=path_code,
                                      is_custom=True,
                                      is_sparse=is_sparse)
-        avg_cost = fluid.layers.mean(cost)
+        avg_cost = paddle.mean(cost)
         # optimizer
         optimizer = fluid.optimizer.SGD(learning_rate=0.003)
         optimizer.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py b/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
index 06cdaed1988cc..7950c2784221f 100644
--- a/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
@@ -1,4 +1,4 @@
-#    Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 import numpy as np
 import math
 import sys
+import paddle
+
 from op_test import OpTest
 
 
@@ -164,5 +166,62 @@ def init_test_case(self):
         self.pixel_offset = False
 
 
+class TestDistributeFpnProposalsAPI(unittest.TestCase):
+
+    def setUp(self):
+        np.random.seed(678)
+        self.rois_np = np.random.rand(10, 4).astype('float32')
+        self.rois_num_np = np.array([4, 6]).astype('int32')
+
+    def test_dygraph_with_static(self):
+        paddle.enable_static()
+        rois = paddle.static.data(name='rois', shape=[10, 4], dtype='float32')
+        rois_num = paddle.static.data(name='rois_num',
+                                      shape=[None],
+                                      dtype='int32')
+        multi_rois, restore_ind, rois_num_per_level = paddle.vision.ops.distribute_fpn_proposals(
+            fpn_rois=rois,
+            min_level=2,
+            max_level=5,
+            refer_level=4,
+            refer_scale=224,
+            rois_num=rois_num)
+        fetch_list = multi_rois + [restore_ind] + rois_num_per_level
+
+        exe = paddle.static.Executor()
+        output_stat = exe.run(paddle.static.default_main_program(),
+                              feed={
+                                  'rois': self.rois_np,
+                                  'rois_num': self.rois_num_np
+                              },
+                              fetch_list=fetch_list,
+                              return_numpy=False)
+        output_stat_np = []
+        for output in output_stat:
+            output_np = np.array(output)
+            if len(output_np) > 0:
+                output_stat_np.append(output_np)
+
+        paddle.disable_static()
+        rois_dy = paddle.to_tensor(self.rois_np)
+        rois_num_dy = paddle.to_tensor(self.rois_num_np)
+        multi_rois_dy, restore_ind_dy, rois_num_per_level_dy = paddle.vision.ops.distribute_fpn_proposals(
+            fpn_rois=rois_dy,
+            min_level=2,
+            max_level=5,
+            refer_level=4,
+            refer_scale=224,
+            rois_num=rois_num_dy)
+        output_dy = multi_rois_dy + [restore_ind_dy] + rois_num_per_level_dy
+        output_dy_np = []
+        for output in output_dy:
+            output_np = output.numpy()
+            if len(output_np) > 0:
+                output_dy_np.append(output_np)
+
+        for res_stat, res_dy in zip(output_stat_np, output_dy_np):
+            self.assertTrue(np.allclose(res_stat, res_dy))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_distributed_strategy.py b/python/paddle/fluid/tests/unittests/test_distributed_strategy.py
index 491555907ec40..92a6715a6424e 100644
--- a/python/paddle/fluid/tests/unittests/test_distributed_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_distributed_strategy.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import unittest
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig, ServerRuntimeConfig
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
@@ -238,7 +239,7 @@ def test_debug_info(self):
         y = fluid.layers.data(name='y', shape=[1], dtype='float32')
         y_predict = fluid.layers.fc(input=x, size=1, act=None)
         cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
+        avg_cost = paddle.mean(cost)
 
         role = role_maker.UserDefinedRoleMaker(
             current_id=0,
diff --git a/python/paddle/fluid/tests/unittests/test_dot_op.py b/python/paddle/fluid/tests/unittests/test_dot_op.py
index 536f8fd8d8af7..ffdc90dd986ad 100644
--- a/python/paddle/fluid/tests/unittests/test_dot_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dot_op.py
@@ -27,6 +27,7 @@ class DotOp(OpTest):
 
     def setUp(self):
         self.op_type = "dot"
+        self.python_api = paddle.dot
         self.init_dtype()
         self.init_input_output()
 
@@ -38,34 +39,43 @@ def setUp(self):
         self.attrs = {}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad_normal(self):
         if core.is_compiled_with_rocm():
             self.check_grad(
                 ['X', 'Y'],
                 'Out',
-                user_defined_grads=[self.inputs['Y'], self.inputs['X']])
+                user_defined_grads=[self.inputs['Y'], self.inputs['X']],
+                check_eager=True)
         else:
-            self.check_grad(['X', 'Y'], 'Out')
+            self.check_grad(['X', 'Y'], 'Out', check_eager=True)
 
     def test_check_grad_ingore_x(self):
         if core.is_compiled_with_rocm():
             self.check_grad(['Y'],
                             'Out',
                             no_grad_set=set("X"),
-                            user_defined_grads=[self.inputs['X']])
+                            user_defined_grads=[self.inputs['X']],
+                            check_eager=True)
         else:
-            self.check_grad(['Y'], 'Out', no_grad_set=set("X"))
+            self.check_grad(['Y'],
+                            'Out',
+                            no_grad_set=set("X"),
+                            check_eager=True)
 
     def test_check_grad_ingore_y(self):
         if core.is_compiled_with_rocm():
             self.check_grad(['X'],
                             'Out',
                             no_grad_set=set('Y'),
-                            user_defined_grads=[self.inputs['Y']])
+                            user_defined_grads=[self.inputs['Y']],
+                            check_eager=True)
         else:
-            self.check_grad(['X'], 'Out', no_grad_set=set('Y'))
+            self.check_grad(['X'],
+                            'Out',
+                            no_grad_set=set('Y'),
+                            check_eager=True)
 
     def init_input_output(self):
         self.x = np.random.uniform(0.1, 1, [121]).astype(self.dtype)
@@ -137,6 +147,7 @@ class TestComplexDotOp(OpTest):
 
     def setUp(self):
         self.op_type = "dot"
+        self.python_api = paddle.dot
         self.init_base_dtype()
         self.init_input_output()
         self.init_grad_input_output()
@@ -164,27 +175,30 @@ def init_grad_input_output(self):
         self.grad_y = self.grad_out * np.conj(self.x)
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad_normal(self):
         self.check_grad(['X', 'Y'],
                         'Out',
                         user_defined_grads=[self.grad_x, self.grad_y],
-                        user_defined_grad_outputs=[self.grad_out])
+                        user_defined_grad_outputs=[self.grad_out],
+                        check_eager=True)
 
     def test_check_grad_ingore_x(self):
         self.check_grad(['Y'],
                         'Out',
                         no_grad_set=set("X"),
                         user_defined_grads=[self.grad_y],
-                        user_defined_grad_outputs=[self.grad_out])
+                        user_defined_grad_outputs=[self.grad_out],
+                        check_eager=True)
 
     def test_check_grad_ingore_y(self):
         self.check_grad(['X'],
                         'Out',
                         no_grad_set=set('Y'),
                         user_defined_grads=[self.grad_x],
-                        user_defined_grad_outputs=[self.grad_out])
+                        user_defined_grad_outputs=[self.grad_out],
+                        check_eager=True)
 
 
 class TestComplexDotOp2D(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/test_downpoursgd.py b/python/paddle/fluid/tests/unittests/test_downpoursgd.py
index 030af8f809e3e..16e9948a7e6d4 100644
--- a/python/paddle/fluid/tests/unittests/test_downpoursgd.py
+++ b/python/paddle/fluid/tests/unittests/test_downpoursgd.py
@@ -60,7 +60,7 @@ def test_device_work_use_cvm(self):
             y_predict = fluid.layers.fc(input=x_emb, size=1, act=None)
             y = fluid.layers.data(name='y', shape=[1], dtype='float32')
             cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-            avg_cost = fluid.layers.mean(cost)
+            avg_cost = paddle.mean(cost)
 
             ps_param = pslib.PSParameter()
             with open("{}/fleet_desc.prototxt".format(cache_path)) as f:
@@ -120,7 +120,7 @@ def test_device_work(self):
             y_predict = fluid.layers.fc(input=x_emb, size=1, act=None)
             y = fluid.layers.data(name='y', shape=[1], dtype='float32')
             cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-            avg_cost = fluid.layers.mean(cost)
+            avg_cost = paddle.mean(cost)
 
             ps_param = pslib.PSParameter()
             with open("{}/fleet_desc.prototxt".format(cache_path)) as f:
@@ -178,7 +178,7 @@ def test_downpour_opt_work(self):
             y_predict = fluid.layers.fc(input=x_emb, size=1, act=None)
             y = fluid.layers.data(name='y', shape=[1], dtype='float32')
             cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-            avg_cost = fluid.layers.mean(cost)
+            avg_cost = paddle.mean(cost)
 
             ps_param = pslib.PSParameter()
             with open("{}/fleet_desc.prototxt".format(cache_path)) as f:
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_mnist_fp16.py b/python/paddle/fluid/tests/unittests/test_dygraph_mnist_fp16.py
index f77f54a636ee7..c0ff50d58cfa7 100644
--- a/python/paddle/fluid/tests/unittests/test_dygraph_mnist_fp16.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_mnist_fp16.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear
 from paddle.fluid.framework import _test_eager_guard
@@ -111,7 +112,7 @@ def forward(self, inputs, label):
         x = fluid.layers.reshape(x, shape=[-1, self.pool_2_shape])
         cost = self._linear(x)
         loss = fluid.layers.cross_entropy(cost, label)
-        avg_loss = fluid.layers.mean(loss)
+        avg_loss = paddle.mean(loss)
         return avg_loss
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py b/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
index 2487bc15660e2..814ef31102fc2 100644
--- a/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
@@ -143,7 +143,7 @@ def test_mnist_forward_float32(self):
 
                     cost = mnist(img)
                     loss = fluid.layers.cross_entropy(cost, label)
-                    avg_loss = fluid.layers.mean(loss)
+                    avg_loss = paddle.mean(loss)
 
                     dy_out = avg_loss.numpy()
 
@@ -169,7 +169,7 @@ def test_mnist_forward_float32(self):
             label = fluid.layers.data(name='label', shape=[1], dtype='int64')
             cost = mnist(img)
             loss = fluid.layers.cross_entropy(cost, label)
-            avg_loss = fluid.layers.mean(loss)
+            avg_loss = paddle.mean(loss)
 
             # initialize params and fetch them
             static_param_init_value = {}
diff --git a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
index 0698a8b40df59..e811fe481f9fc 100644
--- a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
@@ -131,7 +131,7 @@ def test_plain_while_op(self):
             label = fluid.layers.data(name='label', shape=[1], dtype='float32')
             loss = fluid.layers.sigmoid_cross_entropy_with_logits(x=logits,
                                                                   label=label)
-            loss = fluid.layers.mean(loss)
+            loss = paddle.mean(loss)
             sgd = fluid.optimizer.SGD(1e-4)
             sgd.minimize(loss=loss)
 
@@ -174,7 +174,7 @@ def test_train_dynamic_rnn(self):
             label = fluid.layers.data(name='label', shape=[1], dtype='float32')
             loss = fluid.layers.sigmoid_cross_entropy_with_logits(x=logits,
                                                                   label=label)
-            loss = fluid.layers.mean(loss)
+            loss = paddle.mean(loss)
             sgd = fluid.optimizer.Adam(1e-3)
             sgd.minimize(loss=loss)
 
@@ -242,7 +242,7 @@ def test_train_nested_dynamic_rnn(self):
             logits = fluid.layers.fc(input=last, size=1, act=None)
             loss = fluid.layers.sigmoid_cross_entropy_with_logits(x=logits,
                                                                   label=label)
-            loss = fluid.layers.mean(loss)
+            loss = paddle.mean(loss)
             sgd = fluid.optimizer.SGD(1e-3)
             sgd.minimize(loss=loss)
 
@@ -303,7 +303,7 @@ def test_train_nested_dynamic_rnn2(self):
             logits = fluid.layers.fc(input=last, size=1, act=None)
             loss = fluid.layers.sigmoid_cross_entropy_with_logits(x=logits,
                                                                   label=label)
-            loss = fluid.layers.mean(loss)
+            loss = paddle.mean(loss)
             sgd = fluid.optimizer.SGD(1e-3)
             sgd.minimize(loss=loss)
 
diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
index 0d6fa635a8fd4..e81da693b7f0b 100644
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
@@ -17,6 +17,7 @@
 import numpy
 import random
 import collections
+import paddle
 import paddle.fluid as fluid
 import unittest
 from decorator_helper import *
@@ -276,7 +277,7 @@ def test_forward_backward(self):
 
         out = rnn()
         out = fluid.layers.sequence_pool(out, pool_type='last')
-        loss = fluid.layers.mean(out)
+        loss = paddle.mean(out)
         fluid.backward.append_backward(loss)
 
         cpu = fluid.CPUPlace()
@@ -357,7 +358,7 @@ def test_forward_backward(self):
 
         out = rnn()
         last = fluid.layers.sequence_pool(input=out, pool_type='last')
-        loss = fluid.layers.mean(last)
+        loss = paddle.mean(last)
         fluid.backward.append_backward(loss)
 
         cpu = fluid.CPUPlace()
diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
index 07f7fa818aa0e..1daa68aa01599 100644
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
@@ -130,7 +130,7 @@ def build_graph(self, only_forward=False):
             return static_input_step_outs
 
         last = fluid.layers.sequence_pool(input=rnn(), pool_type='last')
-        loss = fluid.layers.mean(last)
+        loss = paddle.mean(last)
         append_backward(loss)
         static_input_grad = self._program.global_block().var(
             framework.grad_var_name('static_input_tensor'))
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py
index 4bf8faf25ef44..ed7a3c0f0fe81 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py
@@ -44,7 +44,7 @@ def simple_fc_net():
                 value=1.0)))
     prediction = fluid.layers.fc(hidden, size=10, act='softmax')
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
+    loss = paddle.mean(loss)
     optimizer = fluid.optimizer.Adam(learning_rate=1e-3)
     optimizer.minimize(loss)
     return image, label, loss
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
index 39dc0caefd335..ac501a43ca75e 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
@@ -14,6 +14,7 @@
 
 import unittest
 from test_eager_deletion_dynamic_rnn_base import TestBase
+import paddle
 import paddle.fluid as fluid
 
 fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
@@ -38,7 +39,7 @@ def gru_net(data,
     fc1 = fluid.layers.fc(input=gru_max_tanh, size=hid_dim2, act='tanh')
     prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
     cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
+    avg_cost = paddle.mean(x=cost)
     return avg_cost
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py
index 07f78d3b84568..bb6f608201573 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from test_eager_deletion_dynamic_rnn_base import TestBase
+import paddle
 import paddle.fluid as fluid
 import unittest
 
@@ -40,7 +41,7 @@ def lstm_net(data,
     fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh')
     prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
     cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
+    avg_cost = paddle.mean(x=cost)
     return avg_cost
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py
index 907e167b5f1d4..195278253e8a3 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py
@@ -140,7 +140,7 @@ def setUp(self):
         self.py_rnn = PySimpleRNN1(self.input_shape, self.output_shape)
 
         with fluid.program_guard(self.main_program, self.startup_program):
-            self.output = layers.mean(self.create_rnn_op())
+            self.output = paddle.mean(self.create_rnn_op())
 
     def create_rnn_op(self):
         x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim],
@@ -274,7 +274,7 @@ def setUp(self):
         self.py_rnn = PySimpleRNN2(self.input_shape, self.output_shape)
 
         with fluid.program_guard(self.main_program, self.startup_program):
-            self.output = layers.mean(self.create_rnn_op())
+            self.output = paddle.mean(self.create_rnn_op())
 
     def create_rnn_op(self):
         x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim],
@@ -375,7 +375,7 @@ def setUp(self):
             self.input_shape, self.output_shape)
 
         with fluid.program_guard(self.main_program, self.startup_program):
-            self.output = layers.mean(self.create_rnn_op())
+            self.output = paddle.mean(self.create_rnn_op())
 
     def create_rnn_op(self):
         x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim],
@@ -456,7 +456,7 @@ def setUp(self):
             self.input_shape, self.output_shape)
 
         with fluid.program_guard(self.main_program, self.startup_program):
-            self.output = layers.mean(self.create_rnn_op())
+            self.output = paddle.mean(self.create_rnn_op())
 
     def create_rnn_op(self):
         x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim],
@@ -533,7 +533,7 @@ def setUp(self):
             self.input_shape, self.output_shape)
 
         with fluid.program_guard(self.main_program, self.startup_program):
-            self.output = layers.mean(self.create_rnn_op())
+            self.output = paddle.mean(self.create_rnn_op())
 
     def create_rnn_op(self):
         x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim],
@@ -654,7 +654,7 @@ def setUp(self):
                 forward_only_rnn.output(h)
             forward_only_output = forward_only_rnn()
             forward_only_output.stop_gradient = True
-            self.forward_only_output = layers.mean(forward_only_output)
+            self.forward_only_output = paddle.mean(forward_only_output)
 
             rnn = layers.StaticRNN()
             with rnn.step():
@@ -667,7 +667,7 @@ def setUp(self):
                 rnn.update_memory(h_pre, h)
                 rnn.output(h)
 
-            self.output = layers.mean(rnn())
+            self.output = paddle.mean(rnn())
 
     def forward_two_rnn(self):
         self.feed_map = {
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py
index 41685fa4254bf..52048d798ba02 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py
@@ -133,7 +133,7 @@ def run_main(self, place, with_data_parallel):
         tmp = layers.unsqueeze(sum_result, axes=[0])
         tmp = layers.expand(tmp, expand_times=[10, 1])
         fc = layers.fc(tmp, size=256)
-        loss = layers.mean(sum_result)
+        loss = paddle.mean(sum_result)
 
         optim = fluid.optimizer.Adam(learning_rate=1e-3)
         optim.minimize(loss)
diff --git a/python/paddle/fluid/tests/unittests/test_einsum_v2.py b/python/paddle/fluid/tests/unittests/test_einsum_v2.py
index 97f3eef51a5bf..224f44d74864b 100644
--- a/python/paddle/fluid/tests/unittests/test_einsum_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_einsum_v2.py
@@ -541,5 +541,19 @@ def test_shape(self):
             self.assertEqual(C.item(), 8.0)
 
 
+class TestComplex(unittest.TestCase):
+    """
+    EinsumOp support Complex type
+    """
+
+    def test_shape(self):
+        a = paddle.rand([4, 4])
+        b = paddle.rand([4, 4])
+        c = paddle.einsum('xy,yz->xz', a, b)
+        a = paddle.cast(a, 'complex64')
+        b = paddle.cast(b, 'complex64')
+        c = paddle.einsum('xy,yz->xz', a, b)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_ema.py b/python/paddle/fluid/tests/unittests/test_ema.py
index ae0dff4edf9e3..dd3472d31c928 100644
--- a/python/paddle/fluid/tests/unittests/test_ema.py
+++ b/python/paddle/fluid/tests/unittests/test_ema.py
@@ -16,6 +16,7 @@
 
 import unittest
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 
 
@@ -36,7 +37,7 @@ def setUp(self):
                 hidden = fluid.layers.fc(input=data,
                                          size=10,
                                          param_attr=self._param_name)
-                cost = fluid.layers.mean(hidden)
+                cost = paddle.mean(hidden)
 
                 self._test_program = fluid.default_main_program().clone(
                     for_test=True)
diff --git a/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py b/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py
index a1a4a263d936a..74d101497b8ed 100644
--- a/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py
@@ -59,7 +59,7 @@ def run_program(self, place, stop_gradient=False):
                 x.stop_gradient = stop_gradient
 
                 emb = fluid.embedding(x, size=[10, 32], dtype='float32')
-                avg_cost = fluid.layers.mean(emb, name='mean_loss')
+                avg_cost = paddle.mean(emb, name='mean_loss')
                 optim = fluid.optimizer.SGD(learning_rate=0.001)
                 optim.minimize(avg_cost)
 
diff --git a/python/paddle/fluid/tests/unittests/test_exception.py b/python/paddle/fluid/tests/unittests/test_exception.py
index 6e826dacf7ca5..900dbb4b1909f 100644
--- a/python/paddle/fluid/tests/unittests/test_exception.py
+++ b/python/paddle/fluid/tests/unittests/test_exception.py
@@ -49,7 +49,7 @@ def test_exception_in_static_mode(self):
         y = fluid.layers.data(name='Y', shape=[-1, 1], dtype='float32')
         predict = fluid.layers.fc(input=x, size=1, act=None)
         loss = fluid.layers.square_error_cost(input=predict, label=y)
-        avg_loss = fluid.layers.mean(loss)
+        avg_loss = paddle.mean(loss)
 
         fluid.optimizer.SGD(learning_rate=0.01).minimize(avg_loss)
 
diff --git a/python/paddle/fluid/tests/unittests/test_executor_check_feed.py b/python/paddle/fluid/tests/unittests/test_executor_check_feed.py
index a35ebfbab173e..120788ac50e93 100644
--- a/python/paddle/fluid/tests/unittests/test_executor_check_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_executor_check_feed.py
@@ -17,6 +17,7 @@
 import unittest
 
 import numpy
+import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 
@@ -30,7 +31,7 @@ def net(self):
         y_predict = fluid.layers.fc(input=x, size=1, act=None)
 
         cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
+        avg_cost = paddle.mean(cost)
 
         opt = fluid.optimizer.Adam(learning_rate=lr)
         opt.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/test_executor_feed_non_tensor.py b/python/paddle/fluid/tests/unittests/test_executor_feed_non_tensor.py
index 05676c34e6def..1362065f81981 100644
--- a/python/paddle/fluid/tests/unittests/test_executor_feed_non_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_executor_feed_non_tensor.py
@@ -17,6 +17,7 @@
 import unittest
 
 import numpy
+import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 
@@ -30,7 +31,7 @@ def net(self):
         y_predict = fluid.layers.fc(input=x, size=1, act=None)
 
         cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-        avg_cost = fluid.layers.mean(cost)
+        avg_cost = paddle.mean(cost)
 
         opt = fluid.optimizer.Adam(learning_rate=lr)
         opt.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/test_feed_data_check_shape_type.py b/python/paddle/fluid/tests/unittests/test_feed_data_check_shape_type.py
index f3fe43e315212..8519beb9615ad 100644
--- a/python/paddle/fluid/tests/unittests/test_feed_data_check_shape_type.py
+++ b/python/paddle/fluid/tests/unittests/test_feed_data_check_shape_type.py
@@ -61,7 +61,7 @@ def _simple_fc_net(self, in_size, label_size, class_num, hidden_sizes):
             hidden = fluid.layers.fc(hidden, size=hidden_size)
 
         predict_label = fluid.layers.fc(hidden, size=class_num, act='softmax')
-        loss = fluid.layers.mean(
+        loss = paddle.mean(
             fluid.layers.cross_entropy(input=predict_label, label=label))
 
         optimizer = fluid.optimizer.Adam()
diff --git a/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py b/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py
index 2e48157f950f8..ab8ea8d3e4bd3 100644
--- a/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py
+++ b/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py
@@ -44,7 +44,7 @@ def conv_net(self, img, label):
         hidden = fluid.layers.fc(input=conv_pool_2, size=32, act='relu')
         prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
         loss = fluid.layers.cross_entropy(input=prediction, label=label)
-        avg_loss = fluid.layers.mean(loss)
+        avg_loss = paddle.mean(loss)
         return avg_loss, prediction
 
     def build_program(self, main, startup, is_test):
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_api_input.py b/python/paddle/fluid/tests/unittests/test_fleet_api_input.py
index 139ce121ad587..edcaa54a7f884 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_api_input.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_api_input.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import unittest
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig
 from paddle.fluid.incubate.fleet.base.role_maker import UserDefinedRoleMaker
@@ -56,7 +57,7 @@ def testInvalidInputs(self):
 
         data = fluid.layers.data(name='X', shape=[1], dtype='float32')
         hidden = fluid.layers.fc(input=data, size=10)
-        loss = fluid.layers.mean(hidden)
+        loss = paddle.mean(hidden)
         adam = fluid.optimizer.Adam()
         adam.minimize(loss)
         place = fluid.CPUPlace()
@@ -156,7 +157,7 @@ def testInvalidInputs(self):
         self.assertRaises(Exception, transpiler.minimize, loss=[])
         data = fluid.layers.data(name='X', shape=[1], dtype='float32')
         hidden = fluid.layers.fc(input=data, size=10)
-        loss = fluid.layers.mean(hidden)
+        loss = paddle.mean(hidden)
         self.assertRaises(Exception,
                           transpiler.minimize,
                           loss=loss.name,
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_auto.py b/python/paddle/fluid/tests/unittests/test_fleet_auto.py
index 460ef27f63c18..7f37e7b2a44ac 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_auto.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_auto.py
@@ -42,7 +42,7 @@ def test_distributed_strategy_auto(self):
         prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
         cost = paddle.fluid.layers.cross_entropy(input=prediction,
                                                  label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
+        avg_cost = paddle.mean(x=cost)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.auto = True
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base_2.py b/python/paddle/fluid/tests/unittests/test_fleet_base_2.py
index 5b87f215feff7..ed914d2866510 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_base_2.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base_2.py
@@ -55,7 +55,7 @@ def test_ps_minimize(self):
         prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
         cost = paddle.fluid.layers.cross_entropy(input=prediction,
                                                  label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
+        avg_cost = paddle.mean(x=cost)
 
         role = fleet.PaddleCloudRoleMaker(is_collective=False)
         fleet.init(role)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base_3.py b/python/paddle/fluid/tests/unittests/test_fleet_base_3.py
index 5e6aabe308ec1..c81d96bafd332 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_base_3.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base_3.py
@@ -42,7 +42,7 @@ def test_collective_minimize(self):
         prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
         cost = paddle.fluid.layers.cross_entropy(input=prediction,
                                                  label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
+        avg_cost = paddle.mean(x=cost)
 
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
@@ -72,7 +72,7 @@ def test_fleet_get_applied_optimizer(self):
         prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
         cost = paddle.fluid.layers.cross_entropy(input=prediction,
                                                  label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
+        avg_cost = paddle.mean(x=cost)
 
         fleet.init(is_collective=True)
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py b/python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py
index f48b166f97035..fb78f1f1add0c 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import unittest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
 from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
@@ -43,7 +44,7 @@ def _test_checkpoint(self, fs, dir_path):
                                   place=fluid.CPUPlace())
         predict = fluid.layers.fc(input=image, size=10, act='softmax')
         loss = fluid.layers.cross_entropy(input=predict, label=label)
-        avg_loss = fluid.layers.mean(loss)
+        avg_loss = paddle.mean(loss)
         optimizer = fluid.optimizer.AdamOptimizer(learning_rate=0.001)
 
         dist_optimizer = fleet.distributed_optimizer(optimizer)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_fp16_allreduce_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_fp16_allreduce_meta_optimizer.py
index d7de5ef3d40eb..363843dd5e839 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_fp16_allreduce_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_fp16_allreduce_meta_optimizer.py
@@ -44,7 +44,7 @@ def net(self, main_prog, startup_prog, dtype='float32'):
                                                 act='softmax')
             cost = paddle.fluid.layers.cross_entropy(input=prediction,
                                                      label=input_y)
-            avg_cost = paddle.fluid.layers.mean(x=cost)
+            avg_cost = paddle.mean(x=cost)
 
             strategy = paddle.distributed.fleet.DistributedStrategy()
             strategy.fp16_allreduce = True
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py
index 6ca078cdde7f5..bc6a554f84d8f 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py
@@ -80,7 +80,7 @@ def node_func():
                                                 act='softmax')
             cost = paddle.fluid.layers.cross_entropy(input=prediction,
                                                      label=input_y)
-            avg_cost = paddle.fluid.layers.mean(x=cost)
+            avg_cost = paddle.mean(x=cost)
 
             strategy = paddle.distributed.fleet.DistributedStrategy()
             optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
@@ -148,7 +148,7 @@ def node_func():
                                                 act='softmax')
             cost = paddle.fluid.layers.cross_entropy(input=prediction,
                                                      label=input_y)
-            avg_cost = paddle.fluid.layers.mean(x=cost)
+            avg_cost = paddle.mean(x=cost)
 
             strategy = paddle.distributed.fleet.DistributedStrategy()
             strategy.nccl_comm_num = 2
@@ -228,7 +228,7 @@ def node_func():
                                                 act='softmax')
             cost = paddle.fluid.layers.cross_entropy(input=prediction,
                                                      label=input_y)
-            avg_cost = paddle.fluid.layers.mean(x=cost)
+            avg_cost = paddle.mean(x=cost)
 
             strategy = paddle.distributed.fleet.DistributedStrategy()
             optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
@@ -295,7 +295,7 @@ def node_func():
                                                 act='softmax')
             cost = paddle.fluid.layers.cross_entropy(input=prediction,
                                                      label=input_y)
-            avg_cost = paddle.fluid.layers.mean(x=cost)
+            avg_cost = paddle.mean(x=cost)
 
             strategy = paddle.distributed.fleet.DistributedStrategy()
             strategy.nccl_comm_num = 2
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py b/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py
index 2afe4af3645f2..af2a8a1465c3c 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py
@@ -58,7 +58,7 @@ def node_func():
                                                 act='softmax')
             cost = paddle.fluid.layers.cross_entropy(input=prediction,
                                                      label=input_y)
-            avg_cost = paddle.fluid.layers.mean(x=cost)
+            avg_cost = paddle.mean(x=cost)
 
             strategy = paddle.distributed.fleet.DistributedStrategy()
             strategy.nccl_comm_num = 2
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py
index 928ea06a611d4..3062812223d64 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py
@@ -79,9 +79,9 @@ def test_opt_sharding_with_pp(self):
         self.assertEqual(main_prog_op_types, [
             'recv_v2', 'mul', 'elementwise_add', 'tanh', 'mul',
             'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul',
-            'elementwise_add', 'softmax', 'cross_entropy2', 'mean',
-            'fill_constant', 'mean_grad', 'cross_entropy_grad2', 'softmax_grad',
-            'elementwise_add_grad', 'mul_grad', 'tanh_grad',
+            'elementwise_add', 'softmax', 'cross_entropy2', 'reduce_mean',
+            'fill_constant', 'reduce_mean_grad', 'cross_entropy_grad2',
+            'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2',
@@ -161,9 +161,9 @@ def test_opt_sharding_with_pp_with_allreduce_fuse(self):
         self.assertEqual(main_prog_op_types, [
             'recv_v2', 'mul', 'elementwise_add', 'tanh', 'mul',
             'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul',
-            'elementwise_add', 'softmax', 'cross_entropy2', 'mean',
-            'fill_constant', 'mean_grad', 'cross_entropy_grad2', 'softmax_grad',
-            'elementwise_add_grad', 'mul_grad', 'tanh_grad',
+            'elementwise_add', 'softmax', 'cross_entropy2', 'reduce_mean',
+            'fill_constant', 'reduce_mean_grad', 'cross_entropy_grad2',
+            'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2',
@@ -228,8 +228,8 @@ def test_opt_sharding_with_pp_amp_gclip(self):
             'tanh', 'cast', 'cast', 'mul', 'cast', 'elementwise_add', 'cast',
             'tanh', 'cast', 'cast', 'mul', 'cast', 'elementwise_add', 'cast',
             'tanh', 'cast', 'cast', 'mul', 'cast', 'elementwise_add', 'softmax',
-            'cast', 'cross_entropy2', 'mean', 'elementwise_mul',
-            'fill_constant', 'elementwise_mul_grad', 'mean_grad',
+            'cast', 'cross_entropy2', 'reduce_mean', 'elementwise_mul',
+            'fill_constant', 'elementwise_mul_grad', 'reduce_mean_grad',
             'cross_entropy_grad2', 'cast', 'softmax_grad',
             'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast',
             'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast',
@@ -305,10 +305,10 @@ def test_opt_sharding_with_pp_amp_gclip_fuse_gm(self):
             'tanh', 'cast', 'cast', 'mul', 'cast', 'elementwise_add', 'cast',
             'tanh', 'cast', 'cast', 'mul', 'cast', 'elementwise_add', 'cast',
             'tanh', 'cast', 'cast', 'mul', 'cast', 'elementwise_add', 'softmax',
-            'cast', 'cross_entropy2', 'mean', 'elementwise_mul',
+            'cast', 'cross_entropy2', 'reduce_mean', 'elementwise_mul',
             'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor',
             'coalesce_tensor', 'fill_constant', 'elementwise_mul_grad',
-            'mean_grad', 'cross_entropy_grad2', 'cast', 'softmax_grad',
+            'reduce_mean_grad', 'cross_entropy_grad2', 'cast', 'softmax_grad',
             'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast',
             'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast',
             'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast',
@@ -386,22 +386,22 @@ def test_opt_sharding_with_pp_amp_ckp_fuse_gm_optcast(self):
             'mul', 'elementwise_add', 'cast', 'tanh', 'cast', 'mul',
             'elementwise_add', 'cast', 'tanh', 'cast', 'mul', 'cast',
             'elementwise_add', 'cast', 'softmax', 'cast', 'cross_entropy2',
-            'mean', 'elementwise_mul', 'coalesce_tensor', 'coalesce_tensor',
+            'reduce_mean', 'elementwise_mul', 'coalesce_tensor',
             'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor',
-            'coalesce_tensor', 'fill_constant', 'elementwise_mul_grad',
-            'mean_grad', 'cross_entropy_grad2', 'cast', 'softmax_grad', 'cast',
-            'elementwise_add_grad', 'cast', 'mul_grad', 'cast', 'tanh_grad',
-            'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad',
-            'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'cast', 'mul',
-            'elementwise_add', 'cast', 'tanh_grad', 'cast',
-            'elementwise_add_grad', 'mul_grad', 'cast', 'c_sync_calc_stream',
-            'send_v2', 'cast', 'sum', 'sum', 'cast', 'sum', 'c_reduce_sum',
-            'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream',
-            'check_finite_and_unscale', 'cast', 'c_allreduce_max',
-            'c_allreduce_max', 'cast', 'update_loss_scaling', 'momentum',
-            'cast', 'momentum', 'cast', 'momentum', 'cast', 'momentum',
-            'momentum', 'cast', 'coalesce_tensor', 'c_broadcast', 'c_broadcast',
-            'coalesce_tensor', 'c_broadcast'
+            'coalesce_tensor', 'coalesce_tensor', 'fill_constant',
+            'elementwise_mul_grad', 'reduce_mean_grad', 'cross_entropy_grad2',
+            'cast', 'softmax_grad', 'cast', 'elementwise_add_grad', 'cast',
+            'mul_grad', 'cast', 'tanh_grad', 'cast', 'elementwise_add_grad',
+            'mul_grad', 'cast', 'tanh_grad', 'cast', 'elementwise_add_grad',
+            'mul_grad', 'cast', 'cast', 'mul', 'elementwise_add', 'cast',
+            'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast',
+            'c_sync_calc_stream', 'send_v2', 'cast', 'sum', 'sum', 'cast',
+            'sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
+            'c_sync_comm_stream', 'check_finite_and_unscale', 'cast',
+            'c_allreduce_max', 'c_allreduce_max', 'cast', 'update_loss_scaling',
+            'momentum', 'cast', 'momentum', 'cast', 'momentum', 'cast',
+            'momentum', 'momentum', 'cast', 'coalesce_tensor', 'c_broadcast',
+            'c_broadcast', 'coalesce_tensor', 'c_broadcast'
         ])
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
index f6f3f50be0dee..1c20d2e45be03 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
@@ -48,7 +48,7 @@ def net(self, main_prog, startup_prog):
                                                     act='softmax')
                 cost = paddle.fluid.layers.cross_entropy(input=prediction,
                                                          label=input_y)
-                avg_cost = paddle.fluid.layers.mean(x=cost)
+                avg_cost = paddle.mean(x=cost)
 
                 strategy = paddle.distributed.fleet.DistributedStrategy()
                 strategy.lamb = True
@@ -120,7 +120,7 @@ def test_lamb_apply_with_amp(self):
         prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
         cost = paddle.fluid.layers.cross_entropy(input=prediction,
                                                  label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
+        avg_cost = paddle.mean(x=cost)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.amp = True
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
index b4f0c93d09ccc..b560cdaa66ef4 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
@@ -48,7 +48,7 @@ def net(self, main_prog, startup_prog):
                                                     act='softmax')
                 cost = paddle.fluid.layers.cross_entropy(input=prediction,
                                                          label=input_y)
-                avg_cost = paddle.fluid.layers.mean(x=cost)
+                avg_cost = paddle.mean(x=cost)
 
                 strategy = paddle.distributed.fleet.DistributedStrategy()
                 strategy.lars = True
@@ -121,7 +121,7 @@ def test_lars_apply_with_amp(self):
         prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
         cost = paddle.fluid.layers.cross_entropy(input=prediction,
                                                  label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
+        avg_cost = paddle.mean(x=cost)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.amp = True
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_meta_optimizer_base.py b/python/paddle/fluid/tests/unittests/test_fleet_meta_optimizer_base.py
index f39f916dbbe64..21246cb74c442 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_meta_optimizer_base.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_meta_optimizer_base.py
@@ -44,7 +44,7 @@ def net(main_prog, startup_prog):
                                                     act='softmax')
                 cost = paddle.fluid.layers.cross_entropy(input=prediction,
                                                          label=input_y)
-                avg_cost = paddle.fluid.layers.mean(x=cost)
+                avg_cost = paddle.mean(x=cost)
 
                 optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
                 opt = MetaOptimizerBase(optimizer)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py
index d9bc0c7a5f39c..279a2e21f70ef 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py
@@ -57,7 +57,7 @@ def net(self):
                                                 act='softmax')
             cost = paddle.fluid.layers.cross_entropy(input=prediction,
                                                      label=input_y)
-            avg_cost = paddle.fluid.layers.mean(x=cost)
+            avg_cost = paddle.mean(x=cost)
         return avg_cost
 
     def test_pipeline_optimizer(self):
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer_with_recompute.py b/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer_with_recompute.py
index 5c086a5994f0b..c45c81c35b42b 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer_with_recompute.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer_with_recompute.py
@@ -52,7 +52,7 @@ def test_pipeline_optimizer(self):
                                                 act='softmax')
             cost = paddle.fluid.layers.cross_entropy(input=prediction,
                                                      label=input_y)
-            avg_cost = paddle.fluid.layers.mean(x=cost)
+            avg_cost = paddle.mean(x=cost)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.pipeline = True
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_raw_program_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_raw_program_meta_optimizer.py
index 05c3391565ea2..3fde52958d353 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_raw_program_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_raw_program_meta_optimizer.py
@@ -41,7 +41,7 @@ def test_pipeline_optimizer(self):
         prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
         cost = paddle.fluid.layers.cross_entropy(input=prediction,
                                                  label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
+        avg_cost = paddle.mean(x=cost)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.without_graph_optimization = True
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
index 7fc68ec15636a..8952f01dd6df5 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
@@ -446,7 +446,7 @@ def net():
             y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32')
             cost = paddle.fluid.layers.square_error_cost(input=y_predict,
                                                          label=y)
-            avg_cost = paddle.fluid.layers.mean(cost)
+            avg_cost = paddle.mean(cost)
             return avg_cost
 
         from paddle.distributed import fleet
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
index 20eace7cce3c0..68ad29880b372 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
@@ -52,14 +52,14 @@ def test_sharding_optimizer(self):
             'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast',
             'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream',
             'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh',
-            'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean',
-            'fill_constant', 'mean_grad', 'cross_entropy_grad2', 'softmax_grad',
-            'elementwise_add_grad', 'mul_grad', 'tanh_grad',
-            'elementwise_add_grad', 'mul_grad', 'tanh_grad',
-            'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
+            'mul', 'elementwise_add', 'softmax', 'cross_entropy2',
+            'reduce_mean', 'fill_constant', 'reduce_mean_grad',
+            'cross_entropy_grad2', 'softmax_grad', 'elementwise_add_grad',
+            'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad',
+            'tanh_grad', 'elementwise_add_grad', 'mul_grad',
+            'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum',
             'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
-            'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', 'momentum',
-            'momentum', 'momentum'
+            'c_sync_comm_stream', 'momentum', 'momentum', 'momentum'
         ])
 
     def test_sharding_amp_optimizer(self):
@@ -92,16 +92,16 @@ def test_sharding_amp_optimizer(self):
             'c_sync_comm_stream', 'cast', 'mul', 'elementwise_add', 'cast',
             'tanh', 'cast', 'mul', 'elementwise_add', 'cast', 'tanh', 'cast',
             'mul', 'elementwise_add', 'softmax', 'cast', 'cross_entropy2',
-            'mean', 'elementwise_mul', 'fill_constant', 'elementwise_mul_grad',
-            'mean_grad', 'cross_entropy_grad2', 'cast', 'softmax_grad',
-            'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast',
-            'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast',
-            'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
+            'reduce_mean', 'elementwise_mul', 'fill_constant',
+            'elementwise_mul_grad', 'reduce_mean_grad', 'cross_entropy_grad2',
+            'cast', 'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'cast',
+            'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast',
+            'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad',
+            'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum',
             'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
-            'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', 'cast',
-            'cast', 'cast', 'check_finite_and_unscale', 'cast',
-            'c_allreduce_max', 'cast', 'update_loss_scaling', 'momentum',
-            'momentum', 'momentum'
+            'c_sync_comm_stream', 'cast', 'cast', 'cast',
+            'check_finite_and_unscale', 'cast', 'c_allreduce_max', 'cast',
+            'update_loss_scaling', 'momentum', 'momentum', 'momentum'
         ])
 
     def test_sharding_recompute_optimizer(self):
@@ -132,11 +132,12 @@ def test_sharding_recompute_optimizer(self):
             'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast',
             'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream',
             'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh',
-            'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean',
-            'fill_constant', 'mean_grad', 'cross_entropy_grad2', 'softmax_grad',
+            'mul', 'elementwise_add', 'softmax', 'cross_entropy2',
+            'reduce_mean', 'fill_constant', 'reduce_mean_grad',
+            'cross_entropy_grad2', 'softmax_grad', 'elementwise_add_grad',
+            'mul_grad', 'mul', 'elementwise_add', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'mul', 'elementwise_add',
-            'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'mul',
-            'elementwise_add', 'tanh_grad', 'elementwise_add_grad', 'mul_grad',
+            'tanh_grad', 'elementwise_add_grad', 'mul_grad',
             'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum',
             'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
             'c_sync_comm_stream', 'momentum', 'momentum', 'momentum'
@@ -177,9 +178,9 @@ def test_sharding_amp_recompute_optimizer(self):
             'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream',
             'cast', 'mul', 'elementwise_add', 'cast', 'tanh', 'cast', 'mul',
             'elementwise_add', 'cast', 'tanh', 'cast', 'mul', 'elementwise_add',
-            'softmax', 'cast', 'cross_entropy2', 'mean', 'elementwise_mul',
-            'fill_constant', 'elementwise_mul_grad', 'mean_grad',
-            'cross_entropy_grad2', 'cast', 'softmax_grad',
+            'softmax', 'cast', 'cross_entropy2', 'reduce_mean',
+            'elementwise_mul', 'fill_constant', 'elementwise_mul_grad',
+            'reduce_mean_grad', 'cross_entropy_grad2', 'cast', 'softmax_grad',
             'elementwise_add_grad', 'mul_grad', 'cast', 'cast', 'mul',
             'elementwise_add', 'cast', 'tanh_grad', 'cast',
             'elementwise_add_grad', 'mul_grad', 'cast', 'mul',
@@ -227,16 +228,17 @@ def test_sharding_amp_asp_optimizer(self):
             'c_sync_comm_stream', 'cast', 'mul', 'elementwise_add', 'cast',
             'tanh', 'cast', 'mul', 'elementwise_add', 'cast', 'tanh', 'cast',
             'mul', 'elementwise_add', 'softmax', 'cast', 'cross_entropy2',
-            'mean', 'elementwise_mul', 'fill_constant', 'elementwise_mul_grad',
-            'mean_grad', 'cross_entropy_grad2', 'cast', 'softmax_grad',
-            'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast',
-            'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast',
-            'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
+            'reduce_mean', 'elementwise_mul', 'fill_constant',
+            'elementwise_mul_grad', 'reduce_mean_grad', 'cross_entropy_grad2',
+            'cast', 'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'cast',
+            'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast',
+            'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad',
+            'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum',
             'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
-            'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', 'cast',
-            'cast', 'cast', 'check_finite_and_unscale', 'cast',
-            'c_allreduce_max', 'cast', 'update_loss_scaling', 'momentum',
-            'momentum', 'momentum', 'elementwise_mul'
+            'c_sync_comm_stream', 'cast', 'cast', 'cast',
+            'check_finite_and_unscale', 'cast', 'c_allreduce_max', 'cast',
+            'update_loss_scaling', 'momentum', 'momentum', 'momentum',
+            'elementwise_mul'
         ])
 
     def test_sharding_weight_decay(self):
@@ -268,15 +270,15 @@ def test_sharding_weight_decay(self):
             'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast',
             'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream',
             'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh',
-            'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean',
-            'fill_constant', 'mean_grad', 'cross_entropy_grad2', 'softmax_grad',
-            'elementwise_add_grad', 'mul_grad', 'tanh_grad',
-            'elementwise_add_grad', 'mul_grad', 'tanh_grad',
-            'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
+            'mul', 'elementwise_add', 'softmax', 'cross_entropy2',
+            'reduce_mean', 'fill_constant', 'reduce_mean_grad',
+            'cross_entropy_grad2', 'softmax_grad', 'elementwise_add_grad',
+            'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad',
+            'tanh_grad', 'elementwise_add_grad', 'mul_grad',
+            'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum',
             'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
-            'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', 'scale',
-            'sum', 'scale', 'sum', 'scale', 'sum', 'momentum', 'momentum',
-            'momentum'
+            'c_sync_comm_stream', 'scale', 'sum', 'scale', 'sum', 'scale',
+            'sum', 'momentum', 'momentum', 'momentum'
         ])
 
     def test_sharding_gradient_clip(self):
@@ -308,17 +310,18 @@ def test_sharding_gradient_clip(self):
             'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast',
             'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream',
             'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh',
-            'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean',
-            'fill_constant', 'mean_grad', 'cross_entropy_grad2', 'softmax_grad',
-            'elementwise_add_grad', 'mul_grad', 'tanh_grad',
-            'elementwise_add_grad', 'mul_grad', 'tanh_grad',
-            'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
+            'mul', 'elementwise_add', 'softmax', 'cross_entropy2',
+            'reduce_mean', 'fill_constant', 'reduce_mean_grad',
+            'cross_entropy_grad2', 'softmax_grad', 'elementwise_add_grad',
+            'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad',
+            'tanh_grad', 'elementwise_add_grad', 'mul_grad',
+            'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum',
             'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
-            'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream',
-            'squared_l2_norm', 'squared_l2_norm', 'squared_l2_norm', 'sum',
-            'c_allreduce_sum', 'sqrt', 'fill_constant', 'elementwise_max',
-            'elementwise_div', 'elementwise_mul', 'elementwise_mul',
-            'elementwise_mul', 'momentum', 'momentum', 'momentum'
+            'c_sync_comm_stream', 'squared_l2_norm', 'squared_l2_norm',
+            'squared_l2_norm', 'sum', 'c_allreduce_sum', 'sqrt',
+            'fill_constant', 'elementwise_max', 'elementwise_div',
+            'elementwise_mul', 'elementwise_mul', 'elementwise_mul', 'momentum',
+            'momentum', 'momentum'
         ])
 
     def test_sharding_clone_for_test(self):
@@ -338,7 +341,7 @@ def test_sharding_clone_for_test(self):
             'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast',
             'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream',
             'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh',
-            'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean'
+            'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'reduce_mean'
         ])
 
 
@@ -464,15 +467,16 @@ def test_sharding_hybrid_dp(self):
             'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast',
             'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream',
             'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh',
-            'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean',
-            'fill_constant', 'mean_grad', 'cross_entropy_grad2', 'softmax_grad',
-            'elementwise_add_grad', 'mul_grad', 'tanh_grad',
-            'elementwise_add_grad', 'mul_grad', 'tanh_grad',
-            'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
+            'mul', 'elementwise_add', 'softmax', 'cross_entropy2',
+            'reduce_mean', 'fill_constant', 'reduce_mean_grad',
+            'cross_entropy_grad2', 'softmax_grad', 'elementwise_add_grad',
+            'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad',
+            'tanh_grad', 'elementwise_add_grad', 'mul_grad',
+            'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum',
             'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
-            'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream',
-            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
-            'c_sync_comm_stream', 'momentum', 'momentum', 'momentum'
+            'c_sync_comm_stream', 'c_allreduce_sum', 'c_allreduce_sum',
+            'c_allreduce_sum', 'c_sync_comm_stream', 'momentum', 'momentum',
+            'momentum'
         ])
 
     def test_sharding_hybrid_dp_gm(self):
@@ -527,15 +531,16 @@ def test_sharding_hybrid_dp_gm(self):
             'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast',
             'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream',
             'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh',
-            'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean',
-            'fill_constant', 'mean_grad', 'cross_entropy_grad2', 'softmax_grad',
-            'elementwise_add_grad', 'mul_grad', 'tanh_grad',
-            'elementwise_add_grad', 'mul_grad', 'tanh_grad',
-            'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
+            'mul', 'elementwise_add', 'softmax', 'cross_entropy2',
+            'reduce_mean', 'fill_constant', 'reduce_mean_grad',
+            'cross_entropy_grad2', 'softmax_grad', 'elementwise_add_grad',
+            'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad',
+            'tanh_grad', 'elementwise_add_grad', 'mul_grad',
+            'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum',
             'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
-            'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream',
-            'elementwise_add', 'elementwise_add', 'elementwise_add',
-            'increment', 'elementwise_mod', 'equal', 'conditional_block'
+            'c_sync_comm_stream', 'elementwise_add', 'elementwise_add',
+            'elementwise_add', 'increment', 'elementwise_mod', 'equal',
+            'conditional_block'
         ])
         self.assertEqual(opt_ops, [
             'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum', 'scale',
@@ -597,10 +602,11 @@ def test_sharding_with_pp(self):
             'c_broadcast', 'c_sync_comm_stream', 'recv_v2', 'mul',
             'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul',
             'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'softmax',
-            'cross_entropy2', 'mean', 'fill_constant', 'mean_grad',
-            'cross_entropy_grad2', 'softmax_grad', 'elementwise_add_grad',
-            'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad',
-            'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
+            'cross_entropy2', 'reduce_mean', 'fill_constant',
+            'reduce_mean_grad', 'cross_entropy_grad2', 'softmax_grad',
+            'elementwise_add_grad', 'mul_grad', 'tanh_grad',
+            'elementwise_add_grad', 'mul_grad', 'tanh_grad',
+            'elementwise_add_grad', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2',
             'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum',
             'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
@@ -720,26 +726,26 @@ def test_hybrid_with_mp_pp_amp_gclip(self):
             'elementwise_add', 'cast', 'tanh', 'cast', 'cast', 'mul', 'cast',
             'elementwise_add', 'cast', 'tanh', 'cast', 'cast', 'mul', 'cast',
             'elementwise_add', 'cast', 'tanh', 'cast', 'cast', 'mul', 'cast',
-            'elementwise_add', 'softmax', 'cast', 'cross_entropy2', 'mean',
-            'elementwise_mul', 'fill_constant', 'elementwise_mul_grad',
-            'mean_grad', 'cross_entropy_grad2', 'cast', 'softmax_grad',
-            'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast',
-            'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast',
-            'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast',
-            'elementwise_add_grad', 'mul_grad', 'cast', 'c_sync_calc_stream',
-            'partial_send', 'fill_constant', 'cast', 'sum', 'fill_constant',
-            'cast', 'sum', 'fill_constant', 'cast', 'sum', 'fill_constant',
-            'cast', 'sum', 'fill_constant', 'cast', 'sum', 'fill_constant',
-            'cast', 'sum', 'fill_constant', 'cast', 'sum', 'fill_constant',
-            'cast', 'sum', 'c_sync_comm_stream', 'check_finite_and_unscale',
-            'cast', 'c_allreduce_max', 'c_allreduce_max', 'cast',
-            'update_loss_scaling', 'fill_constant', 'c_allreduce_sum',
-            'c_allreduce_sum', 'sqrt', 'fill_constant', 'elementwise_max',
-            'elementwise_div', 'elementwise_mul', 'elementwise_mul',
+            'elementwise_add', 'softmax', 'cast', 'cross_entropy2',
+            'reduce_mean', 'elementwise_mul', 'fill_constant',
+            'elementwise_mul_grad', 'reduce_mean_grad', 'cross_entropy_grad2',
+            'cast', 'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'cast',
+            'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast',
+            'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast',
+            'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast',
+            'c_sync_calc_stream', 'partial_send', 'fill_constant', 'cast',
+            'sum', 'fill_constant', 'cast', 'sum', 'fill_constant', 'cast',
+            'sum', 'fill_constant', 'cast', 'sum', 'fill_constant', 'cast',
+            'sum', 'fill_constant', 'cast', 'sum', 'fill_constant', 'cast',
+            'sum', 'fill_constant', 'cast', 'sum', 'c_sync_comm_stream',
+            'check_finite_and_unscale', 'cast', 'c_allreduce_max',
+            'c_allreduce_max', 'cast', 'update_loss_scaling', 'fill_constant',
+            'c_allreduce_sum', 'c_allreduce_sum', 'sqrt', 'fill_constant',
+            'elementwise_max', 'elementwise_div', 'elementwise_mul',
             'elementwise_mul', 'elementwise_mul', 'elementwise_mul',
-            'elementwise_mul', 'elementwise_mul', 'elementwise_mul', 'momentum',
-            'momentum', 'momentum', 'momentum', 'momentum', 'momentum',
-            'momentum', 'momentum'
+            'elementwise_mul', 'elementwise_mul', 'elementwise_mul',
+            'elementwise_mul', 'momentum', 'momentum', 'momentum', 'momentum',
+            'momentum', 'momentum', 'momentum', 'momentum'
         ])
 
         # pp + mp, partial send recv
@@ -839,25 +845,26 @@ def test_hybrid_with_mp_pp_amp_gclip_for_optimizer(self):
             'elementwise_add', 'cast', 'tanh', 'cast', 'cast', 'mul', 'cast',
             'elementwise_add', 'cast', 'tanh', 'cast', 'cast', 'mul', 'cast',
             'elementwise_add', 'cast', 'tanh', 'cast', 'cast', 'mul', 'cast',
-            'elementwise_add', 'softmax', 'cast', 'cross_entropy2', 'mean',
-            'elementwise_mul', 'fill_constant', 'elementwise_mul_grad',
-            'mean_grad', 'cross_entropy_grad2', 'cast', 'softmax_grad',
-            'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast',
-            'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast',
-            'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast',
-            'elementwise_add_grad', 'mul_grad', 'cast', 'c_sync_calc_stream',
-            'partial_send', 'fill_constant', 'cast', 'sum', 'fill_constant',
-            'cast', 'sum', 'fill_constant', 'cast', 'sum', 'fill_constant',
-            'cast', 'sum', 'fill_constant', 'cast', 'sum', 'fill_constant',
-            'cast', 'sum', 'fill_constant', 'cast', 'sum', 'fill_constant',
-            'cast', 'sum', 'c_sync_comm_stream', 'check_finite_and_unscale',
-            'cast', 'c_allreduce_max', 'c_allreduce_max', 'cast',
-            'update_loss_scaling', 'memcpy', 'fill_constant', 'c_allreduce_sum',
-            'c_allreduce_sum', 'sqrt', 'fill_constant', 'elementwise_max',
-            'elementwise_div', 'elementwise_mul', 'elementwise_mul',
+            'elementwise_add', 'softmax', 'cast', 'cross_entropy2',
+            'reduce_mean', 'elementwise_mul', 'fill_constant',
+            'elementwise_mul_grad', 'reduce_mean_grad', 'cross_entropy_grad2',
+            'cast', 'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'cast',
+            'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast',
+            'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast',
+            'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast',
+            'c_sync_calc_stream', 'partial_send', 'fill_constant', 'cast',
+            'sum', 'fill_constant', 'cast', 'sum', 'fill_constant', 'cast',
+            'sum', 'fill_constant', 'cast', 'sum', 'fill_constant', 'cast',
+            'sum', 'fill_constant', 'cast', 'sum', 'fill_constant', 'cast',
+            'sum', 'fill_constant', 'cast', 'sum', 'c_sync_comm_stream',
+            'check_finite_and_unscale', 'cast', 'c_allreduce_max',
+            'c_allreduce_max', 'cast', 'update_loss_scaling', 'memcpy',
+            'fill_constant', 'c_allreduce_sum', 'c_allreduce_sum', 'sqrt',
+            'fill_constant', 'elementwise_max', 'elementwise_div',
             'elementwise_mul', 'elementwise_mul', 'elementwise_mul',
-            'elementwise_mul', 'elementwise_mul', 'elementwise_mul', 'adamw',
-            'adamw', 'adamw', 'adamw', 'adamw', 'adamw', 'adamw', 'adamw'
+            'elementwise_mul', 'elementwise_mul', 'elementwise_mul',
+            'elementwise_mul', 'elementwise_mul', 'adamw', 'adamw', 'adamw',
+            'adamw', 'adamw', 'adamw', 'adamw', 'adamw'
         ])
 
         # pp + mp, partial send recv
@@ -948,8 +955,8 @@ def test_hybrid_with_pp_dp_amp_fp16allreduce(self):
             'recv_v2', 'cast', 'mul', 'cast', 'elementwise_add', 'tanh', 'cast',
             'mul', 'cast', 'elementwise_add', 'tanh', 'cast', 'mul', 'cast',
             'elementwise_add', 'tanh', 'cast', 'mul', 'cast', 'elementwise_add',
-            'softmax', 'cross_entropy2', 'mean', 'elementwise_mul',
-            'fill_constant', 'elementwise_mul_grad', 'mean_grad',
+            'softmax', 'cross_entropy2', 'reduce_mean', 'elementwise_mul',
+            'fill_constant', 'elementwise_mul_grad', 'reduce_mean_grad',
             'cross_entropy_grad2', 'softmax_grad', 'elementwise_add_grad',
             'cast', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad',
             'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
@@ -1119,11 +1126,11 @@ def test_hybrid_with_pp_dp_amp_fp16allreduce_optimize_cast(self):
         self.assertEqual(main_prog_op_types, [
             'recv_v2', 'mul', 'elementwise_add', 'tanh', 'mul',
             'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul',
-            'cast', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean',
-            'elementwise_mul', 'fill_constant', 'elementwise_mul_grad',
-            'mean_grad', 'cross_entropy_grad2', 'softmax_grad',
-            'elementwise_add_grad', 'cast', 'mul_grad', 'tanh_grad',
-            'elementwise_add_grad', 'mul_grad', 'tanh_grad',
+            'cast', 'elementwise_add', 'softmax', 'cross_entropy2',
+            'reduce_mean', 'elementwise_mul', 'fill_constant',
+            'elementwise_mul_grad', 'reduce_mean_grad', 'cross_entropy_grad2',
+            'softmax_grad', 'elementwise_add_grad', 'cast', 'mul_grad',
+            'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2',
             'fill_constant', 'cast', 'sum', 'fill_constant', 'sum',
@@ -1218,11 +1225,11 @@ def test_hybrid_with_pp_dp_amp_fp16allreduce_optimize_offload(self):
         self.assertEqual(main_prog_op_types, [
             'recv_v2', 'mul', 'elementwise_add', 'tanh', 'mul',
             'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul',
-            'cast', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean',
-            'elementwise_mul', 'fill_constant', 'elementwise_mul_grad',
-            'mean_grad', 'cross_entropy_grad2', 'softmax_grad',
-            'elementwise_add_grad', 'cast', 'mul_grad', 'tanh_grad',
-            'elementwise_add_grad', 'mul_grad', 'tanh_grad',
+            'cast', 'elementwise_add', 'softmax', 'cross_entropy2',
+            'reduce_mean', 'elementwise_mul', 'fill_constant',
+            'elementwise_mul_grad', 'reduce_mean_grad', 'cross_entropy_grad2',
+            'softmax_grad', 'elementwise_add_grad', 'cast', 'mul_grad',
+            'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2',
             'fill_constant', 'cast', 'sum', 'fill_constant', 'sum',
@@ -1320,13 +1327,13 @@ def test_hybrid_with_pp_dp_amp_fp16allreduce_optimize_cast_with_gradient_fuse(
         self.assertEqual(main_prog_op_types, [
             'recv_v2', 'mul', 'elementwise_add', 'tanh', 'mul',
             'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul',
-            'cast', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean',
-            'elementwise_mul', 'coalesce_tensor', 'coalesce_tensor',
-            'coalesce_tensor', 'coalesce_tensor', 'fill_constant',
-            'elementwise_mul_grad', 'mean_grad', 'cross_entropy_grad2',
-            'softmax_grad', 'elementwise_add_grad', 'cast', 'mul_grad',
+            'cast', 'elementwise_add', 'softmax', 'cross_entropy2',
+            'reduce_mean', 'elementwise_mul', 'coalesce_tensor',
+            'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor',
+            'fill_constant', 'elementwise_mul_grad', 'reduce_mean_grad',
+            'cross_entropy_grad2', 'softmax_grad', 'elementwise_add_grad',
+            'cast', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad',
             'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
-            'elementwise_add_grad', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2',
             'sum', 'cast', 'sum', 'c_allreduce_sum', 'c_allreduce_sum', 'cast',
             'cast', 'cast', 'cast', 'cast', 'cast', 'cast', 'cast',
@@ -1415,10 +1422,10 @@ def test_hybrid_with_pp_dp_amp_with_gradient_fuse(self):
             'recv_v2', 'cast', 'mul', 'cast', 'elementwise_add', 'tanh', 'cast',
             'mul', 'cast', 'elementwise_add', 'tanh', 'cast', 'mul', 'cast',
             'elementwise_add', 'tanh', 'cast', 'mul', 'cast', 'elementwise_add',
-            'softmax', 'cross_entropy2', 'mean', 'elementwise_mul',
+            'softmax', 'cross_entropy2', 'reduce_mean', 'elementwise_mul',
             'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor',
             'coalesce_tensor', 'fill_constant', 'elementwise_mul_grad',
-            'mean_grad', 'cross_entropy_grad2', 'softmax_grad',
+            'reduce_mean_grad', 'cross_entropy_grad2', 'softmax_grad',
             'elementwise_add_grad', 'cast', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'tanh_grad',
@@ -1511,10 +1518,10 @@ def test_hybrid_with_pp_dp_amp_with_gradient_fuse_and_avg_after_sum(self):
             'recv_v2', 'cast', 'mul', 'cast', 'elementwise_add', 'tanh', 'cast',
             'mul', 'cast', 'elementwise_add', 'tanh', 'cast', 'mul', 'cast',
             'elementwise_add', 'tanh', 'cast', 'mul', 'cast', 'elementwise_add',
-            'softmax', 'cross_entropy2', 'mean', 'elementwise_mul',
+            'softmax', 'cross_entropy2', 'reduce_mean', 'elementwise_mul',
             'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor',
             'coalesce_tensor', 'fill_constant', 'elementwise_mul_grad',
-            'mean_grad', 'cross_entropy_grad2', 'softmax_grad',
+            'reduce_mean_grad', 'cross_entropy_grad2', 'softmax_grad',
             'elementwise_add_grad', 'cast', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'tanh_grad',
@@ -1574,11 +1581,12 @@ def test_hybrid_with_pp_dp_with_gradient_fuse_and_avg_after_sum(self):
         self.assertEqual(main_prog_op_types, [
             'recv_v2', 'mul', 'elementwise_add', 'tanh', 'mul',
             'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul',
-            'elementwise_add', 'softmax', 'cross_entropy2', 'mean',
-            'coalesce_tensor', 'coalesce_tensor', 'fill_constant', 'mean_grad',
-            'cross_entropy_grad2', 'softmax_grad', 'elementwise_add_grad',
-            'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad',
-            'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
+            'elementwise_add', 'softmax', 'cross_entropy2', 'reduce_mean',
+            'coalesce_tensor', 'coalesce_tensor', 'fill_constant',
+            'reduce_mean_grad', 'cross_entropy_grad2', 'softmax_grad',
+            'elementwise_add_grad', 'mul_grad', 'tanh_grad',
+            'elementwise_add_grad', 'mul_grad', 'tanh_grad',
+            'elementwise_add_grad', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2',
             'sum', 'c_allreduce_sum', 'c_sync_comm_stream', 'scale', 'momentum',
             'momentum', 'momentum', 'momentum', 'momentum', 'momentum',
@@ -1640,10 +1648,10 @@ def test_hybrid_with_pp_dp_with_amp_no_dynamic_gradient_fuse_and_avg_after_sum(
             'recv_v2', 'cast', 'mul', 'cast', 'elementwise_add', 'tanh', 'cast',
             'mul', 'cast', 'elementwise_add', 'tanh', 'cast', 'mul', 'cast',
             'elementwise_add', 'tanh', 'cast', 'mul', 'cast', 'elementwise_add',
-            'softmax', 'cross_entropy2', 'mean', 'elementwise_mul',
+            'softmax', 'cross_entropy2', 'reduce_mean', 'elementwise_mul',
             'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor',
             'coalesce_tensor', 'fill_constant', 'elementwise_mul_grad',
-            'mean_grad', 'cross_entropy_grad2', 'softmax_grad',
+            'reduce_mean_grad', 'cross_entropy_grad2', 'softmax_grad',
             'elementwise_add_grad', 'cast', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'tanh_grad',
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py
index d1b9c68925747..dce6a37c6bbb8 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py
@@ -39,6 +39,7 @@ def setUp(self):
         self.groups = 1
         self.no_bias = False
         self.data_format = "NHWC"
+        np.random.seed(2022)
 
     def prepare(self):
         if isinstance(self.filter_shape, int):
@@ -188,6 +189,7 @@ def setUp(self):
         self.groups = 1
         self.no_bias = False
         self.data_format = "NHWC"
+        np.random.seed(2022)
 
     def test_exception(self):
         self.prepare()
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
index c8106db13300f..d3a18ad28ca54 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
@@ -48,7 +48,7 @@ def build_program(self, main_program, startup_program, use_cuda, seed=1):
                                               data_layout='NHWC')
             prediction = fluid.layers.fc(input=hidden4, size=10, act='softmax')
             loss = fluid.layers.cross_entropy(input=prediction, label=y)
-            loss = fluid.layers.mean(loss)
+            loss = paddle.mean(loss)
             sgd = fluid.optimizer.SGD(learning_rate=0.001)
             if use_cuda:
                 sgd = fluid.contrib.mixed_precision.decorate(
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py
index 59b85530f10da..08141c44395e3 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py
@@ -94,7 +94,7 @@ def build_fused_program(self,
                                          act='softmax',
                                          param_attr=self.fc_param_attr)
             loss = fluid.layers.cross_entropy(input=prediction, label=y)
-            loss = fluid.layers.mean(loss)
+            loss = paddle.mean(loss)
             sgd = fluid.optimizer.SGD(learning_rate=0.001)
             sgd = fluid.contrib.mixed_precision.decorate(
                 sgd, use_dynamic_loss_scaling=True, init_loss_scaling=128.0)
@@ -144,7 +144,7 @@ def build_origin_program(self,
                                          act='softmax',
                                          param_attr=self.fc_param_attr)
             loss = fluid.layers.cross_entropy(input=prediction, label=y)
-            loss = fluid.layers.mean(loss)
+            loss = paddle.mean(loss)
             sgd = fluid.optimizer.SGD(learning_rate=0.001)
             sgd = fluid.contrib.mixed_precision.decorate(
                 sgd, use_dynamic_loss_scaling=True, init_loss_scaling=128.0)
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
index cddc05f591444..a9e9a588e857d 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
@@ -62,7 +62,7 @@ def simple_depthwise_net(use_feed):
         hidden = fluid.layers.relu(hidden)
     prediction = fluid.layers.fc(hidden, size=10, act='softmax')
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
+    loss = paddle.mean(loss)
     return loss
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_fused_attention_op.py b/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
index 6507cc1ee3258..1ad29ecadd7bf 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
@@ -26,9 +26,7 @@
 from paddle.fluid import layers
 import unittest
 from op_test import OpTest
-from paddle.fluid.framework import default_main_program, _enable_legacy_dygraph
-
-_enable_legacy_dygraph()
+from paddle.fluid.framework import default_main_program
 
 default_main_program().random_seed = 42
 
diff --git a/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py b/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py
index 43d39224287e6..8d2873276033a 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py
@@ -23,9 +23,7 @@
 from paddle.nn.layer.common import Linear, Dropout
 import unittest
 from op_test import OpTest
-from paddle.fluid.framework import default_main_program, _enable_legacy_dygraph
-
-_enable_legacy_dygraph()
+from paddle.fluid.framework import default_main_program
 
 
 class TestFusedFFNOp(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py b/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py
index 0aad7ec7581e9..8b8d378e5c8fa 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py
@@ -26,11 +26,9 @@
 from op_test import OpTest, convert_float_to_uint16, convert_uint16_to_float
 from test_sparse_attention_op import get_cuda_version
 from paddle import _C_ops
-from paddle.fluid.framework import default_main_program, _enable_legacy_dygraph
+from paddle.fluid.framework import default_main_program
 from paddle.fluid import core
 
-_enable_legacy_dygraph()
-
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "Paddle is not compiled with CUDA")
diff --git a/python/paddle/fluid/tests/unittests/test_fused_token_prune_op.py b/python/paddle/fluid/tests/unittests/test_fused_token_prune_op.py
new file mode 100644
index 0000000000000..9425283f078c0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fused_token_prune_op.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle
+from op_test import OpTest
+from paddle.framework import core
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFusedTokenPruneOp(OpTest):
+
+    def setDtype(self):
+        self.dtype = np.float32
+
+    def setInouts(self):
+        attn = [[1, 2], [3, 4]]
+        attn = np.array(attn, dtype=self.dtype)
+        attn = np.expand_dims(attn, axis=0)
+        self.attn = np.expand_dims(
+            attn, axis=0)  # [1,1,2,2] bsz = 1, nd_head=1, max_seq_len=2
+        mask = [[1, 1], [-1, -1]]
+        mask = np.array(mask, dtype=self.dtype)
+        mask = np.expand_dims(mask, axis=0)
+        self.mask = np.expand_dims(mask, axis=0)  # same as attn
+        x = [[1, 2, 3], [4, 5, 6]]
+        x = np.array(x, dtype=self.dtype)
+        self.x = np.expand_dims(x,
+                                axis=0)  # [1, 2, 3] bsz = 1, max_seq_len=2, c=3
+        new_mask = [[1]]
+        new_mask = np.array(new_mask, dtype=self.dtype)
+        new_mask = np.expand_dims(new_mask, axis=0)
+        self.new_mask = np.expand_dims(new_mask, axis=0)  #[1, 1, 1, 1]
+
+        out_slimmedx_py = [[[1, 2, 3]]]
+        self.out_slimmedx_py = np.array(out_slimmedx_py, dtype=self.dtype)
+
+        out_cls_inds_py = [[0]]
+        self.out_cls_inds_py = np.array(out_cls_inds_py, dtype='int64')
+
+    def setUp(self):
+        self.op_type = 'fused_token_prune'
+        self.setDtype()
+        self.setInouts()
+        self.inputs = {
+            'Attn': self.attn,
+            'Mask': self.mask,
+            'X': self.x,
+            'NewMask': self.new_mask
+        }
+
+        self.outputs = {
+            'SlimmedX': self.out_slimmedx_py,
+            'CLSInds': self.out_cls_inds_py
+        }
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CUDAPlace(0))
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFusedTokenPruneOpFloat64(TestFusedTokenPruneOp):
+
+    def setDtype(self):
+        self.dtype = np.float64
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFusedTokenPruneOp2(TestFusedTokenPruneOp):
+
+    def setInouts(self):
+        attn = [[[[1, 2, 3, 4], [4, 3, 2, 1], [5, 9, 5, 4], [9, 6, 5, 4]],
+                 [[8, 5, 2, 0], [1, 0, 2, 3], [2, 2, 3, 2], [7, 4, 1, 8]]]]
+        self.attn = np.array(
+            attn,
+            dtype=self.dtype)  # [1,2,4,4] bsz = 1, nd_head=2, max_seq_len=4
+        mask = [[[[-1, -1, -1, 1], [-1, -1, 1, 1], [-1, -1, 1, 1],
+                  [-1, -1, 1, 1]],
+                 [[-1, -1, 1, 1], [-1, -1, 1, 1], [-1, -1, 1, 1],
+                  [-1, -1, 1, 1]]]]
+        self.mask = np.array(mask, dtype=self.dtype)  # same as attn
+        x = [[[1.1, 1.1, 1.1], [2.2, 2.2, 2.2], [3.3, 3.3, 3.3],
+              [4.4, 4.4, 4.4]]]
+        self.x = np.array(
+            x, dtype=self.dtype)  # [1, 4, 3] bsz = 1, max_seq_len=4, c=3
+        self.new_mask = np.random.rand(1, 2, 2,
+                                       2).astype(self.dtype)  #[1, 2, 2, 2]
+
+        out_slimmedx_py = [[[1.1, 1.1, 1.1], [4.4, 4.4, 4.4]]]  #[1, 2, 3]
+        self.out_slimmedx_py = np.array(out_slimmedx_py, dtype=self.dtype)
+
+        out_cls_inds_py = [[0, 3]]
+        self.out_cls_inds_py = np.array(out_cls_inds_py, dtype='int64')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_generator_dataloader.py b/python/paddle/fluid/tests/unittests/test_generator_dataloader.py
index 674c0b4d12fe4..7c1a80d229626 100644
--- a/python/paddle/fluid/tests/unittests/test_generator_dataloader.py
+++ b/python/paddle/fluid/tests/unittests/test_generator_dataloader.py
@@ -63,7 +63,7 @@ def simple_fc_net(places, use_legacy_py_reader, use_double_buffer):
             predict_label = fluid.layers.fc(hidden,
                                             size=CLASS_NUM,
                                             act='softmax')
-            loss = fluid.layers.mean(
+            loss = paddle.mean(
                 fluid.layers.cross_entropy(input=predict_label, label=label))
 
             optimizer = fluid.optimizer.Adam()
diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
index dfdb3c32dc232..e84a3f0329623 100644
--- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
@@ -47,7 +47,7 @@ def bow_net(data,
     fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
     prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
     cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
+    avg_cost = paddle.mean(x=cost)
 
     return avg_cost
 
@@ -89,7 +89,7 @@ def check_gradient_clip(self, place, dtype='float32'):
             predict = fluid.layers.fc(input=hidden, size=10, act='softmax')
 
             cost = fluid.layers.cross_entropy(input=predict, label=label)
-            avg_cost = fluid.layers.mean(cost)
+            avg_cost = paddle.mean(cost)
 
         prog_clip = prog.clone()
         avg_cost_clip = prog_clip.block(0).var(avg_cost.name)
diff --git a/python/paddle/fluid/tests/unittests/test_identity_loss_op.py b/python/paddle/fluid/tests/unittests/test_identity_loss_op.py
new file mode 100644
index 0000000000000..3912fcafd52d0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_identity_loss_op.py
@@ -0,0 +1,188 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+from op_test import OpTest
+from paddle.fluid.framework import _test_eager_guard
+
+
+class TestIdentityLossOp(OpTest):
+
+    def setUp(self):
+        self.max_relative_error = 0.006
+        self.python_api = paddle.incubate.identity_loss
+
+        self.inputs = {}
+        self.initTestCase()
+        self.dtype = np.float64
+
+        self.op_type = "identity_loss"
+        self.attrs = {}
+        self.attrs['reduction'] = self.reduction
+
+        input = np.random.random(self.shape).astype(self.dtype)
+
+        self.inputs['X'] = input
+        if self.reduction == 0:
+            output = input.sum()
+        elif self.reduction == 1:
+            output = input.mean()
+        else:
+            output = input
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        paddle.enable_static()
+        self.check_output(check_eager=True)
+        paddle.disable_static()
+
+    def test_check_grad_normal(self):
+        paddle.enable_static()
+        self.check_grad(['X'], 'Out', check_eager=True)
+        paddle.disable_static()
+
+    def initTestCase(self):
+        self.shape = (4, 10, 10)
+        self.reduction = 0
+
+
+class TestCase1(TestIdentityLossOp):
+
+    def initTestCase(self):
+        self.shape = (8, 16, 8)
+        self.reduction = 0
+
+
+class TestCase2(TestIdentityLossOp):
+
+    def initTestCase(self):
+        self.shape = (8, 16)
+        self.reduction = 1
+
+
+class TestCase3(TestIdentityLossOp):
+
+    def initTestCase(self):
+        self.shape = (4, 8, 16)
+        self.reduction = 2
+
+
+class TestIdentityLossFloat32(TestIdentityLossOp):
+
+    def set_attrs(self):
+        self.dtype = 'float32'
+
+
+class TestIdentityLossOpError(unittest.TestCase):
+
+    def test_errors(self):
+        paddle.enable_static()
+        with program_guard(Program(), Program()):
+            input_data = np.random.random((2, 4)).astype("float32")
+
+            def test_int():
+                paddle.incubate.identity_loss(x=input_data, reduction=3)
+
+            self.assertRaises(Exception, test_int)
+
+            def test_string():
+                paddle.incubate.identity_loss(x=input_data,
+                                              reduction="wrongkey")
+
+            self.assertRaises(Exception, test_string)
+
+            def test_dtype():
+                x2 = fluid.layers.data(name='x2', shape=[1], dtype='int32')
+                paddle.incubate.identity_loss(x=x2, reduction=1)
+
+            self.assertRaises(TypeError, test_dtype)
+        paddle.disable_static()
+
+
+class TestIdentityLossAPI(unittest.TestCase):
+
+    def setUp(self):
+        self.x_shape = [2, 3, 4, 5]
+        self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
+        self.place = fluid.CPUPlace()
+
+    def identity_loss_ref(self, input, reduction):
+        if reduction == 0 or reduction == "sum":
+            return input.sum()
+        elif reduction == 1 or reduction == "mean":
+            return input.mean()
+        else:
+            return input
+
+    def test_api_static(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.fluid.data('X', self.x_shape)
+            out1 = paddle.incubate.identity_loss(x)
+            out2 = paddle.incubate.identity_loss(x, reduction=0)
+            out3 = paddle.incubate.identity_loss(x, reduction=1)
+            out4 = paddle.incubate.identity_loss(x, reduction=2)
+
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x},
+                          fetch_list=[out1, out2, out3, out4])
+        ref = [
+            self.identity_loss_ref(self.x, 2),
+            self.identity_loss_ref(self.x, 0),
+            self.identity_loss_ref(self.x, 1),
+            self.identity_loss_ref(self.x, 2)
+        ]
+        for out, out_ref in zip(res, ref):
+            self.assertEqual(np.allclose(out, out_ref, rtol=1e-04), True)
+
+    def test_api_dygraph(self):
+        paddle.disable_static(self.place)
+
+        def test_case(x, reduction):
+            x_tensor = paddle.to_tensor(x)
+            out = paddle.incubate.identity_loss(x_tensor, reduction)
+            out_ref = self.identity_loss_ref(x, reduction)
+            self.assertEqual(np.allclose(out.numpy(), out_ref, rtol=1e-04),
+                             True)
+
+        test_case(self.x, 0)
+        test_case(self.x, 1)
+        test_case(self.x, 2)
+        test_case(self.x, "sum")
+        test_case(self.x, "mean")
+        test_case(self.x, "none")
+        paddle.enable_static()
+
+    def test_errors(self):
+        paddle.disable_static()
+        x = np.random.uniform(-1, 1, [10, 12]).astype('float32')
+        x = paddle.to_tensor(x)
+        self.assertRaises(Exception, paddle.incubate.identity_loss, x, -1)
+        self.assertRaises(Exception, paddle.incubate.identity_loss, x, 3)
+        self.assertRaises(Exception, paddle.incubate.identity_loss, x,
+                          "wrongkey")
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.fluid.data('X', [10, 12], 'int32')
+            self.assertRaises(TypeError, paddle.incubate.identity_loss, x)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
index 9649e9c68eda2..d59cdc3e328e2 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
@@ -82,7 +82,7 @@ def amp_guard_black_op(self):
         with fluid.dygraph.guard():
             data = fluid.dygraph.to_variable(data)
             with fluid.dygraph.amp_guard(True):
-                out_fp32 = fluid.layers.mean(data)
+                out_fp32 = paddle.mean(data)
 
         self.assertTrue(data.dtype == fluid.core.VarDesc.VarType.FP32)
         self.assertTrue(out_fp32.dtype == fluid.core.VarDesc.VarType.FP32)
@@ -222,7 +222,7 @@ def run_simple_conv(inp_np, use_scaler=True):
                 data = fluid.dygraph.to_variable(inp_np)
 
                 out = model(data)
-                loss = fluid.layers.mean(out)
+                loss = paddle.mean(out)
                 if use_scaler:
                     print('use scaler')
                     scaled_loss = scaler.scale(loss)
@@ -273,7 +273,7 @@ def run_simple_conv(inp_np, use_scaler=True):
                 data = fluid.dygraph.to_variable(inp_np)
 
                 out = model(data)
-                loss = fluid.layers.mean(out)
+                loss = paddle.mean(out)
                 if use_scaler:
                     print('use scaler')
                     scaled_loss = scaler.scale(loss)
@@ -316,7 +316,7 @@ def nan_inf(self):
             data = fluid.dygraph.to_variable(inp_np)
 
             out = model(data)
-            loss = fluid.layers.mean(out)
+            loss = paddle.mean(out)
             scaled_loss = scaler.scale(loss)
             scaled_loss.backward()
             optimize_ops, params_grads = scaler.minimize(optimizer, scaled_loss)
@@ -1215,7 +1215,7 @@ def train_resnet(self, enable_amp=True, level='O1'):
                     out = resnet(img)
 
                 loss = fluid.layers.cross_entropy(input=out, label=label)
-                avg_loss = fluid.layers.mean(x=loss)
+                avg_loss = paddle.mean(x=loss)
 
                 dy_out = avg_loss.numpy()
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision_for_eager.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision_for_eager.py
index d12b002f04ef8..3b1a0436556b1 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision_for_eager.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision_for_eager.py
@@ -81,7 +81,7 @@ def amp_guard_black_op(self):
         with fluid.dygraph.guard():
             data = fluid.dygraph.to_variable(data)
             with fluid.dygraph.amp_guard(True):
-                out_fp32 = fluid.layers.mean(data)
+                out_fp32 = paddle.mean(data)
 
         self.assertTrue(data.dtype == fluid.core.VarDesc.VarType.FP32)
         self.assertTrue(out_fp32.dtype == fluid.core.VarDesc.VarType.FP32)
@@ -221,7 +221,7 @@ def run_simple_conv(inp_np, use_scaler=True):
                 data = fluid.dygraph.to_variable(inp_np)
 
                 out = model(data)
-                loss = fluid.layers.mean(out)
+                loss = paddle.mean(out)
                 if use_scaler:
                     print('use scaler')
                     scaled_loss = scaler.scale(loss)
@@ -272,7 +272,7 @@ def run_simple_conv(inp_np, use_scaler=True):
                 data = fluid.dygraph.to_variable(inp_np)
 
                 out = model(data)
-                loss = fluid.layers.mean(out)
+                loss = paddle.mean(out)
                 if use_scaler:
                     print('use scaler')
                     scaled_loss = scaler.scale(loss)
@@ -315,7 +315,7 @@ def nan_inf(self):
             data = fluid.dygraph.to_variable(inp_np)
 
             out = model(data)
-            loss = fluid.layers.mean(out)
+            loss = paddle.mean(out)
             scaled_loss = scaler.scale(loss)
             scaled_loss.backward()
             optimize_ops, params_grads = scaler.minimize(optimizer, scaled_loss)
@@ -1206,7 +1206,7 @@ def train_resnet(self, enable_amp=True, level='O1'):
                     out = resnet(img)
 
                 loss = fluid.layers.cross_entropy(input=out, label=label)
-                avg_loss = fluid.layers.mean(x=loss)
+                avg_loss = paddle.mean(x=loss)
 
                 dy_out = avg_loss.numpy()
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
index 4dee7cf963348..7a5934b4fdc79 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import unittest
+import paddle
 import paddle.fluid as fluid
 import numpy as np
 from paddle.fluid.framework import _test_eager_guard
@@ -79,7 +80,7 @@ def forward(self, x, label):
         label = fluid.layers.cast(label, dtype='int64')
         # Note that the label is not persistable in fluid.layers.cross_entropy.
         loss = fluid.layers.cross_entropy(input=feature, label=label)
-        loss = fluid.layers.mean(loss)
+        loss = paddle.mean(loss)
         return loss
 
 
@@ -96,7 +97,7 @@ def forward(self, x, label, test_num):
                                           dim=1)
         # Note that: part2 is not used.
         loss = fluid.layers.cross_entropy(input=part1, label=label)
-        loss = fluid.layers.mean(loss)
+        loss = paddle.mean(loss)
         if test_num == 1:
             return loss, part2
         else:
@@ -460,7 +461,7 @@ def func_case3_prune_no_grad_branch2(self):
             label = fluid.layers.cast(label, dtype="float32")
             label = fluid.layers.cast(label, dtype='int64')
             out = fluid.layers.one_hot(input=label, depth=100)
-            loss = fluid.layers.mean(out)
+            loss = paddle.mean(out)
             loss.backward()
             self.assertTrue(linear.weight._grad_ivar() is None)
 
@@ -472,7 +473,7 @@ def test_case3_prune_no_grad_branch2(self):
     def func_case4_with_no_grad_op_maker(self):
         with fluid.dygraph.guard():
             out = fluid.layers.gaussian_random(shape=[20, 30])
-            loss = fluid.layers.mean(out)
+            loss = paddle.mean(out)
             loss.backward()
             self.assertTrue(out._grad_ivar() is None)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
index aeead6ff74745..a365b00e9129c 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
@@ -178,7 +178,7 @@ def func_test_mnist_float32(self):
                         helper.assertEachVar(cost, cost_static)
 
                     loss = fluid.layers.cross_entropy(cost, label)
-                    avg_loss = fluid.layers.mean(loss)
+                    avg_loss = paddle.mean(loss)
 
                     dy_out = avg_loss.numpy()
 
@@ -213,7 +213,7 @@ def func_test_mnist_float32(self):
             label = fluid.layers.data(name='label', shape=[1], dtype='int64')
             cost = mnist(img)
             loss = fluid.layers.cross_entropy(cost, label)
-            avg_loss = fluid.layers.mean(loss)
+            avg_loss = paddle.mean(loss)
             sgd.minimize(avg_loss)
 
             # initialize params and fetch them
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py
index 23af23a4286ea..18094024b4a10 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py
@@ -63,7 +63,7 @@ def func_test_mnist_sort_gradient_float32(self):
 
                     cost2 = mnist2(img2)
                     loss2 = fluid.layers.cross_entropy(cost2, label2)
-                    avg_loss2 = fluid.layers.mean(loss2)
+                    avg_loss2 = paddle.mean(loss2)
 
                     dy_out2 = avg_loss2.numpy()
 
@@ -100,7 +100,7 @@ def func_test_mnist_sort_gradient_float32(self):
             label = fluid.layers.data(name='label', shape=[1], dtype='int64')
             cost = mnist(img)
             loss = fluid.layers.cross_entropy(cost, label)
-            avg_loss = fluid.layers.mean(loss)
+            avg_loss = paddle.mean(loss)
             sgd.minimize(avg_loss)
 
             # initialize params and fetch them
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
index 69ebf875b3d0b..0371176d7824f 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
@@ -309,7 +309,7 @@ def func_test_resnet_float32(self):
                     resnet.train()
 
                 loss = fluid.layers.cross_entropy(input=out, label=label)
-                avg_loss = fluid.layers.mean(x=loss)
+                avg_loss = paddle.mean(x=loss)
 
                 dy_out = avg_loss.numpy()
 
@@ -356,7 +356,7 @@ def func_test_resnet_float32(self):
             label = fluid.layers.data(name='label', shape=[1], dtype='int64')
             out = resnet(img)
             loss = fluid.layers.cross_entropy(input=out, label=label)
-            avg_loss = fluid.layers.mean(x=loss)
+            avg_loss = paddle.mean(x=loss)
             optimizer.minimize(avg_loss)
 
             # initialize params and fetch them
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py
index 0a1d1c0cfb315..4942e1db76968 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py
@@ -112,7 +112,7 @@ def func_test_resnet_sort_gradient_float32(self):
 
                 out = resnet(img)
                 loss = fluid.layers.cross_entropy(input=out, label=label)
-                avg_loss = fluid.layers.mean(x=loss)
+                avg_loss = paddle.mean(x=loss)
 
                 dy_out = avg_loss.numpy()
 
@@ -161,7 +161,7 @@ def func_test_resnet_sort_gradient_float32(self):
             label = fluid.layers.data(name='label', shape=[1], dtype='int64')
             out = resnet(img)
             loss = fluid.layers.cross_entropy(input=out, label=label)
-            avg_loss = fluid.layers.mean(x=loss)
+            avg_loss = paddle.mean(x=loss)
             optimizer.minimize(avg_loss)
 
             # initialize params and fetch them
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
index 245982c71ccc2..fa2d470fc5e79 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
@@ -351,7 +351,7 @@ def run_dygraph():
                     softmax_out = fluid.layers.softmax(out, use_cudnn=False)
                     loss = fluid.layers.cross_entropy(input=softmax_out,
                                                       label=label)
-                    avg_loss = fluid.layers.mean(x=loss)
+                    avg_loss = paddle.mean(x=loss)
 
                     dy_out = avg_loss.numpy()
 
@@ -410,7 +410,7 @@ def run_dygraph():
             out = se_resnext(img)
             softmax_out = fluid.layers.softmax(out, use_cudnn=False)
             loss = fluid.layers.cross_entropy(input=softmax_out, label=label)
-            avg_loss = fluid.layers.mean(x=loss)
+            avg_loss = paddle.mean(x=loss)
             optimizer.minimize(avg_loss)
 
             # initialize params and fetch them
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
index 092478bbf2ae1..b8ea449c2b254 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
@@ -427,7 +427,7 @@ def get_generator_loss(image_real, label_org, label_trg, generator,
 
     pred_fake, cls_fake = discriminator(fake_img)
 
-    g_loss_fake = -fluid.layers.mean(pred_fake)
+    g_loss_fake = -paddle.mean(pred_fake)
     g_loss_cls = loss_cls(cls_fake, label_trg, cfg)
     g_loss = g_loss_fake + cfg.lambda_rec * g_loss_rec + g_loss_cls
     return g_loss
@@ -439,8 +439,8 @@ def get_discriminator_loss(image_real, label_org, label_trg, generator,
     pred_real, cls_real = discriminator(image_real)
     pred_fake, _ = discriminator(fake_img)
     d_loss_cls = loss_cls(cls_real, label_org, cfg)
-    d_loss_fake = fluid.layers.mean(pred_fake)
-    d_loss_real = -fluid.layers.mean(pred_real)
+    d_loss_fake = paddle.mean(pred_fake)
+    d_loss_real = -paddle.mean(pred_real)
     d_loss = d_loss_real + d_loss_fake + d_loss_cls
 
     d_loss_gp = gradient_penalty(discriminator, image_real, fake_img,
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py
index 619e1ba37d60c..d031cd84683da 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py
@@ -52,7 +52,7 @@ def static_train_net(img, label):
     prediction = convolutional_neural_network(img)
 
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_loss = fluid.layers.mean(loss)
+    avg_loss = paddle.mean(loss)
 
     optimizer = fluid.optimizer.SGD(learning_rate=0.001)
     optimizer.minimize(avg_loss)
@@ -159,7 +159,7 @@ def load_and_train_dygraph(self):
                     cost = mnist(img)
 
                     loss = fluid.layers.cross_entropy(cost, label)
-                    avg_loss = fluid.layers.mean(loss)
+                    avg_loss = paddle.mean(loss)
 
                     avg_loss.backward()
                     sgd.minimize(avg_loss)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py
index 6c90b8348714c..0c4dad64adaea 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py
@@ -86,7 +86,7 @@ def train_and_save_model(self):
         pred = while_softmax_regression(img)
 
         loss = fluid.layers.cross_entropy(input=pred, label=label)
-        avg_loss = fluid.layers.mean(loss)
+        avg_loss = paddle.mean(loss)
 
         optimizer = fluid.optimizer.SGD(learning_rate=0.001)
         optimizer.minimize(avg_loss)
@@ -144,7 +144,7 @@ def load_and_train_dygraph(self):
                 cost = while_net(img)
 
                 loss = fluid.layers.cross_entropy(cost, label)
-                avg_loss = fluid.layers.mean(loss)
+                avg_loss = paddle.mean(loss)
 
                 avg_loss.backward()
                 sgd.minimize(avg_loss)
@@ -169,7 +169,7 @@ def load_and_train_static(self):
             pred = while_softmax_regression(img)
 
             loss = fluid.layers.cross_entropy(input=pred, label=label)
-            avg_loss = fluid.layers.mean(loss)
+            avg_loss = paddle.mean(loss)
 
             optimizer = fluid.optimizer.SGD(learning_rate=0.001)
             optimizer.minimize(avg_loss)
diff --git a/python/paddle/fluid/tests/unittests/test_inference_model_io.py b/python/paddle/fluid/tests/unittests/test_inference_model_io.py
index 89b7771700f57..431b8be2a779e 100644
--- a/python/paddle/fluid/tests/unittests/test_inference_model_io.py
+++ b/python/paddle/fluid/tests/unittests/test_inference_model_io.py
@@ -61,7 +61,7 @@ def test_fit_line_inference_model(self):
             y_predict = layers.fc(input=x, size=1, act=None)
 
             cost = layers.square_error_cost(input=y_predict, label=y)
-            avg_cost = layers.mean(cost)
+            avg_cost = paddle.mean(cost)
 
             sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
             sgd_optimizer.minimize(avg_cost, init_program)
@@ -142,7 +142,7 @@ def test_save_inference_model(self):
             y_predict = layers.fc(input=x, size=1, act=None)
 
             cost = layers.square_error_cost(input=y_predict, label=y)
-            avg_cost = layers.mean(cost)
+            avg_cost = paddle.mean(cost)
 
         place = core.CPUPlace()
         exe = executor.Executor(place)
@@ -166,7 +166,7 @@ def test_save_inference_model_with_auc(self):
             auc_var, batch_auc_var, auc_states = fluid.layers.auc(input=predict,
                                                                   label=y)
             cost = fluid.layers.cross_entropy(input=predict, label=y)
-            avg_cost = fluid.layers.mean(x=cost)
+            avg_cost = paddle.mean(x=cost)
 
         place = core.CPUPlace()
         exe = executor.Executor(place)
@@ -197,7 +197,7 @@ def test_save_inference_model(self):
             y_predict = layers.fc(input=x, size=1, act=None)
 
             cost = layers.square_error_cost(input=y_predict, label=y)
-            avg_cost = layers.mean(cost)
+            avg_cost = paddle.mean(cost)
 
         place = core.CPUPlace()
         exe = executor.Executor(place)
@@ -230,7 +230,7 @@ def test_save_and_load_inference_model(self):
             y_predict = layers.fc(input=x, size=1, act=None)
 
             cost = layers.square_error_cost(input=y_predict, label=y)
-            avg_cost = layers.mean(cost)
+            avg_cost = paddle.mean(cost)
 
             sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
             sgd_optimizer.minimize(avg_cost, init_program)
@@ -350,7 +350,7 @@ def test_serialize_program_and_persistables(self):
             y_predict = layers.fc(input=x, size=1, act=None)
 
             cost = layers.square_error_cost(input=y_predict, label=y)
-            avg_cost = layers.mean(cost)
+            avg_cost = paddle.mean(cost)
 
             sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
             sgd_optimizer.minimize(avg_cost, init_program)
@@ -393,7 +393,7 @@ def test_normalize_program(self):
             y_predict = layers.fc(input=x, size=1, act=None)
 
             cost = layers.square_error_cost(input=y_predict, label=y)
-            avg_cost = layers.mean(cost)
+            avg_cost = paddle.mean(cost)
 
             sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
             sgd_optimizer.minimize(avg_cost, init_program)
diff --git a/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py b/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
index f45ada0a52980..6b4e3602fe641 100644
--- a/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
@@ -17,6 +17,7 @@
 import os
 import unittest
 import numpy as np
+import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from parallel_executor_test_base import TestParallelExecutorBase, DeviceType
@@ -38,7 +39,7 @@ def fc_with_batchnorm(use_feed):
         hidden = fluid.layers.batch_norm(input=hidden)
     prediction = fluid.layers.fc(hidden, size=10, act='softmax')
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
+    loss = paddle.mean(loss)
     return loss
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
index cd34e9070213a..fa3adfb9e99f5 100644
--- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
@@ -65,7 +65,7 @@ def check_network_convergence(self,
 
             prob = ie()
             loss = layers.cross_entropy(input=prob[0], label=label)
-            avg_loss = layers.mean(loss)
+            avg_loss = paddle.mean(loss)
 
             optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
             optimizer.minimize(avg_loss, startup_prog)
diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_nlp.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_nlp.py
index 360457000befd..ed4dd80885d31 100644
--- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_nlp.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_nlp.py
@@ -15,7 +15,7 @@
 # nlp model stack of op operate on lod. It's a classical test case in optimize pass.
 
 from __future__ import print_function
-
+import paddle
 import paddle.fluid as fluid
 import unittest
 from ir_memory_optimize_net_base import TestIrMemOptBase
@@ -43,7 +43,7 @@ def lstm_net(data,
     fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh')
     prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
     cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
+    avg_cost = paddle.mean(x=cost)
     return avg_cost
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py
index 4b775197aaea1..ac57e1b92243f 100644
--- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py
@@ -36,7 +36,7 @@ def simple_fc_net(use_feed):
         x = fluid.layers.fc(input=x, size=20, act='relu')
     y_predict = fluid.layers.fc(input=x, size=10, act='softmax')
     cost = fluid.layers.cross_entropy(input=y_predict, label=y)
-    avg_cost = fluid.layers.mean(cost)
+    avg_cost = paddle.mean(cost)
     return avg_cost
 
 
@@ -49,7 +49,7 @@ def fc_with_inplace_net(use_feed):
     reshape = fluid.layers.reshape(x=reshape, shape=[-1, 5, 2])
     y_predict = fluid.layers.fc(input=reshape, size=10, act='softmax')
     cost = fluid.layers.cross_entropy(input=y_predict, label=y)
-    avg_cost = fluid.layers.mean(cost)
+    avg_cost = paddle.mean(cost)
     return avg_cost
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_jit_layer.py b/python/paddle/fluid/tests/unittests/test_jit_layer.py
new file mode 100644
index 0000000000000..fd77aa599889f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_jit_layer.py
@@ -0,0 +1,81 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import paddle
+import unittest
+import tempfile
+import numpy as np
+from paddle.static import InputSpec
+from paddle.fluid.framework import _enable_legacy_dygraph
+from paddle.jit.layer import Layer
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator
+
+paddle.seed(1)
+
+
+class Net(paddle.nn.Layer):
+
+    def __init__(self):
+        super(Net, self).__init__()
+        self.fc1 = paddle.nn.Linear(4, 4)
+        self.fc2 = paddle.nn.Linear(4, 4)
+        self._bias = 0.4
+
+    @paddle.jit.to_static(input_spec=[InputSpec([None, 4], dtype='float32')])
+    def forward(self, x):
+        out = self.fc1(x)
+        out = self.fc2(out)
+        out = paddle.nn.functional.relu(out)
+        out = paddle.mean(out)
+        return out
+
+    @paddle.jit.to_static(input_spec=[InputSpec([None, 4], dtype='float32')])
+    def infer(self, input):
+        out = self.fc2(input)
+        out = out + self._bias
+        out = paddle.mean(out)
+        return out
+
+
+class TestMultiLoad(unittest.TestCase):
+
+    def test_multi_load(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+        x = paddle.full([2, 4], 2)
+        model = Net()
+        program_translator = ProgramTranslator()
+        program_translator.enable(False)
+        forward_out1 = model.forward(x)
+        infer_out1 = model.infer(x)
+        program_translator.enable(True)
+
+        model_path = os.path.join(self.temp_dir.name, 'multi_program')
+        paddle.jit.save(model, model_path, combine_params=True)
+        place = paddle.CPUPlace()
+        if paddle.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+        jit_layer = Layer()
+        jit_layer.load(model_path, place)
+        forward_out2 = jit_layer.forward(x)
+        infer_out2 = jit_layer.infer(x)
+        self.assertEqual(np.allclose(forward_out1, forward_out2[0]), True)
+        self.assertEqual(np.allclose(infer_out1, infer_out2[0]), True)
+
+        self.temp_dir.cleanup()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
index bf5ccf1a854ff..eab86141ba6b1 100644
--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
@@ -96,7 +96,7 @@ def __init__(self, in_size, out_size):
     def forward(self, x, label):
         out = self._linear(x)
         loss = fluid.layers.cross_entropy(out, label)
-        avg_loss = fluid.layers.mean(loss)
+        avg_loss = paddle.mean(loss)
         return out, avg_loss
 
 
@@ -113,7 +113,7 @@ def __init__(self, in_size, out_size):
     def forward(self, x, label):
         out = self._linear(x)
         loss = fluid.layers.cross_entropy(out, label)
-        avg_loss = fluid.layers.mean(loss)
+        avg_loss = paddle.mean(loss)
         return out
 
 
@@ -142,7 +142,7 @@ def __init__(self, in_size, out_size):
     def forward(self, x):
         y = self._linear(x)
         z = self._linear(y)
-        loss = fluid.layers.mean(z)
+        loss = paddle.mean(z)
         return z, loss
 
 
@@ -160,7 +160,7 @@ def __init__(self, in_size, out_size):
     def forward(self, x, y):
         x_out = self._linear1(x)
         y_out = self._linear2(y)
-        loss = fluid.layers.mean(x_out + y_out)
+        loss = paddle.mean(x_out + y_out)
         return x_out, y_out, loss
 
 
@@ -176,7 +176,7 @@ def __init__(self, in_size, out_size):
     def forward(self, x, y):
         x_out = self._linear1(x)
         y_out = self._linear2(y)
-        loss = fluid.layers.mean(x_out + y_out)
+        loss = paddle.mean(x_out + y_out)
         return x_out, y_out, loss
 
 
@@ -208,7 +208,7 @@ def __init__(self, in_size, out_size):
     def forward(self, x):
         y = self._linear_1(x)
         z = self._linear_2(y)
-        loss = fluid.layers.mean(z)
+        loss = paddle.mean(z)
         return y, loss
 
 
@@ -224,7 +224,7 @@ def forward(self, x):
         y = self._linear_1(x)
         z = self._linear_2(y)
         out = y + z
-        loss = fluid.layers.mean(out)
+        loss = paddle.mean(out)
         return y, [(z, loss), out]
 
 
@@ -316,7 +316,7 @@ def train(layer, input_size=784, label_size=1):
         cost = layer(img)
 
         loss = fluid.layers.cross_entropy(cost, label)
-        avg_loss = fluid.layers.mean(loss)
+        avg_loss = paddle.mean(loss)
 
         avg_loss.backward()
         sgd.minimize(avg_loss)
@@ -1153,6 +1153,65 @@ def forward(self, x):
         return self._linear_2(y)
 
 
+class Net(paddle.nn.Layer):
+
+    def __init__(self):
+        super(Net, self).__init__()
+        self.fc1 = paddle.nn.Linear(4, 4)
+        self.fc2 = paddle.nn.Linear(4, 4)
+        self.bias = 0.4
+        self.flag = paddle.ones([2], dtype="int32")
+
+    @paddle.jit.to_static(input_spec=[InputSpec([None, 4], dtype='float32')])
+    def log_softmax(self, input):
+        return paddle.nn.functional.log_softmax(input, axis=-1)
+
+    @paddle.jit.to_static(input_spec=[InputSpec([None, 4], dtype='float32')])
+    def forward(self, x):
+        out = self.fc1(x)
+        out = paddle.nn.functional.relu(out)
+        out = paddle.mean(out)
+        return out
+
+    @paddle.jit.to_static(input_spec=[InputSpec([None, 4], dtype='float32')])
+    def infer(self, input):
+        out = self.fc2(input)
+        out = out + self.bias
+        out = paddle.mean(out)
+        return out
+
+    # For extra Python float
+    @paddle.jit.to_static(property=True)
+    def fbias(self):
+        return self.bias + 1
+
+    # For extra Tensor
+    @paddle.jit.to_static(property=True)
+    def fflag(self):
+        return self.flag
+
+
+class TestJitSaveCombine(unittest.TestCase):
+
+    def setUp(self):
+        # enable dygraph mode
+        paddle.disable_static()
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
+    def test_save_load_finetune_load(self):
+        model_path = os.path.join(self.temp_dir.name,
+                                  "test_jit_save_combine/model")
+
+        # Use new namespace
+        with unique_name.guard():
+            net = Net()
+        #save
+        paddle.jit.save(net, model_path, combine_params=True)
+
+
 class LayerLoadFinetune(paddle.nn.Layer):
 
     def __init__(self, in_size, out_size, load_path):
diff --git a/python/paddle/fluid/tests/unittests/test_lambv2_op.py b/python/paddle/fluid/tests/unittests/test_lambv2_op.py
index cde23216c1093..6ae2dbfb590bd 100644
--- a/python/paddle/fluid/tests/unittests/test_lambv2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lambv2_op.py
@@ -125,7 +125,7 @@ def _build_static_model(main, startup, seed=100):
                 y = fluid.layers.data(name='Y', shape=[1], dtype='float32')
                 prediction = fluid.layers.fc(input=x, size=1, act=None)
                 loss = fluid.layers.square_error_cost(input=prediction, label=y)
-                avg_loss = fluid.layers.mean(loss)
+                avg_loss = paddle.mean(loss)
             return avg_loss
 
         place = fluid.CPUPlace()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index aead014e7abb1..551ba3ffb542b 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -2970,7 +2970,7 @@ def make_fit_a_line(self):
             y_predict = layers.fc(input=x, size=1, act=None)
             y = self._get_data(name='y', shape=[1], dtype='float32')
             cost = layers.square_error_cost(input=y_predict, label=y)
-            avg_cost = layers.mean(cost)
+            avg_cost = paddle.mean(cost)
             return (avg_cost)
 
     def make_recognize_digits_mlp(self):
@@ -2986,7 +2986,7 @@ def make_recognize_digits_mlp(self):
                                 act='softmax',
                                 param_attr=["sftmax.w1", "sftmax.w2"])
             cost = layers.cross_entropy(input=predict, label=label)
-            avg_cost = layers.mean(cost)
+            avg_cost = paddle.mean(cost)
             return (avg_cost)
 
     def make_conv2d_transpose(self):
@@ -3019,7 +3019,7 @@ def make_recognize_digits_conv(self):
 
             predict = layers.fc(input=conv_pool_2, size=10, act="softmax")
             cost = layers.cross_entropy(input=predict, label=label)
-            avg_cost = layers.mean(cost)
+            avg_cost = paddle.mean(cost)
             return avg_cost
 
     def make_word_embedding(self):
@@ -3062,7 +3062,7 @@ def make_word_embedding(self):
                                      size=dict_size,
                                      act='softmax')
             cost = layers.cross_entropy(input=predict_word, label=next_word)
-            avg_cost = layers.mean(cost)
+            avg_cost = paddle.mean(cost)
             return (avg_cost)
 
     def make_sigmoid_cross_entropy(self):
@@ -3235,7 +3235,7 @@ def make_nce(self):
                           num_total_classes=dict_size,
                           param_attr='nce.w',
                           bias_attr='nce.b')
-        avg_loss = layers.mean(loss)
+        avg_loss = paddle.mean(loss)
         return (avg_loss)
 
     def make_multiplex(self):
diff --git a/python/paddle/fluid/tests/unittests/test_layout_autotune.py b/python/paddle/fluid/tests/unittests/test_layout_autotune.py
index 6e25e3719d3cd..5cb53437fe9cd 100644
--- a/python/paddle/fluid/tests/unittests/test_layout_autotune.py
+++ b/python/paddle/fluid/tests/unittests/test_layout_autotune.py
@@ -146,7 +146,7 @@ def test_argmax_op_transposer_keep_dims(self):
             out = paddle.argmax(conv_out, axis=1, keepdim=True)
 
         self.assertEqual(conv_out.shape, [1, 14, 12, 8])
-        self.assertEqual(out.shape, [1, 14, 1, 8])
+        self.assertEqual(out.shape, [1, 14, 12, 1])
 
     def test_argmax_op_transposer(self):
         if not self.use_autoune():
@@ -161,6 +161,35 @@ def test_argmax_op_transposer(self):
         self.assertEqual(conv_out.shape, [1, 14, 12, 8])
         self.assertEqual(out.shape, [1])
 
+    def test_concat_op_transposer(self):
+        if not self.use_autoune():
+            return
+        in1 = paddle.rand([1, 8, 14, 12])
+        conv = paddle.nn.Conv2D(3, 8, (3, 3))
+        data = paddle.rand([1, 3, 16, 14])
+        with paddle.amp.auto_cast(level="O2"):
+            conv_out = conv(data)
+            # conv_out.shape = [1, 14, 12, 8] with NHWC
+            out = paddle.concat(x=[conv_out, in1], axis=0)
+
+        self.assertEqual(conv_out.shape, [1, 14, 12, 8])
+        self.assertEqual(out.shape, [2, 8, 14, 12])
+
+    def test_concat_op_no_transposer(self):
+        if not self.use_autoune():
+            return
+        conv = paddle.nn.Conv2D(3, 8, (3, 3))
+        data1 = paddle.rand([1, 3, 16, 14])
+        data2 = paddle.rand([1, 3, 16, 14])
+        with paddle.amp.auto_cast(level="O2"):
+            conv_out1 = conv(data1)
+            conv_out2 = conv(data2)
+            # conv_out.shape = [1, 14, 12, 8] with NHWC
+            out = paddle.concat(x=[conv_out1, conv_out2], axis=0)
+
+        self.assertEqual(conv_out1.shape, [1, 14, 12, 8])
+        self.assertEqual(out.shape, [2, 14, 12, 8])
+
 
 class TestAutoTuneAPI(unittest.TestCase):
 
diff --git a/python/paddle/fluid/tests/unittests/test_linalg_lstsq_op.py b/python/paddle/fluid/tests/unittests/test_linalg_lstsq_op.py
index 07729ae4e79cf..b283c80adfd9a 100644
--- a/python/paddle/fluid/tests/unittests/test_linalg_lstsq_op.py
+++ b/python/paddle/fluid/tests/unittests/test_linalg_lstsq_op.py
@@ -30,6 +30,7 @@ def setUp(self):
             self.devices.append("gpu:0")
         self.generate_input()
         self.generate_output()
+        np.random.seed(2022)
 
     def init_config(self):
         self.dtype = 'float64'
@@ -175,6 +176,16 @@ def init_config(self):
         self._input_shape_2 = (5, 8)
 
 
+class LinalgLstsqTestCase3(LinalgLstsqTestCase):
+
+    def init_config(self):
+        self.dtype = 'float64'
+        self.rcond = 1e-15
+        self.driver = "gels"
+        self._input_shape_1 = (10, 7, 3)
+        self._input_shape_2 = (10, 7, 6)
+
+
 class LinalgLstsqTestCaseRcond(LinalgLstsqTestCase):
 
     def init_config(self):
@@ -192,7 +203,17 @@ def init_config(self):
         self.rcond = None
         self.driver = "gels"
         self._input_shape_1 = (10, 5)
-        self._input_shape_2 = (10, 2)
+        self._input_shape_2 = (10, 8)
+
+
+class LinalgLstsqTestCaseGelsFloat64(LinalgLstsqTestCase):
+
+    def init_config(self):
+        self.dtype = 'float32'
+        self.rcond = None
+        self.driver = "gels"
+        self._input_shape_1 = (3, 2, 8)
+        self._input_shape_2 = (3, 2, 15)
 
 
 class LinalgLstsqTestCaseGelssFloat64(LinalgLstsqTestCase):
@@ -230,9 +251,9 @@ class LinalgLstsqTestCaseBatch2(LinalgLstsqTestCase):
     def init_config(self):
         self.dtype = 'float64'
         self.rcond = 1e-15
-        self.driver = "gelss"
+        self.driver = "gels"
         self._input_shape_1 = (10, 8, 6)
-        self._input_shape_2 = (10, 8, 2)
+        self._input_shape_2 = (10, 8, 10)
 
 
 class LinalgLstsqTestCaseLarge1(LinalgLstsqTestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
index 0d328034ab7ea..15e3d806bb5f0 100644
--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
@@ -39,7 +39,7 @@ def run_pserver(use_cuda, sync_mode, ip, port, trainers, trainer_id):
 
     # loss function
     cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-    avg_cost = fluid.layers.mean(cost)
+    avg_cost = paddle.mean(cost)
 
     # optimizer
     sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
@@ -73,7 +73,7 @@ def run_pserver_with_empty_block(use_cuda, sync_mode, ip, port, trainers,
 
     # loss function
     cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-    avg_cost = fluid.layers.mean(cost)
+    avg_cost = paddle.mean(cost)
 
     # optimizer
     sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
diff --git a/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
index 32029e561d0ba..0005ccb4ab6a6 100644
--- a/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
+++ b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
@@ -48,7 +48,7 @@ def static_train_net(img, label):
     prediction = convolutional_neural_network(img)
 
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_loss = fluid.layers.mean(loss)
+    avg_loss = paddle.mean(loss)
 
     optimizer = fluid.optimizer.SGD(learning_rate=0.001)
     optimizer.minimize(avg_loss)
diff --git a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
index 2911e7a6b71af..9843410bf7679 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import unittest
+import paddle
 import paddle.fluid.core as core
 import numpy
 import paddle.fluid.layers as layers
@@ -191,7 +192,7 @@ def test_grad(self):
             array = lod_tensor_to_array(x, table)
             result = array_to_lod_tensor(array, table)
 
-            mean = layers.mean(result)
+            mean = paddle.mean(result)
 
             append_backward(mean)
 
diff --git a/python/paddle/fluid/tests/unittests/test_lookahead.py b/python/paddle/fluid/tests/unittests/test_lookahead.py
index efbc28cfa6cea..cc6977bb28420 100644
--- a/python/paddle/fluid/tests/unittests/test_lookahead.py
+++ b/python/paddle/fluid/tests/unittests/test_lookahead.py
@@ -42,7 +42,7 @@ def test_lookahead_static(self):
             with fluid.unique_name.guard():
                 data = fluid.data(name='X', shape=[None, 1], dtype='float32')
                 hidden = fluid.layers.fc(input=data, size=10)
-                loss = fluid.layers.mean(hidden)
+                loss = paddle.mean(hidden)
 
                 optimizer = paddle.optimizer.SGD(learning_rate=SGD_LR)
                 lookahead = paddle.incubate.optimizer.LookAhead(
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
index eed0530e76113..5da93ebb798ee 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
@@ -222,7 +222,7 @@ def get_w_grad(self, is_sparse):
             y = fluid.layers.reduce_sum(emb, dim=-1)
 
             loss = fluid.layers.square_error_cost(input=y, label=y_)
-            loss = fluid.layers.mean(loss)
+            loss = paddle.mean(loss)
 
             sgd_optimizer = fluid.optimizer.SGD(learning_rate=1e-4)
             sgd_optimizer.minimize(loss)
diff --git a/python/paddle/fluid/tests/unittests/test_lu_unpack_op.py b/python/paddle/fluid/tests/unittests/test_lu_unpack_op.py
index 1757adef8e36f..97773c70e177a 100644
--- a/python/paddle/fluid/tests/unittests/test_lu_unpack_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lu_unpack_op.py
@@ -190,6 +190,9 @@ def config(self):
 
 class TestLU_UnpackAPI(unittest.TestCase):
 
+    def setUp(self):
+        np.random.seed(2022)
+
     def test_dygraph(self):
 
         def run_lu_unpack_dygraph(shape, dtype):
diff --git a/python/paddle/fluid/tests/unittests/test_mean_op.py b/python/paddle/fluid/tests/unittests/test_mean_op.py
index af15f271b4a70..6b7a47febb835 100644
--- a/python/paddle/fluid/tests/unittests/test_mean_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mean_op.py
@@ -42,7 +42,7 @@ class TestMeanOp(OpTest):
 
     def setUp(self):
         self.op_type = "mean"
-        self.python_api = fluid.layers.mean
+        self.python_api = paddle.mean
         self.dtype = np.float64
         self.init_dtype_type()
         self.inputs = {'X': np.random.random((10, 10)).astype(self.dtype)}
@@ -64,12 +64,12 @@ def test_errors(self):
         with program_guard(Program(), Program()):
             # The input type of mean_op must be Variable.
             input1 = 12
-            self.assertRaises(TypeError, fluid.layers.mean, input1)
+            self.assertRaises(TypeError, paddle.mean, input1)
             # The input dtype of mean_op must be float16, float32, float64.
             input2 = fluid.layers.data(name='input2',
                                        shape=[12, 10],
                                        dtype="int32")
-            self.assertRaises(TypeError, fluid.layers.mean, input2)
+            self.assertRaises(TypeError, paddle.mean, input2)
             input3 = fluid.layers.data(name='input3',
                                        shape=[4],
                                        dtype="float16")
@@ -96,7 +96,7 @@ def test_checkout_grad(self):
                 x_np = np.random.random((10, 10)).astype(self.dtype)
                 x = paddle.to_tensor(x_np)
                 x.stop_gradient = False
-                y = fluid.layers.mean(x)
+                y = paddle.mean(x)
                 dx = paddle.grad(y, x)[0].numpy()
                 dx_expected = self.dtype(1.0 / np.prod(x_np.shape)) * np.ones(
                     x_np.shape).astype(self.dtype)
diff --git a/python/paddle/fluid/tests/unittests/test_memory_usage.py b/python/paddle/fluid/tests/unittests/test_memory_usage.py
index adc3cd0a8442e..bdce84cfcb658 100644
--- a/python/paddle/fluid/tests/unittests/test_memory_usage.py
+++ b/python/paddle/fluid/tests/unittests/test_memory_usage.py
@@ -29,7 +29,7 @@ def train_simulator(test_batch_size=10):
     y = fluid.layers.data(name='y', shape=[1], dtype='float32')
 
     cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-    avg_cost = fluid.layers.mean(cost)
+    avg_cost = paddle.mean(cost)
 
     sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
     sgd_optimizer.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py b/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py
index 650b6a9a247d5..4b2a849b8b035 100644
--- a/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py
+++ b/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py
@@ -33,7 +33,7 @@
 def loss_net(hidden, label):
     prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_loss = fluid.layers.mean(loss)
+    avg_loss = paddle.mean(loss)
     return avg_loss
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_modelaverage.py b/python/paddle/fluid/tests/unittests/test_modelaverage.py
index 7bb1e7d2e7a27..73a8bf0247c55 100644
--- a/python/paddle/fluid/tests/unittests/test_modelaverage.py
+++ b/python/paddle/fluid/tests/unittests/test_modelaverage.py
@@ -38,7 +38,7 @@ def test_model_average_static(self):
             with fluid.unique_name.guard():
                 data = fluid.data(name='X', shape=[None, 1], dtype='float32')
                 hidden = fluid.layers.fc(input=data, size=10)
-                loss = fluid.layers.mean(hidden)
+                loss = paddle.mean(hidden)
                 test_program = train_program.clone()
                 optimizer = paddle.optimizer.Momentum(learning_rate=0.2,
                                                       momentum=0.1)
diff --git a/python/paddle/fluid/tests/unittests/test_momentum_op.py b/python/paddle/fluid/tests/unittests/test_momentum_op.py
index 0b6bd99e6592f..949bb2fb3250d 100644
--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py
@@ -517,7 +517,7 @@ def test_momentum(self):
             y = fluid.layers.data(name='y', shape=[1], dtype='float32')
             y_predict = fluid.layers.fc(input=x, size=1, act=None)
             cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-            avg_cost = fluid.layers.mean(cost)
+            avg_cost = paddle.mean(cost)
 
             rms_optimizer = paddle.optimizer.Momentum(learning_rate=0.1,
                                                       momentum=0.9)
@@ -658,7 +658,7 @@ def test_momentum_static(self):
             y = fluid.layers.data(name='y', shape=[1], dtype='float32')
             y_predict = fluid.layers.fc(input=x, size=1, act=None)
             cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-            avg_cost = fluid.layers.mean(cost)
+            avg_cost = paddle.mean(cost)
 
             momentum_optimizer = paddle.fluid.contrib.optimizer.Momentum(
                 learning_rate=0.1, momentum=0.9)
@@ -987,7 +987,7 @@ def _momentum_optimize_static(self,
                                           name='X',
                                           dtype='float32')
             hidden = paddle.static.nn.fc(x=data, size=10)
-            loss = paddle.fluid.layers.mean(hidden)
+            loss = paddle.mean(hidden)
             optimizer.minimize(loss)
         exe.run(startup_program)
         if use_amp:
diff --git a/python/paddle/fluid/tests/unittests/test_nce.py b/python/paddle/fluid/tests/unittests/test_nce.py
index bbeec5ce62111..7418714b23b8d 100644
--- a/python/paddle/fluid/tests/unittests/test_nce.py
+++ b/python/paddle/fluid/tests/unittests/test_nce.py
@@ -16,7 +16,7 @@
 
 import numpy as np
 import unittest
-
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.initializer as initializer
 from paddle.fluid import Program, program_guard
@@ -192,7 +192,7 @@ def train_network(self, num_total_classes, num_neg_samples, sampler,
                                 seed=1,
                                 num_neg_samples=num_neg_samples,
                                 is_sparse=is_sparse)
-        avg_cost = fluid.layers.mean(cost)
+        avg_cost = paddle.mean(cost)
         # optimizer
         optimizer = self.get_optimizer()
         optimizer.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/test_network_with_dtype.py b/python/paddle/fluid/tests/unittests/test_network_with_dtype.py
index 7f230164d6027..64f9f14f94ed0 100644
--- a/python/paddle/fluid/tests/unittests/test_network_with_dtype.py
+++ b/python/paddle/fluid/tests/unittests/test_network_with_dtype.py
@@ -39,7 +39,7 @@ def run_net_on_place(self, place):
             y = fluid.layers.data(name='y', shape=[1], dtype=self.dtype)
             y_predict = fluid.layers.fc(input=x, size=1, act=None)
             cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-            avg_cost = fluid.layers.mean(cost)
+            avg_cost = paddle.mean(cost)
             sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
             sgd_optimizer.minimize(avg_cost)
 
diff --git a/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py b/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py
index 170b916941d36..c43fcc51a816f 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py
@@ -16,6 +16,7 @@
 
 import unittest
 import numpy as np
+import paddle
 import paddle.fluid.core as core
 from op_test import OpTest
 from scipy.special import expit, erf
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer_grad.py b/python/paddle/fluid/tests/unittests/test_optimizer_grad.py
index 30cfa9f17ebcc..65d09ebff51d1 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer_grad.py
@@ -122,7 +122,7 @@ def cond_false():
         cond_i = fluid.layers.assign(np.array([cond_i], dtype='float32'))
         sum_cond = fluid.layers.cond(cond_i > 1.0, cond_true, cond_false)
         sum_all = fluid.layers.sum([sum_xy, sub_yz, sum_cond])
-        mean_out = fluid.layers.mean(sum_all)
+        mean_out = paddle.mean(sum_all)
         if use_bf16:
             import paddle.static.amp as amp
             self.optimizer = amp.bf16.decorate_bf16(
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py b/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py
index 40afe9248bf9b..4331ea8ff3136 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py
@@ -72,7 +72,7 @@ def double_fc_net(image):
         def fn_1(opt, avg_loss=None, pred=None, label=None):
             if avg_loss is None:
                 loss = layers.cross_entropy(input=pred, label=label)
-                avg_loss = layers.mean(loss, name='mean_cross_entropy_loss')
+                avg_loss = paddle.mean(loss, name='mean_cross_entropy_loss')
             opt.minimize(avg_loss)
             return avg_loss
 
@@ -80,7 +80,7 @@ def fn_2(opt, avg_loss=None, pred=None, label=None):
             if avg_loss is None:
                 loss = layers.softmax_with_cross_entropy(logits=pred,
                                                          label=label)
-                avg_loss = layers.mean(loss, name='mean_softmax_loss')
+                avg_loss = paddle.mean(loss, name='mean_softmax_loss')
             opt.minimize(avg_loss)
             return avg_loss
 
@@ -101,10 +101,10 @@ def fn_2(opt, avg_loss=None, pred=None, label=None):
                 lambda: fn_2(sgd, None, prediction, label))
         else:
             loss_1 = layers.cross_entropy(input=prediction, label=label)
-            avg_loss_1 = layers.mean(loss_1)
+            avg_loss_1 = paddle.mean(loss_1)
             loss_2 = layers.softmax_with_cross_entropy(logits=prediction,
                                                        label=label)
-            avg_loss_2 = layers.mean(loss_2)
+            avg_loss_2 = paddle.mean(loss_2)
             avg_loss = layers.case([(mod_two, lambda: fn_1(adam, avg_loss_1))],
                                    lambda: fn_2(sgd, avg_loss_2))
 
@@ -174,13 +174,13 @@ def dynamic(train_data, use_cuda=False, use_parallel_exe=False):
 
             if epoch % 2 == 0:
                 cross_entropy_loss = layers.cross_entropy(prediction, var_label)
-                loss = layers.mean(cross_entropy_loss)
+                loss = paddle.mean(cross_entropy_loss)
                 loss.backward()
                 adam.minimize(loss)
             else:
                 softmax_loss = layers.softmax_with_cross_entropy(
                     prediction, var_label)
-                loss = layers.mean(softmax_loss)
+                loss = paddle.mean(softmax_loss)
                 loss.backward()
                 sgd.minimize(loss)
 
@@ -247,7 +247,7 @@ def fn_2(opt, avg_loss):
 
             x = fluid.layers.data("X", [10], 'float32')
             hidden = layers.fc(x, 5)
-            avg_loss = layers.mean(hidden)
+            avg_loss = paddle.mean(hidden)
 
             adam = optimizer.Adam(learning_rate=LR)
             sgd = optimizer.SGD(learning_rate=LR)
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
index c81a38019956f..83017f49e505c 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
@@ -160,7 +160,7 @@ def check_network_convergence(self,
                     input=feature_out,
                     label=target,
                     param_attr=fluid.ParamAttr(name='crfw', learning_rate=1e-1))
-                avg_cost = fluid.layers.mean(crf_cost)
+                avg_cost = paddle.mean(crf_cost)
 
                 sgd_optimizer = fluid.optimizer.SGD(
                     learning_rate=fluid.layers.exponential_decay(
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_drop_scope.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_drop_scope.py
index 7618371036b12..2716a38d89399 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_drop_scope.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_drop_scope.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import unittest
+import paddle
 import paddle.fluid as fluid
 import numpy
 import os
@@ -33,7 +34,7 @@ def check_drop_scope(self, use_cuda=True):
         with fluid.program_guard(train_program, startup_program):
             data = fluid.layers.data(name='X', shape=[1], dtype='float32')
             hidden = fluid.layers.fc(input=data, size=10)
-            loss = fluid.layers.mean(hidden)
+            loss = paddle.mean(hidden)
             test_program = fluid.default_main_program().clone(for_test=True)
             fluid.optimizer.SGD(learning_rate=0.01).minimize(loss)
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
index aefa635508db0..9cac242a7ba7b 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid import compiler
 import unittest
@@ -74,7 +75,7 @@ def network_func():
             hidden = fluid.layers.fc(input=img, size=200, act='tanh')
         prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
         loss = fluid.layers.cross_entropy(input=prediction, label=label)
-        avg_loss = fluid.layers.mean(loss)
+        avg_loss = paddle.mean(loss)
         fluid.optimizer.Adam().minimize(avg_loss)
         return avg_loss
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
index 0c3c293f7b9c3..7321327372d87 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import math
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid import compiler
 import paddle.fluid.core as core
@@ -59,7 +60,7 @@ def parallel_exe(self,
             label = fluid.layers.data(name='label', shape=[1], dtype='int64')
             out = Lenet(data, class_dim=102)
             loss = fluid.layers.cross_entropy(input=out, label=label)
-            loss = fluid.layers.mean(loss)
+            loss = paddle.mean(loss)
             opt = fluid.optimizer.Momentum(
                 learning_rate=0.1,
                 momentum=0.9,
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
index 2e2791351bfec..f2a753a9874ed 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
@@ -38,7 +38,7 @@ def simple_fc_net(use_feed):
                 value=1.0)))
     prediction = fluid.layers.fc(hidden, size=10, act='softmax')
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
+    loss = paddle.mean(loss)
     return loss
 
 
@@ -61,7 +61,7 @@ def fc_with_batchnorm(use_feed):
         prediction = fluid.layers.fc(hidden, size=10, act='softmax')
     with fluid.name_scope("loss"):
         loss = fluid.layers.cross_entropy(input=prediction, label=label)
-        loss = fluid.layers.mean(loss)
+        loss = paddle.mean(loss)
     return loss
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_profiler.py b/python/paddle/fluid/tests/unittests/test_profiler.py
index 4f5cfba0c1ab3..e6b334600bc82 100644
--- a/python/paddle/fluid/tests/unittests/test_profiler.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler.py
@@ -59,7 +59,7 @@ def build_program(self, compile_program=True):
             predict = fluid.layers.fc(input=hidden2, size=10, act='softmax')
             label = fluid.layers.data(name='y', shape=[1], dtype='int64')
             cost = fluid.layers.cross_entropy(input=predict, label=label)
-            avg_cost = fluid.layers.mean(cost)
+            avg_cost = paddle.mean(cost)
             batch_size = fluid.layers.create_tensor(dtype='int64')
             batch_acc = fluid.layers.accuracy(input=predict,
                                               label=label,
diff --git a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
index e5463b1a90d59..6481e0f825df1 100644
--- a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
@@ -16,6 +16,7 @@
 
 import paddle
 import paddle.profiler as profiler
+import paddle.profiler.profiler_statistic as profiler_statistic
 
 
 class HostPythonNode:
@@ -30,6 +31,7 @@ def __init__(self, name, type, start_ns, end_ns, process_id, thread_id):
         self.children_node = []
         self.runtime_node = []
         self.device_node = []
+        self.mem_node = []
 
 
 class DevicePythonNode:
@@ -45,6 +47,22 @@ def __init__(self, name, type, start_ns, end_ns, device_id, context_id,
         self.stream_id = stream_id
 
 
+class MemPythonNode:
+    def __init__(self, timestamp_ns, addr, type, process_id, thread_id, increase_bytes, place, current_allocated, \
+        current_reserved, peak_allocated, peak_reserved):
+        self.timestamp_ns = timestamp_ns
+        self.addr = addr
+        self.type = type
+        self.process_id = process_id
+        self.thread_id = thread_id
+        self.increase_bytes = increase_bytes
+        self.place = place
+        self.current_allocated = current_allocated
+        self.current_reserved = current_reserved
+        self.peak_allocated = peak_allocated
+        self.peak_reserved = peak_reserved
+
+
 class TestProfilerStatistic(unittest.TestCase):
 
     def test_statistic_case1(self):
@@ -89,6 +107,9 @@ def test_statistic_case1(self):
         conv2d_compute = HostPythonNode('conv2d::compute',
                                         profiler.TracerEventType.OperatorInner,
                                         30, 40, 1000, 1001)
+        conv2d_compute.mem_node.append(
+            MemPythonNode(33, 0, profiler_statistic.TracerMemEventType.Allocate,
+                          1000, 1001, 20, 'place(gpu:0)', 200, 200, 800, 800))
         conv2d_launchkernel = HostPythonNode(
             'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 30, 35,
             1000, 1001)
@@ -211,6 +232,24 @@ def test_statistic_case1(self):
         self.assertEqual(
             event_summary.memory_manipulation_items['AsyncMemcpy'].
             general_gpu_time, 60)
+        self.assertEqual(
+            statistic_data.memory_summary.allocated_items['place(gpu:0)']
+            ['conv2d'].allocation_count, 1)
+        self.assertEqual(
+            statistic_data.memory_summary.allocated_items['place(gpu:0)']
+            ['conv2d'].allocation_size, 20)
+        self.assertEqual(
+            statistic_data.memory_summary.allocated_items['place(gpu:0)']
+            ['conv2d'].increase_size, 20)
+        self.assertEqual(
+            statistic_data.memory_summary.allocated_items['place(gpu:0)']
+            ['conv2d'].increase_size, 20)
+        self.assertEqual(
+            statistic_data.memory_summary.
+            peak_allocation_values['place(gpu:0)'], 800)
+        self.assertEqual(
+            statistic_data.memory_summary.peak_reserved_values['place(gpu:0)'],
+            800)
         print(
             profiler.profiler_statistic._build_table(
                 statistic_data,
diff --git a/python/paddle/fluid/tests/unittests/test_program.py b/python/paddle/fluid/tests/unittests/test_program.py
index b768aa7305158..cd00af1ed96da 100644
--- a/python/paddle/fluid/tests/unittests/test_program.py
+++ b/python/paddle/fluid/tests/unittests/test_program.py
@@ -16,6 +16,7 @@
 import unittest
 
 from paddle.fluid.framework import Program, default_main_program, program_guard, grad_var_name
+import paddle
 import paddle.fluid.layers as layers
 import paddle.fluid as fluid
 
@@ -120,7 +121,7 @@ def net():
                                             use_double_buffer=True)
             in_data, label = fluid.layers.read_file(reader)
             predict_label = fluid.layers.fc(in_data, size=2, act='softmax')
-            loss = fluid.layers.mean(
+            loss = paddle.mean(
                 fluid.layers.cross_entropy(input=predict_label, label=label))
 
             optimizer = fluid.optimizer.Adam()
@@ -146,7 +147,7 @@ def test_program_all_parameters(self):
         program = fluid.default_main_program()
         data = fluid.data(name='x', shape=[None, 13], dtype='float32')
         hidden = fluid.layers.fc(input=data, size=10)
-        loss = fluid.layers.mean(hidden)
+        loss = paddle.mean(hidden)
         fluid.optimizer.SGD(learning_rate=0.01).minimize(loss)
 
         # NOTE: here the parameters are fc_0.w_0 and fc_0.b_0
@@ -182,7 +183,7 @@ def net():
                                             use_double_buffer=True)
             in_data, label = fluid.layers.read_file(reader)
             predict_label = fluid.layers.fc(in_data, size=2, act='softmax')
-            loss = fluid.layers.mean(
+            loss = paddle.mean(
                 fluid.layers.cross_entropy(input=predict_label, label=label))
 
             optimizer = fluid.optimizer.Adam()
diff --git a/python/paddle/fluid/tests/unittests/test_program_prune_backward.py b/python/paddle/fluid/tests/unittests/test_program_prune_backward.py
index c602cfb4ad0b3..0f4543bc934a4 100755
--- a/python/paddle/fluid/tests/unittests/test_program_prune_backward.py
+++ b/python/paddle/fluid/tests/unittests/test_program_prune_backward.py
@@ -52,7 +52,7 @@ def lstm_net(use_feed):
     fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh')
     prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
     cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
+    avg_cost = paddle.mean(x=cost)
     return avg_cost
 
 
@@ -70,7 +70,7 @@ def simple_fc_net_with_accuracy(use_feed):
                 value=1.0)))
     prediction = fluid.layers.fc(hidden, size=10, act='softmax')
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
+    loss = paddle.mean(loss)
     accuracy_out = fluid.layers.accuracy(input=prediction, label=label, k=5)
     return loss
 
@@ -83,12 +83,12 @@ def cond_net(use_feed=None):
     def loss1(pred, label):
         x = fluid.layers.data(name="x", shape=[4], dtype='float32')
         loss = fluid.layers.cross_entropy(input=pred, label=label)
-        avg_loss = fluid.layers.mean(loss, name='mean_cross_entropy_loss')
+        avg_loss = paddle.mean(loss, name='mean_cross_entropy_loss')
         return avg_loss
 
     def loss2(pred, label):
         loss = fluid.layers.softmax_with_cross_entropy(logits=pred, label=label)
-        avg_loss = fluid.layers.mean(loss, name='mean_softmax_loss')
+        avg_loss = paddle.mean(loss, name='mean_softmax_loss')
         return avg_loss
 
     two = fluid.layers.fill_constant([1], 'int32', 2)
@@ -106,14 +106,14 @@ def optimization_in_cond_net(with_optimize=False):
     def loss1(opt, pred, label, with_optimize):
         x = fluid.layers.data(name="x", shape=[4], dtype='float32')
         loss = fluid.layers.cross_entropy(input=pred, label=label)
-        avg_loss = fluid.layers.mean(loss, name='mean_cross_entropy_loss')
+        avg_loss = paddle.mean(loss, name='mean_cross_entropy_loss')
         if with_optimize:
             opt.minimize(avg_loss)
         return avg_loss
 
     def loss2(opt, pred, label, with_optimize):
         loss = fluid.layers.softmax_with_cross_entropy(logits=pred, label=label)
-        avg_loss = fluid.layers.mean(loss, name='mean_softmax_loss')
+        avg_loss = paddle.mean(loss, name='mean_softmax_loss')
         if with_optimize:
             opt.minimize(avg_loss)
         return avg_loss
diff --git a/python/paddle/fluid/tests/unittests/test_prroi_pool_op.py b/python/paddle/fluid/tests/unittests/test_prroi_pool_op.py
index 71b07155f4015..c9a7317bfff3b 100644
--- a/python/paddle/fluid/tests/unittests/test_prroi_pool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prroi_pool_op.py
@@ -18,6 +18,7 @@
 import unittest
 from py_precise_roi_pool import PyPrRoIPool
 from op_test import OpTest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid import compiler, Program, program_guard
@@ -103,7 +104,7 @@ def run_net(self, place):
                                      dtype="float32",
                                      lod_level=1)
             output = fluid.layers.prroi_pool(x, rois, 0.25, 2, 2)
-            loss = fluid.layers.mean(output)
+            loss = paddle.mean(output)
             optimizer = fluid.optimizer.SGD(learning_rate=1e-3)
             optimizer.minimize(loss)
             input_x = fluid.create_lod_tensor(self.x, [], place)
@@ -234,7 +235,7 @@ def run_net(self, place):
                                              2,
                                              2,
                                              batch_roi_nums=rois_index)
-            loss = fluid.layers.mean(output)
+            loss = paddle.mean(output)
             optimizer = fluid.optimizer.SGD(learning_rate=1e-3)
             optimizer.minimize(loss)
             exe = fluid.Executor(place)
diff --git a/python/paddle/fluid/tests/unittests/test_prune.py b/python/paddle/fluid/tests/unittests/test_prune.py
index c320e3fbf58b2..730a6c1b8a8ff 100644
--- a/python/paddle/fluid/tests/unittests/test_prune.py
+++ b/python/paddle/fluid/tests/unittests/test_prune.py
@@ -16,6 +16,7 @@
 
 import unittest
 
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.framework as framework
 import paddle.compat as cpt
@@ -31,7 +32,7 @@ def net(self):
         label = fluid.layers.data(name="label", shape=[1], dtype="int64")
         y = fluid.layers.fc(input=[x], size=2, act="softmax")
         loss = fluid.layers.cross_entropy(input=y, label=label)
-        loss = fluid.layers.mean(x=loss)
+        loss = paddle.mean(x=loss)
         return x, y, label, loss
 
     def test_prune_with_input(self):
@@ -41,14 +42,14 @@ def test_prune_with_input(self):
         with fluid.program_guard(program, startup_program):
             (x, y, label, loss) = self.net()
         self.assertEqual(len(block.ops), 5)
-        self.assertEqual(
-            [op.type for op in block.ops],
-            ["mul", "elementwise_add", "softmax", "cross_entropy2", "mean"])
+        self.assertEqual([op.type for op in block.ops], [
+            "mul", "elementwise_add", "softmax", "cross_entropy2", "reduce_mean"
+        ])
         pruned_program = program._prune_with_input(
             feeded_var_names=[y.name, label.name], targets=[loss])
         self.assertEqual(len(pruned_program.global_block().ops), 2)
         self.assertEqual([op.type for op in pruned_program.global_block().ops],
-                         ["cross_entropy2", "mean"])
+                         ["cross_entropy2", "reduce_mean"])
 
     def test_prune(self):
         program = framework.Program()
@@ -57,14 +58,16 @@ def test_prune(self):
         with fluid.program_guard(program, startup_program):
             (x, y, label, loss) = self.net()
         self.assertEqual(len(block.ops), 5)
-        self.assertEqual(
-            [op.type for op in block.ops],
-            ["mul", "elementwise_add", "softmax", "cross_entropy2", "mean"])
+        self.assertEqual([op.type for op in block.ops], [
+            "mul", "elementwise_add", "softmax", "cross_entropy2", "reduce_mean"
+        ])
         pruned_program = program._prune(targets=[loss])
         self.assertEqual(len(pruned_program.global_block().ops), 5)
-        self.assertEqual(
-            [op.type for op in pruned_program.global_block().ops],
-            ["mul", "elementwise_add", "softmax", "cross_entropy2", "mean"])
+        self.assertEqual([op.type for op in pruned_program.global_block().ops],
+                         [
+                             "mul", "elementwise_add", "softmax",
+                             "cross_entropy2", "reduce_mean"
+                         ])
 
     def test_prune_target_not_list(self):
         program = framework.Program()
@@ -73,14 +76,16 @@ def test_prune_target_not_list(self):
         with fluid.program_guard(program, startup_program):
             (x, y, label, loss) = self.net()
         self.assertEqual(len(block.ops), 5)
-        self.assertEqual(
-            [op.type for op in block.ops],
-            ["mul", "elementwise_add", "softmax", "cross_entropy2", "mean"])
+        self.assertEqual([op.type for op in block.ops], [
+            "mul", "elementwise_add", "softmax", "cross_entropy2", "reduce_mean"
+        ])
         pruned_program = program._prune(targets=loss)
         self.assertEqual(len(pruned_program.global_block().ops), 5)
-        self.assertEqual(
-            [op.type for op in pruned_program.global_block().ops],
-            ["mul", "elementwise_add", "softmax", "cross_entropy2", "mean"])
+        self.assertEqual([op.type for op in pruned_program.global_block().ops],
+                         [
+                             "mul", "elementwise_add", "softmax",
+                             "cross_entropy2", "reduce_mean"
+                         ])
 
     def test_prune_target_none(self):
         program = framework.Program()
@@ -89,9 +94,9 @@ def test_prune_target_none(self):
         with fluid.program_guard(program, startup_program):
             (x, y, label, loss) = self.net()
         self.assertEqual(len(block.ops), 5)
-        self.assertEqual(
-            [op.type for op in block.ops],
-            ["mul", "elementwise_add", "softmax", "cross_entropy2", "mean"])
+        self.assertEqual([op.type for op in block.ops], [
+            "mul", "elementwise_add", "softmax", "cross_entropy2", "reduce_mean"
+        ])
         try:
             pruned_program = program._prune(targets=None)
         except ValueError as e:
@@ -128,9 +133,9 @@ def net1(self):
                             act="softmax",
                             param_attr=w_param_attrs)
         loss1 = fluid.layers.cross_entropy(input=y, label=label)
-        loss1 = fluid.layers.mean(x=loss1)
+        loss1 = paddle.mean(x=loss1)
         loss2 = fluid.layers.cross_entropy(input=y, label=label)
-        loss2 = fluid.layers.mean(x=loss2)
+        loss2 = paddle.mean(x=loss2)
         loss1.persistable = True
         loss2.persistable = True
         return x, y, label, loss1, loss2, w_param_attrs
@@ -158,9 +163,9 @@ def net2(self):
                              act="softmax",
                              param_attr=w2_param_attrs)
         loss1 = fluid.layers.cross_entropy(input=y1, label=label)
-        loss1 = fluid.layers.mean(x=loss1)
+        loss1 = paddle.mean(x=loss1)
         loss2 = fluid.layers.cross_entropy(input=y2, label=label)
-        loss2 = fluid.layers.mean(x=loss2)
+        loss2 = paddle.mean(x=loss2)
         return x1, x2, y1, y2, label, loss1, loss2, w1_param_attrs, w2_param_attrs
 
     def test_not_prune(self):
diff --git a/python/paddle/fluid/tests/unittests/test_pull_gpups_sparse_op.py b/python/paddle/fluid/tests/unittests/test_pull_gpups_sparse_op.py
index b15edb44d57a8..d700966126a2e 100644
--- a/python/paddle/fluid/tests/unittests/test_pull_gpups_sparse_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pull_gpups_sparse_op.py
@@ -43,7 +43,7 @@ def test_static_graph(self):
                                         size=[11],
                                         is_distributed=True,
                                         is_sparse=True)
-            cost = paddle.fluid.layers.mean(output)
+            cost = paddle.mean(output)
             sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
             sgd_optimizer.minimize(cost, train_program)
             block = train_program.global_block()
diff --git a/python/paddle/fluid/tests/unittests/test_py_func_op.py b/python/paddle/fluid/tests/unittests/test_py_func_op.py
index f0f791d62a7a2..0eaf4b453bdaa 100644
--- a/python/paddle/fluid/tests/unittests/test_py_func_op.py
+++ b/python/paddle/fluid/tests/unittests/test_py_func_op.py
@@ -127,7 +127,7 @@ def simple_fc_net(img, label, use_py_func_op):
         assert loss == loss_out and dummy_var == dummy_var_out, \
             "py_func failed with multi input and output"
 
-    loss = fluid.layers.mean(loss)
+    loss = paddle.mean(loss)
     return loss
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
index 4be5a4ae94860..830ade004d3a6 100644
--- a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
@@ -119,7 +119,7 @@ def simple_fc_net(in_size,
                 value=1.0)))
 
     predict_label = fluid.layers.fc(hidden, size=class_num, act='softmax')
-    loss = fluid.layers.mean(
+    loss = paddle.mean(
         fluid.layers.cross_entropy(input=predict_label, label=label))
 
     optimizer = fluid.optimizer.Adam()
diff --git a/python/paddle/fluid/tests/unittests/test_recurrent_op.py b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
index 568d57c09355f..0acd0ac398e34 100644
--- a/python/paddle/fluid/tests/unittests/test_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import unittest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import numpy as np
@@ -134,7 +135,7 @@ def setUp(self):
         self.py_rnn = PySimpleRNN1(self.input_shape, self.output_shape)
 
         with fluid.program_guard(self.main_program, self.startup_program):
-            self.output = layers.mean(self.create_rnn_op())
+            self.output = paddle.mean(self.create_rnn_op())
 
     def create_rnn_op(self):
         x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim],
@@ -262,7 +263,7 @@ def setUp(self):
         self.py_rnn = PySimpleRNN2(self.input_shape, self.output_shape)
 
         with fluid.program_guard(self.main_program, self.startup_program):
-            self.output = layers.mean(self.create_rnn_op())
+            self.output = paddle.mean(self.create_rnn_op())
 
     def create_rnn_op(self):
         x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim],
@@ -364,7 +365,7 @@ def setUp(self):
             self.input_shape, self.output_shape)
 
         with fluid.program_guard(self.main_program, self.startup_program):
-            self.output = layers.mean(self.create_rnn_op())
+            self.output = paddle.mean(self.create_rnn_op())
 
     def create_rnn_op(self):
         x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim],
@@ -446,7 +447,7 @@ def setUp(self):
             self.input_shape, self.output_shape)
 
         with fluid.program_guard(self.main_program, self.startup_program):
-            self.output = layers.mean(self.create_rnn_op())
+            self.output = paddle.mean(self.create_rnn_op())
 
     def create_rnn_op(self):
         x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim],
@@ -553,7 +554,7 @@ def setUp(self):
 
         with fluid.program_guard(self.main_program, self.startup_program):
             rnn_out = self.create_rnn_op()
-            self.output = layers.mean(rnn_out)
+            self.output = paddle.mean(rnn_out)
 
     def create_rnn_op(self):
         x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim],
@@ -637,7 +638,7 @@ def setUp(self):
         self.py_rnn = PySimpleRNN2(self.input_shape, self.output_shape)
 
         with fluid.program_guard(self.main_program, self.startup_program):
-            self.output = layers.mean(self.create_rnn_op())
+            self.output = paddle.mean(self.create_rnn_op())
 
     def create_rnn_op(self):
         x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim],
diff --git a/python/paddle/fluid/tests/unittests/test_registry.py b/python/paddle/fluid/tests/unittests/test_registry.py
index e9f847185fc76..8d803635aa35c 100644
--- a/python/paddle/fluid/tests/unittests/test_registry.py
+++ b/python/paddle/fluid/tests/unittests/test_registry.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 import unittest
 
+import paddle
 import paddle.fluid as fluid
 import numpy as np
 from decorator_helper import prog_scope
@@ -25,7 +26,7 @@ class TestRegistry(unittest.TestCase):
     @prog_scope()
     def test_registry_layer(self):
         x = fluid.layers.data(name='X', shape=[10, 10], dtype='float32')
-        output = fluid.layers.mean(x)
+        output = paddle.mean(x)
 
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
diff --git a/python/paddle/fluid/tests/unittests/test_regularizer.py b/python/paddle/fluid/tests/unittests/test_regularizer.py
index 304e47da9a61a..4a48b6fb1f838 100644
--- a/python/paddle/fluid/tests/unittests/test_regularizer.py
+++ b/python/paddle/fluid/tests/unittests/test_regularizer.py
@@ -147,7 +147,7 @@ def bow_net(data,
     fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
     prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
     cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
+    avg_cost = paddle.mean(x=cost)
     return avg_cost
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_regularizer_api.py b/python/paddle/fluid/tests/unittests/test_regularizer_api.py
index da2643cc64726..fc46c9c93c37e 100644
--- a/python/paddle/fluid/tests/unittests/test_regularizer_api.py
+++ b/python/paddle/fluid/tests/unittests/test_regularizer_api.py
@@ -50,7 +50,7 @@ def bow_net(data,
     fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
     prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
     cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
+    avg_cost = paddle.mean(x=cost)
 
     return avg_cost
 
diff --git a/python/paddle/fluid/tests/unittests/test_resnet50_with_cinn.py b/python/paddle/fluid/tests/unittests/test_resnet50_with_cinn.py
index 829960250d05d..4aebad4e87cb6 100644
--- a/python/paddle/fluid/tests/unittests/test_resnet50_with_cinn.py
+++ b/python/paddle/fluid/tests/unittests/test_resnet50_with_cinn.py
@@ -108,6 +108,10 @@ def test_check_resnet50_accuracy(self):
 
         loss_c = self.train(place, loop_num, feed, use_cinn=True)
         loss_p = self.train(place, loop_num, feed, use_cinn=False)
+        print("Losses of CINN:")
+        print(loss_c)
+        print("Losses of Paddle")
+        print(loss_p)
         self.assertTrue(np.allclose(loss_c, loss_p, atol=1e-5))
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
index 42f32f2e75bd8..eb192fcde6fac 100644
--- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
@@ -246,7 +246,7 @@ def test_rmsprop(self):
             y = fluid.layers.data(name='y', shape=[1], dtype='float32')
             y_predict = fluid.layers.fc(input=x, size=1, act=None)
             cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-            avg_cost = fluid.layers.mean(cost)
+            avg_cost = paddle.mean(cost)
 
             rms_optimizer = paddle.optimizer.RMSProp(learning_rate=0.1)
             rms_optimizer.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/test_select_input_output_op.py b/python/paddle/fluid/tests/unittests/test_select_input_output_op.py
index 8a41e05d1d52a..c809c973438eb 100644
--- a/python/paddle/fluid/tests/unittests/test_select_input_output_op.py
+++ b/python/paddle/fluid/tests/unittests/test_select_input_output_op.py
@@ -16,6 +16,7 @@
 
 import unittest
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.layers as layers
@@ -43,7 +44,7 @@ def test_forward_backward_list_output(self):
 
                 select_output(x, outputs, mask)
                 y = select_input(outputs, mask)
-                mean = layers.mean(y)
+                mean = paddle.mean(y)
                 append_backward(mean)
 
             place = fluid.CUDAPlace(
diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op.py b/python/paddle/fluid/tests/unittests/test_sgd_op.py
index 8e00d905a3520..8f4f5dad074f0 100644
--- a/python/paddle/fluid/tests/unittests/test_sgd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op.py
@@ -206,7 +206,7 @@ def runTest(self):
         out = fluid.layers.l2_normalize(x=emb, axis=-1)
 
         cost = fluid.layers.square_error_cost(input=out, label=label)
-        avg_cost = fluid.layers.mean(cost)
+        avg_cost = paddle.mean(cost)
         sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
         sgd_optimizer.minimize(avg_cost)
 
@@ -368,7 +368,7 @@ def static_sgd_mp(self, mp):
                                           name='X',
                                           dtype='float32')
             hidden = paddle.static.nn.fc(x=data, size=10)
-            loss = paddle.fluid.layers.mean(hidden)
+            loss = paddle.mean(hidden)
             optimizer.minimize(loss)
         exe.run(startup_program)
 
@@ -470,7 +470,7 @@ def static_sgd_mp(self, mp):
                                           name='X',
                                           dtype='float32')
             hidden = paddle.static.nn.fc(x=data, size=10)
-            loss = paddle.fluid.layers.mean(hidden)
+            loss = paddle.mean(hidden)
             optimizer.minimize(loss)
         exe.run(startup_program)
 
diff --git a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
index daa3f191ccd72..c3cb57f9438f1 100644
--- a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
+++ b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import unittest
+import paddle
 import paddle.fluid.core as core
 from paddle.fluid.executor import Executor
 import paddle.fluid.layers as layers
@@ -47,7 +48,7 @@ def setUp(self):
         i = layers.increment(x=i)
         i.stop_gradient = True
         self.mem3 = shrink_memory(x=self.mem2, i=i, table=table)
-        mem3_mean = layers.mean(self.mem3)
+        mem3_mean = paddle.mean(self.mem3)
         append_backward(loss=mem3_mean)
         self.x_grad = self.main_program.global_block().var('x@GRAD')
 
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py b/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py
index e1a9b2428babc..9501b2c89531f 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py
@@ -53,6 +53,7 @@ def test_conv3d(self):
                 groups=1,
                 data_format="NDHWC")
             out.backward(out)
+            out = paddle.incubate.sparse.coalesce(out)
             assert np.array_equal(correct_out_values, out.values().numpy())
 
     def test_subm_conv3d(self):
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_elementwise_op.py b/python/paddle/fluid/tests/unittests/test_sparse_elementwise_op.py
index 61932cf4a7b0a..12546ea463a84 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_elementwise_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_elementwise_op.py
@@ -125,16 +125,14 @@ def func_test_coo(self, op):
     def test_support_dtypes_csr(self):
         paddle.device.set_device('cpu')
         if paddle.device.get_device() == "cpu":
-            with _test_eager_guard():
-                for op in op_list:
-                    self.func_test_csr(op)
+            for op in op_list:
+                self.func_test_csr(op)
 
     def test_support_dtypes_coo(self):
         paddle.device.set_device('cpu')
         if paddle.device.get_device() == "cpu":
-            with _test_eager_guard():
-                for op in op_list:
-                    self.func_test_coo(op)
+            for op in op_list:
+                self.func_test_coo(op)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_fused_attention_op.py b/python/paddle/fluid/tests/unittests/test_sparse_fused_attention_op.py
new file mode 100644
index 0000000000000..0383247886ff2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sparse_fused_attention_op.py
@@ -0,0 +1,163 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import math
+import re
+import copy
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+from paddle.fluid.framework import _test_eager_guard
+
+
+def get_cuda_version():
+    result = os.popen("nvcc --version").read()
+    regex = r'release (\S+),'
+    match = re.search(regex, result)
+    if match:
+        num = str(match.group(1))
+        integer, decimal = num.split('.')
+        return int(integer) * 1000 + int(float(decimal) * 10)
+    else:
+        return -1
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda() or get_cuda_version() < 11070,
+    "core is not compiled with CUDA and cuda version need larger than or equal to 11.3"
+)
+class TestSparseAttentionAPI1(unittest.TestCase):
+
+    def setUp(self):
+        self.batch_size = 16
+        self.num_heads = 16
+        self.seq_len = 128
+        self.head_dim = 16
+        self.dtype = 'float64'
+        self.use_mask = True
+
+    def test_dygraph(self):
+        with _test_eager_guard():
+            self.shape = [
+                self.batch_size, self.num_heads, self.seq_len, self.head_dim
+            ]
+            query = paddle.rand(self.shape, self.dtype)
+            key = paddle.rand(self.shape, self.dtype)
+            value = paddle.rand(self.shape, self.dtype)
+
+            query.stop_gradient = False
+            key.stop_gradient = False
+            value.stop_gradient = False
+
+            mask = paddle.nn.functional.dropout(paddle.ones(
+                [self.seq_len, self.seq_len]),
+                                                mode='downscale_in_infer')
+            mask = mask.expand(
+                [self.batch_size, self.num_heads, self.seq_len, self.seq_len])
+            sp_mask = mask.reshape([-1, self.seq_len,
+                                    self.seq_len]).to_sparse_csr()
+
+            query_sp = copy.deepcopy(query)
+            key_sp = copy.deepcopy(key)
+            value_sp = copy.deepcopy(value)
+
+            query_sp.stop_gradient = False
+            key_sp.stop_gradient = False
+            value_sp.stop_gradient = False
+
+            if self.use_mask:
+                kp_mask = paddle.randint(
+                    0, 2, [self.batch_size, self.seq_len]).astype(self.dtype)
+                attn_mask = paddle.randint(
+                    0, 2, [self.seq_len, self.seq_len]).astype(self.dtype)
+
+                sdd = paddle.matmul(query, key, False, True) / math.sqrt(
+                    float(self.head_dim))
+                sdd = sdd + (
+                    (mask * kp_mask.unsqueeze([1, 2]) * attn_mask) - 1.0) * 1e9
+                softmax = paddle.nn.functional.softmax(sdd)
+                output = paddle.matmul(softmax, value)
+                output.backward()
+
+                output_sp = paddle.incubate.sparse.nn.functional.attention(
+                    query_sp, key_sp, value_sp, sp_mask, kp_mask, attn_mask)
+                output_sp.backward()
+            else:
+                sdd = paddle.matmul(query, key, False, True) / math.sqrt(
+                    float(self.head_dim))
+                sdd = sdd + (mask - 1.0) * 1e9
+                softmax = paddle.nn.functional.softmax(sdd)
+                output = paddle.matmul(softmax, value)
+                output.backward()
+
+                output_sp = paddle.incubate.sparse.nn.functional.attention(
+                    query_sp, key_sp, value_sp, sp_mask)
+                output_sp.backward()
+
+            self.assertTrue(np.allclose(output_sp.numpy(), output.numpy()))
+            self.assertTrue(
+                np.allclose(query_sp.grad.numpy(), query.grad.numpy()))
+            self.assertTrue(np.allclose(key_sp.grad.numpy(), key.grad.numpy()))
+            self.assertTrue(
+                np.allclose(value_sp.grad.numpy(), value.grad.numpy()))
+
+
+class TestSparseAttentionAPI2(TestSparseAttentionAPI1):
+
+    def setUp(self):
+        self.batch_size = 16
+        self.num_heads = 16
+        self.seq_len = 128
+        self.head_dim = 32
+        self.dtype = 'float64'
+        self.use_mask = False
+
+
+class TestSparseAttentionAPI3(TestSparseAttentionAPI1):
+
+    def setUp(self):
+        self.batch_size = 16
+        self.num_heads = 16
+        self.seq_len = 512
+        self.head_dim = 16
+        self.dtype = 'float64'
+        self.use_mask = True
+
+
+class TestSparseAttentionAPI4(TestSparseAttentionAPI1):
+
+    def setUp(self):
+        self.batch_size = 16
+        self.num_heads = 16
+        self.seq_len = 512
+        self.head_dim = 32
+        self.dtype = 'float64'
+        self.use_mask = False
+
+
+class TestSparseAttentionAPI5(TestSparseAttentionAPI1):
+
+    def setUp(self):
+        self.batch_size = 16
+        self.num_heads = 16
+        self.seq_len = 512
+        self.head_dim = 64
+        self.dtype = 'float64'
+        self.use_mask = True
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_matmul_op.py b/python/paddle/fluid/tests/unittests/test_sparse_matmul_op.py
index 96adf959b2b6e..8986d4a7ef5d2 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_matmul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_matmul_op.py
@@ -13,8 +13,6 @@
 # limitations under the License.
 
 import paddle
-from paddle.fluid.framework import _test_eager_guard
-
 import numpy as np
 import scipy
 import scipy.sparse as sp
@@ -22,7 +20,7 @@
 import os
 import re
 
-np.random.seed(2022)
+paddle.set_default_dtype('float64')
 
 
 def get_cuda_version():
@@ -37,153 +35,115 @@ def get_cuda_version():
         return -1
 
 
-@unittest.skipIf(
-    not paddle.is_compiled_with_cuda() or get_cuda_version() < 11000,
-    "paddle is not compiled with CUDA and cuda version need to >= 11.0")
-class TestCsrDenseMatmul2D(unittest.TestCase):
-    # x: csr, y: dense, out: dense
-    def test_matmul(self):
-        with _test_eager_guard():
-            mask = np.random.rand(10, 12) < 0.2
-            np_x = np.random.rand(10, 12) * mask
-
-            np_csr = sp.csr_matrix(np_x)
-            np_dense = np.random.rand(12, 6)
-            np_out = np_csr @ np_dense
-
-            np_out_grad = np.ones([10, 6])
-
-            # dx(csr) = dout(dense) * y'(dense) * mask
-            np_csr_grad = sp.csr_matrix(
-                np.matmul(np_out_grad, np_dense.transpose(1, 0)) * mask)
-            # dy(dense) = x'(csr) * dout(dense)
-            np_dense_grad = np_csr.transpose() @ np_out_grad
-
-            csr = paddle.to_tensor(np_x, stop_gradient=False).to_sparse_csr()
-            dense = paddle.to_tensor(np_dense, stop_gradient=False)
-            out = paddle.incubate.sparse.matmul(csr, dense)
-
-            self.assertTrue(np.allclose(np_out, out.numpy()))
-
-            if get_cuda_version() >= 11030:
-                out.backward()
-                self.assertTrue(
-                    np.allclose(np_csr_grad.indptr,
-                                csr.grad.crows().numpy()))
-                self.assertTrue(
-                    np.allclose(np_csr_grad.indices,
-                                csr.grad.cols().numpy()))
-                self.assertTrue(
-                    np.allclose(np_csr_grad.data,
-                                csr.grad.values().numpy()))
-
-                self.assertTrue(np.allclose(np_dense_grad, dense.grad.numpy()))
-
-
-@unittest.skipIf(
-    not paddle.is_compiled_with_cuda() or get_cuda_version() < 11030,
-    "paddle is not compiled with CUDA and cuda version need to >= 11.3")
-class TestCsrMaskedMatmul2D(unittest.TestCase):
-    # x: dense, y: dense, out: csr
-    def test_matmul(self):
-        with _test_eager_guard():
-            np_mask = np.random.rand(10, 6) < 0.2
-
-            np_x = np.random.rand(10, 12)
-            np_y = np.random.rand(12, 6)
-            np_out = sp.csr_matrix(np.matmul(np_x, np_y) * np_mask)
-
-            np_out_grad = sp.csr_matrix(np.ones([10, 6]) * np_mask)
-            # dx(dense) = dout(csr) * y'(dense)
-            np_x_grad = np_out_grad @ np_y.transpose(1, 0)
-            # dy(dense) = x'(dense) * dout(csr) -> dy'(dense) = dout'(csr) * x(dense)
-            np_y_grad = (np_out_grad.transpose() @ np_x).transpose(1, 0)
-
-            x = paddle.to_tensor(np_x, stop_gradient=False)
-            y = paddle.to_tensor(np_y, stop_gradient=False)
-            mask = paddle.to_tensor(np.ones([10, 6]) * np_mask).to_sparse_csr()
-            out = paddle.incubate.sparse.masked_matmul(x, y, mask)
-
-            self.assertTrue(np.allclose(np_out.indptr, out.crows().numpy()))
-            self.assertTrue(np.allclose(np_out.indices, out.cols().numpy()))
-            self.assertTrue(np.allclose(np_out.data, out.values().numpy()))
-
-            out.backward()
-            self.assertTrue(np.allclose(out.is_sparse_csr(), True))
-            self.assertTrue(np.allclose(np_x_grad, x.grad.numpy()))
-            self.assertTrue(np.allclose(np_y_grad, y.grad.numpy()))
-
-
-@unittest.skipIf(
-    not paddle.is_compiled_with_cuda() or get_cuda_version() < 11070,
-    "paddle is not compiled with CUDA and cuda version need to >= 11.7")
-class TestCsrDenseMatmul3D(unittest.TestCase):
-    # x: csr, y: dense, out: dense
-    def test_matmul(self):
-        with _test_eager_guard():
-            paddle.set_default_dtype('float32')
-            origin_x = paddle.rand([16, 16, 12])
-            mask = paddle.randint(0, 2, [16, 12])
-            origin_x = origin_x * mask
-            origin_y = paddle.rand([16, 12, 10])
-
-            dense_x = origin_x.detach()
-            dense_x.stop_gradient = False
-            dense_y = origin_y.detach()
-            dense_y.stop_gradient = False
-            dense_out = paddle.matmul(dense_x, dense_y)
-            dense_out.backward()
-
+class TestMatmul(unittest.TestCase):
+    # x: sparse, y: dense, out: dense
+    def check_result(self, x_shape, y_shape, format):
+        if len(x_shape) == 3:
+            mask = paddle.randint(0, 2, [x_shape[-2], x_shape[-1]])
+        else:
+            mask = paddle.randint(0, 2, x_shape)
+        origin_x = paddle.rand(x_shape) * mask
+        origin_y = paddle.rand(y_shape)
+
+        dense_x = origin_x.detach()
+        dense_x.stop_gradient = False
+        dense_y = origin_y.detach()
+        dense_y.stop_gradient = False
+        dense_out = paddle.matmul(dense_x, dense_y)
+
+        if format == "coo":
+            sp_x = origin_x.detach().to_sparse_coo(len(x_shape))
+        else:
             sp_x = origin_x.detach().to_sparse_csr()
-            sp_x.stop_gradient = False
-            sp_y = origin_y.detach()
-            sp_y.stop_gradient = False
-            sp_out = paddle.incubate.sparse.matmul(sp_x, sp_y)
-            sp_out.backward()
+        sp_x.stop_gradient = False
+        sp_y = origin_y.detach()
+        sp_y.stop_gradient = False
+        sp_out = paddle.incubate.sparse.matmul(sp_x, sp_y)
 
-            self.assertTrue(np.allclose(sp_out.numpy(), dense_out.numpy()))
+        self.assertTrue(np.allclose(sp_out.numpy(), dense_out.numpy()))
+        if get_cuda_version() >= 11030:
+            dense_out.backward()
+            sp_out.backward()
             self.assertTrue(
                 np.allclose(sp_x.grad.to_dense().numpy(),
                             (dense_x.grad * mask).numpy()))
             self.assertTrue(np.allclose(sp_y.grad.numpy(),
                                         dense_y.grad.numpy()))
 
-
-@unittest.skipIf(
-    not paddle.is_compiled_with_cuda() or get_cuda_version() < 11070,
-    "paddle is not compiled with CUDA and cuda version need to >= 11.7")
-class TestCsrMaskedMatmul3D(unittest.TestCase):
-    # x: dense, y: dense, out: csr
-    def test_matmul(self):
-        with _test_eager_guard():
-            paddle.set_default_dtype('float64')
-            origin_x = paddle.rand([16, 16, 12])
-            origin_y = paddle.rand([16, 12, 10])
-
-            mask = paddle.randint(0, 2, [16, 10])
-
-            dense_x = origin_x.detach()
-            dense_x.stop_gradient = False
-            dense_y = origin_y.detach()
-            dense_y.stop_gradient = False
-            dense_out = paddle.matmul(dense_x, dense_y)
-            dense_out = dense_out * mask
-            dense_out.backward()
-
-            sp_x = origin_x.detach()
-            sp_x.stop_gradient = False
-            sp_y = origin_y.detach()
-            sp_y.stop_gradient = False
-            sp_out = paddle.incubate.sparse.masked_matmul(
-                sp_x, sp_y, dense_out.to_sparse_csr())
-            sp_out.backward()
-
-            self.assertTrue(
-                np.allclose(sp_out.to_dense().numpy(), dense_out.numpy()))
-            self.assertTrue(np.allclose(sp_x.grad.numpy(),
-                                        dense_x.grad.numpy()))
-            self.assertTrue(np.allclose(sp_y.grad.numpy(),
-                                        dense_y.grad.numpy()))
+    @unittest.skipIf(not paddle.is_compiled_with_cuda()
+                     or get_cuda_version() < 11000, "only support cuda>=11.0")
+    def test_matmul_2d(self):
+        self.check_result([16, 12], [12, 10], 'coo')
+        self.check_result([16, 12], [12, 10], 'csr')
+
+    @unittest.skipIf(not paddle.is_compiled_with_cuda()
+                     or get_cuda_version() < 11070, "only support cuda>=11.7")
+    def test_matmul_3d(self):
+        self.check_result([8, 16, 12], [8, 12, 10], 'coo')
+        self.check_result([8, 16, 12], [8, 12, 10], 'csr')
+
+
+class TestMaskedMatmul(unittest.TestCase):
+    # x: dense, y: dense, out: sparse_`csr
+    @unittest.skipIf(not paddle.is_compiled_with_cuda()
+                     or get_cuda_version() < 11030,
+                     "only support on cuda>=11.3")
+    def test_masked_matmul_2d(self):
+        np_mask = np.random.rand(10, 6) < 0.2
+
+        np_x = np.random.rand(10, 12)
+        np_y = np.random.rand(12, 6)
+        np_out = sp.csr_matrix(np.matmul(np_x, np_y) * np_mask)
+
+        np_out_grad = sp.csr_matrix(np.ones([10, 6]) * np_mask)
+        # dx(dense) = dout(csr) * y'(dense)
+        np_x_grad = np_out_grad @ np_y.transpose(1, 0)
+        # dy(dense) = x'(dense) * dout(csr) -> dy'(dense) = dout'(csr) * x(dense)
+        np_y_grad = (np_out_grad.transpose() @ np_x).transpose(1, 0)
+
+        x = paddle.to_tensor(np_x, stop_gradient=False)
+        y = paddle.to_tensor(np_y, stop_gradient=False)
+        mask = paddle.to_tensor(np.ones([10, 6]) * np_mask).to_sparse_csr()
+        out = paddle.incubate.sparse.masked_matmul(x, y, mask)
+
+        self.assertTrue(np.allclose(np_out.indptr, out.crows().numpy()))
+        self.assertTrue(np.allclose(np_out.indices, out.cols().numpy()))
+        self.assertTrue(np.allclose(np_out.data, out.values().numpy()))
+
+        out.backward()
+        self.assertTrue(np.allclose(out.is_sparse_csr(), True))
+        self.assertTrue(np.allclose(np_x_grad, x.grad.numpy()))
+        self.assertTrue(np.allclose(np_y_grad, y.grad.numpy()))
+
+    @unittest.skipIf(not paddle.is_compiled_with_cuda()
+                     or get_cuda_version() < 11070,
+                     "only support on cuda>=11.7")
+    def test_masked_matmul_3d(self):
+        paddle.set_default_dtype('float32')
+        origin_x = paddle.rand([16, 16, 12])
+        mask = paddle.randint(0, 2, [16, 12])
+        origin_x = origin_x * mask
+        origin_y = paddle.rand([16, 12, 10])
+
+        dense_x = origin_x.detach()
+        dense_x.stop_gradient = False
+        dense_y = origin_y.detach()
+        dense_y.stop_gradient = False
+        dense_out = paddle.matmul(dense_x, dense_y)
+        dense_out.backward()
+
+        sp_x = origin_x.detach().to_sparse_csr()
+        sp_x.stop_gradient = False
+        sp_y = origin_y.detach()
+        sp_y.stop_gradient = False
+        sp_out = paddle.incubate.sparse.matmul(sp_x, sp_y)
+        sp_out.backward()
+
+        self.assertTrue(np.allclose(sp_out.numpy(), dense_out.numpy()))
+        self.assertTrue(
+            np.allclose(sp_x.grad.to_dense().numpy(),
+                        (dense_x.grad * mask).numpy()))
+        self.assertTrue(np.allclose(sp_y.grad.numpy(), dense_y.grad.numpy()))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_model.py b/python/paddle/fluid/tests/unittests/test_sparse_model.py
index 90f30e383174c..c070614fc708b 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_model.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_model.py
@@ -62,3 +62,7 @@ def test(self):
             sparse_loss.backward()
 
             assert np.allclose(x.grad.numpy(), sparse_x.grad.to_dense().numpy())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_unary_op.py b/python/paddle/fluid/tests/unittests/test_sparse_unary_op.py
index 2272022e8d6dc..36d64f5067263 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_unary_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_unary_op.py
@@ -12,137 +12,142 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
 import unittest
-from typing import Union, Callable
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-from paddle.fluid.framework import _test_eager_guard
-from paddle import _C_ops
+from paddle.fluid.framework import convert_np_dtype_to_dtype_
 
 
 class TestSparseUnary(unittest.TestCase):
 
-    def assert_raises_on_dense_tensor(self, sparse_func):
-        with _test_eager_guard():
-            dense_x = paddle.ones((2, 3))
-            with self.assertRaises(NotImplementedError):
-                sparse_func(dense_x)
-
-    def compare_with_dense(
-        self,
-        x,
-        to_sparse: Callable[[paddle.Tensor], paddle.Tensor],
-        dense_func: Callable[[paddle.Tensor], paddle.Tensor],
-        sparse_func: Callable[[paddle.Tensor], paddle.Tensor],
-        test_gradient: bool,
-    ):
-
-        def tensor_allclose(dense_tensor: paddle.Tensor,
-                            sparse_tensor: paddle.Tensor):
-            dense_numpy = dense_tensor.numpy()
-            mask = ~np.isnan(dense_numpy)
-            return np.allclose(dense_numpy[mask],
-                               sparse_tensor.to_dense().numpy()[mask])
-
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
-        with _test_eager_guard():
-            dense_x = paddle.to_tensor(x,
-                                       dtype="float32",
-                                       stop_gradient=not test_gradient)
-
-            sparse_x = to_sparse(dense_x)
-            sparse_out = sparse_func(sparse_x)
-
-            dense_x = paddle.to_tensor(x,
-                                       dtype="float32",
-                                       stop_gradient=not test_gradient)
+    def to_sparse(self, x, format):
+        if format == 'coo':
+            return x.detach().to_sparse_coo(sparse_dim=x.ndim)
+        elif format == 'csr':
+            return x.detach().to_sparse_csr()
+
+    def check_result(self, dense_func, sparse_func, format, *args):
+        origin_x = paddle.rand([8, 16, 32], dtype='float32')
+        mask = paddle.randint(0, 2, [8, 16, 32]).astype('float32')
+
+        ### check sparse coo with dense ###
+        dense_x = origin_x * mask
+        sp_x = self.to_sparse(dense_x, format)
+
+        sp_x.stop_gradient = False
+        if len(args) == 0:
+            sp_out = sparse_func(sp_x)
+        elif len(args) == 1:
+            sp_out = sparse_func(sp_x, args[0])
+        elif len(args) == 2:
+            sp_out = sparse_func(sp_x, args[0], args[1])
+        sp_out.backward()
+
+        dense_x.stop_gradient = False
+        if len(args) == 0:
             dense_out = dense_func(dense_x)
+        elif len(args) == 1:
+            dense_out = dense_func(dense_x, args[0])
+        elif len(args) == 2:
+            if dense_func == paddle.cast:
+                dense_out = dense_func(dense_x, args[1])
+
+                int_dtype = convert_np_dtype_to_dtype_(args[0])
+                if sp_out.is_sparse_csr():
+                    self.assertEqual(sp_out.crows().dtype, int_dtype)
+                    self.assertEqual(sp_out.cols().dtype, int_dtype)
+                elif sp_out.is_sparse_coo():
+                    self.assertEqual(sp_out.indices().dtype, int_dtype)
+            else:
+                dense_out = dense_func(dense_x, args[0], args[1])
+        dense_out.backward()
+
+        # compare forward
+        self.assertTrue(
+            np.allclose(sp_out.to_dense().numpy(), dense_out.numpy()))
+
+        # compare backward
+        if dense_func == paddle.sqrt:
+            expect_grad = np.nan_to_num(dense_x.grad.numpy(), 0., 0., 0.)
+        else:
+            expect_grad = (dense_x.grad * mask).numpy()
+        self.assertTrue(np.allclose(sp_x.grad.to_dense().numpy(), expect_grad))
+
+    def compare_with_dense(self, dense_func, sparse_func):
+        self.check_result(dense_func, sparse_func, 'coo')
+        self.check_result(dense_func, sparse_func, 'csr')
+
+    def compare_with_dense_one_attr(self, dense_func, sparse_func, attr1):
+        self.check_result(dense_func, sparse_func, 'coo', attr1)
+        self.check_result(dense_func, sparse_func, 'csr', attr1)
+
+    def compare_with_dense_two_attr(self, dense_func, sparse_func, attr1,
+                                    attr2):
+        self.check_result(dense_func, sparse_func, 'coo', attr1, attr2)
+        self.check_result(dense_func, sparse_func, 'csr', attr1, attr2)
 
-            assert tensor_allclose(dense_out, sparse_out)
+    def test_sparse_sin(self):
+        self.compare_with_dense(paddle.sin, paddle.incubate.sparse.sin)
 
-            if test_gradient:
-                dense_out.backward(dense_out)
-                sparse_out.backward(sparse_out)
-                assert tensor_allclose(dense_x.grad, sparse_x.grad)
-        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
+    def test_sparse_tan(self):
+        self.compare_with_dense(paddle.tan, paddle.incubate.sparse.tan)
 
-    def test_sparse_relu(self):
-        x = [[0, -1, 0, 2], [0, 0, -3, 0], [4, 5, 0, 0]]
-        sparse_dim = 2
-        self.compare_with_dense(
-            x,
-            lambda x: x.to_sparse_coo(sparse_dim),
-            paddle.nn.ReLU(),
-            paddle.incubate.sparse.nn.ReLU(),
-            True,
-        )
-        self.compare_with_dense(
-            x,
-            lambda x: x.to_sparse_csr(),
-            paddle.nn.ReLU(),
-            paddle.incubate.sparse.nn.ReLU(),
-            False,
-        )
-        self.assert_raises_on_dense_tensor(paddle.incubate.sparse.nn.ReLU())
+    def test_sparse_asin(self):
+        self.compare_with_dense(paddle.asin, paddle.incubate.sparse.asin)
 
-    def test_sparse_sqrt(self):
-        x = [[0, 16, 0, 0], [0, 0, 0, 0], [0, 4, 2, 0]]
-        sparse_dim = 2
-        self.compare_with_dense(
-            x,
-            lambda x: x.to_sparse_coo(sparse_dim),
-            paddle.sqrt,
-            paddle.incubate.sparse.sqrt,
-            True,
-        )
-        self.compare_with_dense(
-            x,
-            lambda x: x.to_sparse_csr(),
-            paddle.sqrt,
-            paddle.incubate.sparse.sqrt,
-            False,
-        )
-        self.assert_raises_on_dense_tensor(paddle.incubate.sparse.sqrt)
+    def test_sparse_atan(self):
+        self.compare_with_dense(paddle.atan, paddle.incubate.sparse.atan)
 
-    def test_sparse_sin(self):
-        x = [[0, 16, 0, 0], [0, 0, 0, 0], [0, 4, 2, 0]]
-        sparse_dim = 2
-        self.compare_with_dense(
-            x,
-            lambda x: x.to_sparse_coo(sparse_dim),
-            paddle.sin,
-            paddle.incubate.sparse.sin,
-            True,
-        )
-        self.compare_with_dense(
-            x,
-            lambda x: x.to_sparse_csr(),
-            paddle.sin,
-            paddle.incubate.sparse.sin,
-            False,
-        )
-        self.assert_raises_on_dense_tensor(paddle.incubate.sparse.sin)
+    def test_sparse_sinh(self):
+        self.compare_with_dense(paddle.sinh, paddle.incubate.sparse.sinh)
 
     def test_sparse_tanh(self):
-        x = [[0, 16, 0, 0], [0, 0, 0, 0], [0, -4, 2, 0]]
-        sparse_dim = 2
-        self.compare_with_dense(
-            x,
-            lambda x: x.to_sparse_coo(sparse_dim),
-            paddle.tanh,
-            paddle.incubate.sparse.tanh,
-            True,
-        )
-        self.compare_with_dense(
-            x,
-            lambda x: x.to_sparse_csr(),
-            paddle.tanh,
-            paddle.incubate.sparse.tanh,
-            False,
-        )
-        self.assert_raises_on_dense_tensor(paddle.incubate.sparse.tanh)
+        self.compare_with_dense(paddle.tanh, paddle.incubate.sparse.tanh)
+
+    def test_sparse_asinh(self):
+        self.compare_with_dense(paddle.asinh, paddle.incubate.sparse.asinh)
+
+    def test_sparse_atanh(self):
+        self.compare_with_dense(paddle.atanh, paddle.incubate.sparse.atanh)
+
+    def test_sparse_sqrt(self):
+        self.compare_with_dense(paddle.sqrt, paddle.incubate.sparse.sqrt)
+
+    def test_sparse_square(self):
+        self.compare_with_dense(paddle.square, paddle.incubate.sparse.square)
+
+    def test_sparse_log1p(self):
+        self.compare_with_dense(paddle.log1p, paddle.incubate.sparse.log1p)
+
+    def test_sparse_relu(self):
+        self.compare_with_dense(paddle.nn.ReLU(),
+                                paddle.incubate.sparse.nn.ReLU())
+
+    def test_sparse_abs(self):
+        self.compare_with_dense(paddle.abs, paddle.incubate.sparse.abs)
+
+    def test_sparse_neg(self):
+        self.compare_with_dense(paddle.neg, paddle.incubate.sparse.neg)
+
+    def test_sparse_pow(self):
+        self.compare_with_dense_one_attr(paddle.pow, paddle.incubate.sparse.pow,
+                                         3)
+
+    def test_sparse_mul_scalar(self):
+        self.compare_with_dense_one_attr(paddle.Tensor.__mul__,
+                                         paddle.incubate.sparse.multiply, 3)
+
+    def test_sparse_div_scalar(self):
+        self.compare_with_dense_one_attr(paddle.Tensor.__div__,
+                                         paddle.incubate.sparse.divide, 2)
+
+    def test_sparse_cast(self):
+        self.compare_with_dense_two_attr(paddle.cast,
+                                         paddle.incubate.sparse.cast, 'int16',
+                                         'float32')
+        self.compare_with_dense_two_attr(paddle.cast,
+                                         paddle.incubate.sparse.cast, 'int32',
+                                         'float64')
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
index a12425b69299e..53c84c9d1f66a 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
@@ -38,7 +38,6 @@ def test_create_coo_by_tensor(self):
                                                            dense_shape,
                                                            stop_gradient=False)
             # test the to_string.py
-            print(coo)
             assert np.array_equal(indices, coo.indices().numpy())
             assert np.array_equal(values, coo.values().numpy())
 
@@ -49,6 +48,7 @@ def test_create_coo_by_np(self):
             dense_shape = [3, 3]
             coo = paddle.incubate.sparse.sparse_coo_tensor(
                 indices, values, dense_shape)
+            assert np.array_equal(3, coo.nnz())
             assert np.array_equal(indices, coo.indices().numpy())
             assert np.array_equal(values, coo.values().numpy())
 
@@ -78,7 +78,7 @@ def test_create_csr_by_np(self):
             csr = paddle.incubate.sparse.sparse_csr_tensor(
                 crows, cols, values, dense_shape)
             # test the to_string.py
-            print(csr)
+            assert np.array_equal(5, csr.nnz())
             assert np.array_equal(crows, csr.crows().numpy())
             assert np.array_equal(cols, csr.cols().numpy())
             assert np.array_equal(values, csr.values().numpy())
@@ -298,6 +298,7 @@ def test_sparse_coo_tensor_sorted(self):
                     values = paddle.to_tensor(values, dtype='float32')
                     sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
                         indices, values)
+                    sparse_x = paddle.incubate.sparse.coalesce(sparse_x)
                     indices_sorted = [[0, 1], [1, 0]]
                     values_sorted = [5.0, 1.0]
                     assert np.array_equal(indices_sorted,
@@ -310,6 +311,7 @@ def test_sparse_coo_tensor_sorted(self):
                     values = paddle.to_tensor(values, dtype='float32')
                     sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
                         indices, values)
+                    sparse_x = paddle.incubate.sparse.coalesce(sparse_x)
                     values_sorted = [[5.0, 5.0], [1.0, 1.0]]
                     assert np.array_equal(indices_sorted,
                                           sparse_x.indices().numpy())
diff --git a/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py b/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
index 8f2380845875a..e027401549a01 100644
--- a/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
@@ -16,6 +16,7 @@
 
 import unittest
 from paddle.fluid import Program, program_guard
+import paddle
 import paddle.fluid.core as core
 import numpy as np
 import paddle.fluid.layers as layers
@@ -195,7 +196,7 @@ def test_grad(self):
                                    mask=y,
                                    x=x,
                                    level=level)
-            mean = layers.mean(out)
+            mean = paddle.mean(out)
 
             append_backward(mean)
 
diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py
index 9d1a4cf19eb07..ad226878f7ef1 100644
--- a/python/paddle/fluid/tests/unittests/test_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_op.py
@@ -384,6 +384,29 @@ def test_dygraph_final_state_api(self):
                 self.assertEqual(
                     (input1.grad.numpy() == expected_grad_result).all(), True)
 
+    def test_add_n_and_add_and_grad(self):
+        with fluid.dygraph.guard():
+            np_x = np.array([[1, 2, 3], [4, 5, 6]])
+            np_y = [[7, 8, 9], [10, 11, 12]]
+            np_z = [[1, 1, 1], [1, 1, 1]]
+            x = paddle.to_tensor(np_x, dtype='float32', stop_gradient=False)
+            y = paddle.to_tensor(np_y, dtype='float32', stop_gradient=False)
+            z = paddle.to_tensor(np_z, dtype='float32')
+
+            out1 = x + z
+            out2 = y + z
+            out = paddle.add_n([out1, out2])
+
+            dx, dy = paddle.grad([out], [x, y], create_graph=True)
+
+            expected_out = np.array([[10., 12., 14.], [16., 18., 20.]])
+            expected_dx = np.array([[1, 1, 1], [1, 1, 1]])
+            expected_dy = np.array([[1, 1, 1], [1, 1, 1]])
+
+            self.assertTrue(np.allclose(out, expected_out))
+            self.assertTrue(np.allclose(dx, expected_dx))
+            self.assertTrue(np.allclose(dy, expected_dy))
+
 
 class TestRaiseSumError(unittest.TestCase):
 
diff --git a/python/paddle/fluid/tests/unittests/test_tensordot.py b/python/paddle/fluid/tests/unittests/test_tensordot.py
index e5d563455e896..bcba40090b858 100644
--- a/python/paddle/fluid/tests/unittests/test_tensordot.py
+++ b/python/paddle/fluid/tests/unittests/test_tensordot.py
@@ -185,35 +185,35 @@ def set_dtype(self):
         self.dtype = np.float64
 
 
-class TestTensordotAPIBroadcastCase1(TestTensordotAPI):
+class TestTensordotAPIBroadcastCase1(TestTensordotAPIFloat64):
 
     def set_input_shape(self):
         self.x_shape = [1, 1, 1, 5]
         self.y_shape = [1, 5, 1, 1]
 
 
-class TestTensordotAPIBroadcastCase2(TestTensordotAPI):
+class TestTensordotAPIBroadcastCase2(TestTensordotAPIFloat64):
 
     def set_input_shape(self):
         self.x_shape = [1, 5, 5, 5]
         self.y_shape = [1, 1, 1, 5]
 
 
-class TestTensordotAPIBroadcastCase3(TestTensordotAPI):
+class TestTensordotAPIBroadcastCase3(TestTensordotAPIFloat64):
 
     def set_input_shape(self):
         self.x_shape = [5, 5, 5, 1]
         self.y_shape = [5, 5, 1, 5]
 
 
-class TestTensordotAPIBroadcastCase4(TestTensordotAPI):
+class TestTensordotAPIBroadcastCase4(TestTensordotAPIFloat64):
 
     def set_input_shape(self):
         self.x_shape = [5, 5, 5, 1]
         self.y_shape = [1, 1, 1, 1]
 
 
-class TestTensordotAPIBroadcastCase5(TestTensordotAPI):
+class TestTensordotAPIBroadcastCase5(TestTensordotAPIFloat64):
 
     def set_input_shape(self):
         self.x_shape = [1, 1, 5, 5]
diff --git a/python/paddle/fluid/tests/unittests/test_trainable.py b/python/paddle/fluid/tests/unittests/test_trainable.py
index 72edff9f29b34..546ab11466db7 100644
--- a/python/paddle/fluid/tests/unittests/test_trainable.py
+++ b/python/paddle/fluid/tests/unittests/test_trainable.py
@@ -16,6 +16,7 @@
 
 from collections import Counter
 import unittest
+import paddle
 import paddle.fluid as fluid
 from simple_nets import init_data
 
@@ -27,7 +28,7 @@ def test_trainable():
                               size=10,
                               param_attr=fluid.ParamAttr(trainable=False))
     loss = fluid.layers.cross_entropy(input=feature, label=label)
-    loss = fluid.layers.mean(loss)
+    loss = paddle.mean(loss)
     return loss
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py
index d9e293ba67159..fb48f63185075 100644
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
@@ -126,6 +126,41 @@ def initTestCase(self):
         self.axis = (6, 1, 3, 5, 0, 2, 4, 7)
 
 
+class TestAutoTuneTransposeOp(OpTest):
+
+    def setUp(self):
+        self.init_op_type()
+        self.initTestCase()
+        self.python_api = paddle.transpose
+        self.inputs = {'X': np.random.random(self.shape).astype("float64")}
+        self.attrs = {
+            'axis': list(self.axis),
+            'use_mkldnn': self.use_mkldnn,
+        }
+        self.outputs = {
+            'XShape': np.random.random(self.shape).astype("float64"),
+            'Out': self.inputs['X'].transpose(self.axis)
+        }
+
+    def initTestCase(self):
+        fluid.core.set_autotune_range(0, 3)
+        fluid.core.update_autotune_status()
+        fluid.core.enable_autotune()
+        self.shape = (1, 12, 256, 1)
+        self.axis = (0, 3, 2, 1)
+
+    def init_op_type(self):
+        self.op_type = "transpose2"
+        self.use_mkldnn = False
+
+    def test_check_output(self):
+        self.check_output(no_check_set=['XShape'], check_eager=True)
+        fluid.core.disable_autotune()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', check_eager=True)
+
+
 class TestTransposeBF16Op(OpTest):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_unique_consecutive_op.py b/python/paddle/fluid/tests/unittests/test_unique_consecutive_op.py
index b4a4eac0ba74f..8ec0bcca4bc4e 100644
--- a/python/paddle/fluid/tests/unittests/test_unique_consecutive_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unique_consecutive_op.py
@@ -72,6 +72,7 @@ def config(self):
         self.x_range = 20
         self.return_inverse = False
         self.return_counts = False
+        self.python_api = paddle.unique_consecutive
 
     def init_kernel_type(self):
         self.dtype = "float32" if core.is_compiled_with_rocm() else "float64"
@@ -88,13 +89,14 @@ def setUp(self):
         self.inputs = {
             'X': x,
         }
+        self.python_out_sig = ["Out"]
         self.attrs = {'dtype': int(core.VarDesc.VarType.INT32)}
         self.outputs = {
             'Out': out,
         }
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
 
 class TestUniqueConsecutiveOp2(TestUniqueConsecutiveOp):
@@ -105,6 +107,7 @@ def config(self):
         self.x_range = 20
         self.return_inverse = True
         self.return_counts = False
+        self.python_api = paddle.unique_consecutive
 
     def setUp(self):
         self.init_kernel_type()
@@ -122,6 +125,7 @@ def setUp(self):
             'return_inverse': self.return_inverse,
             'dtype': int(core.VarDesc.VarType.INT32)
         }
+        self.python_out_sig = ["Out"]
         self.outputs = {'Out': result, 'Index': inverse}
 
 
@@ -133,6 +137,7 @@ def config(self):
         self.x_range = 20
         self.return_inverse = False
         self.return_counts = True
+        self.python_api = paddle.unique_consecutive
 
     def setUp(self):
         self.init_kernel_type()
@@ -150,6 +155,7 @@ def setUp(self):
             'return_counts': self.return_counts,
             'dtype': int(core.VarDesc.VarType.INT32)
         }
+        self.python_out_sig = ["Out"]
         self.outputs = {'Out': result, 'Counts': counts}
 
 
@@ -161,6 +167,7 @@ def config(self):
         self.x_range = 20
         self.return_inverse = True
         self.return_counts = True
+        self.python_api = paddle.unique_consecutive
 
     def setUp(self):
         self.init_kernel_type()
@@ -180,6 +187,7 @@ def setUp(self):
             'return_counts': self.return_counts,
             'dtype': int(core.VarDesc.VarType.INT32)
         }
+        self.python_out_sig = ["Out"]
         self.outputs = {'Out': result, 'Index': inverse, 'Counts': counts}
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_variable.py b/python/paddle/fluid/tests/unittests/test_variable.py
index 87802b83415d6..5fb220da609a4 100644
--- a/python/paddle/fluid/tests/unittests/test_variable.py
+++ b/python/paddle/fluid/tests/unittests/test_variable.py
@@ -30,6 +30,9 @@
 
 class TestVariable(unittest.TestCase):
 
+    def setUp(self):
+        np.random.seed(2022)
+
     def test_np_dtype_convert(self):
         DT = core.VarDesc.VarType
         convert = convert_np_dtype_to_dtype_
@@ -486,6 +489,9 @@ def test_detach(self):
 
 class TestVariableSlice(unittest.TestCase):
 
+    def setUp(self):
+        np.random.seed(2022)
+
     def _test_item_none(self, place):
         data = np.random.rand(2, 3, 4).astype("float32")
         prog = paddle.static.Program()
@@ -545,6 +551,9 @@ def test_slice(self):
 
 class TestListIndex(unittest.TestCase):
 
+    def setUp(self):
+        np.random.seed(2022)
+
     def numel(self, shape):
         return reduce(lambda x, y: x * y, shape)
 
@@ -723,10 +732,10 @@ def run_getitem_list_index(self, array, index):
             return
         getitem_pp = exe.run(prog, feed={x.name: array}, fetch_list=fetch_list)
 
-        print(getitem_pp)
-        self.assertTrue(np.array_equal(value_np, getitem_pp[0]),
-                        msg='\n numpy:{},\n paddle:{}'.format(
-                            value_np, getitem_pp[0]))
+        np.testing.assert_allclose(value_np,
+                                   getitem_pp[0],
+                                   rtol=1e-5,
+                                   atol=1e-8)
 
     def test_static_graph_getitem_bool_index(self):
         paddle.enable_static()
@@ -791,9 +800,7 @@ def run_setitem_list_index(self, array, index, value_np):
                              },
                              fetch_list=fetch_list)
 
-        self.assertTrue(np.allclose(array2, setitem_pp[0]),
-                        msg='\n numpy:{},\n paddle:{}'.format(
-                            array2, setitem_pp[0]))
+        np.testing.assert_allclose(array2, setitem_pp[0], rtol=1e-5, atol=1e-8)
 
     def test_static_graph_setitem_list_index(self):
         paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_weight_decay.py b/python/paddle/fluid/tests/unittests/test_weight_decay.py
index b42bfb1a684ac..7cdfc3da93048 100644
--- a/python/paddle/fluid/tests/unittests/test_weight_decay.py
+++ b/python/paddle/fluid/tests/unittests/test_weight_decay.py
@@ -63,7 +63,7 @@ def bow_net(data,
     fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
     prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
     cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
+    avg_cost = paddle.mean(x=cost)
 
     return avg_cost
 
diff --git a/python/paddle/fluid/tests/unittests/test_where_op.py b/python/paddle/fluid/tests/unittests/test_where_op.py
index 51cb380be8438..967f917fd93b7 100644
--- a/python/paddle/fluid/tests/unittests/test_where_op.py
+++ b/python/paddle/fluid/tests/unittests/test_where_op.py
@@ -97,7 +97,7 @@ def test_api(self, use_cuda=False):
                     x.stop_gradient = x_stop_gradient
                     y.stop_gradient = y_stop_gradient
                     result = paddle.where(cond, x, y)
-                    append_backward(layers.mean(result))
+                    append_backward(paddle.mean(result))
                     for use_cuda in [False, True]:
                         if (use_cuda
                                 and (not fluid.core.is_compiled_with_cuda())):
diff --git a/python/paddle/fluid/tests/unittests/test_while_loop_op.py b/python/paddle/fluid/tests/unittests/test_while_loop_op.py
index baf111df6335a..92d67406b033a 100644
--- a/python/paddle/fluid/tests/unittests/test_while_loop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_while_loop_op.py
@@ -222,7 +222,7 @@ def body(i, x):
             x.stop_gradient = False
 
             out = layers.while_loop(cond, body, [i, x])
-            mean = layers.mean(out[1])
+            mean = paddle.mean(out[1])
             append_backward(mean)
 
         place = fluid.CUDAPlace(
@@ -264,7 +264,7 @@ def body(i, x):
             x.stop_gradient = False
 
             out = layers.while_loop(cond, body, [i, x])
-            mean = layers.mean(out[1])
+            mean = paddle.mean(out[1])
             append_backward(mean)
 
         place = fluid.CUDAPlace(
@@ -351,7 +351,7 @@ def internal_body(j, x, mem_array):
                                     [i, j, x, mem_array])
 
             sum_result = layers.array_read(array=mem_array, i=j)
-            mean = layers.mean(sum_result)
+            mean = paddle.mean(sum_result)
             append_backward(mean)
 
             place = fluid.CUDAPlace(
diff --git a/python/paddle/fluid/tests/unittests/test_while_op.py b/python/paddle/fluid/tests/unittests/test_while_op.py
index dee83692bd324..8e35a57f2426f 100644
--- a/python/paddle/fluid/tests/unittests/test_while_op.py
+++ b/python/paddle/fluid/tests/unittests/test_while_op.py
@@ -81,7 +81,7 @@ def simple_net(self):
                 layers.array_write(result2, i=j, array=mem_array)
                 layers.less_than(x=j, y=array_len2, cond=cond2)
         sum_result = layers.array_read(array=mem_array, i=j)
-        loss = layers.mean(sum_result)
+        loss = paddle.mean(sum_result)
         return loss, sum_result
 
     def test_simple_net(self):
diff --git a/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py b/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
index ea3264ba0dbb7..fb0cb2d7a5aee 100644
--- a/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
@@ -36,4 +36,5 @@
     'eigvalsh',
     'class_center_sample',
     'einsum',
+    'rmsprop',
 ]
diff --git a/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt
index 6267526f33c12..cf70f63580b99 100644
--- a/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt
@@ -16,9 +16,6 @@ if(WITH_XPU_BKCL)
   list(APPEND DIST_TEST_OPS test_gen_bkcl_id_op)
 endif()
 
-list(REMOVE_ITEM TEST_OPS test_concat_op_xpu)
-list(REMOVE_ITEM TEST_OPS test_mean_op_xpu)
-
 foreach(TEST_OP ${TEST_OPS})
   py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach()
@@ -27,5 +24,7 @@ foreach(TEST_OP ${DIST_TEST_OPS})
   py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach()
 
-set_tests_properties(test_mul_op_xpu PROPERTIES TIMEOUT 120)
 set_tests_properties(test_conv2d_op_xpu PROPERTIES TIMEOUT 120)
+set_tests_properties(test_mul_op_xpu PROPERTIES TIMEOUT 120)
+set_tests_properties(test_matmul_v2_op_xpu PROPERTIES TIMEOUT 900)
+set_tests_properties(test_matmul_op_xpu PROPERTIES TIMEOUT 300)
diff --git a/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py b/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py
index d92378f60f578..bcaa8055b25cd 100644
--- a/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py
+++ b/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py
@@ -84,7 +84,13 @@
 
 xpu_test_op_white_list = []
 xpu_test_device_type_white_list = ['xpu1_float64']
-xpu_test_op_type_white_list = ['dropout_float16', 'dropout_grad_float16']
+xpu_test_op_type_white_list = [
+    'dropout_float16',
+    'dropout_grad_float16',
+    "grad_add_float32",  # no api for grad_add, skip
+    "resnet_unit",
+    "resnet_unit_grad"
+]
 xpu_test_device_op_white_list = []
 xpu_test_device_op_type_white_list = []
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py
index b79bbafb37554..648e87f8c3174 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py
@@ -58,7 +58,7 @@ def test_assign_LoDTensorArray(self):
             init_array = fluid.layers.array_write(x=z, i=i)
             array = fluid.layers.assign(init_array)
             sums = fluid.layers.array_read(array=init_array, i=i)
-            mean = fluid.layers.mean(sums)
+            mean = paddle.mean(sums)
             append_backward(mean)
 
         place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_assign_value_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_assign_value_op_xpu.py
new file mode 100644
index 0000000000000..6455b157cb2ca
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_assign_value_op_xpu.py
@@ -0,0 +1,135 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy
+import sys
+
+sys.path.append("..")
+import paddle.fluid as fluid
+import paddle.fluid.framework as framework
+import paddle.fluid.layers as layers
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+import paddle
+
+paddle.enable_static()
+
+
+class XPUTestAssignValueOp(XPUOpTestWrapper):
+
+    def __init__(self):
+        self.op_name = 'assign_value'
+        self.use_dynamic_create_class = False
+
+    class TestAssignValueOp(XPUOpTest):
+
+        def init(self):
+            self.dtype = self.in_type
+            self.place = paddle.XPUPlace(0)
+            self.op_type = 'assign_value'
+
+        def setUp(self):
+            self.init()
+            self.inputs = {}
+            self.attrs = {}
+            self.init_data()
+            self.attrs["shape"] = self.value.shape
+            self.attrs["dtype"] = framework.convert_np_dtype_to_dtype_(
+                self.value.dtype)
+            self.outputs = {"Out": self.value}
+
+        def init_data(self):
+            self.value = numpy.random.random(size=(2, 5)).astype(self.dtype)
+            self.attrs["fp32_values"] = [float(v) for v in self.value.flat]
+
+        def test_forward(self):
+            self.check_output_with_place(self.place)
+
+    class TestAssignValueOp2(TestAssignValueOp):
+
+        def init_data(self):
+            self.value = numpy.random.random(size=(2, 5)).astype(numpy.int32)
+            self.attrs["int32_values"] = [int(v) for v in self.value.flat]
+
+    class TestAssignValueOp3(TestAssignValueOp):
+
+        def init_data(self):
+            self.value = numpy.random.random(size=(2, 5)).astype(numpy.int64)
+            self.attrs["int64_values"] = [int(v) for v in self.value.flat]
+
+    class TestAssignValueOp4(TestAssignValueOp):
+
+        def init_data(self):
+            self.value = numpy.random.choice(a=[False, True],
+                                             size=(2, 5)).astype(numpy.bool)
+            self.attrs["bool_values"] = [int(v) for v in self.value.flat]
+
+
+class TestAssignApi(unittest.TestCase):
+
+    def setUp(self):
+        self.init_dtype()
+        self.value = (-100 + 200 * numpy.random.random(size=(2, 5))).astype(
+            self.dtype)
+        self.place = fluid.XPUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = "float32"
+
+    def test_assign(self):
+        main_program = fluid.Program()
+        with fluid.program_guard(main_program):
+            x = layers.create_tensor(dtype=self.dtype)
+            layers.assign(input=self.value, output=x)
+
+        exe = fluid.Executor(self.place)
+        [fetched_x] = exe.run(main_program, feed={}, fetch_list=[x])
+        self.assertTrue(numpy.array_equal(fetched_x, self.value),
+                        "fetch_x=%s val=%s" % (fetched_x, self.value))
+        self.assertEqual(fetched_x.dtype, self.value.dtype)
+
+
+class TestAssignApi2(TestAssignApi):
+
+    def init_dtype(self):
+        self.dtype = "int32"
+
+
+class TestAssignApi3(TestAssignApi):
+
+    def init_dtype(self):
+        self.dtype = "int64"
+
+
+class TestAssignApi4(TestAssignApi):
+
+    def setUp(self):
+        self.init_dtype()
+        self.value = numpy.random.choice(a=[False, True],
+                                         size=(2, 5)).astype(numpy.bool)
+        self.place = fluid.XPUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = "bool"
+
+
+support_types = get_xpu_op_support_types('assign_value')
+for stype in support_types:
+    create_test_class(globals(), XPUTestAssignValueOp, stype)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_compare_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_compare_op_xpu.py
index f33da83bae7a1..cdaf767a1de68 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_compare_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_compare_op_xpu.py
@@ -12,282 +12,393 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
 import sys
 
 sys.path.append("..")
 import unittest
 import numpy as np
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-from op_test_xpu import OpTest, XPUOpTest
+from op_test_xpu import XPUOpTest
 import paddle
-from paddle.fluid import Program, program_guard
-
-
-def create_test_class(op_type, typename, callback):
-
-    class Cls(OpTest):
-
-        def setUp(self):
-            a = np.random.random(size=(10, 7)).astype(typename)
-            b = np.random.random(size=(10, 7)).astype(typename)
-            c = callback(a, b)
-            self.inputs = {'X': a, 'Y': b}
-            self.outputs = {'Out': c}
-            self.op_type = op_type
-            self.use_xpu = True
-            self.attrs = {'use_xpu': True}
-
-        def test_check_output(self):
-            paddle.enable_static()
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
-
-        def test_errors(self):
-            paddle.enable_static()
-            with program_guard(Program(), Program()):
-                x = fluid.layers.data(name='x', shape=[2], dtype='int32')
-                y = fluid.layers.data(name='y', shape=[2], dtype='int32')
-                a = fluid.layers.data(name='a', shape=[2], dtype='int16')
-                if self.op_type == "less_than":
-                    self.assertRaises(TypeError,
-                                      fluid.layers.less_than,
-                                      x=x,
-                                      y=y,
-                                      force_cpu=1)
-                op = eval("fluid.layers.%s" % self.op_type)
-                self.assertRaises(TypeError, op, x=x, y=y, cond=1)
-                self.assertRaises(TypeError, op, x=x, y=a)
-                self.assertRaises(TypeError, op, x=a, y=y)
-
-    cls_name = "{0}_{1}".format(op_type, typename)
-    Cls.__name__ = cls_name
-    globals()[cls_name] = Cls
-
-
-for _type_name in {'int32'}:
-    if _type_name == 'float64' and core.is_compiled_with_rocm():
-        _type_name = 'float32'
-
-    create_test_class('less_than', _type_name, lambda _a, _b: _a < _b)
-    create_test_class('less_equal', _type_name, lambda _a, _b: _a <= _b)
-    create_test_class('greater_than', _type_name, lambda _a, _b: _a > _b)
-    create_test_class('greater_equal', _type_name, lambda _a, _b: _a >= _b)
-    create_test_class('equal', _type_name, lambda _a, _b: _a == _b)
-    create_test_class('not_equal', _type_name, lambda _a, _b: _a != _b)
-
-
-def create_paddle_case(op_type, callback):
-
-    class PaddleCls(unittest.TestCase):
-
-        def setUp(self):
-            self.op_type = op_type
-            self.input_x = np.array([1, 2, 3, 4]).astype(np.int64)
-            self.input_y = np.array([1, 3, 2, 4]).astype(np.int64)
-            self.real_result = callback(self.input_x, self.input_y)
-            self.place = fluid.XPUPlace(
-                0) if fluid.core.is_compiled_with_xpu() else fluid.CPUPlace()
-
-        def test_api(self):
-            paddle.enable_static()
-            with program_guard(Program(), Program()):
-                x = fluid.data(name='x', shape=[4], dtype='int64')
-                y = fluid.data(name='y', shape=[4], dtype='int64')
-                op = eval("paddle.%s" % (self.op_type))
-                out = op(x, y)
-                exe = fluid.Executor(self.place)
-                res, = exe.run(feed={
-                    "x": self.input_x,
-                    "y": self.input_y
-                },
-                               fetch_list=[out])
-            self.assertEqual((res == self.real_result).all(), True)
-
-        def test_api_float(self):
-            if self.op_type == "equal":
-                paddle.enable_static()
-                with program_guard(Program(), Program()):
-                    x = fluid.data(name='x', shape=[4], dtype='int64')
-                    y = fluid.data(name='y', shape=[1], dtype='int64')
-                    op = eval("paddle.%s" % (self.op_type))
-                    out = op(x, y)
-                    exe = fluid.Executor(self.place)
-                    res, = exe.run(feed={
-                        "x": self.input_x,
-                        "y": 1.0
-                    },
-                                   fetch_list=[out])
-                self.real_result = np.array([1, 0, 0, 0]).astype(np.int64)
-                self.assertEqual((res == self.real_result).all(), True)
-
-        def test_dynamic_api(self):
-            paddle.disable_static()
-            x = paddle.to_tensor(self.input_x)
-            y = paddle.to_tensor(self.input_y)
-            op = eval("paddle.%s" % (self.op_type))
-            out = op(x, y)
-            self.assertEqual((out.numpy() == self.real_result).all(), True)
-            paddle.enable_static()
-
-        def test_dynamic_api_int(self):
-            if self.op_type == "equal":
-                paddle.disable_static()
-                x = paddle.to_tensor(self.input_x)
-                op = eval("paddle.%s" % (self.op_type))
-                out = op(x, 1)
-                self.real_result = np.array([1, 0, 0, 0]).astype(np.int64)
-                self.assertEqual((out.numpy() == self.real_result).all(), True)
-                paddle.enable_static()
-
-        def test_dynamic_api_float(self):
-            if self.op_type == "equal":
-                paddle.disable_static()
-                x = paddle.to_tensor(self.input_x)
-                op = eval("paddle.%s" % (self.op_type))
-                out = op(x, 1.0)
-                self.real_result = np.array([1, 0, 0, 0]).astype(np.int64)
-                self.assertEqual((out.numpy() == self.real_result).all(), True)
-                paddle.enable_static()
-
-        def test_assert(self):
-
-            def test_dynamic_api_string(self):
-                if self.op_type == "equal":
-                    paddle.disable_static()
-                    x = paddle.to_tensor(self.input_x)
-                    op = eval("paddle.%s" % (self.op_type))
-                    out = op(x, "1.0")
-                    paddle.enable_static()
-
-            self.assertRaises(TypeError, test_dynamic_api_string)
-
-        def test_dynamic_api_bool(self):
-            if self.op_type == "equal":
-                paddle.disable_static()
-                x = paddle.to_tensor(self.input_x)
-                op = eval("paddle.%s" % (self.op_type))
-                out = op(x, True)
-                self.real_result = np.array([1, 0, 0, 0]).astype(np.int64)
-                self.assertEqual((out.numpy() == self.real_result).all(), True)
-                paddle.enable_static()
-
-        def test_broadcast_api_1(self):
-            paddle.enable_static()
-            with program_guard(Program(), Program()):
-                x = paddle.static.data(name='x',
-                                       shape=[1, 2, 1, 3],
-                                       dtype='int32')
-                y = paddle.static.data(name='y', shape=[1, 2, 3], dtype='int32')
-                op = eval("paddle.%s" % (self.op_type))
-                out = op(x, y)
-                exe = paddle.static.Executor(self.place)
-                input_x = np.arange(1, 7).reshape((1, 2, 1, 3)).astype(np.int32)
-                input_y = np.arange(0, 6).reshape((1, 2, 3)).astype(np.int32)
-                real_result = callback(input_x, input_y)
-                res, = exe.run(feed={
-                    "x": input_x,
-                    "y": input_y
-                },
-                               fetch_list=[out])
-            self.assertEqual((res == real_result).all(), True)
-
-        def test_broadcast_api_2(self):
-            paddle.enable_static()
-            with program_guard(Program(), Program()):
-                x = paddle.static.data(name='x', shape=[1, 2, 3], dtype='int32')
-                y = paddle.static.data(name='y',
-                                       shape=[1, 2, 1, 3],
-                                       dtype='int32')
-                op = eval("paddle.%s" % (self.op_type))
-                out = op(x, y)
-                exe = paddle.static.Executor(self.place)
-                input_x = np.arange(0, 6).reshape((1, 2, 3)).astype(np.int32)
-                input_y = np.arange(1, 7).reshape((1, 2, 1, 3)).astype(np.int32)
-                real_result = callback(input_x, input_y)
-                res, = exe.run(feed={
-                    "x": input_x,
-                    "y": input_y
-                },
-                               fetch_list=[out])
-            self.assertEqual((res == real_result).all(), True)
-
-        def test_broadcast_api_3(self):
-            paddle.enable_static()
-            with program_guard(Program(), Program()):
-                x = paddle.static.data(name='x', shape=[5], dtype='int32')
-                y = paddle.static.data(name='y', shape=[3, 1], dtype='int32')
-                op = eval("paddle.%s" % (self.op_type))
-                out = op(x, y)
-                exe = paddle.static.Executor(self.place)
-                input_x = np.arange(0, 5).reshape((5)).astype(np.int32)
-                input_y = np.array([5, 3, 2]).reshape((3, 1)).astype(np.int32)
-                real_result = callback(input_x, input_y)
-                res, = exe.run(feed={
-                    "x": input_x,
-                    "y": input_y
-                },
-                               fetch_list=[out])
-            self.assertEqual((res == real_result).all(), True)
-
-        def test_bool_api_4(self):
-            paddle.enable_static()
-            with program_guard(Program(), Program()):
-                x = paddle.static.data(name='x', shape=[3, 1], dtype='bool')
-                y = paddle.static.data(name='y', shape=[3, 1], dtype='bool')
-                op = eval("paddle.%s" % (self.op_type))
-                out = op(x, y)
-                exe = paddle.static.Executor(self.place)
-                input_x = np.array([True, False, True]).astype(np.bool_)
-                input_y = np.array([True, True, False]).astype(np.bool_)
-                real_result = callback(input_x, input_y)
-                res, = exe.run(feed={
-                    "x": input_x,
-                    "y": input_y
-                },
-                               fetch_list=[out])
-            self.assertEqual((res == real_result).all(), True)
-
-        def test_bool_broadcast_api_4(self):
-            paddle.enable_static()
-            with program_guard(Program(), Program()):
-                x = paddle.static.data(name='x', shape=[3, 1], dtype='bool')
-                y = paddle.static.data(name='y', shape=[1], dtype='bool')
-                op = eval("paddle.%s" % (self.op_type))
-                out = op(x, y)
-                exe = paddle.static.Executor(self.place)
-                input_x = np.array([True, False, True]).astype(np.bool_)
-                input_y = np.array([True]).astype(np.bool_)
-                real_result = callback(input_x, input_y)
-                res, = exe.run(feed={
-                    "x": input_x,
-                    "y": input_y
-                },
-                               fetch_list=[out])
-            self.assertEqual((res == real_result).all(), True)
-
-        def test_attr_name(self):
-            paddle.enable_static()
-            with program_guard(Program(), Program()):
-                x = fluid.layers.data(name='x', shape=[4], dtype='int32')
-                y = fluid.layers.data(name='y', shape=[4], dtype='int32')
-                op = eval("paddle.%s" % (self.op_type))
-                out = op(x=x, y=y, name="name_%s" % (self.op_type))
-            self.assertEqual("name_%s" % (self.op_type) in out.name, True)
-
-    cls_name = "TestCase_{}".format(op_type)
-    PaddleCls.__name__ = cls_name
-    globals()[cls_name] = PaddleCls
-
-
-create_paddle_case('less_than', lambda _a, _b: _a < _b)
-create_paddle_case('less_equal', lambda _a, _b: _a <= _b)
-create_paddle_case('greater_than', lambda _a, _b: _a > _b)
-create_paddle_case('greater_equal', lambda _a, _b: _a >= _b)
-create_paddle_case('equal', lambda _a, _b: _a == _b)
-create_paddle_case('not_equal', lambda _a, _b: _a != _b)
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types
+from xpu.get_test_cover_info import XPUOpTestWrapper
+
+
+class TestCompareOpBase(XPUOpTest):
+
+    def setUp(self):
+        self.place = paddle.XPUPlace(0)
+        self.config()
+        self.set_case()
+        self.inputs = {'X': self.x, 'Y': self.y}
+        self.outputs = {'Out': self.result}
+
+    def set_case(self):
+        self.x = np.random.uniform(self.lbound, self.hbound,
+                                   self.x_shape).astype(self.dtype)
+        self.y = np.random.uniform(self.lbound, self.hbound,
+                                   self.y_shape).astype(self.dtype)
+        self.result = self.compute(self.x, self.y)
+
+    def config(self):
+        self.dtype = np.float32
+        self.op_type = 'less_than'
+        self.compute = np.less
+        self.lbound = -100
+        self.hbound = 100
+        self.x_shape = [11, 17]
+        self.y_shape = [11, 17]
+
+    def test_check_output(self):
+        paddle.enable_static()
+        self.check_output_with_place(self.place)
+
+
+class XPUTestLessThanOP(XPUOpTestWrapper):
+
+    def __init__(self):
+        self.op_name = 'less_than'
+        self.use_dynamic_create_class = False
+
+    class LessThanOpTestCase1(TestCompareOpBase):
+
+        def config(self):
+            self.dtype = self.in_type
+            self.op_type = 'less_than'
+            self.compute = np.less
+            self.set_data()
+
+        def set_data(self):
+            self.lbound = -100
+            self.hbound = 100
+            self.x_shape = [11, 17]
+            self.y_shape = [11, 17]
+
+    class LessThanOpTestCase2(LessThanOpTestCase1):
+
+        def set_data(self):
+            self.lbound = -200
+            self.hbound = 200
+            self.x_shape = [11, 17]
+            self.y_shape = [1]
+
+    class LessThanOpTestCase3(LessThanOpTestCase1):
+
+        def set_data(self):
+            self.lbound = -300
+            self.hbound = 300
+            self.x_shape = [11, 17, 29]
+            self.y_shape = [1]
+
+    class LessThanOpTestCase4(LessThanOpTestCase1):
+
+        def set_data(self):
+            self.lbound = -200
+            self.hbound = 200
+            self.x_shape = [128, 128, 512]
+            self.y_shape = [1]
+
+    class LessThanOpTestCase5(LessThanOpTestCase1):
+
+        def set_data(self):
+            self.lbound = -100
+            self.hbound = 100
+            self.x_shape = [128, 128, 512]
+            self.y_shape = [128, 128, 512]
+
+
+support_types = get_xpu_op_support_types('less_than')
+for stype in support_types:
+    create_test_class(globals(), XPUTestLessThanOP, stype)
+
+
+class XPUTestLessEqualOp(XPUOpTestWrapper):
+
+    def __init__(self):
+        self.op_name = 'less_equal'
+        self.use_dynamic_create_class = False
+
+    class LessEqualOpTestCase1(TestCompareOpBase):
+
+        def config(self):
+            self.dtype = self.in_type
+            self.op_type = 'less_equal'
+            self.compute = np.less_equal
+            self.set_data()
+
+        def set_data(self):
+            self.lbound = -100
+            self.hbound = 100
+            self.x_shape = [11, 17]
+            self.y_shape = [11, 17]
+
+    class LessEqualOpTestCase2(LessEqualOpTestCase1):
+
+        def set_data(self):
+            self.lbound = -100
+            self.hbound = 100
+            self.x_shape = [11, 17, 255]
+            self.y_shape = [11, 17, 255]
+
+    class LessEqualOpTestCase3(LessEqualOpTestCase1):
+
+        def set_data(self):
+            self.lbound = -200
+            self.hbound = 200
+            self.x_shape = [11, 17, 255]
+            self.y_shape = [1]
+
+    class LessEqualOpTestCase4(LessEqualOpTestCase1):
+
+        def set_data(self):
+            self.lbound = -200
+            self.hbound = 200
+            self.x_shape = [11, 17]
+            self.y_shape = [1]
+
+    class LessEqualOpTestCase5(LessEqualOpTestCase1):
+
+        def set_data(self):
+            self.lbound = -200
+            self.hbound = 200
+            self.x_shape = [128, 128, 512]
+            self.y_shape = [128, 128, 512]
+
+
+support_types = get_xpu_op_support_types('less_equal')
+for stype in support_types:
+    create_test_class(globals(), XPUTestLessEqualOp, stype)
+
+
+class XPUTestGreaterThanOp(XPUOpTestWrapper):
+
+    def __init__(self):
+        self.op_name = 'greater_than'
+        self.use_dynamic_create_class = False
+
+    class GreaterThanOpTestCase1(TestCompareOpBase):
+
+        def config(self):
+            self.dtype = self.in_type
+            self.op_type = 'greater_than'
+            self.compute = np.greater
+            self.set_data()
+
+        def set_data(self):
+            self.lbound = -200
+            self.hbound = 200
+            self.x_shape = [128, 128, 512]
+            self.y_shape = [128, 128, 512]
+
+    class GreaterThanOpTestCase2(GreaterThanOpTestCase1):
+
+        def set_data(self):
+            self.lbound = -100
+            self.hbound = 100
+            self.x_shape = [128, 128, 512]
+            self.y_shape = [1]
+
+    class GreaterThanOpTestCase3(GreaterThanOpTestCase1):
+
+        def set_data(self):
+            self.lbound = -100
+            self.hbound = 100
+            self.x_shape = [11, 17]
+            self.y_shape = [1]
+
+    class GreaterThanOpTestCase4(GreaterThanOpTestCase1):
+
+        def set_data(self):
+            self.lbound = -100
+            self.hbound = 100
+            self.x_shape = [11, 17]
+            self.y_shape = [11, 17]
+
+    class GreaterThanOpTestCase5(GreaterThanOpTestCase1):
+
+        def set_data(self):
+            self.lbound = -100
+            self.hbound = 100
+            self.x_shape = [10, 10, 20, 20]
+            self.y_shape = [10, 10, 20, 20]
+
+
+support_types = get_xpu_op_support_types('greater_than')
+for stype in support_types:
+    create_test_class(globals(), XPUTestGreaterThanOp, stype)
+
+
+class XPUTestGreaterEqualOp(XPUOpTestWrapper):
+
+    def __init__(self):
+        self.op_name = 'greater_equal'
+        self.use_dynamic_create_class = False
+
+    class GreaterEqualOpTestCase1(TestCompareOpBase):
+
+        def config(self):
+            self.dtype = self.in_type
+            self.op_type = 'greater_equal'
+            self.compute = np.greater_equal
+            self.set_data()
+
+        def set_data(self):
+            self.lbound = -100
+            self.hbound = 100
+            self.x_shape = [10, 10, 20, 20]
+            self.y_shape = [10, 10, 20, 20]
+
+    class GreaterEqualOpTestCase2(GreaterEqualOpTestCase1):
+
+        def set_data(self):
+            self.lbound = -100
+            self.hbound = 100
+            self.x_shape = [10, 10]
+            self.y_shape = [10, 10]
+
+    class GreaterEqualOpTestCase3(GreaterEqualOpTestCase1):
+
+        def set_data(self):
+            self.lbound = -200
+            self.hbound = 200
+            self.x_shape = [512, 512, 2]
+            self.y_shape = [1]
+
+    class GreaterEqualOpTestCase4(GreaterEqualOpTestCase1):
+
+        def set_data(self):
+            self.lbound = -100
+            self.hbound = 100
+            self.x_shape = [10, 10, 20, 20]
+            self.y_shape = [1]
+
+    class GreaterEqualOpTestCase5(GreaterEqualOpTestCase1):
+
+        def set_data(self):
+            self.lbound = -100
+            self.hbound = 100
+            self.x_shape = [10, 30, 15]
+            self.y_shape = [10, 30, 15]
+
+
+support_types = get_xpu_op_support_types('greater_equal')
+for stype in support_types:
+    create_test_class(globals(), XPUTestGreaterEqualOp, stype)
+
+
+class XPUTestEqualOp(XPUOpTestWrapper):
+
+    def __init__(self):
+        self.op_name = 'equal'
+        self.use_dynamic_create_class = False
+
+    class EqualOpTestCase1(TestCompareOpBase):
+
+        def config(self):
+            self.dtype = self.in_type
+            self.op_type = 'equal'
+            self.compute = np.equal
+            self.set_data()
+
+        def set_data(self):
+            self.lbound = -100
+            self.hbound = 100
+            self.x_shape = [10, 30, 15]
+            self.y_shape = [10, 30, 15]
+
+    class EqualOpTestCase2(EqualOpTestCase1):
+
+        def set_data(self):
+            self.lbound = -100
+            self.hbound = 100
+            self.x_shape = [10, 30, 15]
+            self.y_shape = [1]
+
+    class EqualOpTestCase3(EqualOpTestCase1):
+
+        def set_data(self):
+            self.lbound = -200
+            self.hbound = 200
+            self.x_shape = [10, 30]
+            self.y_shape = [10, 30]
+
+    class EqualOpTestCase4(EqualOpTestCase1):
+
+        def set_data(self):
+            self.lbound = -100
+            self.hbound = 100
+            self.x_shape = [256, 256, 10]
+            self.y_shape = [256, 256, 10]
+
+    class EqualOpTestCase5(EqualOpTestCase1):
+
+        def set_data(self):
+            self.lbound = -100
+            self.hbound = 100
+            self.x_shape = [11, 17]
+            self.y_shape = [1]
+
+
+support_types = get_xpu_op_support_types('equal')
+for stype in support_types:
+    create_test_class(globals(), XPUTestEqualOp, stype)
+
+
+class XPUTestNotEqualOp(XPUOpTestWrapper):
+
+    def __init__(self):
+        self.op_name = 'not_equal'
+        self.use_dynamic_create_class = False
+
+    class NotEqualOpTestCase1(TestCompareOpBase):
+
+        def config(self):
+            self.dtype = self.in_type
+            self.op_type = 'not_equal'
+            self.compute = np.not_equal
+            self.set_data()
+
+        def set_data(self):
+            self.lbound = -100
+            self.hbound = 100
+            self.x_shape = [11, 17]
+            self.y_shape = [1]
+
+    class NotEqualOpTestCase2(NotEqualOpTestCase1):
+
+        def set_data(self):
+            self.lbound = -200
+            self.hbound = 200
+            self.x_shape = [11, 17]
+            self.y_shape = [11, 17]
+
+    class NotEqualOpTestCase3(NotEqualOpTestCase1):
+
+        def set_data(self):
+            self.lbound = -200
+            self.hbound = 200
+            self.x_shape = [11, 17, 30]
+            self.y_shape = [1]
+
+    class NotEqualOpTestCase4(NotEqualOpTestCase1):
+
+        def set_data(self):
+            self.lbound = -200
+            self.hbound = 200
+            self.x_shape = [256, 256, 10]
+            self.y_shape = [256, 256, 10]
+
+    class NotEqualOpTestCase5(NotEqualOpTestCase1):
+
+        def set_data(self):
+            self.lbound = -100
+            self.hbound = 100
+            self.x_shape = [512, 128]
+            self.y_shape = [512, 128]
+
+
+support_types = get_xpu_op_support_types('not_equal')
+for stype in support_types:
+    create_test_class(globals(), XPUTestNotEqualOp, stype)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py
index 387dd88bcd4ea..9dd7247c4a39d 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py
@@ -498,10 +498,41 @@ def init_paddings(self):
             self.padding_algorithm = "EXPLICIT"
 
 
+class XPUTestConv2DOp_NHWC(XPUOpTestWrapper):
+
+    def __init__(self):
+        self.op_name = 'conv2d'
+        self.use_dynamic_create_class = False
+
+    class TestConv2DOp_AsyPadding_NHWC(
+            XPUTestConv2DOp_v2.TestConv2DOp_AsyPadding):
+
+        def init_data_format(self):
+            self.data_format = "NHWC"
+
+        def init_test_case_2(self):
+            N, C, H, W = self.input_size
+            self.input_size = [N, H, W, C]
+
+    class TestWithPad_AsyPadding_NHWC(XPUTestConv2DOp_v2.TestWithPad_AsyPadding
+                                      ):
+
+        def init_data_format(self):
+            self.data_format = "NHWC"
+
+        def init_test_case_2(self):
+            N, C, H, W = self.input_size
+            self.input_size = [N, H, W, C]
+
+
 support_types = get_xpu_op_support_types('conv2d')
 for stype in ['float32']:
     create_test_class(globals(), XPUTestConv2DOp, stype)
     create_test_class(globals(), XPUTestConv2DOp_v2, stype)
+    create_test_class(globals(),
+                      XPUTestConv2DOp_NHWC,
+                      stype,
+                      ignore_deivce_version=[core.XPUVersion.XPU1])
 
 #---------- test SAME VALID -----------
 #create_test_padding_SAME_class(TestConv2DOp_AsyPadding)
@@ -512,9 +543,5 @@ def init_paddings(self):
 #create_test_padding_VALID_class(TestWithPad_AsyPadding)
 #create_test_padding_VALID_class(TestWithStride_AsyPadding)
 
-# ------------ test channel last ---------
-#create_test_channel_last_class(TestConv2DOp_AsyPadding)
-#create_test_channel_last_class(TestWithPad_AsyPadding)
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_conv2d_transpose_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_conv2d_transpose_op_xpu.py
index 4204a73524d27..22bc8fef839b8 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_conv2d_transpose_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_conv2d_transpose_op_xpu.py
@@ -22,9 +22,11 @@
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 import paddle
 import paddle.nn as nn
-from paddle.fluid import Program, program_guard
+
+paddle.enable_static()
 
 
 def conv2dtranspose_forward_naive(input_, filter_, attrs):
@@ -117,166 +119,159 @@ def _get_padding_with_SAME(input_shape, kernel_size, kernel_stride):
     return out
 
 
-class TestConv2DTransposeOp(XPUOpTest):
-
-    def setUp(self):
-        # init as conv transpose
-        self.dtype = np.float32
-        self.need_check_grad = True
-        self.is_test = False
-        self.use_cudnn = False
-        self.use_mkldnn = False
-        self.output_size = None
-        self.output_padding = []
-        self.data_format = "NCHW"
-        self.pad = [0, 0]
-        self.padding_algorithm = "EXPLICIT"
-        self.init_op_type()
-        self.init_test_case()
-        self.__class__.op_type = "conv2d_transpose"
-
-        input_ = np.random.random(self.input_size).astype(self.dtype)
-        filter_ = np.random.random(self.filter_size).astype(self.dtype)
-
-        self.inputs = {'Input': input_, 'Filter': filter_}
-        self.attrs = {
-            'strides': self.stride,
-            'paddings': self.pad,
-            'padding_algorithm': self.padding_algorithm,
-            'groups': self.groups,
-            'dilations': self.dilations,
-            'use_cudnn': self.use_cudnn,
-            'is_test': self.is_test,
-            'use_mkldnn': self.use_mkldnn,
-            'data_format': self.data_format
-        }
-        if self.output_size is not None:
-            self.attrs['output_size'] = self.output_size
-
-        if len(self.output_padding) > 0:
-            self.attrs['output_padding'] = self.output_padding
-
-        output = conv2dtranspose_forward_naive(input_, filter_,
-                                               self.attrs).astype(self.dtype)
-
-        self.outputs = {'Output': output}
-
-    def test_check_output(self):
-        if core.is_compiled_with_xpu():
-            paddle.enable_static()
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
-
-    def test_check_grad_no_input(self):
-        if self.need_check_grad:
-            if core.is_compiled_with_xpu():
-                paddle.enable_static()
-                place = paddle.XPUPlace(0)
-                self.check_grad_with_place(place, ['Filter'],
+class XPUTestConv2DTransposeOp(XPUOpTestWrapper):
+
+    def __init__(self):
+        self.op_name = 'conv2d_transpose'
+        self.use_dynamic_create_class = False
+
+    class TestConv2DTransposeOp(XPUOpTest):
+
+        def setUp(self):
+            # init as conv transpose
+            self.need_check_grad = True
+            self.is_test = False
+            self.use_cudnn = False
+            self.use_mkldnn = False
+            self.output_size = None
+            self.output_padding = []
+            self.data_format = "NCHW"
+            self.pad = [0, 0]
+            self.padding_algorithm = "EXPLICIT"
+            self.init_op_type()
+            self.init_test_case()
+            self.__class__.op_type = "conv2d_transpose"
+
+            input_ = np.random.random(self.input_size).astype(self.dtype)
+            filter_ = np.random.random(self.filter_size).astype(self.dtype)
+
+            self.inputs = {'Input': input_, 'Filter': filter_}
+            self.attrs = {
+                'strides': self.stride,
+                'paddings': self.pad,
+                'padding_algorithm': self.padding_algorithm,
+                'groups': self.groups,
+                'dilations': self.dilations,
+                'use_cudnn': self.use_cudnn,
+                'is_test': self.is_test,
+                'use_mkldnn': self.use_mkldnn,
+                'data_format': self.data_format
+            }
+            if self.output_size is not None:
+                self.attrs['output_size'] = self.output_size
+
+            if len(self.output_padding) > 0:
+                self.attrs['output_padding'] = self.output_padding
+
+            output = conv2dtranspose_forward_naive(
+                input_, filter_, self.attrs).astype(self.dtype)
+
+            self.outputs = {'Output': output}
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad_no_input(self):
+            if self.need_check_grad:
+                self.check_grad_with_place(self.place, ['Filter'],
                                            'Output',
                                            no_grad_set=set(['Input']))
 
-    def test_check_grad_no_filter(self):
-        if self.need_check_grad:
-            if core.is_compiled_with_xpu():
-                paddle.enable_static()
-                place = paddle.XPUPlace(0)
-                self.check_grad_with_place(place, ['Input'],
+        def test_check_grad_no_filter(self):
+            if self.need_check_grad:
+                self.check_grad_with_place(self.place, ['Input'],
                                            'Output',
                                            no_grad_set=set(['Filter']))
 
-    def test_check_grad(self):
-        if self.need_check_grad:
-            if core.is_compiled_with_xpu():
-                paddle.enable_static()
-                place = paddle.XPUPlace(0)
-                self.check_grad_with_place(place, set(['Input', 'Filter']),
+        def test_check_grad(self):
+            if self.need_check_grad:
+                self.check_grad_with_place(self.place, set(['Input', 'Filter']),
                                            'Output')
 
-    def init_test_case(self):
-        self.pad = [0, 0]
-        self.stride = [1, 1]
-        self.dilations = [1, 1]
-        self.groups = 1
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        f_c = self.input_size[1]
-        self.filter_size = [f_c, 6, 3, 3]
-
-    def init_op_type(self):
-        self.op_type = "conv2d_transpose"
-
-
-class TestWithSymmetricPad(TestConv2DTransposeOp):
-
-    def init_test_case(self):
-        self.pad = [1, 1]
-        self.stride = [1, 1]
-        self.dilations = [1, 1]
-        self.groups = 1
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        f_c = self.input_size[1]
-        self.filter_size = [f_c, 6, 3, 3]
-
-
-class TestWithAsymmetricPad(TestConv2DTransposeOp):
-
-    def init_test_case(self):
-        self.pad = [1, 0, 1, 2]
-        self.stride = [1, 1]
-        self.dilations = [1, 1]
-        self.groups = 1
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        f_c = self.input_size[1]
-        self.filter_size = [f_c, 6, 3, 3]
-
-
-class TestWithSAMEPad(TestConv2DTransposeOp):
-
-    def init_test_case(self):
-        self.stride = [2, 1]
-        self.dilations = [1, 2]
-        self.groups = 1
-        self.input_size = [2, 3, 6, 5]  # NCHW
-        f_c = self.input_size[1]
-        self.filter_size = [f_c, 6, 4, 3]
-        self.padding_algorithm = 'SAME'
-
-
-class TestWithVALIDPad(TestConv2DTransposeOp):
-
-    def init_test_case(self):
-        self.stride = [1, 1]
-        self.dilations = [1, 1]
-        self.groups = 1
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        f_c = self.input_size[1]
-        self.filter_size = [f_c, 6, 3, 3]
-        self.padding_algorithm = 'VALID'
-
-
-class TestWithGroups(TestConv2DTransposeOp):
-
-    def init_test_case(self):
-        self.pad = [1, 1]
-        self.stride = [1, 1]
-        self.dilations = [1, 1]
-        self.groups = 2
-        self.input_size = [2, 4, 5, 5]  # NCHW
-        f_c = self.input_size[1]
-        self.filter_size = [f_c, 3, 3, 3]
-
-
-class TestWithStride(TestConv2DTransposeOp):
-
-    def init_test_case(self):
-        self.pad = [1, 1]
-        self.stride = [2, 2]
-        self.dilations = [1, 1]
-        self.groups = 1
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        f_c = self.input_size[1]
-        self.filter_size = [f_c, 6, 3, 3]
-
+        def init_test_case(self):
+            self.pad = [0, 0]
+            self.stride = [1, 1]
+            self.dilations = [1, 1]
+            self.groups = 1
+            self.input_size = [2, 3, 5, 5]  # NCHW
+            f_c = self.input_size[1]
+            self.filter_size = [f_c, 6, 3, 3]
+
+        def init_op_type(self):
+            self.dtype = self.in_type
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "conv2d_transpose"
+
+    class TestWithSymmetricPad(TestConv2DTransposeOp):
+
+        def init_test_case(self):
+            self.pad = [1, 1]
+            self.stride = [1, 1]
+            self.dilations = [1, 1]
+            self.groups = 1
+            self.input_size = [2, 3, 5, 5]  # NCHW
+            f_c = self.input_size[1]
+            self.filter_size = [f_c, 6, 3, 3]
+
+    class TestWithAsymmetricPad(TestConv2DTransposeOp):
+
+        def init_test_case(self):
+            self.pad = [1, 0, 1, 2]
+            self.stride = [1, 1]
+            self.dilations = [1, 1]
+            self.groups = 1
+            self.input_size = [2, 3, 5, 5]  # NCHW
+            f_c = self.input_size[1]
+            self.filter_size = [f_c, 6, 3, 3]
+
+    class TestWithSAMEPad(TestConv2DTransposeOp):
+
+        def init_test_case(self):
+            self.stride = [2, 1]
+            self.dilations = [1, 2]
+            self.groups = 1
+            self.input_size = [2, 3, 6, 5]  # NCHW
+            f_c = self.input_size[1]
+            self.filter_size = [f_c, 6, 4, 3]
+            self.padding_algorithm = 'SAME'
+
+    class TestWithVALIDPad(TestConv2DTransposeOp):
+
+        def init_test_case(self):
+            self.stride = [1, 1]
+            self.dilations = [1, 1]
+            self.groups = 1
+            self.input_size = [2, 3, 5, 5]  # NCHW
+            f_c = self.input_size[1]
+            self.filter_size = [f_c, 6, 3, 3]
+            self.padding_algorithm = 'VALID'
+
+    class TestWithGroups(TestConv2DTransposeOp):
+
+        def init_test_case(self):
+            self.pad = [1, 1]
+            self.stride = [1, 1]
+            self.dilations = [1, 1]
+            self.groups = 2
+            self.input_size = [2, 4, 5, 5]  # NCHW
+            f_c = self.input_size[1]
+            self.filter_size = [f_c, 3, 3, 3]
+
+    class TestWithStride(TestConv2DTransposeOp):
+
+        def init_test_case(self):
+            self.pad = [1, 1]
+            self.stride = [2, 2]
+            self.dilations = [1, 1]
+            self.groups = 1
+            self.input_size = [2, 3, 5, 5]  # NCHW
+            f_c = self.input_size[1]
+            self.filter_size = [f_c, 6, 3, 3]
+
+
+support_types = get_xpu_op_support_types('conv2d_transpose')
+for stype in support_types:
+    create_test_class(globals(), XPUTestConv2DTransposeOp, stype)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_flatten2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_flatten2_op_xpu.py
index 819fd1248fecf..392eed198ff95 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_flatten2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_flatten2_op_xpu.py
@@ -23,67 +23,80 @@
 import paddle.fluid as fluid
 from op_test import OpTest
 from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
 paddle.enable_static()
 
 
-class TestFlatten2Op(XPUOpTest):
+class XPUTestFlatten2Op(XPUOpTestWrapper):
 
-    def setUp(self):
-        self.set_xpu()
-        self.op_type = "flatten2"
-        self.place = paddle.XPUPlace(0)
-        self.init_test_case()
-        self.inputs = {"X": np.random.random(self.in_shape).astype("float32")}
-        self.init_attrs()
-        self.outputs = {
-            "Out": self.inputs["X"].reshape(self.new_shape),
-            "XShape": np.random.random(self.in_shape).astype("float32")
-        }
+    def __init__(self):
+        self.op_name = 'flatten2'
+        self.use_dynamic_create_class = False
 
-    def set_xpu(self):
-        self.__class__.use_xpu = True
+    class TestFlatten2Op(XPUOpTest):
 
-    def test_check_output(self):
-        self.check_output_with_place(self.place, no_check_set=["XShape"])
+        def setUp(self):
+            self.set_xpu()
+            self.op_type = "flatten2"
+            self.dtype = self.in_type
+            self.place = paddle.XPUPlace(0)
+            self.init_test_case()
+            self.inputs = {
+                "X": np.random.random(self.in_shape).astype(self.dtype)
+            }
+            self.init_attrs()
+            self.outputs = {
+                "Out": self.inputs["X"].reshape(self.new_shape),
+                "XShape": np.random.random(self.in_shape).astype(self.dtype)
+            }
 
-    def test_check_grad(self):
-        self.check_grad_with_place(self.place, ["X"], "Out")
+        def set_xpu(self):
+            self.__class__.use_xpu = True
 
-    def init_test_case(self):
-        self.in_shape = (3, 2, 4, 5)
-        self.axis = 1
-        self.new_shape = (3, 40)
+        def test_check_output(self):
+            self.check_output_with_place(self.place, no_check_set=["XShape"])
 
-    def init_attrs(self):
-        self.attrs = {"axis": self.axis}
+        def test_check_grad(self):
+            self.check_grad_with_place(self.place, ["X"], "Out")
 
+        def init_test_case(self):
+            self.in_shape = (3, 2, 4, 5)
+            self.axis = 1
+            self.new_shape = (3, 40)
 
-class TestFlatten2OpWithCornerAxis(TestFlatten2Op):
+        def init_attrs(self):
+            self.attrs = {"axis": self.axis}
 
-    def init_test_case(self):
-        self.in_shape = (3, 2, 5, 4)
-        self.axis = 0
-        self.new_shape = (1, 120)
+    class TestFlatten2OpWithCornerAxis(TestFlatten2Op):
 
+        def init_test_case(self):
+            self.in_shape = (3, 2, 5, 4)
+            self.axis = 0
+            self.new_shape = (1, 120)
 
-class TestFlatten2OpWithDefaultAxis(TestFlatten2Op):
+    class TestFlatten2OpWithDefaultAxis(TestFlatten2Op):
 
-    def init_test_case(self):
-        self.in_shape = (10, 2, 2, 3)
-        self.new_shape = (10, 12)
+        def init_test_case(self):
+            self.in_shape = (10, 2, 2, 3)
+            self.new_shape = (10, 12)
 
-    def init_attrs(self):
-        self.attrs = {}
+        def init_attrs(self):
+            self.attrs = {}
 
+    class TestFlatten2OpSixDims(TestFlatten2Op):
 
-class TestFlatten2OpSixDims(TestFlatten2Op):
+        def init_test_case(self):
+            self.in_shape = (3, 2, 3, 2, 4, 4)
+            self.axis = 4
+            self.new_shape = (36, 16)
 
-    def init_test_case(self):
-        self.in_shape = (3, 2, 3, 2, 4, 4)
-        self.axis = 4
-        self.new_shape = (36, 16)
 
+support_types = get_xpu_op_support_types('flatten2')
+support_types_for_grad = get_xpu_op_support_types('mean')
+for stype in support_types:
+    if stype in support_types_for_grad:
+        create_test_class(globals(), XPUTestFlatten2Op, stype)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_flatten_contiguous_range_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_flatten_contiguous_range_op_xpu.py
index 06fc12f510844..c9426f54b1cf6 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_flatten_contiguous_range_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_flatten_contiguous_range_op_xpu.py
@@ -17,7 +17,6 @@
 import sys
 
 sys.path.append("..")
-
 import numpy as np
 import unittest
 import sys
@@ -27,215 +26,214 @@
 from op_test_xpu import XPUOpTest
 import paddle
 import paddle.fluid as fluid
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
 paddle.enable_static()
 
 
-class TestFlattenOp(XPUOpTest):
-
-    def setUp(self):
-        self.set_xpu()
-        self.op_type = "flatten_contiguous_range"
-        self.place = paddle.XPUPlace(0)
-        self.use_xpu = True
-        self.use_mkldnn = False
-
-        self.start_axis = 0
-        self.stop_axis = -1
-        self.dtype = np.float32
-        self.init_test_case()
-        self.inputs = {"X": np.random.random(self.in_shape).astype(self.dtype)}
-        self.init_attrs()
-        self.outputs = {
-            "Out": self.inputs["X"].reshape(self.new_shape),
-            "XShape": np.random.random(self.in_shape).astype("float32")
-        }
-
-    def set_xpu(self):
-        self.__class__.use_xpu = True
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place, no_check_set=["XShape"])
-
-    def test_check_grad(self):
-        self.check_grad_with_place(self.place, ["X"], "Out")
-
-    def init_test_case(self):
-        self.in_shape = (3, 2, 5, 4)
-        self.start_axis = 0
-        self.stop_axis = -1
-        self.new_shape = (120)
-
-    def init_attrs(self):
-        self.attrs = {
-            "start_axis": self.start_axis,
-            "stop_axis": self.stop_axis,
-            'use_xpu': True,
-        }
-
-
-class TestFlattenOp_1(TestFlattenOp):
-
-    def init_test_case(self):
-        self.in_shape = (3, 2, 5, 4)
-        self.start_axis = 1
-        self.stop_axis = 2
-        self.new_shape = (3, 10, 4)
-
-    def init_attrs(self):
-        self.attrs = {
-            "start_axis": self.start_axis,
-            "stop_axis": self.stop_axis
-        }
-
-
-class TestFlattenOp_2(TestFlattenOp):
-
-    def init_test_case(self):
-        self.in_shape = (3, 2, 5, 4)
-        self.start_axis = 0
-        self.stop_axis = 1
-        self.new_shape = (6, 5, 4)
-
-    def init_attrs(self):
-        self.attrs = {
-            "start_axis": self.start_axis,
-            "stop_axis": self.stop_axis
-        }
-
-
-class TestFlattenOp_3(TestFlattenOp):
-
-    def init_test_case(self):
-        self.in_shape = (3, 2, 5, 4)
-        self.start_axis = 0
-        self.stop_axis = 2
-        self.new_shape = (30, 4)
-
-    def init_attrs(self):
-        self.attrs = {
-            "start_axis": self.start_axis,
-            "stop_axis": self.stop_axis
-        }
-
-
-class TestFlattenOp_4(TestFlattenOp):
-
-    def init_test_case(self):
-        self.in_shape = (3, 2, 5, 4)
-        self.start_axis = -2
-        self.stop_axis = -1
-        self.new_shape = (3, 2, 20)
-
-    def init_attrs(self):
-        self.attrs = {
-            "start_axis": self.start_axis,
-            "stop_axis": self.stop_axis
-        }
-
-
-class TestFlattenOp_5(TestFlattenOp):
-
-    def init_test_case(self):
-        self.in_shape = (3, 2, 5, 4)
-        self.start_axis = 2
-        self.stop_axis = 2
-        self.new_shape = (3, 2, 5, 4)
-
-    def init_attrs(self):
-        self.attrs = {
-            "start_axis": self.start_axis,
-            "stop_axis": self.stop_axis
-        }
-
-
-class TestFlattenOpSixDims(TestFlattenOp):
-
-    def init_test_case(self):
-        self.in_shape = (3, 2, 3, 2, 4, 4)
-        self.start_axis = 3
-        self.stop_axis = 5
-        self.new_shape = (3, 2, 3, 32)
-
-    def init_attrs(self):
-        self.attrs = {
-            "start_axis": self.start_axis,
-            "stop_axis": self.stop_axis
-        }
-
-
-class TestFlattenOp_Float32(TestFlattenOp):
-
-    def init_test_case(self):
-        self.in_shape = (3, 2, 5, 4)
-        self.start_axis = 0
-        self.stop_axis = 1
-        self.new_shape = (6, 5, 4)
-        self.dtype = np.float32
-
-    def init_attrs(self):
-        self.attrs = {
-            "start_axis": self.start_axis,
-            "stop_axis": self.stop_axis
-        }
-
-
-class TestFlattenOp_int32(TestFlattenOp):
-
-    def init_test_case(self):
-        self.in_shape = (3, 2, 5, 4)
-        self.start_axis = 0
-        self.stop_axis = 1
-        self.new_shape = (6, 5, 4)
-        self.dtype = np.int32
-
-    def init_attrs(self):
-        self.attrs = {
-            "start_axis": self.start_axis,
-            "stop_axis": self.stop_axis,
-            'use_xpu': True
-        }
-
-    def test_check_grad(self):
-        pass
-
-
-class TestFlattenOp_int8(TestFlattenOp):
-
-    def init_test_case(self):
-        self.in_shape = (3, 2, 5, 4)
-        self.start_axis = 0
-        self.stop_axis = 1
-        self.new_shape = (6, 5, 4)
-        self.dtype = np.int8
-
-    def init_attrs(self):
-        self.attrs = {
-            "start_axis": self.start_axis,
-            "stop_axis": self.stop_axis
-        }
-
-    def test_check_grad(self):
-        pass
-
-
-class TestFlattenOp_int64(TestFlattenOp):
-
-    def init_test_case(self):
-        self.in_shape = (3, 2, 5, 4)
-        self.start_axis = 0
-        self.stop_axis = 1
-        self.new_shape = (6, 5, 4)
-        self.dtype = np.int64
-
-    def init_attrs(self):
-        self.attrs = {
-            "start_axis": self.start_axis,
-            "stop_axis": self.stop_axis
-        }
-
-    def test_check_grad(self):
-        pass
+class XPUTestFlattenOp(XPUOpTestWrapper):
+
+    def __init__(self):
+        self.op_name = 'flatten_contiguous_range'
+        self.use_dynamic_create_class = False
+
+    class TestFlattenOp(XPUOpTest):
+
+        def setUp(self):
+            self.set_xpu()
+            self.op_type = "flatten_contiguous_range"
+            self.place = paddle.XPUPlace(0)
+            self.use_xpu = True
+            self.use_mkldnn = False
+
+            self.start_axis = 0
+            self.stop_axis = -1
+            self.dtype = self.in_type
+            self.init_test_case()
+            self.inputs = {
+                "X": np.random.random(self.in_shape).astype(self.dtype)
+            }
+            self.init_attrs()
+            self.outputs = {
+                "Out": self.inputs["X"].reshape(self.new_shape),
+                "XShape": np.random.random(self.in_shape).astype(self.dtype)
+            }
+
+        def set_xpu(self):
+            self.__class__.use_xpu = True
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place, no_check_set=["XShape"])
+
+        def test_check_grad(self):
+            self.check_grad_with_place(self.place, ["X"], "Out")
+
+        def init_test_case(self):
+            self.in_shape = (3, 2, 5, 4)
+            self.start_axis = 0
+            self.stop_axis = -1
+            self.new_shape = (120)
+
+        def init_attrs(self):
+            self.attrs = {
+                "start_axis": self.start_axis,
+                "stop_axis": self.stop_axis,
+                'use_xpu': True,
+            }
+
+    class TestFlattenOp_1(TestFlattenOp):
+
+        def init_test_case(self):
+            self.in_shape = (3, 2, 5, 4)
+            self.start_axis = 1
+            self.stop_axis = 2
+            self.new_shape = (3, 10, 4)
+
+        def init_attrs(self):
+            self.attrs = {
+                "start_axis": self.start_axis,
+                "stop_axis": self.stop_axis
+            }
+
+    class TestFlattenOp_2(TestFlattenOp):
+
+        def init_test_case(self):
+            self.in_shape = (3, 2, 5, 4)
+            self.start_axis = 0
+            self.stop_axis = 1
+            self.new_shape = (6, 5, 4)
+
+        def init_attrs(self):
+            self.attrs = {
+                "start_axis": self.start_axis,
+                "stop_axis": self.stop_axis
+            }
+
+    class TestFlattenOp_3(TestFlattenOp):
+
+        def init_test_case(self):
+            self.in_shape = (3, 2, 5, 4)
+            self.start_axis = 0
+            self.stop_axis = 2
+            self.new_shape = (30, 4)
+
+        def init_attrs(self):
+            self.attrs = {
+                "start_axis": self.start_axis,
+                "stop_axis": self.stop_axis
+            }
+
+    class TestFlattenOp_4(TestFlattenOp):
+
+        def init_test_case(self):
+            self.in_shape = (3, 2, 5, 4)
+            self.start_axis = -2
+            self.stop_axis = -1
+            self.new_shape = (3, 2, 20)
+
+        def init_attrs(self):
+            self.attrs = {
+                "start_axis": self.start_axis,
+                "stop_axis": self.stop_axis
+            }
+
+    class TestFlattenOp_5(TestFlattenOp):
+
+        def init_test_case(self):
+            self.in_shape = (3, 2, 5, 4)
+            self.start_axis = 2
+            self.stop_axis = 2
+            self.new_shape = (3, 2, 5, 4)
+
+        def init_attrs(self):
+            self.attrs = {
+                "start_axis": self.start_axis,
+                "stop_axis": self.stop_axis
+            }
+
+    class TestFlattenOpSixDims(TestFlattenOp):
+
+        def init_test_case(self):
+            self.in_shape = (3, 2, 3, 2, 4, 4)
+            self.start_axis = 3
+            self.stop_axis = 5
+            self.new_shape = (3, 2, 3, 32)
+
+        def init_attrs(self):
+            self.attrs = {
+                "start_axis": self.start_axis,
+                "stop_axis": self.stop_axis
+            }
+
+    class TestFlattenOp_Float32(TestFlattenOp):
+
+        def init_test_case(self):
+            self.in_shape = (3, 2, 5, 4)
+            self.start_axis = 0
+            self.stop_axis = 1
+            self.new_shape = (6, 5, 4)
+            self.dtype = np.float32
+
+        def init_attrs(self):
+            self.attrs = {
+                "start_axis": self.start_axis,
+                "stop_axis": self.stop_axis
+            }
+
+    class TestFlattenOp_int32(TestFlattenOp):
+
+        def init_test_case(self):
+            self.in_shape = (3, 2, 5, 4)
+            self.start_axis = 0
+            self.stop_axis = 1
+            self.new_shape = (6, 5, 4)
+            self.dtype = np.int32
+
+        def init_attrs(self):
+            self.attrs = {
+                "start_axis": self.start_axis,
+                "stop_axis": self.stop_axis,
+                'use_xpu': True
+            }
+
+        def test_check_grad(self):
+            pass
+
+    class TestFlattenOp_int8(TestFlattenOp):
+
+        def init_test_case(self):
+            self.in_shape = (3, 2, 5, 4)
+            self.start_axis = 0
+            self.stop_axis = 1
+            self.new_shape = (6, 5, 4)
+            self.dtype = np.int8
+
+        def init_attrs(self):
+            self.attrs = {
+                "start_axis": self.start_axis,
+                "stop_axis": self.stop_axis
+            }
+
+        def test_check_grad(self):
+            pass
+
+    class TestFlattenOp_int64(TestFlattenOp):
+
+        def init_test_case(self):
+            self.in_shape = (3, 2, 5, 4)
+            self.start_axis = 0
+            self.stop_axis = 1
+            self.new_shape = (6, 5, 4)
+            self.dtype = np.int64
+
+        def init_attrs(self):
+            self.attrs = {
+                "start_axis": self.start_axis,
+                "stop_axis": self.stop_axis
+            }
+
+        def test_check_grad(self):
+            pass
 
 
 class TestFlatten2OpError(unittest.TestCase):
@@ -338,5 +336,11 @@ def test_Negative():
         self.assertTrue((2, 3, 16) == res_shape)
 
 
+support_types = get_xpu_op_support_types('flatten_contiguous_range')
+support_types_for_grad = get_xpu_op_support_types('mean')
+for stype in support_types:
+    if stype in support_types_for_grad:
+        create_test_class(globals(), XPUTestFlattenOp, stype)
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_flatten_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_flatten_op_xpu.py
index 9622fc5bb1a82..c3c732fa77177 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_flatten_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_flatten_op_xpu.py
@@ -23,61 +23,74 @@
 import paddle.fluid as fluid
 from op_test import OpTest
 from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
 paddle.enable_static()
 
 
-class TestFlattenOp(XPUOpTest):
+class XPUTestFlattenOp(XPUOpTestWrapper):
 
-    def setUp(self):
-        self.op_type = "flatten"
-        self.use_xpu = True
-        self.place = paddle.XPUPlace(0)
-        self.init_test_case()
-        self.inputs = {"X": np.random.random(self.in_shape).astype("float32")}
-        self.init_attrs()
-        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
+    def __init__(self):
+        self.op_name = 'flatten'
+        self.use_dynamic_create_class = False
 
-    def test_check_output(self):
-        self.check_output_with_place(self.place)
+    class TestFlattenOp(XPUOpTest):
 
-    def test_check_grad(self):
-        self.check_grad_with_place(self.place, ["X"], "Out")
+        def setUp(self):
+            self.op_type = "flatten"
+            self.use_xpu = True
+            self.place = paddle.XPUPlace(0)
+            self.init_test_case()
+            self.dtype = self.in_type
+            self.inputs = {
+                "X": np.random.random(self.in_shape).astype(self.dtype)
+            }
+            self.init_attrs()
+            self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
 
-    def init_test_case(self):
-        self.in_shape = (3, 2, 2, 10)
-        self.axis = 1
-        self.new_shape = (3, 40)
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
 
-    def init_attrs(self):
-        self.attrs = {"axis": self.axis}
+        def test_check_grad(self):
+            self.check_grad_with_place(self.place, ["X"], "Out")
 
+        def init_test_case(self):
+            self.in_shape = (3, 2, 2, 10)
+            self.axis = 1
+            self.new_shape = (3, 40)
 
-class TestFlattenOp1(TestFlattenOp):
+        def init_attrs(self):
+            self.attrs = {"axis": self.axis}
 
-    def init_test_case(self):
-        self.in_shape = (3, 2, 2, 10)
-        self.axis = 0
-        self.new_shape = (1, 120)
+    class TestFlattenOp1(TestFlattenOp):
 
+        def init_test_case(self):
+            self.in_shape = (3, 2, 2, 10)
+            self.axis = 0
+            self.new_shape = (1, 120)
 
-class TestFlattenOpWithDefaultAxis(TestFlattenOp):
+    class TestFlattenOpWithDefaultAxis(TestFlattenOp):
 
-    def init_test_case(self):
-        self.in_shape = (10, 2, 2, 3)
-        self.new_shape = (10, 12)
+        def init_test_case(self):
+            self.in_shape = (10, 2, 2, 3)
+            self.new_shape = (10, 12)
 
-    def init_attrs(self):
-        self.attrs = {}
+        def init_attrs(self):
+            self.attrs = {}
 
+    class TestFlattenOpSixDims(TestFlattenOp):
 
-class TestFlattenOpSixDims(TestFlattenOp):
+        def init_test_case(self):
+            self.in_shape = (3, 2, 3, 2, 4, 4)
+            self.axis = 4
+            self.new_shape = (36, 16)
 
-    def init_test_case(self):
-        self.in_shape = (3, 2, 3, 2, 4, 4)
-        self.axis = 4
-        self.new_shape = (36, 16)
 
+support_types = get_xpu_op_support_types('flatten')
+support_types_for_grad = get_xpu_op_support_types('mean')
+for stype in support_types:
+    if stype in support_types_for_grad:
+        create_test_class(globals(), XPUTestFlattenOp, stype)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fused_resnet_basic_block_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_fused_resnet_basic_block_op_xpu.py
new file mode 100644
index 0000000000000..8fe9769d51925
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_fused_resnet_basic_block_op_xpu.py
@@ -0,0 +1,272 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import sys
+
+sys.path.append("..")
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.nn as nn
+from paddle.fluid import core
+from paddle.incubate.xpu.resnet_block import ResNetBasicBlock
+from paddle.fluid.framework import default_main_program
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
+paddle.enable_static()
+
+
+class XPUTestResNetBasicBlockOp(XPUOpTestWrapper):
+
+    def __init__(self):
+        self.op_name = "resnet_basic_block"
+        self.use_dynamic_create_class = False
+
+    class TestResNetBasicBlockOp(OpTest):
+
+        def setUp(self):
+            paddle.disable_static()
+            self.dtype = self.in_type
+            self.place = paddle.XPUPlace(0)
+            self.__class__.op_type = "resnet_basic_block"
+            self.__class__.no_need_check_grad = True
+            self.getShape()
+            self.getDiff()
+            self.getShortcut()
+            paddle.set_default_dtype(self.dtype)
+
+            self.src = np.random.random(self.input_size).astype(self.dtype)
+            self.dout = np.random.random(self.output_size).astype(self.dtype)
+
+        def getShape(self):
+            self.in_channels = 8
+            self.out_channels = 8
+            self.stride = 1
+            self.input_size = [2, 8, 32, 32]  # NCHW
+            self.output_size = [2, 8, 32, 32]  # NCHW
+
+        def getDiff(self):
+            self.rtol = 1e-3
+            self.atol = 1e-3
+
+        def getShortcut(self):
+            self.has_shortcut = False
+
+        def Base(self):
+            paddle.disable_static()
+
+            conv1_weight = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv2_weight = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv3_weight = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            bn1_weight = fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=1.0))
+            bn1_bias = fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=0.0))
+            bn2_weight = fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=1.0))
+            bn2_bias = fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=0.0))
+            bn3_weight = fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=1.0))
+            bn3_bias = fluid.ParamAttr(initializer=fluid.initializer.Constant(
+                value=0.0))
+
+            self.conv1 = nn.Conv2D(in_channels=self.in_channels,
+                                   out_channels=self.out_channels,
+                                   kernel_size=3,
+                                   stride=self.stride,
+                                   padding=1,
+                                   weight_attr=conv1_weight,
+                                   bias_attr=None,
+                                   data_format='NCHW')
+            self.bn1 = nn.BatchNorm(self.out_channels,
+                                    act='relu',
+                                    param_attr=bn1_weight,
+                                    bias_attr=bn1_bias,
+                                    data_layout='NCHW')
+            self.conv2 = nn.Conv2D(in_channels=self.out_channels,
+                                   out_channels=self.out_channels,
+                                   kernel_size=3,
+                                   stride=1,
+                                   padding=1,
+                                   weight_attr=conv2_weight,
+                                   bias_attr=None,
+                                   data_format='NCHW')
+            self.bn2 = nn.BatchNorm(self.out_channels,
+                                    act=None,
+                                    param_attr=bn2_weight,
+                                    bias_attr=bn2_bias,
+                                    data_layout='NCHW')
+            self.conv3 = nn.Conv2D(in_channels=self.in_channels,
+                                   out_channels=self.out_channels,
+                                   kernel_size=1,
+                                   stride=self.stride,
+                                   padding=0,
+                                   weight_attr=conv3_weight,
+                                   bias_attr=None,
+                                   data_format='NCHW')
+            self.bn3 = nn.BatchNorm(self.out_channels,
+                                    act=None,
+                                    param_attr=bn3_weight,
+                                    bias_attr=bn3_bias,
+                                    data_layout='NCHW')
+            self.relu = nn.ReLU()
+
+            tensor_src = paddle.to_tensor(self.src, stop_gradient=False)
+            if self.has_shortcut:
+                z_out = self.bn3(self.conv3(tensor_src))
+            else:
+                z_out = tensor_src
+            bn1_out = self.bn1(self.conv1(tensor_src))
+            bn2_out = self.bn2(self.conv2(bn1_out))
+            result = self.relu(bn2_out + z_out)
+            paddle.autograd.backward([result], [paddle.to_tensor(self.dout)],
+                                     True)
+            return result, tensor_src.grad
+
+        def FusedResNetBasicBlock(self):
+            paddle.disable_static()
+
+            fused_conv1_weight = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            fused_conv2_weight = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            fused_conv3_weight = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            fused_bn1_weight = fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=1.0))
+            fused_bn1_bias = fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.0))
+            fused_bn2_weight = fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=1.0))
+            fused_bn2_bias = fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.0))
+            fused_bn3_weight = fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=1.0))
+            fused_bn3_bias = fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.0))
+
+            if self.has_shortcut:
+                self.resnet_basic_block = ResNetBasicBlock(
+                    num_channels1=self.in_channels,
+                    num_filter1=self.out_channels,
+                    filter1_size=3,
+                    num_channels2=self.out_channels,
+                    num_filter2=self.out_channels,
+                    filter2_size=3,
+                    num_channels3=self.in_channels,
+                    num_filter3=self.out_channels,
+                    filter3_size=1,
+                    filter1_attr=fused_conv1_weight,
+                    scale1_attr=fused_bn1_weight,
+                    bias1_attr=fused_bn1_bias,
+                    filter2_attr=fused_conv2_weight,
+                    scale2_attr=fused_bn2_weight,
+                    bias2_attr=fused_bn2_bias,
+                    filter3_attr=fused_conv3_weight,
+                    scale3_attr=fused_bn3_weight,
+                    bias3_attr=fused_bn3_bias,
+                    stride1=self.stride,
+                    stride2=1,
+                    stride3=self.stride,
+                    act='relu',
+                    padding1=1,
+                    padding2=1,
+                    padding3=0,
+                    has_shortcut=True)
+            else:
+                self.resnet_basic_block = ResNetBasicBlock(
+                    num_channels1=self.in_channels,
+                    num_filter1=self.out_channels,
+                    filter1_size=3,
+                    num_channels2=self.out_channels,
+                    num_filter2=self.out_channels,
+                    filter2_size=3,
+                    num_channels3=self.in_channels,
+                    num_filter3=self.out_channels,
+                    filter3_size=1,
+                    filter1_attr=fused_conv1_weight,
+                    scale1_attr=fused_bn1_weight,
+                    bias1_attr=fused_bn1_bias,
+                    filter2_attr=fused_conv2_weight,
+                    scale2_attr=fused_bn2_weight,
+                    bias2_attr=fused_bn2_bias,
+                    filter3_attr=fused_conv3_weight,
+                    scale3_attr=fused_bn3_weight,
+                    bias3_attr=fused_bn3_bias,
+                    stride1=self.stride,
+                    stride2=1,
+                    stride3=self.stride,
+                    act='relu',
+                    padding1=1,
+                    padding2=1,
+                    padding3=1,
+                    has_shortcut=False)
+
+            x = paddle.to_tensor(self.src, stop_gradient=False)
+            out = self.resnet_basic_block.forward(x)
+            paddle.autograd.backward([out], [paddle.to_tensor(self.dout)])
+            return out, x.grad
+
+        def test_out_and_grad_has_shortcut(self):
+            self.has_shortcut = True
+            default_main_program().random_seed = 1
+            base_out, base_grad = self.Base()
+            fused_out, fused_grad = self.FusedResNetBasicBlock()
+            np.testing.assert_allclose(base_out.numpy(),
+                                       fused_out.numpy(),
+                                       rtol=self.rtol,
+                                       atol=self.atol)
+            np.testing.assert_allclose(base_grad.numpy(),
+                                       fused_grad.numpy(),
+                                       rtol=self.rtol,
+                                       atol=self.atol)
+
+        def test_out_and_grad(self):
+            self.has_shortcut = False
+            default_main_program().random_seed = 1
+            base_out, base_grad = self.Base()
+            fused_out, fused_grad = self.FusedResNetBasicBlock()
+            np.testing.assert_allclose(base_out.numpy(),
+                                       fused_out.numpy(),
+                                       rtol=self.rtol,
+                                       atol=self.atol)
+            np.testing.assert_allclose(base_grad.numpy(),
+                                       fused_grad.numpy(),
+                                       rtol=self.rtol,
+                                       atol=self.atol)
+
+
+support_types = get_xpu_op_support_types('resnet_basic_block')
+for stype in support_types:
+    create_test_class(globals(),
+                      XPUTestResNetBasicBlockOp,
+                      stype,
+                      ignore_deivce_version=[core.XPUVersion.XPU1])
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_gaussian_random_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_gaussian_random_op_xpu.py
index 0a0a9bb3d365d..0b2470228b94a 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_gaussian_random_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_gaussian_random_op_xpu.py
@@ -21,25 +21,293 @@
 import numpy as np
 import paddle
 import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-from paddle.fluid.executor import Executor
-from op_test import OpTest
-from test_gaussian_random_op import TestGaussianRandomOp
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+import paddle
 
 paddle.enable_static()
 
 
-class TestXPUGaussianRandomOp(TestGaussianRandomOp):
+class XPUTestGaussianRandomOp(XPUOpTestWrapper):
+
+    def __init__(self):
+        self.op_name = 'gaussian_random'
+        self.use_dynamic_create_class = False
+
+    class TestGaussianRandomOp(XPUOpTest):
+
+        def init(self):
+            self.dtype = self.in_type
+            self.place = paddle.XPUPlace(0)
+            self.op_type = 'gaussian_random'
+
+        def setUp(self):
+            self.init()
+            self.python_api = paddle.normal
+            self.set_attrs()
+            self.inputs = {}
+            self.use_mkldnn = False
+            self.attrs = {
+                "shape": [123, 92],
+                "mean": self.mean,
+                "std": self.std,
+                "seed": 10,
+                "use_mkldnn": self.use_mkldnn
+            }
+            paddle.seed(10)
+
+            self.outputs = {'Out': np.zeros((123, 92), dtype=self.dtype)}
+
+        def set_attrs(self):
+            self.mean = 1.0
+            self.std = 2.
+
+        def test_check_output(self):
+            self.check_output_with_place_customized(self.verify_output,
+                                                    self.place)
+
+        def verify_output(self, outs):
+            self.assertEqual(outs[0].shape, (123, 92))
+            hist, _ = np.histogram(outs[0], range=(-3, 5))
+            hist = hist.astype("float32")
+            hist /= float(outs[0].size)
+            data = np.random.normal(size=(123, 92), loc=1, scale=2)
+            hist2, _ = np.histogram(data, range=(-3, 5))
+            hist2 = hist2.astype("float32")
+            hist2 /= float(outs[0].size)
+            self.assertTrue(np.allclose(hist, hist2, rtol=0, atol=0.01),
+                            "hist: " + str(hist) + " hist2: " + str(hist2))
+
+    class TestMeanStdAreInt(TestGaussianRandomOp):
+
+        def set_attrs(self):
+            self.mean = 1
+            self.std = 2
+
+    # Situation 2: Attr(shape) is a list(with tensor)
+    class TestGaussianRandomOp_ShapeTensorList(TestGaussianRandomOp):
+
+        def setUp(self):
+            '''Test gaussian_random op with specified value
+            '''
+            self.init()
+            self.init_data()
+            shape_tensor_list = []
+            for index, ele in enumerate(self.shape):
+                shape_tensor_list.append(("x" + str(index), np.ones(
+                    (1)).astype('int32') * ele))
+
+            self.attrs = {
+                'shape': self.infer_shape,
+                'mean': self.mean,
+                'std': self.std,
+                'seed': self.seed,
+                'use_mkldnn': self.use_mkldnn
+            }
+
+            self.inputs = {"ShapeTensorList": shape_tensor_list}
+            self.outputs = {'Out': np.zeros(self.shape, dtype=self.dtype)}
+
+        def init_data(self):
+            self.shape = [123, 92]
+            self.infer_shape = [-1, 92]
+            self.use_mkldnn = False
+            self.mean = 1.0
+            self.std = 2.0
+            self.seed = 10
+
+        def test_check_output(self):
+            self.check_output_with_place_customized(self.verify_output,
+                                                    self.place)
+
+    class TestGaussianRandomOp2_ShapeTensorList(
+            TestGaussianRandomOp_ShapeTensorList):
+
+        def init_data(self):
+            self.shape = [123, 92]
+            self.infer_shape = [-1, -1]
+            self.use_mkldnn = False
+            self.mean = 1.0
+            self.std = 2.0
+            self.seed = 10
+
+    class TestGaussianRandomOp3_ShapeTensorList(
+            TestGaussianRandomOp_ShapeTensorList):
+
+        def init_data(self):
+            self.shape = [123, 92]
+            self.infer_shape = [123, -1]
+            self.use_mkldnn = True
+            self.mean = 1.0
+            self.std = 2.0
+            self.seed = 10
+
+    class TestGaussianRandomOp4_ShapeTensorList(
+            TestGaussianRandomOp_ShapeTensorList):
+
+        def init_data(self):
+            self.shape = [123, 92]
+            self.infer_shape = [123, -1]
+            self.use_mkldnn = False
+            self.mean = 1.0
+            self.std = 2.0
+            self.seed = 10
+
+    # Situation 3: shape is a tensor
+    class TestGaussianRandomOp1_ShapeTensor(TestGaussianRandomOp):
+
+        def setUp(self):
+            '''Test gaussian_random op with specified value
+            '''
+            self.init()
+            self.init_data()
+            self.use_mkldnn = False
+
+            self.inputs = {"ShapeTensor": np.array(self.shape).astype("int32")}
+            self.attrs = {
+                'mean': self.mean,
+                'std': self.std,
+                'seed': self.seed,
+                'use_mkldnn': self.use_mkldnn
+            }
+            self.outputs = {'Out': np.zeros((123, 92), dtype=self.dtype)}
+
+        def init_data(self):
+            self.shape = [123, 92]
+            self.use_mkldnn = False
+            self.mean = 1.0
+            self.std = 2.0
+            self.seed = 10
+
+
+# Test python API
+class TestGaussianRandomAPI(unittest.TestCase):
+
+    def test_api(self):
+        positive_2_int32 = fluid.layers.fill_constant([1], "int32", 2000)
+
+        positive_2_int64 = fluid.layers.fill_constant([1], "int64", 500)
+        shape_tensor_int32 = fluid.data(name="shape_tensor_int32",
+                                        shape=[2],
+                                        dtype="int32")
+
+        shape_tensor_int64 = fluid.data(name="shape_tensor_int64",
+                                        shape=[2],
+                                        dtype="int64")
+
+        out_1 = fluid.layers.gaussian_random(shape=[2000, 500],
+                                             dtype="float32",
+                                             mean=0.0,
+                                             std=1.0,
+                                             seed=10)
+
+        out_2 = fluid.layers.gaussian_random(shape=[2000, positive_2_int32],
+                                             dtype="float32",
+                                             mean=0.,
+                                             std=1.0,
+                                             seed=10)
+
+        out_3 = fluid.layers.gaussian_random(shape=[2000, positive_2_int64],
+                                             dtype="float32",
+                                             mean=0.,
+                                             std=1.0,
+                                             seed=10)
+
+        out_4 = fluid.layers.gaussian_random(shape=shape_tensor_int32,
+                                             dtype="float32",
+                                             mean=0.,
+                                             std=1.0,
+                                             seed=10)
+
+        out_5 = fluid.layers.gaussian_random(shape=shape_tensor_int64,
+                                             dtype="float32",
+                                             mean=0.,
+                                             std=1.0,
+                                             seed=10)
+
+        out_6 = fluid.layers.gaussian_random(shape=shape_tensor_int64,
+                                             dtype=np.float32,
+                                             mean=0.,
+                                             std=1.0,
+                                             seed=10)
+
+        exe = fluid.Executor(place=fluid.XPUPlace(0))
+        res_1, res_2, res_3, res_4, res_5, res_6 = exe.run(
+            fluid.default_main_program(),
+            feed={
+                "shape_tensor_int32": np.array([2000, 500]).astype("int32"),
+                "shape_tensor_int64": np.array([2000, 500]).astype("int64"),
+            },
+            fetch_list=[out_1, out_2, out_3, out_4, out_5, out_6])
+
+        self.assertAlmostEqual(np.mean(res_1), 0.0, delta=0.1)
+        self.assertAlmostEqual(np.std(res_1), 1., delta=0.1)
+        self.assertAlmostEqual(np.mean(res_2), 0.0, delta=0.1)
+        self.assertAlmostEqual(np.std(res_2), 1., delta=0.1)
+        self.assertAlmostEqual(np.mean(res_3), 0.0, delta=0.1)
+        self.assertAlmostEqual(np.std(res_3), 1., delta=0.1)
+        self.assertAlmostEqual(np.mean(res_4), 0.0, delta=0.1)
+        self.assertAlmostEqual(np.std(res_5), 1., delta=0.1)
+        self.assertAlmostEqual(np.mean(res_5), 0.0, delta=0.1)
+        self.assertAlmostEqual(np.std(res_5), 1., delta=0.1)
+        self.assertAlmostEqual(np.mean(res_6), 0.0, delta=0.1)
+        self.assertAlmostEqual(np.std(res_6), 1., delta=0.1)
+
+    def test_default_dtype(self):
+        paddle.disable_static()
+
+        def test_default_fp16():
+            paddle.framework.set_default_dtype('float16')
+            paddle.tensor.random.gaussian([2, 3])
+
+        self.assertRaises(TypeError, test_default_fp16)
+
+        def test_default_fp32():
+            paddle.framework.set_default_dtype('float32')
+            out = paddle.tensor.random.gaussian([2, 3])
+            self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP32)
+
+        def test_default_fp64():
+            paddle.framework.set_default_dtype('float64')
+            out = paddle.tensor.random.gaussian([2, 3])
+            self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP64)
+
+        test_default_fp64()
+        test_default_fp32()
+
+        paddle.enable_static()
+
+
+class TestStandardNormalDtype(unittest.TestCase):
+
+    def test_default_dtype(self):
+        paddle.disable_static()
+
+        def test_default_fp16():
+            paddle.framework.set_default_dtype('float16')
+            paddle.tensor.random.standard_normal([2, 3])
+
+        self.assertRaises(TypeError, test_default_fp16)
+
+        def test_default_fp32():
+            paddle.framework.set_default_dtype('float32')
+            out = paddle.tensor.random.standard_normal([2, 3])
+            self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP32)
+
+        def test_default_fp64():
+            paddle.framework.set_default_dtype('float64')
+            out = paddle.tensor.random.standard_normal([2, 3])
+            self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP64)
+
+        test_default_fp64()
+        test_default_fp32()
+
+        paddle.enable_static()
 
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            outs = self.calc_output(place)
-            outs = [np.array(out) for out in outs]
-            outs.sort(key=len)
-            self.verify_output(outs)
 
+support_types = get_xpu_op_support_types('gaussian_random')
+for stype in support_types:
+    create_test_class(globals(), XPUTestGaussianRandomOp, stype)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_grid_sampler_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_grid_sampler_op_xpu.py
new file mode 100644
index 0000000000000..967815cc559ee
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_grid_sampler_op_xpu.py
@@ -0,0 +1,284 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+
+sys.path.append("..")
+
+import paddle
+
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
+paddle.enable_static()
+
+
+def AffineGrid(theta, grid_shape):
+    n = grid_shape[0]
+    h = grid_shape[1]
+    w = grid_shape[2]
+    h_idx = np.repeat(np.linspace(-1, 1, h)[np.newaxis, :], w,
+                      axis=0).T[:, :, np.newaxis]
+    w_idx = np.repeat(np.linspace(-1, 1, w)[np.newaxis, :], h,
+                      axis=0)[:, :, np.newaxis]
+    grid = np.concatenate([w_idx, h_idx, np.ones([h, w, 1])],
+                          axis=2)  # h * w * 3
+    grid = np.repeat(grid[np.newaxis, :], n, axis=0)  # n * h * w *3
+
+    ret = np.zeros([n, h * w, 2])
+    theta = theta.transpose([0, 2, 1])
+    for i in range(len(theta)):
+        ret[i] = np.dot(grid[i].reshape([h * w, 3]), theta[i])
+
+    return ret.reshape([n, h, w, 2]).astype("float64")
+
+
+def getGridPointValue(data, x, y):
+    data_shape = data.shape
+    N = data_shape[0]
+    C = data_shape[1]
+    in_H = data_shape[2]
+    in_W = data_shape[3]
+    out_H = x.shape[1]
+    out_W = x.shape[2]
+
+    #out = np.zeros(data_shape, dtype='float64')
+    out = np.zeros([N, C, out_H, out_W], dtype='float64')
+    for i in range(N):
+        for j in range(out_H):
+            for k in range(out_W):
+                if y[i, j, k] < 0 or y[i, j, k] > in_H - 1 or x[
+                        i, j, k] < 0 or x[i, j, k] > in_W - 1:
+                    out[i, :, j, k] = 0
+                else:
+                    out[i, :, j, k] = data[i, :, y[i, j, k], x[i, j, k]]
+
+    return out
+
+
+def clip(x, min_n, max_n):
+    return np.maximum(np.minimum(x, max_n), min_n)
+
+
+def unnormalizeAndClip(grid_slice, max_val, align_corners, padding_mode):
+    if align_corners:
+        grid_slice = 0.5 * ((grid_slice.astype('float64') + 1.0) * max_val)
+    else:
+        grid_slice = 0.5 * ((grid_slice.astype('float64') + 1.0) *
+                            (max_val + 1)) - 0.5
+
+    if padding_mode == "border":
+        grid_slice = clip(grid_slice, 0, max_val)
+    elif padding_mode == "reflection":
+        double_range = 2 * max_val if align_corners else (max_val + 1) * 2
+        grid_abs = np.abs(grid_slice) if align_corners else np.abs(grid_slice +
+                                                                   0.5)
+        extra = grid_abs - np.floor(grid_abs / double_range) * double_range
+        grid_slice = np.minimum(extra, double_range - extra)
+        grid_slice = grid_slice if align_corners else clip(
+            grid_slice - 0.5, 0, max_val)
+    return grid_slice
+
+
+def GridSampler(data,
+                grid,
+                align_corners=True,
+                mode="bilinear",
+                padding_mode="zeros"):
+    dims = data.shape
+    N = dims[0]
+    in_C = dims[1]
+    in_H = dims[2]
+    in_W = dims[3]
+
+    out_H = grid.shape[1]
+    out_W = grid.shape[2]
+
+    x = grid[:, :, :, 0]
+    y = grid[:, :, :, 1]
+    y_max = in_H - 1
+    x_max = in_W - 1
+
+    x = unnormalizeAndClip(x, x_max, align_corners, padding_mode)
+    y = unnormalizeAndClip(y, y_max, align_corners, padding_mode)
+
+    if mode == "bilinear":
+        x0 = np.floor(x).astype('int32')
+        x1 = x0 + 1
+        y0 = np.floor(y).astype('int32')
+        y1 = y0 + 1
+
+        wa = np.tile(((x1 - x) * (y1 - y)).reshape((N, 1, out_H, out_W)),
+                     (1, in_C, 1, 1))
+        wb = np.tile(((x1 - x) * (y - y0)).reshape((N, 1, out_H, out_W)),
+                     (1, in_C, 1, 1))
+        wc = np.tile(((x - x0) * (y1 - y)).reshape((N, 1, out_H, out_W)),
+                     (1, in_C, 1, 1))
+        wd = np.tile(((x - x0) * (y - y0)).reshape((N, 1, out_H, out_W)),
+                     (1, in_C, 1, 1))
+
+        va = getGridPointValue(data, x0, y0)
+        vb = getGridPointValue(data, x0, y1)
+        vc = getGridPointValue(data, x1, y0)
+        vd = getGridPointValue(data, x1, y1)
+
+        out = (wa * va + wb * vb + wc * vc + wd * vd).astype('float64')
+    elif mode == "nearest":
+        x = np.round(x).astype('int32')
+        y = np.round(y).astype('int32')
+        out = getGridPointValue(data, x, y)
+    return out
+
+
+class XPUTestGridSamplerOP(XPUOpTestWrapper):
+
+    def __init__(self):
+        self.op_name = 'grid_sampler'
+        self.use_dynamic_create_class = False
+
+    class TestXPUGridSamplerOp(XPUOpTest):
+
+        def setUp(self):
+            self.place = paddle.XPUPlace(0)
+            self.init_dtype()
+            self.op_type = 'grid_sampler'
+
+            self.use_cudnn = False
+            self.align_corners = True
+            self.padding_mode = "zeros"
+            self.mode = "bilinear"
+
+            self.initTestCase()
+
+            x = np.random.uniform(-10, 10, self.x_shape).astype(self.dtype)
+
+            theta = np.zeros(self.theta_shape).astype(self.dtype)
+            for i in range(self.theta_shape[0]):
+                for j in range(2):
+                    for k in range(3):
+                        theta[i, j, k] = np.random.rand(1)[0]
+            grid = AffineGrid(theta, self.grid_shape).astype(self.dtype)
+
+            self.inputs = {'X': x, 'Grid': grid}
+            self.attrs = {
+                'use_cudnn': self.use_cudnn,
+                "align_corners": self.align_corners,
+                "padding_mode": self.padding_mode,
+                "mode": self.mode,
+            }
+            self.outputs = {
+                'Output':
+                GridSampler(x, grid, self.align_corners, self.mode,
+                            self.padding_mode)
+            }
+
+        def initTestCase(self):
+            self.x_shape = (2, 3, 8, 8)
+            self.grid_shape = (2, 7, 9, 2)
+            self.theta_shape = (2, 2, 3)
+            self.align_corners = True
+            self.padding_mode = "zeros"
+            self.mode = "bilinear"
+
+        def init_dtype(self):
+            self.dtype = self.in_type
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            self.check_grad_with_place(self.place, ['X', 'Grid'], 'Output')
+
+    class TestGridSample1(TestXPUGridSamplerOp):
+
+        def initTestCase(self):
+            self.x_shape = (2, 3, 5, 6)
+            self.grid_shape = (2, 8, 9, 2)
+            self.theta_shape = (2, 2, 3)
+            self.align_corners = False
+            self.padding_mode = "zeros"
+            self.mode = "bilinear"
+
+    class TestGridSample2(TestXPUGridSamplerOp):
+
+        def initTestCase(self):
+            self.x_shape = (2, 3, 5, 6)
+            self.grid_shape = (2, 8, 9, 2)
+            self.theta_shape = (2, 2, 3)
+            self.align_corners = False
+            self.padding_mode = "border"
+            self.mode = "bilinear"
+
+    class TestGridSample3(TestXPUGridSamplerOp):
+
+        def initTestCase(self):
+            self.x_shape = (2, 3, 5, 6)
+            self.grid_shape = (2, 8, 9, 2)
+            self.theta_shape = (2, 2, 3)
+            self.align_corners = False
+            self.padding_mode = "reflection"
+            self.mode = "bilinear"
+
+    class TestGridSample4(TestXPUGridSamplerOp):
+
+        def initTestCase(self):
+            self.x_shape = (2, 3, 5, 6)
+            self.grid_shape = (2, 8, 9, 2)
+            self.theta_shape = (2, 2, 3)
+            self.align_corners = True
+            self.padding_mode = "reflection"
+            self.mode = "bilinear"
+
+    class TestGridSample5(TestXPUGridSamplerOp):
+
+        def initTestCase(self):
+            self.x_shape = (2, 3, 5, 6)
+            self.grid_shape = (2, 8, 9, 2)
+            self.theta_shape = (2, 2, 3)
+            self.align_corners = False
+            self.padding_mode = "reflection"
+            self.mode = "nearest"
+
+    class TestGridSample6(TestXPUGridSamplerOp):
+
+        def initTestCase(self):
+            self.x_shape = (2, 3, 128, 128)
+            self.grid_shape = (2, 130, 130, 2)
+            self.theta_shape = (2, 2, 3)
+            self.align_corners = False
+            self.padding_mode = "reflection"
+            self.mode = "bilinear"
+
+    class TestGridSample7(TestXPUGridSamplerOp):
+
+        def initTestCase(self):
+            self.x_shape = (2, 3, 128, 128)
+            self.grid_shape = (2, 130, 130, 2)
+            self.theta_shape = (2, 2, 3)
+            self.align_corners = True
+            self.padding_mode = "zeros"
+            self.mode = "bilinear"
+
+
+support_types = get_xpu_op_support_types('grid_sampler')
+for stype in support_types:
+    create_test_class(globals(), XPUTestGridSamplerOP, stype)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_iou_similarity_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_iou_similarity_op_xpu.py
index ceb154f1e3520..46e82c68321ee 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_iou_similarity_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_iou_similarity_op_xpu.py
@@ -13,108 +13,120 @@
 # limitations under the License.
 
 from __future__ import print_function
-import unittest
-import sys
-
-sys.path.append("..")
-
 import unittest
 import numpy as np
 import numpy.random as random
 import sys
-import math
-from op_test import OpTest
+
+sys.path.append("..")
 from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 import paddle
 
 paddle.enable_static()
 
 
-class TestXPUIOUSimilarityOp(XPUOpTest):
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
-
-    def setUp(self):
-        self.op_type = "iou_similarity"
-        self.boxes1 = random.rand(2, 4).astype('float32')
-        self.boxes2 = random.rand(3, 4).astype('float32')
-        self.output = random.rand(2, 3).astype('float32')
-        self.box_normalized = False
-        # run python iou computation
-        self._compute_iou()
-        self.inputs = {'X': self.boxes1, 'Y': self.boxes2}
-        self.attrs = {"box_normalized": self.box_normalized, 'use_xpu': True}
-        self.outputs = {'Out': self.output}
-
-    def _compute_iou(self, ):
-        for row in range(self.boxes1.shape[0]):
-            for col in range(self.boxes2.shape[0]):
-                xmin1, ymin1, xmax1, ymax1 = self.boxes1[row]
-                xmin2, ymin2, xmax2, ymax2 = self.boxes2[col]
-                if not self.box_normalized:
-                    area1 = (ymax1 - ymin1 + 1) * (xmax1 - xmin1 + 1)
-                    area2 = (ymax2 - ymin2 + 1) * (xmax2 - xmin2 + 1)
-                else:
-                    area1 = (ymax1 - ymin1) * (xmax1 - xmin1)
-                    area2 = (ymax2 - ymin2) * (xmax2 - xmin2)
-
-                inter_xmax = min(xmax1, xmax2)
-                inter_ymax = min(ymax1, ymax2)
-                inter_xmin = max(xmin1, xmin2)
-                inter_ymin = max(ymin1, ymin2)
-                inter_height = inter_ymax - inter_ymin
-                inter_width = inter_xmax - inter_xmin
-                if not self.box_normalized:
-                    inter_height += 1
-                    inter_width += 1
-                inter_height = max(inter_height, 0)
-                inter_width = max(inter_width, 0)
-                inter_area = inter_width * inter_height
-                union_area = area1 + area2 - inter_area
-                sim_score = inter_area / union_area
-                self.output[row, col] = sim_score
-
-
-class TestXPUIOUSimilarityOpWithLoD(TestXPUIOUSimilarityOp):
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place, check_dygraph=False)
-
-    def setUp(self):
-        super(TestXPUIOUSimilarityOpWithLoD, self).setUp()
-        self.boxes1_lod = [[1, 1]]
-        self.output_lod = [[1, 1]]
-        self.box_normalized = False
-        # run python iou computation
-        self._compute_iou()
-        self.inputs = {'X': (self.boxes1, self.boxes1_lod), 'Y': self.boxes2}
-        self.attrs = {"box_normalized": self.box_normalized}
-        self.outputs = {'Out': (self.output, self.output_lod)}
-
-
-class TestXPUIOUSimilarityOpWithBoxNormalized(TestXPUIOUSimilarityOp):
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place, check_dygraph=False)
-
-    def setUp(self):
-        super(TestXPUIOUSimilarityOpWithBoxNormalized, self).setUp()
-        self.boxes1_lod = [[1, 1]]
-        self.output_lod = [[1, 1]]
-        self.box_normalized = True
-        # run python iou computation
-        self._compute_iou()
-        self.inputs = {'X': (self.boxes1, self.boxes1_lod), 'Y': self.boxes2}
-        self.attrs = {"box_normalized": self.box_normalized}
-        self.outputs = {'Out': (self.output, self.output_lod)}
-
+class XPUTestIOUSimilarityOp(XPUOpTestWrapper):
+
+    def __init__(self):
+        self.op_name = 'iou_similarity'
+        self.use_dynamic_create_class = False
+
+    class TestXPUIOUSimilarityOp(XPUOpTest):
+
+        def init(self):
+            self.dtype = self.in_type
+            self.place = paddle.XPUPlace(0)
+            self.op_type = 'iou_similarity'
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def setUp(self):
+            self.init()
+            self.boxes1 = random.rand(2, 4).astype(self.dtype)
+            self.boxes2 = random.rand(3, 4).astype(self.dtype)
+            self.output = random.rand(2, 3).astype(self.dtype)
+            self.box_normalized = False
+            # run python iou computation
+            self._compute_iou()
+            self.inputs = {'X': self.boxes1, 'Y': self.boxes2}
+            self.attrs = {
+                "box_normalized": self.box_normalized,
+                'use_xpu': True
+            }
+            self.outputs = {'Out': self.output}
+
+        def _compute_iou(self, ):
+            for row in range(self.boxes1.shape[0]):
+                for col in range(self.boxes2.shape[0]):
+                    xmin1, ymin1, xmax1, ymax1 = self.boxes1[row]
+                    xmin2, ymin2, xmax2, ymax2 = self.boxes2[col]
+                    if not self.box_normalized:
+                        area1 = (ymax1 - ymin1 + 1) * (xmax1 - xmin1 + 1)
+                        area2 = (ymax2 - ymin2 + 1) * (xmax2 - xmin2 + 1)
+                    else:
+                        area1 = (ymax1 - ymin1) * (xmax1 - xmin1)
+                        area2 = (ymax2 - ymin2) * (xmax2 - xmin2)
+
+                    inter_xmax = min(xmax1, xmax2)
+                    inter_ymax = min(ymax1, ymax2)
+                    inter_xmin = max(xmin1, xmin2)
+                    inter_ymin = max(ymin1, ymin2)
+                    inter_height = inter_ymax - inter_ymin
+                    inter_width = inter_xmax - inter_xmin
+                    if not self.box_normalized:
+                        inter_height += 1
+                        inter_width += 1
+                    inter_height = max(inter_height, 0)
+                    inter_width = max(inter_width, 0)
+                    inter_area = inter_width * inter_height
+                    union_area = area1 + area2 - inter_area
+                    sim_score = inter_area / union_area
+                    self.output[row, col] = sim_score
+
+    class TestXPUIOUSimilarityOpWithLoD(TestXPUIOUSimilarityOp):
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place, check_dygraph=False)
+
+        def setUp(self):
+            super().setUp()
+            self.boxes1_lod = [[1, 1]]
+            self.output_lod = [[1, 1]]
+            self.box_normalized = False
+            # run python iou computation
+            self._compute_iou()
+            self.inputs = {
+                'X': (self.boxes1, self.boxes1_lod),
+                'Y': self.boxes2
+            }
+            self.attrs = {"box_normalized": self.box_normalized}
+            self.outputs = {'Out': (self.output, self.output_lod)}
+
+    class TestXPUIOUSimilarityOpWithBoxNormalized(TestXPUIOUSimilarityOp):
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place, check_dygraph=False)
+
+        def setUp(self):
+            super().setUp()
+            self.boxes1_lod = [[1, 1]]
+            self.output_lod = [[1, 1]]
+            self.box_normalized = True
+            # run python iou computation
+            self._compute_iou()
+            self.inputs = {
+                'X': (self.boxes1, self.boxes1_lod),
+                'Y': self.boxes2
+            }
+            self.attrs = {"box_normalized": self.box_normalized}
+            self.outputs = {'Out': (self.output, self.output_lod)}
+
+
+support_types = get_xpu_op_support_types('iou_similarity')
+for stype in support_types:
+    create_test_class(globals(), XPUTestIOUSimilarityOp, stype)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_layer_norm_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_layer_norm_op_xpu.py
index 1f2caa9fbe9d8..8cab945b45978 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_layer_norm_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_layer_norm_op_xpu.py
@@ -20,7 +20,9 @@
 
 sys.path.append("..")
 from op_test import OpTest
+from op_test_xpu import XPUOpTest
 from operator import mul
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
 paddle.enable_static()
 
@@ -42,77 +44,77 @@ def ref_layer_norm(x, scale, bias, epsilon, begin_norm_axis=1):
     return y, mean, variance
 
 
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPULayerNormOp(OpTest):
+class XPUTestLayerNormOp(XPUOpTestWrapper):
 
-    def setUp(self):
-        self.op_type = "layer_norm"
-        self.dtype = np.float32
-        self.shape = [2, 3, 4, 5]
-        self.epsilon = 1e-05
-        self.begin_norm_axis = 1
-        self.set_attrs()
+    def __init__(self):
+        self.op_name = 'layer_norm'
+        self.use_dynamic_create_class = False
 
-        right = reduce(mul, self.shape[self.begin_norm_axis:len(self.shape)], 1)
-        np.random.seed(10)
-        x_np = np.random.uniform(0.1, 1, self.shape).astype(self.dtype)
-        scale_np = np.random.uniform(0.1, 1, [right]).astype(self.dtype)
-        bias_np = np.random.uniform(0.1, 1, [right]).astype(self.dtype)
-        ref_y_np, ref_mean_np, ref_variance_np = ref_layer_norm(
-            x_np, scale_np, bias_np, self.epsilon, self.begin_norm_axis)
+    class TestXPULayerNormOp(XPUOpTest):
 
-        self.inputs = {'X': x_np, 'Scale': scale_np, 'Bias': bias_np}
-        self.outputs = {
-            'Y': ref_y_np,
-            'Mean': ref_mean_np,
-            'Variance': ref_variance_np
-        }
-        self.attrs = {'begin_norm_axis': self.begin_norm_axis, 'use_xpu': True}
+        def setUp(self):
+            self.op_type = "layer_norm"
+            self.dtype = self.in_type
+            self.shape = [2, 3, 4, 5]
+            self.epsilon = 1e-05
+            self.begin_norm_axis = 1
+            self.set_attrs()
 
-    def set_attrs(self):
-        pass
+            right = reduce(mul,
+                           self.shape[self.begin_norm_axis:len(self.shape)], 1)
+            np.random.seed(10)
+            x_np = np.random.uniform(0.1, 1, self.shape).astype(self.dtype)
+            scale_np = np.random.uniform(0.1, 1, [right]).astype(self.dtype)
+            bias_np = np.random.uniform(0.1, 1, [right]).astype(self.dtype)
+            ref_y_np, ref_mean_np, ref_variance_np = ref_layer_norm(
+                x_np, scale_np, bias_np, self.epsilon, self.begin_norm_axis)
 
-    def test_check_output(self):
-        self.check_output_with_place(paddle.XPUPlace(0), atol=1e-4)
+            self.inputs = {'X': x_np, 'Scale': scale_np, 'Bias': bias_np}
+            self.outputs = {
+                'Y': ref_y_np,
+                'Mean': ref_mean_np,
+                'Variance': ref_variance_np
+            }
+            self.attrs = {
+                'begin_norm_axis': self.begin_norm_axis,
+                'use_xpu': True
+            }
 
-    def test_check_grad(self):
-        self.check_grad_with_place(paddle.XPUPlace(0), ['X'],
-                                   'Y',
-                                   max_relative_error=0.02)
+        def set_attrs(self):
+            pass
 
+        def test_check_output(self):
+            self.check_output_with_place(paddle.XPUPlace(0), atol=1e-4)
 
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPULayerNormOpAxis2(TestXPULayerNormOp):
+        def test_check_grad(self):
+            self.check_grad_with_place(paddle.XPUPlace(0), ['X'],
+                                       'Y',
+                                       max_relative_error=0.02)
 
-    def set_attrs(self):
-        self.begin_norm_axis = 2
+    class TestXPULayerNormOpAxis2(TestXPULayerNormOp):
 
+        def set_attrs(self):
+            self.begin_norm_axis = 2
 
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPULayerNormOpAxis3(TestXPULayerNormOp):
+    class TestXPULayerNormOpAxis3(TestXPULayerNormOp):
 
-    def set_attrs(self):
-        self.begin_norm_axis = 3
+        def set_attrs(self):
+            self.begin_norm_axis = 3
 
+    class TestXPULayerNormOp2D(TestXPULayerNormOp):
 
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPULayerNormOp2D(TestXPULayerNormOp):
+        def set_attrs(self):
+            self.shape = [10, 12]
 
-    def set_attrs(self):
-        self.shape = [10, 12]
+    class TestXPULayerNormOp3D(TestXPULayerNormOp):
 
+        def set_attrs(self):
+            self.shape = [4, 5, 6]
 
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPULayerNormOp3D(TestXPULayerNormOp):
-
-    def set_attrs(self):
-        self.shape = [4, 5, 6]
 
+support_types = get_xpu_op_support_types('layer_norm')
+for stype in support_types:
+    create_test_class(globals(), XPUTestLayerNormOp, stype)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_log_softmax_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_log_softmax_op_xpu.py
new file mode 100644
index 0000000000000..e7e730d9b2e25
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_log_softmax_op_xpu.py
@@ -0,0 +1,107 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import sys
+
+sys.path.append("..")
+from op_test import OpTest
+
+import paddle
+import paddle.fluid.core as core
+import paddle.nn.functional as F
+
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
+paddle.enable_static()
+np.random.seed(10)
+
+
+def ref_log_softmax(x):
+    shiftx = (x - np.max(x))
+    out = shiftx - np.log(np.exp(shiftx).sum())
+    return out
+
+
+def ref_log_softmax_grad(x, axis):
+    if axis < 0:
+        axis += len(x.shape)
+    out = np.apply_along_axis(ref_log_softmax, axis, x)
+    axis_dim = x.shape[axis]
+    dout = np.full_like(x, fill_value=1. / x.size)
+    dx = dout - np.exp(out) * dout.copy().sum(axis=axis, keepdims=True).repeat(
+        axis_dim, axis=axis)
+    return dx
+
+
+class XPUTestLogSoftmaxOp(XPUOpTestWrapper):
+
+    def __init__(self):
+        self.op_name = 'log_softmax'
+        self.use_dynamic_create_class = True
+
+    def dynamic_create_class(self):
+        base_class = self.TestXPULogSoftmaxOp
+        classes = []
+        axis_arr = [-1, 1]
+        shape_arr = [[2, 3, 4, 5], [12, 10], [2, 5], [7, 7], [3, 5, 7]]
+        for axis in axis_arr:
+            for shape in shape_arr:
+                class_name = 'XPUTestLogSoftmax_' + \
+                       str(axis) + "_" + str(shape)
+                attr_dict = {'axis': axis, 'shape': shape}
+                classes.append([class_name, attr_dict])
+        return base_class, classes
+
+    class TestXPULogSoftmaxOp(XPUOpTest):
+
+        def setUp(self):
+            self.op_type = 'log_softmax'
+            self.python_api = F.log_softmax
+            self.dtype = 'float32'
+            self.set_attrs()
+            self.use_xpu = True
+            if not hasattr(self, 'axis'):
+                self.shape = [2, 3, 4, 5]
+                self.axis = -1
+
+            x = np.random.uniform(0.1, 1., self.shape).astype(self.dtype)
+            out = np.apply_along_axis(ref_log_softmax, self.axis, x)
+            self.x_grad = ref_log_softmax_grad(x, self.axis)
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'axis': self.axis}
+
+        def set_attrs(self):
+            pass
+
+        def test_check_output(self):
+            self.check_output(check_eager=True)
+
+        def test_check_grad(self):
+            self.check_grad(['X'], ['Out'],
+                            user_defined_grads=[self.x_grad],
+                            check_eager=True)
+
+
+support_types = get_xpu_op_support_types('log_softmax')
+for stype in support_types:
+    create_test_class(globals(), XPUTestLogSoftmaxOp, stype)
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py
index bc6fa19a35444..73f61c2d9d5ba 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py
@@ -294,6 +294,10 @@ def setUp(self):
         self.op_type = "matmul"
         self.dtype = np.float32 if not hasattr(self,
                                                'in_type') else self.in_type
+
+        self.__class__.no_need_check_grad = False if not hasattr(
+            self, 'no_need_check_grad') else self.no_need_check_grad
+
         shape_X = [4, 5] if not hasattr(self, 'shape_X') else self.shape_X
         shape_Y = [5, 6] if not hasattr(self, 'shape_Y') else self.shape_Y
         transpose_X = False if not hasattr(self,
@@ -303,7 +307,8 @@ def setUp(self):
 
         X = np.random.random(shape_X).astype(self.dtype)
         Y = np.random.random(shape_Y).astype(self.dtype)
-        Out = reference_matmul(X, Y, transpose_X, transpose_Y)
+        Out = reference_matmul(X, Y, transpose_X,
+                               transpose_Y).astype(self.dtype)
         self.inputs = {'X': X, 'Y': Y}
         self.attrs = {'transpose_X': transpose_X, 'transpose_Y': transpose_Y}
         self.outputs = {'Out': Out}
@@ -313,12 +318,20 @@ def test_check_output(self):
         self.check_output_with_place(place, atol=1e-3)
 
     def test_check_grad_normal(self):
+        if hasattr(self.__class__, "no_need_check_grad"
+                   ) and self.__class__.no_need_check_grad == True:
+            return
+
         place = paddle.XPUPlace(0)
         self.check_grad_with_place(place, ['X', 'Y'],
                                    'Out',
                                    max_relative_error=5e-2)
 
     def test_check_grad_ignore_x(self):
+        if hasattr(self.__class__, "no_need_check_grad"
+                   ) and self.__class__.no_need_check_grad == True:
+            return
+
         place = paddle.XPUPlace(0)
         self.check_grad_with_place(place, ['Y'],
                                    'Out',
@@ -326,6 +339,10 @@ def test_check_grad_ignore_x(self):
                                    no_grad_set=set("X"))
 
     def test_check_grad_ignore_y(self):
+        if hasattr(self.__class__, "no_need_check_grad"
+                   ) and self.__class__.no_need_check_grad == True:
+            return
+
         place = paddle.XPUPlace(0)
         self.check_grad_with_place(place, ['X'],
                                    'Out',
@@ -350,6 +367,9 @@ def dynamic_create_class(self):
             for transose_x in [True, False]:
                 for transose_y in [True, False]:
                     for batch in batch_size:
+                        no_need_check_grad = False
+                        if batch >= 5:
+                            no_need_check_grad = True
                         class_name = (
                             'TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}_batch_{}'
                             .format(dim_X, dim_Y, transose_x, transose_y,
@@ -361,6 +381,7 @@ def dynamic_create_class(self):
                             'shape_Y': shape_y,
                             'transpose_X': transose_x,
                             'transpose_Y': transose_y,
+                            'no_need_check_grad': no_need_check_grad,
                             'op_type': "matmul"
                         }
                         classes.append([class_name, attr_dict])
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
index 8f31981355403..92b9ae3ae8998 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
@@ -80,6 +80,8 @@ def setUp(self):
             self.dtype = self.in_type
             self.config()
             self.op_type = "matmul_v2"
+            if self.dtype == np.float16 or self.dtype == "float16":
+                self.__class__.no_need_check_grad = True
             x = np.random.random(self.x_shape).astype(self.dtype)
             y = np.random.random(self.y_shape).astype(self.dtype)
             # -0.1 ~ 0.1
@@ -99,6 +101,9 @@ def test_check_output(self):
             self.check_output_with_place(place)
 
         def test_check_grad(self):
+            if hasattr(self.__class__, "no_need_check_grad"
+                       ) and self.__class__.no_need_check_grad == True:
+                return
             place = paddle.XPUPlace(0)
             self.check_grad_with_place(place, ['X', 'Y'], 'Out')
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py
index 0ddc38dbceba6..cd21dcca4c0ab 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py
@@ -28,29 +28,66 @@
 
 np.random.seed(10)
 
+import op_test
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
+paddle.enable_static()
+
+
+class XPUTestMeanOp(XPUOpTestWrapper):
+
+    def __init__(self):
+        self.op_name = 'mean'
+        self.use_dynamic_create_class = False
+
+    class TestMeanOp(XPUOpTest):
+
+        def setUp(self):
+            self.init_dtype()
+            self.set_xpu()
+            self.op_type = "mean"
+            self.place = paddle.XPUPlace(0)
+            self.set_shape()
+            self.inputs = {'X': np.random.random(self.shape).astype(self.dtype)}
+            self.outputs = {'Out': np.mean(self.inputs["X"]).astype(np.float16)}
+
+        def init_dtype(self):
+            self.dtype = self.in_type
+
+        def set_shape(self):
+            self.shape = (10, 10)
+
+        def set_xpu(self):
+            self.__class__.use_xpu = True
+            self.__class__.no_need_check_grad = True
+            self.__class__.op_type = self.dtype
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_checkout_grad(self):
+            self.check_grad_with_place(self.place, ['X'], 'Out')
 
-class TestMeanOp(XPUOpTest):
+    class TestMeanOp1(TestMeanOp):
 
-    def setUp(self):
-        self.op_type = "mean"
-        self.init_dtype_type()
-        self.inputs = {'X': np.random.random((10, 10)).astype(self.dtype)}
-        self.outputs = {'Out': np.mean(self.inputs["X"]).astype(np.float16)}
+        def set_shape(self):
+            self.shape = (5)
 
-    def init_dtype_type(self):
-        self.dtype = np.float32
+    class TestMeanOp2(TestMeanOp):
 
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            paddle.enable_static()
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place, atol=2e-3)
+        def set_shape(self):
+            self.shape = (5, 7, 8)
 
-    def test_checkout_grad(self):
-        if paddle.is_compiled_with_xpu():
-            paddle.enable_static()
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['X'], 'Out')
+    class TestMeanOp3(TestMeanOp):
+
+        def set_shape(self):
+            self.shape = (10, 5, 7, 8)
+
+    class TestMeanOp4(TestMeanOp):
+
+        def set_shape(self):
+            self.shape = (2, 2, 3, 3, 3)
 
 
 class TestMeanOpError(unittest.TestCase):
@@ -59,55 +96,21 @@ def test_errors(self):
         with program_guard(Program(), Program()):
             # The input type of mean_op must be Variable.
             input1 = 12
-            self.assertRaises(TypeError, fluid.layers.mean, input1)
+            self.assertRaises(TypeError, paddle.mean, input1)
             # The input dtype of mean_op must be float16, float32, float64.
             input2 = fluid.layers.data(name='input2',
                                        shape=[12, 10],
                                        dtype="int32")
-            self.assertRaises(TypeError, fluid.layers.mean, input2)
+            self.assertRaises(TypeError, paddle.mean, input2)
             input3 = fluid.layers.data(name='input3',
                                        shape=[4],
                                        dtype="float16")
             fluid.layers.softmax(input3)
 
 
-class TestXPUMeanOp(TestMeanOp):
-
-    def init_dtype_type(self):
-        self.dtype = np.float32
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            paddle.enable_static()
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
-
-    def test_checkout_grad(self):
-        if paddle.is_compiled_with_xpu():
-            paddle.enable_static()
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['X'], 'Out')
-
-
-class TestXPUMeanOpFp16(TestMeanOp):
-
-    def init_dtype_type(self):
-        self.dtype = np.float16
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            paddle.enable_static()
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
-
-    def test_checkout_grad(self):
-        if paddle.is_compiled_with_xpu():
-            paddle.enable_static()
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['X'],
-                                       'Out',
-                                       max_relative_error=1.e1)
-
+support_types = get_xpu_op_support_types('mean')
+for stype in support_types:
+    create_test_class(globals(), XPUTestMeanOp, stype)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_one_hot_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_one_hot_v2_op_xpu.py
index afeccd637a265..d45e0ce34d42f 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_one_hot_v2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_one_hot_v2_op_xpu.py
@@ -18,136 +18,130 @@
 import numpy as np
 import paddle
 import paddle.fluid.core as core
+import paddle.fluid as fluid
 import sys
 
 sys.path.append("..")
 from op_test_xpu import XPUOpTest
-import paddle.fluid as fluid
-from paddle.fluid import Program, program_guard
-import time
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
 paddle.enable_static()
 
 
-class TestOneHotOp(XPUOpTest):
+class XPUTestOneHotOp(XPUOpTestWrapper):
 
-    def setUp(self):
-        self.use_xpu = True
-        self.op_type = 'one_hot_v2'
-        depth = 10
-        depth_np = np.array(10).astype('int32')
-        # dimension = 12
-        x_lod = [[4, 1, 3, 3]]
-        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
-        x = np.array(x).astype('int32').reshape([sum(x_lod[0])])
+    def __init__(self):
+        self.op_name = 'one_hot_v2'
+        self.use_dynamic_create_class = False
 
-        out = np.zeros(shape=(np.product(x.shape), depth)).astype('float32')
+    class TestOneHotOp(XPUOpTest):
 
-        for i in range(np.product(x.shape)):
-            out[i, x[i]] = 1.0
+        def init(self):
+            self.dtype = self.in_type
+            self.place = paddle.XPUPlace(0)
+            self.op_type = 'one_hot_v2'
 
-        self.inputs = {'X': (x, x_lod), 'depth_tensor': depth_np}
-        self.attrs = {'dtype': int(core.VarDesc.VarType.FP32)}
-        self.outputs = {'Out': (out, x_lod)}
+        def setUp(self):
+            self.init()
+            depth = 10
+            depth_np = np.array(10).astype('int32')
+            # dimension = 12
+            x_lod = [[4, 1, 3, 3]]
+            x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+            x = np.array(x).astype('int32').reshape([sum(x_lod[0])])
 
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
-        self.check_output_with_place(place, check_dygraph=False)
+            out = np.zeros(shape=(np.product(x.shape),
+                                  depth)).astype(self.dtype)
 
+            for i in range(np.product(x.shape)):
+                out[i, x[i]] = 1.0
 
-class TestOneHotOp_attr(XPUOpTest):
+            self.inputs = {'X': (x, x_lod), 'depth_tensor': depth_np}
+            self.attrs = {'dtype': int(core.VarDesc.VarType.FP32)}
+            self.outputs = {'Out': (out, x_lod)}
 
-    def setUp(self):
-        self.op_type = 'one_hot_v2'
-        depth = 10
-        dimension = 12
-        x_lod = [[4, 1, 3, 3]]
-        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
-        x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1])
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
 
-        out = np.zeros(shape=(np.product(x.shape[:-1]), 1,
-                              depth)).astype('float32')
+    class TestOneHotOp_attr(TestOneHotOp):
 
-        for i in range(np.product(x.shape)):
-            out[i, 0, x[i]] = 1.0
+        def setUp(self):
+            self.init()
+            depth = 10
+            dimension = 12
+            x_lod = [[4, 1, 3, 3]]
+            x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+            x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1])
 
-        self.inputs = {'X': (x, x_lod)}
-        self.attrs = {'dtype': int(core.VarDesc.VarType.FP32), 'depth': depth}
-        self.outputs = {'Out': (out, x_lod)}
+            out = np.zeros(shape=(np.product(x.shape[:-1]), 1,
+                                  depth)).astype(self.dtype)
 
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
-        self.check_output_with_place(place, check_dygraph=False)
+            for i in range(np.product(x.shape)):
+                out[i, 0, x[i]] = 1.0
 
+            self.inputs = {'X': (x, x_lod)}
+            self.attrs = {
+                'dtype': int(core.VarDesc.VarType.FP32),
+                'depth': depth
+            }
+            self.outputs = {'Out': (out, x_lod)}
 
-class TestOneHotOp_default_dtype(XPUOpTest):
+    class TestOneHotOp_default_dtype(TestOneHotOp):
 
-    def setUp(self):
-        self.op_type = 'one_hot_v2'
-        depth = 10
-        depth_np = np.array(10).astype('int32')
-        dimension = 12
-        x_lod = [[4, 1, 3, 3]]
-        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
-        x = np.array(x).astype('int32').reshape([sum(x_lod[0])])
+        def setUp(self):
+            self.init()
+            depth = 10
+            depth_np = np.array(10).astype('int32')
+            dimension = 12
+            x_lod = [[4, 1, 3, 3]]
+            x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+            x = np.array(x).astype('int32').reshape([sum(x_lod[0])])
 
-        out = np.zeros(shape=(np.product(x.shape), depth)).astype('float32')
+            out = np.zeros(shape=(np.product(x.shape),
+                                  depth)).astype(self.dtype)
 
-        for i in range(np.product(x.shape)):
-            out[i, x[i]] = 1.0
+            for i in range(np.product(x.shape)):
+                out[i, x[i]] = 1.0
 
-        self.inputs = {'X': (x, x_lod), 'depth_tensor': depth_np}
-        self.attrs = {}
-        self.outputs = {'Out': (out, x_lod)}
+            self.inputs = {'X': (x, x_lod), 'depth_tensor': depth_np}
+            self.attrs = {}
+            self.outputs = {'Out': (out, x_lod)}
 
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
-        self.check_output_with_place(place, check_dygraph=False)
+    class TestOneHotOp_default_dtype_attr(TestOneHotOp):
 
+        def setUp(self):
+            self.init()
+            depth = 10
+            dimension = 12
+            x_lod = [[4, 1, 3, 3]]
+            x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+            x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1])
 
-class TestOneHotOp_default_dtype_attr(XPUOpTest):
+            out = np.zeros(shape=(np.product(x.shape[:-1]), 1,
+                                  depth)).astype(self.dtype)
 
-    def setUp(self):
-        self.op_type = 'one_hot_v2'
-        depth = 10
-        dimension = 12
-        x_lod = [[4, 1, 3, 3]]
-        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
-        x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1])
+            for i in range(np.product(x.shape)):
+                out[i, 0, x[i]] = 1.0
 
-        out = np.zeros(shape=(np.product(x.shape[:-1]), 1,
-                              depth)).astype('float32')
+            self.inputs = {'X': (x, x_lod)}
+            self.attrs = {'depth': depth}
+            self.outputs = {'Out': (out, x_lod)}
 
-        for i in range(np.product(x.shape)):
-            out[i, 0, x[i]] = 1.0
+    class TestOneHotOp_out_of_range(TestOneHotOp):
 
-        self.inputs = {'X': (x, x_lod)}
-        self.attrs = {'depth': depth}
-        self.outputs = {'Out': (out, x_lod)}
+        def setUp(self):
+            self.init()
+            depth = 10
+            x_lod = [[4, 1, 3, 3]]
+            x = [np.random.choice([-1, depth]) for i in range(sum(x_lod[0]))]
+            x = np.array(x).astype('int32').reshape([sum(x_lod[0])])
 
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
-        self.check_output_with_place(place, check_dygraph=False)
+            out = np.zeros(shape=(np.product(x.shape),
+                                  depth)).astype(self.dtype)
 
-
-class TestOneHotOp_out_of_range(XPUOpTest):
-
-    def setUp(self):
-        self.op_type = 'one_hot_v2'
-        depth = 10
-        x_lod = [[4, 1, 3, 3]]
-        x = [np.random.choice([-1, depth]) for i in range(sum(x_lod[0]))]
-        x = np.array(x).astype('int32').reshape([sum(x_lod[0])])
-
-        out = np.zeros(shape=(np.product(x.shape), depth)).astype('float32')
-
-        self.inputs = {'X': (x, x_lod)}
-        self.attrs = {'depth': depth, 'allow_out_of_range': True}
-        self.outputs = {'Out': (out, x_lod)}
-
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
-        self.check_output_with_place(place, check_dygraph=False)
+            self.inputs = {'X': (x, x_lod)}
+            self.attrs = {'depth': depth, 'allow_out_of_range': True}
+            self.outputs = {'Out': (out, x_lod)}
 
 
 class TestOneHotOpApi(unittest.TestCase):
@@ -200,6 +194,9 @@ def test_bad_x():
             self.assertRaises(TypeError, test_bad_x)
 
 
+support_types = get_xpu_op_support_types('one_hot_v2')
+for stype in support_types:
+    create_test_class(globals(), XPUTestOneHotOp, stype)
+
 if __name__ == '__main__':
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_p_norm_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_p_norm_op_xpu.py
new file mode 100644
index 0000000000000..049896527b940
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_p_norm_op_xpu.py
@@ -0,0 +1,186 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numpy as np
+import sys
+import unittest
+from functools import reduce
+
+sys.path.append("..")
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+from operator import mul
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
+paddle.enable_static()
+
+
+def ref_p_norm(x, axis, porder, keepdims=False, reduce_all=False):
+    r = []
+    if axis is None or reduce_all:
+        x = x.flatten()
+        if porder == np.inf:
+            r = np.amax(np.abs(x), keepdims=keepdims)
+        elif porder == -np.inf:
+            r = np.amin(np.abs(x), keepdims=keepdims)
+        else:
+            r = np.linalg.norm(x, ord=porder, keepdims=keepdims)
+    elif isinstance(axis, list or tuple) and len(axis) == 2:
+        if porder == np.inf:
+            axis = tuple(axis)
+            r = np.amax(np.abs(x), axis=axis, keepdims=keepdims)
+        elif porder == -np.inf:
+            axis = tuple(axis)
+            r = np.amin(np.abs(x), axis=axis, keepdims=keepdims)
+        elif porder == 0:
+            axis = tuple(axis)
+            r = x.astype(bool)
+            r = np.sum(r, axis, keepdims=keepdims)
+        elif porder == 1:
+            axis = tuple(axis)
+            r = np.sum(np.abs(x), axis, keepdims=keepdims)
+        else:
+            axis = tuple(axis)
+            xp = np.power(np.abs(x), porder)
+            s = np.sum(xp, axis=axis, keepdims=keepdims)
+            r = np.power(s, 1.0 / porder)
+    else:
+        if isinstance(axis, list):
+            axis = tuple(axis)
+        r = np.linalg.norm(x, ord=porder, axis=axis, keepdims=keepdims)
+    r = r.astype(x.dtype)
+
+    return r
+
+
+class XPUTestPNormOp(XPUOpTestWrapper):
+
+    def __init__(self):
+        self.op_name = 'p_norm'
+        self.use_dynamic_create_class = False
+
+    class TestXPUPNormOp(XPUOpTest):
+
+        def setUp(self):
+            self.op_type = "p_norm"
+            self.dtype = self.in_type
+            self.shape = [2, 3, 4, 5]
+            self.epsilon = 1e-12
+            self.axis = 1
+            self.porder = 2.0
+            self.asvector = False
+            self.keepdims = False
+            self.set_attrs()
+            np.random.seed(12345)
+
+            x_np = np.random.uniform(-10, 10, self.shape).astype(self.dtype)
+
+            ref_y_np = ref_p_norm(x_np, self.axis, self.porder, self.keepdims,
+                                  self.asvector)
+            self.inputs = {'X': x_np}
+            self.outputs = {'Out': ref_y_np}
+            self.attrs = {
+                'epsilon': self.epsilon,
+                'axis': self.axis,
+                'porder': float(self.porder),
+                'asvector': self.asvector
+            }
+
+        def set_attrs(self):
+            pass
+
+        def test_check_output(self):
+            self.check_output_with_place(paddle.XPUPlace(0), atol=1e-4)
+
+        def test_check_grad(self):
+            self.check_grad_with_place(paddle.XPUPlace(0), ['X'], 'Out')
+
+    class TestPnormOp2(TestXPUPNormOp):
+
+        def set_attrs(self):
+            self.shape = [3, 20, 3]
+            self.axis = 2
+            self.porder = 2.0
+
+    class TestPnormOp3(TestXPUPNormOp):
+
+        def set_attrs(self):
+            self.shape = [3, 20, 3]
+            self.axis = 2
+            self.porder = np.inf
+
+    class TestPnormOp4(TestXPUPNormOp):
+
+        def set_attrs(self):
+            self.shape = [3, 20, 3]
+            self.axis = 2
+            self.porder = -np.inf
+
+    class TestPnormOp5(TestXPUPNormOp):
+
+        def set_attrs(self):
+            self.shape = [3, 20, 3]
+            self.axis = 2
+            self.porder = 0
+
+    class TestPnormOp6(TestXPUPNormOp):
+
+        def set_attrs(self):
+            self.shape = [3, 20, 3]
+            self.axis = -1
+            self.porder = 2
+
+    class TestPnormOp7(TestXPUPNormOp):
+
+        def set_attrs(self):
+            self.shape = [3, 20, 3, 10]
+            self.axis = 2
+            self.porder = 2.0
+
+    class TestPnormOp8(TestXPUPNormOp):
+
+        def set_attrs(self):
+            self.shape = [3, 20, 3]
+            self.axis = 2
+            self.porder = np.inf
+
+    class TestPnormOp9(TestXPUPNormOp):
+
+        def set_attrs(self):
+            self.shape = [3, 20, 3, 10]
+            self.axis = 1
+            self.porder = -np.inf
+
+    class TestPnormOp10(TestXPUPNormOp):
+
+        def set_attrs(self):
+            self.shape = [3, 20, 3, 10]
+            self.axis = 2
+            self.porder = 0
+
+    class TestPnormOp11(TestXPUPNormOp):
+
+        def set_attrs(self):
+            self.shape = [3, 20, 3, 10]
+            self.axis = -1
+            self.porder = 2
+
+
+support_types = get_xpu_op_support_types('p_norm')
+for stype in support_types:
+    create_test_class(globals(), XPUTestPNormOp, stype)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py
index 5ab62af7104e9..0d7121144adab 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py
@@ -297,6 +297,7 @@ def setUp(self):
                 'exclusive': self.exclusive,
                 'adaptive': self.adaptive,
                 "padding_algorithm": self.padding_algorithm,
+                'ceil_mode': self.ceil_mode
             }
 
             self.outputs = {'Out': output}
@@ -469,6 +470,77 @@ def init_test_case(self):
         def init_shape(self):
             self.shape = [2, 3, 7, 7]
 
+    class TestCaseCeil1(TestPool2D_Op):
+
+        def init_test_case(self):
+            self.ksize = [3, 3]
+            self.strides = [1, 1]
+
+        def init_paddings(self):
+            self.paddings = [0, 0]
+
+        def init_pool_type(self):
+            self.pool_type = "avg"
+            self.pool2D_forward_naive = avg_pool2D_forward_naive
+
+        def init_global_pool(self):
+            self.global_pool = False
+
+        def init_shape(self):
+            self.shape = [2, 3, 7, 7]
+
+        def init_ceil_mode(self):
+            self.ceil_mode = True
+
+    class TestCaseCeil2(TestPool2D_Op):
+
+        def init_test_case(self):
+            self.ksize = [3, 3]
+            self.strides = [1, 1]
+
+        def init_paddings(self):
+            self.paddings = [1, 1]
+
+        def init_pool_type(self):
+            self.pool_type = "avg"
+            self.pool2D_forward_naive = avg_pool2D_forward_naive
+
+        def init_global_pool(self):
+            self.global_pool = False
+
+        def init_shape(self):
+            self.shape = [2, 3, 7, 7]
+
+        def init_ceil_mode(self):
+            self.ceil_mode = True
+
+    class TestCaseCeil3(TestPool2D_Op):
+
+        def init_pool_type(self):
+            self.pool_type = "max"
+            self.pool2D_forward_naive = max_pool2D_forward_naive
+
+        def init_ceil_mode(self):
+            self.ceil_mode = True
+
+    class TestCaseCeil4(TestCaseCeil1):
+
+        def init_pool_type(self):
+            self.pool_type = "max"
+            self.pool2D_forward_naive = max_pool2D_forward_naive
+
+        def init_ceil_mode(self):
+            self.ceil_mode = True
+
+    class TestCaseCeil5(TestCaseCeil2):
+
+        def init_pool_type(self):
+            self.pool_type = "max"
+            self.pool2D_forward_naive = max_pool2D_forward_naive
+
+        def init_ceil_mode(self):
+            self.ceil_mode = True
+
 
 support_types = get_xpu_op_support_types('pool2d')
 for stype in support_types:
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_mean_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reduce_mean_op_xpu.py
index ef483870c68ee..90fe474e09cd1 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reduce_mean_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_reduce_mean_op_xpu.py
@@ -19,196 +19,180 @@
 import sys
 
 sys.path.append("..")
-from op_test import OpTest, skip_check_grad_ci
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 import paddle
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-from paddle.fluid import compiler, Program, program_guard
-from paddle.fluid.framework import convert_np_dtype_to_dtype_
 
+paddle.enable_static()
 
-class TestMeanOp(OpTest):
 
-    def setUp(self):
-        self.op_type = "reduce_mean"
-        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
-        self.attrs = {'use_xpu': True}
-        self.outputs = {'Out': self.inputs['X'].mean(axis=0)}
+class XPUTestMeanOp(XPUOpTestWrapper):
 
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
+    def __init__(self):
+        self.op_name = 'reduce_mean'
+        self.use_dynamic_create_class = False
 
-    def check_grad_(self):
-        self.check_grad(['X'], 'Out')
+    class TestMeanOp(XPUOpTest):
 
+        def setUp(self):
+            self.dtype = self.in_type
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "reduce_mean"
+            self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+            self.attrs = {'use_xpu': True}
+            self.outputs = {'Out': self.inputs['X'].mean(axis=0)}
 
-class TestMeanOp5D(OpTest):
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
 
-    def setUp(self):
-        self.op_type = "reduce_mean"
-        self.inputs = {
-            'X': np.random.random((1, 2, 5, 6, 10)).astype("float32")
-        }
-        self.attrs = {'use_xpu': True}
-        self.outputs = {'Out': self.inputs['X'].mean(axis=0)}
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestMeanOp6D(OpTest):
-
-    def setUp(self):
-        self.op_type = "reduce_mean"
-        self.inputs = {
-            'X': np.random.random((1, 1, 2, 5, 6, 10)).astype("float32")
-        }
-        self.attrs = {'use_xpu': True}
-        self.outputs = {'Out': self.inputs['X'].mean(axis=0)}
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestMeanOp8D(OpTest):
-
-    def setUp(self):
-        self.op_type = "reduce_mean"
-        self.inputs = {
-            'X': np.random.random((1, 3, 1, 2, 1, 4, 3, 10)).astype("float32")
-        }
-        self.attrs = {'dim': (0, 3), 'use_xpu': True}
-        self.outputs = {'Out': self.inputs['X'].mean(axis=(0, 3))}
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class Test1DReduce(OpTest):
-
-    def setUp(self):
-        self.op_type = "reduce_mean"
-        self.inputs = {'X': np.random.random(120).astype("float32")}
-        self.attrs = {'use_xpu': True}
-        self.outputs = {'Out': self.inputs['X'].mean(axis=0)}
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
-class Test2DReduce0(Test1DReduce):
-
-    def setUp(self):
-        self.op_type = "reduce_mean"
-        self.attrs = {'dim': [0], 'use_xpu': True}
-        self.inputs = {'X': np.random.random((20, 10)).astype("float32")}
-        self.outputs = {'Out': self.inputs['X'].mean(axis=0)}
-
-
-class Test2DReduce1(Test1DReduce):
-
-    def setUp(self):
-        self.op_type = "reduce_mean"
-        self.attrs = {'dim': [1], 'use_xpu': True}
-        self.inputs = {'X': np.random.random((20, 10)).astype("float32")}
-        self.outputs = {
-            'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
-        }
-
-
-class Test3DReduce0(Test1DReduce):
-
-    def setUp(self):
-        self.op_type = "reduce_mean"
-        self.attrs = {'dim': [1], 'use_xpu': True}
-        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float32")}
-        self.outputs = {
-            'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
-        }
-
-
-class Test3DReduce1(Test1DReduce):
-
-    def setUp(self):
-        self.op_type = "reduce_mean"
-        self.attrs = {'dim': [2], 'use_xpu': True}
-        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float32")}
-        self.outputs = {
-            'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
-        }
-
-
-class Test3DReduce2(Test1DReduce):
-
-    def setUp(self):
-        self.op_type = "reduce_mean"
-        self.attrs = {'dim': [-2], 'use_xpu': True}
-        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float32")}
-        self.outputs = {
-            'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
-        }
-
-
-class Test3DReduce3(Test1DReduce):
-
-    def setUp(self):
-        self.op_type = "reduce_mean"
-        self.attrs = {'dim': [1, 2], 'use_xpu': True}
-        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float32")}
-        self.outputs = {
-            'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
-        }
-
-
-class TestKeepDimReduce(Test1DReduce):
-
-    def setUp(self):
-        self.op_type = "reduce_mean"
-        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
-        self.attrs = {'dim': [1], 'keep_dim': True, 'use_xpu': True}
-        self.outputs = {
-            'Out':
-            self.inputs['X'].mean(axis=tuple(self.attrs['dim']),
-                                  keepdims=self.attrs['keep_dim'])
-        }
-
-
-class TestKeepDim8DReduce(Test1DReduce):
-
-    def setUp(self):
-        self.op_type = "reduce_mean"
-        self.inputs = {
-            'X': np.random.random((2, 5, 3, 2, 2, 3, 4, 2)).astype("float32")
-        }
-        self.attrs = {'dim': (3, 4, 5), 'keep_dim': True, 'use_xpu': True}
-        self.outputs = {
-            'Out':
-            self.inputs['X'].mean(axis=tuple(self.attrs['dim']),
-                                  keepdims=self.attrs['keep_dim'])
-        }
+        def test_check_grad(self):
+            self.check_grad_with_place(self.place, ['X'], 'Out')
 
+    class TestMeanOp5D(TestMeanOp):
+
+        def setUp(self):
+            super().setUp()
+            self.inputs = {
+                'X': np.random.random((1, 2, 5, 6, 10)).astype(self.dtype)
+            }
+            self.attrs = {'use_xpu': True}
+            self.outputs = {'Out': self.inputs['X'].mean(axis=0)}
+
+    class TestMeanOp6D(TestMeanOp):
+
+        def setUp(self):
+            super().setUp()
+            self.inputs = {
+                'X': np.random.random((1, 1, 2, 5, 6, 10)).astype(self.dtype)
+            }
+            self.attrs = {'use_xpu': True}
+            self.outputs = {'Out': self.inputs['X'].mean(axis=0)}
+
+    class TestMeanOp8D(TestMeanOp):
+
+        def setUp(self):
+            super().setUp()
+            self.inputs = {
+                'X': np.random.random(
+                    (1, 3, 1, 2, 1, 4, 3, 10)).astype(self.dtype)
+            }
+            self.attrs = {'dim': (0, 3), 'use_xpu': True}
+            self.outputs = {'Out': self.inputs['X'].mean(axis=(0, 3))}
+
+
+class XPUTestReduce(XPUOpTestWrapper):
+
+    def __init__(self):
+        self.op_name = 'reduce_mean'
+        self.use_dynamic_create_class = False
+
+    class Test1DReduce(XPUOpTest):
+
+        def setUp(self):
+            self.dtype = self.in_type
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "reduce_mean"
+            self.inputs = {'X': np.random.random(120).astype(self.dtype)}
+            self.attrs = {'use_xpu': True}
+            self.outputs = {'Out': self.inputs['X'].mean(axis=0)}
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        # There is a api bug in checking grad when dim[0] > 0
+        # def test_check_grad(self):
+        #     self.check_output_with_place(self.place, ['X'], 'Out')
+
+    class Test2DReduce0(Test1DReduce):
+
+        def setUp(self):
+            super().setUp()
+            self.attrs = {'dim': [0], 'use_xpu': True}
+            self.inputs = {'X': np.random.random((20, 10)).astype(self.dtype)}
+            self.outputs = {'Out': self.inputs['X'].mean(axis=0)}
+
+    class Test2DReduce1(Test1DReduce):
+
+        def setUp(self):
+            super().setUp()
+            self.attrs = {'dim': [1], 'use_xpu': True}
+            self.inputs = {'X': np.random.random((20, 10)).astype(self.dtype)}
+            self.outputs = {
+                'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
+            }
+
+    class Test3DReduce0(Test1DReduce):
+
+        def setUp(self):
+            super().setUp()
+            self.attrs = {'dim': [1], 'use_xpu': True}
+            self.inputs = {'X': np.random.random((5, 6, 7)).astype(self.dtype)}
+            self.outputs = {
+                'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
+            }
+
+    class Test3DReduce1(Test1DReduce):
+
+        def setUp(self):
+            super().setUp()
+            self.attrs = {'dim': [2], 'use_xpu': True}
+            self.inputs = {'X': np.random.random((5, 6, 7)).astype(self.dtype)}
+            self.outputs = {
+                'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
+            }
+
+    class Test3DReduce2(Test1DReduce):
+
+        def setUp(self):
+            super().setUp()
+            self.attrs = {'dim': [-2], 'use_xpu': True}
+            self.inputs = {'X': np.random.random((5, 6, 7)).astype(self.dtype)}
+            self.outputs = {
+                'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
+            }
+
+    class Test3DReduce3(Test1DReduce):
+
+        def setUp(self):
+            super().setUp()
+            self.attrs = {'dim': [1, 2], 'use_xpu': True}
+            self.inputs = {'X': np.random.random((5, 6, 7)).astype(self.dtype)}
+            self.outputs = {
+                'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
+            }
+
+    class TestKeepDimReduce(Test1DReduce):
+
+        def setUp(self):
+            super().setUp()
+            self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+            self.attrs = {'dim': [1], 'keep_dim': True, 'use_xpu': True}
+            self.outputs = {
+                'Out':
+                self.inputs['X'].mean(axis=tuple(self.attrs['dim']),
+                                      keepdims=self.attrs['keep_dim'])
+            }
+
+    class TestKeepDim8DReduce(Test1DReduce):
+
+        def setUp(self):
+            super().setUp()
+            self.inputs = {
+                'X': np.random.random(
+                    (2, 5, 3, 2, 2, 3, 4, 2)).astype(self.dtype)
+            }
+            self.attrs = {'dim': (3, 4, 5), 'keep_dim': True, 'use_xpu': True}
+            self.outputs = {
+                'Out':
+                self.inputs['X'].mean(axis=tuple(self.attrs['dim']),
+                                      keepdims=self.attrs['keep_dim'])
+            }
+
+
+support_types = get_xpu_op_support_types('reduce_mean')
+for stype in support_types:
+    create_test_class(globals(), XPUTestMeanOp, stype)
+    create_test_class(globals(), XPUTestReduce, stype)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_rmsprop_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_rmsprop_op_xpu.py
index 2e8853de44a9a..020dbf344b68a 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_rmsprop_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_rmsprop_op_xpu.py
@@ -1,300 +1,164 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import sys
-
-sys.path.append("..")
-
-import unittest
-import numpy as np
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-from op_test_xpu import XPUOpTest
-import paddle.fluid as fluid
-import paddle
-'''
-def create_selected_rows_and_tensor(scope, place, height, row_num,
-                                    embedding_size):
-    sr = scope.var("@selected_rows@").get_selected_rows()
-    tensor = scope.var("grad").get_tensor()
-
-    rows = np.random.random_integers(
-        low=0, high=height - 1, size=[row_num, ]).astype('int64')
-    sr_val = np.random.random(size=[row_num, embedding_size]).astype('float32')
-
-    sr.set_height(height)
-    sr.set_rows(rows)
-    sr.get_tensor().set(sr_val, place)
-
-    tensor_val = np.zeros(shape=[height, embedding_size], dtype='float32')
-    for i in range(row_num):
-        row = rows[i]
-        tensor_val[row, :] = tensor_val[row, :] + sr_val[i, :]
-
-    tensor.set(tensor_val, place)
-    return tensor_val, sr_val
-'''
-"""
-class TestBase(XPUOpTest):
-    op_type = 'rmsprop'
-
-    def setup(self,
-              place,
-              is_sparse,
-              centered,
-              size,
-              row_num=None,
-              epsilon=1e-6):
-
-        np.random.seed(5)  # fix seed
-
-        self.scope = fluid.global_scope()
-        self.place = place
-
-        self.param_name = 'param'
-        self.param = np.random.random(size).astype('float32')
-
-        self.mean_square_name = 'mean_square'
-        self.mean_square = np.random.uniform(
-            low=1, high=2, size=size).astype('float32')
-
-        self.mean_grad_name = 'mean_grad'
-        self.mean_grad = np.random.random(size).astype('float32')
-
-        self.lr_name = 'lr'
-        self.learning_rate = np.array([0.01]).astype('float32')
-
-        self.grad_name = 'grad'
-        self.is_sparse = is_sparse
-
-        self.grad = np.random.random(size).astype('float32')
-        grad_tensor = self.scope.var(self.grad_name).get_tensor()
-        grad_tensor.set(self.grad, place)
-
-        self.moment_name = 'moment'
-        self.moment = np.random.uniform(
-            low=0, high=1, size=size).astype('float32')
-
-        self.epsilon = epsilon
-        self.decay = 0.9
-        self.momentum = 0.1
-        self.centered = centered
-
-        self.ms_out = self.decay * self.mean_square + (1 - self.decay
-                                                       ) * self.grad * self.grad
-        if centered:
-            self.mg_out = self.decay * self.mean_grad + (1 - self.decay
-                                                         ) * self.grad
-            self.moment_out = self.momentum * self.moment + \
-                              self.learning_rate * self.grad / np.sqrt(self.ms_out - np.square(self.mg_out) + self.epsilon)
-        else:
-            self.moment_out = self.momentum * self.moment + \
-                              self.learning_rate * self.grad / np.sqrt(self.ms_out + self.epsilon)
-
-        self.param_out = self.param - self.moment_out
-
-        # create and initialize Param Variable
-        self.param_tensor = self.scope.var(self.param_name).get_tensor()
-        self.param_tensor.set(self.param, place)
-
-        self.mean_square_tensor = self.scope.var(
-            self.mean_square_name).get_tensor()
-        self.mean_square_tensor.set(self.mean_square, place)
-
-        lr = self.scope.var(self.lr_name).get_tensor()
-        lr.set(self.learning_rate, place)
-
-        self.moment_tensor = self.scope.var(self.moment_name).get_tensor()
-        self.moment_tensor.set(self.moment, place)
-
-        if self.centered:
-            self.mean_grad_tensor = self.scope.var(
-                self.mean_grad_name).get_tensor()
-            self.mean_grad_tensor.set(self.mean_grad, place)
-
-    def check(self, actual_t, expect_t, place, out_name, atol=1e-5):
-        self.assertTrue(
-            np.allclose(
-                actual_t, expect_t, atol=atol),
-            'Output (' + out_name + ') has diff at ' + str(place) + '\nExpect '
-            + str(expect_t) + '\n' + 'But Got' + str(actual_t))
-
-
-class TestRmspropOp(TestBase):
-    def check_with_place(self,
-                         place,
-                         is_sparse,
-                         centered,
-                         size,
-                         row_num=None,
-                         epsilon=1e-6):
-        self.setup(place, is_sparse, centered, size, row_num, epsilon)
-        self.run_and_check()
-
-    def run_and_check(self):
-        #grad_name = self.grad_sr_name if self.is_sparse else self.grad_name
-        grad_name = self.grad_name
-
-        kwargs = {
-            'Param': self.param_name,
-            'Grad': grad_name,
-            'MeanSquare': self.mean_square_name,
-            'Moment': self.moment_name,
-            'LearningRate': self.lr_name,
-            'ParamOut': self.param_name,
-            'MeanSquareOut': self.mean_square_name,
-            'MomentOut': self.moment_name,
-            'epsilon': self.epsilon,
-            'decay': self.decay,
-            'momentum': self.momentum,
-            'centered': self.centered
-        }
-
-        if self.centered:
-            kwargs['MeanGrad'] = self.mean_grad_name
-            kwargs['MeanGradOut'] = self.mean_grad_name
-
-        rmsprop_op = Operator('rmsprop', **kwargs)
-        atol = 1e-6
-
-        rmsprop_op.run(self.scope, self.place)
-
-        self.check(
-            np.array(self.mean_square_tensor),
-            self.ms_out,
-            self.place,
-            self.mean_square_name,
-            atol=atol)
-        self.check(
-            np.array(self.moment_tensor),
-            self.moment_out,
-            self.place,
-            self.moment_name,
-            atol=atol)
-        self.check(
-            np.array(self.param_tensor),
-            self.param_out,
-            self.place,
-            self.param_name,
-            atol=atol)
-
-        if self.centered:
-            self.check(
-                np.array(self.mean_grad_tensor), self.mg_out, self.place,
-                self.mean_grad_name)
-
-    def test_rmsprop(self):
-        places = [paddle.XPUPlace(0)]
-
-        size = (128, 320)
-        for place in places:
-            for centered in [False]:
-                with fluid.scope_guard(core.Scope()):
-                    self.check_with_place(
-                        place, is_sparse=False, centered=centered, size=size)
-
-                with fluid.scope_guard(core.Scope()):
-                    self.check_with_place(
-                        place,
-                        is_sparse=True,
-                        centered=centered,
-                        row_num=512,
-                        size=size)
-
-                with fluid.scope_guard(core.Scope()):
-                    self.check_with_place(
-                        place,
-                        is_sparse=True,
-                        centered=centered,
-                        row_num=60,
-                        size=size, )
-
-
-class TestRMSPropV2(XPUOpTest):
-    op_type = 'rmsprop'
-
-    def test_rmsprop_dygraph(self):
-        paddle.disable_static()
-        value = np.arange(26).reshape(2, 13).astype('float32')
-        a = paddle.to_tensor(value)
-        linear = paddle.nn.Linear(13, 5)
-        # This can be any optimizer supported by dygraph.
-        adam = paddle.optimizer.RMSProp(
-            learning_rate=0.01,
-            parameters=linear.parameters(),
-            weight_decay=0.01)
-        out = linear(a)
-        out.backward()
-        adam.step()
-        adam.clear_gradients()
-
-    def test_rmsprop(self):
-        place = paddle.XPUPlace(0)
-        paddle.enable_static()
-        main = fluid.Program()
-        with fluid.program_guard(main):
-            x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-            y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-            y_predict = fluid.layers.fc(input=x, size=1, act=None)
-            cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-            avg_cost = fluid.layers.mean(cost)
-
-            print(avg_cost.shape)
-            linear = paddle.nn.Linear(13, 5)
-            rms_optimizer = paddle.optimizer.RMSProp(
-                learning_rate=0.1, parameters=linear.parameters())
-            rms_optimizer.minimize(avg_cost)
-
-            fetch_list = [avg_cost]
-            train_reader = paddle.batch(
-                paddle.dataset.uci_housing.train(), batch_size=1)
-            feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
-            for data in train_reader():
-                exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
-
-    def test_raise_error(self):
-        self.assertRaises(ValueError, paddle.optimizer.RMSProp, None)
-        self.assertRaises(
-            ValueError, paddle.optimizer.RMSProp, learning_rate=0.1, rho=None)
-        self.assertRaises(
-            ValueError,
-            paddle.optimizer.RMSProp,
-            learning_rate=0.1,
-            epsilon=None)
-        self.assertRaises(
-            ValueError,
-            paddle.optimizer.RMSProp,
-            learning_rate=0.1,
-            momentum=None)
-
-    def test_rmsprop_op_invalid_input(self):
-        paddle.disable_static()
-        linear = paddle.nn.Linear(10, 10)
-        with self.assertRaises(ValueError):
-            adam = paddle.optimizer.RMSProp(
-                0.1, epsilon=-1, parameters=linear.parameters())
-        with self.assertRaises(ValueError):
-            adam = paddle.optimizer.RMSProp(
-                0.1, momentum=-1, parameters=linear.parameters())
-        with self.assertRaises(ValueError):
-            adam = paddle.optimizer.RMSProp(
-                0.1, rho=-1, parameters=linear.parameters())
-"""
-
-if __name__ == "__main__":
-    paddle.enable_static()
-    unittest.main()
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+
+sys.path.append("..")
+
+import paddle
+import paddle.fluid.core as core
+
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
+paddle.enable_static()
+
+
+def calculate_rmsprop_by_numpy(param, grad, mean_square, moment, learning_rate,
+                               epsilon, decay, momentum):
+    mean_square_out = decay * mean_square + (1 - decay) * grad * grad
+    moment_out = momentum * moment + learning_rate * grad / np.sqrt(
+        mean_square_out + epsilon)
+    param_out = param - moment_out
+    return param_out, mean_square_out, moment_out
+
+
+class XPUTestRMSPropOP(XPUOpTestWrapper):
+
+    def __init__(self):
+        self.op_name = 'rmsprop'
+        self.use_dynamic_create_class = False
+
+    class TestRMSPropOPBase(XPUOpTest):
+
+        def setUp(self):
+            self.place = paddle.XPUPlace(0)
+            self.xpu_version = core.get_xpu_device_version(0)
+            self.init_dtype()
+            self.set_case()
+
+        def set_case(self):
+            self.op_type = 'rmsprop'
+            self.dtype = self.in_type
+            self.init_config()
+
+            self.param = np.random.uniform(-1, 1,
+                                           self.input_shape).astype(self.dtype)
+            self.grad = np.random.uniform(-1, 1,
+                                          self.input_shape).astype(self.dtype)
+            self.mean_square = np.random.uniform(0, 1, self.input_shape).astype(
+                self.dtype)
+            self.moment = np.random.uniform(-1, 1,
+                                            self.input_shape).astype(self.dtype)
+
+            self.mean_grad = np.random.uniform(-1, 1, self.input_shape).astype(
+                self.dtype)
+            self.mean_grad_out = np.random.uniform(
+                -1, 1, self.input_shape).astype(self.dtype)
+
+            param_out, mean_square_out, moment_out = calculate_rmsprop_by_numpy(
+                param=self.param,
+                grad=self.grad,
+                mean_square=self.mean_square,
+                moment=self.moment,
+                learning_rate=self.learning_rate,
+                epsilon=self.epsilon,
+                decay=self.decay,
+                momentum=self.momentum)
+            self.inputs = {
+                'Param': self.param,
+                'Grad': self.grad,
+                'MeanSquare': self.mean_square,
+                'Moment': self.moment,
+                'LearningRate': self.learning_rate,
+                'MeanGrad': self.mean_grad,
+                'MeanGradOut': self.mean_grad_out,
+            }
+            self.attrs = {
+                'use_xpu': True,
+                'epsilon': self.epsilon,
+                'decay': self.decay,
+                'momentum': self.momentum,
+                'centered':
+                False,  # TODO(houj04): when XDNN api supports 'center = True', add more test cases
+            }
+            self.outputs = {
+                'ParamOut': param_out,
+                'MomentOut': moment_out,
+                'MeanSquareOut': mean_square_out,
+                'MeanGradOut': self.mean_grad_out
+            }
+
+        def init_dtype(self):
+            self.dtype = np.float32
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place,
+                                         no_check_set=['MeanGradOut'])
+
+        def init_config(self):
+            self.input_shape = [864]
+            self.learning_rate = np.array([0.001]).astype(self.dtype)
+            self.epsilon = 1e-4
+            self.decay = 0.9
+            self.momentum = 0.1
+
+    class XPUTestRMSProp1(TestRMSPropOPBase):
+
+        def init_config(self):
+            self.input_shape = [2, 768]
+            self.learning_rate = np.array([0.002]).astype(self.dtype)
+            self.epsilon = 1e-4
+            self.decay = 0.9
+            self.momentum = 0.1
+
+    class XPUTestRMSProp2(TestRMSPropOPBase):
+
+        def init_config(self):
+            self.input_shape = [3, 8, 4096]
+            self.learning_rate = np.array([0.005]).astype(self.dtype)
+            self.epsilon = 1e-6
+            self.decay = 0.95
+            self.momentum = 0
+
+    class XPUTestRMSProp3(TestRMSPropOPBase):
+
+        def init_config(self):
+            self.input_shape = [1024]
+            self.learning_rate = np.array([0.01]).astype(self.dtype)
+            self.epsilon = 1e-5
+            self.decay = 0.99
+            self.momentum = 0.02
+
+    class XPUTestRMSProp4(TestRMSPropOPBase):
+
+        def init_config(self):
+            self.input_shape = [2, 2, 255]
+            self.learning_rate = np.array([0.0005]).astype(self.dtype)
+            self.epsilon = 1e-3
+            self.decay = 0.8
+            self.momentum = 0.002
+
+
+support_types = get_xpu_op_support_types('rmsprop')
+for stype in support_types:
+    create_test_class(globals(), XPUTestRMSPropOP, stype)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py
index 4c830b1e8729a..deebd2e02ff8a 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py
@@ -20,208 +20,220 @@
 import math
 import numpy as np
 import paddle.fluid.core as core
-from op_test import OpTest, skip_check_grad_ci
 from op_test_xpu import XPUOpTest
 import paddle
-import paddle.fluid as fluid
-from paddle.fluid import Program, program_guard
-
-
-class TestROIAlignOp(XPUOpTest):
-
-    def set_data(self):
-        self.init_test_case()
-        self.make_rois()
-        self.calc_roi_align()
-
-        self.inputs = {
-            'X': self.x,
-            'ROIs': (self.rois[:, 1:5], self.rois_lod),
-        }
-        self.attrs = {
-            'spatial_scale': self.spatial_scale,
-            'pooled_height': self.pooled_height,
-            'pooled_width': self.pooled_width,
-            'sampling_ratio': self.sampling_ratio,
-            'aligned': self.continuous_coordinate
-        }
-
-        self.outputs = {'Out': self.out_data}
-
-    def init_test_case(self):
-        self.batch_size = 3
-        self.channels = 3
-        self.height = 8
-        self.width = 6
-
-        self.xpu_version = core.get_xpu_device_version(0)
-
-        # n, c, h, w
-        self.x_dim = (self.batch_size, self.channels, self.height, self.width)
-
-        self.spatial_scale = 1.0 / 2.0
-        self.pooled_height = 2
-        self.pooled_width = 2
-        self.sampling_ratio = -1
-        if self.xpu_version == core.XPUVersion.XPU1:
-            self.continuous_coordinate = False
-        else:
-            self.continuous_coordinate = bool(np.random.randint(2))
-        self.x = np.random.random(self.x_dim).astype('float32')
-
-    def pre_calc(self, x_i, roi_xmin, roi_ymin, roi_bin_grid_h, roi_bin_grid_w,
-                 bin_size_h, bin_size_w):
-        count = roi_bin_grid_h * roi_bin_grid_w
-        bilinear_pos = np.zeros(
-            [self.channels, self.pooled_height, self.pooled_width, count, 4],
-            np.float32)
-        bilinear_w = np.zeros([self.pooled_height, self.pooled_width, count, 4],
-                              np.float32)
-        for ph in range(self.pooled_width):
-            for pw in range(self.pooled_height):
-                c = 0
-                for iy in range(roi_bin_grid_h):
-                    y = roi_ymin + ph * bin_size_h + (iy + 0.5) * \
-                        bin_size_h / roi_bin_grid_h
-                    for ix in range(roi_bin_grid_w):
-                        x = roi_xmin + pw * bin_size_w + (ix + 0.5) * \
-                            bin_size_w / roi_bin_grid_w
-                        if y < -1.0 or y > self.height or \
-                               x < -1.0 or x > self.width:
-                            continue
-                        if y <= 0:
-                            y = 0
-                        if x <= 0:
-                            x = 0
-                        y_low = int(y)
-                        x_low = int(x)
-                        if y_low >= self.height - 1:
-                            y = y_high = y_low = self.height - 1
-                        else:
-                            y_high = y_low + 1
-                        if x_low >= self.width - 1:
-                            x = x_high = x_low = self.width - 1
-                        else:
-                            x_high = x_low + 1
-                        ly = y - y_low
-                        lx = x - x_low
-                        hy = 1 - ly
-                        hx = 1 - lx
-                        for ch in range(self.channels):
-                            bilinear_pos[ch, ph, pw, c, 0] = x_i[ch, y_low,
-                                                                 x_low]
-                            bilinear_pos[ch, ph, pw, c, 1] = x_i[ch, y_low,
-                                                                 x_high]
-                            bilinear_pos[ch, ph, pw, c, 2] = x_i[ch, y_high,
-                                                                 x_low]
-                            bilinear_pos[ch, ph, pw, c, 3] = x_i[ch, y_high,
-                                                                 x_high]
-                        bilinear_w[ph, pw, c, 0] = hy * hx
-                        bilinear_w[ph, pw, c, 1] = hy * lx
-                        bilinear_w[ph, pw, c, 2] = ly * hx
-                        bilinear_w[ph, pw, c, 3] = ly * lx
-                        c = c + 1
-        return bilinear_pos, bilinear_w
-
-    def calc_roi_align(self):
-        self.out_data = np.zeros(
-            (self.rois_num, self.channels, self.pooled_height,
-             self.pooled_width)).astype('float32')
-
-        for i in range(self.rois_num):
-            roi = self.rois[i]
-            roi_batch_id = int(roi[0])
-            x_i = self.x[roi_batch_id]
-            roi_offset = 0.5 if self.continuous_coordinate else 0
-            roi_xmin = roi[1] * self.spatial_scale - roi_offset
-            roi_ymin = roi[2] * self.spatial_scale - roi_offset
-            roi_xmax = roi[3] * self.spatial_scale - roi_offset
-            roi_ymax = roi[4] * self.spatial_scale - roi_offset
-            roi_width = roi_xmax - roi_xmin
-            roi_height = roi_ymax - roi_ymin
-            if not self.continuous_coordinate:
-                roi_width = max(roi_width, 1)
-                roi_height = max(roi_height, 1)
-            bin_size_h = float(roi_height) / float(self.pooled_height)
-            bin_size_w = float(roi_width) / float(self.pooled_width)
-            roi_bin_grid_h = self.sampling_ratio if self.sampling_ratio > 0 else \
-                                 math.ceil(roi_height / self.pooled_height)
-            roi_bin_grid_w = self.sampling_ratio if self.sampling_ratio > 0 else \
-                                 math.ceil(roi_width / self.pooled_width)
-            count = int(roi_bin_grid_h * roi_bin_grid_w)
-            pre_size = count * self.pooled_width * self.pooled_height
-            bilinear_pos, bilinear_w = self.pre_calc(x_i, roi_xmin, roi_ymin,
-                                                     int(roi_bin_grid_h),
-                                                     int(roi_bin_grid_w),
-                                                     bin_size_h, bin_size_w)
-            for ch in range(self.channels):
-                align_per_bin = (bilinear_pos[ch] * bilinear_w).sum(axis=-1)
-                output_val = align_per_bin.mean(axis=-1)
-                self.out_data[i, ch, :, :] = output_val
-
-    def make_rois(self):
-        rois = []
-        self.rois_lod = [[]]
-        for bno in range(self.batch_size):
-            self.rois_lod[0].append(bno + 1)
-            for i in range(bno + 1):
-                x1 = np.random.random_integers(
-                    0, self.width // self.spatial_scale - self.pooled_width)
-                y1 = np.random.random_integers(
-                    0, self.height // self.spatial_scale - self.pooled_height)
-
-                x2 = np.random.random_integers(x1 + self.pooled_width,
-                                               self.width // self.spatial_scale)
-                y2 = np.random.random_integers(
-                    y1 + self.pooled_height, self.height // self.spatial_scale)
-
-                roi = [bno, x1, y1, x2, y2]
-                rois.append(roi)
-        self.rois_num = len(rois)
-        self.rois = np.array(rois).astype("float32")
-
-    def setUp(self):
-        self.op_type = "roi_align"
-        self.set_data()
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            paddle.enable_static()
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
-
-    def test_check_grad(self):
-        if core.is_compiled_with_xpu():
-            paddle.enable_static()
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, {'X'}, 'Out')
-
-
-class TestROIAlignInLodOp(TestROIAlignOp):
-
-    def set_data(self):
-        self.init_test_case()
-        self.make_rois()
-        self.calc_roi_align()
-
-        seq_len = self.rois_lod[0]
-
-        self.inputs = {
-            'X': self.x,
-            'ROIs': (self.rois[:, 1:5], self.rois_lod),
-            'RoisNum': np.asarray(seq_len).astype('int32')
-        }
-
-        self.attrs = {
-            'spatial_scale': self.spatial_scale,
-            'pooled_height': self.pooled_height,
-            'pooled_width': self.pooled_width,
-            'sampling_ratio': self.sampling_ratio,
-            'aligned': self.continuous_coordinate
-        }
-
-        self.outputs = {'Out': self.out_data}
-
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
+paddle.enable_static()
+
+
+class XPUTestROIAlignOp(XPUOpTestWrapper):
+
+    def __init__(self):
+        self.op_name = 'roi_align'
+        self.use_dynamic_create_class = False
+
+    class TestROIAlignOp(XPUOpTest):
+
+        def set_data(self):
+            self.init_test_case()
+            self.make_rois()
+            self.calc_roi_align()
+
+            self.inputs = {
+                'X': self.x,
+                'ROIs': (self.rois[:, 1:5], self.rois_lod),
+            }
+            self.attrs = {
+                'spatial_scale': self.spatial_scale,
+                'pooled_height': self.pooled_height,
+                'pooled_width': self.pooled_width,
+                'sampling_ratio': self.sampling_ratio,
+                'aligned': self.continuous_coordinate
+            }
+
+            self.outputs = {'Out': self.out_data}
+
+        def init_test_case(self):
+            self.batch_size = 3
+            self.channels = 3
+            self.height = 8
+            self.width = 6
+
+            self.xpu_version = core.get_xpu_device_version(0)
+
+            # n, c, h, w
+            self.x_dim = (self.batch_size, self.channels, self.height,
+                          self.width)
+
+            self.spatial_scale = 1.0 / 2.0
+            self.pooled_height = 2
+            self.pooled_width = 2
+            self.sampling_ratio = -1
+            if self.xpu_version == core.XPUVersion.XPU1:
+                self.continuous_coordinate = False
+            else:
+                self.continuous_coordinate = bool(np.random.randint(2))
+            self.x = np.random.random(self.x_dim).astype(self.dtype)
+
+        def pre_calc(self, x_i, roi_xmin, roi_ymin, roi_bin_grid_h,
+                     roi_bin_grid_w, bin_size_h, bin_size_w):
+            count = roi_bin_grid_h * roi_bin_grid_w
+            bilinear_pos = np.zeros([
+                self.channels, self.pooled_height, self.pooled_width, count, 4
+            ], np.float32)
+            bilinear_w = np.zeros(
+                [self.pooled_height, self.pooled_width, count, 4], np.float32)
+            for ph in range(self.pooled_width):
+                for pw in range(self.pooled_height):
+                    c = 0
+                    for iy in range(roi_bin_grid_h):
+                        y = roi_ymin + ph * bin_size_h + (iy + 0.5) * \
+                            bin_size_h / roi_bin_grid_h
+                        for ix in range(roi_bin_grid_w):
+                            x = roi_xmin + pw * bin_size_w + (ix + 0.5) * \
+                                bin_size_w / roi_bin_grid_w
+                            if y < -1.0 or y > self.height or \
+                                x < -1.0 or x > self.width:
+                                continue
+                            if y <= 0:
+                                y = 0
+                            if x <= 0:
+                                x = 0
+                            y_low = int(y)
+                            x_low = int(x)
+                            if y_low >= self.height - 1:
+                                y = y_high = y_low = self.height - 1
+                            else:
+                                y_high = y_low + 1
+                            if x_low >= self.width - 1:
+                                x = x_high = x_low = self.width - 1
+                            else:
+                                x_high = x_low + 1
+                            ly = y - y_low
+                            lx = x - x_low
+                            hy = 1 - ly
+                            hx = 1 - lx
+                            for ch in range(self.channels):
+                                bilinear_pos[ch, ph, pw, c, 0] = x_i[ch, y_low,
+                                                                     x_low]
+                                bilinear_pos[ch, ph, pw, c, 1] = x_i[ch, y_low,
+                                                                     x_high]
+                                bilinear_pos[ch, ph, pw, c, 2] = x_i[ch, y_high,
+                                                                     x_low]
+                                bilinear_pos[ch, ph, pw, c, 3] = x_i[ch, y_high,
+                                                                     x_high]
+                            bilinear_w[ph, pw, c, 0] = hy * hx
+                            bilinear_w[ph, pw, c, 1] = hy * lx
+                            bilinear_w[ph, pw, c, 2] = ly * hx
+                            bilinear_w[ph, pw, c, 3] = ly * lx
+                            c = c + 1
+            return bilinear_pos, bilinear_w
+
+        def calc_roi_align(self):
+            self.out_data = np.zeros(
+                (self.rois_num, self.channels, self.pooled_height,
+                 self.pooled_width)).astype(self.dtype)
+
+            for i in range(self.rois_num):
+                roi = self.rois[i]
+                roi_batch_id = int(roi[0])
+                x_i = self.x[roi_batch_id]
+                roi_offset = 0.5 if self.continuous_coordinate else 0
+                roi_xmin = roi[1] * self.spatial_scale - roi_offset
+                roi_ymin = roi[2] * self.spatial_scale - roi_offset
+                roi_xmax = roi[3] * self.spatial_scale - roi_offset
+                roi_ymax = roi[4] * self.spatial_scale - roi_offset
+                roi_width = roi_xmax - roi_xmin
+                roi_height = roi_ymax - roi_ymin
+                if not self.continuous_coordinate:
+                    roi_width = max(roi_width, 1)
+                    roi_height = max(roi_height, 1)
+                bin_size_h = float(roi_height) / float(self.pooled_height)
+                bin_size_w = float(roi_width) / float(self.pooled_width)
+                roi_bin_grid_h = self.sampling_ratio if self.sampling_ratio > 0 else \
+                                    math.ceil(roi_height / self.pooled_height)
+                roi_bin_grid_w = self.sampling_ratio if self.sampling_ratio > 0 else \
+                                    math.ceil(roi_width / self.pooled_width)
+                count = int(roi_bin_grid_h * roi_bin_grid_w)
+                pre_size = count * self.pooled_width * self.pooled_height
+                bilinear_pos, bilinear_w = self.pre_calc(
+                    x_i, roi_xmin, roi_ymin, int(roi_bin_grid_h),
+                    int(roi_bin_grid_w), bin_size_h, bin_size_w)
+                for ch in range(self.channels):
+                    align_per_bin = (bilinear_pos[ch] * bilinear_w).sum(axis=-1)
+                    output_val = align_per_bin.mean(axis=-1)
+                    self.out_data[i, ch, :, :] = output_val
+
+        def make_rois(self):
+            rois = []
+            self.rois_lod = [[]]
+            for bno in range(self.batch_size):
+                self.rois_lod[0].append(bno + 1)
+                for i in range(bno + 1):
+                    x1 = np.random.random_integers(
+                        0, self.width // self.spatial_scale - self.pooled_width)
+                    y1 = np.random.random_integers(
+                        0,
+                        self.height // self.spatial_scale - self.pooled_height)
+
+                    x2 = np.random.random_integers(
+                        x1 + self.pooled_width,
+                        self.width // self.spatial_scale)
+                    y2 = np.random.random_integers(
+                        y1 + self.pooled_height,
+                        self.height // self.spatial_scale)
+
+                    roi = [bno, x1, y1, x2, y2]
+                    rois.append(roi)
+            self.rois_num = len(rois)
+            self.rois = np.array(rois).astype(self.dtype)
+
+        def setUp(self):
+            self.set_xpu()
+            self.op_type = "roi_align"
+            self.place = paddle.XPUPlace(0)
+            self.dtype = self.in_type
+            self.set_data()
+
+        def set_xpu(self):
+            self.__class__.use_xpu = True
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            self.check_grad_with_place(self.place, {'X'}, 'Out')
+
+    class TestROIAlignInLodOp(TestROIAlignOp):
+
+        def set_data(self):
+            self.init_test_case()
+            self.make_rois()
+            self.calc_roi_align()
+
+            seq_len = self.rois_lod[0]
+
+            self.inputs = {
+                'X': self.x,
+                'ROIs': (self.rois[:, 1:5], self.rois_lod),
+                'RoisNum': np.asarray(seq_len).astype('int32')
+            }
+
+            self.attrs = {
+                'spatial_scale': self.spatial_scale,
+                'pooled_height': self.pooled_height,
+                'pooled_width': self.pooled_width,
+                'sampling_ratio': self.sampling_ratio,
+                'aligned': self.continuous_coordinate
+            }
+
+            self.outputs = {'Out': self.out_data}
+
+
+support_types = get_xpu_op_support_types('roi_align')
+for stype in support_types:
+    create_test_class(globals(), XPUTestROIAlignOp, stype)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py
index e174d24533215..8953773d8cde4 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py
@@ -79,7 +79,7 @@ def runTest(self):
         out = fluid.layers.l2_normalize(x=emb, axis=-1)
 
         cost = fluid.layers.square_error_cost(input=out, label=label)
-        avg_cost = fluid.layers.mean(cost)
+        avg_cost = paddle.mean(cost)
         sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
         sgd_optimizer.minimize(avg_cost)
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_unsqueeze2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_unsqueeze2_op_xpu.py
index e9fc66ca4fcce..8ba7f6818882a 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_unsqueeze2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_unsqueeze2_op_xpu.py
@@ -69,7 +69,7 @@ def test_check_output(self):
 
         def test_check_grad(self):
             place = paddle.XPUPlace(0)
-            if self.dtype in [np.float32, np.float64]:
+            if self.dtype in [np.float32, np.float64, np.float16]:
                 self.check_grad_with_place(place, ['X'], 'Out')
             elif self.dtype == np.bool_:
                 return
@@ -147,7 +147,7 @@ def test_check_output(self):
 
         def test_check_grad(self):
             place = paddle.XPUPlace(0)
-            if self.dtype in [np.float32, np.float64]:
+            if self.dtype in [np.float32, np.float64, np.float16]:
                 self.check_grad_with_place(place, ['X'], 'Out')
             else:
                 return
@@ -217,7 +217,7 @@ def test_check_output(self):
 
         def test_check_grad(self):
             place = paddle.XPUPlace(0)
-            if self.dtype in [np.float32, np.float64]:
+            if self.dtype in [np.float32, np.float64, np.float16]:
                 self.check_grad_with_place(place, ['X'], 'Out')
             else:
                 return
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_where_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_where_op_xpu.py
index ad22ab86b932f..433ab36e1936b 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_where_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_where_op_xpu.py
@@ -115,7 +115,7 @@ def test_api(self):
                     y.stop_gradient = y_stop_gradient
 
                     result = paddle.where(cond, x, y)
-                    append_backward(fluid.layers.mean(result))
+                    append_backward(paddle.mean(result))
 
                     exe = fluid.Executor(self.place)
                     exe.run(startup)
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index 31a430789d636..16b3646a4a81a 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -465,7 +465,7 @@ def _run(self, inputs, labels=None):
                     idx] == core.VarDesc.VarType.FP16:
                 if isinstance(feed[n], core.LoDTensor):
                     feed[n] = feed[n]._as_type(core.VarDesc.VarType.FP16)
-                elif isinstance(feed[n], numpy.array):
+                elif isinstance(feed[n], np.array):
                     feed[n] = feed[n].astype('float16')
 
         if labels is not None:
diff --git a/python/paddle/incubate/__init__.py b/python/paddle/incubate/__init__.py
index bab1d92a83659..543b0b815c16e 100644
--- a/python/paddle/incubate/__init__.py
+++ b/python/paddle/incubate/__init__.py
@@ -35,7 +35,10 @@
 from . import nn  #noqa: F401
 from . import asp  #noqa: F401
 
+from ..fluid.layers.loss import identity_loss
+
 from ..fluid.incubate import fleet
+from . import xpu
 
 __all__ = [
     'LookAhead',
@@ -50,4 +53,5 @@
     'segment_mean',
     'segment_max',
     'segment_min',
+    'identity_loss',
 ]
diff --git a/python/paddle/incubate/autograd/__init__.py b/python/paddle/incubate/autograd/__init__.py
index 718bc018d9fe5..c5ff3b18d4d49 100644
--- a/python/paddle/incubate/autograd/__init__.py
+++ b/python/paddle/incubate/autograd/__init__.py
@@ -11,11 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from paddle.autograd.functional import Hessian, Jacobian, jvp, vjp
+from .functional import Hessian, Jacobian, jvp, vjp
+from .primapi import forward_grad, grad
 from .primx import prim2orig
-from .utils import enable_prim, disable_prim, prim_enabled
+from .utils import disable_prim, enable_prim, prim_enabled
 
 __all__ = [  # noqa
-    'vjp', 'jvp', 'Jacobian', 'Hessian', 'prim2orig', 'enable_prim',
-    'disable_prim', 'prim_enabled'
+    'vjp', 'jvp', 'Jacobian', 'Hessian', 'enable_prim', 'disable_prim',
+    'forward_grad', 'grad'
 ]
diff --git a/python/paddle/incubate/autograd/functional.py b/python/paddle/incubate/autograd/functional.py
new file mode 100644
index 0000000000000..3be95c88d12e7
--- /dev/null
+++ b/python/paddle/incubate/autograd/functional.py
@@ -0,0 +1,677 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import typing
+
+import paddle
+from paddle.fluid import framework
+from paddle.incubate.autograd import primapi, utils
+
+
+def vjp(func, xs, v=None):
+    r"""Computes the Vector-Jacobian product, a functional form of
+    reverse mode automatic differentiation.
+
+    Warning:
+        This API is in beta, the signatures could be changed in future version.
+
+    Args:
+        func(Callable): A function that takes ``xs`` as inputs parameter and
+            returns a sequence of Tensors or a Tensor.
+        xs(Tensor|Sequence[Tensor]): Used as positional arguments to evaluate
+            ``func``. ``xs`` is accepted as one Tensor or a sequence of Tensors.
+        v(Tensor|Sequence[Tensor]|None, optional): The cotangent vector invovled
+            in the VJP computation. ``v`` matches the size and shape of
+            ``func`` 's output. Defaults to None, which is equivalent to all
+            ones the same size of ``func`` 's output.
+
+    Returns:
+        output(tuple):
+        
+            - func_out(Tensor|tuple[Tensor]): The output of ``func(xs)`` .
+            - vjp(Tensor|tuple[Tensor]): The vjp result.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+
+            def func(x):
+                return paddle.matmul(x, x)
+
+            x = paddle.ones(shape=[2, 2], dtype='float32')
+            _, vjp_result = paddle.incubate.autograd.vjp(func, x)
+            print(vjp_result)
+            # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False,
+            #        [[4., 4.],
+            #         [4., 4.]])
+
+            v = paddle.to_tensor([[1.0, 0.0], [0.0, 0.0]])
+            _, vjp_result = paddle.incubate.autograd.vjp(func, x, v)
+            print(vjp_result)
+            # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False,
+            #        [[2., 1.],
+            #         [1., 0.]])
+    """
+    _check_inputs(func, xs, v)
+
+    # ``_seprate`` breaks the dependencies between ``xs`` and other
+    # variables. See more ``_seprate`` .
+    if paddle.fluid._non_static_mode() or not utils.prim_enabled():
+        xs, v = _separate(xs), _separate(v)
+    ys = func(*xs) if isinstance(xs, typing.Sequence) else func(xs)
+    _check_v_shape(v, ys)
+
+    return ys, _grad(ys, xs, v)
+
+
+def jvp(func, xs, v=None):
+    r"""
+    Computes the Jacobian-Vector product for a function at the given
+    inputs and a vector in the tangent space induced by the inputs.
+
+    Warning:
+        This API is in beta, the signatures could be changed in future version.
+
+    Args:
+        func(Callable): The ``func`` takes as input a Tensor or a Sequence
+            of Tensors and returns a Tensor or a Sequence of Tensors.
+        xs(Tensor|Sequence[Tensor]): Used as positional arguments to
+            evaluate ``func``.  The ``xs`` is accepted as one Tensor or a
+            Sequence of Tensors.
+        v(Tensor|Sequence[Tensor]|None, Optional): The tangent vector invovled
+            in the JVP computation. The ``v`` matches the size and shape of
+            ``xs`` . Default value is None and in this case is equivalent to 
+            all ones the same size of ``xs`` .
+
+    Returns:
+        output(tuple):
+
+            - func_out(Tensor|tuple[Tensor]): The output of ``func(xs)`` .
+            - jvp(Tensor|tuple[Tensor]): The jvp result.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+
+
+            def func(x):
+                return paddle.matmul(x, x)
+
+
+            x = paddle.ones(shape=[2, 2], dtype='float32')
+            _, jvp_result = paddle.incubate.autograd.jvp(func, x)
+            print(jvp_result)
+            # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False,
+            #        [[4., 4.],
+            #         [4., 4.]])
+            v = paddle.to_tensor([[1.0, 0.0], [0.0, 0.0]])
+            _, jvp_result = paddle.incubate.autograd.jvp(func, x, v)
+            print(jvp_result)
+            # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False,
+            #        [[2., 1.],
+            #         [1., 0.]])
+
+    """
+    _check_inputs(func, xs, v)
+    # ``_seprate`` breaks the dependencies between ``xs`` and other
+    # variables. See more ``_seprate`` .
+    if paddle.fluid._non_static_mode() or not utils.prim_enabled():
+        xs, v = _separate(xs), _separate(v)
+    ys = func(*xs) if isinstance(xs, typing.Sequence) else func(xs)
+    _check_v_shape(v, xs)
+
+    if not paddle.fluid._non_static_mode() and utils.prim_enabled():
+        return ys, primapi.forward_grad(ys, xs, v)
+    else:
+        return ys, _double_backward_trick(ys, xs, v)
+
+
+def _double_backward_trick(ys, xs, v):
+    """Double backward trick for computing ``jvp`` by ``vjp``
+    see details: https://j-towns.github.io/2017/06/12/A-new-trick.html
+    """
+    # The value of ys_grad is not important, it can be any random value in
+    # theory, but it's required to set stop_gradient=False.
+    ys_grad = _zeros_like_with_grad(ys)
+    xs_grad = _grad(ys, xs, ys_grad)
+    return _grad(xs_grad, ys_grad, v)
+
+
+def _zeros_like_with_grad(xs):
+    """Create a zero or zeros sequence Tensor like ``xs`` with a flag 
+    ``stop_graident=False`` .
+    """
+    if not isinstance(xs, typing.Sequence):
+        ys = paddle.zeros_like(xs)
+        ys.stop_gradient = False
+    else:
+        ys = []
+        for x in xs:
+            y = paddle.zeros_like(x)
+            y.stop_gradient = False
+            ys.append(y)
+    return ys
+
+
+class Jacobian(object):
+    r"""
+    Computes the Jacobian matrix of a given function.
+
+    If the function has multiple inputs and multiple outputs, during internal 
+    implementation, all input tensors are concatenated after being flatten, 
+    the batch dimension is retained, and the output is subject to the same 
+    processing rules.
+
+    Once the Jacobian ``J`` is constructed, you can use a multidimensional index 
+    to retrieve the submatrix of ``J``, as same as slicing a Tensor. The 
+    submatrix is lazily evaluated along row axis, and will be cached once 
+    evaluated.
+
+    For examples, supposing ``is_batched=True``, you can retrieve the submatrix 
+    by following methods:
+
+        * J[:], retrieving the full matrix.
+        * J[:, :, j], retrieving the partial derivatives w.r.t. the j'th input
+          variable.
+        * J[:, i, :], retrieving the partial derivatives w.r.t. the i'th output
+          variable.
+        * J[:, i, j], retrieving the partial derivatives w.r.t. the i'th output
+          variable and the j'th input variable.
+
+    Notes:
+
+        Eclipsis index is not supported currently.
+
+    Warning:
+        This API is in beta, the signatures could be changed in future version.
+
+    Args:
+
+        func (Callable): A python function that takes a Tensor or a sequence of 
+            Tensors as inputs(the first dimension is batch size) and
+            returns a Tensor  a sequence of Tensors.
+        xs (Tensor|Sequence[Tensor]): The input to the function ``func`` .
+        is_batched (bool): If true, the first axis is batch axis. Defaults to 
+            False.
+
+    Returns:
+
+        Jacobian (Object): A python object retains the Jacobian matrix.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+
+
+            def func(x, y):
+                return paddle.matmul(x, y)
+
+
+            x = paddle.to_tensor([[1., 2.], [3., 4.]])
+            J = paddle.incubate.autograd.Jacobian(func, [x, x])
+            print(J[:, :])
+            # Tensor(shape=[4, 8], dtype=float32, place=Place(gpu:0), stop_gradient=False,
+            #        [[1., 3., 0., 0., 1., 0., 2., 0.],
+            #         [2., 4., 0., 0., 0., 1., 0., 2.],
+            #         [0., 0., 1., 3., 3., 0., 4., 0.],
+            #         [0., 0., 2., 4., 0., 3., 0., 4.]])
+
+            print(J[0, :])
+            # Tensor(shape=[8], dtype=float32, place=Place(gpu:0), stop_gradient=False,
+            #        [1., 3., 0., 0., 1., 0., 2., 0.])
+            print(J[:, 0])
+            # Tensor(shape=[4], dtype=float32, place=Place(gpu:0), stop_gradient=False,
+            #        [1., 2., 0., 0.])
+
+    """
+
+    def __init__(self, func, xs, is_batched=False):
+        if not is_batched:
+            self._jacobian = _JacobianNoBatch(func, xs)
+        else:
+            self._jacobian = _JacobianBatchFirst(func, xs)
+
+    def __getitem__(self, indexes):
+        return self._jacobian[indexes]
+
+    @property
+    def shape(self):
+        """The shape of flattened Jacobian matrix.
+        """
+        return self._jacobian.shape
+
+
+class Hessian(object):
+    """
+    Computes the Hessian matrix  with a given ``func`` with respect to ``xs`` .
+
+    If the function has multiple inputs, during internal implementation, 
+    all input tensors are concatenated after being flatten, the batch dimension 
+    is retained.
+
+    The Hessian submatrix is lazily evaluated, and can be retrieved with a 
+    multidimensional indexes. See details ``Jacobian`` .
+
+    Warning:
+        This API is in beta, the signatures could be changed in future version.
+
+    Args:
+        func (Callable): A python function that takes a Tensor or a Tensor
+            sequence as inputs and returns a Tensor with shape 
+            ``[batch_size, 1]`` with batch or ``[1]`` without batch.
+        xs (Tensor|Sequence(Tensor)): The input Tensor or Tensor sequence of 
+            the function ``func``.
+        is_batched (bool): If true, the first axis is batch axis. Defaults to 
+            False.
+
+    Returns:
+
+        Hessian (Object): A python object retains the Hessian matrix.
+
+
+    Examples:
+
+    .. code-block:: python
+
+        import paddle
+
+
+        def reducer(x):
+            return paddle.sum(x * x)
+
+
+        x = paddle.rand([2, 2])
+        h = paddle.incubate.autograd.Hessian(reducer, x)
+        print(h[:])
+        # Tensor(shape=[4, 4], dtype=float32, place=Place(gpu:0), stop_gradient=False,
+        #        [[2., 0., 0., 0.],
+        #         [0., 2., 0., 0.],
+        #         [0., 0., 2., 0.],
+        #         [0., 0., 0., 2.]])
+    """
+
+    def __init__(self, func, xs, is_batched=False):
+
+        def _jac_func(*xs):
+            jac = Jacobian(func, xs, is_batched=is_batched)
+            if (is_batched and jac.shape[1] != 1) or (not is_batched
+                                                      and jac.shape[0] != 1):
+                raise RuntimeError(
+                    "The function given to Hessian shoud return as single element Tensor or batched single element Tensor."
+                )
+            return jac[:, 0, :] if is_batched else jac[0, :]
+
+        self.symbolic = Jacobian(_jac_func, xs, is_batched=is_batched)
+
+    def __getitem__(self, indexes):
+        return self.symbolic[indexes]
+
+    @property
+    def shape(self):
+        """The shape of flattened Hessian matrix.
+        """
+        return self.symbolic.shape
+
+
+class _Jacobian(object):
+    """The base class for computing Jacobian matrix.
+
+    ``_Jacobian`` implementes the core logic of multidimensional index and lazy 
+    evaluation for Jacobian matrix, subclass only need to overwrite following 
+    methods:
+
+        * ``_lazy_axis()``,  return the axis along which will be lazy 
+            evaluating.
+        * ``_flatten(xs)``, flattens the inputs ``xs``.
+        * ``_evaluate(index)``, evaluates one slice along ``_lazy_axis`` .
+
+    Notes:
+
+        Because currently PaddlePaddle only support reverse differentiation by 
+        ``paddle.grad``, so lazy evaluation is only supported along the row of 
+        Jacobian matrix, which means that slicing along row will get better 
+        performance.
+
+    """
+
+    def __init__(self, func, xs):
+        # Skip separating in prim mode temporarily, as detach and clone are not
+        # primitive operators.
+        if not paddle.fluid._non_static_mode() and utils.prim_enabled():
+            self._xs = xs
+        else:
+            self._xs = _separate(xs)
+        self._ys = func(*utils.as_tensors(self._xs))
+        self._flatten_xs = self._flatten(utils.as_tensors(self._xs))
+        self._flatten_ys = self._flatten(utils.as_tensors(self._ys))
+        self._cache = {}
+
+    @property
+    def shape(self):
+        raise NotImplementedError
+
+    @property
+    def _lazy_axis(self):
+        """"The axis of lazily evaluated."""
+        raise NotImplementedError
+
+    def _lazy_indexes(self, indexes):
+        idx = indexes[self._lazy_axis]
+        return (idx, ) if isinstance(idx, int) else tuple(
+            range(idx.start, idx.stop, idx.step))
+
+    def _flatten(self, xs):
+        raise NotImplementedError
+
+    def _shifted_indexes(self, indexes, lazy_axis_size=0):
+        idx = indexes[self._lazy_axis]
+        shifted_lazy_axis_idx = 0 if isinstance(idx, int) else slice(
+            0, lazy_axis_size, 1)
+        return indexes[:self._lazy_axis] + (
+            shifted_lazy_axis_idx, ) + indexes[self._lazy_axis + 1:]
+
+    def __getitem__(self, indexes):
+        indexes = _multi_index(indexes, self.shape)
+
+        if isinstance(indexes[self._lazy_axis], int):
+            other_indexes = indexes[:self._lazy_axis] + \
+                indexes[self._lazy_axis+1:]
+            return self._cached_evaluate(
+                indexes[self._lazy_axis])[other_indexes]
+        lazy_indexes = self._lazy_indexes(indexes)
+        # Using concat and reshape to replace stack operator temporarily, as
+        # it is not a primitive operator.
+        shape = list(self.shape)
+        shape[self._lazy_axis] = len(lazy_indexes)
+        part_jac = paddle.concat(
+            [self._cached_evaluate(i) for i in lazy_indexes],
+            axis=self._lazy_axis).reshape(shape)
+        return part_jac[self._shifted_indexes(indexes, len(lazy_indexes))]
+
+    def _cached_evaluate(self, k):
+        v = self._cache.get(k)
+        if v is None:
+            v = self._evaluate(k)
+            self._cache[k] = v
+        return v
+
+    def _evaluate(self, index):
+        """Evaluate one slice at along lazy axis."""
+        raise NotImplementedError
+
+
+class _JacobianNoBatch(_Jacobian):
+    """Compute Jacobian matrix without batch dimension.
+    Suppose the mapping is :math:`f: R^M \to R^N`, the output shape is 
+    ``(N, M)`` .
+    """
+
+    def __init__(self, func, xs):
+        super(_JacobianNoBatch, self).__init__(func, xs)
+
+    @property
+    def shape(self):
+        return (self._flatten_ys.shape[0], self._flatten_xs.shape[0])
+
+    @property
+    def _lazy_axis(self):
+        return 0
+
+    def _flatten(self, xs):
+        return paddle.concat(tuple(x.reshape((-1, )) for x in xs))
+
+    def _evaluate(self, row_index):
+        return self._flatten(_grad(
+            self._flatten_ys[row_index],
+            self._xs,
+        ))
+
+
+class _JacobianBatchFirst(_Jacobian):
+    """Compute Jacobian matrix with batch at first axis.
+    Suppose the mapping is :math:`f: R^{B,M} \to R^{B,N}`, the output shape is 
+    ``(B, N, M)`` .
+    """
+
+    def __init__(self, func, xs):
+        super(_JacobianBatchFirst, self).__init__(func, xs)
+
+    @property
+    def shape(self):
+        return (self._flatten_xs.shape[0], self._flatten_ys.shape[1],
+                self._flatten_xs.shape[1])
+
+    @property
+    def _lazy_axis(self):
+        return 1
+
+    def _flatten(self, xs):
+        return paddle.concat(
+            tuple(x.reshape((x.shape[0], -1)) for x in utils.as_tensors(xs)), 1)
+
+    def _evaluate(self, row_index):
+        return self._flatten(_grad(self._flatten_ys[:, row_index], self._xs))
+
+
+def _multi_index(indexes, shape):
+    """A tool for parsing N-dimensional index into a standard format.
+
+    Currently supporting following input format:
+        * ([positive|negative|slice], ...), the right-most elements can be 
+            omited.
+
+    The standard format after converted is slice tuple which contains N elements:
+        * ([positive|slice], ..., [positive|slice])
+
+    Notes: 
+        Ellipsis indexes such as ``(..., i), (i, ...)`` is not supported.
+
+    Args:
+        indexes (tuple): The input indexes.
+        shape (tuple): The input shape.
+
+    Returns:
+        tuple: The standard format index as the above description.
+    """
+    indexes = indexes if isinstance(indexes, typing.Sequence) else (indexes, )
+    if any(isinstance(i, type(Ellipsis)) for i in indexes):
+        raise IndexError('Ellipsis index currently is not supported.')
+    # Fill the right-most elements.
+    indexes = indexes + (slice(0, None, None), ) * (len(shape) - len(indexes))
+    # Convert to positive index.
+    positive_indexes = []
+    for i, index in enumerate(indexes):
+        if isinstance(index, slice):
+            index = slice(index.start or 0, index.stop or shape[i], index.step
+                          or 1)
+            positive_indexes.append(
+                slice(
+                    index.start + shape[i] if index.start < 0 else index.start,
+                    index.stop + shape[i] if index.stop < 0 else index.stop,
+                    # Negative step means index backward, no need to convert to
+                    # positive interger.
+                    index.step))
+        elif isinstance(index, int):
+            positive_indexes.append(index + shape[i] if index < 0 else index)
+        else:
+            raise TypeError(f'Not supported index type {index}.')
+    return tuple(positive_indexes)
+
+
+def _replace_none_with_zero_tensor(xs, refs):
+    if xs is None:
+        xs = paddle.zeros_like(refs)
+        xs.stop_gradient = refs.stop_gradient
+        return xs
+    elif isinstance(xs, typing.Sequence):
+        return tuple(
+            _replace_none_with_zero_tensor(x, refs[i])
+            for i, x in enumerate(xs))
+    else:
+        return xs
+
+
+def _grad(ys, xs, v=None):
+    """A gradient function that can be used in dynamic graph and static graph.
+
+    The ``grad`` combines ``paddle.grad`` used in dynamic graph and
+    ``paddle.static.gradients`` used in static graph, and do following changes:
+
+    * The ``allow_unused`` flag is removed and set defaults to true internally,
+        none in outputs will be replaced by zero tensor.
+    * The ``create_graph`` flag is removed and set defaults to true internally,
+        only makes sense in dynamic graph.
+    * When xs is a single Tensor, ``paddle.grad`` returns a list which only 
+        contains one Tensor. It may confuse users, thus in this case we improve 
+        to return a single Tensor in _grad interface.
+
+    Args:
+        ys (Tensor|Sequence[Tensor]): The output tensor or tensor sequence of
+            the graph to compute gradients.
+        xs (Tensor|Sequence[Tensor]): The input tensor or tensor sequence of the graph to
+            compute gradients. The returned values of this API are the
+            gradients of inputs .
+        v (Tensor|Sequence[Tensor]|None,optional): The initial gradient values
+            of outputs . If grad_outputs is None, the initial gradient values of
+            outputs would be Tensors filled with 1; if grad_outputs is not None,
+            it must have the same length as outputs , and in this case, the
+            initial gradient value of the i-th outputs would be: (1) a Tensor
+            filled with 1 when the i-th element of grad_outputs is None;
+            (2) the i-th element of grad_outputs when the i-th element of
+            grad_outputs is a Tensor. Default None.
+
+    Returns:
+        Tensor|tuple[Tensor]: Tensor or a tuple of Tensors, whose length is the 
+            same as the Tensor number inside inputs, and the i-th returned 
+            Tensor is the sum of gradients of outputs with respect to the i-th 
+            inputs.
+    """
+    if paddle.fluid._non_static_mode():
+        # paddle.grad returns a list though the inputs is a signle Tensor. The
+        # follow code snippet fixes the problem by return the first element of
+        # xs_grad when the xs is a signle Tensor.
+        xs_grad = paddle.grad(ys, xs, v, create_graph=True, allow_unused=True)
+        if isinstance(xs, paddle.fluid.framework.Variable) and isinstance(
+                xs_grad, typing.Sequence) and len(xs_grad) > 0:
+            xs_grad = xs_grad[0]
+    else:
+        xs_grad = paddle.incubate.autograd.grad(ys, xs, v)
+    return _replace_none_with_zero_tensor(xs_grad, xs)
+
+
+def _separate(xs):
+    """
+    ``_separate`` separates ``xs`` from the computation graph through ``clone`` 
+    or ``deteach`` .
+
+    Interally, ``paddle.grad(xs, ys)`` is stateful API implemented based on 
+    computional graph, which will reduce gradients along all path from ys to xs.
+
+    However, funcional autograd API such as ``vjp``, ``jvp`` is stateless, and 
+    only compute gradients with a given ``func`` .
+
+    For example, given a ``func`` :math:`y0=f(x0)`, supposing forward path is:
+    ``x0 -> y0``, ``x0 -> x1 -> y0`` .
+    ``paddle.grad(y0, x0)`` will reduce gradients along ``y0->x0`` and 
+    ``y0->x1->x0``, and ``vjp`` only need reduce along ``y0->x0``.
+
+    So, it's needed to clone or detach xs for breaking the dependencies with 
+    other variables.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            from paddle.autograd.functional import _separate
+
+
+            def func(x, y):
+                return x * y
+
+
+            x = paddle.ones((1,))
+            x.stop_gradient = False
+
+            y = func(x, x)
+            print(paddle.grad(y, x))
+            # [Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [2.])]
+
+            x1, x2 = _separate((x, x))
+            y = func(x1, x2)
+            print(paddle.grad(y, x1))
+            # [Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [1.])]
+
+    """
+    if isinstance(xs, typing.Sequence):
+        return tuple(_single_separate(x) for x in xs)
+    else:
+        return _single_separate(xs)
+
+
+def _single_separate(x):
+    if x is None:  # x maybe none because grad input's v defaults to none.
+        return x
+    if not x.stop_gradient:
+        return paddle.clone(x)
+    else:  # use detach to share memory when no need gradients.
+        x = x.detach()
+        x.stop_gradient = False
+        return x
+    return x
+
+
+def _check_inputs(func, xs, v=None):
+    if not callable(func):
+        raise TypeError(f"Expected 'fun' is Callable, but got {type(func)}.")
+
+    if not isinstance(xs, (framework.Variable, typing.Sequence)):
+        raise TypeError(f"Expected 'xs' is a Tensor|Sequence[Tensor],"
+                        f"but got {type(xs)}.")
+    if isinstance(xs, typing.Sequence) and not all(
+            isinstance(x, framework.Variable) for x in xs):
+        raise TypeError("All elements of 'xs' shoule be Tensor.")
+
+    if not isinstance(v, (framework.Variable, typing.Sequence, type(None))):
+        raise TypeError(
+            f"Expected 'v' is Tensor|Sequence[Tensor]|None, but got {type(v)}.")
+
+    if isinstance(v, typing.Sequence) and not all(
+            isinstance(e, framework.Variable) for e in v):
+        raise TypeError("All elements of 'xs' shoule be Tensor.")
+
+
+def _check_v_shape(v, refs):
+    if v is None:
+        return
+
+    v, refs = utils.as_tensors(v), utils.as_tensors(refs)
+    if len(refs) != len(v):
+        raise RuntimeError(f"The argument v is a tuple of invalid length:"
+                           f"should be {len(refs)} but got {len(v)}.")
+
+    for index, (element_v, element_ref) in enumerate(zip(v, refs)):
+        if element_v.shape != element_ref.shape:
+            raise RuntimeError(
+                f"The v[{index}] has invalid shape: should "
+                f"be {element_ref.shape} but got {element_v.shape}.")
diff --git a/python/paddle/incubate/autograd/primapi.py b/python/paddle/incubate/autograd/primapi.py
index 75a70b09731f2..ba7a2537df133 100644
--- a/python/paddle/incubate/autograd/primapi.py
+++ b/python/paddle/incubate/autograd/primapi.py
@@ -14,28 +14,26 @@
 
 import typing
 
-import paddle.autograd.utils as tensor_utils
-import paddle.incubate.autograd.utils as prim_utils
-from paddle.fluid import framework
-from paddle.incubate.autograd import primx
+from paddle.fluid import backward, framework
+from paddle.incubate.autograd import primx, utils
 
 
 @framework.static_only
-def forward_gradients(targets, inputs, input_gradients=None):
+def forward_grad(outputs, inputs, grad_inputs=None):
     """Forward mode of automatic differentiation.
 
     .. note::
         **ONLY available in the static mode and primitive operators.**
 
     Args:
-        targets: The target tensor or tensors
-        inputs: The input tensor or tensors
-        input_gradients: The gradient Tensor or Tensors of inputs which has 
-            the same shape with inputs, Defaults to None, in this case is 
-            equivalent to all ones .
+        outputs(Tensor|Sequence[Tensor]): The output tensor or tensors.
+        inputs(Tensor|Sequence[Tensor]): The input tensor or tensors.
+        grad_inputs(Tensor|Sequence[Tensor]): Optional, the gradient Tensor or 
+            Tensors of inputs which has the same shape with inputs, Defaults to 
+            None, in this case is equivalent to all ones.
 
     Returns:
-        target_gradients (Tensor|Sequence[Tensor]): The gradients for targets.
+        grad_outputs(Tensor|Sequence[Tensor]): The gradients for outputs.
 
     Examples:
 
@@ -53,7 +51,7 @@ def forward_gradients(targets, inputs, input_gradients=None):
             with paddle.static.program_guard(main_program, startup_program):
                 x = paddle.static.data('x', shape=[1], dtype='float32')
                 y = x * x 
-                y_grad = paddle.incubate.autograd.forward_gradients(y, x)
+                y_grad = paddle.incubate.autograd.forward_grad(y, x)
                 paddle.incubate.autograd.prim2orig()
 
             exe = paddle.static.Executor()
@@ -65,20 +63,20 @@ def forward_gradients(targets, inputs, input_gradients=None):
             paddle.incubate.autograd.disable_prim()
             paddle.disable_static()
     """
-    if not prim_utils.prim_enabled():
-        raise RuntimeError('forward_gradients must be running on primitive'
+    if not utils.prim_enabled():
+        raise RuntimeError('forward_grad must be running on primitive'
                            'operators, use enable_prim to turn it on.')
 
-    if not isinstance(targets, (framework.Variable, typing.Sequence)):
-        raise TypeError(f'Expected targets is Tensor|Sequence[Tesnor], '
-                        f'but got {type(targets)}.')
+    if not isinstance(outputs, (framework.Variable, typing.Sequence)):
+        raise TypeError(f'Expected outputs is Tensor|Sequence[Tesnor], '
+                        f'but got {type(outputs)}.')
 
     if not isinstance(inputs, (framework.Variable, typing.Sequence)):
         raise TypeError(f'Expected inputs is Tensor|Sequence[Tesnor], '
                         f'but got {type(inputs)}.')
 
-    ys, xs, xs_dot = tensor_utils.as_tensors(targets), tensor_utils.as_tensors(
-        inputs), tensor_utils.as_tensors(input_gradients)
+    ys, xs, xs_dot = utils.as_tensors(outputs), utils.as_tensors(
+        inputs), utils.as_tensors(grad_inputs)
 
     block = framework.default_main_program().current_block()
     if any(x.block != block for x in xs + ys):
@@ -90,4 +88,106 @@ def forward_gradients(targets, inputs, input_gradients=None):
     ad = primx.Transform(ys[0].block)
     _, ys_dot = ad.linearize(xs, ys, xs_dot)
 
-    return ys_dot[0] if isinstance(targets, framework.Variable) else ys_dot
+    return ys_dot[0] if isinstance(outputs, framework.Variable) else ys_dot
+
+
+@framework.static_only
+def grad(outputs, inputs, grad_outputs=None):
+    """Reverse mode of automatic differentiation.
+
+    .. note::
+        **ONLY available in the static mode and primitive operators**
+
+    Args:
+        outputs(Tensor|Sequence[Tensor]): The output Tensor or Tensors.
+        inputs(Tensor|Sequence[Tensor]): The input Tensor or Tensors.
+        grad_outputs(Tensor|Sequence[Tensor]): Optional, the gradient Tensor or 
+            Tensors of outputs which has the same shape with outputs, Defaults 
+            to None, in this case is equivalent to all ones.
+
+    Returns:
+        grad_inputs(Tensor|Tensors): The gradients for inputs. 
+
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            paddle.enable_static()
+            paddle.incubate.autograd.enable_prim()
+
+            startup_program = paddle.static.Program()
+            main_program = paddle.static.Program()
+            with paddle.static.program_guard(main_program, startup_program):
+                x = paddle.static.data('x', shape=[1], dtype='float32')
+                x.stop_gradients = False
+                y = x * x 
+                x_grad = paddle.incubate.autograd.grad(y, x)
+                paddle.incubate.autograd.prim2orig()
+
+            exe = paddle.static.Executor()
+            exe.run(startup_program)
+            x_grad = exe.run(main_program, feed={'x': np.array([2.]).astype('float32')}, fetch_list=[x_grad])
+            print(x_grad)
+            # [array([4.], dtype=float32)]
+            
+            paddle.incubate.autograd.disable_prim()
+            paddle.disable_static()
+    """
+    if not utils.prim_enabled():
+        grad_inputs = backward.gradients(outputs, inputs, grad_outputs)
+        # backward.gradients returns a list though the inputs is a signle Tensor.
+        # The follow code snippet fixes the problem by return the first element
+        # of grad_inputs when the inputs is a signle Tensor.
+        if isinstance(inputs, framework.Variable) and isinstance(
+                grad_inputs, typing.Sequence) and len(grad_inputs) > 0:
+            return grad_inputs[0]
+        else:
+            return grad_inputs
+
+    if not isinstance(outputs, (framework.Variable, typing.Sequence)):
+        raise TypeError(f'Expected outputs is Tensor|Sequence[Tesnor], '
+                        f'but got {type(outputs)}.')
+
+    if not isinstance(inputs, (framework.Variable, typing.Sequence)):
+        raise TypeError(f'Expected inputs is Tensor|Sequence[Tesnor], '
+                        f'but got {type(inputs)}.')
+
+    ys, xs, ys_bar = utils.as_tensors(outputs), utils.as_tensors(
+        inputs), utils.as_tensors(grad_outputs)
+    block = framework.default_main_program().current_block()
+    if any((x is not None and x.block != block) for x in xs + ys):
+        raise RuntimeError(
+            'Variable in inputs and outputs should be None or in current block of main program'
+        )
+
+    # TODO(Tongxin) without any prior knowledge about whether the program
+    # is completely lowered to primitive ops, it's mandatory to run the lowering
+    # pass once and again. This is obviously inefficient and needs to be
+    # optimized.
+    primx.orig2prim(block)
+    ad = primx.Transform(block)
+    xs_dot, ys_dot = ad.linearize(xs, ys)
+    if any(var is None for var in ys_dot):
+        raise RuntimeError(
+            'Grads cannot be computed. The given outputs does not depend on inputs'
+        )
+    ys_bar, xs_bar = ad.transpose(ys_dot, xs_dot, ys_bar)
+
+    # remove xs_dot and their constructor ops
+    op_indexes = []
+    for var in xs_dot:
+        if var is not None:
+            op_index = block.ops.index(var.op)
+            if op_index < 0:
+                raise ValueError(
+                    f'op_index should be greater than or equal to 0, but op_index={op_index}.'
+                )
+            op_indexes.append(op_index)
+
+    ad.erase_ops(sorted(op_indexes))
+    ad.erase_dots(xs_dot)
+
+    return xs_bar[0] if isinstance(inputs, framework.Variable) else xs_bar
diff --git a/python/paddle/incubate/autograd/primops.py b/python/paddle/incubate/autograd/primops.py
index 6017ac3598920..b9a3ac459961a 100644
--- a/python/paddle/incubate/autograd/primops.py
+++ b/python/paddle/incubate/autograd/primops.py
@@ -14,6 +14,7 @@
 
 import paddle
 from paddle.fluid.layer_helper import LayerHelper
+
 from .primreg import REGISTER_FN
 
 
diff --git a/python/paddle/incubate/autograd/primx.py b/python/paddle/incubate/autograd/primx.py
index d5037dcf64994..19f87dd929215 100644
--- a/python/paddle/incubate/autograd/primx.py
+++ b/python/paddle/incubate/autograd/primx.py
@@ -22,7 +22,7 @@
 from .primrules import _orig2prim, _prim2orig, _jvp, _transpose
 from .utils import get_input_var_list, get_output_var_list, flatten, flatten_and_remove_none
 from collections import OrderedDict
-from paddle.autograd.utils import as_tensors
+from paddle.incubate.autograd.utils import as_tensors
 
 
 def topo_path(xs, ys, block=None):
@@ -408,7 +408,7 @@ def transpose(self, ys_dot, xs_dot, ys_bar=None, retain_fwd=False):
 
 
 # TODO(lml): supporting control flow, nested blocks, and block other than current block of main program.
-def _lower(block, reverse):
+def _lower(block, reverse, blacklist):
     # Some functions which are only used in _lower.
     def bind(args, to_bind, value_table):
         for i in range(len(args)):
@@ -452,7 +452,7 @@ def expand_nested_list(xs):
     for op_idx in range(len(block.ops)):
         op = block.ops[op_idx]
         ops_to_remove.append(op_idx)
-        if lookup_fn(op.type) is not None:
+        if lookup_fn(op.type) is not None and op.type not in blacklist:
             input_args = get_input_var_list(op)
             bind(input_args, to_bind, value_table)
 
@@ -535,11 +535,11 @@ def orig2prim(block=None):
     block = default_main_program().current_block() if block is None else block
     assert block == default_main_program().current_block(
     ), f'block is neither None nor current block of main program'
-    _lower(block, reverse=False)
+    _lower(block, reverse=False, blacklist=[])
 
 
 @framework.static_only
-def prim2orig(block=None):
+def prim2orig(block=None, blacklist=None):
     """
     .. note::
         **ONLY available in the static mode.**
@@ -554,7 +554,11 @@ def prim2orig(block=None):
         block(paddle.static.Block|None, optional): The
             target block to process on. Default None, and will
             process on the current block of main program.
-    
+        blacklist(list[string]|None, optional): The names of automatic
+            differential basic operator that will not be transformed
+            into original operators. Default None, and the blacklist
+            is treated as empty list.
+
     Examples:
 
         .. code-block:: python
@@ -576,48 +580,5 @@ def prim2orig(block=None):
     block = default_main_program().current_block() if block is None else block
     assert block == default_main_program().current_block(
     ), f'block is neither None nor current block of main program'
-    _lower(block, reverse=True)
-
-
-def _gradients(ys, xs, ys_bar=None):
-    """ A drop-in replacement of paddle.gradients but instead computing
-    on primitive ops.
-    
-    Args:
-        ys: the target tensor or tensors
-        xs: the input tensor or tensors
-        ys_bar: the optional gradient tensors of `ys`
-    
-    Returns:
-        xs_bar: a list gradients of input `xs`
-    """
-
-    ys, xs, ys_bar = as_tensors(ys), as_tensors(xs), as_tensors(ys_bar)
-    block = default_main_program().current_block()
-    for el in xs + ys:
-        assert el is None or el.block == block, f'variable in xs and ys should be None or in current block of main program'
-    # TODO(Tongxin) without any prior knowledge about whether the program
-    # is completely lowered to primitive ops, it's mandatory to run the lowering
-    # pass once and again. This is obviously inefficient and needs to be
-    # optimized.
-    orig2prim(block)
-
-    ad = Transform(block)
-
-    xs_dot, ys_dot = ad.linearize(xs, ys)
-    if any(var is None for var in ys_dot):
-        assert False, f'Gradients cannot be computed. The given output `ys` does not depend on input `xs`.'
-    ys_bar, xs_bar = ad.transpose(ys_dot, xs_dot, ys_bar)
-    # remove xs_dot and their constructor ops
-
-    op_indexes = []
-    for var in xs_dot:
-        if var is not None:
-            op_index = block.ops.index(var.op)
-            assert op_index >= 0, f'op_index should be greater than or equal to 0, but op_index={op_index}.'
-            op_indexes.append(op_index)
-
-    ad.erase_ops(sorted(op_indexes))
-    ad.erase_dots(xs_dot)
-
-    return xs_bar
+    blacklist = [] if blacklist is None else blacklist
+    _lower(block, reverse=True, blacklist=blacklist)
diff --git a/python/paddle/incubate/autograd/utils.py b/python/paddle/incubate/autograd/utils.py
index 9d6a8c4f6a36d..96faf7f7440ca 100644
--- a/python/paddle/incubate/autograd/utils.py
+++ b/python/paddle/incubate/autograd/utils.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import typing
 
 import paddle
 from paddle.fluid import framework as framework
@@ -170,3 +171,12 @@ def flatten(inp):
 def flatten_and_remove_none(inp):
     flattened = flatten(inp)
     return [var for var in flattened if var is not None]
+
+
+def as_tensors(xs):
+    if isinstance(xs, framework.Variable):
+        return (xs, )
+    elif isinstance(xs, typing.Sequence):
+        return tuple(xs)
+    else:
+        return xs
diff --git a/python/paddle/incubate/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py
index 3e4d015da1b11..506a282171bbd 100644
--- a/python/paddle/incubate/nn/functional/fused_transformer.py
+++ b/python/paddle/incubate/nn/functional/fused_transformer.py
@@ -680,6 +680,7 @@ def fused_multi_transformer(x,
                             activation="gelu",
                             training=False,
                             mode='upscale_in_train',
+                            trans_qkvw=True,
                             ring_id=-1,
                             name=None):
     r"""
@@ -756,6 +757,9 @@ def fused_multi_transformer(x,
 
                                   - train: out = input * mask
                                   - inference: out = input * (1.0 - p)
+        trans_qkvw (bool, optional): Whether to transpose for weights of qkv.
+            If true, the shape eights of qkv should be [3, num_head, dim_head, dim_embed].
+            Otherwise the shape of weights of qkv should be [dim_embed, 3, num_head, dim_head]. Default True.
         ring_id (int, optional): For distributed forward in tensor model parallel, only support NCCL. Default is -1, means not using mp.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
@@ -826,8 +830,8 @@ def fused_multi_transformer(x,
             ffn_ln_biases, ffn1_weights, ffn1_biases, ffn2_weights, ffn2_biases,
             cache_kvs, 'pre_layer_norm', pre_layer_norm, 'epsilon', epsilon,
             'dropout_rate', dropout_rate, 'is_test', not training,
-            'dropout_implementation', mode, 'act_method', activation, 'ring_id',
-            ring_id)
+            'dropout_implementation', mode, 'act_method', activation,
+            'trans_qkvw', trans_qkvw, 'ring_id', ring_id)
         if cache_kvs is not None:
             return final_out, cache_kv_out
         return final_out
@@ -875,6 +879,7 @@ def fused_multi_transformer(x,
             'is_test': not training,
             'dropout_implementation': mode,
             'act_method': activation,
+            'trans_qkvw': trans_qkvw,
             'ring_id': ring_id
         }
 
diff --git a/python/paddle/incubate/nn/layer/fused_transformer.py b/python/paddle/incubate/nn/layer/fused_transformer.py
index 4a8f7815ae9d8..ba14ac5b86529 100644
--- a/python/paddle/incubate/nn/layer/fused_transformer.py
+++ b/python/paddle/incubate/nn/layer/fused_transformer.py
@@ -1048,6 +1048,9 @@ class FusedMultiTransformer(Layer):
             is a list or tuple, the number of layers is obtained from `qkv_weight_attrs`. num_layers
             only takes effect when `qkv_weight_attrs` is not a list or tuple. Default: -1.
         nranks (int, optional): Distributed tensor model parallel nranks. Default is 1, means not using mp.
+        trans_qkvw (bool, optional): Whether to transpose for weights of qkv.
+            If true, the shape eights of qkv should be [3, num_head, dim_head, dim_embed].
+            Otherwise the shape of weights of qkv should be [dim_embed, 3, num_head, dim_head]. Default: True.
         ring_id (int, optional): For distributed tensor model parallel. Default is -1, means not using mp.
         name (str, optional): The default value is None.  Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
@@ -1090,6 +1093,7 @@ def __init__(self,
                  epsilon=1e-5,
                  num_layers=-1,
                  nranks=1,
+                 trans_qkvw=True,
                  ring_id=-1,
                  name=None):
         super(FusedMultiTransformer, self).__init__()
@@ -1105,6 +1109,7 @@ def __init__(self,
         self.normalize_before = normalize_before
         self._dtype = self._helper.get_default_dtype()
         self._epsilon = epsilon
+        self._trans_qkvw = trans_qkvw
         self._ring_id = ring_id
 
         self.embed_dim = embed_dim
@@ -1161,7 +1166,8 @@ def get_attr(attrs, idx):
                                             shape=[embed_dim],
                                             is_bias=True)
             qkv_weight = self.create_parameter(
-                shape=[3, num_heads, self.head_dim, embed_dim],
+                shape=[3, num_heads, self.head_dim, embed_dim]
+                if trans_qkvw else [embed_dim, 3, num_heads, self.head_dim],
                 attr=qkv_weight_attr,
                 dtype=self._dtype,
                 is_bias=False)
@@ -1292,6 +1298,7 @@ def forward(self, src, attn_mask=None, caches=None, time_step=None):
             activation=self.activation,
             training=self.training,
             mode='upscale_in_train',
+            trans_qkvw=self._trans_qkvw,
             ring_id=self._ring_id,
             name=self.name)
         return out
diff --git a/python/paddle/incubate/operators/resnet_unit.py b/python/paddle/incubate/operators/resnet_unit.py
index 6333ddafe1096..70abe41f62462 100644
--- a/python/paddle/incubate/operators/resnet_unit.py
+++ b/python/paddle/incubate/operators/resnet_unit.py
@@ -170,7 +170,7 @@ def __init__(self,
         self._is_test = is_test
 
         # check format
-        valid_format = {'NHWC'}
+        valid_format = {'NHWC', 'NCHW'}
         if data_format not in valid_format:
             raise ValueError(
                 "conv_format must be one of {}, but got conv_format='{}'".
@@ -181,11 +181,25 @@ def _get_default_param_initializer(channels):
             std = (2.0 / filter_elem_num)**0.5
             return I.Normal(0.0, std)
 
+        is_nchw = (data_format == 'NCHW')
         # initial filter
         bn_param_dtype = fluid.core.VarDesc.VarType.FP32
-        bn_param_shape = [1, 1, 1, num_filters]
-        filter_x_shape = [num_filters, filter_size, filter_size, num_channels_x]
-        filter_z_shape = [num_filters, filter_size, filter_size, num_channels_z]
+        if not is_nchw:
+            bn_param_shape = [1, 1, 1, num_filters]
+            filter_x_shape = [
+                num_filters, filter_size, filter_size, num_channels_x
+            ]
+            filter_z_shape = [
+                num_filters, filter_size, filter_size, num_channels_z
+            ]
+        else:
+            bn_param_shape = [1, num_filters, 1, 1]
+            filter_x_shape = [
+                num_filters, num_channels_x, filter_size, filter_size
+            ]
+            filter_z_shape = [
+                num_filters, num_channels_z, filter_size, filter_size
+            ]
 
         self.filter_x = self.create_parameter(
             shape=filter_x_shape,
diff --git a/python/paddle/incubate/sparse/__init__.py b/python/paddle/incubate/sparse/__init__.py
index f696434118745..47c7a312e24d8 100644
--- a/python/paddle/incubate/sparse/__init__.py
+++ b/python/paddle/incubate/sparse/__init__.py
@@ -15,27 +15,51 @@
 from .creation import sparse_coo_tensor
 from .creation import sparse_csr_tensor
 
-from .unary import sqrt
 from .unary import sin
+from .unary import tan
+from .unary import asin
+from .unary import atan
+from .unary import sinh
 from .unary import tanh
+from .unary import asinh
+from .unary import atanh
+from .unary import sqrt
+from .unary import square
+from .unary import log1p
+from .unary import abs
+from .unary import pow
+from .unary import cast
+from .unary import neg
+from .unary import coalesce
 
 from .binary import mv
 from .binary import matmul
 from .binary import masked_matmul
-
-from .math import add
-from .math import divide
-from .math import multiply
-from .math import subtract
+from .binary import add
+from .binary import divide
+from .binary import multiply
+from .binary import subtract
 
 from . import nn
 
 __all__ = [
     'sparse_coo_tensor',
     'sparse_csr_tensor',
-    'sqrt',
     'sin',
+    'tan',
+    'asin',
+    'atan',
+    'sinh',
     'tanh',
+    'asinh',
+    'atanh',
+    'sqrt',
+    'square',
+    'log1p',
+    'abs',
+    'pow',
+    'cast',
+    'neg',
     'mv',
     'matmul',
     'masked_matmul',
@@ -43,4 +67,5 @@
     'subtract',
     'multiply',
     'divide',
+    'coalesce',
 ]
diff --git a/python/paddle/incubate/sparse/binary.py b/python/paddle/incubate/sparse/binary.py
index f34378924e1f4..7a7861f7b20e7 100644
--- a/python/paddle/incubate/sparse/binary.py
+++ b/python/paddle/incubate/sparse/binary.py
@@ -13,10 +13,19 @@
 # limitations under the License.
 
 from paddle import _C_ops
-from paddle.fluid.framework import dygraph_only
+from paddle.fluid.framework import dygraph_only, core
 
 __all__ = []
 
+_int_dtype_ = [
+    core.VarDesc.VarType.UINT8,
+    core.VarDesc.VarType.INT8,
+    core.VarDesc.VarType.INT16,
+    core.VarDesc.VarType.INT32,
+    core.VarDesc.VarType.INT64,
+    core.VarDesc.VarType.BOOL,
+]
+
 
 @dygraph_only
 def matmul(x, y, name=None):
@@ -53,29 +62,37 @@ def matmul(x, y, name=None):
         .. code-block:: python
 
             import paddle
-            from paddle.fluid.framework import _test_eager_guard 
-            paddle.seed(100)
 
             # csr @ dense -> dense
-
-            with _test_eager_guard():         
-                crows = [0, 2, 3, 5]
-                cols = [1, 3, 2, 0, 1]
-                values = [1., 2., 3., 4., 5.]
-                dense_shape = [3, 4]
-                csr = paddle.incubate.sparse.sparse_csr_tensor(crows, cols, values, dense_shape)
-                # Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True, 
-                #        crows=[0, 2, 3, 5], 
-                #        cols=[1, 3, 2, 0, 1], 
-                #        values=[1., 2., 3., 4., 5.])
-                dense = paddle.randn([4, 3])
-                
-                out = paddle.incubate.sparse.matmul(csr, dense)
-                # Tensor(shape=[3, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-                #        [[-1.94294846 , -3.33990622 ,  0.62359387 ],
-                #         [-4.12815523 ,  3.46535444 , -3.27413893 ],
-                #         [-0.15209436 , -19.23207283, -3.35593438 ]])
-
+            crows = [0, 1, 2, 3]
+            cols = [1, 2, 0]
+            values = [1., 2., 3.]
+            csr = paddle.incubate.sparse.sparse_csr_tensor(crows, cols, values, [3, 3])
+            # Tensor(shape=[3, 3], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True, 
+            #        crows=[0, 1, 2, 3], 
+            #        cols=[1, 2, 0], 
+            #        values=[1., 2., 3.])
+            dense = paddle.ones([3, 2])
+            out = paddle.incubate.sparse.matmul(csr, dense)
+            # Tensor(shape=[3, 2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [[1., 1.],
+            #         [2., 2.],
+            #         [3., 3.]])
+
+            # coo @ dense -> dense
+            indices = [[0, 1, 2], [1, 2, 0]]
+            values = [1., 2., 3.]
+            coo = paddle.incubate.sparse.sparse_coo_tensor(indices, values, [3, 3])
+            # Tensor(shape=[3, 3], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True, 
+            #        indices=[[0, 1, 2],
+            #                 [1, 2, 0]], 
+            #        values=[1., 2., 3.])
+            dense = paddle.ones([3, 2])
+            out = paddle.incubate.sparse.matmul(coo, dense)
+            # Tensor(shape=[3, 2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [[1., 1.],
+            #         [2., 2.],
+            #         [3., 3.]])
     """
     return _C_ops.final_state_sparse_matmul(x, y)
 
@@ -114,30 +131,27 @@ def masked_matmul(x, y, mask, name=None):
         .. code-block:: python
 
             import paddle
-            from paddle.fluid.framework import _test_eager_guard
             paddle.seed(100)
 
             # dense @ dense * csr_mask -> csr
-
-            with _test_eager_guard():
-                crows = [0, 2, 3, 5]
-                cols = [1, 3, 2, 0, 1]
-                values = [1., 2., 3., 4., 5.]
-                dense_shape = [3, 4]
-                mask = paddle.incubate.sparse.sparse_csr_tensor(crows, cols, values, dense_shape)
-                # Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True,
-                #       crows=[0, 2, 3, 5],
-                #       cols=[1, 3, 2, 0, 1],
-                #       values=[1., 2., 3., 4., 5.])
-
-                x = paddle.rand([3, 5])
-                y = paddle.rand([5, 4])
-
-                out = paddle.incubate.sparse.masked_matmul(x, y, mask)
-                # Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True, 
-                #        crows=[0, 2, 3, 5], 
-                #        cols=[1, 3, 2, 0, 1], 
-                #        values=[0.98986477, 0.97800624, 1.14591956, 0.68561077, 0.94714981])
+            crows = [0, 2, 3, 5]
+            cols = [1, 3, 2, 0, 1]
+            values = [1., 2., 3., 4., 5.]
+            dense_shape = [3, 4]
+            mask = paddle.incubate.sparse.sparse_csr_tensor(crows, cols, values, dense_shape)
+            # Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True,
+            #       crows=[0, 2, 3, 5],
+            #       cols=[1, 3, 2, 0, 1],
+            #       values=[1., 2., 3., 4., 5.])
+
+            x = paddle.rand([3, 5])
+            y = paddle.rand([5, 4])
+
+            out = paddle.incubate.sparse.masked_matmul(x, y, mask)
+            # Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True, 
+            #        crows=[0, 2, 3, 5], 
+            #        cols=[1, 3, 2, 0, 1], 
+            #        values=[0.98986477, 0.97800624, 1.14591956, 0.68561077, 0.94714981])
 
     """
     return _C_ops.final_state_sparse_masked_matmul(x, y, mask)
@@ -197,3 +211,191 @@ def mv(x, vec, name=None):
 
     """
     return _C_ops.final_state_sparse_mv(x, vec)
+
+
+def add(x, y, name=None):
+    """
+    Add two sparse tensors element-wise. Input x and y's shape should be identical and have same sparse
+    type（SparseCooTensor or SparseCsrTensor）.If input is SparseCooTensor, x and y's sparse_dim should be identical.
+    The equation is:
+
+    .. math::
+        out = x + y
+
+    Args:
+        x (Tensor): the input tensor, it's data type should be float32, float64, int32, int64.
+        y (Tensor): the input tensor, it's data type should be float32, float64, int32, int64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: the result tensor.
+
+    Examples:
+
+    ..  code-block:: python
+
+        import paddle
+        from paddle.fluid.framework import _test_eager_guard
+
+        paddle.device.set_device("cpu")
+
+        with _test_eager_guard():
+            x = paddle.to_tensor([[0, -1, 0, 2], [0, 0, -3, 0], [4, 5, 0, 0]], 'float32')
+            y = paddle.to_tensor([[0, 0, 0, -2], [0, 2, -3, 0], [2, 3, 4, 8]], 'float32')
+            sparse_x = x.to_sparse_csr()
+            sparse_y = y.to_sparse_csr()
+            sparse_z = paddle.incubate.sparse.add(sparse_x, sparse_y)
+            print(sparse_z.to_dense())
+
+        # [[ 0., -1.,  0.,  0.],
+        # [ 0.,  2., -6.,  0.],
+        # [ 6.,  8.,  4.,  8.]]
+
+    """
+    if y.dtype != x.dtype:
+        y = _C_ops.final_state_sparse_cast(y, None, x.dtype)
+    return _C_ops.final_state_sparse_add(x, y)
+
+
+@dygraph_only
+def subtract(x, y, name=None):
+    """
+    Subtract two sparse tensors element-wise. Input x and y's shape should be identical and have same sparse
+    type（SparseCooTensor or SparseCsrTensor）.If input is SparseCooTensor, x and y's sparse_dim should be identical.
+    The equation is:
+
+    .. math::
+        out = x - y
+
+    Args:
+        x (Tensor): the input tensor, it's data type should be float32, float64, int32, int64.
+        y (Tensor): the input tensor, it's data type should be float32, float64, int32, int64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: the result tensor.
+
+    Examples:
+
+    ..  code-block:: python
+
+        import paddle
+        from paddle.fluid.framework import _test_eager_guard
+
+        paddle.device.set_device("cpu")
+
+        with _test_eager_guard():
+            x = paddle.to_tensor([[0, -1, 0, 2], [0, 0, -3, 0], [4, 5, 0, 0]], 'float32')
+            y = paddle.to_tensor([[0, 0, 0, -2], [0, 2, -3, 0], [2, 3, 4, 8]], 'float32')
+            sparse_x = x.to_sparse_csr()
+            sparse_y = y.to_sparse_csr()
+            sparse_z = paddle.incubate.sparse.subtract(sparse_x, sparse_y)
+            print(sparse_z.to_dense())
+
+        # [[ 0., -1.,  0.,  4.],
+        # [ 0., -2.,  0.,  0.],
+        # [ 2.,  2., -4., -8.]]
+
+    """
+    if y.dtype != x.dtype:
+        y = _C_ops.final_state_sparse_cast(y, None, x.dtype)
+    return _C_ops.final_state_sparse_subtract(x, y)
+
+
+@dygraph_only
+def multiply(x, y, name=None):
+    """
+    Multiply two sparse tensors element-wise. Input x and y's shape should be identical and have same sparse
+    type（SparseCooTensor or SparseCsrTensor）.If input is SparseCooTensor, x and y's sparse_dim should be identical.
+    The equation is:
+
+    .. math::
+        out = x * y
+
+    Args:
+        x (Tensor): the input tensor, it's data type should be float32, float64, int32, int64.
+        y (Tensor): the input tensor, it's data type should be float32, float64, int32, int64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: the result tensor.
+
+    Examples:
+
+    ..  code-block:: python
+
+        import paddle
+        from paddle.fluid.framework import _test_eager_guard
+
+        paddle.device.set_device("cpu")
+
+        with _test_eager_guard():
+            x = paddle.to_tensor([[0, -1, 0, 2], [0, 0, -3, 0], [4, 5, 0, 0]], 'float32')
+            y = paddle.to_tensor([[0, 0, 0, -2], [0, 2, -3, 0], [2, 3, 4, 8]], 'float32')
+            sparse_x = x.to_sparse_csr()
+            sparse_y = y.to_sparse_csr()
+            sparse_z = paddle.incubate.sparse.multiply(sparse_x, sparse_y)
+            print(sparse_z.to_dense())
+
+        # [[ 0.,  0.,  0., -4.],
+        # [ 0.,  0.,  9.,  0.],
+        # [ 8., 15.,  0.,  0.]]
+
+    """
+    if isinstance(y, (int, float)):
+        return _C_ops.final_state_sparse_scale(x, float(y), 0.0, True)
+    else:
+        if y.dtype != x.dtype:
+            y = _C_ops.final_state_sparse_cast(y, None, x.dtype)
+        return _C_ops.final_state_sparse_multiply(x, y)
+
+
+@dygraph_only
+def divide(x, y, name=None):
+    """
+    Divide two sparse tensors element-wise. Input x and y's shape should be identical and have same sparse
+    type（SparseCooTensor or SparseCsrTensor）.If input is SparseCooTensor, x and y's sparse_dim should be identical.
+    The equation is:
+
+    .. math::
+        out = x / y
+
+    Args:
+        x (Tensor): the input tensor, it's data type should be float32, float64, int32, int64.
+        y (Tensor): the input tensor, it's data type should be float32, float64, int32, int64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: the result tensor.
+
+    Examples:
+
+    ..  code-block:: python
+
+        import paddle
+        from paddle.fluid.framework import _test_eager_guard
+
+        paddle.device.set_device("cpu")
+
+        with _test_eager_guard():
+            x = paddle.to_tensor([[0, -1, 0, 2], [0, 0, -3, 0], [4, 5, 0, 0]], 'float32')
+            y = paddle.to_tensor([[0, 0, 0, -2], [0, 2, -3, 0], [2, 3, 4, 8]], 'float32')
+            sparse_x = x.to_sparse_csr()
+            sparse_y = y.to_sparse_csr()
+            sparse_z = paddle.incubate.sparse.divide(sparse_x, sparse_y)
+            print(sparse_z.to_dense())
+
+        # [[ nan      , -inf.     ,  nan      , -1.       ],
+        # [ nan      ,  0.       ,  1.       ,  nan      ],
+        # [ 2.       , 1.66666663,  0.       ,  0.       ]]
+
+    """
+    if x.dtype in _int_dtype_:
+        x = _C_ops.final_state_sparse_cast(x, None, core.VarDesc.VarType.FP32)
+
+    if isinstance(y, (int, float)):
+        return _C_ops.final_state_sparse_divide_scalar(x, float(y))
+    else:
+        if y.dtype != x.dtype:
+            y = _C_ops.final_state_sparse_cast(y, None, x.dtype)
+        return _C_ops.final_state_sparse_divide(x, y)
diff --git a/python/paddle/incubate/sparse/math.py b/python/paddle/incubate/sparse/math.py
deleted file mode 100644
index c6a984c3ad5be..0000000000000
--- a/python/paddle/incubate/sparse/math.py
+++ /dev/null
@@ -1,260 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-sparse math functions
-"""
-from __future__ import print_function
-
-from paddle import _C_ops, in_dynamic_mode, device, int32, int64
-from paddle.tensor import cast
-from paddle.incubate.sparse import sparse_csr_tensor
-
-
-def _cast_coo(x, dtype, name=None):
-    indices = x.indices()
-    values = cast(x.values(), dtype)
-    return _C_ops.final_state_sparse_create_sparse_coo_tensor(
-        values, indices, x.shape)
-
-
-def _cast_csr(x, dtype, name=None):
-    crows = x.crows()
-    cols = x.cols()
-    values = cast(x.values(), dtype)
-    return sparse_csr_tensor(crows, cols, values, x.shape)
-
-
-def _cast(x, dtype, name=None):
-    if x.is_sparse_coo():
-        return _cast_coo(x, dtype, name)
-    return _cast_csr(x, dtype, name)
-
-
-def add(x, y, name=None):
-    """
-    Add two sparse tensors element-wise. Input x and y's shape should be identical and have same sparse
-    type（SparseCooTensor or SparseCsrTensor）.If input is SparseCooTensor, x and y's sparse_dim should be identical.
-    The equation is:
-
-    .. math::
-        out = x + y
-
-    Args:
-        x (Tensor): the input tensor, it's data type should be float32, float64, int32, int64.
-        y (Tensor): the input tensor, it's data type should be float32, float64, int32, int64.
-        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
-
-    Returns:
-        Tensor: the result tensor.
-
-    Examples:
-
-    ..  code-block:: python
-
-        import paddle
-        from paddle.fluid.framework import _test_eager_guard
-
-        paddle.device.set_device("cpu")
-
-        with _test_eager_guard():
-            x = paddle.to_tensor([[0, -1, 0, 2], [0, 0, -3, 0], [4, 5, 0, 0]], 'float32')
-            y = paddle.to_tensor([[0, 0, 0, -2], [0, 2, -3, 0], [2, 3, 4, 8]], 'float32')
-            sparse_x = x.to_sparse_csr()
-            sparse_y = y.to_sparse_csr()
-            sparse_z = paddle.incubate.sparse.add(sparse_x, sparse_y)
-            print(sparse_z.to_dense())
-
-        # [[ 0., -1.,  0.,  0.],
-        # [ 0.,  2., -6.,  0.],
-        # [ 6.,  8.,  4.,  8.]]
-
-    """
-    assert device.get_device(
-    ) == "cpu", "Currently, Sparse add only support CPU device."
-    assert in_dynamic_mode(), "Currently, Sparse API only support dynamic mode"
-    assert x.is_sparse_csr() == y.is_sparse_csr(
-    ), f"Expect sparse tensor type to be same"
-    if x.is_sparse_coo() or x.is_sparse_csr():
-        return _C_ops.final_state_sparse_add(x, y)
-    else:
-        raise ValueError(
-            "Currently, sparse.add only support the input of SparseCooTensor or SparseCsrTensor"
-        )
-
-
-def subtract(x, y, name=None):
-    """
-    Subtract two sparse tensors element-wise. Input x and y's shape should be identical and have same sparse
-    type（SparseCooTensor or SparseCsrTensor）.If input is SparseCooTensor, x and y's sparse_dim should be identical.
-    The equation is:
-
-    .. math::
-        out = x - y
-
-    Args:
-        x (Tensor): the input tensor, it's data type should be float32, float64, int32, int64.
-        y (Tensor): the input tensor, it's data type should be float32, float64, int32, int64.
-        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
-
-    Returns:
-        Tensor: the result tensor.
-
-    Examples:
-
-    ..  code-block:: python
-
-        import paddle
-        from paddle.fluid.framework import _test_eager_guard
-
-        paddle.device.set_device("cpu")
-
-        with _test_eager_guard():
-            x = paddle.to_tensor([[0, -1, 0, 2], [0, 0, -3, 0], [4, 5, 0, 0]], 'float32')
-            y = paddle.to_tensor([[0, 0, 0, -2], [0, 2, -3, 0], [2, 3, 4, 8]], 'float32')
-            sparse_x = x.to_sparse_csr()
-            sparse_y = y.to_sparse_csr()
-            sparse_z = paddle.incubate.sparse.subtract(sparse_x, sparse_y)
-            print(sparse_z.to_dense())
-
-        # [[ 0., -1.,  0.,  4.],
-        # [ 0., -2.,  0.,  0.],
-        # [ 2.,  2., -4., -8.]]
-
-    """
-    assert device.get_device(
-    ) == "cpu", "Currently, Sparse subtract only support CPU device."
-    assert in_dynamic_mode(), "Currently, Sparse API only support dynamic mode"
-    assert x.is_sparse_csr() == y.is_sparse_csr(
-    ), f"Expect sparse tensor type to be same"
-    if x.is_sparse_coo() or x.is_sparse_csr():
-        return _C_ops.final_state_sparse_subtract(x, y)
-    else:
-        raise ValueError(
-            "Currently, sparse.subtract only support the input of SparseCooTensor or SparseCsrTensor"
-        )
-
-
-def multiply(x, y, name=None):
-    """
-    Multiply two sparse tensors element-wise. Input x and y's shape should be identical and have same sparse
-    type（SparseCooTensor or SparseCsrTensor）.If input is SparseCooTensor, x and y's sparse_dim should be identical.
-    The equation is:
-
-    .. math::
-        out = x * y
-
-    Args:
-        x (Tensor): the input tensor, it's data type should be float32, float64, int32, int64.
-        y (Tensor): the input tensor, it's data type should be float32, float64, int32, int64.
-        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
-
-    Returns:
-        Tensor: the result tensor.
-
-    Examples:
-
-    ..  code-block:: python
-
-        import paddle
-        from paddle.fluid.framework import _test_eager_guard
-
-        paddle.device.set_device("cpu")
-
-        with _test_eager_guard():
-            x = paddle.to_tensor([[0, -1, 0, 2], [0, 0, -3, 0], [4, 5, 0, 0]], 'float32')
-            y = paddle.to_tensor([[0, 0, 0, -2], [0, 2, -3, 0], [2, 3, 4, 8]], 'float32')
-            sparse_x = x.to_sparse_csr()
-            sparse_y = y.to_sparse_csr()
-            sparse_z = paddle.incubate.sparse.multiply(sparse_x, sparse_y)
-            print(sparse_z.to_dense())
-
-        # [[ 0.,  0.,  0., -4.],
-        # [ 0.,  0.,  9.,  0.],
-        # [ 8., 15.,  0.,  0.]]
-
-    """
-    assert device.get_device(
-    ) == "cpu", "Currently, Sparse multiply only support CPU device."
-    assert in_dynamic_mode(), "Currently, Sparse API only support dynamic mode"
-    assert x.is_sparse_csr() == y.is_sparse_csr(
-    ), f"Expect sparse tensor type to be same"
-    if x.is_sparse_coo() or x.is_sparse_csr():
-        return _C_ops.final_state_sparse_multiply(x, y)
-    else:
-        raise ValueError(
-            "Currently, sparse.multiply only support the input of SparseCooTensor or SparseCsrTensor"
-        )
-
-
-def divide(x, y, name=None):
-    """
-    Divide two sparse tensors element-wise. Input x and y's shape should be identical and have same sparse
-    type（SparseCooTensor or SparseCsrTensor）.If input is SparseCooTensor, x and y's sparse_dim should be identical.
-    The equation is:
-
-    .. math::
-        out = x / y
-
-    Args:
-        x (Tensor): the input tensor, it's data type should be float32, float64, int32, int64.
-        y (Tensor): the input tensor, it's data type should be float32, float64, int32, int64.
-        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
-
-    Returns:
-        Tensor: the result tensor.
-
-    Examples:
-
-    ..  code-block:: python
-
-        import paddle
-        from paddle.fluid.framework import _test_eager_guard
-
-        paddle.device.set_device("cpu")
-
-        with _test_eager_guard():
-            x = paddle.to_tensor([[0, -1, 0, 2], [0, 0, -3, 0], [4, 5, 0, 0]], 'float32')
-            y = paddle.to_tensor([[0, 0, 0, -2], [0, 2, -3, 0], [2, 3, 4, 8]], 'float32')
-            sparse_x = x.to_sparse_csr()
-            sparse_y = y.to_sparse_csr()
-            sparse_z = paddle.incubate.sparse.divide(sparse_x, sparse_y)
-            print(sparse_z.to_dense())
-
-        # [[ nan      , -inf.     ,  nan      , -1.       ],
-        # [ nan      ,  0.       ,  1.       ,  nan      ],
-        # [ 2.       , 1.66666663,  0.       ,  0.       ]]
-
-    """
-    assert device.get_device(
-    ) == "cpu", "Currently, Sparse divide only support CPU device."
-    assert in_dynamic_mode(), "Currently, Sparse API only support dynamic mode"
-    assert x.is_sparse_csr() == y.is_sparse_csr(
-    ), f"Expect sparse tensor type to be same"
-
-    if x.dtype in [int32, int64]:
-        if x.is_sparse_coo() or x.is_sparse_csr():
-            cx = _cast(x, 'float32')
-            cy = _cast(y, 'float32')
-            return _C_ops.final_state_sparse_divide(cx, cy)
-        else:
-            raise ValueError(
-                "Currently, sparse.divide only support the input of SparseCooTensor or SparseCsrTensor"
-            )
-    else:
-        if x.is_sparse_coo() or x.is_sparse_csr():
-            return _C_ops.final_state_sparse_divide(x, y)
-        else:
-            raise ValueError(
-                "Currently, sparse.divide only support the input of SparseCooTensor or SparseCsrTensor"
-            )
diff --git a/python/paddle/incubate/sparse/nn/functional/__init__.py b/python/paddle/incubate/sparse/nn/functional/__init__.py
index af5636a622a9a..21939eeb1a4f9 100644
--- a/python/paddle/incubate/sparse/nn/functional/__init__.py
+++ b/python/paddle/incubate/sparse/nn/functional/__init__.py
@@ -14,6 +14,7 @@
 
 from .conv import conv3d  # noqa: F401
 from .conv import subm_conv3d  # noqa: F401
+from .transformer import attention  # noqa: F401
 from .pooling import max_pool3d  # noqa: F401
 from .activation import relu  # noqa: F401
 from .activation import softmax  # noqa: F401
@@ -24,4 +25,5 @@
     'max_pool3d',
     'relu',
     'softmax',
+    'attention',
 ]
diff --git a/python/paddle/incubate/sparse/nn/functional/activation.py b/python/paddle/incubate/sparse/nn/functional/activation.py
index 12d44063e0015..dc2969424086e 100644
--- a/python/paddle/incubate/sparse/nn/functional/activation.py
+++ b/python/paddle/incubate/sparse/nn/functional/activation.py
@@ -14,7 +14,7 @@
 
 __all__ = []
 
-from paddle import _C_ops, in_dynamic_mode
+from paddle import _C_ops
 from paddle.fluid.framework import dygraph_only
 
 
diff --git a/python/paddle/incubate/sparse/nn/functional/transformer.py b/python/paddle/incubate/sparse/nn/functional/transformer.py
new file mode 100644
index 0000000000000..f69714700bf5d
--- /dev/null
+++ b/python/paddle/incubate/sparse/nn/functional/transformer.py
@@ -0,0 +1,94 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = []
+
+from paddle import _C_ops
+from paddle.fluid.framework import dygraph_only
+
+
+@dygraph_only
+def attention(query,
+              key,
+              value,
+              sparse_mask,
+              key_padding_mask=None,
+              attn_mask=None,
+              name=None):
+    """
+    Note:    
+        This API is only used from ``CUDA 11.7`` .
+
+    SparseCsrTensor is used to store the intermediate result of Attention matrix 
+    in Transformer module, which can reduce memory usage and improve performance. 
+    ``sparse_mask`` express the sparse layout in CSR format. 
+    The calculation equation is: 
+
+    .. math::
+
+        result = softmax(\frac{ Q * K^T }{\sqrt{d}}) * V
+
+    where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module. 
+    The shape of the three parameters are: `[batch_size, num_heads, seq_len, head_dim]`, and
+    ``d`` represents ``head_dim`` .
+
+    Args:
+        query(DenseTensor): `query` in the Attention module. 4D Tensor with float32 or float64.
+        key(DenseTensor): `key` in the Attention module. 4D Tensor with float32 or float64.
+        value(DenseTensor): `value` in the Attention module. 4D Tensor with float32 or float64.
+        sparse_mask(SparseCsrTensor): The sparse layout in the Attention module. Its dense shape 
+            is `[batch_size*num_heads, seq_len, seq_len]` .  `nnz` of each batch must be the same. 
+            dtype of `crows` and `cols` must be int64, dtype of `values` can be float32 or float64.
+        key_padding_mask(DenseTensor, optional): The key padding mask tensor in the Attention module. 
+            2D tensor with shape: [batch_size, seq_len]. dtype can be float32 or float64. Default: None.
+        attn_mask(DenseTensor, optional): The attention mask tensor in the Attention module. 
+            2D tensor with shape: [seq_len, seq_len]. dtype can be float32 or float64. Default: None.
+        name(str, optional): The default value is None. Normally there is no need for user
+                        to set this property. For more information, please refer to
+                        :ref:`api_guide_Name`.
+
+    Returns:
+        4D tensor with shape: [batch_size, num_heads, seq_len, head_dim]. dtype is same with input.
+
+    Examples:
+        .. code-block:: python
+            
+            import paddle
+
+            batch_size = 16
+            num_heads = 16
+            seq_len = 512
+            head_dim = 32
+
+            query = paddle.rand([batch_size, num_heads, seq_len, head_dim])
+            key = paddle.rand([batch_size, num_heads, seq_len, head_dim])
+            value = paddle.rand([batch_size, num_heads, seq_len, head_dim])
+
+            query.stop_gradient = False
+            key.stop_gradient = False
+            value.stop_gradient = False
+
+            mask = paddle.nn.functional.dropout(paddle.ones([seq_len, seq_len])).expand([batch_size, num_heads, seq_len, seq_len])
+            sp_mask = mask.reshape([-1, seq_len, seq_len]).to_sparse_csr()
+
+            kp_mask = paddle.randint(0, 2, [batch_size, seq_len])
+            attn_mask = paddle.randint(0, 2, [seq_len, seq_len])
+
+            output = paddle.incubate.sparse.nn.functional.attention(query, key, value, sp_mask, kp_mask, attn_mask)
+            output.backward()
+    """
+    return _C_ops.final_state_sparse_fused_attention(query, key, value,
+                                                     sparse_mask,
+                                                     key_padding_mask,
+                                                     attn_mask)
diff --git a/python/paddle/incubate/sparse/unary.py b/python/paddle/incubate/sparse/unary.py
index 09e449b0d9c5e..1725c8791fd30 100644
--- a/python/paddle/incubate/sparse/unary.py
+++ b/python/paddle/incubate/sparse/unary.py
@@ -13,19 +13,49 @@
 # limitations under the License.
 
 from paddle import _C_ops
-from paddle.fluid.framework import dygraph_only
+from paddle.fluid.framework import dygraph_only, core, convert_np_dtype_to_dtype_
 
 __all__ = []
 
 
 @dygraph_only
-def tanh(x, name=None):
+def sin(x, name=None):
     """
-    sparse tanh activation, requiring x to be a sparse coo or sparse csr tensor.
+    Calculate elementwise sin of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor.
+        
+    .. math::
+
+        out = sin(x)
+
+    Parameters:
+        x (Tensor): The input Sparse Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Sparse Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
 
+            dense_x = paddle.to_tensor([-2., 0., 1.])
+            sparse_x = dense_x.to_sparse_coo(1)
+            out = paddle.incubate.sparse.sin(sparse_x)
+            
+    """
+    return _C_ops.final_state_sparse_sin(x)
+
+
+@dygraph_only
+def tan(x, name=None):
+    """
+    Calculate elementwise tan of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor.
+        
     .. math::
 
-        out = tanh(x)
+        out = tan(x)
 
     Parameters:
         x (Tensor): The input Sparse Tensor with data type float32, float64.
@@ -39,21 +69,230 @@ def tanh(x, name=None):
         .. code-block:: python
 
             import paddle
-            from paddle.fluid.framework import _test_eager_guard
 
-            with _test_eager_guard():
-                dense_x = paddle.to_tensor([-2, 0, 1], dtype='float32')
-                sparse_x = dense_x.to_sparse_coo(1)
-                out = paddle.incubate.sparse.tanh(sparse_x)
+            dense_x = paddle.to_tensor([-2., 0., 1.])
+            sparse_x = dense_x.to_sparse_coo(1)
+            out = paddle.incubate.sparse.tan(sparse_x)
+            
+    """
+    return _C_ops.final_state_sparse_tan(x)
+
+
+@dygraph_only
+def asin(x, name=None):
+    """
+    Calculate elementwise asin of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor.
+        
+    .. math::
+
+        out = asin(x)
+
+    Parameters:
+        x (Tensor): The input Sparse Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Sparse Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            dense_x = paddle.to_tensor([-2., 0., 1.])
+            sparse_x = dense_x.to_sparse_coo(1)
+            out = paddle.incubate.sparse.asin(sparse_x)
+            
+    """
+    return _C_ops.final_state_sparse_asin(x)
+
+
+@dygraph_only
+def atan(x, name=None):
+    """
+    Calculate elementwise atan of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor.
+        
+    .. math::
+
+        out = atan(x)
+
+    Parameters:
+        x (Tensor): The input Sparse Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Sparse Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            dense_x = paddle.to_tensor([-2., 0., 1.])
+            sparse_x = dense_x.to_sparse_coo(1)
+            out = paddle.incubate.sparse.atan(sparse_x)
+            
+    """
+    return _C_ops.final_state_sparse_atan(x)
+
+
+@dygraph_only
+def sinh(x, name=None):
+    """
+    Calculate elementwise sinh of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor.
+        
+    .. math::
+
+        out = sinh(x)
+
+    Parameters:
+        x (Tensor): The input Sparse Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Sparse Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            dense_x = paddle.to_tensor([-2., 0., 1.])
+            sparse_x = dense_x.to_sparse_coo(1)
+            out = paddle.incubate.sparse.sinh(sparse_x)
+            
+    """
+    return _C_ops.final_state_sparse_sinh(x)
+
+
+@dygraph_only
+def asinh(x, name=None):
+    """
+    Calculate elementwise asinh of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor.
+        
+    .. math::
+
+        out = asinh(x)
+
+    Parameters:
+        x (Tensor): The input Sparse Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Sparse Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            dense_x = paddle.to_tensor([-2., 0., 1.])
+            sparse_x = dense_x.to_sparse_coo(1)
+            out = paddle.incubate.sparse.asinh(sparse_x)
+            
+    """
+    return _C_ops.final_state_sparse_asinh(x)
+
+
+@dygraph_only
+def atanh(x, name=None):
+    """
+    Calculate elementwise atanh of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor.
+        
+    .. math::
+
+        out = atanh(x)
+
+    Parameters:
+        x (Tensor): The input Sparse Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Sparse Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            dense_x = paddle.to_tensor([-2., 0., 1.])
+            sparse_x = dense_x.to_sparse_coo(1)
+            out = paddle.incubate.sparse.atanh(sparse_x)
+            
+    """
+    return _C_ops.final_state_sparse_atanh(x)
+
+
+@dygraph_only
+def tanh(x, name=None):
+    """
+    Calculate elementwise tanh of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor.
+        
+    .. math::
+
+        out = tanh(x)
+
+    Parameters:
+        x (Tensor): The input Sparse Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Sparse Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            
+            dense_x = paddle.to_tensor([-2., 0., 1.])
+            sparse_x = dense_x.to_sparse_coo(1)
+            out = paddle.incubate.sparse.tanh(sparse_x)
+            
     """
     return _C_ops.final_state_sparse_tanh(x)
 
 
 @dygraph_only
-def sqrt(x, name=None):
+def square(x, name=None):
     """
-    Calculate square root of x, requiring x to be a sparse coo or sparse csr tensor.
+    Calculate elementwise square of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor.
+        
+    .. math::
 
+        out = square(x)
+
+    Parameters:
+        x (Tensor): The input Sparse Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Sparse Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            
+            dense_x = paddle.to_tensor([-2., 0., 1.])
+            sparse_x = dense_x.to_sparse_coo(1)
+            out = paddle.incubate.sparse.square(sparse_x)
+            
+    """
+    return _C_ops.final_state_sparse_square(x)
+
+
+@dygraph_only
+def sqrt(x, name=None):
+    """
+    Calculate elementwise sqrt of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor.
+        
     .. math::
 
         out = sqrt(x)
@@ -70,24 +309,149 @@ def sqrt(x, name=None):
         .. code-block:: python
 
             import paddle
-            from paddle.fluid.framework import _test_eager_guard
 
-            with _test_eager_guard():
-                dense_x = paddle.to_tensor([4, 0, 1], dtype='float32')
-                sparse_x = dense_x.to_sparse_coo(1)
-                out = paddle.incubate.sparse.sqrt(sparse_x)
+            dense_x = paddle.to_tensor([-2., 0., 1.])
+            sparse_x = dense_x.to_sparse_coo(1)
+            out = paddle.incubate.sparse.sqrt(sparse_x)
+            
     """
     return _C_ops.final_state_sparse_sqrt(x)
 
 
 @dygraph_only
-def sin(x, name=None):
+def log1p(x, name=None):
     """
-    Calculate sin of x, requiring x to be a sparse coo or sparse csr tensor.
+    Calculate the natural log of (1+x), requiring x to be a SparseCooTensor or SparseCsrTensor.
 
     .. math::
 
-        out = sin(x)
+        out = ln(1+x)
+
+    Parameters:
+        x (Tensor): The input Sparse Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Sparse Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            dense_x = paddle.to_tensor([-2, 0, 1], dtype='float32')
+            sparse_x = dense_x.to_sparse_coo(1)
+            out = paddle.incubate.sparse.log1p(sparse_x)
+            
+    """
+    return _C_ops.final_state_sparse_log1p(x)
+
+
+@dygraph_only
+def cast(x, index_dtype=None, value_dtype=None, name=None):
+    """
+    cast non-zero-index of SparseTensor to `index_dtype`, non-zero-element of SparseTensor to
+    `value_dtype` , requiring x to be a SparseCooTensor or SparseCsrTensor.
+
+    Parameters:
+        x (Tensor): The input Sparse Tensor with data type float32, float64.
+        index_dtype (np.dtype|str, optional): Data type of the index of SparseCooTensor, 
+            or crows/cols of SparseCsrTensor. Can be uint8, int8, int16, int32, int64.
+        value_dtype (np.dtype|str, optional): Data type of the value of SparseCooTensor,
+            SparseCsrTensor. Can be bool, float16, float32, float64, int8, int32, int64, uint8.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Sparse Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            dense_x = paddle.to_tensor([-2, 0, 1])
+            sparse_x = dense_x.to_sparse_coo(1)
+            out = paddle.incubate.sparse.cast(sparse_x, 'int32', 'float64')
+            
+    """
+    if index_dtype and not isinstance(index_dtype, core.VarDesc.VarType):
+        index_dtype = convert_np_dtype_to_dtype_(index_dtype)
+    if value_dtype and not isinstance(value_dtype, core.VarDesc.VarType):
+        value_dtype = convert_np_dtype_to_dtype_(value_dtype)
+    return _C_ops.final_state_sparse_cast(x, index_dtype, value_dtype)
+
+
+@dygraph_only
+def pow(x, factor, name=None):
+    """
+    Calculate elementwise pow of x, requiring x to be a SparseCooTensor or SparseCsrTensor.
+
+    .. math::
+
+        out = x^{factor}
+
+    Parameters:
+        x (Tensor): The input Sparse Tensor with data type float32, float64.
+        factor (float|int): factor of pow.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Sparse Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            dense_x = paddle.to_tensor([-2, 0, 3], dtype='float32')
+            sparse_x = dense_x.to_sparse_coo(1)
+            out = paddle.incubate.sparse.pow(sparse_x, 2)
+            
+    """
+    return _C_ops.final_state_sparse_pow(x, float(factor))
+
+
+@dygraph_only
+def neg(x, name=None):
+    """
+    Calculate elementwise negative of x, requiring x to be a SparseCooTensor or SparseCsrTensor.
+
+    .. math::
+
+        out = -x
+
+    Parameters:
+        x (Tensor): The input Sparse Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Sparse Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            dense_x = paddle.to_tensor([-2, 0, 3], dtype='float32')
+            sparse_x = dense_x.to_sparse_coo(1)
+            out = paddle.incubate.sparse.neg(sparse_x)
+            
+    """
+    return _C_ops.final_state_sparse_scale(x, -1.0, 0.0, True)
+
+
+@dygraph_only
+def abs(x, name=None):
+    """
+    Calculate elementwise absolute value of x, requiring x to be a SparseCooTensor or SparseCsrTensor.
+
+    .. math::
+
+        out = |x|
 
     Parameters:
         x (Tensor): The input Sparse Tensor with data type float32, float64.
@@ -101,11 +465,41 @@ def sin(x, name=None):
         .. code-block:: python
 
             import paddle
+
+            dense_x = paddle.to_tensor([-2, 0, 3], dtype='float32')
+            sparse_x = dense_x.to_sparse_coo(1)
+            out = paddle.incubate.sparse.abs(sparse_x)
+            
+    """
+    return _C_ops.final_state_sparse_abs(x)
+
+
+@dygraph_only
+def coalesce(x):
+    r"""
+    the coalesced operator include sorted and merge, after coalesced, the indices of x is sorted and unique.
+
+    Parameters:
+        x (Tensor): the input SparseCooTensor.
+
+    Returns:
+        Tensor: return the SparseCooTensor after coalesced.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.incubate import sparse
             from paddle.fluid.framework import _test_eager_guard
 
             with _test_eager_guard():
-                dense_x = paddle.to_tensor([-2, 0, 3], dtype='float32')
-                sparse_x = dense_x.to_sparse_coo(1)
-                out = paddle.incubate.sparse.sin(sparse_x)
-    """
-    return _C_ops.final_state_sparse_sin(x)
+                indices = [[0, 0, 1], [1, 1, 2]]
+                values = [1.0, 2.0, 3.0]
+                sp_x = sparse.sparse_coo_tensor(indices, values)
+                sp_x = sparse.coalesce(sp_x)
+                print(sp_x.indices())
+                #[[0, 1], [1, 2]]
+                print(sp_x.values())
+                #[3.0, 3.0]
+	"""
+    return _C_ops.final_state_sparse_coalesce(x)
diff --git a/python/paddle/incubate/xpu/__init__.py b/python/paddle/incubate/xpu/__init__.py
new file mode 100644
index 0000000000000..33a93b00f51da
--- /dev/null
+++ b/python/paddle/incubate/xpu/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .resnet_block import ResNetBasicBlock
diff --git a/python/paddle/incubate/xpu/resnet_block.py b/python/paddle/incubate/xpu/resnet_block.py
new file mode 100644
index 0000000000000..2b690cd7bf929
--- /dev/null
+++ b/python/paddle/incubate/xpu/resnet_block.py
@@ -0,0 +1,468 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import collections
+import itertools
+import six
+import math
+import sys
+import warnings
+from functools import partial, reduce
+
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle import framework
+from paddle.nn import initializer as I
+from paddle.nn import Layer, LayerList
+from paddle.fluid.layers import utils
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.data_feeder import convert_dtype
+from paddle.fluid.param_attr import ParamAttr
+from paddle import _C_ops
+
+__all__ = ['resnet_basic_block', 'ResNetBasicBlock']
+
+
+def resnet_basic_block(x,
+                       filter1,
+                       scale1,
+                       bias1,
+                       mean1,
+                       var1,
+                       filter2,
+                       scale2,
+                       bias2,
+                       mean2,
+                       var2,
+                       filter3,
+                       scale3,
+                       bias3,
+                       mean3,
+                       var3,
+                       stride1,
+                       stride2,
+                       stride3,
+                       padding1,
+                       padding2,
+                       padding3,
+                       dilation1,
+                       dilation2,
+                       dilation3,
+                       groups,
+                       momentum,
+                       eps,
+                       data_format,
+                       has_shortcut,
+                       use_global_stats=None,
+                       training=False,
+                       trainable_statistics=False,
+                       find_conv_max=True):
+
+    if fluid.framework.in_dygraph_mode():
+        attrs = ('stride1', stride1, 'stride2', stride2, 'stride3', stride3,
+                 'padding1', padding1, 'padding2', padding2, 'padding3',
+                 padding3, 'dilation1', dilation1, 'dilation2', dilation2,
+                 'dilation3', dilation3, 'group', groups, 'momentum', momentum,
+                 'epsilon', eps, 'data_format', data_format, 'has_shortcut',
+                 has_shortcut, 'use_global_stats', use_global_stats,
+                 "trainable_statistics", trainable_statistics, 'is_test',
+                 not training, 'act_type', "relu", 'find_conv_input_max',
+                 find_conv_max)
+
+        out, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _ = \
+                getattr(_C_ops, "resnet_basic_block")(x, filter1, scale1, bias1, mean1, var1, filter2, scale2, bias2, mean2, var2, \
+                filter3, scale3, bias3, mean3, var3, mean1, var1, mean2, var2, mean3, var3, *attrs)
+        return out
+    helper = LayerHelper('resnet_basic_block', **locals())
+    bn_param_dtype = fluid.core.VarDesc.VarType.FP32
+    max_dtype = fluid.core.VarDesc.VarType.FP32
+
+    out = helper.create_variable_for_type_inference(dtype=x.dtype,
+                                                    stop_gradient=True)
+    conv1 = helper.create_variable_for_type_inference(dtype=x.dtype,
+                                                      stop_gradient=True)
+    saved_mean1 = helper.create_variable_for_type_inference(
+        dtype=bn_param_dtype, stop_gradient=True)
+    saved_invstd1 = helper.create_variable_for_type_inference(
+        dtype=bn_param_dtype, stop_gradient=True)
+    running_mean1 = helper.create_variable_for_type_inference(
+        dtype=bn_param_dtype, stop_gradient=True) if mean1 is None else mean1
+    running_var1 = helper.create_variable_for_type_inference(
+        dtype=bn_param_dtype, stop_gradient=True) if var1 is None else var1
+    conv2 = helper.create_variable_for_type_inference(dtype=x.dtype,
+                                                      stop_gradient=True)
+    conv2_input = helper.create_variable_for_type_inference(dtype=x.dtype,
+                                                            stop_gradient=True)
+    saved_mean2 = helper.create_variable_for_type_inference(
+        dtype=bn_param_dtype, stop_gradient=True)
+    saved_invstd2 = helper.create_variable_for_type_inference(
+        dtype=bn_param_dtype, stop_gradient=True)
+    running_mean2 = helper.create_variable_for_type_inference(
+        dtype=bn_param_dtype, stop_gradient=True) if mean2 is None else mean2
+    running_var2 = helper.create_variable_for_type_inference(
+        dtype=bn_param_dtype, stop_gradient=True) if var2 is None else var2
+    conv3 = helper.create_variable_for_type_inference(dtype=x.dtype,
+                                                      stop_gradient=True)
+    saved_mean3 = helper.create_variable_for_type_inference(
+        dtype=bn_param_dtype, stop_gradient=True)
+    saved_invstd3 = helper.create_variable_for_type_inference(
+        dtype=bn_param_dtype, stop_gradient=True)
+    running_mean3 = helper.create_variable_for_type_inference(
+        dtype=bn_param_dtype, stop_gradient=True) if mean3 is None else mean3
+    running_var3 = helper.create_variable_for_type_inference(
+        dtype=bn_param_dtype, stop_gradient=True) if var3 is None else var3
+    conv1_input_max = helper.create_variable_for_type_inference(
+        dtype=max_dtype, stop_gradient=True)
+    conv1_filter_max = helper.create_variable_for_type_inference(
+        dtype=max_dtype, stop_gradient=True)
+    conv2_input_max = helper.create_variable_for_type_inference(
+        dtype=max_dtype, stop_gradient=True)
+    conv2_filter_max = helper.create_variable_for_type_inference(
+        dtype=max_dtype, stop_gradient=True)
+    conv3_input_max = helper.create_variable_for_type_inference(
+        dtype=max_dtype, stop_gradient=True)
+    conv3_filter_max = helper.create_variable_for_type_inference(
+        dtype=max_dtype, stop_gradient=True)
+
+    inputs = {
+        'X': x,
+        'Filter1': filter1,
+        'Scale1': scale1,
+        'Bias1': bias1,
+        'Mean1': mean1,
+        'Var1': var1,
+        'Filter2': filter2,
+        'Scale2': scale2,
+        'Bias2': bias2,
+        'Mean2': mean2,
+        'Var2': var2,
+        'Filter3': filter3,
+        'Scale3': scale3,
+        'Bias3': bias3,
+        'Mean3': mean3,
+        'Var3': var3,
+    }
+
+    attrs = {
+        'stride1': stride1,
+        'stride2': stride2,
+        'stride3': stride3,
+        'padding1': padding1,
+        'padding2': padding2,
+        'padding3': padding3,
+        'dilation1': dilation1,
+        'dilation2': dilation2,
+        'dilation3': dilation3,
+        'group': groups,
+        'momentum': momentum,
+        'epsilon': eps,
+        'data_format': data_format,
+        'has_shortcut': has_shortcut,
+        'use_global_stats': use_global_stats,
+        "trainable_statistics": trainable_statistics,
+        'is_test': not training,
+        'act_type': "relu",
+        'find_conv_input_max': find_conv_max
+    }
+
+    outputs = {
+        'Y': out,
+        'Conv1': conv1,
+        'SavedMean1': saved_mean1,
+        'SavedInvstd1': saved_invstd1,
+        'Mean1Out': running_mean1,
+        'Var1Out': running_var1,
+        'Conv2': conv2,
+        'SavedMean2': saved_mean2,
+        'SavedInvstd2': saved_invstd2,
+        'Mean2Out': running_mean2,
+        'Var2Out': running_var2,
+        'Conv2Input': conv2_input,
+        'Conv3': conv3,
+        'SavedMean3': saved_mean3,
+        'SavedInvstd3': saved_invstd3,
+        'Mean3Out': running_mean3,
+        'Var3Out': running_var3,
+        'MaxInput1': conv1_input_max,
+        'MaxFilter1': conv1_filter_max,
+        'MaxInput2': conv2_input_max,
+        'MaxFilter2': conv2_filter_max,
+        'MaxInput3': conv3_input_max,
+        'MaxFilter3': conv3_filter_max,
+    }
+    helper.append_op(type='resnet_basic_block',
+                     inputs=inputs,
+                     outputs=outputs,
+                     attrs=attrs)
+    return out
+
+
+class ResNetBasicBlock(Layer):
+    """
+    ResNetBasicBlock is designed for optimize the performence of the basic unit of ssd resnet block.
+    The fusion op architecture like this:
+            has_shortcut = True:       else:
+                    X                         X
+                  /                         /
+                |       |                 |       |
+              CONV1     |               CONV1     |
+                |       |                 |       |
+               BN1      |                BN1      |
+                |       |                 |       |
+              RELU1     |               RELU1     |
+                |       |                 |       |
+              CONV2   CONV3             CONV2     |
+                |       |                 |       |
+               BN2     BN3               BN2      |
+                 \     /                   \     /
+                   ADD                       ADD
+                    |                         |
+                   RELU                      RELU
+                    |                         |
+                    Y                         Y
+    """
+
+    def __init__(self,
+                 num_channels1,
+                 num_filter1,
+                 filter1_size,
+                 num_channels2,
+                 num_filter2,
+                 filter2_size,
+                 num_channels3,
+                 num_filter3,
+                 filter3_size,
+                 stride1=1,
+                 stride2=1,
+                 stride3=1,
+                 act='relu',
+                 momentum=0.9,
+                 eps=1e-5,
+                 data_format='NCHW',
+                 has_shortcut=False,
+                 use_global_stats=False,
+                 is_test=False,
+                 filter1_attr=None,
+                 scale1_attr=None,
+                 bias1_attr=None,
+                 moving_mean1_name=None,
+                 moving_var1_name=None,
+                 filter2_attr=None,
+                 scale2_attr=None,
+                 bias2_attr=None,
+                 moving_mean2_name=None,
+                 moving_var2_name=None,
+                 filter3_attr=None,
+                 scale3_attr=None,
+                 bias3_attr=None,
+                 moving_mean3_name=None,
+                 moving_var3_name=None,
+                 padding1=0,
+                 padding2=0,
+                 padding3=0,
+                 dilation1=1,
+                 dilation2=1,
+                 dilation3=1,
+                 trainable_statistics=False,
+                 find_conv_max=True):
+        super(ResNetBasicBlock, self).__init__()
+        self._stride1 = stride1
+        self._stride2 = stride2
+        self._kernel1_size = utils.convert_to_list(filter1_size, 2,
+                                                   'filter1_size')
+        self._kernel2_size = utils.convert_to_list(filter2_size, 2,
+                                                   'filter2_size')
+        self._dilation1 = dilation1
+        self._dilation2 = dilation2
+        self._padding1 = padding1
+        self._padding2 = padding2
+        self._groups = 1
+        self._momentum = momentum
+        self._eps = eps
+        self._data_format = data_format
+        self._act = act
+        self._has_shortcut = has_shortcut
+        self._use_global_stats = use_global_stats
+        self._is_test = is_test
+        self._trainable_statistics = trainable_statistics
+        self._find_conv_max = find_conv_max
+
+        if has_shortcut:
+            self._kernel3_size = utils.convert_to_list(filter3_size, 2,
+                                                       'filter3_size')
+            self._padding3 = padding3
+            self._stride3 = stride3
+            self._dilation3 = dilation3
+        else:
+            self._kernel3_size = None
+            self._padding3 = 1
+            self._stride3 = 1
+            self._dilation3 = 1
+
+        # check format
+        valid_format = {'NCHW'}
+        if data_format not in valid_format:
+            raise ValueError(
+                "conv_format must be one of {}, but got conv_format={}".format(
+                    valid_format, data_format))
+
+        def _get_default_param_initializer(channels, kernel_size):
+            filter_elem_num = np.prod(kernel_size) * channels
+            std = (2.0 / filter_elem_num)**0.5
+            return I.Normal(0.0, std)
+
+        # init filter
+        bn_param_dtype = fluid.core.VarDesc.VarType.FP32
+        bn1_param_shape = [1, 1, num_filter1]
+        bn2_param_shape = [1, 1, num_filter2]
+        filter1_shape = [num_filter1, num_channels1, filter1_size, filter1_size]
+        filter2_shape = [num_filter2, num_channels2, filter2_size, filter2_size]
+
+        self.filter_1 = self.create_parameter(
+            shape=filter1_shape,
+            attr=filter1_attr,
+            default_initializer=_get_default_param_initializer(
+                num_channels1, self._kernel1_size))
+        self.scale_1 = self.create_parameter(
+            shape=bn1_param_shape,
+            attr=scale1_attr,
+            dtype=bn_param_dtype,
+            default_initializer=I.Constant(1.0))
+        self.bias_1 = self.create_parameter(shape=bn1_param_shape,
+                                            attr=bias1_attr,
+                                            dtype=bn_param_dtype,
+                                            is_bias=True)
+        self.mean_1 = self.create_parameter(attr=ParamAttr(
+            name=moving_mean1_name,
+            initializer=I.Constant(0.0),
+            trainable=False),
+                                            shape=bn1_param_shape,
+                                            dtype=bn_param_dtype)
+        self.mean_1.stop_gradient = True
+        self.var_1 = self.create_parameter(
+            attr=ParamAttr(name=moving_var1_name,
+                           initializer=I.Constant(1.0),
+                           trainable=False),
+            shape=bn1_param_shape,
+            dtype=bn_param_dtype)
+        self.var_1.stop_gradient = True
+
+        self.filter_2 = self.create_parameter(
+            shape=filter2_shape,
+            attr=filter2_attr,
+            default_initializer=_get_default_param_initializer(
+                num_channels2, self._kernel2_size))
+        self.scale_2 = self.create_parameter(
+            shape=bn2_param_shape,
+            attr=scale2_attr,
+            dtype=bn_param_dtype,
+            default_initializer=I.Constant(1.0))
+        self.bias_2 = self.create_parameter(shape=bn2_param_shape,
+                                            attr=bias2_attr,
+                                            dtype=bn_param_dtype,
+                                            is_bias=True)
+        self.mean_2 = self.create_parameter(attr=ParamAttr(
+            name=moving_mean2_name,
+            initializer=I.Constant(0.0),
+            trainable=False),
+                                            shape=bn2_param_shape,
+                                            dtype=bn_param_dtype)
+        self.mean_2.stop_gradient = True
+        self.var_2 = self.create_parameter(
+            attr=ParamAttr(name=moving_var2_name,
+                           initializer=I.Constant(1.0),
+                           trainable=False),
+            shape=bn2_param_shape,
+            dtype=bn_param_dtype)
+        self.var_2.stop_gradient = True
+
+        if has_shortcut:
+            bn3_param_shape = [1, 1, num_filter3]
+            filter3_shape = [
+                num_filter3, num_channels3, filter3_size, filter3_size
+            ]
+            self.filter_3 = self.create_parameter(
+                shape=filter3_shape,
+                attr=filter3_attr,
+                default_initializer=_get_default_param_initializer(
+                    num_channels3, self._kernel3_size))
+            self.scale_3 = self.create_parameter(
+                shape=bn3_param_shape,
+                attr=scale3_attr,
+                dtype=bn_param_dtype,
+                default_initializer=I.Constant(1.0))
+            self.bias_3 = self.create_parameter(shape=bn3_param_shape,
+                                                attr=bias3_attr,
+                                                dtype=bn_param_dtype,
+                                                is_bias=True)
+            self.mean_3 = self.create_parameter(attr=ParamAttr(
+                name=moving_mean3_name,
+                initializer=I.Constant(0.0),
+                trainable=False),
+                                                shape=bn3_param_shape,
+                                                dtype=bn_param_dtype)
+            self.mean_3.stop_gradient = True
+            self.var_3 = self.create_parameter(attr=ParamAttr(
+                name=moving_var3_name,
+                initializer=I.Constant(1.0),
+                trainable=False),
+                                               shape=bn3_param_shape,
+                                               dtype=bn_param_dtype)
+            self.var_3.stop_gradient = True
+        else:
+            self.filter_3 = None
+            self.scale_3 = None
+            self.bias_3 = None
+            self.mean_3 = None
+            self.var_3 = None
+
+    def forward(self, x):
+        out = resnet_basic_block(
+            x,
+            self.filter_1,
+            self.scale_1,
+            self.bias_1,
+            self.mean_1,
+            self.var_1,
+            self.filter_2,
+            self.scale_2,
+            self.bias_2,
+            self.mean_2,
+            self.var_2,
+            self.filter_3,
+            self.scale_3,
+            self.bias_3,
+            self.mean_3,
+            self.var_3,
+            self._stride1,
+            self._stride2,
+            self._stride3,
+            self._padding1,
+            self._padding2,
+            self._padding3,
+            self._dilation1,
+            self._dilation2,
+            self._dilation3,
+            self._groups,
+            self._momentum,
+            self._eps,
+            self._data_format,
+            self._has_shortcut,
+            use_global_stats=self._use_global_stats,
+            training=self.training,
+            trainable_statistics=self._trainable_statistics,
+            find_conv_max=self._find_conv_max)
+        return out
diff --git a/python/paddle/jit/dy2static/__init__.py b/python/paddle/jit/dy2static/__init__.py
index 0a51a3e265ede..ebb4d30a41212 100644
--- a/python/paddle/jit/dy2static/__init__.py
+++ b/python/paddle/jit/dy2static/__init__.py
@@ -26,7 +26,8 @@
 from .convert_operators import convert_print as Print  # noqa: F401
 from .convert_operators import convert_shape as Shape  # noqa: F401
 from .convert_operators import convert_while_loop as While  # noqa: F401
-
+from .convert_operators import unpack_by_structure as Unpack  # noqa: F401
+from .convert_operators import indexable as Indexable  # noqa: F401
 from .variable_trans_func import create_bool_as_type  # noqa: F401
 from .variable_trans_func import to_static_variable  # noqa: F401
 from .convert_operators import convert_shape_compare  # noqa: F401
diff --git a/python/paddle/jit/dy2static/convert_operators.py b/python/paddle/jit/dy2static/convert_operators.py
index 59ffedef0a900..691c8c0cfbea3 100644
--- a/python/paddle/jit/dy2static/convert_operators.py
+++ b/python/paddle/jit/dy2static/convert_operators.py
@@ -26,5 +26,6 @@
 from ...fluid.dygraph.dygraph_to_static.convert_operators import convert_var_dtype  # noqa: F401
 from ...fluid.dygraph.dygraph_to_static.convert_operators import convert_shape  # noqa: F401
 from ...fluid.dygraph.dygraph_to_static.convert_operators import convert_while_loop  # noqa: F401
+from ...fluid.dygraph.dygraph_to_static.convert_operators import unpack_by_structure, indexable  # noqa: F401
 
 __all__ = []
diff --git a/python/paddle/jit/layer.py b/python/paddle/jit/layer.py
new file mode 100644
index 0000000000000..4aee7a8f5c02a
--- /dev/null
+++ b/python/paddle/jit/layer.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 NVIDIA Corporation. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid import core
+from paddle.fluid.core import Load
+
+
+class Layer(object):
+
+    def __init__(self):
+        self.cpp_layer = None
+        # {name: Function}
+        self.functions = {}
+
+    def load(self, load_path, place):
+        self.cpp_layer = Load(load_path, place)
+        function_dict = self.cpp_layer.function_dict()
+
+        for name, function in function_dict.items():
+            self.functions[name] = Function(function)
+            setattr(self, name, self.functions[name])
+
+
+class Function():
+
+    def __init__(self, function):
+        self.function = function
+        self.info = FunctionInfo(function.info())
+
+    def __call__(self, *args):
+        return core.eager.jit_function_call(self.function, args)
+
+
+class FunctionInfo():
+
+    def __init__(self, info):
+        self.info = info
+
+    def name(self):
+        return self.info.name()
diff --git a/python/paddle/metric/metrics.py b/python/paddle/metric/metrics.py
index 4d28b68f99456..919daa31d06fc 100644
--- a/python/paddle/metric/metrics.py
+++ b/python/paddle/metric/metrics.py
@@ -776,7 +776,7 @@ def accuracy(input, label, k=1, correct=None, total=None, name=None):
     Args:
         input(Tensor): The input of accuracy layer, which is the predictions of network. A Tensor with type float32,float64.
             The shape is ``[sample_number, class_dim]`` .
-        label(Tensor): The label of dataset. Tensor with type int64. The shape is ``[sample_number, 1]`` .
+        label(Tensor): The label of dataset. Tensor with type int64 or int32. The shape is ``[sample_number, 1]`` .
         k(int, optional): The top k predictions for each class will be checked. Data type is int64 or int32.
         correct(Tensor, optional): The correct predictions count. A Tensor with type int64 or int32.
         total(Tensor, optional): The total entries count. A tensor with type int64 or int32.
@@ -796,6 +796,8 @@ def accuracy(input, label, k=1, correct=None, total=None, name=None):
             result = paddle.metric.accuracy(input=predictions, label=label, k=1)
             # [0.5]
     """
+    if label.dtype == paddle.int32:
+        label = paddle.cast(label, paddle.int64)
     if _non_static_mode():
         if correct is None:
             correct = _varbase_creator(dtype="int32")
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index d8dc68376d163..3e73cab8f2cd8 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -130,6 +130,10 @@ def _conv_nd(x,
         if bias is not None:
             channel_dim = channel_dim + len(
                 x.shape) if channel_dim < 0 else channel_dim
+            if isinstance(x, tuple):
+                x = x[0]
+            if isinstance(bias, tuple):
+                bias = bias[0]
             if len(bias.shape) < len(x.shape):
                 tmp_bias = _C_ops.final_state_reshape(
                     bias, bias.shape +
diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py
index daa6925c4b907..c1dd9c48fb5a1 100755
--- a/python/paddle/profiler/profiler_statistic.py
+++ b/python/paddle/profiler/profiler_statistic.py
@@ -15,7 +15,7 @@
 from enum import Enum
 import re
 
-from paddle.fluid.core import TracerEventType
+from paddle.fluid.core import TracerEventType, TracerMemEventType
 
 from .statistic_helper import *
 
@@ -79,19 +79,14 @@ def __init__(self, hostnode):
         self.self_gpu_time = 0
         self.general_gpu_time = 0  # besides kernel, include time of gpu events like memcpy and memset
         self.self_general_gpu_time = 0
-        self.is_terminal_operator_node = True
 
     def cal_statistic(self):
         for child in self.children_node:
             child.cal_statistic()
-            if child.is_terminal_operator_node == False:
-                self.is_terminal_operator_node = False
         for rt in self.runtime_node:
             rt.cal_statistic()
         self.cpu_time = self.hostnode.end_ns - self.hostnode.start_ns
         for child in self.children_node:
-            if child.type == TracerEventType.Operator:
-                self.is_terminal_operator_node = False
             self.gpu_time += child.gpu_time
             self.general_gpu_time += child.general_gpu_time
             self.self_cpu_time -= (child.end_ns - child.start_ns)
@@ -421,10 +416,11 @@ def add_item(self, node):
             self.add_gpu_time(node.gpu_time)
             self.add_general_gpu_time(node.general_gpu_time)
             for child in node.children_node:
-                if child.name not in self.operator_inners:
-                    self.operator_inners[
-                        child.name] = EventSummary.OperatorItem(child.name)
-                self.operator_inners[child.name].add_item(child)
+                if child.type != TracerEventType.Operator:
+                    if child.name not in self.operator_inners:
+                        self.operator_inners[
+                            child.name] = EventSummary.OperatorItem(child.name)
+                    self.operator_inners[child.name].add_item(child)
 
             for runtimenode in node.runtime_node:
                 for devicenode in runtimenode.device_node:
@@ -537,8 +533,6 @@ def parse(self, nodetrees):
                         deque.append(child)
 
     def add_operator_item(self, operator_node):
-        if operator_node.is_terminal_operator_node == False:
-            return
         if operator_node.name not in self.items:
             self.items[operator_node.name] = EventSummary.OperatorItem(
                 operator_node.name)
@@ -603,6 +597,83 @@ def add_kernel_item(self, root_node):
                 self.kernel_items[name].add_item(device_node)
 
 
+class MemorySummary:
+    r"""
+    Analyse memory events in profiling data.
+    """
+
+    class MemoryItem:
+
+        def __init__(self, event_name, place, memory_type='Allocated'):
+            self.event_name = event_name
+            self.place = place
+            self.allocation_count = 0
+            self.free_count = 0
+            self.allocation_size = 0
+            self.free_size = 0
+            self.increase_size = 0
+            self.memory_type = memory_type
+
+        def add_memory_record(self, size, allocation_type):
+            if allocation_type == TracerMemEventType.Allocate or allocation_type == TracerMemEventType.ReservedAllocate:
+                self.allocation_count += 1
+                self.allocation_size += size
+
+            elif allocation_type == TracerMemEventType.Free or allocation_type == TracerMemEventType.ReservedFree:
+                self.free_count += 1
+                self.free_size -= size  # size is sign(-) when free.
+
+            else:
+                print("No corresponding type.")
+            self.increase_size = self.allocation_size - self.free_size
+
+    def __init__(self):
+        self.allocated_items = collections.defaultdict(
+            dict)  # for memory summary, device type: event
+        self.reserved_items = collections.defaultdict(
+            dict)  # for memory summary, device type: event
+        self.peak_allocation_values = collections.defaultdict(int)
+        self.peak_reserved_values = collections.defaultdict(int)
+
+    def _analyse_node_memory(self, event_name, node):
+        for memnode in node.mem_node:  # self mem node
+            if memnode.type == TracerMemEventType.Allocate or memnode.type == TracerMemEventType.Free:
+                if event_name not in self.allocated_items[memnode.place]:
+                    self.allocated_items[
+                        memnode.place][event_name] = MemorySummary.MemoryItem(
+                            event_name, memnode.place, 'Allocated')
+                self.allocated_items[
+                    memnode.place][event_name].add_memory_record(
+                        memnode.increase_bytes, memnode.type)
+            elif memnode.type == TracerMemEventType.ReservedAllocate or memnode.type == TracerMemEventType.ReservedFree:
+                if event_name not in self.reserved_items[memnode.place]:
+                    self.reserved_items[
+                        memnode.place][event_name] = MemorySummary.MemoryItem(
+                            event_name, memnode.place, 'Reserved')
+                self.reserved_items[
+                    memnode.place][event_name].add_memory_record(
+                        memnode.increase_bytes, memnode.type)
+            self.peak_allocation_values[memnode.place] = max(
+                self.peak_allocation_values[memnode.place],
+                memnode.peak_allocated)
+            self.peak_reserved_values[memnode.place] = max(
+                self.peak_reserved_values[memnode.place], memnode.peak_reserved)
+
+    def parse(self, nodetrees):
+        r"""
+        Analyse memory event in the nodetress.
+        """
+        thread2hostnodes = traverse_tree(nodetrees)
+        for threadid, host_nodes in thread2hostnodes.items():
+            for host_node in host_nodes[1:]:  #skip root node
+                if host_node.type == TracerEventType.OperatorInner:
+                    continue
+                if host_node.type == TracerEventType.Operator:
+                    for child in host_node.children_node:
+                        self._analyse_node_memory(host_node.name, child)
+                self._analyse_node_memory(host_node.name, host_node)
+
+
 class StatisticData:
     r"""
     Hold all analysed results.
@@ -614,9 +685,11 @@ def __init__(self, node_trees, extra_info):
         self.time_range_summary = TimeRangeSummary()
         self.event_summary = EventSummary()
         self.distributed_summary = DistributedSummary()
+        self.memory_summary = MemorySummary()
         self.time_range_summary.parse(node_trees)
         self.event_summary.parse(node_trees)
         self.distributed_summary.parse(node_trees)
+        self.memory_summary.parse(node_trees)
 
 
 def _build_table(statistic_data,
@@ -1498,4 +1571,76 @@ def format_ratio(ratio, indent=0):
         append('')
         append('')
 
+    ###### Print Memory Summary Report ######
+    if statistic_data.memory_summary.allocated_items or statistic_data.memory_summary.reserved_items:
+        for device_type, memory_events in statistic_data.memory_summary.allocated_items.items(
+        ):
+            all_row_values = []
+            sorted_items = sorted(memory_events.items(),
+                                  key=lambda x: x[1].increase_size,
+                                  reverse=True)
+
+            for event_name, item in sorted_items:
+                row_values = [
+                    event_name, item.memory_type, item.allocation_count,
+                    item.free_count, item.allocation_size, item.free_size,
+                    item.increase_size
+                ]
+                all_row_values.append(row_values)
+
+            sorted_reserved_items = sorted(statistic_data.memory_summary.
+                                           reserved_items[device_type].items(),
+                                           key=lambda x: x[1].increase_size,
+                                           reverse=True)
+            for event_name, item in sorted_reserved_items:
+                row_values = [
+                    event_name, item.memory_type, item.allocation_count,
+                    item.free_count, item.allocation_size, item.free_size,
+                    item.increase_size
+                ]
+                all_row_values.append(row_values)
+
+            # Calculate the column width
+            headers = [
+                'Name', 'Type', 'Allocation Count', 'Free Count',
+                'Allocation Size', 'Free Size', 'Increased Size'
+            ]
+            row_format_list = [""]
+            header_sep_list = [""]
+            line_length_list = [-SPACING_SIZE]
+            name_column_width = 50
+            number_column_width = 15
+            add_column(name_column_width)
+            add_column(12)
+            add_column(number_column_width)
+            add_column(number_column_width)
+            add_column(number_column_width)
+            add_column(number_column_width)
+            add_column(number_column_width)
+
+            row_format = row_format_list[0]
+            header_sep = header_sep_list[0]
+            line_length = line_length_list[0]
+
+            # construct table string
+            append(
+                add_title(line_length,
+                          "Memory Summary - {}".format(device_type)))
+            append('Peak Allocated Memory: {}'.format(
+                statistic_data.memory_summary.
+                peak_allocation_values[device_type]))
+            append('Peak Reserved Memory: {}'.format(
+                statistic_data.memory_summary.peak_reserved_values[device_type])
+                   )
+            append(header_sep)
+            append(row_format.format(*headers))
+            append(header_sep)
+            for row_values in all_row_values:
+                if isinstance(row_values, str):
+                    append(add_title(line_length, row_values))
+                else:
+                    append(row_format.format(*row_values))
+            append('')
+            append('')
+
     return ''.join(result)
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index b73fe74a40ba2..85f8ba4aa4f45 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -1701,6 +1701,9 @@ def complex(real, imag, name=None):
             # [[0.+0.j 0.+1.j 0.+2.j]
             #  [1.+0.j 1.+1.j 1.+2.j]]
     """
+    if in_dygraph_mode():
+        return _C_ops.final_state_complex(real, imag)
+
     if paddle.in_dynamic_mode():
         return paddle._C_ops.complex(real, imag)
 
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index c704a1b52d14e..1bc85a076a0f7 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -1017,11 +1017,12 @@ def dot(x, y, name=None):
         print(z)
 
     """
+    if in_dygraph_mode():
+        return _C_ops.final_state_dot(x, y)
+    if _in_legacy_dygraph():
+        return _C_ops.dot(x, y)
+
     op_type = 'dot'
-    # skip var type check in dygraph mode to improve efficiency
-    if paddle.in_dynamic_mode():
-        op = getattr(_C_ops, op_type)
-        return op(x, y)
 
     assert x is not None, 'x cannot be None in {}'.format(op_type)
     assert y is not None, 'y cannot be None in {}'.format(op_type)
@@ -2338,7 +2339,9 @@ def eigvals(x, name=None):
             "The last two dimensions of Input(x) should be equal, but received x's shape = {}"
             .format(x_shape))
 
-    if paddle.in_dynamic_mode():
+    if in_dygraph_mode():
+        return _C_ops.final_state_eigvals(x)
+    elif paddle.in_dynamic_mode():
         return _C_ops.eigvals(x)
 
     helper = LayerHelper('eigvals', **locals())
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index c445402412e16..6c4b1cd22b0ef 100755
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -2066,7 +2066,18 @@ def unique_consecutive(x,
     else:
         axis = [axis]
     attr_dtype = convert_np_dtype_to_dtype_(dtype)
-    if paddle.in_dynamic_mode():
+    if in_dygraph_mode():
+        out, inverse, counts = _C_ops.final_state_unique_consecutive(
+            x, return_inverse, return_counts, axis, attr_dtype)
+        outs = [out]
+        if return_inverse:
+            outs.append(inverse)
+        if return_counts:
+            outs.append(counts)
+        if len(outs) == 1:
+            return outs[0]
+        return tuple(outs)
+    elif paddle.in_dynamic_mode():
         out, inverse, counts = _C_ops.unique_consecutive(
             x, 'dtype', attr_dtype, 'return_inverse', return_inverse,
             'return_counts', return_counts, 'axis', axis)
@@ -2842,8 +2853,7 @@ def tile(x, repeat_times, name=None):
     """
     if in_dygraph_mode():
         if isinstance(repeat_times, core.eager.Tensor):
-            assert (repeat_times.ndim == 1,
-                    "Only support ndim == 1 while repeat_times is a Tensor.")
+            assert repeat_times.ndim == 1, "Only support ndim == 1 while repeat_times is a Tensor."
             repeat_times = repeat_times.numpy().tolist()
 
         return _C_ops.final_state_tile(x, repeat_times)
diff --git a/python/paddle/utils/code_gen/args_compat.yaml b/python/paddle/utils/code_gen/args_compat.yaml
deleted file mode 100644
index 17f1d545057c0..0000000000000
--- a/python/paddle/utils/code_gen/args_compat.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-- api : atan2
-  inputs :
-    x : X1
-    y : X2
-  outputs :
-    out : Out
-
-- api : cross
-  inputs : {x : X, y : Y}
-  attrs :
-    axis : dim
-  outputs :
-    out : Out
-
-- api : diagonal
-  inputs :
-    x : Input
-  outputs :
-    out : Out
-
-- api : trace
-  inputs :
-    x : Input
-  outputs :
-    out : Out
diff --git a/python/paddle/utils/code_gen/sparse_api.yaml b/python/paddle/utils/code_gen/sparse_api.yaml
deleted file mode 100644
index e99009a70fc3b..0000000000000
--- a/python/paddle/utils/code_gen/sparse_api.yaml
+++ /dev/null
@@ -1,179 +0,0 @@
-- api : add
-  args : (Tensor x, Tensor y)
-  output : Tensor(out)
-  kernel :
-    func : add_coo_coo{sparse_coo -> sparse_coo},
-           add_csr_csr{sparse_csr -> sparse_csr}
-    layout : x
-  backward : add_grad
-
-- api : conv3d
-  args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm)
-  output : Tensor(out), Tensor(rulebook)
-  kernel :
-    func : sparse_conv3d{sparse_coo, dense -> sparse_coo, dense}
-    layout : x
-  intermediate : rulebook
-  backward : conv3d_grad
-
-- api : coo_to_dense
-  args : (Tensor x)
-  output : Tensor(out)
-  invoke : to_dense_impl(x)
-  backward : coo_to_dense_grad
-
-- api : create_sparse_coo_tensor
-  args : (Tensor values, Tensor indices, IntArray dense_shape)
-  output : Tensor(out)
-  kernel :
-    func : sparse_coo_tensor{dense, dense -> sparse_coo}
-    layout : values
-    data_type : values
-  backward : create_sparse_coo_tensor_grad
-
-- api : dense_to_coo
-  args : (Tensor x, int64_t sparse_dim)
-  output : Tensor(out)
-  invoke : to_sparse_coo_impl(x, sparse_dim)
-  backward : dense_to_coo_grad
-
-- api : divide
-  args : (Tensor x, Tensor y)
-  output : Tensor(out)
-  kernel :
-    func : divide_coo_coo{sparse_coo -> sparse_coo},
-           divide_csr_csr{sparse_csr -> sparse_csr}
-    layout : x
-  backward : divide_grad
-
-- api : multiply
-  args : (Tensor x, Tensor y)
-  output : Tensor(out)
-  kernel :
-    func : multiply_coo_coo{sparse_coo -> sparse_coo},
-           multiply_csr_csr{sparse_csr -> sparse_csr}
-    layout : x
-  backward : multiply_grad
-
-- api : relu
-  args : (Tensor x)
-  output : Tensor(out)
-  kernel :
-    func : sparse_coo_relu{sparse_coo -> sparse_coo},
-           sparse_csr_relu{sparse_csr -> sparse_csr}
-    layout : x
-  backward : relu_grad
-
-- api : sin
-  args : (Tensor x)
-  output : Tensor(out@SparseCooTensor)
-  kernel :
-    func : sparse_coo_sin {sparse_coo -> sparse_coo},
-           sparse_csr_sin {sparse_csr -> sparse_csr}
-    layout : x
-  backward : sin_grad
-
-- api : softmax
-  args : (Tensor x, int axis=-1)
-  output : Tensor(out)
-  kernel :
-    func : softmax_csr{sparse_csr -> sparse_csr}
-    layout : x
-  backward : softmax_grad
-
-- api : sqrt
-  args : (Tensor x)
-  output : Tensor(out)
-  kernel :
-    func : sparse_coo_sqrt{sparse_coo -> sparse_coo},
-           sparse_csr_sqrt{sparse_csr -> sparse_csr}
-    layout : x
-  backward : sqrt_grad
-
-- api : subtract
-  args : (Tensor x, Tensor y)
-  output : Tensor(out)
-  kernel :
-    func : subtract_coo_coo{sparse_coo -> sparse_coo},
-           subtract_csr_csr{sparse_csr -> sparse_csr}
-    layout : x
-  backward : subtract_grad
-
-- api : tanh
-  args : (Tensor x)
-  output : Tensor(out)
-  kernel :
-    func : sparse_coo_tanh{sparse_coo -> sparse_coo},
-           sparse_csr_tanh{sparse_csr -> sparse_csr}
-    layout : x
-  backward : tanh_grad
-
-- api : to_dense
-  args : (Tensor x)
-  output : Tensor(out)
-  invoke : to_dense_impl(x)
-
-- api : to_sparse_coo
-  args : (Tensor x, int64_t sparse_dim)
-  output : Tensor(out)
-  invoke : to_sparse_coo_impl(x, sparse_dim)
-
-- api : to_sparse_csr
-  args : (Tensor x)
-  output : Tensor(out)
-  invoke : to_sparse_csr_impl(x)
-
-- api : values
-  args : (Tensor x)
-  output : Tensor(out)
-  kernel :
-    func : coo_values{sparse_coo -> dense},
-           csr_values{sparse_csr -> dense}
-    layout : x
-  backward : values_grad
-
-- api: full_like
-  args : (Tensor x, Scalar value, DataType dtype=DataType::UNDEFINED)
-  output : Tensor(out)
-  kernel :
-    func : coo_full_like{sparse_coo -> sparse_coo},
-           csr_full_like{sparse_csr -> sparse_csr}
-    layout : x
-    data_type : dtype
-
-- api: masked_matmul
-  args : (Tensor x, Tensor y, Tensor mask)
-  output : Tensor(out)
-  kernel :
-    func : csr_masked_matmul{dense, dense, sparse_csr -> sparse_csr}
-    layout : x
-  backward: masked_matmul_grad
-
-- api: matmul
-  args : (Tensor x, Tensor y)
-  output : Tensor(out)
-  kernel :
-    func : csr_dense_matmul{sparse_csr, dense -> dense},
-           csr_csr_matmul{sparse_csr, sparse_csr -> sparse_csr},
-           coo_dense_matmul{sparse_coo, dense -> dense},
-           coo_coo_matmul{sparse_coo, sparse_coo -> sparse_coo}
-    layout : x
-  backward: matmul_grad
-
-- api: maxpool
-  args : (Tensor x, int[] kernel_sizes, int[] paddings, int[] dilations, int[] strides)
-  output : Tensor(out), Tensor(rulebook)
-  kernel :
-    func : sparse_maxpool{sparse_coo -> sparse_coo, dense}
-    layout : x
-  intermediate : rulebook
-  backward : sparse_maxpool_grad
-
-- api: mv
-  args : (Tensor x, Tensor vec)
-  output : Tensor(out)
-  kernel :
-    func : mv_coo{sparse_coo, dense -> dense},
-           mv_csr{sparse_csr, dense -> dense}
-    layout : x
-  backward: mv_grad
diff --git a/python/paddle/utils/code_gen/sparse_bw_api.yaml b/python/paddle/utils/code_gen/sparse_bw_api.yaml
deleted file mode 100644
index 6ceedb0978121..0000000000000
--- a/python/paddle/utils/code_gen/sparse_bw_api.yaml
+++ /dev/null
@@ -1,129 +0,0 @@
-- backward_api : add_grad
-  forward : add(Tensor x, Tensor y) -> Tensor(out)
-  args : (Tensor x, Tensor y, Tensor out_grad)
-  output : Tensor(x_grad), Tensor(y_grad)
-  kernel :
-    func : add_coo_coo_grad{sparse_coo, sparse_coo, sparse_coo -> sparse_coo, sparse_coo},
-           add_csr_csr_grad{sparse_csr, sparse_csr, sparse_csr -> sparse_csr, sparse_csr}
-
-- backward_api : conv3d_grad
-  forward : conv3d (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
-  args : (Tensor x, Tensor kernel, Tensor rulebook, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups, bool subm)
-  output : Tensor(x_grad), Tensor(kernel_grad)
-  kernel :
-    func : sparse_conv3d_grad{sparse_coo, dense, dense, sparse_coo -> sparse_coo, dense}
-
-- backward_api : coo_to_dense_grad
-  forward : coo_to_dense(Tensor x) -> Tensor(out)
-  args : (Tensor x, Tensor out_grad)
-  output : Tensor(x_grad)
-  kernel :
-    func : sparse_coo_to_dense_grad{sparse_coo, dense-> sparse_coo}
-
-- backward_api : create_sparse_coo_tensor_grad
-  forward : create_sparse_coo_tensor(Tensor values, Tensor indices, IntArray dense_shape) -> Tensor(out)
-  args : (Tensor indices, Tensor out_grad)
-  output : Tensor(values_grad)
-  kernel :
-    func : sparse_coo_tensor_grad{dense, sparse_coo -> dense}
-
-- backward_api : dense_to_coo_grad
-  forward : dense_to_coo(Tensor x, int64_t sparse_dim) -> Tensor(out)
-  args : (Tensor out_grad)
-  output : Tensor(x_grad)
-  invoke : to_dense_impl(out_grad)
-
-- backward_api : divide_grad
-  forward : divide(Tensor x, Tensor y) -> Tensor(out)
-  args : (Tensor x, Tensor y, Tensor out, Tensor out_grad)
-  output : Tensor(x_grad), Tensor(y_grad)
-  kernel :
-    func : divide_coo_coo_grad{sparse_coo, sparse_coo, sparse_coo, sparse_coo -> sparse_coo, sparse_coo},
-           divide_csr_csr_grad{sparse_csr, sparse_csr, sparse_csr, sparse_csr -> sparse_csr, sparse_csr}
-
-- backward_api : masked_matmul_grad
-  forward : masked_matmul(Tensor x, Tensor y, Tensor mask) -> Tensor(out)
-  args : (Tensor x, Tensor y, Tensor out_grad)
-  output : Tensor(x_grad), Tensor(y_grad)
-  kernel :
-    func : csr_masked_matmul_grad{dense, dense, sparse_csr -> dense, dense}
-
-- backward_api : matmul_grad
-  forward : matmul(Tensor x, Tensor y) -> Tensor(out)
-  args : (Tensor x, Tensor y, Tensor out_grad)
-  output : Tensor(x_grad), Tensor(y_grad)
-  kernel :
-    func : csr_dense_matmul_grad{sparse_csr, dense, dense -> sparse_csr, dense}
-
-- backward_api : multiply_grad
-  forward : multiply(Tensor x, Tensor y) -> Tensor(out)
-  args : (Tensor x, Tensor y, Tensor out_grad)
-  output : Tensor(x_grad), Tensor(y_grad)
-  kernel :
-    func : multiply_coo_coo_grad{sparse_coo, sparse_coo, sparse_coo -> sparse_coo, sparse_coo},
-           multiply_csr_csr_grad{sparse_csr, sparse_csr, sparse_csr -> sparse_csr, sparse_csr}
-
-- backward_api : mv_grad
-  forward : mv(Tensor x, Tensor vec) -> Tensor(out)
-  args : (Tensor x, Tensor vec, Tensor out_grad)
-  output : Tensor(x_grad), Tensor(vec_grad)
-  kernel :
-    func : mv_coo_grad{sparse_coo, dense, dense -> sparse_coo, dense},
-           mv_csr_grad{sparse_csr, dense, dense -> sparse_csr, dense}
-
-- backward_api : relu_grad
-  forward : relu(Tensor x) -> Tensor(out)
-  args : (Tensor out, Tensor out_grad)
-  output : Tensor(x_grad)
-  kernel :
-    func : sparse_coo_relu_grad {sparse_coo, sparse_coo -> sparse_coo}
-
-- backward_api : sin_grad
-  forward : sin(Tensor x) -> Tensor(out)
-  args : (Tensor x, Tensor out_grad)
-  output : Tensor(x_grad)
-  kernel :
-    func : sparse_coo_sin_grad {sparse_coo, sparse_coo -> sparse_coo}
-
-- backward_api : softmax_grad
-  forward : softmax(Tensor x, int axis=-1) -> Tensor(out)
-  args : (Tensor out, Tensor out_grad, int axis)
-  output : Tensor(x_grad)
-  kernel :
-    func : softmax_csr_grad{sparse_csr, sparse_csr -> sparse_csr}
-
-- backward_api : sparse_maxpool_grad
-  forward : sparse_maxpool(Tensor x, int[] kernel_sizes, int[] paddings, int[] dilations, int[] strides) -> Tensor(out), Tensor(rulebook)
-  args : (Tensor x, Tensor rulebook, Tensor out, Tensor out_grad, int[] kernel_sizes)
-  output : Tensor(x_grad)
-  kernel :
-    func : sparse_maxpool_grad {sparse_coo, dense, sparse_coo, sparse_coo -> sparse_coo}
-
-- backward_api : sqrt_grad
-  forward : sqrt(Tensor x) -> Tensor(out)
-  args : (Tensor out, Tensor out_grad)
-  output : Tensor(x_grad)
-  kernel :
-    func : sparse_coo_sqrt_grad {sparse_coo, sparse_coo -> sparse_coo}
-
-- backward_api : subtract_grad
-  forward : subtract(Tensor x, Tensor y) -> Tensor(out)
-  args : (Tensor x, Tensor y, Tensor out_grad)
-  output : Tensor(x_grad), Tensor(y_grad)
-  kernel :
-    func : subtract_coo_coo_grad{sparse_coo, sparse_coo, sparse_coo -> sparse_coo, sparse_coo},
-           subtract_csr_csr_grad{sparse_csr, sparse_csr, sparse_csr -> sparse_csr, sparse_csr}
-
-- backward_api : tanh_grad
-  forward : tanh(Tensor x) -> Tensor(out)
-  args : (Tensor out, Tensor out_grad)
-  output : Tensor(x_grad)
-  kernel :
-    func : sparse_coo_tanh_grad {sparse_coo, sparse_coo -> sparse_coo}
-
-- backward_api : values_grad
-  forward : coo_values(Tensor x) -> Tensor(out)
-  args : (Tensor x, Tensor out_grad)
-  output : Tensor(x_grad)
-  kernel :
-    func : coo_values_grad{sparse_coo, dense-> sparse_coo}
diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py
index cc5a0caf71f47..cdb8417b6b9c2 100644
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -24,9 +24,21 @@
 from paddle import _C_ops
 
 __all__ = [  #noqa
-    'yolo_loss', 'yolo_box', 'deform_conv2d', 'DeformConv2D', 'read_file',
-    'decode_jpeg', 'roi_pool', 'RoIPool', 'psroi_pool', 'PSRoIPool',
-    'roi_align', 'RoIAlign', 'nms', 'generate_proposals'
+    'yolo_loss',
+    'yolo_box',
+    'deform_conv2d',
+    'DeformConv2D',
+    'distribute_fpn_proposals',
+    'generate_proposals',
+    'read_file',
+    'decode_jpeg',
+    'roi_pool',
+    'RoIPool',
+    'psroi_pool',
+    'PSRoIPool',
+    'roi_align',
+    'RoIAlign',
+    'nms',
 ]
 
 
@@ -825,6 +837,123 @@ def forward(self, x, offset, mask=None):
         return out
 
 
+def distribute_fpn_proposals(fpn_rois,
+                             min_level,
+                             max_level,
+                             refer_level,
+                             refer_scale,
+                             pixel_offset=False,
+                             rois_num=None,
+                             name=None):
+    r"""
+        In Feature Pyramid Networks (FPN) models, it is needed to distribute 
+    all proposals into different FPN level, with respect to scale of the proposals, 
+    the referring scale and the referring level. Besides, to restore the order of 
+    proposals, we return an array which indicates the original index of rois 
+    in current proposals. To compute FPN level for each roi, the formula is given as follows:
+    
+    .. math::
+        roi\_scale &= \sqrt{BBoxArea(fpn\_roi)}
+        level = floor(&\log(\\frac{roi\_scale}{refer\_scale}) + refer\_level)
+    where BBoxArea is a function to compute the area of each roi.
+
+    Args:
+        fpn_rois (Tensor): The input fpn_rois. 2-D Tensor with shape [N, 4] and data type can be
+            float32 or float64.
+        min_level (int): The lowest level of FPN layer where the proposals come 
+            from.
+        max_level (int): The highest level of FPN layer where the proposals
+            come from.
+        refer_level (int): The referring level of FPN layer with specified scale.
+        refer_scale (int): The referring scale of FPN layer with specified level.
+        pixel_offset (bool, optional): Whether there is pixel offset. If True, the offset of 
+            image shape will be 1. 'False' by default.
+        rois_num (Tensor, optional): 1-D Tensor contains the number of RoIs in each image. 
+            The shape is [B] and data type is int32. B is the number of images.
+            If rois_num not None, it will return a list of 1-D Tensor. Each element 
+            is the output RoIs' number of each image on the corresponding level
+            and the shape is [B]. None by default.
+        name (str, optional): For detailed information, please refer 
+            to :ref:`api_guide_Name`. Usually name is no need to set and 
+            None by default. 
+
+    Returns:
+        multi_rois (List) : The proposals in each FPN level. It is a list of 2-D Tensor with shape [M, 4], where M is
+            and data type is same as `fpn_rois` . The length is max_level-min_level+1.         
+        restore_ind (Tensor): The index used to restore the order of fpn_rois. It is a 2-D Tensor with shape [N, 1]
+            , where N is the number of total rois. The data type is int32. 
+        rois_num_per_level (List): A list of 1-D Tensor and each Tensor is 
+            the RoIs' number in each image on the corresponding level. The shape 
+            is [B] and data type of int32, where B is the number of images.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            fpn_rois = paddle.rand((10, 4))
+            rois_num = paddle.to_tensor([3, 1, 4, 2], dtype=paddle.int32)
+
+            multi_rois, restore_ind, rois_num_per_level = paddle.vision.ops.distribute_fpn_proposals(
+                fpn_rois=fpn_rois,
+                min_level=2,
+                max_level=5,
+                refer_level=4,
+                refer_scale=224,
+                rois_num=rois_num)
+    """
+    num_lvl = max_level - min_level + 1
+
+    if _non_static_mode():
+        assert rois_num is not None, "rois_num should not be None in dygraph mode."
+        attrs = ('min_level', min_level, 'max_level', max_level, 'refer_level',
+                 refer_level, 'refer_scale', refer_scale, 'pixel_offset',
+                 pixel_offset)
+        multi_rois, restore_ind, rois_num_per_level = _C_ops.distribute_fpn_proposals(
+            fpn_rois, rois_num, num_lvl, num_lvl, *attrs)
+        return multi_rois, restore_ind, rois_num_per_level
+
+    else:
+        check_variable_and_dtype(fpn_rois, 'fpn_rois', ['float32', 'float64'],
+                                 'distribute_fpn_proposals')
+        helper = LayerHelper('distribute_fpn_proposals', **locals())
+        dtype = helper.input_dtype('fpn_rois')
+        multi_rois = [
+            helper.create_variable_for_type_inference(dtype)
+            for i in range(num_lvl)
+        ]
+
+        restore_ind = helper.create_variable_for_type_inference(dtype='int32')
+
+        inputs = {'FpnRois': fpn_rois}
+        outputs = {
+            'MultiFpnRois': multi_rois,
+            'RestoreIndex': restore_ind,
+        }
+
+        if rois_num is not None:
+            inputs['RoisNum'] = rois_num
+            rois_num_per_level = [
+                helper.create_variable_for_type_inference(dtype='int32')
+                for i in range(num_lvl)
+            ]
+            outputs['MultiLevelRoIsNum'] = rois_num_per_level
+        else:
+            rois_num_per_level = None
+
+        helper.append_op(type='distribute_fpn_proposals',
+                         inputs=inputs,
+                         outputs=outputs,
+                         attrs={
+                             'min_level': min_level,
+                             'max_level': max_level,
+                             'refer_level': refer_level,
+                             'refer_scale': refer_scale,
+                             'pixel_offset': pixel_offset
+                         })
+        return multi_rois, restore_ind, rois_num_per_level
+
+
 def read_file(filename, name=None):
     """
     Reads and outputs the bytes contents of a file as a uint8 Tensor
diff --git a/python/setup.py.in b/python/setup.py.in
index 567a411d0980b..c02ef7f017fca 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -379,6 +379,7 @@ packages=['paddle',
           'paddle.incubate.sparse.nn',
           'paddle.incubate.sparse.nn.layer',
           'paddle.incubate.sparse.nn.functional',
+          'paddle.incubate.xpu',
           'paddle.io',
           'paddle.optimizer',
           'paddle.nn',
@@ -578,7 +579,8 @@ if '${CMAKE_BUILD_TYPE}' == 'Release':
             commands = ["install_name_tool -id '@loader_path/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so']
             commands.append("install_name_tool -add_rpath '@loader_path/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so')
         else:
-            commands = ["patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so']
+            commands = ["patchelf --set-soname '${FLUID_CORE_NAME}.so' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so']
+            commands.append("patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so')
         # The sw_64 not suppot patchelf, so we just disable that.
         if platform.machine() != 'sw_64' and platform.machine() != 'mips64':
             for command in commands:
diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh
index 18b467ccf4781..87edff50ef85e 100644
--- a/tools/check_api_approvals.sh
+++ b/tools/check_api_approvals.sh
@@ -43,22 +43,22 @@ api_spec_diff=`python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/flu
 if [ "$api_spec_diff" != "" -o "${api_params_diff}" != "" ]; then
     echo_line="You must have one RD (XiaoguangHu01, lanxianghit or Superjomn) approval for API change.\n"
     echo_line="${echo_line} and one TPM approval for API change: \n"
-    echo_line="${echo_line} jzhang533/ZhangJun, dingjiaweiww/DingJiaWei, TCChenlong/ChenLong, Ligoml/LiMengLiu for general APIs.\n"
+    echo_line="${echo_line} jzhang533/ZhangJun, momozi1996/MoYan, dingjiaweiww/DingJiaWei, Ligoml/LiMengLiu for general APIs.\n"
     echo_line="${echo_line} liuTINA0907/LiuShuangQiao for distributed related APIs.\n"
     echo_line="${echo_line} leiqing1/LeiQing for inference related APIs.\n"
 
     check_approval 1 46782768 47554610 328693
-    check_approval 1 29231 23093488 11935832 39876205 65896652 54695910
+    check_approval 1 29231 79295425 23093488 39876205 65896652 54695910
 fi
 
 api_doc_spec_diff=`python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec.doc  ${PADDLE_ROOT}/paddle/fluid/API_PR.spec.doc` 
 if [ "$api_doc_spec_diff" != "" ]; then
     echo_line="You must have  one TPM approval for API documents change: \n"
-    echo_line="${echo_line} jzhang533/ZhangJun, dingjiaweiww/DingJiaWei, TCChenlong/ChenLong, Ligoml/LiMengLiu for general API docs.\n"
+    echo_line="${echo_line} jzhang533/ZhangJun, momozi1996/MoYan, dingjiaweiww/DingJiaWei, Ligoml/LiMengLiu for general API docs.\n"
     echo_line="${echo_line} liuTINA0907/LiuShuangQiao for distributed related API docs.\n"
     echo_line="${echo_line} leiqing1/LeiQing for inference related API docs.\n"
 
-    check_approval 1 29231 23093488 11935832 39876205 65896652 54695910
+    check_approval 1 29231 79295425 23093488 39876205 65896652 54695910
 fi
 
 api_src_spec_diff=`python ${PADDLE_ROOT}/tools/check_api_source_without_core_ops.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.source.md5  ${PADDLE_ROOT}/paddle/fluid/API_PR.source.md5` 
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index 9ed85c699d1e9..55b55faabf993 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -238,12 +238,6 @@ if [ "${HAS_MODIFIED_DEMO_CMAKE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     check_approval 1 328693 6836917 39303645
   fi
 
-HAS_MODIFIED_ALLOCATION=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/memory/allocation" || true`
-if [ "${HAS_MODIFIED_ALLOCATION}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="You must be approved by zhiqiu and Shixiaowei02 for paddle/fluid/memory/allocation.\nIt is being modularized and refactored. Thanks!\n"
-    check_approval 1 6888866 39303645
-  fi
-
 HAS_MODIFIED_DECLARATIONS=`git diff -U0 upstream/$BRANCH |grep "^+" |grep "paddle/phi/kernels/declarations.h" || true`
 if [ "${HAS_MODIFIED_DECLARATIONS}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     echo_line="You must be approved by chenwhql or zyfncg for paddle/phi/kernels/declarations.h using. Thanks!\n"
diff --git a/tools/codestyle/cpplint_pre_commit.hook b/tools/codestyle/cpplint_pre_commit.hook
index cef11ab1351b7..3c584a440ee1f 100755
--- a/tools/codestyle/cpplint_pre_commit.hook
+++ b/tools/codestyle/cpplint_pre_commit.hook
@@ -24,7 +24,7 @@ for file in $files; do
     if [[ $file =~ ^(patches/.*) ]]; then
         continue;
     else
-        cpplint --filter=-readability/fn_size,-build/include_what_you_use,-build/c++11 $file;
+        cpplint --filter=-readability/fn_size,-build/include_what_you_use,-build/c++11,-whitespace/parens $file;
         TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
     fi
 done
diff --git a/tools/infrt/generate_phi_kernel_dialect.py b/tools/infrt/generate_phi_kernel_dialect.py
index c0a5139313029..39b0d5484a8ff 100644
--- a/tools/infrt/generate_phi_kernel_dialect.py
+++ b/tools/infrt/generate_phi_kernel_dialect.py
@@ -72,12 +72,11 @@ def get_skipped_kernel_list():
 
 def get_api_yaml_info(file_path):
     apis = []
-    with open(file_path + "/python/paddle/utils/code_gen/api.yaml", 'r') as f:
+    with open(file_path + "/paddle/phi/api/yaml/api.yaml", 'r') as f:
         api_list = yaml.load(f, Loader=yaml.FullLoader)
         if api_list:
             apis.extend(api_list)
-    with open(file_path + "/python/paddle/utils/code_gen/legacy_api.yaml",
-              'r') as f:
+    with open(file_path + "/paddle/phi/api/yaml/legacy_api.yaml", 'r') as f:
         legacy_api_list = yaml.load(f, Loader=yaml.FullLoader)
         if legacy_api_list:
             apis.extend(legacy_api_list)
diff --git a/tools/infrt/get_phi_kernel_function.sh b/tools/infrt/get_phi_kernel_function.sh
index 92076803cf65e..69926e28cb54b 100644
--- a/tools/infrt/get_phi_kernel_function.sh
+++ b/tools/infrt/get_phi_kernel_function.sh
@@ -77,8 +77,8 @@ done
 
 #step 2:get simple general inferMeta function wrap info
 temp_path=`mktemp -d`
-python3 ${PADDLE_ROOT}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py \
-  --api_yaml_path ${PADDLE_ROOT}/python/paddle/utils/code_gen/api.yaml ${PADDLE_ROOT}/python/paddle/utils/code_gen/legacy_api.yaml \
+python3 ${PADDLE_ROOT}/paddle/phi/api/yaml/generator/wrapped_infermeta_gen.py \
+  --api_yaml_path ${PADDLE_ROOT}/paddle/phi/api/yaml/api.yaml ${PADDLE_ROOT}/paddle/phi/api/yaml/legacy_api.yaml \
   --wrapped_infermeta_header_path ${temp_path}/generate.h \
   --wrapped_infermeta_source_path ${temp_path}/generate.cc
 
diff --git a/tools/infrt/get_phi_kernel_info.py b/tools/infrt/get_phi_kernel_info.py
index 6c4f40d215fc1..4837ca582135c 100644
--- a/tools/infrt/get_phi_kernel_info.py
+++ b/tools/infrt/get_phi_kernel_info.py
@@ -20,8 +20,8 @@
 from typing import List, Dict, Any
 
 skipped_phi_api_list_file = "/tools/infrt/skipped_phi_api.json"
-api_yaml_file = "/python/paddle/utils/code_gen/api.yaml"
-legacy_api_yaml_file = "/python/paddle/utils/code_gen/legacy_api.yaml"
+api_yaml_file = "/paddle/phi/api/yaml/api.yaml"
+legacy_api_yaml_file = "/paddle/phi/api/yaml/legacy_api.yaml"
 
 
 def get_skipped_kernel_list():
diff --git a/tools/nvcc_lazy b/tools/nvcc_lazy
index 9cb49b04ffaff..e3e7e361021c2 100755
--- a/tools/nvcc_lazy
+++ b/tools/nvcc_lazy
@@ -1,4 +1,6 @@
 #!/usr/bin/env bash
+unset GREP_OPTIONS
+set -e
 
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 # 
@@ -17,11 +19,11 @@
 
 ## CUDA_MODULE_LOADING=EAGER,DEFAULT,LAZY
 
-# check nvcc version, if nvcc >= 11.6, just run nvcc itself
-CUDA_VERSION=$(nvcc --version | grep -oP '(?<=cuda_)\d*\.\d*')
+# check nvcc version, if nvcc >= 11.7, just run nvcc itself
+CUDA_VERSION=$(nvcc --version | grep -oP '(?<=V)\d*\.\d*')
 CUDA_VERSION_MAJOR=${CUDA_VERSION%.*}
 CUDA_VERSION_MINOR=${CUDA_VERSION#*.}
-if (( CUDA_VERSION_MAJOR > 11 || (CUDA_VERSION_MAJOR == 11 && CUDA_VERSION_MINOR >= 6) )); then
+if (( CUDA_VERSION_MAJOR > 11 || (CUDA_VERSION_MAJOR == 11 && CUDA_VERSION_MINOR >= 7) )); then
   nvcc "$@"
   exit
 fi
diff --git a/tools/print_signatures.py b/tools/print_signatures.py
index 44083d660c6e1..f751709a767a5 100644
--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py
@@ -292,6 +292,11 @@ def parse_args():
                         help="using get_all_api or from_modulelist")
     parser.add_argument('module', type=str, help='module',
                         default='paddle')  # not used
+    parser.add_argument('--skipped',
+                        dest='skipped',
+                        type=str,
+                        help='Skip Checking submodules',
+                        default='paddle.fluid.core_avx.eager.ops')
 
     if len(sys.argv) == 1:
         args = parser.parse_args(['paddle'])
@@ -320,6 +325,8 @@ def parse_args():
             all_api_names_to_k[api_name] = k
         all_api_names_sorted = sorted(all_api_names_to_k.keys())
         for api_name in all_api_names_sorted:
+            if args.skipped != '' and api_name.find(args.skipped) >= 0:
+                continue
             api_info = api_info_dict[all_api_names_to_k[api_name]]
             print("{0} ({2}, ('document', '{1}'))".format(
                 api_name, md5(api_info['docstring']), api_info['signature']
diff --git a/tools/prune_for_jetson.py b/tools/prune_for_jetson.py
index d53b21d6c3723..cbc03393360d2 100644
--- a/tools/prune_for_jetson.py
+++ b/tools/prune_for_jetson.py
@@ -125,7 +125,7 @@ def append_fluid_kernels():
     with io.open(file_name, 'r', encoding='utf-8') as f:
         content = ''.join(f.readlines())
 
-    location_str = "nv_library(\n  tensorrt_op_teller\n  SRCS op_teller.cc\n  DEPS framework_proto device_context boost)"
+    location_str = "nv_library(\n  tensorrt_op_teller\n  SRCS op_teller.cc\n  DEPS framework_proto device_context)"
     new_content = content.replace(location_str, location_str + append_str)
 
     if new_content == content:
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index 95c5ecf713112..7e92b6b9b7afc 100755
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -233,6 +233,7 @@
     'test_fused_elemwise_activation_op',
     'test_fused_emb_seq_pool_op',
     'test_fused_embedding_fc_lstm_op',
+    'test_fused_token_prune_op',
     'test_fusion_gru_op',
     'test_fusion_lstm_op',
     'test_fusion_repeated_fc_relu_op',
diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh
index a1826220095b5..34c3d4156a818 100644
--- a/tools/windows/run_unittests.sh
+++ b/tools/windows/run_unittests.sh
@@ -72,21 +72,114 @@ disable_win_trt_test="^test_trt_convert_conv2d$|\
 ^test_trt_convert_matmul$|\
 ^test_trt_convert_scale$"
 
-# /*=============Fixed Disabled Windows CUDA11.x MKL(PR-CI-Windows-Inference) unittests=================*/
-# TODO: fix these unittest that is bound to fail
-disable_wingpu11_test="^test_autograd_functional_dynamic$|\
-^disable_wingpu_test$"
-
-
 # /*==========Fixed Disabled Windows CUDA11.x inference_api_test(PR-CI-Windows-Inference) unittests=============*/
-disable_win_inference_api_test="^trt_quant_int8_yolov3_r50_test$|\
+disable_win_inference_test="^trt_quant_int8_yolov3_r50_test$|\
 ^test_trt_dynamic_shape_ernie$|\
 ^test_trt_dynamic_shape_ernie_fp16_ser_deser$|\
 ^lite_resnet50_test$|\
 ^test_trt_dynamic_shape_transformer_prune$|\
 ^lite_mul_model_test$|\
 ^trt_split_converter_test$|\
-^paddle_infer_api_copy_tensor_tester$"
+^paddle_infer_api_copy_tensor_tester$|\
+^test_trt_deformable_conv$|\
+^test_imperative_triple_grad$|\
+^test_full_name_usage$|\
+^test_trt_convert_unary$|\
+^test_eigh_op$|\
+^test_fc_op$|\
+^test_stack_op$|\
+^trt_split_converter_test$|\
+^paddle_infer_api_copy_tensor_tester$|\
+^test_var_base$|\
+^test_einsum_v2$|\
+^test_tensor_scalar_type_promotion_static$|\
+^test_matrix_power_op$|\
+^test_deformable_conv_v1_op$|\
+^test_where_index$|\
+^test_custom_grad_input$|\
+^test_conv3d_transpose_op$|\
+^test_conv_elementwise_add_act_fuse_pass$|\
+^test_conv_eltwiseadd_bn_fuse_pass$|\
+^test_custom_relu_op_setup$|\
+^test_conv3d_transpose_part2_op$|\
+^test_deform_conv2d$|\
+^test_matmul_op$|\
+^test_basic_api_transformation$|\
+^test_deformable_conv_op$|\
+^test_variable$|\
+^test_mkldnn_conv_hard_sigmoid_fuse_pass$|\
+^test_mkldnn_conv_hard_swish_fuse_pass$|\
+^test_conv_act_mkldnn_fuse_pass$|\
+^test_matmul_scale_fuse_pass$|\
+^test_addmm_op$|\
+^test_inverse_op$|\
+^test_set_value_op$|\
+^test_fused_multihead_matmul_op$|\
+^test_cudnn_bn_add_relu$|\
+^test_cond$|\
+^test_conv_bn_fuse_pass$|\
+^test_graph_khop_sampler$|\
+^test_gru_rnn_op$|\
+^test_masked_select_op$|\
+^test_ir_fc_fuse_pass$|\
+^test_fc_elementwise_layernorm_fuse_pass$|\
+^test_linalg_pinv_op$|\
+^test_math_op_patch_var_base$|\
+^test_slice$|\
+^test_conv_elementwise_add_fuse_pass$|\
+^test_executor_and_mul$|\
+^test_analyzer_int8_resnet50$|\
+^test_analyzer_int8_mobilenetv1$|\
+^test_trt_conv_pass$|\
+^test_roll_op$|\
+^test_lcm$|\
+^test_elementwise_floordiv_op$|\
+^test_autograd_functional_dynamic$|\
+^test_corr$|\
+^test_trt_convert_deformable_conv$|\
+^test_conv_elementwise_add2_act_fuse_pass$|\
+^test_tensor_scalar_type_promotion_dynamic$|\
+^test_model$|\
+^test_py_reader_combination$|\
+^test_trt_convert_flatten$|\
+^test_py_reader_push_pop$|\
+^test_parallel_executor_feed_persistable_var$|\
+^test_parallel_executor_inference_feed_partial_data$|\
+^test_parallel_ssa_graph_inference_feed_partial_data$|\
+^test_reader_reset$|\
+^test_parallel_executor_seresnext_base_gpu$|\
+^test_py_reader_pin_memory$|\
+^test_multiprocess_dataloader_iterable_dataset_dynamic$|\
+^test_multiprocess_dataloader_iterable_dataset_static$|\
+^test_add_reader_dependency$|\
+^test_compat$|\
+^test_decoupled_py_reader$|\
+^test_generator_dataloader$|\
+^test_py_reader_using_executor$|\
+^test_imperative_static_runner_while$|\
+^test_dataloader_keep_order$|\
+^test_dataloader_unkeep_order$|\
+^test_sync_batch_norm_op$|\
+^test_fuse_bn_act_pass$|\
+^test_fuse_bn_add_act_pass$|\
+^test_decoupled_py_reader_data_check$|\
+^test_parallel_dygraph_sync_batch_norm$|\
+^test_dataloader_early_reset$|\
+^test_fleet_base_single$|\
+^test_sequence_pool$|\
+^test_simplify_with_basic_ops_pass_autoscan$|\
+^test_trt_activation_pass$|\
+^test_trt_convert_hard_swish$|\
+^test_trt_convert_leaky_relu$|\
+^test_trt_convert_multihead_matmul$|\
+^test_trt_convert_prelu$|\
+^test_trt_fc_fuse_quant_dequant_pass$|\
+^test_unsqueeze2_eltwise_fuse_pass$|\
+^test_parallel_executor_seresnext_with_fuse_all_reduce_gpu$|\
+^test_parallel_executor_seresnext_with_reduce_gpu$|\
+^test_api_impl$|\
+^test_tensordot$|\
+^disable_wingpu_test$"
 
 
 # /*==========Fixed Disabled Windows CPU OPENBLAS((PR-CI-Windows-OPENBLAS)) unittests==============================*/
@@ -184,7 +277,7 @@ bash $PADDLE_ROOT/tools/check_added_ut_win.sh
 rm -rf $PADDLE_ROOT/tools/check_added_ut_win.sh
 if [ -f "$PADDLE_ROOT/added_ut" ];then
     added_uts=^$(awk BEGIN{RS=EOF}'{gsub(/\n/,"$|^");print}' $PADDLE_ROOT/added_ut)$
-    ctest -R "(${added_uts})" -E "$disable_wingpu11_test" --output-on-failure -C Release --repeat-until-fail 3;added_ut_error=$?
+    ctest -R "(${added_uts})" -E "${disable_win_inference_test}" --output-on-failure -C Release --repeat-until-fail 3;added_ut_error=$?
     rm -f $PADDLE_ROOT/added_ut
     if [ "$added_ut_error" != 0 ];then
         echo "========================================"
@@ -247,6 +340,11 @@ function run_unittest_gpu() {
     echo "********These unittests run $parallel_job job each time with 1 GPU**********"
     echo "************************************************************************"
     export CUDA_VISIBLE_DEVICES=0
+
+    if nvcc --version | grep 11.2; then
+        disable_wingpu_test=${disable_win_inference_test}
+    fi
+
     tmpfile=$tmp_dir/$RANDOM
     (ctest -R "$test_case" -E "$disable_ut_quickly|$disable_wingpu_test|$disable_win_trt_test|$long_time_test" -LE "${nightly_label}" --output-on-failure -C Release -j $parallel_job | tee $tmpfile ) &
     wait;
@@ -335,22 +433,22 @@ set +e
 
 export FLAGS_call_stack_level=2
 
-if nvcc --version | grep 11.2; then
-    echo "Only test added_ut and inference_api_test temporarily when running in CI-Windows-inference of CUDA 11.2."
-    export CUDA_VISIBLE_DEVICES=0
-    tmpfile=$tmp_dir/$RANDOM
-    inference_api_test=^$(ls "paddle/fluid/inference/tests/api" | sed -n 's/\.exe$//pg' | awk BEGIN{RS=EOF}'{gsub(/\n/,"$|^");print}' | sed 's/|\^$//g')
-    (ctest -R "$inference_api_test" -E "$disable_win_inference_api_test" --output-on-failure -C Release -j 2 | tee $tmpfile ) &
-    wait;
-    collect_failed_tests
-    set -e
-    rm -f $tmp_dir/*
-    if [[ "$failed_test_lists" != "" ]]; then
-        unittests_retry
-        show_ut_retry_result
-    fi
-    exit 0;
-fi
+# if nvcc --version | grep 11.2; then
+#     echo "Only test added_ut and inference_api_test temporarily when running in CI-Windows-inference of CUDA 11.2."
+#     export CUDA_VISIBLE_DEVICES=0
+#     tmpfile=$tmp_dir/$RANDOM
+#     inference_api_test=^$(ls "paddle/fluid/inference/tests/api" | sed -n 's/\.exe$//pg' | awk BEGIN{RS=EOF}'{gsub(/\n/,"$|^");print}' | sed 's/|\^$//g')
+#     (ctest -R "$inference_api_test" -E "$disable_win_inference_api_test" --output-on-failure -C Release -j 2 | tee $tmpfile ) &
+#     wait;
+#     collect_failed_tests
+#     set -e
+#     rm -f $tmp_dir/*
+#     if [[ "$failed_test_lists" != "" ]]; then
+#         unittests_retry
+#         show_ut_retry_result
+#     fi
+#     exit 0;
+# fi
 
 if [ "${WITH_GPU:-OFF}" == "ON" ];then