diff --git a/.gitignore b/.gitignore
index 4faaf162ba52b..b8a2e8fbce933 100644
--- a/.gitignore
+++ b/.gitignore
@@ -66,3 +66,8 @@ paddle/infrt/tests/lit.cfg.py
 paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.cc
 paddle/fluid/pybind/eager_final_state_op_function_impl.h
 paddle/fluid/pybind/tmp_eager_final_state_op_function_impl.h
+
+# these files (directories) are generated before build system generation
+paddle/fluid/operators/generated_op.cc
+paddle/phi/ops/compat/generated_sig.cc
+python/paddle/utils/code_gen/parsed_apis/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 51c0ef35f1efa..f3ed08d56e6d6 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,6 +60,7 @@ option(WITH_ONNXRUNTIME         "Compile PaddlePaddle with ONNXRUNTIME"
 # Note(zhouwei): It use option above, so put here
 include(init)
 include(generic)            # simplify cmake module
+include(experimental)       # experimental build options
 
 if (WITH_GPU  AND WITH_XPU)
     message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
@@ -256,8 +257,8 @@ option(WITH_CUSTOM_DEVICE "Compile with custom device support"    OFF)
 option(WITH_ARM_BRPC "Supprot Brpc in Arm"    OFF)
 
 if(WITH_RECORD_BUILDTIME)
-    set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh")
-    set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh")
+    set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh ${CMAKE_CURRENT_BINARY_DIR}")
+    set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh ${CMAKE_CURRENT_BINARY_DIR}")
 else()            
     include(ccache) # set ccache for compilation ; if WITH_RECORD_BUILDTIME=ON can't use ccache
 endif()
diff --git a/README.md b/README.md
index 21e0aba8b48bf..048a273a7d78b 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ PaddlePaddle is originated from industrial practices with dedication and commitm
 
 ## Installation
 
-### Latest PaddlePaddle Release: [v2.2](https://github.com/PaddlePaddle/Paddle/tree/release/2.2)
+### Latest PaddlePaddle Release: [v2.3](https://github.com/PaddlePaddle/Paddle/tree/release/2.3)
 
 Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest features of PaddlePaddle.
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 69e66407580b6..43c2208182a55 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -65,11 +65,13 @@ if(NOT DEFINED CBLAS_PROVIDER)
     PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS} NO_DEFAULT_PATH)
   find_path(OPENBLAS_LAPACKE_INC_DIR NAMES lapacke.h
     PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
+  find_path(OPENBLAS_CONFIG_INC_DIR NAMES openblas_config.h
+    PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
   find_library(OPENBLAS_LIB NAMES openblas
     PATHS ${OPENBLAS_LIB_SEARCH_PATHS})
 
-  if(OPENBLAS_LAPACKE_INC_DIR AND OPENBLAS_INC_DIR AND OPENBLAS_LIB)
-    file(READ "${OPENBLAS_INC_DIR}/openblas_config.h" config_file)
+  if(OPENBLAS_LAPACKE_INC_DIR AND OPENBLAS_INC_DIR AND OPENBLAS_CONFIG_INC_DIR AND OPENBLAS_LIB)
+    file(READ "${OPENBLAS_CONFIG_INC_DIR}/openblas_config.h" config_file)
     string(REGEX MATCH "OpenBLAS ([0-9]+\.[0-9]+\.[0-9]+)" tmp ${config_file})
     string(REGEX MATCH "([0-9]+\.[0-9]+\.[0-9]+)" ver ${tmp})
     
@@ -138,4 +140,3 @@ if(${CBLAS_PROVIDER} STREQUAL REFERENCE_CBLAS)
 elseif(NOT ${CBLAS_PROVIDER} STREQUAL MKLML)
   target_link_libraries(cblas ${CBLAS_LIBRARIES})
 endif()
-
diff --git a/cmake/experimental.cmake b/cmake/experimental.cmake
new file mode 100644
index 0000000000000..55e7fe263f9dc
--- /dev/null
+++ b/cmake/experimental.cmake
@@ -0,0 +1,17 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# this file contains experimental build options
+
+include(experiments/cuda_module_loading_lazy)
diff --git a/cmake/experiments/cuda_module_loading_lazy.cmake b/cmake/experiments/cuda_module_loading_lazy.cmake
new file mode 100644
index 0000000000000..ef6a51b594b9e
--- /dev/null
+++ b/cmake/experiments/cuda_module_loading_lazy.cmake
@@ -0,0 +1,40 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# this file contains experimental build options for lazy cuda module loading
+# cuda moduel lazy loading is supported by CUDA 11.6+
+# this experiment option makes Paddle supports lazy loading before CUDA 11.6.
+
+option(EXP_CUDA_MODULE_LOADING_LAZY  "enable lazy cuda module loading" OFF)
+if (${EXP_CUDA_MODULE_LOADING_LAZY})
+  if (NOT ${ON_INFER} OR NOT ${LINUX})
+    message("EXP_CUDA_MODULE_LOADING_LAZY only works with ON_INFER=ON on Linux platforms")
+    return()
+  endif ()
+  if (NOT ${CUDA_FOUND})
+    message("EXP_CUDA_MODULE_LOADING_LAZY only works with CUDA")
+    return()
+  endif ()
+  if (${CUDA_VERSION} VERSION_GREATER_EQUAL "11.6")
+    message("cuda 11.6+ already support lazy module loading")
+    return()
+  endif ()
+
+  message("for cuda before 11.6, libcudart.so must be used for the lazy module loading trick to work, instead of libcudart_static.a")
+  set(CUDA_USE_STATIC_CUDA_RUNTIME OFF CACHE BOOL "" FORCE)
+  set(CMAKE_CUDA_FLAGS "--cudart shared")
+  enable_language(CUDA)
+  set(CUDA_NVCC_EXECUTABLE "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy" CACHE FILEPATH "" FORCE)
+  set(CMAKE_CUDA_COMPILER "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy" CACHE FILEPATH "" FORCE)
+endif()
diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake
index 5029878af6199..d02f47142e775 100644
--- a/cmake/external/ascend.cmake
+++ b/cmake/external/ascend.cmake
@@ -90,9 +90,10 @@ endif()
 if (WITH_ASCEND_CL)
 macro(find_ascend_toolkit_version ascend_toolkit_version_info) 
     file(READ ${ascend_toolkit_version_info} ASCEND_TOOLKIT_VERSION_CONTENTS)
-    string(REGEX MATCH "version=([0-9]+\.[0-9]+\.(RC)?[0-9]+\.[a-z]*[0-9]*)" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION_CONTENTS}")
-    string(REGEX REPLACE "version=([0-9]+\.[0-9]+\.(RC)?[0-9]+\.[a-z]*[0-9]*)" "\\1" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION}")
+    string(REGEX MATCH "version=([0-9]+\.[0-9]+\.(RC)?[0-9][.a-z0-9]*)" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION_CONTENTS}")
+    string(REGEX REPLACE "version=([0-9]+\.[0-9]+\.(RC)?[0-9][.a-z0-9]*)" "\\1" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION}")
     string(REGEX REPLACE "[A-Z]|[a-z|\.]" "" CANN_VERSION ${ASCEND_TOOLKIT_VERSION})
+    STRING(SUBSTRING "${CANN_VERSION}000" 0 6 CANN_VERSION)
     add_definitions("-DCANN_VERSION_CODE=${CANN_VERSION}")
     if(NOT ASCEND_TOOLKIT_VERSION)
         set(ASCEND_TOOLKIT_VERSION "???")
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 29625b2b52e18..8f955008fa079 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -19,7 +19,7 @@ SET(MKLDNN_PREFIX_DIR     ${THIRD_PARTY_PATH}/mkldnn)
 SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
 SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
 SET(MKLDNN_REPOSITORY     ${GIT_URL}/oneapi-src/oneDNN.git)
-SET(MKLDNN_TAG            9a35435c18722ff17a48fb60bceac42bfdf78754)
+SET(MKLDNN_TAG            9b186765dded79066e0cd9c17eb70b680b76fb8e)
 
 
 # Introduce variables:
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index d5ccf1297922f..43d5002fe3819 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -9,7 +9,7 @@ SET(XPU_RT_LIB_NAME             "libxpurt.so")
 
 if(NOT DEFINED XPU_BASE_URL)
   SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220425")
+  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220520")
 else()
   SET(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()
@@ -17,7 +17,7 @@ endif()
 # ubuntu and centos: use output by XDNN API team
 if(NOT DEFINED XPU_XDNN_BASE_URL)
   SET(XPU_XDNN_BASE_URL_WITHOUT_DATE "https://klx-sdk-release-public.su.bcebos.com/xdnn/dev")
-  SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220425")
+  SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220520")
 else()
   SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL}")
 endif()
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index f9cac0579fec4..0dbd3bc328314 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -142,12 +142,10 @@ set(COMMON_FLAGS
     -Wno-unused-function
     -Wno-error=literal-suffix
     -Wno-error=unused-local-typedefs
-    -Wno-error=parentheses-equality # Warnings in pybind11
     -Wno-error=ignored-attributes  # Warnings in Eigen, gcc 6.3
     -Wno-error=terminate  # Warning in PADDLE_ENFORCE
     -Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2
     -Wimplicit-fallthrough=0 # Warning in tinyformat.h
-    -Wno-error=maybe-uninitialized # Warning in boost gcc 7.2
     ${fsanitize}
 )
 
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 1df9e1497384b..51e4bd3ac41c9 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -261,6 +261,13 @@ function(op_library TARGET)
     elseif (WITH_XPU_KP AND ${xpu_kp_cc_srcs_len} GREATER 0)
         xpu_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${xpu_kp_cc_srcs} DEPS ${op_library_DEPS} ${op_common_deps})
     else()
+        # deal with CANN version control while registering NPU operators before build
+        if (WITH_ASCEND_CL)
+            if (CANN_VERSION LESS 504000)
+                list(REMOVE_ITEM npu_cc_srcs "multinomial_op_npu.cc")
+                list(REMOVE_ITEM npu_cc_srcs "take_along_axis_op_npu.cc")
+            endif()
+        endif()
         # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
         if(WITH_UNITY_BUILD AND op_library_UNITY)
             # Combine the cc source files.
diff --git a/cmake/xpu_kp.cmake b/cmake/xpu_kp.cmake
index 166f8786337b1..adf3d74c26220 100644
--- a/cmake/xpu_kp.cmake
+++ b/cmake/xpu_kp.cmake
@@ -16,6 +16,10 @@ if(NOT WITH_XPU_KP)
     return()
 endif()
 
+set(LINK_FLAGS    "-Wl,--allow-multiple-definition")
+set(CMAKE_EXE_LINKER_FLAGS    "${LINK_FLAGS}")
+set(CMAKE_SHARED_LINKER_FLAGS "${LINK_FLAGS}")
+
 if(NOT XPU_TOOLCHAIN)
   set(XPU_TOOLCHAIN /workspace/output/XTDK-ubuntu_x86_64)
   get_filename_component(XPU_TOOLCHAIN ${XPU_TOOLCHAIN} REALPATH)
diff --git a/paddle/fluid/distributed/CMakeLists.txt b/paddle/fluid/distributed/CMakeLists.txt
index 107a19cb7decc..a92932b4d3247 100755
--- a/paddle/fluid/distributed/CMakeLists.txt
+++ b/paddle/fluid/distributed/CMakeLists.txt
@@ -26,7 +26,7 @@ add_custom_command(TARGET ps_framework_proto POST_BUILD
     COMMAND mv the_one_ps.pb.h ps.pb.h
     COMMAND mv the_one_ps.pb.cc ps.pb.cc)
 
-set(DISTRIBUTE_COMPILE_FLAGS "-Wno-error=unused-value -Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=type-limits -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result")
+set(DISTRIBUTE_COMPILE_FLAGS "-Wno-error=unused-value -Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result")
 
 if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
     set(DISTRIBUTE_COMPILE_FLAGS
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHeter.cc b/paddle/fluid/distributed/collective/ProcessGroupHeter.cc
index 31f9b26e732d1..0911a4a3e3e18 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupHeter.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupHeter.cc
@@ -255,12 +255,6 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Broadcast(
 
 std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Send(
     std::vector<phi::DenseTensor>& in_tensors, int peer) {
-#if defined(PADDLE_WITH_NCCL)
-  PADDLE_ENFORCE_EQ(
-      CheckTensorsInCudaPlace(in_tensors), true,
-      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
-#endif
-
   PADDLE_ENFORCE_EQ(
       in_tensors.size(), 1,
       platform::errors::PreconditionNotMet(
@@ -299,12 +293,6 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Send(
 
 std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Recv(
     std::vector<phi::DenseTensor>& out_tensors, int peer) {
-#if defined(PADDLE_WITH_NCCL)
-  PADDLE_ENFORCE_EQ(
-      CheckTensorsInCudaPlace(out_tensors), true,
-      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
-#endif
-
   PADDLE_ENFORCE_EQ(
       out_tensors.size(), 1,
       platform::errors::PreconditionNotMet(
@@ -343,7 +331,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Recv(
   end = std::chrono::high_resolution_clock::now();
   diff = end - start;
   VLOG(2) << "Time to copy tensor of dims(" << cpu_tensor.dims()
-          << ") from gpu to cpu for recv " << std::setw(9)
+          << ") from cpu to gpu for recv " << std::setw(9)
           << " is: " << diff.count() << " s" << std::endl;
   return CreateTask(rank_, CommType::RECV, out_tensors);
 }
diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc
index a7c3e2208ab74..96009ce722905 100644
--- a/paddle/fluid/distributed/collective/reducer.cc
+++ b/paddle/fluid/distributed/collective/reducer.cc
@@ -901,6 +901,9 @@ void EagerReducer::AllReduceSparse(EagerGroup *group,
 
   dev_ctx->Wait();
 
+  Tensor src_value_tensor(std::make_shared<phi::DenseTensor>(src->value()));
+  std::vector<int64_t> dst_shape = src_value_tensor.shape();
+
   if (std::all_of(cpu_rows_num_ptr, cpu_rows_num_ptr + size_,
                   [&](int64_t row) { return row == cpu_rows_num_ptr[0]; })) {
     // During sparse communication, the number of each card is same.
@@ -940,8 +943,6 @@ void EagerReducer::AllReduceSparse(EagerGroup *group,
                                        &dst_rows_vector);
     dev_ctx->Wait();
 
-    Tensor src_value_tensor(std::make_shared<phi::DenseTensor>(src->value()));
-    std::vector<int64_t> dst_shape = src_value_tensor.shape();
     dst_shape[dst_shape.size() - 2] = rows_num;
     auto dst_dense_tensor = std::dynamic_pointer_cast<phi::DenseTensor>(
         paddle::experimental::full(IntArray(dst_shape), 0,
@@ -971,8 +972,58 @@ void EagerReducer::AllReduceSparse(EagerGroup *group,
     *(src->mutable_value()) =
         *(std::dynamic_pointer_cast<phi::DenseTensor>(dst_value_tensor.impl()));
   } else {
-    PADDLE_THROW(
-        platform::errors::Unimplemented("This case is not supported."));
+    std::vector<Tensor> rows_tensors;
+    std::vector<Tensor> values_tensors;
+
+    for (int i = 0; i < size_; ++i) {
+      std::vector<int64_t> value_tensor_shape = {
+          cpu_rows_num_ptr[i], dst_shape[dst_shape.size() - 1]};
+      Tensor rows_tensor = paddle::experimental::full(
+          IntArray({static_cast<int64_t>(cpu_rows_num_ptr[i])}), 0,
+          DataType::INT64, inner_place_);
+      Tensor values_tensor = paddle::experimental::full(
+          IntArray(value_tensor_shape), 0, src->value().dtype(), inner_place_);
+      std::vector<phi::DenseTensor> rows_dense_vector;
+      std::vector<phi::DenseTensor> values_dense_vector;
+
+      if (i == rank_) {
+        auto *rows_dense_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(rows_tensor.impl())
+                .get();
+        framework::TensorFromVector<int64_t>(src_rows, *dev_ctx,
+                                             rows_dense_tensor);
+        values_tensor.set_impl(
+            std::make_shared<phi::DenseTensor>(src->value()));
+      }
+      rows_dense_vector.push_back(
+          *std::dynamic_pointer_cast<phi::DenseTensor>(rows_tensor.impl()));
+      values_dense_vector.push_back(
+          *std::dynamic_pointer_cast<phi::DenseTensor>(values_tensor.impl()));
+
+      auto b_opts = BroadcastOptions();
+      b_opts.source_rank = i;
+      process_group_->Broadcast(rows_dense_vector, rows_dense_vector, b_opts);
+      process_group_
+          ->Broadcast(values_dense_vector, values_dense_vector, b_opts)
+          ->Wait();
+      rows_tensors.push_back(rows_tensor);
+      values_tensors.push_back(values_tensor);
+    }
+
+    Tensor dst_rows_tensor =
+        paddle::experimental::concat(rows_tensors, phi::Scalar(0));
+    framework::Vector<int64_t> dst_rows_vector(rows_num, 0);
+    auto *dst_rows_dense_tensor =
+        std::dynamic_pointer_cast<phi::DenseTensor>(dst_rows_tensor.impl())
+            .get();
+    framework::TensorToVector<int64_t>(*dst_rows_dense_tensor, *dev_ctx,
+                                       &dst_rows_vector);
+    src->set_rows(dst_rows_vector);
+
+    Tensor dst_values_tensor =
+        paddle::experimental::concat(values_tensors, phi::Scalar(0));
+    *(src->mutable_value()) = *(
+        std::dynamic_pointer_cast<phi::DenseTensor>(dst_values_tensor.impl()));
   }
 }
 
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc
index cacd55e02a5e2..d8f937e218be4 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model.cc
+++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc
@@ -546,9 +546,9 @@ bool DistModel::Run(const std::vector<DistModelTensor> &input_data,
 
   DistModelTimer timer;
   timer.tic();
-  double feed_elapse;
-  double fleet_exe_elapse;
-  double fetch_elapse;
+  double feed_elapse = 0;
+  double fleet_exe_elapse = 0;
+  double fetch_elapse = 0;
 
   if (!FeedData(input_data, scope_.get())) {
     LOG(ERROR) << "DistModel failed at feeding data.";
diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
index 78673184eb23b..0959b651bb558 100755
--- a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
@@ -261,7 +261,7 @@ int DownpourBrpcClosure::check_response(size_t request_idx, int cmd_id) {
 }
 
 int DownpourBrpcClosure::check_save_response(size_t request_idx, int cmd_id) {
-  uint32_t feasign_size = 0;
+  int32_t feasign_size = 0;
   if (_cntls[request_idx]->Failed()) {
     LOG(ERROR) << "resquest cmd_id:" << cmd_id << " failed, "
                                                   "err:"
diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_server.cc b/paddle/fluid/distributed/ps/service/brpc_ps_server.cc
index d0bf06d49504a..8167c37b59987 100644
--- a/paddle/fluid/distributed/ps/service/brpc_ps_server.cc
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_server.cc
@@ -301,11 +301,6 @@ int32_t BrpcPsService::PullDense(Table *table, const PsRequestMessage &request,
   }
   CostTimer timer("pserver_server_pull_dense");
   uint32_t num = *(const uint32_t *)request.params(0).c_str();
-  if (num < 0) {
-    set_response_code(response, -1,
-                      "PsRequestMessage.datas[0] is invalid, num must >= 0");
-    return 0;
-  }
 
   auto res_data = butil::get_object<std::vector<float>>();
   res_data->resize(num * table->ValueAccesor()->GetAccessorInfo().select_size /
diff --git a/paddle/fluid/distributed/ps/service/heter_client.h b/paddle/fluid/distributed/ps/service/heter_client.h
index 36bafc943701f..efaa48470a8bd 100644
--- a/paddle/fluid/distributed/ps/service/heter_client.h
+++ b/paddle/fluid/distributed/ps/service/heter_client.h
@@ -171,19 +171,16 @@ class HeterClient {
   // switch client singleton
   static std::shared_ptr<HeterClient> GetSwitchInstance(
       const std::vector<std::string>& peer_endpoints, int32_t peer_role) {
+    std::unique_lock<std::mutex> lock(mtx_);
+    if (peer_endpoints.empty()) {
+      VLOG(4) << "init switch client failed, null peer_endpoints";
+    }
+    VLOG(4) << "peer role is: " << peer_role
+            << ", addr is: " << peer_endpoints[0];
     if (switch_s_instance_ == nullptr) {
-      std::unique_lock<std::mutex> lock(mtx_);
-      if (peer_endpoints.empty()) {
-        VLOG(4) << "init switch client failed, null peer_endpoints";
-      }
-      VLOG(4) << "peer role is: " << peer_role
-              << ", addr is: " << peer_endpoints[0];
-      if (switch_s_instance_ == nullptr) {
-        switch_s_instance_.reset(new HeterClient());
-        switch_s_instance_->SetPeerSwitchList(peer_endpoints);
-        switch_s_instance_->InitClientChannels(false, peer_endpoints,
-                                               peer_role);
-      }
+      switch_s_instance_.reset(new HeterClient());
+      switch_s_instance_->SetPeerSwitchList(peer_endpoints);
+      switch_s_instance_->InitClientChannels(false, peer_endpoints, peer_role);
     }
     return switch_s_instance_;
   }
diff --git a/paddle/fluid/distributed/ps/service/heter_server.cc b/paddle/fluid/distributed/ps/service/heter_server.cc
index 0753a6799c1be..fd38a030ff366 100755
--- a/paddle/fluid/distributed/ps/service/heter_server.cc
+++ b/paddle/fluid/distributed/ps/service/heter_server.cc
@@ -125,6 +125,9 @@ int SendAndRecvVariableHandler::SaveInSwitchWithShard(
     brpc::Controller* cntl) {
   VLOG(4) << "entering SaveInSwitchWithShard";
   int32_t group_id = request->group_id();
+  if (group_id >= FLAGS_heter_world_size) {
+    LOG(ERROR) << "group id exceed maxmium";
+  }
   auto& local_shard = _local_shards[group_id];
   auto& request_io_buffer = cntl->request_attachment();
   butil::IOBufBytesIterator io_buffer_itr(request_io_buffer);
@@ -132,11 +135,11 @@ int SendAndRecvVariableHandler::SaveInSwitchWithShard(
     const auto& var_name = request->send_var_names(idx);
     const auto& var_size = request->vars_len(idx);
     WaitForVarsConsumed(group_id, var_name);
+    std::unique_lock<std::mutex> lk(scope_mutex_);
     auto& value = local_shard[var_name];
     value.resize(var_size);
     io_buffer_itr.copy_and_forward(reinterpret_cast<void*>(value.data()),
                                    var_size);
-    std::unique_lock<std::mutex> lk(scope_mutex_);
     vars_ready_flag[group_id][var_name] = 1;
     VLOG(4) << "saved var_name: " << var_name << "is saved ready!";
   }
@@ -162,11 +165,11 @@ int SendAndRecvVariableHandler::QueryInSwitchWithShard(
     VLOG(4) << "req var name: " << req_var_name;
     response->add_send_var_names(req_var_name);
     WaitForVarsProduced(group_id, req_var_name);
+    std::unique_lock<std::mutex> lk(scope_mutex_);
     auto itr = local_shard.find(req_var_name);
     auto& value = itr.value();
     response_io_buffer.append(value.data(), value.size());
     value.resize(0);  // 清空内存
-    std::unique_lock<std::mutex> lk(scope_mutex_);
     vars_ready_flag[group_id][req_var_name] = 0;
     VLOG(4) << "query var_name: " << req_var_name << "is consumed ready!";
   }
diff --git a/paddle/fluid/distributed/ps/service/ps_client.h b/paddle/fluid/distributed/ps/service/ps_client.h
index 0d3d23be4e8d1..926bb7e7c9fd3 100644
--- a/paddle/fluid/distributed/ps/service/ps_client.h
+++ b/paddle/fluid/distributed/ps/service/ps_client.h
@@ -109,7 +109,7 @@ class PSClient {
                                          size_t table_id) = 0;  // 保留
 
   // firstly push dense param for parameter server
-  // this is neccessary because dense weight initialized in trainer on cold
+  // this is necessary because dense weight initialized in trainer on cold
   // start
   virtual std::future<int32_t> PushDenseParam(const Region *regions,
                                               size_t region_num,
diff --git a/paddle/fluid/distributed/ps/table/CMakeLists.txt b/paddle/fluid/distributed/ps/table/CMakeLists.txt
index f2b9eb71f5a64..b8eff940a0dca 100644
--- a/paddle/fluid/distributed/ps/table/CMakeLists.txt
+++ b/paddle/fluid/distributed/ps/table/CMakeLists.txt
@@ -35,12 +35,13 @@ set_source_files_properties(sparse_sgd_rule.cc PROPERTIES COMPILE_FLAGS ${DISTRI
 set_source_files_properties(ctr_double_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(ctr_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(sparse_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(ctr_dymf_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(memory_sparse_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(ssd_sparse_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(memory_sparse_geo_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 
 cc_library(sparse_sgd_rule SRCS sparse_sgd_rule.cc DEPS ${TABLE_DEPS} ps_framework_proto)
-cc_library(ctr_accessor SRCS ctr_accessor.cc ctr_double_accessor.cc sparse_accessor.cc DEPS ${TABLE_DEPS} ps_framework_proto sparse_sgd_rule)
+cc_library(ctr_accessor SRCS ctr_accessor.cc ctr_double_accessor.cc sparse_accessor.cc ctr_dymf_accessor.cc DEPS ${TABLE_DEPS} ps_framework_proto sparse_sgd_rule)
 cc_library(sparse_table SRCS memory_sparse_table.cc ssd_sparse_table.cc memory_sparse_geo_table.cc DEPS ps_framework_proto ${TABLE_DEPS} fs afs_wrapper ctr_accessor common_table rocksdb)
 
 cc_library(table SRCS table.cc DEPS sparse_table common_table tensor_accessor tensor_table ps_framework_proto string_helper device_context gflags glog boost)
diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.cc b/paddle/fluid/distributed/ps/table/common_graph_table.cc
index a3fa80b3865e4..43dee275a3dc6 100644
--- a/paddle/fluid/distributed/ps/table/common_graph_table.cc
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.cc
@@ -80,7 +80,7 @@ paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph(
   }
   for (int i = 0; i < (int)tasks.size(); i++) tasks[i].get();
   paddle::framework::GpuPsCommGraph res;
-  unsigned int tot_len = 0;
+  int64_t tot_len = 0;
   for (int i = 0; i < task_pool_size_; i++) {
     tot_len += edge_array[i].size();
   }
@@ -88,8 +88,8 @@ paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph(
   // res.node_size = ids.size();
   // res.neighbor_list = new int64_t[tot_len];
   // res.node_list = new paddle::framework::GpuPsGraphNode[ids.size()];
-  res.init_on_cpu(tot_len, (unsigned int)ids.size());
-  unsigned int offset = 0, ind = 0;
+  res.init_on_cpu(tot_len, ids.size());
+  int64_t offset = 0, ind = 0;
   for (int i = 0; i < task_pool_size_; i++) {
     for (int j = 0; j < (int)node_array[i].size(); j++) {
       res.node_list[ind] = node_array[i][j];
@@ -126,8 +126,8 @@ int32_t GraphTable::add_node_to_ssd(int type_id, int idx, int64_t src_id,
       _db->put(src_id % shard_num % task_pool_size_, ch,
                sizeof(int) * 2 + sizeof(int64_t), (char *)data, len);
     }
-    _db->flush(src_id % shard_num % task_pool_size_);
-    std::string x;
+    // _db->flush(src_id % shard_num % task_pool_size_);
+    // std::string x;
     // if (_db->get(src_id % shard_num % task_pool_size_, ch, sizeof(int64_t) +
     // 2 * sizeof(int), x) ==0){
     // VLOG(0)<<"put result";
@@ -135,6 +135,18 @@ int32_t GraphTable::add_node_to_ssd(int type_id, int idx, int64_t src_id,
     //   VLOG(0)<<"get an id "<<*((int64_t *)(x.c_str() + i));
     // }
     //}
+    // if(src_id == 429){
+    //   str = "";
+    //   _db->get(src_id % shard_num % task_pool_size_, ch,
+    //            sizeof(int) * 2 + sizeof(int64_t), str);
+    //   int64_t *stored_data = ((int64_t *)str.c_str());
+    //   int n = str.size() / sizeof(int64_t);
+    //   VLOG(0)<<"429 has "<<n<<"neighbors";
+    //   for(int i =0;i< n;i++){
+    //     VLOG(0)<<"get an id "<<*((int64_t *)(str.c_str() +
+    //     i*sizeof(int64_t)));
+    //   }
+    // }
   }
   return 0;
 }
@@ -146,6 +158,7 @@ char *GraphTable::random_sample_neighbor_from_ssd(
     return NULL;
   }
   std::string str;
+  VLOG(2) << "sample ssd for key " << id;
   char ch[sizeof(int) * 2 + sizeof(int64_t)];
   memset(ch, 0, sizeof(int));
   memcpy(ch + sizeof(int), &idx, sizeof(int));
@@ -178,6 +191,9 @@ char *GraphTable::random_sample_neighbor_from_ssd(
       memcpy(buff + i * Node::id_size, &data[pos], Node::id_size);
       // res.push_back(data[pos]);
     }
+    for (int i = 0; i < actual_size; i += 8) {
+      VLOG(2) << "sampled an neighbor " << *(int64_t *)&buff[i];
+    }
     return buff;
   }
   actual_size = 0;
@@ -230,7 +246,8 @@ void GraphTable::make_partitions(int idx, int64_t byte_size, int device_len) {
     VLOG(0) << "no edges are detected,make partitions exits";
     return;
   }
-  const float a = 2.0, y = 1.25;
+  auto &weight_map = node_weight[0][idx];
+  const double a = 2.0, y = 1.25, weight_param = 1.0;
   int64_t gb_size_by_discount = byte_size * 0.8 * device_len;
   if (gb_size_by_discount <= 0) gb_size_by_discount = 1;
   int part_len = total_memory_cost / gb_size_by_discount;
@@ -240,8 +257,9 @@ void GraphTable::make_partitions(int idx, int64_t byte_size, int device_len) {
           << " byte size = " << gb_size_by_discount;
   partitions[idx].clear();
   partitions[idx].resize(part_len);
+  std::vector<double> weight_cost(part_len, 0);
   std::vector<int64_t> memory_remaining(part_len, gb_size_by_discount);
-  std::vector<float> score(part_len, 0);
+  std::vector<double> score(part_len, 0);
   std::unordered_map<int64_t, int> id_map;
   std::vector<rocksdb::Iterator *> iters;
   for (int i = 0; i < task_pool_size_; i++) {
@@ -258,14 +276,15 @@ void GraphTable::make_partitions(int idx, int64_t byte_size, int device_len) {
       continue;
     }
     std::string key = iters[next]->key().ToString();
+    int type_idx = *(int *)key.c_str();
     int temp_idx = *(int *)(key.c_str() + sizeof(int));
-    if (temp_idx != idx) {
+    if (type_idx != 0 || temp_idx != idx) {
       iters[next]->Next();
       next++;
       continue;
     }
     std::string value = iters[next]->value().ToString();
-    std::int64_t i_key = *(int64_t *)(key.c_str() + 8);
+    std::int64_t i_key = *(int64_t *)(key.c_str() + sizeof(int) * 2);
     for (int i = 0; i < part_len; i++) {
       if (memory_remaining[i] < (int64_t)value.size()) {
         score[i] = -100000.0;
@@ -281,11 +300,22 @@ void GraphTable::make_partitions(int idx, int64_t byte_size, int device_len) {
         score[index]++;
       }
     }
-    float base;
+    double base, weight_base = 0;
+    double w = 0;
+    bool has_weight = false;
+    if (weight_map.find(i_key) != weight_map.end()) {
+      w = weight_map[i_key];
+      has_weight = true;
+    }
     int index = 0;
     for (int i = 0; i < part_len; i++) {
-      base = gb_size_by_discount - memory_remaining[i];
-      score[i] -= a * y * std::pow(1.0 * base, y - 1);
+      base = gb_size_by_discount - memory_remaining[i] + value.size();
+      if (has_weight)
+        weight_base = weight_cost[i] + w * weight_param;
+      else {
+        weight_base = 0;
+      }
+      score[i] -= a * y * std::pow(1.0 * base, y - 1) + weight_base;
       if (score[i] > score[index]) index = i;
       VLOG(2) << "score" << i << " = " << score[i] << " memory left "
               << memory_remaining[i];
@@ -293,6 +323,7 @@ void GraphTable::make_partitions(int idx, int64_t byte_size, int device_len) {
     id_map[i_key] = index;
     partitions[idx][index].push_back(i_key);
     memory_remaining[index] -= (int64_t)value.size();
+    if (has_weight) weight_cost[index] += w;
     iters[next]->Next();
     next++;
   }
@@ -311,6 +342,38 @@ void GraphTable::make_partitions(int idx, int64_t byte_size, int device_len) {
   next_partition = 0;
 }
 
+void GraphTable::export_partition_files(int idx, std::string file_path) {
+  int part_len = partitions[idx].size();
+  if (part_len == 0) return;
+  if (file_path == "") file_path = ".";
+  if (file_path[(int)file_path.size() - 1] != '/') {
+    file_path += "/";
+  }
+  std::vector<std::future<int>> tasks;
+  for (int i = 0; i < part_len; i++) {
+    tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
+        [&, i, idx, this]() -> int {
+
+          std::string output_path =
+              file_path + "partition_" + std::to_string(i);
+
+          std::ofstream ofs(output_path);
+          if (ofs.fail()) {
+            VLOG(0) << "creating " << output_path << " failed";
+            return 0;
+          }
+          for (auto x : partitions[idx][i]) {
+            auto str = std::to_string(x);
+            ofs.write(str.c_str(), str.size());
+            ofs.write("\n", 1);
+          }
+          ofs.close();
+          return 0;
+        }));
+  }
+
+  for (int i = 0; i < (int)tasks.size(); i++) tasks[i].get();
+}
 void GraphTable::clear_graph(int idx) {
   for (auto p : edge_shards[idx]) {
     delete p;
@@ -376,7 +439,7 @@ int32_t GraphTable::load_edges_to_ssd(const std::string &path,
 }
 
 int32_t GraphTable::dump_edges_to_ssd(int idx) {
-  VLOG(0) << "calling dump edges to ssd";
+  VLOG(2) << "calling dump edges to ssd";
   const int64_t fixed_size = 10000;
   // std::vector<int64_t> edge_array[task_pool_size_];
   std::vector<std::unordered_map<int64_t, int>> count(task_pool_size_);
@@ -387,9 +450,9 @@ int32_t GraphTable::dump_edges_to_ssd(int idx) {
         [&, i, this]() -> int64_t {
           int64_t cost = 0;
           std::vector<Node *> &v = shards[i]->get_bucket();
-          std::vector<int64_t> s;
           size_t ind = i % this->task_pool_size_;
           for (size_t j = 0; j < v.size(); j++) {
+            std::vector<int64_t> s;
             for (int k = 0; k < v[j]->get_neighbor_size(); k++) {
               s.push_back(v[j]->get_neighbor_id(k));
             }
@@ -405,7 +468,7 @@ int32_t GraphTable::dump_edges_to_ssd(int idx) {
 }
 int32_t GraphTable::make_complementary_graph(int idx, int64_t byte_size) {
   VLOG(0) << "make_complementary_graph";
-  const int64_t fixed_size = 10000;
+  const int64_t fixed_size = byte_size / 8;
   // std::vector<int64_t> edge_array[task_pool_size_];
   std::vector<std::unordered_map<int64_t, int>> count(task_pool_size_);
   std::vector<std::future<int>> tasks;
@@ -416,7 +479,7 @@ int32_t GraphTable::make_complementary_graph(int idx, int64_t byte_size) {
           std::vector<Node *> &v = shards[i]->get_bucket();
           size_t ind = i % this->task_pool_size_;
           for (size_t j = 0; j < v.size(); j++) {
-            size_t location = v[j]->get_id();
+            // size_t location = v[j]->get_id();
             for (int k = 0; k < v[j]->get_neighbor_size(); k++) {
               count[ind][v[j]->get_neighbor_id(k)]++;
             }
@@ -424,19 +487,12 @@ int32_t GraphTable::make_complementary_graph(int idx, int64_t byte_size) {
           return 0;
         }));
   }
-
+  for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
   std::unordered_map<int64_t, int> final_count;
   std::map<int, std::vector<int64_t>> count_to_id;
   std::vector<int64_t> buffer;
-  for (auto p : edge_shards[idx]) {
-    delete p;
-  }
+  clear_graph(idx);
 
-  edge_shards[idx].clear();
-  for (size_t i = 0; i < shard_num_per_server; i++) {
-    edge_shards[idx].push_back(new GraphShard());
-  }
-  for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
   for (int i = 0; i < task_pool_size_; i++) {
     for (auto &p : count[i]) {
       final_count[p.first] = final_count[p.first] + p.second;
@@ -447,13 +503,13 @@ int32_t GraphTable::make_complementary_graph(int idx, int64_t byte_size) {
     count_to_id[p.second].push_back(p.first);
     VLOG(2) << p.first << " appear " << p.second << " times";
   }
-  // std::map<int,std::vector<int64_t>>::iterator iter= count_to_id.rbegin();
   auto iter = count_to_id.rbegin();
   while (iter != count_to_id.rend() && byte_size > 0) {
     for (auto x : iter->second) {
       buffer.push_back(x);
       if (buffer.size() >= fixed_size) {
         int64_t res = load_graph_to_memory_from_ssd(idx, buffer);
+        buffer.clear();
         byte_size -= res;
       }
       if (byte_size <= 0) break;
@@ -1265,13 +1321,14 @@ int32_t GraphTable::random_sample_neighbors(
           if (node == nullptr) {
 #ifdef PADDLE_WITH_HETERPS
             if (search_level == 2) {
-              VLOG(2) << "enter sample from ssd";
+              VLOG(2) << "enter sample from ssd for node_id " << node_id;
               char *buffer_addr = random_sample_neighbor_from_ssd(
                   idx, node_id, sample_size, rng, actual_size);
               if (actual_size != 0) {
-                std::shared_ptr<char> &buffer = buffers[idx];
+                std::shared_ptr<char> &buffer = buffers[idy];
                 buffer.reset(buffer_addr, char_del);
               }
+              VLOG(2) << "actual sampled size from ssd = " << actual_sizes[idy];
               continue;
             }
 #endif
@@ -1431,7 +1488,7 @@ std::vector<std::vector<int64_t>> GraphTable::get_all_id(int type_id, int idx,
   }
   for (size_t i = 0; i < tasks.size(); i++) {
     auto ids = tasks[i].get();
-    for (auto &id : ids) res[id % slice_num].push_back(id);
+    for (auto &id : ids) res[(uint64_t)(id) % slice_num].push_back(id);
   }
   return res;
 }
@@ -1506,8 +1563,27 @@ int32_t GraphTable::Initialize(const TableParameter &config,
   LOG(INFO) << "in graphTable initialize over";
   return Initialize(graph);
 }
+
+void GraphTable::load_node_weight(int type_id, int idx, std::string path) {
+  auto paths = paddle::string::split_string<std::string>(path, ";");
+  int64_t count = 0;
+  auto &weight_map = node_weight[type_id][idx];
+  for (auto path : paths) {
+    std::ifstream file(path);
+    std::string line;
+    while (std::getline(file, line)) {
+      auto values = paddle::string::split_string<std::string>(line, "\t");
+      count++;
+      if (values.size() < 2) continue;
+      auto src_id = std::stoull(values[0]);
+      double weight = std::stod(values[1]);
+      weight_map[src_id] = weight;
+    }
+  }
+}
 int32_t GraphTable::Initialize(const GraphParameter &graph) {
   task_pool_size_ = graph.task_pool_size();
+
 #ifdef PADDLE_WITH_HETERPS
   _db = NULL;
   search_level = graph.search_level();
@@ -1593,6 +1669,8 @@ int32_t GraphTable::Initialize(const GraphParameter &graph) {
   VLOG(0) << "in init graph table shard idx = " << _shard_idx << " shard_start "
           << shard_start << " shard_end " << shard_end;
   edge_shards.resize(id_to_edge.size());
+  node_weight.resize(2);
+  node_weight[0].resize(id_to_edge.size());
 #ifdef PADDLE_WITH_HETERPS
   partitions.resize(id_to_edge.size());
 #endif
@@ -1601,6 +1679,7 @@ int32_t GraphTable::Initialize(const GraphParameter &graph) {
       edge_shards[k].push_back(new GraphShard());
     }
   }
+  node_weight[1].resize(id_to_feature.size());
   feature_shards.resize(id_to_feature.size());
   for (int k = 0; k < (int)feature_shards.size(); k++) {
     for (size_t i = 0; i < shard_num_per_server; i++) {
diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.h b/paddle/fluid/distributed/ps/table/common_graph_table.h
index 2d869dc805a94..25bec5276e729 100644
--- a/paddle/fluid/distributed/ps/table/common_graph_table.h
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.h
@@ -537,6 +537,7 @@ class GraphTable : public Table {
     }
     return 0;
   }
+  virtual void load_node_weight(int type_id, int idx, std::string path);
 #ifdef PADDLE_WITH_HETERPS
   // virtual int32_t start_graph_sampling() {
   //   return this->graph_sampler->start_graph_sampling();
@@ -551,6 +552,7 @@ class GraphTable : public Table {
   //   return 0;
   // }
   virtual void make_partitions(int idx, int64_t gb_size, int device_len);
+  virtual void export_partition_files(int idx, std::string file_path);
   virtual char *random_sample_neighbor_from_ssd(
       int idx, int64_t id, int sample_size,
       const std::shared_ptr<std::mt19937_64> rng, int &actual_size);
@@ -572,7 +574,6 @@ class GraphTable : public Table {
                             const std::string &edge_type);
   int32_t load_next_partition(int idx);
   void set_search_level(int search_level) { this->search_level = search_level; }
-  // virtual GraphSampler *get_graph_sampler() { return graph_sampler.get(); }
   int search_level;
   int64_t total_memory_cost;
   std::vector<std::vector<std::vector<int64_t>>> partitions;
@@ -585,6 +586,7 @@ class GraphTable : public Table {
   int task_pool_size_ = 24;
   const int random_sample_nodes_ranges = 3;
 
+  std::vector<std::vector<std::unordered_map<int64_t, double>>> node_weight;
   std::vector<std::vector<std::string>> feat_name;
   std::vector<std::vector<std::string>> feat_dtype;
   std::vector<std::vector<int32_t>> feat_shape;
diff --git a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc
new file mode 100644
index 0000000000000..68f28640fc69e
--- /dev/null
+++ b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc
@@ -0,0 +1,316 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h"
+#include <gflags/gflags.h>
+#include "glog/logging.h"
+#include "paddle/fluid/string/string_helper.h"
+
+namespace paddle {
+namespace distributed {
+
+int CtrDymfAccessor::Initialize() {
+  auto name = _config.embed_sgd_param().name();
+  _embed_sgd_rule = CREATE_PSCORE_CLASS(SparseValueSGDRule, name);
+  _embed_sgd_rule->LoadConfig(_config.embed_sgd_param(), 1);
+
+  name = _config.embedx_sgd_param().name();
+  _embedx_sgd_rule = CREATE_PSCORE_CLASS(SparseValueSGDRule, name);
+  _embedx_sgd_rule->LoadConfig(_config.embedx_sgd_param(),
+                               _config.embedx_dim());
+
+  common_feature_value.embed_sgd_dim = _embed_sgd_rule->Dim();
+  common_feature_value.embedx_dim = _config.embedx_dim();
+  common_feature_value.embedx_sgd_dim = _embedx_sgd_rule->Dim();
+  _show_click_decay_rate = _config.ctr_accessor_param().show_click_decay_rate();
+  _ssd_unseenday_threshold =
+      _config.ctr_accessor_param().ssd_unseenday_threshold();
+
+  if (_config.ctr_accessor_param().show_scale()) {
+    _show_scale = true;
+  }
+  VLOG(0) << " INTO CtrDymfAccessor::Initialize()";
+  InitAccessorInfo();
+  return 0;
+}
+
+void CtrDymfAccessor::InitAccessorInfo() {
+  _accessor_info.dim = common_feature_value.Dim();
+  _accessor_info.size = common_feature_value.Size();
+
+  auto embedx_dim = _config.embedx_dim();
+  VLOG(0) << "InitAccessorInfo embedx_dim:" << embedx_dim;
+  _accessor_info.select_dim = 3 + embedx_dim;
+  _accessor_info.select_size = _accessor_info.select_dim * sizeof(float);
+  _accessor_info.update_dim = 4 + embedx_dim;
+  _accessor_info.update_size = _accessor_info.update_dim * sizeof(float);
+  _accessor_info.mf_size =
+      (embedx_dim + common_feature_value.embedx_sgd_dim) * sizeof(float);
+}
+
+bool CtrDymfAccessor::Shrink(float* value) {
+  auto base_threshold = _config.ctr_accessor_param().base_threshold();
+  auto delta_threshold = _config.ctr_accessor_param().delta_threshold();
+  auto delete_after_unseen_days =
+      _config.ctr_accessor_param().delete_after_unseen_days();
+  auto delete_threshold = _config.ctr_accessor_param().delete_threshold();
+
+  // time_decay first
+  common_feature_value.Show(value) *= _show_click_decay_rate;
+  common_feature_value.Click(value) *= _show_click_decay_rate;
+
+  // shrink after
+  auto score = ShowClickScore(common_feature_value.Show(value),
+                              common_feature_value.Click(value));
+  auto unseen_days = common_feature_value.UnseenDays(value);
+  if (score < delete_threshold || unseen_days > delete_after_unseen_days) {
+    return true;
+  }
+  return false;
+}
+
+bool CtrDymfAccessor::SaveCache(float* value, int param,
+                                double global_cache_threshold) {
+  auto base_threshold = _config.ctr_accessor_param().base_threshold();
+  auto delta_keep_days = _config.ctr_accessor_param().delta_keep_days();
+  if (ShowClickScore(common_feature_value.Show(value),
+                     common_feature_value.Click(value)) >= base_threshold &&
+      common_feature_value.UnseenDays(value) <= delta_keep_days) {
+    return common_feature_value.Show(value) > global_cache_threshold;
+  }
+  return false;
+}
+
+bool CtrDymfAccessor::SaveSSD(float* value) {
+  if (common_feature_value.UnseenDays(value) > _ssd_unseenday_threshold) {
+    return true;
+  }
+  return false;
+}
+
+bool CtrDymfAccessor::Save(float* value, int param) {
+  auto base_threshold = _config.ctr_accessor_param().base_threshold();
+  auto delta_threshold = _config.ctr_accessor_param().delta_threshold();
+  auto delta_keep_days = _config.ctr_accessor_param().delta_keep_days();
+  if (param == 2) {
+    delta_threshold = 0;
+  }
+  switch (param) {
+    // save all
+    case 0: {
+      return true;
+    }
+    // save xbox delta
+    case 1:
+    // save xbox base
+    case 2: {
+      if (ShowClickScore(common_feature_value.Show(value),
+                         common_feature_value.Click(value)) >= base_threshold &&
+          common_feature_value.DeltaScore(value) >= delta_threshold &&
+          common_feature_value.UnseenDays(value) <= delta_keep_days) {
+        // do this after save, because it must not be modified when retry
+        if (param == 2) {
+          common_feature_value.DeltaScore(value) = 0;
+        }
+        return true;
+      } else {
+        return false;
+      }
+    }
+    // already decayed in shrink
+    case 3: {
+      // do this after save, because it must not be modified when retry
+      // common_feature_value.UnseenDays(value)++;
+      return true;
+    }
+    // save revert batch_model
+    case 5: {
+      return true;
+    }
+    default:
+      return true;
+  }
+}
+
+void CtrDymfAccessor::UpdateStatAfterSave(float* value, int param) {
+  auto base_threshold = _config.ctr_accessor_param().base_threshold();
+  auto delta_threshold = _config.ctr_accessor_param().delta_threshold();
+  auto delta_keep_days = _config.ctr_accessor_param().delta_keep_days();
+  if (param == 2) {
+    delta_threshold = 0;
+  }
+  switch (param) {
+    case 1: {
+      if (ShowClickScore(common_feature_value.Show(value),
+                         common_feature_value.Click(value)) >= base_threshold &&
+          common_feature_value.DeltaScore(value) >= delta_threshold &&
+          common_feature_value.UnseenDays(value) <= delta_keep_days) {
+        common_feature_value.DeltaScore(value) = 0;
+      }
+    }
+      return;
+    case 3: {
+      common_feature_value.UnseenDays(value)++;
+    }
+      return;
+    default:
+      return;
+  }
+}
+
+int32_t CtrDymfAccessor::Create(float** values, size_t num) {
+  auto embedx_dim = _config.embedx_dim();
+  for (size_t value_item = 0; value_item < num; ++value_item) {
+    float* value = values[value_item];
+    value[common_feature_value.UnseenDaysIndex()] = 0;
+    value[common_feature_value.DeltaScoreIndex()] = 0;
+    value[common_feature_value.ShowIndex()] = 0;
+    value[common_feature_value.ClickIndex()] = 0;
+    value[common_feature_value.SlotIndex()] = -1;
+    value[common_feature_value.MfDimIndex()] = -1;
+    _embed_sgd_rule->InitValue(value + common_feature_value.EmbedWIndex(),
+                               value + common_feature_value.EmbedG2SumIndex());
+    _embedx_sgd_rule->InitValue(value + common_feature_value.EmbedxWIndex(),
+                                value + common_feature_value.EmbedxG2SumIndex(),
+                                false);
+  }
+  return 0;
+}
+
+bool CtrDymfAccessor::NeedExtendMF(float* value) {
+  float show = value[common_feature_value.ShowIndex()];
+  float click = value[common_feature_value.ClickIndex()];
+  float score = (show - click) * _config.ctr_accessor_param().nonclk_coeff() +
+                click * _config.ctr_accessor_param().click_coeff();
+  return score >= _config.embedx_threshold();
+}
+
+bool CtrDymfAccessor::HasMF(size_t size) {
+  return size > common_feature_value.EmbedxG2SumIndex();
+}
+
+// from CommonFeatureValue to CtrDymfPullValue
+int32_t CtrDymfAccessor::Select(float** select_values, const float** values,
+                                size_t num) {
+  auto embedx_dim = _config.embedx_dim();
+  for (size_t value_item = 0; value_item < num; ++value_item) {
+    float* select_value = select_values[value_item];
+    const float* value = values[value_item];
+    select_value[CtrDymfPullValue::ShowIndex()] =
+        value[common_feature_value.ShowIndex()];
+    select_value[CtrDymfPullValue::ClickIndex()] =
+        value[common_feature_value.ClickIndex()];
+    select_value[CtrDymfPullValue::EmbedWIndex()] =
+        value[common_feature_value.EmbedWIndex()];
+    memcpy(select_value + CtrDymfPullValue::EmbedxWIndex(),
+           value + common_feature_value.EmbedxWIndex(),
+           embedx_dim * sizeof(float));
+  }
+  return 0;
+}
+
+// from CtrDymfPushValue to CtrDymfPushValue
+// first dim: item
+// second dim: field num
+int32_t CtrDymfAccessor::Merge(float** update_values,
+                               const float** other_update_values, size_t num) {
+  // currently merge in cpu is not supported
+  return 0;
+}
+
+// from CtrDymfPushValue to CommonFeatureValue
+// first dim: item
+// second dim: field num
+int32_t CtrDymfAccessor::Update(float** update_values,
+                                const float** push_values, size_t num) {
+  // currently update in cpu is not supported
+  return 0;
+}
+
+bool CtrDymfAccessor::CreateValue(int stage, const float* value) {
+  // stage == 0, pull
+  // stage == 1, push
+  if (stage == 0) {
+    return true;
+  } else if (stage == 1) {
+    // operation
+    auto show = CtrDymfPushValue::Show(const_cast<float*>(value));
+    auto click = CtrDymfPushValue::Click(const_cast<float*>(value));
+    auto score = ShowClickScore(show, click);
+    if (score <= 0) {
+      return false;
+    }
+    if (score >= 1) {
+      return true;
+    }
+    return local_uniform_real_distribution<float>()(local_random_engine()) <
+           score;
+  } else {
+    return true;
+  }
+}
+
+float CtrDymfAccessor::ShowClickScore(float show, float click) {
+  auto nonclk_coeff = _config.ctr_accessor_param().nonclk_coeff();
+  auto click_coeff = _config.ctr_accessor_param().click_coeff();
+  return (show - click) * nonclk_coeff + click * click_coeff;
+}
+
+std::string CtrDymfAccessor::ParseToString(const float* v, int param) {
+  /*
+      float unseen_days;
+      float delta_score;
+      float show;
+      float click;
+      float embed_w;
+      std::vector<float> embed_g2sum; // float embed_g2sum
+      float slot;
+      float mf_dim;
+      std::<vector>float embedx_g2sum; // float embedx_g2sum
+      std::vector<float> embedx_w;
+  */
+  thread_local std::ostringstream os;
+  os.clear();
+  os.str("");
+  os << v[0] << " " << v[1] << " " << v[2] << " " << v[3] << " " << v[4];
+  //    << v[5] << " " << v[6];
+  for (int i = common_feature_value.EmbedG2SumIndex();
+       i < common_feature_value.EmbedxWIndex(); i++) {
+    os << " " << v[i];
+  }
+  os << " " << common_feature_value.Slot(const_cast<float*>(v)) << " "
+     << common_feature_value.MfDim(const_cast<float*>(v));
+  auto show = common_feature_value.Show(const_cast<float*>(v));
+  auto click = common_feature_value.Click(const_cast<float*>(v));
+  auto score = ShowClickScore(show, click);
+  if (score >= _config.embedx_threshold() &&
+      param > common_feature_value.EmbedxG2SumIndex()) {
+    VLOG(0) << "common_feature_value.EmbedxG2SumIndex():"
+            << common_feature_value.EmbedxG2SumIndex();
+    for (auto i = common_feature_value.EmbedxG2SumIndex();
+         i < common_feature_value.Dim(); ++i) {
+      os << " " << v[i];
+    }
+  }
+  return os.str();
+}
+
+int CtrDymfAccessor::ParseFromString(const std::string& str, float* value) {
+  auto ret = paddle::string::str_to_float(str.data(), value);
+  CHECK(ret >= 7) << "expect more than 7 real:" << ret;
+  return ret;
+}
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h
new file mode 100644
index 0000000000000..6a9f5d28f5e59
--- /dev/null
+++ b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h
@@ -0,0 +1,217 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <stdint.h>
+#include <stdio.h>
+#include <vector>
+#include "paddle/fluid/distributed/common/registerer.h"
+#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/distributed/ps/table/accessor.h"
+#include "paddle/fluid/distributed/ps/table/sparse_sgd_rule.h"
+
+namespace paddle {
+namespace distributed {
+
+// DownpourUnitAccessor
+class CtrDymfAccessor : public ValueAccessor {
+ public:
+  struct CtrDymfFeatureValue {
+    /*
+      float unseen_days;
+      float delta_score;
+      float show;
+      float click;
+      float embed_w;
+      // float embed_g2sum;
+      std::vector<float> embed_g2sum;
+      float slot;
+      float mf_dim
+      std::<vector>float embedx_g2sum;
+      // float embedx_g2sum;
+      std::vector<float> embedx_w;
+       */
+
+    int Dim() { return 7 + embed_sgd_dim + embedx_sgd_dim + embedx_dim; }
+    int DimSize(size_t dim, int embedx_dim) { return sizeof(float); }
+    int Size() { return Dim() * sizeof(float); }
+    int UnseenDaysIndex() { return 0; }
+    int DeltaScoreIndex() { return UnseenDaysIndex() + 1; }
+    int ShowIndex() { return DeltaScoreIndex() + 1; }
+    int ClickIndex() { return ShowIndex() + 1; }
+    int EmbedWIndex() { return ClickIndex() + 1; }
+    int EmbedG2SumIndex() { return EmbedWIndex() + 1; }
+    int SlotIndex() { return EmbedG2SumIndex() + 1; }
+    int MfDimIndex() { return SlotIndex() + 1; }
+    int EmbedxG2SumIndex() { return MfDimIndex() + 1; }
+    int EmbedxWIndex() { return EmbedxG2SumIndex() + 1; }
+
+    float& UnseenDays(float* val) { return val[UnseenDaysIndex()]; }
+    float& DeltaScore(float* val) { return val[DeltaScoreIndex()]; }
+    float& Show(float* val) { return val[ShowIndex()]; }
+    float& Click(float* val) { return val[ClickIndex()]; }
+    float& Slot(float* val) { return val[SlotIndex()]; }
+    float& MfDim(float* val) { return val[MfDimIndex()]; }
+    float& EmbedW(float* val) { return val[EmbedWIndex()]; }
+    float& EmbedG2Sum(float* val) { return val[EmbedG2SumIndex()]; }
+    float& EmbedxG2Sum(float* val) { return val[EmbedxG2SumIndex()]; }
+    float& EmbedxW(float* val) { return val[EmbedxWIndex()]; }
+
+    int embed_sgd_dim;
+    int embedx_dim;
+    int embedx_sgd_dim;
+  };
+
+  struct CtrDymfPushValue {
+    /*
+       float slot;
+       float show;
+       float click;
+       float mf_dim;
+       float embed_g;
+       std::vector<float> embedx_g;
+       */
+
+    static int Dim(int embedx_dim) { return 5 + embedx_dim; }
+
+    static int DimSize(int dim, int embedx_dim) { return sizeof(float); }
+    static int Size(int embedx_dim) { return Dim(embedx_dim) * sizeof(float); }
+    static int SlotIndex() { return 0; }
+    static int ShowIndex() { return CtrDymfPushValue::SlotIndex() + 1; }
+    static int ClickIndex() { return CtrDymfPushValue::ShowIndex() + 1; }
+    static int MfDimIndex() { return CtrDymfPushValue::ClickIndex() + 1; }
+    static int EmbedGIndex() { return CtrDymfPushValue::MfDimIndex() + 1; }
+    static int EmbedxGIndex() { return CtrDymfPushValue::EmbedGIndex() + 1; }
+    static float& Slot(float* val) {
+      return val[CtrDymfPushValue::SlotIndex()];
+    }
+    static float& Show(float* val) {
+      return val[CtrDymfPushValue::ShowIndex()];
+    }
+    static float& Click(float* val) {
+      return val[CtrDymfPushValue::ClickIndex()];
+    }
+    static float& MfDim(float* val) {
+      return val[CtrDymfPushValue::MfDimIndex()];
+    }
+    static float& EmbedG(float* val) {
+      return val[CtrDymfPushValue::EmbedGIndex()];
+    }
+    static float* EmbedxG(float* val) {
+      return val + CtrDymfPushValue::EmbedxGIndex();
+    }
+  };
+
+  struct CtrDymfPullValue {
+    /*
+       float show;
+       float click;
+       float mf_dim;
+       float embed_w;
+       std::vector<float> embedx_w;
+       */
+
+    static int Dim(int embedx_dim) { return 4 + embedx_dim; }
+    static int DimSize(size_t dim) { return sizeof(float); }
+    static int Size(int embedx_dim) { return Dim(embedx_dim) * sizeof(float); }
+    static int ShowIndex() { return 0; }
+    static int ClickIndex() { return 1; }
+    static int MfDimIndex() { return 2; }
+    static int EmbedWIndex() { return 3; }
+    static int EmbedxWIndex() { return 4; }
+    static float& Show(float* val) {
+      return val[CtrDymfPullValue::ShowIndex()];
+    }
+    static float& Click(float* val) {
+      return val[CtrDymfPullValue::ClickIndex()];
+    }
+    static float& MfDim(float* val) {
+      return val[CtrDymfPullValue::MfDimIndex()];
+    }
+    static float& EmbedW(float* val) {
+      return val[CtrDymfPullValue::EmbedWIndex()];
+    }
+    static float* EmbedxW(float* val) {
+      return val + CtrDymfPullValue::EmbedxWIndex();
+    }
+  };
+  CtrDymfAccessor() {}
+  virtual ~CtrDymfAccessor() {}
+  virtual int Initialize();
+  // 初始化AccessorInfo
+  virtual void InitAccessorInfo();
+  // 判断该value是否进行shrink
+  virtual bool Shrink(float* value);
+  // 判断该value是否保存到ssd
+  // virtual bool save_ssd(float* value);
+  virtual bool NeedExtendMF(float* value);
+  virtual bool HasMF(size_t size);
+  // 判断该value是否在save阶段dump,
+  // param作为参数用于标识save阶段，如downpour的xbox与batch_model
+  // param = 0, save all feature
+  // param = 1, save delta feature
+  // param = 2, save xbox base feature
+  bool Save(float* value, int param) override;
+  bool SaveCache(float* value, int param,
+                 double global_cache_threshold) override;
+  bool SaveSSD(float* value) override;
+  // update delta_score and unseen_days after save
+  void UpdateStatAfterSave(float* value, int param) override;
+  // keys不存在时，为values生成随机值
+  // 要求value的内存由外部调用者分配完毕
+  virtual int32_t Create(float** value, size_t num);
+  // 从values中选取到select_values中
+  virtual int32_t Select(float** select_values, const float** values,
+                         size_t num);
+  // 将update_values聚合到一起
+  virtual int32_t Merge(float** update_values,
+                        const float** other_update_values, size_t num);
+  // 将update_values聚合到一起，通过it.next判定是否进入下一个key
+  // virtual int32_t Merge(float** update_values, iterator it);
+  // 将update_values更新应用到values中
+  virtual int32_t Update(float** values, const float** update_values,
+                         size_t num);
+
+  std::string ParseToString(const float* value, int param) override;
+  int32_t ParseFromString(const std::string& str, float* v) override;
+  virtual bool CreateValue(int type, const float* value);
+
+  // 这个接口目前只用来取show
+  float GetField(float* value, const std::string& name) override {
+    // CHECK(name == "show");
+    if (name == "show") {
+      return common_feature_value.Show(value);
+    }
+    return 0.0;
+  }
+
+ private:
+  // float ShowClickScore(float show, float click);
+
+  // SparseValueSGDRule* _embed_sgd_rule;
+  // SparseValueSGDRule* _embedx_sgd_rule;
+  // CtrDymfFeatureValue common_feature_value;
+  float _show_click_decay_rate;
+  int32_t _ssd_unseenday_threshold;
+  bool _show_scale = false;
+
+ public:  // TODO(zhaocaibei123): it should be private, but we make it public
+          // for unit test
+  CtrDymfFeatureValue common_feature_value;
+  float ShowClickScore(float show, float click);
+  SparseValueSGDRule* _embed_sgd_rule;
+  SparseValueSGDRule* _embedx_sgd_rule;
+};
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/table/table.cc b/paddle/fluid/distributed/ps/table/table.cc
index 5eb38d9c400b0..ef2eb3a746f66 100644
--- a/paddle/fluid/distributed/ps/table/table.cc
+++ b/paddle/fluid/distributed/ps/table/table.cc
@@ -22,6 +22,7 @@
 
 #include "paddle/fluid/distributed/ps/table/ctr_accessor.h"
 #include "paddle/fluid/distributed/ps/table/ctr_double_accessor.h"
+#include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h"
 #include "paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h"
 #include "paddle/fluid/distributed/ps/table/memory_sparse_table.h"
 #include "paddle/fluid/distributed/ps/table/sparse_accessor.h"
@@ -40,9 +41,11 @@ REGISTER_PSCORE_CLASS(Table, GlobalStepTable);
 REGISTER_PSCORE_CLASS(Table, MemorySparseTable);
 REGISTER_PSCORE_CLASS(Table, SSDSparseTable);
 REGISTER_PSCORE_CLASS(Table, MemorySparseGeoTable);
+
 REGISTER_PSCORE_CLASS(ValueAccessor, CommMergeAccessor);
 REGISTER_PSCORE_CLASS(ValueAccessor, CtrCommonAccessor);
 REGISTER_PSCORE_CLASS(ValueAccessor, CtrDoubleAccessor);
+REGISTER_PSCORE_CLASS(ValueAccessor, CtrDymfAccessor);
 REGISTER_PSCORE_CLASS(ValueAccessor, SparseAccessor);
 REGISTER_PSCORE_CLASS(SparseValueSGDRule, StdAdaGradSGDRule);
 REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseAdamSGDRule);
diff --git a/paddle/fluid/distributed/store/tcp_store.cc b/paddle/fluid/distributed/store/tcp_store.cc
index b0d5add49565f..ec6f0e26a08fa 100644
--- a/paddle/fluid/distributed/store/tcp_store.cc
+++ b/paddle/fluid/distributed/store/tcp_store.cc
@@ -19,21 +19,25 @@
 #include "paddle/fluid/distributed/store/tcp_store.h"
 #include "paddle/fluid/distributed/store/tcp_utils.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/flags.h"
 
 namespace paddle {
 namespace distributed {
 
 namespace detail {
 
-constexpr int INFTIME = -1;
+constexpr int INFTIME = 10000;  // 10 seconds
 
-std::unique_ptr<MasterDaemon> MasterDaemon::start(SocketType socket,
-                                                  int nranks) {
-  return std::make_unique<MasterDaemon>(socket, nranks);
+std::unique_ptr<MasterDaemon> MasterDaemon::start(SocketType socket, int nranks,
+                                                  int stop_check_timeout) {
+  return std::make_unique<MasterDaemon>(socket, nranks, stop_check_timeout);
 }
 
-MasterDaemon::MasterDaemon(SocketType socket, int nranks)
-    : _listen_socket(socket), _nranks(nranks) {
+MasterDaemon::MasterDaemon(SocketType socket, int nranks,
+                           int stop_check_timeout)
+    : _listen_socket(socket),
+      _nranks(nranks),
+      _stop_check_timeout(stop_check_timeout) {
   _background_thread = std::thread{&MasterDaemon::run, this};
 }
 
@@ -86,6 +90,10 @@ void MasterDaemon::_do_get(SocketType socket) {
 
 void MasterDaemon::_do_stop(SocketType socket) {
   VLOG(3) << "MasterDaemon::_do_stop";
+  if (!_has_stop) {
+    _stop_time = std::chrono::system_clock::now();
+  }
+  _has_stop = true;
   ReplyType value = ReplyType::STOP_WAIT;
   tcputils::send_value<ReplyType>(socket, value);
   if (--_nranks == 0) {
@@ -115,6 +123,20 @@ void MasterDaemon::run() {
 #endif
 
   while (!_stop) {
+    auto end_time = std::chrono::system_clock::now();
+    if (_has_stop) {
+      std::chrono::duration<double> diff = end_time - _stop_time;
+      int elapsed_seconds = static_cast<int>(diff.count());
+      PADDLE_ENFORCE_LT(
+          elapsed_seconds, _stop_check_timeout,
+          platform::errors::Fatal(
+              "%d seconds elapsed after the first worker "
+              "stopped, so we think there may be something wrong and will "
+              "stop the master worker. You can use "
+              "'export FLAGS_stop_check_timeout=3600'"
+              " to change the timeout value in seconds. The default one is 900",
+              elapsed_seconds));
+    }
     for (size_t i = 0; i < fds.size(); i++) {
       fds[i].revents = 0;
     }
@@ -173,10 +195,12 @@ void MasterDaemon::run() {
   }
 }
 
-std::unique_ptr<TCPServer> TCPServer::create(uint16_t port, int nranks) {
+std::unique_ptr<TCPServer> TCPServer::create(uint16_t port, int nranks,
+                                             int stop_check_timeout) {
   int socket = tcputils::tcp_listen("", std::to_string(port), AF_INET);
   auto server = std::make_unique<TCPServer>();
-  server->_master_daemon = MasterDaemon::start(socket, nranks);
+  server->_master_daemon =
+      MasterDaemon::start(socket, nranks, stop_check_timeout);
   return server;
 }
 
@@ -219,10 +243,11 @@ std::vector<T> TCPClient::receive_vector() {
 }  // namespace detail
 
 TCPStore::TCPStore(std::string host, uint16_t port, bool is_master,
-                   size_t num_workers, std::chrono::seconds timeout)
+                   size_t num_workers, std::chrono::seconds timeout,
+                   int stop_check_timeout)
     : Store(timeout), _is_master(is_master), _num_workers(num_workers) {
   if (_is_master) {
-    _server = detail::TCPServer::create(port, num_workers);
+    _server = detail::TCPServer::create(port, num_workers, stop_check_timeout);
   }
 
   _client = detail::TCPClient::connect(host, port);
diff --git a/paddle/fluid/distributed/store/tcp_store.h b/paddle/fluid/distributed/store/tcp_store.h
index 17c1d8ea30a42..4ca9a673bf575 100644
--- a/paddle/fluid/distributed/store/tcp_store.h
+++ b/paddle/fluid/distributed/store/tcp_store.h
@@ -34,9 +34,11 @@ namespace detail {
 class MasterDaemon {
  public:
   static std::unique_ptr<MasterDaemon> start(SocketType listen_socket,
-                                             int nranks);
+                                             int nranks,
+                                             int stop_check_timeout);
   MasterDaemon() = delete;
-  explicit MasterDaemon(SocketType listen_socket, int nranks);
+  explicit MasterDaemon(SocketType listen_socket, int nranks,
+                        int stop_check_timeout);
   ~MasterDaemon();
 
  private:
@@ -51,13 +53,17 @@ class MasterDaemon {
   std::unordered_map<std::string, std::vector<uint8_t>> _store;
   std::thread _background_thread{};
   int _nranks;
-  bool _stop = false;
+  int _stop_check_timeout;
+  bool _stop = false;  // all workers stopped
+  std::chrono::time_point<std::chrono::system_clock> _stop_time;
+  bool _has_stop = false;  // at least one worker stopped
 };
 
 class TCPServer {
  public:
   TCPServer() = default;
-  static std::unique_ptr<TCPServer> create(std::uint16_t port, int nranks);
+  static std::unique_ptr<TCPServer> create(std::uint16_t port, int nranks,
+                                           int stop_check_timeout);
 
  private:
   std::unique_ptr<MasterDaemon> _master_daemon;
@@ -93,7 +99,8 @@ class TCPStore : public Store {
   static constexpr std::uint16_t kDefaultPort = 6170;
   explicit TCPStore(std::string host, uint16_t port = kDefaultPort,
                     bool is_master = false, size_t num_workers = 1,
-                    std::chrono::seconds timeout = tcputils::kDefaultTimeout);
+                    std::chrono::seconds timeout = tcputils::kDefaultTimeout,
+                    int stop_check_timeout = 900);
 
   ~TCPStore();
 
diff --git a/paddle/fluid/distributed/test/CMakeLists.txt b/paddle/fluid/distributed/test/CMakeLists.txt
index ff0ff26b9579f..9f339d7ee2c08 100644
--- a/paddle/fluid/distributed/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/test/CMakeLists.txt
@@ -35,6 +35,9 @@ cc_test(sparse_sgd_rule_test SRCS sparse_sgd_rule_test.cc DEPS ${COMMON_DEPS} bo
 
 set_source_files_properties(ctr_accessor_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(ctr_accessor_test SRCS ctr_accessor_test.cc DEPS ${COMMON_DEPS} boost table)
+set_source_files_properties(ctr_dymf_accessor_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(ctr_dymf_accessor_test SRCS ctr_dymf_accessor_test.cc DEPS ${COMMON_DEPS} boost table)
+
 
 set_source_files_properties(memory_sparse_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(memory_sparse_table_test SRCS memory_sparse_table_test.cc DEPS ${COMMON_DEPS} boost table)
diff --git a/paddle/fluid/distributed/test/ctr_accessor_test.cc b/paddle/fluid/distributed/test/ctr_accessor_test.cc
index ee893ff01b59e..27b6ddf722b70 100644
--- a/paddle/fluid/distributed/test/ctr_accessor_test.cc
+++ b/paddle/fluid/distributed/test/ctr_accessor_test.cc
@@ -196,9 +196,10 @@ TEST(downpour_feature_value_accessor_test, test_update) {
         ptr[idx + j] = embedx_w[j];
       }
       idx += 8;
-      for (auto j = 0u; j < 0; ++j) {
-        ptr[idx + j] = embedx_g2sum[j];
-      }
+      //      NaiveSGD has no embedx_g2sum
+      //      for (auto j = 0u; j < 0; ++j) {
+      //        ptr[idx + j] = embedx_g2sum[j];
+      //      }
     }
   };
   struct DownpourSparsePushValueTest {
diff --git a/paddle/fluid/distributed/test/ctr_dymf_accessor_test.cc b/paddle/fluid/distributed/test/ctr_dymf_accessor_test.cc
new file mode 100644
index 0000000000000..f6e773a414c7f
--- /dev/null
+++ b/paddle/fluid/distributed/test/ctr_dymf_accessor_test.cc
@@ -0,0 +1,173 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h"
+#include <cmath>
+#include <iostream>
+#include "gtest/gtest.h"
+#include "paddle/fluid/distributed/common/registerer.h"
+#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/distributed/ps/table/sparse_sgd_rule.h"
+
+namespace paddle {
+namespace distributed {
+REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseAdaGradSGDRule);
+REGISTER_PSCORE_CLASS(SparseValueSGDRule, StdAdaGradSGDRule);
+REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseAdamSGDRule);
+REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseNaiveSGDRule);
+
+TableAccessorParameter gen_param() {
+  TableAccessorParameter param;
+  param.set_accessor_class("CtrDymfAccessor");
+  param.set_fea_dim(11);
+  param.set_embedx_dim(8);
+  param.mutable_ctr_accessor_param()->set_nonclk_coeff(0.2);
+  param.mutable_ctr_accessor_param()->set_click_coeff(1);
+  param.mutable_ctr_accessor_param()->set_base_threshold(0.5);
+  param.mutable_ctr_accessor_param()->set_delta_threshold(0.2);
+  param.mutable_ctr_accessor_param()->set_delta_keep_days(16);
+  param.mutable_ctr_accessor_param()->set_show_click_decay_rate(0.99);
+  /*
+  param.mutable_embed_sgd_param()->set_name("naive");
+  auto* naive_param = param.mutable_embed_sgd_param()->mutable_naive();
+  naive_param->set_learning_rate(0.1);
+  naive_param->set_initial_range(0.3);
+  naive_param->add_weight_bounds(-10.0);
+  naive_param->add_weight_bounds(10.0);
+  */
+  param.mutable_embed_sgd_param()->set_name("StdAdaGradSGDRule");
+  auto* adagrad_param = param.mutable_embed_sgd_param()->mutable_adagrad();
+  adagrad_param->set_learning_rate(0.1);
+  adagrad_param->set_initial_range(0.3);
+  adagrad_param->set_initial_g2sum(0.0);
+  adagrad_param->add_weight_bounds(-10.0);
+  adagrad_param->add_weight_bounds(10.0);
+
+  param.mutable_embedx_sgd_param()->set_name("SparseNaiveSGDRule");
+  auto* naive_param = param.mutable_embedx_sgd_param()->mutable_naive();
+  naive_param->set_learning_rate(0.1);
+  naive_param->set_initial_range(0.3);
+  naive_param->add_weight_bounds(-10.0);
+  naive_param->add_weight_bounds(10.0);
+
+  return param;
+}
+
+TEST(downpour_feature_value_accessor_test, test_shrink) {
+  TableAccessorParameter parameter = gen_param();
+  CtrDymfAccessor* acc = new CtrDymfAccessor();
+  ASSERT_EQ(acc->Configure(parameter), 0);
+  ASSERT_EQ(acc->Initialize(), 0);
+
+  VLOG(3) << "size of struct: " << acc->common_feature_value.embed_sgd_dim
+          << " " << acc->common_feature_value.embedx_dim << " "
+          << acc->common_feature_value.embedx_sgd_dim << " "
+          << acc->common_feature_value.Dim() << "\n";
+
+  float* value = new float[acc->GetAccessorInfo().dim];
+  for (auto i = 0u; i < acc->GetAccessorInfo().dim; ++i) {
+    value[i] = i * 1.0;
+  }
+  ASSERT_TRUE(!acc->Shrink(value));
+
+  // set unseen_days too long
+  value[0] = 1000;
+  // set delta score too small
+  value[1] = 0.001;
+  ASSERT_TRUE(acc->Shrink(value));
+}
+
+TEST(downpour_feature_value_accessor_test, test_save) {
+  TableAccessorParameter parameter = gen_param();
+  CtrDymfAccessor* acc = new CtrDymfAccessor();
+  ASSERT_EQ(acc->Configure(parameter), 0);
+  ASSERT_EQ(acc->Initialize(), 0);
+
+  float* value = new float[acc->GetAccessorInfo().dim];
+  for (auto i = 0u; i < acc->GetAccessorInfo().dim; ++i) {
+    value[i] = i * 1.0;
+  }
+
+  // save all feature
+  ASSERT_TRUE(acc->Save(value, 0));
+
+  // save delta feature
+  ASSERT_TRUE(acc->Save(value, 1));
+
+  // save base feature with time decay
+  ASSERT_TRUE(acc->Save(value, 2));
+
+  VLOG(3) << "test_save:";
+  for (auto i = 0u; i < acc->GetAccessorInfo().dim; ++i) {
+    VLOG(3) << value[i];
+  }
+}
+
+TEST(downpour_feature_value_accessor_test, test_create) {
+  TableAccessorParameter parameter = gen_param();
+  CtrDymfAccessor* acc = new CtrDymfAccessor();
+  ASSERT_EQ(acc->Configure(parameter), 0);
+  ASSERT_EQ(acc->Initialize(), 0);
+
+  const int field_size = 8 + 8;
+  const int item_size = 10;
+
+  float** value = new float*[item_size];
+  for (auto i = 0u; i < item_size; ++i) {
+    value[i] = new float[field_size];
+  }
+  ASSERT_EQ(acc->Create(value, item_size), 0);
+
+  for (auto i = 0u; i < item_size; ++i) {
+    for (auto j = 0u; j < field_size; ++j) {
+      VLOG(3) << value[i][j] << " ";
+      // ASSERT_FLOAT_EQ(value[i][j], 0);
+    }
+    VLOG(3) << "\n";
+  }
+}
+
+TEST(downpour_feature_value_accessor_test, test_show_click_score) {
+  TableAccessorParameter parameter = gen_param();
+  CtrDymfAccessor* acc = new CtrDymfAccessor();
+  ASSERT_EQ(acc->Configure(parameter), 0);
+  ASSERT_EQ(acc->Initialize(), 0);
+
+  float show = 10;
+  float click = 6;
+  ASSERT_FLOAT_EQ(acc->ShowClickScore(show, click), 6.8);
+}
+
+TEST(downpour_feature_value_accessor_test, test_string_related) {
+  TableAccessorParameter parameter = gen_param();
+  CtrDymfAccessor* acc = new CtrDymfAccessor();
+  ASSERT_EQ(acc->Configure(parameter), 0);
+  ASSERT_EQ(acc->Initialize(), 0);
+
+  const int field_size = 16;
+  float* value = new float[field_size];
+  for (auto i = 0u; i < field_size; ++i) {
+    value[i] = i;
+  }
+
+  auto str = acc->ParseToString(value, 0);
+
+  VLOG(0) << "test_string_related" << str << std::endl;
+
+  str = "0 1 2 3 4 5 6 7";
+  ASSERT_NE(acc->ParseFromString(str, value), 0);
+  // make sure init_zero=true
+}
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt
index 53ac895bfbccb..11c98e5da9dde 100644
--- a/paddle/fluid/eager/CMakeLists.txt
+++ b/paddle/fluid/eager/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(eager_deps phi_api phi_dygraph_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node custom_operator_node)
+set(eager_deps phi_api phi_dygraph_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta eager_nan_inf_utils grad_node_info grad_tensor_holder accumulation_node custom_operator_node)
 
 set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy)
 set(generated_deps final_dygraph_function final_dygraph_node dygraph_function dygraph_node)
@@ -18,6 +18,7 @@ if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
     cc_library(backward SRCS backward.cc DEPS grad_tensor_holder utils autograd_meta grad_node_info switch_autotune)
 endif()
 
+cc_library(eager_nan_inf_utils SRCS nan_inf_utils.cc DEPS phi_tensor nan_inf_utils enforce)
 cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi_api phi_tensor)
 
 cc_library(autograd_meta SRCS autograd_meta.cc DEPS phi_api phi_tensor)
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc
index 857f1be1f7ae0..2ed44ce489934 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.cc
+++ b/paddle/fluid/eager/accumulation/accumulation_node.cc
@@ -34,22 +34,9 @@ static void CopyOrAddTensor(paddle::experimental::Tensor* tensor,
     *tensor = t;
   } else {
     // Accumulation
-    PADDLE_ENFORCE_EQ(t.initialized(), true,
-                      paddle::platform::errors::Fatal(
-                          "We can only accumulate initialized tensor, but we "
-                          "got tensor: %s is empty please check you network "
-                          "and make sure it creates grads.",
-                          t.name()));
-    PADDLE_ENFORCE_NOT_NULL(
-        tensor, paddle::platform::errors::Fatal(
-                    "We can only accumulate initialized tensor to non-nullptr "
-                    "tensor but we got nullptr please check you network "
-                    "and make sure it creates grads."));
-
-    if (t.is_dense_tensor()) {
-      if (tensor->is_dense_tensor()) {
+    if (LIKELY(t.is_dense_tensor())) {
+      if (LIKELY(tensor->is_dense_tensor())) {
         paddle::imperative::TensorAdd<paddle::experimental::Tensor>(t, tensor);
-
       } else {
         // TODO(jiabin): Support Other TensorBase later
         // TODO(zhanlve): Replace SelectedRowsAddTensor with
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
index 8bd40140f53cc..38f67cb5bdf2a 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
@@ -90,9 +90,7 @@ void ScaleAPI(const paddle::experimental::Tensor& x, float scale, float bias,
   size_t bytes_size =
       phi::product(dense_tensor->dims()) * SizeOf(dense_tensor->dtype());
   auto dense_out = std::make_shared<phi::DenseTensor>(
-      phi::make_intrusive<paddle::experimental::SharedStorage>(
-          paddle::memory::Alloc(place, bytes_size)),
-      std::move(tensor_meta));
+      paddle::memory::Alloc(place, bytes_size), std::move(tensor_meta));
   // Handle Device Context
   const paddle::platform::Place& expected_kernel_place =
       Controller::Instance().GetExpectedPlace();
diff --git a/paddle/fluid/eager/api/utils/global_utils.h b/paddle/fluid/eager/api/utils/global_utils.h
index 44e78c3bbf193..3c18efea20349 100644
--- a/paddle/fluid/eager/api/utils/global_utils.h
+++ b/paddle/fluid/eager/api/utils/global_utils.h
@@ -17,11 +17,12 @@
 
 #include <atomic>
 #include <memory>
+#include "paddle/fluid/eager/type_defs.h"
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/phi/api/ext/op_meta_info.h"
 #include "paddle/utils/small_vector.h"
+
 namespace egr {
-constexpr size_t kSlotSmallVectorSize = 15U;
 class UniqueNameGenerator {
  public:
   explicit UniqueNameGenerator(std::string prefix = "") : prefix_(prefix) {}
@@ -77,7 +78,8 @@ class Controller {
     op_meta_info_map_.insert(map.begin(), map.end());
   }
 
-  std::unordered_map<std::string, std::vector<std::unordered_map<int, int>>>&
+  std::unordered_map<std::string,
+                     std::vector<std::vector<std::unordered_map<int, int>>>>&
   GetCustomEdgesSlotMap() {
     return custom_edges_slot_map_;
   }
@@ -89,8 +91,10 @@ class Controller {
       new paddle::imperative::Tracer()};
   std::unordered_map<std::string, std::vector<paddle::OpMetaInfo>>
       op_meta_info_map_;
-  /* op_type : {{grad_outputs}, {grad_inputs}, {input}, {output}, {attrs}}*/
-  std::unordered_map<std::string, std::vector<std::unordered_map<int, int>>>
+  /* op_type : {{{grad_outputs}, {grad_inputs}, {input}, {output}, {attrs}},
+   * {{grad_outputs}, {grad_inputs}, {input}, {output}, {attrs}}}*/
+  std::unordered_map<std::string,
+                     std::vector<std::vector<std::unordered_map<int, int>>>>
       custom_edges_slot_map_;
   DISABLE_COPY_AND_ASSIGN(Controller);
 };
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index 44fa8461f2fe9..3edd13ccd597f 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -1156,28 +1156,20 @@ static std::string GenerateGradNodeCreationContent(
   for (const auto& iter : op_base_infos) {
     const std::map<std::string, std::string>& grad_ins_fwd_slotname_map =
         iter.GetGradInsFwdSlotnameMap();
-    const std::unordered_set<std::string>& no_need_buffer_ins =
-        iter.GetNoNeedBufferInputs();
     for (auto& kv : grad_ins_fwd_slotname_map) {
       const std::string& tensor_wrapper_name = kv.second;
-      std::string full_reserved = "false";
-      if (fwd_outputs_name_pos_map.find(tensor_wrapper_name) ==
-              fwd_outputs_name_pos_map.end() &&
-          !no_need_buffer_ins.count(tensor_wrapper_name)) {
-        full_reserved = "true";
-      }
       const char* SET_TENSOR_WRAPPER_TEMPLATE =
-          "      grad_node->SetTensorWrapper%s(%s, %s);\n";
+          "      grad_node->SetTensorWrapper%s(%s);\n";
       // Replace output directly with input in inplace op.
       if (!inplace_map.empty() && inplace_map.count(tensor_wrapper_name)) {
         auto inplace_input_name = inplace_map[tensor_wrapper_name];
         grad_node_creation_str += paddle::string::Sprintf(
             SET_TENSOR_WRAPPER_TEMPLATE, LegalizeVarName(tensor_wrapper_name),
-            LegalizeVarName(inplace_input_name), full_reserved);
+            LegalizeVarName(inplace_input_name));
       } else {
         grad_node_creation_str += paddle::string::Sprintf(
             SET_TENSOR_WRAPPER_TEMPLATE, LegalizeVarName(tensor_wrapper_name),
-            LegalizeVarName(tensor_wrapper_name), full_reserved);
+            LegalizeVarName(tensor_wrapper_name));
       }
     }
   }
@@ -2592,7 +2584,6 @@ static std::string GenerateGradNodeHeaderContents(
 
       std::string tensor_wrapper_arg_str;
       std::string tensor_wrapper_body_str;
-      std::string full_reserved_str = "full_reserved";
       std::string no_need_buffer_str = "false";
       if (no_need_buffer_ins.count(tensor_wrapper_name)) {
         no_need_buffer_str = "true";
@@ -2610,12 +2601,12 @@ static std::string GenerateGradNodeHeaderContents(
 
         const char* SET_TENSOR_WRAPPER_BODY_TEMPLATE =
             "for(const auto& eager_tensor : %s) {\n"
-            "          %s.emplace_back( egr::TensorWrapper(eager_tensor, %s "
-            "/*full_reserved*/, %s) );\n"
+            "          %s.emplace_back( egr::TensorWrapper(eager_tensor "
+            ", %s) );\n"
             "      }\n";
         tensor_wrapper_body_str = paddle::string::Sprintf(
             SET_TENSOR_WRAPPER_BODY_TEMPLATE, tensor_wrapper_name,
-            struct_tensor_wrapper_name, full_reserved_str, no_need_buffer_str);
+            struct_tensor_wrapper_name, no_need_buffer_str);
 
         const char* CLEAR_TENSOR_WRAPPER_TEMPLATE =
             "for (auto tw: %s)   {\n"
@@ -2636,22 +2627,20 @@ static std::string GenerateGradNodeHeaderContents(
             TENSOR_WRAPPER_MEMBER_TEMPLATE, struct_tensor_wrapper_name);
 
         const char* SET_TENSOR_WRAPPER_BODY_TEMPLATE =
-            "%s = egr::TensorWrapper(%s, %s /*full_reserved*/, %s);\n";
+            "%s = egr::TensorWrapper(%s, %s);\n";
         tensor_wrapper_body_str = paddle::string::Sprintf(
             SET_TENSOR_WRAPPER_BODY_TEMPLATE, struct_tensor_wrapper_name,
-            tensor_wrapper_name, full_reserved_str, no_need_buffer_str);
+            tensor_wrapper_name, no_need_buffer_str);
 
         const char* CLEAR_TENSOR_WRAPPER_TEMPLATE = "   %s.clear();\n";
         clear_tensor_wrappers_str += paddle::string::Sprintf(
             CLEAR_TENSOR_WRAPPER_TEMPLATE, struct_tensor_wrapper_name);
       }
-      std::string full_reserved_signature_str = "bool full_reserved";
       const char* SET_TENSOR_WRAPPER_TEMPLATE =
-          "   void SetTensorWrapper%s(%s, %s) {\n     %s\n   }\n";
+          "   void SetTensorWrapper%s(%s) {\n     %s\n   }\n";
       set_tensor_wrappers_str += paddle::string::Sprintf(
           SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name,
-          tensor_wrapper_arg_str, full_reserved_signature_str,
-          tensor_wrapper_body_str);
+          tensor_wrapper_arg_str, tensor_wrapper_body_str);
     }
   }
   VLOG(6) << "Generated TensorWrapper";
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
index 8e89ea3f19762..50dab6ce840a5 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(api_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api.yaml")
-set(backward_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api.yaml")
+set(api_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/new_api.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api.yaml")
+set(backward_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/new_backward.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api.yaml")
 set(tmp_forwards_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.cc")
 set(tmp_forwards_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.h")
 set(tmp_nodes_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/tmp_nodes.cc")
@@ -16,9 +16,9 @@ add_custom_target(eager_final_state_codegen
     COMMAND "${PYTHON_EXECUTABLE}" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py" 
             "--api_yaml_path=${api_yaml_path}"
             "--backward_yaml_path=${backward_yaml_path}"
-            "--forwards_cc_path=${tmp_forwards_cc_path}" 
+            "--forwards_cc_path=${tmp_forwards_cc_path}"
             "--forwards_h_path=${tmp_forwards_h_path}"
-            "--nodes_cc_path=${tmp_nodes_cc_path}" 
+            "--nodes_cc_path=${tmp_nodes_cc_path}"
             "--nodes_h_path=${tmp_nodes_h_path}"
     COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_forwards_cc_path} ${forwards_cc_path}
     COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_forwards_h_path} ${forwards_h_path}
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
index 9ad628ef515b1..8467a6d7dfb6a 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
@@ -24,10 +24,13 @@
 ops_to_fill_zero_for_empty_grads = set([
     "split_grad", "rnn_grad", "matmul_double_grad", "matmul_triple_grad",
     "sigmoid_double_grad", "sigmoid_triple_grad", "add_double_grad",
-    "add_triple_grad", "multiply_double_grad", "multiply_triple_grad",
-    "conv2d_grad_grad", "batch_norm_double_grad", "tanh_double_grad",
-    "tanh_triple_grad", "subtract_double_grad", "divide_double_grad",
-    "log_double_grad", "elu_double_grad", "leaky_relu_double_grad"
+    "add_triple_grad", "multiply_grad", "multiply_double_grad",
+    "multiply_triple_grad", "conv2d_grad_grad", "batch_norm_double_grad",
+    "tanh_double_grad", "tanh_triple_grad", "subtract_double_grad",
+    "divide_double_grad", "log_double_grad", "elu_double_grad",
+    "leaky_relu_double_grad", "sqrt_double_grad", "rsqrt_double_grad",
+    "square_double_grad", "celu_double_grad", "pad_double_grad",
+    "pad3d_double_grad", "squeeze_double_grad", "unsqueeze_double_grad"
 ])
 
 # For API dispatch used at python-level
@@ -63,22 +66,24 @@ def AssertMessage(lhs_str, rhs_str):
 
 def ReadFwdFile(filepath):
     f = open(filepath, 'r')
+    # empty file loaded by yaml is None
     contents = yaml.load(f, Loader=yaml.FullLoader)
     f.close()
-    return contents
+    return contents if contents is not None else []
 
 
 def ReadBwdFile(filepath):
     f = open(filepath, 'r')
     contents = yaml.load(f, Loader=yaml.FullLoader)
     ret = {}
-    for content in contents:
-        assert 'backward_api' in content.keys(), AssertMessage('backward_api',
-                                                               content.keys())
-        if 'backward_api' in content.keys():
-            api_name = content['backward_api']
-
-        ret[api_name] = content
+    if contents is not None:
+        for content in contents:
+            assert 'backward_api' in content.keys(), AssertMessage(
+                'backward_api', content.keys())
+            if 'backward_api' in content.keys():
+                api_name = content['backward_api']
+
+            ret[api_name] = content
     f.close()
     return ret
 
@@ -170,7 +175,10 @@ def RecoverBaseNameOfInplaceFunction(function_name):
 
 
 def GetInplacedFunctionName(function_name):
-    return function_name + "_"
+    inplace_func_name = function_name
+    if inplace_func_name[-1] != '_':
+        inplace_func_name += '_'
+    return inplace_func_name
 
 
 def GetForwardFunctionName(string):
@@ -207,6 +215,8 @@ def ParseYamlArgs(string):
 
         assert arg_type in yaml_types_mapping.keys(
         ), f"The argument type {arg_type} in yaml config is not supported in yaml_types_mapping."
+        if arg_type in ["DataType", "DataLayout"] and default_value is not None:
+            default_value = f"paddle::experimental::{default_value}"
         arg_type = yaml_types_mapping[arg_type]
 
         arg_name = RemoveSpecialSymbolsInName(arg_name)
@@ -302,6 +312,23 @@ def ParseYamlBackward(args_str, returns_str):
     return inputs_list, attrs_list, returns_list
 
 
+def ParseYamlInplaceInfo(string):
+    # inplace_map_str: "(x -> out0), (y -> out2)"
+    inplace_map = {}
+    for pair in string.split(","):
+        pair = pair.strip()
+        if pair.startswith("("):
+            pair = pair[1:]
+
+        if pair.endswith(")"):
+            pair = pair[:-1]
+
+        key = pair.split("->")[0].strip()
+        val = pair.split("->")[1].strip()
+        inplace_map[key] = val
+    return inplace_map
+
+
 ########################
 ###  Generator Base  ###
 ########################
@@ -329,25 +356,14 @@ def __init__(self, forward_api_contents, namespace):
         self.optional_inputs = []  #[name, ...]
         self.no_need_buffers = []  #[name, ...]
         self.intermediate_outputs = []  #[name, ...]    
-        self.inplace_map = {}  #{name : name, ...}
+        self.forward_inplace_map = {}  #{name : name, ...}
 
-    def ParseInplaceInfo(self):
+    def ParseForwardInplaceInfo(self):
         forward_api_contents = self.forward_api_contents
         if 'inplace' not in forward_api_contents.keys(): return
 
-        # inplace_map_str: "(x -> out0), (y -> out2)"
         inplace_map_str = forward_api_contents['inplace']
-        for pair in inplace_map_str.split(","):
-            pair = pair.strip()
-            if pair.startswith("("):
-                pair = pair[1:]
-
-            if pair.endswith(")"):
-                pair = pair[:-1]
-
-            key = pair.split("->")[0].strip()
-            val = pair.split("->")[1].strip()
-            self.inplace_map[key] = val
+        self.forward_inplace_map = ParseYamlInplaceInfo(inplace_map_str)
 
     def ParseNoNeedBuffer(self):
         grad_api_contents = self.grad_api_contents
@@ -418,7 +434,7 @@ def DetermineForwardPositionMap(self, forward_inputs_list,
                 return_name] = [return_type, return_pos]
 
 
-class YamlGeneratorBase:
+class GeneratorBase:
     def __init__(self, api_yaml_path):
         self.namespace = ""
         self.api_yaml_path = api_yaml_path
@@ -431,7 +447,7 @@ def ParseForwardYamlContents(self):
 
     def InferNameSpace(self):
         api_yaml_path = self.api_yaml_path
-        if "sparse" in api_yaml_path:
+        if re.search(r"sparse[a-zA-Z0-9_]*\.yaml", api_yaml_path):
             self.namespace = "sparse::"
-        elif "strings" in api_yaml_path:
+        elif re.search(r"strings[a-zA-Z0-9_]*\.yaml", api_yaml_path):
             self.namespace = "strings::"
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index 9d95b9488d298..403216813dd36 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -29,10 +29,18 @@
 from codegen_utils import GetInplacedFunctionName
 from codegen_utils import ParseYamlArgs, ParseYamlReturns, ParseYamlForwardFromBackward
 from codegen_utils import ParseYamlForward, ParseYamlBackward
-from codegen_utils import FunctionGeneratorBase, YamlGeneratorBase
+from codegen_utils import ParseYamlInplaceInfo
+from codegen_utils import FunctionGeneratorBase, GeneratorBase
 from codegen_utils import ops_to_fill_zero_for_empty_grads
 from codegen_utils import AssertMessage, GetIndent
 
+# Note: assign is a inplace api when parameter(output) isn't none,
+# so we should check parameter(output) with rule of inplace.
+# But because there is no check in old dygraph mode, in order to
+# keeping the code compatible, here we also skip inplace check in new dygraph temporarily,
+# and this will be fixed in the futrue.
+inplace_check_blacklist = set(["assign_out_"])
+
 
 ###########
 ## Utils ##
@@ -55,31 +63,31 @@ def ParseArguments():
 ## Code Gen Templates ##
 ########################
 SET_PLAIN_TENSOR_WRAPPER_TEMPLATE = \
-"""  void SetTensorWrapper{}(const paddle::experimental::Tensor& {}, bool full_reserved) {{
-    {} = egr::TensorWrapper({}, full_reserved, {});
+"""  void SetTensorWrapper{}(const paddle::experimental::Tensor& {}) {{
+    {} = egr::TensorWrapper({}, {});
   }}
 """
 
-PLAIN_TENSOR_MEMBER_TEMPLATE = \
-"""  egr::TensorWrapper {};
-"""
-
-CLEAR_TENSOR_WRAPPER_TEMPLATE = \
-"""    {}.clear();
-"""
-
 SET_VECTOR_TENSOR_WRAPPER_TEMPLATE = \
-"""  void SetTensorWrapper{}(const std::vector<paddle::experimental::Tensor>& {}, bool full_reserved) {{
+"""  void SetTensorWrapper{}(const std::vector<paddle::experimental::Tensor>& {}) {{
     for(const auto& eager_tensor : {}) {{
-      {}.emplace_back(egr::TensorWrapper(eager_tensor, full_reserved, {}));
+      {}.emplace_back(egr::TensorWrapper(eager_tensor, {}));
     }};
   }}
 """
 
+PLAIN_TENSOR_MEMBER_TEMPLATE = \
+"""  egr::TensorWrapper {};
+"""
+
 VECTOR_TENSOR_MEMBER_TEMPLATE = \
 """  std::vector<egr::TensorWrapper> {};
 """
 
+CLEAR_TENSOR_WRAPPER_TEMPLATE = \
+"""    {}.clear();
+"""
+
 CLEAR_VECTOR_TENSOR_WRAPPERS_TEMPLATE = \
 """    for (auto& tw : {}) {{
       tw.clear();
@@ -146,10 +154,9 @@ class {} : public egr::GradNodeBase {{
 {}
 
   // Call grad_api function
-  VLOG(3) << \"Final State Running: \" << \"{}\";
+  VLOG(3) << \"Final State Running: {}\";
 {}
-
-  // Get Output
+  // Check NaN and Inf id needed
 {}
   // Get GradIn autograd_meta
 {}
@@ -175,6 +182,8 @@ class {} : public egr::GradNodeBase {{
 {}
   // Forward API Call
   VLOG(3) << \"Final State Running: \" << \"{}\";
+{}
+  // Check NaN and Inf if needed
 {}
   // Get Outputs
 {}
@@ -235,9 +244,11 @@ class {} : public egr::GradNodeBase {{
 #include "paddle/fluid/eager/api/utils/global_utils.h"
 #include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h"
 #include "paddle/fluid/eager/to_static/run_program_op_node.h"
+#include "paddle/fluid/eager/nan_inf_utils.h"
 
 #include "paddle/phi/api/include/sparse_api.h"
 
+DECLARE_bool(check_nan_inf);
 {}
 """
 
@@ -262,7 +273,9 @@ class {} : public egr::GradNodeBase {{
 #include "paddle/fluid/eager/amp_utils.h"
 #include "paddle/fluid/eager/eager_amp_auto_cast.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/fluid/eager/nan_inf_utils.h"
 
+DECLARE_bool(check_nan_inf);
 {}
 {}
 """
@@ -332,14 +345,28 @@ class {} : public egr::GradNodeBase {{
 
 CREATE_PLAIN_OPTIONAL_TENSOR_TEMPLATE = \
 """
-    paddle::optional<const paddle::experimental::Tensor&> {}_optional = paddle::none;
-    if({}.initialized()) {}_optional = paddle::make_optional<const paddle::experimental::Tensor&>({});
+  paddle::optional<const paddle::experimental::Tensor&> {}_optional = paddle::none;
+  if({}.initialized()) {}_optional = paddle::make_optional<const paddle::experimental::Tensor&>({});
 """
 
 CREATE_RECOVER_OPTIONAL_TENSOR_TEMPLATE = \
 """
-    paddle::optional<const paddle::experimental::Tensor&> {}_optional = paddle::none;
-    if( {}.impl() ) {}_optional = paddle::make_optional<const paddle::experimental::Tensor&>({});
+  paddle::optional<const paddle::experimental::Tensor&> {}_optional = paddle::none;
+  if( {}.impl() ) {}_optional = paddle::make_optional<const paddle::experimental::Tensor&>({});
+"""
+
+CHECK_BACKWARD_INPLACE_TEMPLATE = \
+"""
+  bool can_be_inplaced = false;
+  if ({}.initialized()) {{
+    VLOG(10) << {}.name() << "({}) use_count: " << {}.impl().use_count();
+    if ({}.impl().use_count() == 1 || ({}.impl().use_count() == 2 && {}.impl().get() == {}.impl().get())) {{
+      can_be_inplaced = true;
+    }}
+  }}"""
+
+CHECK_NAN_AND_INF_TEMPLATE = \
+"""  if (FLAGS_check_nan_inf) {{ egr::CheckTensorHasNanOrInf("{}", {}); }}
 """
 
 
@@ -398,7 +425,7 @@ def __init__(self, forward_api_contents, grad_api_contents, namespace):
         #self.optional_inputs
         #self.no_need_buffers
         #self.intermediate_outputs
-        #self.inplace_map
+        #self.forward_inplace_map
         FunctionGeneratorBase.__init__(self, forward_api_contents, namespace)
 
         self.grad_api_contents = grad_api_contents
@@ -414,9 +441,9 @@ def __init__(self, forward_api_contents, grad_api_contents, namespace):
         self.forward_returns_list = [
         ]  #[ [ret_name, ret_type, orig_position], ...]
 
-        self.backward_inputs_list = [
-        ]  #[ [attr_name, attr_type, default_value, orig_position], ...]
         self.backward_attrs_list = [
+        ]  #[ [attr_name, attr_type, default_value, orig_position], ...]
+        self.backward_inputs_list = [
         ]  #[ [arg_name, arg_type, orig_position], ...]
         self.backward_returns_list = [
         ]  #[ [ret_name, ret_type, orig_position], ...]
@@ -429,6 +456,15 @@ def __init__(self, forward_api_contents, grad_api_contents, namespace):
         self.backward_grad_outputs_map = {
         }  #{ "name" : [type, fwd_position, orig_position] ...}
 
+        self.backward_inplace_map = {}  #{name : name, ...}
+
+    def ParseBackwardInplaceInfo(self):
+        grad_api_contents = self.grad_api_contents
+        if 'inplace' not in grad_api_contents.keys(): return
+
+        inplace_map_str = grad_api_contents['inplace']
+        self.backward_inplace_map = ParseYamlInplaceInfo(inplace_map_str)
+
     def DygraphYamlValidationCheck(self):
         forward_api_contents = self.forward_api_contents
         grad_api_contents = self.grad_api_contents
@@ -495,11 +531,9 @@ def ForwardsValidationCheck(self):
         for _, _, pos in forward_inputs_list:
             max_input_position = max(max_input_position, pos)
 
-        max_attr_position = -1
         for _, _, _, pos in forward_attrs_list:
             assert pos > max_input_position, AssertMessage(pos,
                                                            max_input_position)
-            max_attr_position = max(max_attr_position, pos)
 
     def BackwardValidationCheck(self):
         backward_forward_inputs_map = self.backward_forward_inputs_map
@@ -641,7 +675,7 @@ def GenerateNodeCreationCodes(self):
             pass_stop_gradient_args_list.append(output_autograd_meta_name)
         pass_stop_gradient_args_str = ",".join(pass_stop_gradient_args_list)
 
-        # Node Construction        
+        # Node Construction
         num_backward_inputs = len(forward_outputs_position_map.keys())
         num_backward_outputs = len(forward_inputs_position_map.keys())
         grad_node_name = GetGradNodeName(forward_api_name)
@@ -679,21 +713,20 @@ def GenerateNodeCreationCodes(self):
 
             if is_fwd_input:
                 if is_optional:
-                    set_tensor_wrappers = f"{indent}if({name}.get_ptr() != nullptr) grad_node->SetTensorWrapper{name}(*({name}.get_ptr()), true);"
+                    set_tensor_wrappers = f"{indent}if({name}.get_ptr() != nullptr) grad_node->SetTensorWrapper{name}(*({name}.get_ptr()));"
                 else:
-                    set_tensor_wrappers = f"{indent}grad_node->SetTensorWrapper{name}({name}, true);"
+                    set_tensor_wrappers = f"{indent}grad_node->SetTensorWrapper{name}({name});"
                 set_input_tensor_wrappers_list.append(set_tensor_wrappers)
-            else:
+            else:  # Forwad's output as backward's input
                 if num_fwd_outputs > 1:
                     # Aligned with forward output position
                     assert name in forward_outputs_position_map.keys(
                     ), AssertMessage(name, forward_outputs_position_map.keys())
-                    fwd_output_pos = forward_outputs_position_map[name][1]
 
                 if is_optional:
-                    set_tensor_wrappers = f"{indent}if({name}.get_ptr() != nullptr) grad_node->SetTensorWrapper{name}(*({name}.get_ptr()), false);"
+                    set_tensor_wrappers = f"{indent}if({name}.get_ptr() != nullptr) grad_node->SetTensorWrapper{name}(*({name}.get_ptr()));"
                 else:
-                    set_tensor_wrappers = f"{indent}grad_node->SetTensorWrapper{name}({name}, false);"
+                    set_tensor_wrappers = f"{indent}grad_node->SetTensorWrapper{name}({name});"
                 set_output_tensor_wrappers_list.append(set_tensor_wrappers)
         set_input_tensor_wrappers_str = "\n".join(
             set_input_tensor_wrappers_list)
@@ -701,6 +734,7 @@ def GenerateNodeCreationCodes(self):
             set_output_tensor_wrappers_list)
 
         # SetGradOutMeta & SetEdges
+        grad_node_out_list = []
         set_grad_out_meta_list = []
         set_edges_list = []
         for name, (_, pos) in forward_inputs_position_map.items():
@@ -713,7 +747,7 @@ def GenerateNodeCreationCodes(self):
             if not has_corresponding_grad_output:
                 continue
 
-            input_autograd_meta_name = GetAutoGradMetaName(name)
+            grad_node_out_list.append(name)
             is_optional = (name in self.optional_inputs)
             if is_optional:
                 set_grad_out_meta = f"{indent}if({name}.get_ptr() != nullptr) grad_node->SetGradOutMeta(*({name}.get_ptr()), {pos});"
@@ -723,7 +757,7 @@ def GenerateNodeCreationCodes(self):
             set_grad_out_meta_list.append(set_grad_out_meta)
         set_grad_out_meta_str = "\n".join(set_grad_out_meta_list)
 
-        # SetOutRank & SetHistory & SetGradInMeta
+        # SetOutRank & SetHistory & SetGradInMeta & CheckAndRetainGrad
         set_out_rank_list = []
         set_history_list = []
         set_grad_in_meta_list = []
@@ -731,11 +765,17 @@ def GenerateNodeCreationCodes(self):
         num_outputs = len(forward_outputs_position_map.keys())
         for name, (_, pos) in forward_outputs_position_map.items():
             output_autograd_meta_name = GetAutoGradMetaName(name)
-            set_out_rank = f"{indent}egr::EagerUtils::SetOutRankWithSlot({output_autograd_meta_name}, {pos});"
-            set_history = f"{indent}egr::EagerUtils::SetHistory({output_autograd_meta_name}, grad_node);"
+            set_out_rank = f"""{indent}if ({output_autograd_meta_name}) {{
+{indent}  egr::EagerUtils::SetOutRankWithSlot({output_autograd_meta_name}, {pos});
+{indent}}}"""
+
+            set_history = f"""{indent}if ({output_autograd_meta_name}) {{
+{indent}  egr::EagerUtils::SetHistory({output_autograd_meta_name}, grad_node);
+{indent}}}"""
 
-            set_retain_grad = f"{indent}egr::EagerUtils::CheckAndRetainGrad({name});"
             set_grad_in_meta = f"{indent}grad_node->SetGradInMeta({name}, {pos});"
+            set_retain_grad = f"{indent}egr::EagerUtils::CheckAndRetainGrad({name});"
+
             set_out_rank_list.append(set_out_rank)
             set_history_list.append(set_history)
             set_grad_in_meta_list.append(set_grad_in_meta)
@@ -755,6 +795,7 @@ def GenerateNodeCreationCodes(self):
             set_input_tensor_wrappers_str, set_grad_out_meta_str,
             set_out_rank_str, set_history_str, set_grad_in_meta_str,
             set_retain_grad_str, set_output_tensor_wrappers_str)
+        self.grad_node_out_list = grad_node_out_list
 
     def run(self):
         # Basic Validation Check
@@ -763,8 +804,9 @@ def run(self):
         ##########################
         ## Parsing Raw Contents ##
         ##########################
-        # Parse inplace_map
-        self.ParseInplaceInfo()
+        # Parse forward and backward inplace_map
+        self.ParseForwardInplaceInfo()
+        self.ParseBackwardInplaceInfo()
 
         # Parse no_need_buffer
         self.ParseNoNeedBuffer()
@@ -795,7 +837,7 @@ def run(self):
         self.DetermineForwardPositionMap(self.forward_inputs_list,
                                          self.forward_returns_list)
 
-        # Initialize forward_inputs_position_map, forward_outputs_position_map
+        # Initialize backward_forward_inputs_map, backward_grad_inputs_map, backward_grad_outputs_map
         self.SlotNameMatching()
 
         # Backward Validation Check
@@ -811,21 +853,21 @@ def __init__(self, forward_api_contents, grad_api_contents, namespace):
         self.forward_definition_str = ""
         self.forward_declaration_str = ""
 
-    def GenerateForwardDefinition(self, is_inplaced):
+    def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
         namespace = self.namespace
+        if self.forward_api_name[-1] == '_' and not is_inplaced:
+            return
         forward_api_name = GetInplacedFunctionName(
             self.forward_api_name) if is_inplaced else self.forward_api_name
-        backward_api_name = self.backward_api_name
+
         forward_inputs_position_map = self.forward_inputs_position_map
         forward_outputs_position_map = self.forward_outputs_position_map
         forward_attrs_list = self.forward_attrs_list
-        backward_forward_inputs_map = self.backward_forward_inputs_map
-        backward_grad_inputs_map = self.backward_grad_inputs_map
         backward_grad_outputs_map = self.backward_grad_outputs_map
-        backward_attrs_list = self.backward_attrs_list
+
         optional_inputs = self.optional_inputs
         intermediate_outputs = self.intermediate_outputs
-        inplace_map = self.inplace_map if is_inplaced else {}
+        forward_inplace_map = self.forward_inplace_map if is_inplaced else {}
         indent = GetIndent(1)
 
         # Get Function Args
@@ -834,6 +876,7 @@ def GenerateForwardDefinition(self, is_inplaced):
         inputs_args_definition_list = ["" for i in range(num_inputs)]
         inputs_args_declaration_list = ["" for i in range(num_inputs)]
         inputs_call_list = ["" for i in range(num_inputs)]
+
         amp_inputs_call_list = ["" for i in range(num_inputs)]
         amp_tensors_vector_list = []
         amp_tensors_vector_optional_list = []
@@ -856,7 +899,7 @@ def GenerateForwardDefinition(self, is_inplaced):
                         f"auto NEW_{name} = ({name}.get_ptr() != nullptr) ? paddle::make_optional<const paddle::experimental::Tensor&>(NEW_{name}_temp_tensor) : {name};\n"
                     )
                 else:
-                    if is_inplaced and inplace_map and name in inplace_map.keys(
+                    if is_inplaced and forward_inplace_map and name in forward_inplace_map.keys(
                     ):
                         arg_str = f"paddle::experimental::Tensor& {name}"
                         amp_tensors_vector_list.append(f"{{{name}}}")
@@ -903,10 +946,17 @@ def GenerateForwardDefinition(self, is_inplaced):
             else:
                 function_name = GetIntermediateAPIFunctionName(function_name)
 
-        forward_call_str = f"{indent}auto api_result = paddle::experimental::{namespace}{function_name}({inputs_call_args_str});"
+        api_out_type = "auto"
+        if is_inplaced and len(forward_outputs_position_map) == 1:
+            api_out_type = "auto&"
+        forward_call_str = f"{indent}{api_out_type} api_result = paddle::experimental::{namespace}{function_name}({inputs_call_args_str});"
         num_outputs = len(forward_outputs_position_map.keys()) - len(
             intermediate_outputs)
 
+        # Check Nan and Inf
+        check_nan_inf_str = CHECK_NAN_AND_INF_TEMPLATE.format(function_name,
+                                                              "api_result")
+
         # Get Outputs
         get_outputs_str = ""
         for name, (rtype, pos) in forward_outputs_position_map.items():
@@ -924,11 +974,20 @@ def GenerateForwardDefinition(self, is_inplaced):
             returns_list[pos] = f"{name}"
 
             if IsPlainTensorType(rtype):
-                returns_type_list[pos] = "paddle::experimental::Tensor"
+                if is_inplaced and forward_inplace_map and name in forward_inplace_map.values(
+                ):
+                    returns_type_list[pos] = "paddle::experimental::Tensor&"
+                else:
+                    returns_type_list[pos] = "paddle::experimental::Tensor"
             else:
                 assert IsVectorTensorType(rtype)
-                returns_type_list[
-                    pos] = "std::vector<paddle::experimental::Tensor>"
+                if is_inplaced and forward_inplace_map and name in forward_inplace_map.values(
+                ):
+                    returns_type_list[
+                        pos] = "std::vector<paddle::experimental::Tensor>&"
+                else:
+                    returns_type_list[
+                        pos] = "std::vector<paddle::experimental::Tensor>"
 
         if num_outputs == 1:
             returns_str = returns_list[0]
@@ -937,24 +996,33 @@ def GenerateForwardDefinition(self, is_inplaced):
             returns_type_str = ", ".join(returns_type_list)
             returns_type_str = f"std::tuple<{returns_type_str}>"
             returns_str = ", ".join(returns_list)
-            returns_str = f"std::make_tuple({returns_str})"
+            returns_str = f"{returns_type_str}{{{returns_str}}}"
 
         # Node Creation Pre-Processing
         # 1. Get Input AutoGradMeta
         inputs_autograd_meta_list = []
         compute_require_grad_args_list = ["trace_backward"]
         for name, (ttype, pos) in forward_inputs_position_map.items():
-            input_autograd_meta_name = GetAutoGradMetaName(name)
-            if IsPlainTensorType(ttype):
-                input_autograd_meta = f"{indent}egr::AutogradMeta* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({name});"
-            else:
-                assert IsVectorTensorType(ttype)
-                input_autograd_meta_vec_name = GetAutoGradMetaVectorName(name)
-                input_autograd_meta = f"{indent}std::vector<egr::AutogradMeta*> {input_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta({name});\n"
-                input_autograd_meta += f"{indent}std::vector<egr::AutogradMeta*>* {input_autograd_meta_name} = &{input_autograd_meta_vec_name};"
-
-            inputs_autograd_meta_list.append(input_autograd_meta)
-            compute_require_grad_args_list.append(input_autograd_meta_name)
+            # Has corresponding grad output
+            has_corresponding_grad_output = False
+            for _, (_, corresponding_pos,
+                    _) in backward_grad_outputs_map.items():
+                if pos == corresponding_pos:
+                    has_corresponding_grad_output = True
+            if has_corresponding_grad_output or (
+                    name in forward_inplace_map and
+                    forward_api_name not in inplace_check_blacklist):
+                input_autograd_meta_name = GetAutoGradMetaName(name)
+                if IsPlainTensorType(ttype):
+                    input_autograd_meta = f"{indent}egr::AutogradMeta* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({name});"
+                else:
+                    assert IsVectorTensorType(ttype)
+                    input_autograd_meta_vec_name = GetAutoGradMetaVectorName(
+                        name)
+                    input_autograd_meta = f"{indent}std::vector<egr::AutogradMeta*> {input_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta({name});\n"
+                    input_autograd_meta += f"{indent}std::vector<egr::AutogradMeta*>* {input_autograd_meta_name} = &{input_autograd_meta_vec_name};"
+                inputs_autograd_meta_list.append(input_autograd_meta)
+                compute_require_grad_args_list.append(input_autograd_meta_name)
         inputs_autograd_meta_str = "\n".join(inputs_autograd_meta_list)
         compute_require_grad_args_str = ",".join(compute_require_grad_args_list)
 
@@ -987,16 +1055,19 @@ def GenerateForwardDefinition(self, is_inplaced):
         check_inplace_str = ""
         bump_inplace_version_str = ""
         if is_inplaced:
-            for inplace_name in inplace_map.keys():
-                inplace_autograd_meta_name = GetAutoGradMetaName(inplace_name)
-                check_inplace_str += CHECK_INPLACE_TEMPLATE.format(
-                    inplace_name, inplace_autograd_meta_name)
+            for inplace_name in forward_inplace_map.keys():
+                if forward_api_name not in inplace_check_blacklist:
+                    inplace_autograd_meta_name = GetAutoGradMetaName(
+                        inplace_name)
+                    check_inplace_str += CHECK_INPLACE_TEMPLATE.format(
+                        inplace_name, inplace_autograd_meta_name)
                 bump_inplace_version_str += BUMP_INPLACE_VERSION_TEMPLATE.format(
                     inplace_name, inplace_name)
 
+        # Node Creation
         self.GenerateNodeCreationCodes()
-
         node_creation_str = self.node_creation_str
+
         dygraph_event_str = f"{indent}paddle::platform::RecordEvent dygraph_entrance_record_event(\"{forward_api_name} dygraph\", paddle::platform::TracerEventType::Operator, 1);\n"
         forward_function_name = GetDygraphForwardFunctionName(forward_api_name)
 
@@ -1020,13 +1091,14 @@ def GenerateForwardDefinition(self, is_inplaced):
                 amp_tensors_vector_optional_list_str, amp_get_dst_dtype_str,
                 amp_autocast_list_str, amp_call_str)
 
+        # Generate forward_definition_str and forward_declaration_str
         self.forward_definition_str += FORWARD_FUNCTION_TEMPLATE.format(
             returns_type_str, forward_function_name, inputs_args_definition_str,
             dygraph_event_str, amp_logic_str, inputs_autograd_meta_str,
-            forward_function_name, forward_call_str, get_outputs_str,
-            outputs_autograd_meta_str, compute_require_grad_args_str,
-            check_inplace_str, bump_inplace_version_str, node_creation_str,
-            returns_str)
+            forward_function_name, forward_call_str, check_nan_inf_str,
+            get_outputs_str, outputs_autograd_meta_str,
+            compute_require_grad_args_str, check_inplace_str,
+            bump_inplace_version_str, node_creation_str, returns_str)
         self.forward_declaration_str += f"{returns_type_str} {forward_function_name}({inputs_args_declaration_str});\n"
 
     def GenerateInplacedForwardDygraphFunctions(self):
@@ -1036,8 +1108,8 @@ def GenerateInplacedForwardDygraphFunctions(self):
 
         if forward_api_name != "sum" and "inplace" in forward_api_contents.keys(
         ):
-            # Node Definition Generation
-            self.GenerateForwardDefinition(is_inplaced=True)
+            # Function Definition and Declaration Generation
+            self.GenerateForwardDefinitionAndDeclaration(is_inplaced=True)
             self.UpdateCoreOpsInformation(is_inplaced=True)
 
     def UpdateCoreOpsInformation(self, is_inplaced):
@@ -1058,6 +1130,7 @@ def UpdateCoreOpsInformation(self, is_inplaced):
             final_state_fwd_api_name] = ["" for i in range(num_args)]
         core_ops_args_type_info[
             final_state_fwd_api_name] = ["" for i in range(num_args)]
+
         for name, (ttype, pos) in forward_inputs_position_map.items():
             core_ops_args_info[final_state_fwd_api_name][pos] = name
             if IsPlainTensorType(ttype):
@@ -1079,7 +1152,9 @@ def run(self):
         #####################
         ## Code Generation ##
         #####################
-        self.GenerateForwardDefinition(is_inplaced=False)
+
+        # Definition And Declaration
+        self.GenerateForwardDefinitionAndDeclaration(is_inplaced=False)
 
         self.UpdateCoreOpsInformation(is_inplaced=False)
 
@@ -1139,8 +1214,10 @@ def GenerateHigherOrderNodeCreationCode(self):
         grad_api_contents = self.grad_api_contents
         next_grad_api_contents = self.next_grad_api_contents
 
-        grad_node_creation_str = ""
+        next_grad_node_creation_str = ""
+        next_grad_node_out_list = []
         if next_grad_api_contents:
+            # Fake forward_api_contents and backward_api_contents
             forward_api_contents = grad_api_contents
             forward_api_contents['api'] = forward_api_contents['backward_api']
             backward_api_contents = next_grad_api_contents
@@ -1149,11 +1226,12 @@ def GenerateHigherOrderNodeCreationCode(self):
                 forward_api_contents, backward_api_contents, namespace)
             next_node_generator.run()
             next_node_generator.GenerateNodeCreationCodes()
-            grad_node_creation_str = next_node_generator.node_creation_str
+            next_grad_node_creation_str = next_node_generator.node_creation_str
+            next_grad_node_out_list = next_node_generator.grad_node_out_list
 
             self.RecordGrad2NextGradNameMapping(next_node_generator)
 
-        return grad_node_creation_str
+        return next_grad_node_creation_str, next_grad_node_out_list
 
     def GenerateNodeDeclaration(self):
         forward_op_name = self.forward_api_name
@@ -1161,7 +1239,7 @@ def GenerateNodeDeclaration(self):
         backward_attrs_list = self.backward_attrs_list
         no_need_buffers = self.no_need_buffers
 
-        # SetTensorWrapper Methods & TensorWrapper Members
+        # SetTensorWrapper Methods & TensorWrapper Members & ClearTensorWrappers
         set_tensor_wrapper_methods_str = ""
         tensor_wrapper_members_str = ""
         clear_tensor_wrapper_str = ""
@@ -1214,7 +1292,8 @@ def GenerateNodeDeclaration(self):
             set_attribute_methods_str, tensor_wrapper_members_str,
             attribute_members_str)
 
-    def GenerateNodeDefinition(self, grad_node_creation_str):
+    def GenerateNodeDefinition(self, next_grad_node_creation_str,
+                               next_grad_node_out_list):
         namespace = self.namespace
         forward_api_name = self.forward_api_name
         backward_api_name = self.backward_api_name
@@ -1222,6 +1301,7 @@ def GenerateNodeDefinition(self, grad_node_creation_str):
         backward_grad_inputs_map = self.backward_grad_inputs_map
         backward_grad_outputs_map = self.backward_grad_outputs_map
         backward_attrs_list = self.backward_attrs_list
+        backward_inplace_map = self.backward_inplace_map
         indent = GetIndent(1)
 
         # Construct grad_api function args
@@ -1234,8 +1314,19 @@ def GenerateNodeDefinition(self, grad_node_creation_str):
         # Fill Grad Ins with Zero
         fill_zero_str = ""
         if backward_api_name in ops_to_fill_zero_for_empty_grads:
-            fill_zero_str = f"{indent}egr::EagerUtils::FillZeroForEmptyGradInputs(&grads, this->InputMeta());\n"
+            fill_zero_str = f"{indent}const auto& input_metas = this->InputMeta();\n"
+            for name, (ttype, fwd_position,
+                       grad_api_position) in backward_grad_inputs_map.items():
+                if name in self.optional_inputs:
+                    if IsPlainTensorType(ttype):
+                        fill_zero_str += f"{indent}egr::EagerUtils::FillZeroForEmptyOptionalGradInput(&grads[{fwd_position}][0], input_metas[{fwd_position}][0]);\n"
+                else:
+                    if IsPlainTensorType(ttype):
+                        fill_zero_str += f"{indent}egr::EagerUtils::FillZeroForEmptyGradInput(&grads[{fwd_position}][0], input_metas[{fwd_position}][0]);\n"
+                    else:
+                        fill_zero_str += f"{indent}egr::EagerUtils::FillZeroForEmptyGradInput(&grads[{fwd_position}], input_metas[{fwd_position}]);\n"
 
+        inplace_grad_input_str = ""
         # Grad Ins from TensorWrappers
         for name, (_, is_fwd_input,
                    grad_api_position), in backward_forward_inputs_map.items():
@@ -1244,6 +1335,14 @@ def GenerateNodeDefinition(self, grad_node_creation_str):
 
             is_optional = (name in self.optional_inputs)
             tensor_wrapper_recover_str = f"{indent}auto {transformed_tensor_name} = egr::EagerUtils::RecoverTensorWrapper(&this->{tensor_wrapper_name});"
+            if backward_inplace_map and name in backward_inplace_map.keys():
+                tensor_wrapper_intermidiate_tensor_str = f"(&this->{tensor_wrapper_name})->get_intermidiate_tensor()"
+                tensor_wrapper_recover_str += CHECK_BACKWARD_INPLACE_TEMPLATE.format(
+                    transformed_tensor_name, transformed_tensor_name, name,
+                    transformed_tensor_name, transformed_tensor_name,
+                    transformed_tensor_name, transformed_tensor_name,
+                    tensor_wrapper_intermidiate_tensor_str)
+                inplace_grad_input_str = transformed_tensor_name
             if is_optional:
                 tensor_wrapper_recover_str += "\n" + CREATE_RECOVER_OPTIONAL_TENSOR_TEMPLATE.format(
                     transformed_tensor_name, transformed_tensor_name,
@@ -1266,6 +1365,16 @@ def GenerateNodeDefinition(self, grad_node_creation_str):
             if IsPlainTensorType(ttype):
                 get_tensor_str = f"{indent}auto& {transformed_tensor_name} = hooked_grads[{fwd_position}][0];"
 
+                # Inplace in backward op
+                if backward_inplace_map and name in backward_inplace_map.keys():
+                    grads_tensor_str = f"grads[{fwd_position}][0]"
+                    get_tensor_str += CHECK_BACKWARD_INPLACE_TEMPLATE.format(
+                        transformed_tensor_name, transformed_tensor_name, name,
+                        transformed_tensor_name, transformed_tensor_name,
+                        transformed_tensor_name, transformed_tensor_name,
+                        grads_tensor_str)
+                    inplace_grad_input_str = transformed_tensor_name
+
                 if is_optional:
                     get_tensor_str += "\n" + CREATE_PLAIN_OPTIONAL_TENSOR_TEMPLATE.format(
                         transformed_tensor_name, transformed_tensor_name,
@@ -1290,72 +1399,106 @@ def GenerateNodeDefinition(self, grad_node_creation_str):
             get_grad_in_args_list.append(get_attr_str)
 
         get_grad_in_args_str = "\n".join(get_grad_in_args_list)
-        grad_api_args_str = ", ".join(grad_api_args)
 
         # Grad Function Call String
+        slot_num_bwd_outputs = len(self.forward_inputs_position_map.keys())
         grad_api_namespace = f"paddle::experimental::{namespace}"
-        grad_function_call_str = f"{indent}auto grad_api_result = {grad_api_namespace}{backward_api_name}({grad_api_args_str});"
+        grad_function_call_str = f"""
+  const auto& out_metas = OutputMeta();
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>, egr::kSlotSmallVectorSize> returns({slot_num_bwd_outputs});
+  for (int i = 0; i < {slot_num_bwd_outputs}; ++i) {{
+    returns[i].resize(out_metas[i].size());
+  }}
+"""
 
-        # Get Grad Outputs
-        get_outputs_str = ""
-        num_outputs = len(backward_grad_outputs_map.keys())
+        # Grad Outputs
+        out_index = -1
         for name, (ttype, fwd_position,
                    grad_api_position) in backward_grad_outputs_map.items():
             transformed_tensor_name = self.TransformToNextGradName(name)
+            out_index = out_index + 1
+            grad_api_args.append(f"api_output_{out_index}")
+
+            if IsPlainTensorType(ttype):
+                inplace_for_grad_outs_str = ""
+                if backward_inplace_map and name in backward_inplace_map.values(
+                ):
+                    inplace_for_grad_outs_str = f"""
+{indent}if (api_output_{out_index} != nullptr && can_be_inplaced) {{
+{indent}  egr::EagerUtils::HandleViewBetweenInputAndOutput({inplace_grad_input_str}, api_output_{out_index});
+{indent}}}"""
+
+                grad_function_call_str += f"""
+  auto* api_output_{out_index} = (out_metas[{fwd_position}].empty() || out_metas[{fwd_position}][0].IsStopGradient()) ? nullptr : &returns[{fwd_position}][0];{inplace_for_grad_outs_str}"""
 
-            if num_outputs == 1:
-                get_tensor_str = f"{indent}auto& {transformed_tensor_name} = grad_api_result;"
             else:
-                if IsPlainTensorType(ttype):
-                    get_tensor_str = f"{indent}auto& {transformed_tensor_name} = grad_api_result[{grad_api_position}][0];"
-                else:
-                    assert IsVectorTensorType(ttype)
-                    get_tensor_str = f"{indent}auto& {transformed_tensor_name} = grad_api_result[{grad_api_position}];"
-            get_outputs_str += get_tensor_str + "\n"
+                assert IsVectorTensorType(ttype)
+                grad_function_call_str += f"""
+  std::vector<paddle::experimental::Tensor*> api_output_{out_index};
+  api_output_{out_index}.reserve(returns[{fwd_position}].size());
+  for (size_t i = 0; i < returns[{fwd_position}].size(); ++i) {{
+    if (out_metas[{fwd_position}].empty() || out_metas[{fwd_position}][i].IsStopGradient()) {{
+      api_output_{out_index}.push_back(nullptr);
+    }} else {{
+      api_output_{out_index}.push_back(&returns[{fwd_position}][i]);
+    }}
+  }}"""
+
+        grad_api_args_str = ", ".join(grad_api_args)
+
+        grad_function_call_str = grad_function_call_str + f"""
+{indent}{grad_api_namespace}{backward_api_name}({grad_api_args_str});"""
+
+        # Check Nan and Inf
+        check_nan_inf_str = CHECK_NAN_AND_INF_TEMPLATE.format(backward_api_name,
+                                                              "returns")
 
         # Prepare for Node Creation if Necessary
         inputs_autograd_meta_str = ""
         outputs_autograd_meta_str = ""
         compute_require_grad_str = ""
-        if len(grad_node_creation_str) > 0:
-            # 1. Get Input AutoGradMeta
+        if len(next_grad_node_creation_str) > 0:
+            # 1. Get Grad Input AutoGradMeta
             inputs_autograd_meta_list = []
             compute_require_grad_args_list = ["trace_backward"]
             for name, (ttype, pos,
                        grad_api_position) in backward_grad_inputs_map.items():
                 transformed_tensor_name = self.TransformToNextGradName(name)
-
-                input_autograd_meta_name = GetAutoGradMetaName(
-                    transformed_tensor_name)
-                if IsPlainTensorType(ttype):
-                    input_autograd_meta = f"{indent}egr::AutogradMeta* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({transformed_tensor_name});"
-                else:
-                    assert IsVectorTensorType(ttype)
-                    input_autograd_meta_vec_name = GetAutoGradMetaVectorName(
+                if transformed_tensor_name in next_grad_node_out_list:
+                    input_autograd_meta_name = GetAutoGradMetaName(
                         transformed_tensor_name)
-                    input_autograd_meta = f"{indent}std::vector<egr::AutogradMeta*> {input_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta({transformed_tensor_name});\n"
-                    input_autograd_meta += f"{indent}std::vector<egr::AutogradMeta*>* {input_autograd_meta_name} = &{input_autograd_meta_vec_name};"
+                    if IsPlainTensorType(ttype):
+                        input_autograd_meta = f"{indent}egr::AutogradMeta* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({transformed_tensor_name});"
+                    else:
+                        assert IsVectorTensorType(ttype)
+                        input_autograd_meta_vec_name = GetAutoGradMetaVectorName(
+                            transformed_tensor_name)
+                        input_autograd_meta = f"{indent}std::vector<egr::AutogradMeta*> {input_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta({transformed_tensor_name});\n"
+                        input_autograd_meta += f"{indent}std::vector<egr::AutogradMeta*>* {input_autograd_meta_name} = &{input_autograd_meta_vec_name};"
 
-                inputs_autograd_meta_list.append(input_autograd_meta)
-                compute_require_grad_args_list.append(input_autograd_meta_name)
+                    inputs_autograd_meta_list.append(input_autograd_meta)
+                    compute_require_grad_args_list.append(
+                        input_autograd_meta_name)
 
             # 2. Get TensorWrapper AutoGradMeta
             for name, (ttype, _, pos), in backward_forward_inputs_map.items():
                 transformed_tensor_name = self.TransformToNextGradName(name)
-
-                input_autograd_meta_name = GetAutoGradMetaName(
-                    transformed_tensor_name)
-                if IsPlainTensorType(ttype):
-                    input_autograd_meta = f"{indent}egr::AutogradMeta* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({transformed_tensor_name});"
-                else:
-                    assert IsVectorTensorType(ttype)
-                    input_autograd_meta_vec_name = GetAutoGradMetaVectorName(
+                if transformed_tensor_name in next_grad_node_out_list:
+                    input_autograd_meta_name = GetAutoGradMetaName(
                         transformed_tensor_name)
-                    input_autograd_meta = f"{indent}std::vector<egr::AutogradMeta*> {input_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta({transformed_tensor_name});\n"
-                    input_autograd_meta += f"{indent}std::vector<egr::AutogradMeta*>* {input_autograd_meta_name} = &{input_autograd_meta_vec_name};"
+                    if IsPlainTensorType(ttype):
+                        input_autograd_meta = f"{indent}egr::AutogradMeta* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({transformed_tensor_name});"
+                    else:
+                        assert IsVectorTensorType(ttype)
+                        input_autograd_meta_vec_name = GetAutoGradMetaVectorName(
+                            transformed_tensor_name)
+                        input_autograd_meta = f"{indent}std::vector<egr::AutogradMeta*> {input_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta({transformed_tensor_name});\n"
+                        input_autograd_meta += f"{indent}std::vector<egr::AutogradMeta*>* {input_autograd_meta_name} = &{input_autograd_meta_vec_name};"
+
+                    inputs_autograd_meta_list.append(input_autograd_meta)
+                    compute_require_grad_args_list.append(
+                        input_autograd_meta_name)
 
-                inputs_autograd_meta_list.append(input_autograd_meta)
-                compute_require_grad_args_list.append(input_autograd_meta_name)
             inputs_autograd_meta_str = "\n".join(inputs_autograd_meta_list)
             compute_require_grad_args_str = ",".join(
                 compute_require_grad_args_list)
@@ -1363,28 +1506,26 @@ def GenerateNodeDefinition(self, grad_node_creation_str):
             # 3. Get Output AutoGradMeta
             outputs_autograd_meta_list = []
             num_fwd_outputs = len(backward_grad_outputs_map.keys())
-            for name, (rtype, pos, _) in backward_grad_outputs_map.items():
+            for name, (rtype, pos,
+                       grad_api_position) in backward_grad_outputs_map.items():
                 transformed_tensor_name = self.TransformToNextGradName(name)
 
                 output_autograd_meta_name = GetAutoGradMetaName(
                     transformed_tensor_name)
                 output_autograd_meta_vec_name = GetAutoGradMetaVectorName(
                     transformed_tensor_name)
-                if num_fwd_outputs == 1:
-                    if IsPlainTensorType(rtype):
-                        output_autograd_meta = f"{indent}egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&{transformed_tensor_name});"
-                    else:
-                        assert IsVectorTensorType(rtype)
-                        output_autograd_meta = f"{indent}std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&{transformed_tensor_name});\n"
-                        output_autograd_meta += f"{indent}std::vector<egr::AutogradMeta*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};"
+                if IsPlainTensorType(rtype):
+                    output_autograd_meta = f"""
+  auto& {transformed_tensor_name} = returns[{pos}][0];
+  egr::AutogradMeta* {output_autograd_meta_name} = returns[{pos}][0].initialized() ? egr::EagerUtils::autograd_meta(&{transformed_tensor_name}) : nullptr;"""
+
                 else:
-                    # Tuple api_result
-                    if IsPlainTensorType(rtype):
-                        output_autograd_meta = f"{indent}egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&{transformed_tensor_name});"
-                    else:
-                        assert IsVectorTensorType(rtype)
-                        output_autograd_meta = f"{indent}std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&{transformed_tensor_name});\n"
-                        output_autograd_meta += f"{indent}std::vector<egr::AutogradMeta*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};"
+                    assert IsVectorTensorType(rtype)
+                    output_autograd_meta = f"""
+  auto& {transformed_tensor_name} = returns[{pos}];
+  std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&{transformed_tensor_name});
+  std::vector<egr::AutogradMeta*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};
+"""
 
                 outputs_autograd_meta_list.append(output_autograd_meta)
             outputs_autograd_meta_str = "\n".join(outputs_autograd_meta_list)
@@ -1392,30 +1533,16 @@ def GenerateNodeDefinition(self, grad_node_creation_str):
             compute_require_grad_str = f"{indent}bool trace_backward = egr::Controller::Instance().HasGrad() && create_graph;\n"
             compute_require_grad_str += f"{indent}bool require_any_grad = egr::EagerUtils::ComputeRequireGrad({compute_require_grad_args_str});"
 
-        # Construct grad_api returns
-        slot_num_bwd_outputs = len(self.forward_inputs_position_map.keys())
-        returns_str = f"{indent}paddle::small_vector<std::vector<paddle::experimental::Tensor>, egr::kSlotSmallVectorSize> returns({slot_num_bwd_outputs});\n"
-        for name, (ttype, fwd_position,
-                   grad_api_position) in backward_grad_outputs_map.items():
-            transformed_tensor_name = self.TransformToNextGradName(name)
-
-            # Rearrange output order accordingly
-            if IsPlainTensorType(ttype):
-                returns_str += f"{indent}returns[{fwd_position}] = {{ {transformed_tensor_name} }};\n"
-            else:
-                assert IsVectorTensorType(ttype)
-                returns_str += f"{indent}returns[{fwd_position}] = {transformed_tensor_name};\n"
-
-        returns_str += f"{indent}if(NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&returns);\n"
+        returns_str = f"{indent}if(NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&returns);\n"
         returns_str += f"{indent}return returns;\n"
 
         grad_node_name = GetGradNodeName(forward_api_name)
 
         self.node_definition_str = GRAD_FUNCTION_TEMPLATE.format(
             grad_node_name, fill_zero_str, get_grad_in_args_str, grad_node_name,
-            grad_function_call_str, get_outputs_str, inputs_autograd_meta_str,
+            grad_function_call_str, check_nan_inf_str, inputs_autograd_meta_str,
             outputs_autograd_meta_str, compute_require_grad_str,
-            grad_node_creation_str, returns_str)
+            next_grad_node_creation_str, returns_str)
 
     def run(self):
         super().run()
@@ -1426,26 +1553,29 @@ def run(self):
         ## Code Generation ##
         #####################
         # Higher-order GradNode generation
-        grad_node_creation_str = self.GenerateHigherOrderNodeCreationCode()
+        next_grad_node_creation_str, next_grad_node_out_list = self.GenerateHigherOrderNodeCreationCode(
+        )
 
         self.GenerateNodeDeclaration()
 
-        self.GenerateNodeDefinition(grad_node_creation_str)
+        self.GenerateNodeDefinition(next_grad_node_creation_str,
+                                    next_grad_node_out_list)
 
 
-class DygraphYamlGenerator(YamlGeneratorBase):
+class DygraphForwardAndNodesGenerator(GeneratorBase):
     def __init__(self, api_yaml_path, backward_yaml_path):
-        # Parent members: 
+        # Parent members:
         # self.namespace
         # self.api_yaml_path
         # self.forward_api_list
-        YamlGeneratorBase.__init__(self, api_yaml_path)
+        GeneratorBase.__init__(self, api_yaml_path)
 
         self.backward_yaml_path = backward_yaml_path
         self.grad_api_dict = {}
 
-        self.forward_definition_str = ""
         self.forward_declaration_str = ""
+        self.forward_definition_str = ""
+
         self.node_declaration_str = ""
         self.node_definition_str = ""
 
@@ -1485,6 +1615,7 @@ def GenerateCode(self):
             self.forward_definition_str += function_generator.forward_definition_str + "\n"
             self.forward_declaration_str += function_generator.forward_declaration_str + "\n"
 
+            # Generate Dygraph GradNode Function
             while True:
                 next_grad_api_contents = self.GetBackwardAPIContents(
                     backward_api_contents)
@@ -1578,20 +1709,23 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str):
     # Generate per Dygraph API
     node_declaration_str = ""
     node_definition_str = ""
-    forward_definition_str = ""
+
     forward_declaration_str = ""
+    forward_definition_str = ""
 
     for i in range(len(api_yaml_paths)):
         api_yaml_path = api_yaml_paths[i]
         backward_yaml_path = backward_yaml_paths[i]
 
-        generator = DygraphYamlGenerator(api_yaml_path, backward_yaml_path)
+        generator = DygraphForwardAndNodesGenerator(api_yaml_path,
+                                                    backward_yaml_path)
         generator.run()
 
         node_declaration_str += generator.node_declaration_str + "\n"
         node_definition_str += generator.node_definition_str + "\n"
-        forward_definition_str += generator.forward_definition_str + "\n"
+
         forward_declaration_str += generator.forward_declaration_str + "\n"
+        forward_definition_str += generator.forward_definition_str + "\n"
 
     # Generate Files
     nodes_h_path = args.nodes_h_path
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
index b86685c205a5c..c02400299dfa6 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
@@ -15,7 +15,7 @@
 import os
 import argparse
 import logging
-from codegen_utils import FunctionGeneratorBase, YamlGeneratorBase
+from codegen_utils import FunctionGeneratorBase, GeneratorBase
 from codegen_utils import yaml_types_mapping
 from codegen_utils import ReadFwdFile, IsVectorTensorType, GetForwardFunctionName
 from codegen_utils import ParseYamlForward, GetInplacedFunctionName
@@ -100,7 +100,8 @@ def FindParsingFunctionFromAttributeType(atype):
 
     // Set Device ID
 {}
-    auto out = {}({});
+    // Call dygraph function
+    decltype({}({})) out = {}({});
 
     PyEval_RestoreThread(tstate);
     tstate = nullptr;
@@ -258,7 +259,7 @@ def __init__(self, forward_api_contents, namespace):
         #self.optional_inputs
         #self.no_need_buffers
         #self.intermediate_outputs   
-        #self.inplace_map
+        #self.forward_inplace_map
         FunctionGeneratorBase.__init__(self, forward_api_contents, namespace)
 
         self.is_forward_only = True
@@ -274,7 +275,7 @@ def CollectIsForwardOnly(self):
 
     def GeneratePythonCFunction(self):
         namespace = self.namespace
-        inplace_map = self.inplace_map
+        forward_inplace_map = self.forward_inplace_map
         forward_api_name = self.forward_api_name
         orig_forward_attrs_list = self.orig_forward_attrs_list
         forward_inputs_position_map = self.forward_inputs_position_map
@@ -328,7 +329,7 @@ def GeneratePythonCFunction(self):
             dygraph_function_call_list[pos] = f"{name}"
         dygraph_function_call_str = ",".join(dygraph_function_call_list)
 
-        # Generate Python-C Function Definitions 
+        # Generate Python-C Function Definitions
         if is_forward_only:
             fwd_function_name = FUNCTION_NAME_TEMPLATE.format(
                 "paddle::experimental::", namespace, forward_api_name)
@@ -341,20 +342,24 @@ def GeneratePythonCFunction(self):
         # Generate Record Event for performance profiling
         pythonc_record_event_str = RECORD_EVENT_TEMPLATE.format(
             "pythonc_record_event", forward_api_name, "pybind_imperative_func")
+
+        # Generate Python-C Function Definetion
         self.python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format(
             forward_api_name, pythonc_record_event_str, forward_api_name,
             get_eager_tensor_str, parse_attributes_str, set_device_str,
-            fwd_function_name, dygraph_function_call_str, return_str)
+            fwd_function_name, dygraph_function_call_str, fwd_function_name,
+            dygraph_function_call_str, return_str)
 
         # Set prefix of forward_api_name to avoid conflicts
         prefix = self.namespace.strip("::")
         forward_api_name_prefix = "" if prefix == "" else prefix + "_"
+
         # Generate Python-C Function Registration
         self.python_c_function_reg_str = PYTHON_C_FUNCTION_REG_TEMPLATE.format(
             forward_api_name_prefix, forward_api_name, namespace,
             forward_api_name, forward_api_name)
 
-        if inplace_map:
+        if forward_inplace_map:
             inplaced_forward_api_name = GetInplacedFunctionName(
                 self.forward_api_name)
             if is_forward_only:
@@ -367,26 +372,37 @@ def GeneratePythonCFunction(self):
                     GetForwardFunctionName(inplaced_forward_api_name))
 
             assert len(
-                inplace_map
-            ) == 1, f"size of inplace_map must be 1, but inplace_map of \"{forward_api_name}\" op got {len(inplace_map)}"
-            for inplace_input, inplace_output in inplace_map.items():
+                forward_inplace_map
+            ) == 1, f"size of inplace_map must be 1, but inplace_map of \"{forward_api_name}\" op got {len(forward_inplace_map)}"
+            for inplace_input, inplace_output in forward_inplace_map.items():
                 return_str = RETURN_INPLACE_PYOBJECT_TEMPLATE.format(
                     inplaced_forward_api_name, inplace_input,
                     inplaced_forward_api_name, inplace_output)
                 break
 
-            self.python_c_function_str += PYTHON_C_FUNCTION_TEMPLATE.format(
+            # Generate Python-C Function Definetion
+            python_c_inplace_func_str = PYTHON_C_FUNCTION_TEMPLATE.format(
                 inplaced_forward_api_name, pythonc_record_event_str,
                 inplaced_forward_api_name, get_eager_tensor_str,
                 parse_attributes_str, set_device_str,
                 inplaced_fwd_function_name, dygraph_function_call_str,
+                inplaced_fwd_function_name, dygraph_function_call_str,
                 return_str)
 
-            # Generate Python-C Function Registration
-            self.python_c_function_reg_str += "\n," + PYTHON_C_FUNCTION_REG_TEMPLATE.format(
+            python_c_inplace_func_reg_str = PYTHON_C_FUNCTION_REG_TEMPLATE.format(
                 forward_api_name_prefix, inplaced_forward_api_name, namespace,
                 inplaced_forward_api_name, inplaced_forward_api_name)
 
+            # self.forward_api_name ending with '_' means it only has inplace api
+            if self.forward_api_name[-1] == '_':
+                self.python_c_function_str = python_c_inplace_func_str
+                # Generate Python-C Function Registration
+                self.python_c_function_reg_str = python_c_inplace_func_reg_str
+            else:
+                self.python_c_function_str += python_c_inplace_func_str
+                # Generate Python-C Function Registration
+                self.python_c_function_reg_str += "\n," + python_c_inplace_func_reg_str
+
     def run(self):
         # Initialized is_forward_only
         self.CollectIsForwardOnly()
@@ -394,8 +410,8 @@ def run(self):
         # Initialized optional_inputs
         self.ParseDispensable()
 
-        # Initialized inplace_map
-        self.ParseInplaceInfo()
+        # Initialized forward_inplace_map
+        self.ParseForwardInplaceInfo()
 
         # Initialized orig_forward_inputs_list, orig_forward_returns_list, orig_forward_attrs_list
         self.CollectOriginalForwardInfo()
@@ -412,17 +428,17 @@ def run(self):
         return True
 
 
-class PythonCYamlGenerator(YamlGeneratorBase):
+class PythonCGenerator(GeneratorBase):
     def __init__(self, path):
         # Parent members: 
         # self.namespace
         # self.api_yaml_path
         # self.forward_api_list
-        YamlGeneratorBase.__init__(self, api_yaml_path)
+        GeneratorBase.__init__(self, api_yaml_path)
 
         # Generated Result
-        self.python_c_functions_reg_str = ""
         self.python_c_functions_str = ""
+        self.python_c_functions_reg_str = ""
 
     def GeneratePythonCFunctions(self):
         namespace = self.namespace
@@ -434,8 +450,8 @@ def GeneratePythonCFunctions(self):
             status = f_generator.run()
 
             if status == True:
-                self.python_c_functions_reg_str += f_generator.python_c_function_reg_str + ",\n"
                 self.python_c_functions_str += f_generator.python_c_function_str + "\n"
+                self.python_c_functions_reg_str += f_generator.python_c_function_reg_str + ",\n"
 
     def AttachNamespace(self):
         namespace = self.namespace
@@ -507,11 +523,11 @@ def GeneratePythonCFile(filepath, python_c_str):
     for i in range(len(api_yaml_paths)):
         api_yaml_path = api_yaml_paths[i]
 
-        y_generator = PythonCYamlGenerator(api_yaml_path)
-        y_generator.run()
+        py_c_generator = PythonCGenerator(api_yaml_path)
+        py_c_generator.run()
 
-        generated_python_c_functions += y_generator.python_c_functions_str + "\n"
-        generated_python_c_registration += y_generator.python_c_functions_reg_str + "\n"
+        generated_python_c_functions += py_c_generator.python_c_functions_str + "\n"
+        generated_python_c_registration += py_c_generator.python_c_functions_reg_str + "\n"
 
     python_c_str = GeneratePythonCWrappers(generated_python_c_functions,
                                            generated_python_c_registration)
diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
index 7a4e7f81611d1..63b899f6d6b62 100644
--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -66,68 +66,69 @@ class GeneralGrad {
                                     "stop_gradient=True.",
                                     msg, i));
         if (is_no_grad_vars) {
-          (no_grad_var_nodes_inputmeta_map)[target_node] = auto_grad_meta;
+          (no_grad_var_nodes_inputmeta_map_)[target_node] = auto_grad_meta;
         } else {  // normal input
-          (input_target_nodes_inputmeta_map)[target_node] = auto_grad_meta;
+          (input_target_nodes_inputmeta_map_)[target_node] = auto_grad_meta;
         }
       }
     }
   }
 
-  // Purify potential_startup_nodes, remove nodes those are the same as
+  // Purify potential_startup_nodes_, remove nodes those are the same as
   // input_target_nodes
   void PurifyPotentialStartUpNodes() {
     VLOG(6) << "Running in PurifyPotentialStartUpNodes";
-    if (input_target_nodes_inputmeta_map.empty()) return;
+    if (input_target_nodes_inputmeta_map_.empty()) return;
     std::unordered_set<GradNodeBase*> potential_startup_nodes_to_be_erased;
-    for (auto startup_op : potential_startup_nodes) {
-      auto iter = input_target_nodes_inputmeta_map.find(startup_op);
-      if (iter != input_target_nodes_inputmeta_map.end()) {
+    for (auto startup_op : potential_startup_nodes_) {
+      auto iter = input_target_nodes_inputmeta_map_.find(startup_op);
+      if (iter != input_target_nodes_inputmeta_map_.end()) {
         potential_startup_nodes_to_be_erased.emplace(iter->first);
       }
     }
     if (!potential_startup_nodes_to_be_erased.empty()) {
       for (auto nodes : potential_startup_nodes_to_be_erased) {
-        potential_startup_nodes.erase(nodes);
+        potential_startup_nodes_.erase(nodes);
       }
     }
   }
 
   // Remove some nodes those doesn't need to be
-  // stored in potential_stop_nodes、potential_startup_nodes
+  // stored in potential_stop_nodes_、potential_startup_nodes_
   void UpdateGraphInfo() {
-    // Updated potential_sotp_nodes by depending_nodes,
+    // Updated potential_sotp_nodes by depending_nodes_,
     // make sure the path from root to target_node is ok
-    std::unordered_set<GradNodeBase*> _startup_ops;
+    std::unordered_set<GradNodeBase*> startup_ops;
     VLOG(6) << "Running in UpdateGraphInfo";
     std::queue<GradNodeBase*> queue;
-    for (auto& target_nodes_inputmeta_pair : input_target_nodes_inputmeta_map) {
+    for (auto& target_nodes_inputmeta_pair :
+         input_target_nodes_inputmeta_map_) {
       queue.emplace(target_nodes_inputmeta_pair.first);
     }
 
     while (!queue.empty()) {
       auto* target_node = queue.front();
       queue.pop();
-      if (!(depending_nodes)[target_node].empty()) {
-        auto precedding_nodes = (depending_nodes)[target_node];
+      if (!(depending_nodes_)[target_node].empty()) {
+        auto precedding_nodes = (depending_nodes_)[target_node];
         for (auto pre_nodes : precedding_nodes) {
           queue.emplace(pre_nodes);
-          if (potential_stop_nodes.find(pre_nodes) !=
-              potential_stop_nodes.end()) {
-            potential_stop_nodes.erase(pre_nodes);
+          if (potential_stop_nodes_.find(pre_nodes) !=
+              potential_stop_nodes_.end()) {
+            potential_stop_nodes_.erase(pre_nodes);
           }
         }
       } else {  // startup_ops have no precedding nodes
-        VLOG(6) << "Emplace _startup_ops";
-        _startup_ops.emplace(target_node);
+        VLOG(6) << "Emplace startup_ops";
+        startup_ops.emplace(target_node);
       }
     }
-    // Purify potential_startup_nodes again, remove some
+    // Purify potential_startup_nodes_ again, remove some
     // potential startup_nodes that unreach to input target nodes
-    if (!_startup_ops.empty()) {
+    if (!startup_ops.empty()) {
       std::unordered_set<GradNodeBase*> potential_startup_nodes_to_be_erased;
-      for (auto node : potential_startup_nodes) {
-        if (_startup_ops.count(node) == 0) {
+      for (auto node : potential_startup_nodes_) {
+        if (startup_ops.count(node) == 0) {
           VLOG(6) << "Set up potential_startup_nodes_to_be_erased";
           potential_startup_nodes_to_be_erased.emplace(node);
         }
@@ -135,14 +136,14 @@ class GeneralGrad {
       if (!potential_startup_nodes_to_be_erased.empty()) {
         for (auto node : potential_startup_nodes_to_be_erased) {
           VLOG(6) << "Erase nodes in potential_startup_nodes_to_be_erased";
-          potential_startup_nodes.erase(node);
+          potential_startup_nodes_.erase(node);
         }
       }
     }
   }
 
   // Get Graph Info Betweent input target GradNode and outputs，
-  // record depending_nodes、potential_stop_nodes、potential_startup_nodes
+  // record depending_nodes_、potential_stop_nodes_、potential_startup_nodes_
   void GetGraphInfoBetweenTargets(const std::queue<GradNodeBase*>& init_queue) {
     VLOG(6) << "Runing In GetGraphInfoBetweenTargets";
 
@@ -164,9 +165,9 @@ class GeneralGrad {
       visited.insert(node);
 
       // Check node is target_nodes or not, if node is not target_node,
-      // all the next_node will be marked in potential_stop_nodes
+      // all the next_node will be marked in potential_stop_nodes_
       bool is_potential_stop_nodes =
-          input_target_nodes_inputmeta_map.count(node);
+          input_target_nodes_inputmeta_map_.count(node);
 
       // Find and append next nodes
       const paddle::small_vector<std::vector<GradSlotMeta>,
@@ -186,40 +187,41 @@ class GeneralGrad {
           // all the next_nodes of current node will be inserted to
           // potential_stop_node
           if (is_potential_stop_nodes) {
-            potential_stop_nodes.emplace(next_node);
+            potential_stop_nodes_.emplace(next_node);
           }
 
           // Update in_degree
-          if (!node_in_degree_map.count(next_node))
+          if (!node_in_degree_map.count(next_node)) {
             node_in_degree_map[next_node] = 0;
+          }
           node_in_degree_map[next_node]++;
 
           // Record depending relationship
-          (depending_nodes)[next_node].emplace(node);
+          (depending_nodes_)[next_node].emplace(node);
           queue.push(next_node);
         }
       }
     }
     // Update Graph Info, remove some nodes in
-    // potential_stop_nodes、potential_startup_nodes、
+    // potential_stop_nodes_、potential_startup_nodes_、
     UpdateGraphInfo();
   }
 
   void ModifyReadyQueue(std::queue<GradNodeBase*>* queue) {
     std::queue<GradNodeBase*> tmp_queue;
-    for (auto nodes : potential_startup_nodes) {
+    for (auto nodes : potential_startup_nodes_) {
       tmp_queue.emplace(nodes);
     }
     tmp_queue.swap(*queue);
   }
 
-  // Set result for input target grad_var when potential_startup_nodes is empty
+  // Set result for input target grad_var when potential_startup_nodes_ is empty
   void SetResultForInputTargetVar(
       const std::unordered_map<GradNodeBase*,
                                std::unique_ptr<GradTensorHolder>>&
           node_input_buffers_dict) {
-    if (potential_startup_nodes.size() == 0) {
-      for (auto input_target_node : *GetInPutTargetNodesInputMetaMap()) {
+    if (potential_startup_nodes_.size() == 0) {
+      for (auto input_target_node : *GetInputTargetNodesInputMetaMap()) {
         // out rank_info of forward op
         auto rank_info = input_target_node.second->OutRankInfo();
         auto iter = node_input_buffers_dict.find(input_target_node.first);
@@ -227,7 +229,7 @@ class GeneralGrad {
           auto& target_result =
               (iter->second)->Buffers()[rank_info.first][rank_info.second];
           // save the target result
-          results_map[input_target_node.first] = target_result;
+          results_map_[input_target_node.first] = target_result;
         }
       }
     }
@@ -236,8 +238,8 @@ class GeneralGrad {
   // Set input target grad_var from node_input_buffer by inputmeta
   void SetResultForInputTargetVar(GradTensorHolder input_buffers,
                                   GradNodeBase* node) {
-    auto iter = GetInPutTargetNodesInputMetaMap()->find(node);
-    if (iter != GetInPutTargetNodesInputMetaMap()->end()) {
+    auto iter = GetInputTargetNodesInputMetaMap()->find(node);
+    if (iter != GetInputTargetNodesInputMetaMap()->end()) {
       VLOG(6) << "Get target result by by inputmeta";
       // out rank_info of forward op
       auto rank_info = (iter->second)->OutRankInfo();
@@ -245,7 +247,7 @@ class GeneralGrad {
       auto& target_result =
           input_buffers.Buffers()[rank_info.first][rank_info.second];
       // save the target result
-      results_map[node] = target_result;
+      results_map_[node] = target_result;
     }
   }
 
@@ -271,8 +273,8 @@ class GeneralGrad {
                    "input";
       }
 
-      auto iter = results_map.find(target_node);
-      if (iter != results_map.end()) {
+      auto iter = results_map_.find(target_node);
+      if (iter != results_map_.end()) {
         // set StopGradient = !create_graph
         AutogradMeta* tensor_auto_grad_meta =
             EagerUtils::autograd_meta(&(iter->second));
@@ -303,12 +305,12 @@ class GeneralGrad {
     GetTargetNodesInfo(no_grad_vars, true /* is_no_grad_vars */);
     // Get inputs's GradNodes and InputMeta Info
     GetTargetNodesInfo(inputs, false /* is_no_grad_vars */);
-    // Purify potential_startup_ops, remove those nodes that are the same as
+    // Purify potentialstartup_ops, remove those nodes that are the same as
     // input_target_nodes
     PurifyPotentialStartUpNodes();
     // Get Graph Info Betweent input target gradnode and outputs
-    // Record the depending_nodes and
-    // potential_stop_nodes、potential_startup_nodes
+    // Record the depending_nodes_ and
+    // potential_stop_nodes_、potential_startup_nodes_
     GetGraphInfoBetweenTargets(*queue);
     // Reset queue. Queue is empty only when
     // 1.input equals to output. 2.input can not reach to output.
@@ -318,34 +320,34 @@ class GeneralGrad {
   }
 
   bool IsPotentialStopNodes(GradNodeBase* node) {
-    return potential_stop_nodes.count(node);
+    return potential_stop_nodes_.count(node);
   }
 
   std::unordered_map<GradNodeBase*, AutogradMeta*>*
   GetNoGradVarNodesInputMetaMap() {
-    return &no_grad_var_nodes_inputmeta_map;
+    return &no_grad_var_nodes_inputmeta_map_;
   }
 
   std::unordered_map<GradNodeBase*, AutogradMeta*>*
-  GetInPutTargetNodesInputMetaMap() {
-    return &input_target_nodes_inputmeta_map;
+  GetInputTargetNodesInputMetaMap() {
+    return &input_target_nodes_inputmeta_map_;
   }
 
   std::unordered_set<GradNodeBase*>* GetPotentialStopNodes() {
-    return &potential_stop_nodes;
+    return &potential_stop_nodes_;
   }
 
   std::unordered_set<GradNodeBase*>* GetPotentialStartupNodes() {
-    return &potential_startup_nodes;
+    return &potential_startup_nodes_;
   }
 
   void Clear() {
-    no_grad_var_nodes_inputmeta_map.clear();
-    input_target_nodes_inputmeta_map.clear();
-    potential_startup_nodes.clear();
-    potential_stop_nodes.clear();
-    depending_nodes.clear();
-    results_map.clear();
+    no_grad_var_nodes_inputmeta_map_.clear();
+    input_target_nodes_inputmeta_map_.clear();
+    potential_startup_nodes_.clear();
+    potential_stop_nodes_.clear();
+    depending_nodes_.clear();
+    results_map_.clear();
     copied_grad_nodes_.clear();
     orig_to_copied_node_mapping_.clear();
   }
@@ -426,18 +428,18 @@ class GeneralGrad {
   static GeneralGrad* general_grad_;
   // no_grad_vars's GradNode and GradNode's InputMeta.
   std::unordered_map<GradNodeBase*, AutogradMeta* /* InputMeta */>
-      no_grad_var_nodes_inputmeta_map;
+      no_grad_var_nodes_inputmeta_map_;
   // inputs's GradNode and GradNode's InputMeta.
   std::unordered_map<GradNodeBase*, AutogradMeta* /* InputMeta */>
-      input_target_nodes_inputmeta_map;
+      input_target_nodes_inputmeta_map_;
   // Record all the potential startup_nodes, will be changed.
-  std::unordered_set<GradNodeBase*> potential_startup_nodes;
+  std::unordered_set<GradNodeBase*> potential_startup_nodes_;
   // Record all the potential stop nodes, will be changed.
-  std::unordered_set<GradNodeBase*> potential_stop_nodes;
+  std::unordered_set<GradNodeBase*> potential_stop_nodes_;
   std::unordered_map<GradNodeBase* /* next node */,
                      std::unordered_set<GradNodeBase*> /* pre nodes */>
-      depending_nodes;
-  std::unordered_map<GradNodeBase*, paddle::experimental::Tensor> results_map;
+      depending_nodes_;
+  std::unordered_map<GradNodeBase*, paddle::experimental::Tensor> results_map_;
 
   std::vector<std::shared_ptr<GradNodeBase>> copied_grad_nodes_;
   std::unordered_map<GradNodeBase*, std::shared_ptr<GradNodeBase>>
@@ -619,7 +621,7 @@ std::vector<paddle::experimental::Tensor> RunBackward(
       // GradTensorHolder will initialize another tensor with same tensortype,
       // datatype and dims but filled with 1.0
       node_input_buffers_dict[grad_node]->CopyValueFromTensor(
-          input_info.first, input_info.second, tensor, true /*fill_one=true*/);
+          input_info.first, input_info.second, tensor, /*fill_one=*/true);
     }
 
     // Prepare queue, potential startup_nodes
@@ -657,7 +659,7 @@ std::vector<paddle::experimental::Tensor> RunBackward(
     VLOG(6) << "Running GradNode:" << node->name();
 
     paddle::platform::RecordEvent node_record_event(
-        std::string((*node).name()) + " grad_node",
+        std::string((*node).name()),
         paddle::platform::TracerEventType::Operator, 1);
 
     if (queue.size() > 1 && node_in_degree_map[node] != 0) {
@@ -667,14 +669,15 @@ std::vector<paddle::experimental::Tensor> RunBackward(
     queue.pop();
 
     // Run node: This is where Hook happens
-    PADDLE_ENFORCE(
-        node_input_buffers_dict.count(node),
+    auto node_input_buffer_iter = node_input_buffers_dict.find(node);
+    PADDLE_ENFORCE_NE(
+        node_input_buffer_iter, node_input_buffers_dict.end(),
         paddle::platform::errors::Fatal(
             "Unable to find next node in the GradTensorHolder \n"
             "Trying to run Node without configuring its GradTensorHolder."));
 
     std::unique_ptr<GradTensorHolder> node_input_buffer =
-        std::move(node_input_buffers_dict[node]);
+        std::move(node_input_buffer_iter->second);
 
     // Set input target grad_var from node_input_buffer by inputmeta
     if (!inputs.empty() && is_general_grad) {
@@ -695,8 +698,6 @@ std::vector<paddle::experimental::Tensor> RunBackward(
       }
     }
 
-    VLOG(6) << "Running GradNode:" << node->name();
-
     // Check input
     EnforceGradNodeHasInput(node);
 
@@ -715,8 +716,7 @@ std::vector<paddle::experimental::Tensor> RunBackward(
     }
 
     // TODO(jiabin): Should we erase it or find a more efficient way.
-
-    node_input_buffers_dict.erase(node);
+    node_input_buffers_dict.erase(node_input_buffer_iter);
 
     // Prepare GradTensorHolder for next node
     const paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>&
@@ -736,8 +736,7 @@ std::vector<paddle::experimental::Tensor> RunBackward(
         }
         auto edge_rank = edge.GetEdgeRankInfo();
         // Since we make edge has as same rank as bwd outputs, we indexing them
-        // with
-        // the same rank(i, j)
+        // with the same rank(i, j)
         auto next_node_shared = edge.GetMutableGradNode();
 
         // Next node could be nullptr if it is leaf tensor with no
diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.cc b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
index 2bb86a86e8348..abdd8cadeed4c 100644
--- a/paddle/fluid/eager/custom_operator/custom_operator_node.cc
+++ b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
@@ -15,10 +15,151 @@
 #include "paddle/fluid/eager/custom_operator/custom_operator_node.h"
 #include "paddle/fluid/framework/custom_operator.h"
 #include "paddle/fluid/framework/op_meta_info_helper.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/phi/api/ext/op_meta_info.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace egr {
+
+static void ConstructFwdAndBwdMap(
+    const std::vector<paddle::OpMetaInfo>& vec_map,
+    const std::string& op_type) {
+  auto& in_out_map = egr::Controller::Instance().GetCustomEdgesSlotMap();
+  if (in_out_map.find(op_type) != in_out_map.end()) {
+    if (in_out_map[op_type].size() == 2) {
+      VLOG(7) << "Find Exist CustomEdgesSlotMap Skip >>>> ";
+      return;
+    }
+  }
+
+  VLOG(7) << "Construct DoubleGrad's CustomEdgesSlotMap ";
+  auto inputs_names =
+      paddle::framework::OpMetaInfoHelper::GetInputs(vec_map[1]);
+  auto outputs_names =
+      paddle::framework::OpMetaInfoHelper::GetOutputs(vec_map[1]);
+  auto attrs_names = paddle::framework::OpMetaInfoHelper::GetAttrs(vec_map[1]);
+  auto grad_outputs_names =
+      paddle::framework::OpMetaInfoHelper::GetOutputs(vec_map[2]);
+  auto grad_inputs_names =
+      paddle::framework::OpMetaInfoHelper::GetInputs(vec_map[2]);
+  auto grad_attrs_names =
+      paddle::framework::OpMetaInfoHelper::GetAttrs(vec_map[2]);
+  std::vector<std::unordered_map<int, int>> res(5);
+  in_out_map[op_type].push_back(res);
+  // Prepare pos map for grad_outputs
+  VLOG(7) << "Prepare pos map for grad_outputs";
+  PADDLE_ENFORCE_LE(
+      grad_outputs_names.size(), inputs_names.size(),
+      paddle::platform::errors::InvalidArgument(
+          "Grad outputs num should be less equal than forward inputs num."));
+  for (size_t i = 0; i < grad_outputs_names.size(); i++) {
+    auto end = grad_outputs_names[i].find("@GRAD@GRAD");
+    if (end != std::string::npos) {
+      for (size_t j = 0; j < inputs_names.size(); j++) {
+        if (grad_outputs_names[i].substr(0, end + 5) == inputs_names[j]) {
+          VLOG(7) << " ==== Custom Operator: " << op_type << "_grad "
+                  << "'s No." << j << " inputs: " << inputs_names[j]
+                  << " related to No." << i
+                  << " grad_outputs: " << grad_outputs_names[i];
+          in_out_map[op_type][1][0][j] = i;
+        }
+      }
+    } else {
+      size_t end_n = grad_outputs_names[i].find("@GRAD@NEW");
+      if (end_n != std::string::npos) {
+        for (size_t j = 0; j < inputs_names.size(); j++) {
+          if (grad_outputs_names[i].substr(0, end_n) == inputs_names[j]) {
+            VLOG(7) << " ==== Custom Operator: " << op_type << "_grad "
+                    << "'s No." << j << " inputs: " << inputs_names[j]
+                    << " related to No." << i
+                    << " grad_outputs: " << grad_outputs_names[i];
+            in_out_map[op_type][1][0][j] = i;
+          }
+        }
+      } else {
+        size_t end_one_grad = grad_outputs_names[i].find("@GRAD");
+        if (end_one_grad != std::string::npos) {
+          for (size_t j = 0; j < inputs_names.size(); j++) {
+            if (grad_outputs_names[i].substr(0, end_one_grad) ==
+                inputs_names[j]) {
+              VLOG(7) << " ==== Custom Operator: " << op_type << "_grad "
+                      << "'s No." << j << " inputs: " << inputs_names[j]
+                      << " related to No." << i
+                      << " grad_outputs: " << grad_outputs_names[i];
+              in_out_map[op_type][1][0][j] = i;
+            }
+          }
+        } else {
+          PADDLE_THROW(paddle::platform::errors::NotFound(
+              "All Grad outputs should be end of @GRAD@GRAD or @GRAD@NEW or "
+              "@GRAD and we got %s is not one of them, "
+              "please check your op and change to fit the rule.",
+              grad_outputs_names[i]));
+        }
+      }
+    }
+  }
+  // Prepare pos map for grad_inputs
+  for (size_t i = 0; i < grad_inputs_names.size(); i++) {
+    size_t end = grad_inputs_names[i].find("@GRAD@GRAD");
+    if (end != std::string::npos) {
+      for (size_t j = 0; j < outputs_names.size(); j++) {
+        if (grad_inputs_names[i].substr(0, end + 5) == outputs_names[j]) {
+          VLOG(7) << " ==== Custom Operator: " << op_type << "_grad "
+                  << "'s No." << j << " outputs: " << outputs_names[j]
+                  << " related to No." << i
+                  << " grad_inputs's grad: " << grad_inputs_names[i];
+          in_out_map[op_type][1][1][j] = i;
+        }
+      }
+    } else {
+      if (std::find(outputs_names.begin(), outputs_names.end(),
+                    grad_inputs_names[i]) != outputs_names.end()) {
+        for (size_t j = 0; j < outputs_names.size(); j++) {
+          if (grad_inputs_names[i] == outputs_names[j]) {
+            VLOG(7) << " ==== Custom Operator: " << op_type << "_grad "
+                    << "'s No." << j << " outputs: " << outputs_names[j]
+                    << " related to No." << i
+                    << " grad_inputs fwd outputs: " << grad_inputs_names[i];
+            in_out_map[op_type][1][2][j] = i;
+          }
+        }
+      } else {
+        for (size_t j = 0; j < inputs_names.size(); j++) {
+          if (grad_inputs_names[i] == inputs_names[j]) {
+            VLOG(7) << " ==== Custom Operator: " << op_type << "_grad "
+                    << "'s No." << j << " inputs: " << inputs_names[j]
+                    << " related to No." << i
+                    << " grad_inputs fwd inputs: " << grad_inputs_names[i];
+            in_out_map[op_type][1][3][j] = i;
+          }
+        }
+      }
+    }
+  }
+
+  // Prepare pos map for grad attrs_
+  for (size_t i = 0; i < grad_attrs_names.size(); i++) {
+    auto end =
+        std::find(attrs_names.begin(), attrs_names.end(), grad_attrs_names[i]);
+    PADDLE_ENFORCE_NE(end, attrs_names.end(),
+                      paddle::platform::errors::NotFound(
+                          "All Grad attrs should be one of forward attrs and "
+                          "we got %s is not one of them, please check your "
+                          "op and change to fit the rule.",
+                          grad_attrs_names[i]));
+    for (size_t j = 0; j < attrs_names.size(); j++) {
+      if (grad_attrs_names[i] == attrs_names[j]) {
+        VLOG(7) << " ==== Custom Operator: " << op_type << "_grad "
+                << "'s No." << j << " attrs: " << attrs_names[j]
+                << " related to No." << i
+                << " grad_attrs: " << grad_attrs_names[i];
+        in_out_map[op_type][1][4][j] = i;
+      }
+    }
+  }
+}
+
 paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                      kSlotSmallVectorSize>
 RunCustomOpNode::operator()(
@@ -38,10 +179,11 @@ RunCustomOpNode::operator()(
       tmp_ins(grad_inputs_name.size());
   VLOG(7) << " Prepare Backward inputs of grads with size: " << grads.size()
           << ", whose grad_inputs_name size is: " << grad_inputs_name.size();
-  for (size_t i = 0; i < grads.size(); i++) {
-    if (map[1].find(i) != map[1].end()) {
-      VLOG(7) << "Insert grad: " << i << " to grad_inputs: " << map[1][i];
-      tmp_ins[map[1][i]] = grads[i];
+  auto hooked_grads = ApplyGradientHooks(grads);
+  for (size_t i = 0; i < hooked_grads.size(); i++) {
+    if (map[0][1].find(i) != map[0][1].end()) {
+      VLOG(7) << "Insert grad: " << i << " to grad_inputs: " << map[0][1][i];
+      tmp_ins[map[0][1][i]] = hooked_grads[i];
     }
   }
 
@@ -69,28 +211,218 @@ RunCustomOpNode::operator()(
       tmp_outs(grad_outputs_names.size());
   VLOG(6) << "Prepare Grad outputs for size: " << grad_outputs_names.size();
   for (size_t i = 0; i < OutputMeta().size(); i++) {
-    if (map[0].find(i) != map[0].end()) {
+    if (map[0][0].find(i) != map[0][0].end()) {
       VLOG(7) << "Insert grad outputs: " << i
               << " with size: " << OutputMeta()[i].size()
-              << " to tmp_outputs: " << map[0][i];
+              << " to tmp_outputs: " << map[0][0][i];
       for (size_t j = 0; j < OutputMeta()[i].size(); j++) {
         outs[i].emplace_back(/* init it incase of copy nullptr of shared_ptr */
                              std::make_shared<phi::DenseTensor>(
                                  phi::DataType::UNDEFINED),
                              egr::Controller::Instance().GenerateUniqueName(
                                  "custom_tmp_grad"));
+        egr::EagerUtils::autograd_meta(&(outs[i][j]));
       }
-      tmp_outs[map[0][i]] = outs[i];
+      tmp_outs[map[0][0][i]] = outs[i];
     }
   }
   for (size_t i = 0; i < tmp_outs.size(); i++) {
     VLOG(7) << "Prepare grad outputs size: " << tmp_outs[i].size();
     ctx.EmplaceBackOutputs(tmp_outs[i]);
   }
-  VLOG(7) << "Run Kernel of Grad Custom Op: " << op_type_;
+  VLOG(7) << "Run Kernel of Grad Custom Op: " << op_type_ << "_grad";
 
   (*paddle::framework::OpMetaInfoHelper::GetKernelFn(
       kernel_map.at(op_type_)[1]))(&ctx);
+
+  VLOG(7) << "Get AutogradMeta for inputs and outputs for Custom Op";
+  std::vector<std::vector<egr::AutogradMeta*>> ins_auto_grad_metas;
+  std::vector<std::vector<egr::AutogradMeta*>> outs_auto_grad_metas;
+  VLOG(7) << "We got slot num of ins is: " << ctx.InputRange().size();
+  ins_auto_grad_metas.resize(ctx.InputRange().size());
+  VLOG(7) << "We got slot num of outs is: " << ctx.OutputRange().size();
+  outs_auto_grad_metas.resize(ctx.OutputRange().size());
+
+  for (size_t i = 0; i < ctx.InputRange().size(); i++) {
+    ins_auto_grad_metas[i] =
+        egr::EagerUtils::nullable_autograd_meta(ctx.InputsBetween(
+            ctx.InputRangeAt(i).first, ctx.InputRangeAt(i).second));
+  }
+
+  for (size_t i = 0; i < ctx.OutputRange().size(); i++) {
+    outs_auto_grad_metas[i] =
+        egr::EagerUtils::unsafe_autograd_meta(ctx.OutputsBetweeen(
+            ctx.OutputRangeAt(i).first, ctx.OutputRangeAt(i).second));
+  }
+  bool require_any_grad = false;
+  bool trace_backward = egr::Controller::Instance().HasGrad() && create_graph;
+  for (size_t i = 0; i < ins_auto_grad_metas.size(); i++) {
+    require_any_grad =
+        require_any_grad || egr::EagerUtils::ComputeRequireGrad(
+                                trace_backward, &(ins_auto_grad_metas[i]));
+  }
+
+  if (require_any_grad) {
+    auto meta_info_map = egr::Controller::Instance().GetOpMetaInfoMap();
+    const auto& vec_map = meta_info_map.at(op_type_);
+    paddle::platform::RecordEvent node_creation_record_event(
+        "Custom Op " + op_type_ + " double_grad node_creation",
+        paddle::platform::TracerEventType::OperatorInner, 1);
+    VLOG(6) << " Construct Grad for Custom Op: " << op_type_;
+    ConstructFwdAndBwdMap(vec_map, op_type_);
+    for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) {
+      egr::EagerUtils::PassStopGradient(false, &(outs_auto_grad_metas[i]));
+    }
+    auto grad_node = std::make_shared<egr::RunCustomOpDoubleGradNode>(
+        outs_auto_grad_metas.size(), ins_auto_grad_metas.size(), op_type_);
+
+    auto slot_map =
+        egr::Controller::Instance().GetCustomEdgesSlotMap().at(op_type_);
+    // Prepare Grad outputs
+    size_t no_grad_cnt = 0;
+    for (size_t i = 0; i < ins_auto_grad_metas.size(); i++) {
+      const std::vector<paddle::experimental::Tensor>& in_tensors =
+          ctx.InputsBetween(ctx.InputRangeAt(i).first,
+                            ctx.InputRangeAt(i).second);
+
+      if (slot_map[1][0].find(i) != slot_map[1][0].end()) {
+        grad_node->SetGradOutMeta(in_tensors, slot_map[1][0][i]);
+      } else {
+        grad_node->SetGradOutMeta(in_tensors,
+                                  ins_auto_grad_metas.size() - 1 - no_grad_cnt);
+        no_grad_cnt++;
+      }
+    }
+
+    // Prepare Grad inputs with grad of fwd outputs
+    for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) {
+      const std::vector<paddle::experimental::Tensor>& out_tensors =
+          ctx.OutputsBetweeen(ctx.OutputRangeAt(i).first,
+                              ctx.OutputRangeAt(i).second);
+      egr::EagerUtils::SetOutRankWithSlot(&(outs_auto_grad_metas[i]), i);
+      egr::EagerUtils::SetHistory(&(outs_auto_grad_metas[i]), grad_node);
+      grad_node->SetGradInMeta(out_tensors, i);
+      egr::EagerUtils::CheckAndRetainGrad(out_tensors);
+    }
+
+    // Prepare Grad inputs with fwd outputs
+    for (auto it = slot_map[1][2].begin(); it != slot_map[1][2].end(); it++) {
+      VLOG(7) << "Prepare fwd_outs: " << it->first
+              << " to grad_inputs: " << it->second;
+      grad_node->fwd_outs[it->second] =
+          egr::RunCustomOpNode::ConstructTensorWrapper(
+              ctx.OutputsBetweeen(ctx.OutputRangeAt(it->first).first,
+                                  ctx.OutputRangeAt(it->first).second));
+    }
+
+    // Prepare Grad inputs with fwd inputs
+    for (auto it = slot_map[1][3].begin(); it != slot_map[1][3].end(); it++) {
+      VLOG(7) << "Prepare fwd_ins: " << it->first
+              << " to grad_inputs: " << it->second;
+      grad_node->fwd_ins[it->second] =
+          egr::RunCustomOpNode::ConstructTensorWrapper(
+              ctx.InputsBetween(ctx.InputRangeAt(it->first).first,
+                                ctx.InputRangeAt(it->first).second));
+    }
+
+    auto attrs_names = paddle::framework::OpMetaInfoHelper::GetAttrs(
+        meta_info_map.at(op_type_)[2]);
+    std::vector<paddle::any> attrs(attrs_names.size());
+    // Prepare attrs for Grad node
+    for (auto it = slot_map[1][4].begin(); it != slot_map[1][4].end(); it++) {
+      VLOG(7) << "Prepare fwd attrs: " << it->first
+              << " to grad_attrs: " << it->second;
+      attrs[it->second] = attrs_[it->first];
+    }
+    grad_node->SetAttrs(attrs);
+  }
+
+  return outs;
+}
+
+paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                     kSlotSmallVectorSize>
+RunCustomOpDoubleGradNode::operator()(
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         kSlotSmallVectorSize>& grads,
+    bool create_graph, bool is_new_grad) {  // NOLINT
+  paddle::CustomOpKernelContext ctx;
+  auto meta_info_map = egr::Controller::Instance().GetOpMetaInfoMap();
+  const auto& vec_map = meta_info_map.at(op_type_);
+  auto grad_inputs_name =
+      paddle::framework::OpMetaInfoHelper::GetInputs(vec_map[2]);
+  auto grad_outputs_names =
+      paddle::framework::OpMetaInfoHelper::GetOutputs(vec_map[2]);
+  auto map = egr::Controller::Instance().GetCustomEdgesSlotMap().at(op_type_);
+  auto kernel_map = egr::Controller::Instance().GetOpMetaInfoMap();
+
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>
+      tmp_ins(grad_inputs_name.size());
+  VLOG(7) << " Prepare Backward inputs of grads with size: " << grads.size()
+          << ", whose grad_inputs_name size is: " << grad_inputs_name.size();
+
+  auto hooked_grads = ApplyGradientHooks(grads);
+
+  for (size_t i = 0; i < hooked_grads.size(); i++) {
+    if (map[1][1].find(i) != map[1][1].end()) {
+      VLOG(7) << "Insert grad: " << i << " to grad_inputs: " << map[1][1][i];
+      tmp_ins[map[1][1][i]] = hooked_grads[i];
+    }
+  }
+
+  for (auto it : fwd_outs) {
+    VLOG(7) << "Insert fwd_outs to grad_inputs: " << it.first;
+    tmp_ins[it.first] = RunCustomOpDoubleGradNode::Recover(&(it.second));
+  }
+
+  for (auto it : fwd_ins) {
+    VLOG(7) << "Insert fwd_ins to grad_inputs: " << it.first;
+    tmp_ins[it.first] = RunCustomOpDoubleGradNode::Recover(&(it.second));
+  }
+
+  VLOG(6) << "Prepare Grad inputs";
+  for (const auto& in : tmp_ins) {
+    ctx.EmplaceBackInputs(in);
+  }
+  VLOG(6) << "Prepare Grad attrs";
+  ctx.EmplaceBackAttrs(attrs_);
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>
+      outs(OutputMeta().size());
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>
+      tmp_outs(grad_outputs_names.size());
+  VLOG(6) << "Prepare Grad outputs for size: " << grad_outputs_names.size();
+
+  for (const auto& name : grad_outputs_names) {
+    VLOG(6) << "Prepare Grad outputs name is: " << name;
+  }
+
+  for (size_t i = 0; i < OutputMeta().size(); i++) {
+    if (map[1][0].find(i) != map[1][0].end()) {
+      VLOG(7) << "Insert grad outputs: " << i
+              << " with size: " << OutputMeta()[i].size()
+              << " to tmp_outputs: " << map[1][0][i];
+      for (size_t j = 0; j < OutputMeta()[i].size(); j++) {
+        outs[i].emplace_back(/* init it incase of copy nullptr of shared_ptr */
+                             std::make_shared<phi::DenseTensor>(
+                                 phi::DataType::UNDEFINED),
+                             egr::Controller::Instance().GenerateUniqueName(
+                                 "custom_tmp_grad"));
+      }
+      tmp_outs[map[1][0][i]] = outs[i];
+    }
+  }
+  for (size_t i = 0; i < tmp_outs.size(); i++) {
+    VLOG(7) << "Prepare grad outputs size: " << tmp_outs[i].size();
+    ctx.EmplaceBackOutputs(tmp_outs[i]);
+  }
+  VLOG(7) << "Run Kernel of Grad Custom Op: " << name();
+
+  (*paddle::framework::OpMetaInfoHelper::GetKernelFn(
+      kernel_map.at(op_type_)[2]))(&ctx);
+
   return outs;
 }
 }  // namespace egr
diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.h b/paddle/fluid/eager/custom_operator/custom_operator_node.h
index 4801088e51ba5..feea23730676e 100644
--- a/paddle/fluid/eager/custom_operator/custom_operator_node.h
+++ b/paddle/fluid/eager/custom_operator/custom_operator_node.h
@@ -67,7 +67,11 @@ class RunCustomOpNode : public GradNodeBase {
     return res;
   }
 
-  void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
+  void ClearTensorWrappers() override {
+    fwd_outs.clear();
+    fwd_ins.clear();
+    grads2grad_in_map.clear();
+  }
 
   void SetAttrs(const std::vector<paddle::any>& attr) { attrs_ = attr; }
 
@@ -87,4 +91,75 @@ class RunCustomOpNode : public GradNodeBase {
   std::string op_type_{""};
 };
 
+class RunCustomOpDoubleGradNode : public GradNodeBase {
+ public:
+  // Constructor: configure fwd input tensors to grad node
+  explicit RunCustomOpDoubleGradNode(size_t bwd_in_slot_num,
+                                     size_t bwd_out_slot_num,
+                                     const std::string& op_type)
+      : GradNodeBase(bwd_in_slot_num, bwd_out_slot_num), op_type_(op_type) {
+    VLOG(6) << "Construct RunCustomOpDoubleGradNode for op: " << op_type;
+  }
+
+  ~RunCustomOpDoubleGradNode() override {
+    VLOG(6) << "Destruct RunCustomOpDoubleGradNode for op: " << op_type_;
+  }
+
+  // Functor: perform backward computations
+  virtual paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                               kSlotSmallVectorSize>
+  operator()(  // NOLINT
+      paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                           kSlotSmallVectorSize>& grads,  // NOLINT
+      bool create_graph = false,
+      bool is_new_grad = false)  // NOLINT
+      override;
+
+  std::string name() {
+    return paddle::string::Sprintf("RunCustomOpDoubleGradNode: %s_grad_grad",
+                                   op_type_);
+  }
+
+  static std::vector<egr::TensorWrapper> ConstructTensorWrapper(
+      const std::vector<paddle::experimental::Tensor>& fwd_var) {
+    std::vector<egr::TensorWrapper> res;
+    for (auto const& var : fwd_var) {
+      res.emplace_back(var);
+    }
+    return res;
+  }
+
+  static std::vector<paddle::experimental::Tensor> Recover(
+      std::vector<egr::TensorWrapper>* fwd_var) {
+    std::vector<paddle::experimental::Tensor> res;
+    for (size_t i = 0; i < fwd_var->size(); i++) {
+      res.emplace_back(fwd_var->at(i).recover());
+    }
+    return res;
+  }
+
+  void ClearTensorWrappers() override {
+    fwd_outs.clear();
+    fwd_ins.clear();
+    grads2grad_in_map.clear();
+  }
+
+  void SetAttrs(const std::vector<paddle::any>& attr) { attrs_ = attr; }
+
+  std::shared_ptr<GradNodeBase> Copy() const override {
+    auto copied_node = std::shared_ptr<RunCustomOpDoubleGradNode>(
+        new RunCustomOpDoubleGradNode(*this));
+    return copied_node;
+  }
+
+ public:
+  std::unordered_map<int, std::vector<egr::TensorWrapper>> fwd_outs;
+  std::unordered_map<int, std::vector<egr::TensorWrapper>> fwd_ins;
+  std::unordered_map<int, int> grads2grad_in_map;
+
+ private:
+  std::vector<paddle::any> attrs_;
+  std::string op_type_{""};
+};
+
 }  // namespace egr
diff --git a/paddle/fluid/eager/eager_tensor.h b/paddle/fluid/eager/eager_tensor.h
index b11acae566d74..dd9881fcd5f0f 100644
--- a/paddle/fluid/eager/eager_tensor.h
+++ b/paddle/fluid/eager/eager_tensor.h
@@ -21,24 +21,176 @@
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
 #include "paddle/phi/core/compat/convert_utils.h"
+
+namespace egr {
+
+/**
+ * VariableCompatTensor class is used by Eager mode for now. It's painful to
+ * do this in Eager Mode, the better choice is to design the special Tensor
+ * directly in phi and use it in paddle::experimental::Tensor.
+ * However, we have some special operators, and they use special input variable
+ * type, such as vector<string>, unordered_map<wstring, int>, these type cannot
+ * cover by DenseTensor or SparseTensor. So, we have to provide a compatible
+ * Tensor type like variable to support these special input type. We should
+ * remove this as soon as we finish the ResourceTensor in phi.
+ *
+ * Note: Keep this class as clean as possible.
+ * This class should only support method declared in framework::Variable and
+ * necessary overridden methods.
+ *
+ * Note: This class is only used to support types that cannot be supported by
+ * the phi Tensor system temporarily. You CANNOT use this class to handle types
+ * such as DenseTensor, SelectedRows, etc.
+ **/
+class VariableCompatTensor
+    : public phi::TensorBase,
+      public phi::TypeInfoTraits<phi::TensorBase, VariableCompatTensor> {
+ public:
+  template <typename T>
+  const T& Get() const {
+    static_assert(
+        paddle::framework::IsRegisteredVarType<T>(),
+        "Not registered type. Please register T inside var_type_traits.h");
+    PADDLE_ENFORCE_NOT_NULL(holder_, paddle::platform::errors::NotFound(
+                                         "Variable is not initialized."));
+    PADDLE_ENFORCE_EQ(
+        holder_->Type(), paddle::framework::VarTypeTrait<T>::kId,
+        paddle::platform::errors::InvalidArgument(
+            "The Variable type must be %s, but the type it holds is %s.",
+            paddle::framework::ToTypeName(
+                paddle::framework::VarTypeTrait<T>::kId),
+            paddle::framework::ToTypeName(holder_->Type())));
+    return *static_cast<const T*>(holder_->Ptr());
+  }
+
+  bool IsInitialized() const { return holder_ != nullptr; }
+
+  template <typename T>
+  T* GetMutable() {
+    if (!holder_) {
+      holder_.reset(new PlaceholderImpl<T>());
+    } else {
+      PADDLE_ENFORCE_EQ(
+          holder_->Type(), paddle::framework::VarTypeTrait<T>::kId,
+          paddle::platform::errors::InvalidArgument(
+              "The Variable type must be %s, but the type it holds is %s.",
+              paddle::framework::ToTypeName(
+                  paddle::framework::VarTypeTrait<T>::kId),
+              paddle::framework::ToTypeName(holder_->Type())));
+    }
+    return static_cast<T*>(holder_->Ptr());
+  }
+
+  template <typename T>
+  bool IsType() const {
+    return holder_ &&
+           holder_->Type() == paddle::framework::VarTypeTrait<T>::kId;
+  }
+
+  void Clear() { holder_.reset(); }
+
+  int Type() const {
+    PADDLE_ENFORCE_NOT_NULL(holder_, paddle::platform::errors::NotFound(
+                                         "Variable is not initialized."));
+    return holder_->Type();
+  }
+
+  // necessary overridden methods
+
+  static const char* name() { return "VariableCompatTensor"; }
+
+  ~VariableCompatTensor() override = default;
+
+  int64_t numel() const override {
+    PADDLE_THROW(paddle::platform::errors::Unavailable(
+        "VariableCompatTensor does not support `numel` method."));
+  }
+
+  const phi::DDim& dims() const override {
+    PADDLE_THROW(paddle::platform::errors::Unavailable(
+        "VariableCompatTensor does not support `dims` method."));
+  }
+
+  phi::DataType dtype() const override {
+    PADDLE_THROW(paddle::platform::errors::Unavailable(
+        "VariableCompatTensor does not support `dtype` method."));
+  }
+
+  phi::DataLayout layout() const override {
+    PADDLE_THROW(paddle::platform::errors::Unavailable(
+        "VariableCompatTensor does not support `layout` method."));
+  }
+
+  const phi::Place& place() const override {
+    PADDLE_THROW(paddle::platform::errors::Unavailable(
+        "VariableCompatTensor does not support `place` method."));
+  }
+
+  bool valid() const override { return IsInitialized(); }
+
+  bool initialized() const override { return IsInitialized(); }
+
+  void* AllocateFrom(phi::Allocator* allocator, phi::DataType dtype,
+                     size_t requested_size = 0) override {
+    PADDLE_THROW(paddle::platform::errors::Unavailable(
+        "VariableCompatTensor does not support `AllocateFrom` method."));
+  }
+
+ private:
+  struct Placeholder {
+    virtual ~Placeholder() PADDLE_MAY_THROW {}
+
+    inline int Type() const { return type_; }
+    inline const void* Ptr() const { return ptr_; }
+    inline void* Ptr() { return ptr_; }
+
+   protected:
+    inline void Init(void* p, int type) {
+      ptr_ = p;
+      type_ = type;
+    }
+
+    void* ptr_;
+    int type_;
+  };
+
+  // Placeholder hides type T, so it doesn't appear as a template
+  // parameter of Variable.
+  template <typename T>
+  struct PlaceholderImpl : public Placeholder {
+    static_assert(
+        paddle::framework::IsRegisteredVarType<T>(),
+        "Not registered type. Please register T inside var_type_traits.h");
+    PlaceholderImpl() {
+      this->Init(&obj_, paddle::framework::VarTypeTrait<T>::kId);
+    }
+
+   private:
+    T obj_;
+  };
+
+  // pointers to a PlaceholderImpl object indeed.
+  std::shared_ptr<Placeholder> holder_;
+};
+
+inline bool IsVariableCompatTensor(const paddle::experimental::Tensor& tensor) {
+  return VariableCompatTensor::classof(tensor.impl().get());
+}
+
 /**
  * This class is used by Eager mode for now. It's painful to do this in Eager
- * Mode, the better
- * choice is to use paddle::experimental::Tensor directly. However, we have a
- * punch of nested kernel code, and
- * they use paddle::framework::Variable in inner logic code. So, we have to
- * provide variable in
- * paddle::framework::ExecutionContext to support it. We should remove this as
- * soon as we finish our latest
- * Phi Lib, and use paddle::experimental::Tensor instead.
+ * Mode, the better choice is to use paddle::experimental::Tensor directly.
+ * However, we have a punch of nested kernel code, and they use
+ * paddle::framework::Variable in inner logic code. So, we have to provide
+ * variable in paddle::framework::ExecutionContext to support it. We should
+ * remove this as soon as we finish our latest Phi Lib, and use
+ * paddle::experimental::Tensor instead.
  *
  * Note: Keep this class as clean as possible.
  * This class should only support method declared in
  * paddle::experimental::Tensor with access method of
  * paddle::framework::Variable no more members are acceptable.
  * **/
-
-namespace egr {
 class EagerVariable final {
  public:
   /* Default constructor and name constructor should only be used for contruct
@@ -54,6 +206,14 @@ class EagerVariable final {
         ConstructVariableFromTensor<phi::DenseTensor>(tensor);
       } else if (tensor.is_selected_rows()) {
         ConstructVariableFromTensor<phi::SelectedRows>(tensor);
+      } else if (IsVariableCompatTensor(tensor) &&
+                 static_cast<const VariableCompatTensor*>(tensor.impl().get())
+                     ->IsType<paddle::framework::Vocab>()) {
+        ConstructVariableFromCompatTensor<paddle::framework::Vocab>(tensor);
+      } else if (IsVariableCompatTensor(tensor) &&
+                 static_cast<const VariableCompatTensor*>(tensor.impl().get())
+                     ->IsType<paddle::framework::Strings>()) {
+        ConstructVariableFromCompatTensor<paddle::framework::Strings>(tensor);
       } else {
         PADDLE_THROW(paddle::platform::errors::Fatal(
             "Unrecognized egr::EagerVariable type, only "
@@ -119,6 +279,22 @@ class EagerVariable final {
     *framework_tensor = *tensor_dense;
   }
 
+  template <typename VarType>
+  void ConstructVariableFromCompatTensor(
+      const paddle::experimental::Tensor& tensor) {
+    auto* framework_holder = var_.GetMutable<VarType>();
+    // Contruct framework::Tensor from egr::EagerVariable
+    auto* compat_tensor =
+        static_cast<VariableCompatTensor*>(tensor.impl().get());
+    PADDLE_ENFORCE_NOT_NULL(compat_tensor,
+                            paddle::platform::errors::Fatal(
+                                "Tensor %s holds empty impl, this should not "
+                                "happend since we should "
+                                "treat all kinds of tensor as what they are.",
+                                tensor.name()));
+    *framework_holder = compat_tensor->Get<VarType>();
+  }
+
  private:
   std::string name_{""};
   paddle::framework::Variable var_;
diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc
index 610b177829e2f..af387bb3238d1 100644
--- a/paddle/fluid/eager/grad_node_info.cc
+++ b/paddle/fluid/eager/grad_node_info.cc
@@ -36,6 +36,31 @@
 **/
 namespace egr {
 
+static void CheckTensor(const paddle::experimental::Tensor& pre,
+                        const paddle::experimental::Tensor& post) {
+  if (!pre.initialized() && post.initialized()) {
+    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
+        "The tensor in before and after hook are not consistent"));
+  }
+  if (pre.initialized() && post.initialized()) {
+    VLOG(4) << paddle::framework::DataType2String(pre.dtype()) << " "
+            << paddle::framework::DataType2String(post.dtype());
+    PADDLE_ENFORCE_EQ(
+        pre.dtype(), post.dtype(),
+        paddle::platform::errors::PermissionDenied(
+            "The dtype of tensor before(%s) and after(%s) hook are not "
+            "consistent",
+            paddle::framework::DataType2String(pre.dtype()),
+            paddle::framework::DataType2String(post.dtype())));
+    PADDLE_ENFORCE_EQ(
+        pre.place(), post.place(),
+        paddle::platform::errors::PermissionDenied(
+            "The place of tensor before(%s) and after(%s) "
+            "hook are not consistent",
+            pre.place().DebugString(), post.place().DebugString()));
+  }
+}
+
 GradNodeBase::GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num) {
   VLOG(6) << "Construct GradNodeBase";
   bwd_in_meta_.resize(bwd_in_slot_num);
@@ -193,6 +218,8 @@ void GradNodeBase::SetGradOutMeta(const paddle::experimental::Tensor& fwd_in,
   // Set Stop_gradient
   if (fwd_in_meta) {
     meta.SetStopGradient(fwd_in_meta->StopGradient());
+  } else {
+    meta.SetStopGradient(true);
   }
   // Set Adj Edges
   if (fwd_in_meta && !fwd_in_meta->StopGradient()) {
@@ -271,7 +298,7 @@ void GradNodeBase::SetGradOutMeta(
         // Only Copy Meta
         phi::DenseTensor* dense_tensor =
             static_cast<phi::DenseTensor*>(fwd_in_tensor.impl().get());
-        PADDLE_ENFORCE_NE(dense_tensor->meta().dtype, phi::DataType::UNDEFINED,
+        PADDLE_ENFORCE_NE(dense_tensor->dtype(), phi::DataType::UNDEFINED,
                           paddle::platform::errors::Fatal(
                               "Attempting to copy DenseTensorMeta "
                               "with phi::DataType::UNDEFINED,"
diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h
index 6fdee203c196c..747e98b846616 100644
--- a/paddle/fluid/eager/grad_node_info.h
+++ b/paddle/fluid/eager/grad_node_info.h
@@ -30,32 +30,23 @@ namespace egr {
  * The GradNodeBase will be held in autograd_meta, and it is also a member of
  * Edge, which indicates the edge of backward graph.
  *
- * TODO:(yangzhanlue) GradNodeBase will also in charge of get the correct input
+ * TODO(yangzhanlue): GradNodeBase will also in charge of get the correct input
  * from GradOpDescMaker to GradNodeBase.
  *
- * NOTE:GradNodeBase has a method named run, this method should be overrided by
- * the
- * specific derived class, it will prepare backward inputs and double backward's
- * depends. Then, it will call C++ API of backward kernel functions to finish
- * backward computation.
+ * NOTE: GradNodeBase has a method named run, this method should be overrided by
+ * the specific derived class, it will prepare backward inputs and double
+ * backward's depends. Then, it will call C++ API of backward kernel functions
+ * to finish backward computation.
  *
- * NOTE:GradNodeBase holds its own inputs and Outputs
+ * NOTE: GradNodeBase holds its own inputs and Outputs
  *
  * Edge is defined to descripe depend of backward, an Edge is what linked
- * between two
- * node, it should contain a Node and rank of this Node (this is used to
- * indicate which
- * input of grad this edge belong).
- * */
+ * between two node, it should contain a Node and rank of this Node (this is
+ * used to indicate which input of grad this edge belong).
+ **/
 class AutogradMeta;
 class GradNodeBase;
-/**
- * GradSlotMeta is used to Record Forward Tensor info to backward, since paddle
- * has lots of operators
- * whose backward logic is depends on if it has some specific inputs or outputs.
- * So, we need a meta info
- * to record it's needs.
- * **/
+
 class Edge {
  public:
   // Default constructor for Edges in order to construct it for AutogradMeta
@@ -64,8 +55,7 @@ class Edge {
   // In real use cases we should create Edge from grad node and input rank which
   // indicate which edge it is.
   // Since we have slot design in operators we will have to locate an edge with
-  // slot
-  // and rank.
+  // slot and rank.
   Edge(const std::shared_ptr<GradNodeBase>& grad_node, size_t in_slot_id,
        size_t in_rank)
       : in_slot_id_(in_slot_id), in_rank_(in_rank), grad_node_(grad_node) {}
@@ -120,6 +110,12 @@ class Edge {
   size_t in_rank_;
   std::shared_ptr<GradNodeBase> grad_node_{nullptr};
 };
+
+/**
+ * GradSlotMeta is used to Record Forward Tensor info to backward, since paddle
+ * has lots of operators whose backward logic is depends on if it has some
+ * specific inputs or outputs. So, we need a meta info to record it's needs.
+ **/
 class GradSlotMeta {
  public:
   GradSlotMeta() = default;
@@ -171,16 +167,13 @@ class GradNodeBase {
 
   /**
    * operator() designed to contian the real backward execution logic, it should
-   * be
-   * overrided by derived class defined for each operator. It accepts a vector
-   * of
-   * Tensor which contains grads input of current operator
+   * be overrided by derived class defined for each operator. It accepts a
+   * vector of Tensor which contains grads input of current operator
    *
    * Note: why we need backward inputs and outputs construct as vector of vector
    * of paddle::experimental::Tensor?
    * Since all of paddle op composite in form of {"Slot name ", vector<Var>},
-   * so, vector of vector
-   * is better choice to fit this format.
+   * so, vector of vector is better choice to fit this format.
    * **/
   virtual paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                                kSlotSmallVectorSize>
@@ -294,36 +287,12 @@ class GradNodeBase {
                         /* slot id */ size_t, /* rank */ size_t,
                         /* hook */ std::shared_ptr<TensorHook>>>
       gradient_hooks_;
+  int64_t next_hook_id_{0};
 
   // We handle complex to real conversion only if any complex GradIn is involved
   bool need_complex_to_real_ = false;
-  int64_t next_hook_id_{0};
+
   bool is_tensor_wrappers_cleared_ = false;
 };
 
-inline void CheckTensor(const paddle::experimental::Tensor& pre,
-                        const paddle::experimental::Tensor& post) {
-  if (!pre.initialized() && post.initialized()) {
-    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
-        "The tensor in before and after hook are not consistent"));
-  }
-  if (pre.initialized() && post.initialized()) {
-    VLOG(4) << paddle::framework::DataType2String(pre.dtype()) << " "
-            << paddle::framework::DataType2String(post.dtype());
-    PADDLE_ENFORCE_EQ(
-        pre.dtype(), post.dtype(),
-        paddle::platform::errors::PermissionDenied(
-            "The dtype of tensor before(%s) and after(%s) hook are not "
-            "consistent",
-            paddle::framework::DataType2String(pre.dtype()),
-            paddle::framework::DataType2String(post.dtype())));
-    PADDLE_ENFORCE_EQ(
-        pre.place(), post.place(),
-        paddle::platform::errors::PermissionDenied(
-            "The place of tensor before(%s) and after(%s) "
-            "hook are not consistent",
-            pre.place().DebugString(), post.place().DebugString()));
-  }
-}
-
 }  // namespace egr
diff --git a/paddle/fluid/eager/nan_inf_utils.cc b/paddle/fluid/eager/nan_inf_utils.cc
new file mode 100644
index 0000000000000..d1c5983a3702f
--- /dev/null
+++ b/paddle/fluid/eager/nan_inf_utils.cc
@@ -0,0 +1,119 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/eager/nan_inf_utils.h"
+
+#include "paddle/fluid/framework/details/nan_inf_utils_detail.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/selected_rows.h"
+
+namespace egr {
+
+void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor) {
+  if (tensor.initialized()) {
+    auto& tensor_name = tensor.name();
+    const phi::DenseTensor* dense_tensor{nullptr};
+    if (tensor.is_dense_tensor()) {
+      dense_tensor = static_cast<const phi::DenseTensor*>(tensor.impl().get());
+    } else if (tensor.is_selected_rows()) {
+      dense_tensor = &(
+          static_cast<const phi::SelectedRows*>(tensor.impl().get())->value());
+    } else {
+      VLOG(10) << "Only DenseTensor or SelectedRows need to check, "
+               << tensor_name << " is no need.";
+      return;
+    }
+
+    auto& place = dense_tensor->place();
+    if (paddle::platform::is_gpu_place(place)) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      paddle::framework::details::tensor_check<
+          paddle::platform::CUDADeviceContext>(api_name, tensor_name,
+                                               *dense_tensor, place);
+#else
+      PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
+          "Tensor[%s] use gpu place. PaddlePaddle must compile with GPU.",
+          tensor_name));
+#endif
+      return;
+    }
+    paddle::framework::details::tensor_check<
+        paddle::platform::CPUDeviceContext>(api_name, tensor_name,
+                                            *dense_tensor, place);
+  }
+}
+
+void CheckTensorHasNanOrInf(const std::string& api_name,
+                            const TupleOfTwoTensors& tensors) {
+  CheckTensorHasNanOrInf(api_name, std::get<0>(tensors));
+  CheckTensorHasNanOrInf(api_name, std::get<1>(tensors));
+}
+
+void CheckTensorHasNanOrInf(const std::string& api_name,
+                            const TupleOfThreeTensors& tensors) {
+  CheckTensorHasNanOrInf(api_name, std::get<0>(tensors));
+  CheckTensorHasNanOrInf(api_name, std::get<1>(tensors));
+  CheckTensorHasNanOrInf(api_name, std::get<2>(tensors));
+}
+
+void CheckTensorHasNanOrInf(const std::string& api_name,
+                            const TupleOfFourTensors& tensors) {
+  CheckTensorHasNanOrInf(api_name, std::get<0>(tensors));
+  CheckTensorHasNanOrInf(api_name, std::get<1>(tensors));
+  CheckTensorHasNanOrInf(api_name, std::get<2>(tensors));
+  CheckTensorHasNanOrInf(api_name, std::get<3>(tensors));
+}
+
+void CheckTensorHasNanOrInf(const std::string& api_name,
+                            const TupleOfFiveTensors& tensors) {
+  CheckTensorHasNanOrInf(api_name, std::get<0>(tensors));
+  CheckTensorHasNanOrInf(api_name, std::get<1>(tensors));
+  CheckTensorHasNanOrInf(api_name, std::get<2>(tensors));
+  CheckTensorHasNanOrInf(api_name, std::get<3>(tensors));
+  CheckTensorHasNanOrInf(api_name, std::get<4>(tensors));
+}
+
+void CheckTensorHasNanOrInf(const std::string& api_name,
+                            const TupleOfSixTensors& tensors) {
+  CheckTensorHasNanOrInf(api_name, std::get<0>(tensors));
+  CheckTensorHasNanOrInf(api_name, std::get<1>(tensors));
+  CheckTensorHasNanOrInf(api_name, std::get<2>(tensors));
+  CheckTensorHasNanOrInf(api_name, std::get<3>(tensors));
+  CheckTensorHasNanOrInf(api_name, std::get<4>(tensors));
+  CheckTensorHasNanOrInf(api_name, std::get<5>(tensors));
+}
+
+void CheckTensorHasNanOrInf(const std::string& api_name,
+                            const std::vector<Tensor>& tensors) {
+  for (auto& tensor : tensors) {
+    CheckTensorHasNanOrInf(api_name, tensor);
+  }
+}
+
+void CheckTensorHasNanOrInf(
+    const std::string& api_name,
+    const paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                               egr::kSlotSmallVectorSize>& tensors) {
+  for (auto& tensor_vector : tensors) {
+    CheckTensorHasNanOrInf(api_name, tensor_vector);
+  }
+}
+
+void CheckTensorHasNanOrInf(const std::string& api_name,
+                            const TupleOfTensorAndVector& tensors) {
+  CheckTensorHasNanOrInf(api_name, std::get<0>(tensors));
+  CheckTensorHasNanOrInf(api_name, std::get<1>(tensors));
+}
+
+}  // namespace egr
diff --git a/paddle/fluid/eager/nan_inf_utils.h b/paddle/fluid/eager/nan_inf_utils.h
new file mode 100644
index 0000000000000..a411504fa4900
--- /dev/null
+++ b/paddle/fluid/eager/nan_inf_utils.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "paddle/fluid/eager/type_defs.h"
+#include "paddle/phi/api/include/tensor.h"
+#include "paddle/utils/small_vector.h"
+
+namespace egr {
+
+using paddle::experimental::Tensor;
+using TupleOfTwoTensors = std::tuple<Tensor, Tensor>;
+using TupleOfThreeTensors = std::tuple<Tensor, Tensor, Tensor>;
+using TupleOfFourTensors = std::tuple<Tensor, Tensor, Tensor, Tensor>;
+using TupleOfFiveTensors = std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor>;
+using TupleOfSixTensors =
+    std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor>;
+using TupleOfTensorAndVector = std::tuple<Tensor, std::vector<Tensor>>;
+
+void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor);
+
+void CheckTensorHasNanOrInf(const std::string& api_name,
+                            const TupleOfTwoTensors& tensors);
+
+void CheckTensorHasNanOrInf(const std::string& api_name,
+                            const TupleOfThreeTensors& tensors);
+
+void CheckTensorHasNanOrInf(const std::string& api_name,
+                            const TupleOfFourTensors& tensors);
+
+void CheckTensorHasNanOrInf(const std::string& api_name,
+                            const TupleOfFiveTensors& tensors);
+
+void CheckTensorHasNanOrInf(const std::string& api_name,
+                            const TupleOfSixTensors& tensors);
+
+void CheckTensorHasNanOrInf(const std::string& api_name,
+                            const std::vector<Tensor>& tensors);
+
+void CheckTensorHasNanOrInf(const std::string& api_name,
+                            const TupleOfTensorAndVector& tensors);
+
+void CheckTensorHasNanOrInf(
+    const std::string& api_name,
+    const paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                               egr::kSlotSmallVectorSize>& tensors);
+
+}  // namespace egr
diff --git a/paddle/fluid/eager/pylayer/py_layer_node.cc b/paddle/fluid/eager/pylayer/py_layer_node.cc
index fad4fd50a5e3e..a00b292fe0915 100644
--- a/paddle/fluid/eager/pylayer/py_layer_node.cc
+++ b/paddle/fluid/eager/pylayer/py_layer_node.cc
@@ -106,8 +106,6 @@ GradNodePyLayer::operator()(
         pybind11::detail::error_string().c_str()));
   }
 
-  outputs_ = outputs;
-
   VLOG(6) << "PyLayer backward function finish...";
 
   PyObject* outputs_tuple = nullptr;
@@ -165,6 +163,9 @@ GradNodePyLayer::operator()(
   if (!PyTuple_Check(outputs)) {
     Py_XDECREF(outputs_tuple);
   }
+  Py_XDECREF(outputs);
+  Py_XDECREF(ctx_);
+  ctx_ = nullptr;
 
   return grad_out;
 }
diff --git a/paddle/fluid/eager/pylayer/py_layer_node.h b/paddle/fluid/eager/pylayer/py_layer_node.h
index b477d7a9ad996..c1a8c6e626b4f 100644
--- a/paddle/fluid/eager/pylayer/py_layer_node.h
+++ b/paddle/fluid/eager/pylayer/py_layer_node.h
@@ -32,10 +32,7 @@ class GradNodePyLayer : public GradNodeBase {
     ctx_ = ctx;
   }
 
-  ~GradNodePyLayer() override {
-    Py_DECREF(ctx_);
-    Py_XDECREF(outputs_);
-  };
+  ~GradNodePyLayer() override { Py_XDECREF(ctx_); };
 
   virtual paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                                kSlotSmallVectorSize>
@@ -50,9 +47,6 @@ class GradNodePyLayer : public GradNodeBase {
     return "GradNodePyLayer_" + std::string(Py_TYPE(ctx_)->tp_name);
   }
 
-  // for paddle.grad get result
-  PyObject* GetMutableOutputs() { return outputs_; }
-
   void SaveForwardOutputsMeta(
       const std::vector<std::vector<paddle::experimental::Tensor*>>&
           outputs_tensor) {
@@ -81,7 +75,6 @@ class GradNodePyLayer : public GradNodeBase {
 
  private:
   PyObject* ctx_{nullptr};
-  PyObject* outputs_{nullptr};
   std::vector<std::vector<phi::DenseTensorMeta>> forward_outputs_meta_;
   std::vector<std::vector<paddle::platform::Place>> forward_outputs_place_;
 };
diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h
index f13fcfa990057..495f7f2e42c59 100644
--- a/paddle/fluid/eager/tensor_wrapper.h
+++ b/paddle/fluid/eager/tensor_wrapper.h
@@ -34,7 +34,6 @@ class TensorWrapper {
  public:
   TensorWrapper() = default;
   explicit TensorWrapper(const paddle::experimental::Tensor& tensor,
-                         bool full_reserved = false,
                          bool no_need_buffer = false) {
     // set inplace_version_snapshot_ according to tensor's current inplace
     // version.
@@ -46,32 +45,12 @@ class TensorWrapper {
     }
 
     /**
-     * Normally, we should fully reserved all non-output or non-leaf fwd tensor
-     * here. And for fwd output tensor, we should not reserve its autogradmeta,
-     * to avoid recursive depends on GradNodeBase
+     * Normally, we should only save data and part of autograd_meta of fwd
+     * tensor, and should not reserve its original grad_node,
+     * to avoid recursive and additional depends on GradNodeBase
      * **/
-    full_reserved_ = full_reserved;
+    auto* tensor_autograd_meta = EagerUtils::nullable_autograd_meta(tensor);
     no_need_buffer_ = no_need_buffer;
-    if (full_reserved_) {
-      VLOG(6) << "Fully reserved tensor: " << tensor.name();
-      intermidiate_tensor_ = tensor;
-      if (no_need_buffer_) {
-        if (phi::DenseTensor::classof(tensor.impl().get())) {
-          // Only Copy Meta
-          phi::DenseTensor* dense_tensor =
-              static_cast<phi::DenseTensor*>(tensor.impl().get());
-          auto tw_dense_tensor =
-              std::make_shared<phi::DenseTensor>(*dense_tensor);
-          tw_dense_tensor->clear();
-          intermidiate_tensor_.set_impl(tw_dense_tensor);
-        } else {
-          PADDLE_THROW(paddle::platform::errors::Fatal(
-              "Unrecognized tensor type for no_need_buffer feature"));
-        }
-      }
-      return;
-    }
-
     // shallow copy tensor_impl here
     if (no_need_buffer) {
       if (phi::DenseTensor::classof(tensor.impl().get())) {
@@ -88,10 +67,12 @@ class TensorWrapper {
     } else {
       intermidiate_tensor_.set_impl(tensor.impl());
     }
-    // TODO(jiabin): This may has server performance issue
-    intermidiate_tensor_.set_name(tensor.name() + "@Saved");
 
-    auto* tensor_autograd_meta = EagerUtils::nullable_autograd_meta(tensor);
+    if (VLOG_IS_ON(7)) {
+      // TODO(jiabin): This may has server performance issue
+      intermidiate_tensor_.set_name(tensor.name() + "@Saved");
+    }
+
     if (tensor_autograd_meta) {
       auto autograd_meta =
           std::make_shared<AutogradMeta>(*tensor_autograd_meta);
@@ -111,31 +92,37 @@ class TensorWrapper {
 
     check_inplace_version();
 
-    // if it's full_reserved just return the full copy of tensor
-    if (full_reserved_) {
-      return intermidiate_tensor_;
+    paddle::experimental::Tensor recovered_tensor = intermidiate_tensor_;
+
+    std::shared_ptr<GradNodeBase> new_grad_node = weak_grad_node_.lock();
+    if (new_grad_node) {
+      VLOG(3) << "Recovered TensorWrapper with GradNode "
+              << new_grad_node->name() << " addr: " << new_grad_node.get();
     } else {
-      paddle::experimental::Tensor recovered_tensor = intermidiate_tensor_;
+      VLOG(3) << "Recovered TensorWrapper with Empty GradNode";
+    }
+    auto* intermediate_autograd_meta =
+        EagerUtils::nullable_autograd_meta(intermidiate_tensor_);
 
-      std::shared_ptr<GradNodeBase> new_grad_node = weak_grad_node_.lock();
-      if (new_grad_node) {
-        VLOG(3) << "Recovered TensorWrapper with GradNode "
-                << new_grad_node->name() << " addr: " << new_grad_node.get();
-      } else {
-        VLOG(3) << "Recovered TensorWrapper with Empth GradNode";
-      }
-      auto* intermediate_autograd_meta =
-          EagerUtils::unsafe_autograd_meta(intermidiate_tensor_);
+    if (intermediate_autograd_meta) {
       auto p_ab_autograd_meta =
           std::make_shared<AutogradMeta>(*intermediate_autograd_meta);
       if (new_grad_node) {
         p_ab_autograd_meta->SetGradNode(new_grad_node);
       }
       recovered_tensor.set_autograd_meta(p_ab_autograd_meta);
-      return recovered_tensor;
     }
+
+    return recovered_tensor;
+  }
+
+  paddle::experimental::Tensor get_intermidiate_tensor() {
+    return intermidiate_tensor_;
   }
 
+  void clear() { intermidiate_tensor_.reset(); }
+
+ private:
   void check_inplace_version() {
     if (no_need_buffer_) {
       VLOG(6) << "There's no need to check inplace_version because "
@@ -170,10 +157,7 @@ class TensorWrapper {
     }
   }
 
-  void clear() { intermidiate_tensor_.reset(); }
-
  private:
-  bool full_reserved_ = false;
   bool no_need_buffer_ = false;
   paddle::experimental::Tensor intermidiate_tensor_;
   std::weak_ptr<egr::GradNodeBase> weak_grad_node_;
diff --git a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
index a9a50a3621767..edbb441f27a08 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
@@ -233,3 +233,88 @@ TEST(EagerVariable, DataLayout) {
   layout = paddle::imperative::GetDataLayout(eager_var);
   CHECK_EQ(layout, paddle::experimental::DataLayout::NCHW);
 }
+
+TEST(VariableCompatTensor, MemberFunction) {
+  egr::VariableCompatTensor var_tensor;
+  // test GetMutable and Get
+  var_tensor.GetMutable<paddle::framework::Vocab>();
+  auto& vocab = var_tensor.Get<paddle::framework::Vocab>();
+  EXPECT_EQ(vocab.size(), 0UL);
+  bool caught_exception = false;
+  try {
+    var_tensor.GetMutable<paddle::framework::Strings>();
+  } catch (paddle::platform::EnforceNotMet& error) {
+    caught_exception = true;
+    std::string ex_msg = error.what();
+    EXPECT_TRUE(ex_msg.find("The Variable type must be") != std::string::npos);
+  }
+  EXPECT_TRUE(caught_exception);
+  // test Type and IsType
+  EXPECT_TRUE(var_tensor.IsType<paddle::framework::Vocab>());
+  EXPECT_EQ(var_tensor.Type(),
+            static_cast<int>(paddle::framework::proto::VarType::VOCAB));
+  // test valid and initialized
+  EXPECT_TRUE(var_tensor.IsInitialized());
+  EXPECT_TRUE(var_tensor.valid());
+  EXPECT_TRUE(var_tensor.initialized());
+  // test name
+  EXPECT_EQ(var_tensor.name(), "VariableCompatTensor");
+  // test other throw error methods
+  caught_exception = false;
+  try {
+    var_tensor.numel();
+  } catch (paddle::platform::EnforceNotMet& error) {
+    caught_exception = true;
+    std::string ex_msg = error.what();
+    EXPECT_TRUE(ex_msg.find("numel") != std::string::npos);
+  }
+  EXPECT_TRUE(caught_exception);
+  caught_exception = false;
+  try {
+    var_tensor.dims();
+  } catch (paddle::platform::EnforceNotMet& error) {
+    caught_exception = true;
+    std::string ex_msg = error.what();
+    EXPECT_TRUE(ex_msg.find("dims") != std::string::npos);
+  }
+  EXPECT_TRUE(caught_exception);
+  caught_exception = false;
+  try {
+    var_tensor.dtype();
+  } catch (paddle::platform::EnforceNotMet& error) {
+    caught_exception = true;
+    std::string ex_msg = error.what();
+    EXPECT_TRUE(ex_msg.find("dtype") != std::string::npos);
+  }
+  EXPECT_TRUE(caught_exception);
+  caught_exception = false;
+  try {
+    var_tensor.layout();
+  } catch (paddle::platform::EnforceNotMet& error) {
+    caught_exception = true;
+    std::string ex_msg = error.what();
+    EXPECT_TRUE(ex_msg.find("layout") != std::string::npos);
+  }
+  EXPECT_TRUE(caught_exception);
+  caught_exception = false;
+  try {
+    var_tensor.place();
+  } catch (paddle::platform::EnforceNotMet& error) {
+    caught_exception = true;
+    std::string ex_msg = error.what();
+    EXPECT_TRUE(ex_msg.find("place") != std::string::npos);
+  }
+  EXPECT_TRUE(caught_exception);
+  caught_exception = false;
+  try {
+    var_tensor.AllocateFrom(nullptr, phi::DataType::UNDEFINED);
+  } catch (paddle::platform::EnforceNotMet& error) {
+    caught_exception = true;
+    std::string ex_msg = error.what();
+    EXPECT_TRUE(ex_msg.find("AllocateFrom") != std::string::npos);
+  }
+  EXPECT_TRUE(caught_exception);
+  // test Clear
+  var_tensor.Clear();
+  EXPECT_FALSE(var_tensor.IsInitialized());
+}
diff --git a/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc b/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc
index 5f563edee39f1..28c3472f90d03 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc
@@ -40,9 +40,11 @@ TEST(TensorWrapper, Basic) {
   auto auto_grad0 = std::make_shared<egr::AutogradMeta>(edge0);
   et1.set_autograd_meta(auto_grad0);
   et1.set_name("et1");
-  auto tw0 = egr::TensorWrapper(et1, true);
+  auto tw0 = egr::TensorWrapper(et1);
   auto recover_et1 = tw0.recover();
-  CHECK_EQ(recover_et1.name(), std::string("et1"));
+  if (VLOG_IS_ON(7)) {
+    CHECK_EQ(recover_et1.name(), std::string("et1@saved"));
+  }
   CHECK_EQ(egr::EagerUtils::OutRankInfo(recover_et1).first,
            egr::EagerUtils::OutRankInfo(et1).first);
   CHECK_EQ(egr::EagerUtils::OutRankInfo(recover_et1).second,
@@ -68,13 +70,15 @@ TEST(TensorWrapper, Basic) {
   et2.set_autograd_meta(auto_grad1);
   auto tw1 = egr::TensorWrapper(et2, false);
   auto recover_et2 = tw1.recover();
-  CHECK_EQ(recover_et2.name(), std::string("et2@Saved"));
+  if (VLOG_IS_ON(7)) {
+    CHECK_EQ(recover_et2.name(), std::string("et2@Saved"));
+  }
   CHECK_EQ(egr::EagerUtils::OutRankInfo(recover_et2).first,
            egr::EagerUtils::OutRankInfo(et2).first);
   CHECK_EQ(egr::EagerUtils::OutRankInfo(recover_et2).second,
            egr::EagerUtils::OutRankInfo(et2).second);
   // Test Raw recover
   paddle::experimental::Tensor et3;
-  auto tw2 = egr::TensorWrapper(et3, true);
+  auto tw2 = egr::TensorWrapper(et3);
   CHECK(tw2.recover().initialized() == false);
 }
diff --git a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt
index 5a09ffd6a1e5f..719ef6673c07d 100644
--- a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt
+++ b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt
@@ -1,6 +1,7 @@
 cc_test(test_egr_task_tensor_utils SRCS tensor_utils_test.cc DEPS ${eager_deps})
 cc_test(test_egr_task_eager_utils SRCS eager_utils_test.cc DEPS ${eager_deps})
 cc_test(test_egr_task_forward_autograd SRCS forward_autograd_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
+cc_test(test_egr_task_nan_inf_utils SRCS nan_inf_utils_test.cc DEPS eager_nan_inf_utils)
 
 if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
     cc_test(test_egr_task_hook SRCS hook_test.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} eager_scale scale_node)
diff --git a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
index 2d69380cf78d9..1f8fdb7de0c17 100644
--- a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
@@ -50,9 +50,7 @@ paddle::experimental::Tensor hook_function(
   auto place = t_dense->place();
   size_t bytes_size = phi::product(t_dense->dims()) * SizeOf(t_dense->dtype());
   auto ret_dense = std::make_shared<phi::DenseTensor>(
-      phi::make_intrusive<paddle::experimental::SharedStorage>(
-          paddle::memory::Alloc(place, bytes_size)),
-      std::move(ret_meta));
+      paddle::memory::Alloc(place, bytes_size), std::move(ret_meta));
 
   float* t_ptr = t_dense->mutable_data<float>(place);
   float* ret_ptr = ret_dense->mutable_data<float>(place);
diff --git a/paddle/fluid/eager/tests/task_tests/hook_test.cc b/paddle/fluid/eager/tests/task_tests/hook_test.cc
index 855fe526c10c8..d7b887b28bde8 100644
--- a/paddle/fluid/eager/tests/task_tests/hook_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test.cc
@@ -46,9 +46,7 @@ paddle::experimental::Tensor hook_function(
   auto place = t_dense->place();
   size_t bytes_size = phi::product(t_dense->dims()) * SizeOf(t_dense->dtype());
   auto ret_dense = std::make_shared<phi::DenseTensor>(
-      phi::make_intrusive<paddle::experimental::SharedStorage>(
-          paddle::memory::Alloc(place, bytes_size)),
-      std::move(ret_meta));
+      paddle::memory::Alloc(place, bytes_size), std::move(ret_meta));
 
   float* t_ptr = t_dense->mutable_data<float>(place);
   float* ret_ptr = ret_dense->mutable_data<float>(place);
diff --git a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
index 8524be7800bfd..c4d4ff9110682 100644
--- a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
@@ -46,9 +46,7 @@ paddle::experimental::Tensor hook_function(
   auto place = t_dense->place();
   size_t bytes_size = phi::product(t_dense->dims()) * SizeOf(t_dense->dtype());
   auto ret_dense = std::make_shared<phi::DenseTensor>(
-      phi::make_intrusive<paddle::experimental::SharedStorage>(
-          paddle::memory::Alloc(place, bytes_size)),
-      std::move(ret_meta));
+      paddle::memory::Alloc(place, bytes_size), std::move(ret_meta));
 
   float* t_ptr = t_dense->mutable_data<float>(place);
   float* ret_ptr = ret_dense->mutable_data<float>(place);
diff --git a/paddle/fluid/eager/tests/task_tests/nan_inf_utils_test.cc b/paddle/fluid/eager/tests/task_tests/nan_inf_utils_test.cc
new file mode 100644
index 0000000000000..be0563fbeedb4
--- /dev/null
+++ b/paddle/fluid/eager/tests/task_tests/nan_inf_utils_test.cc
@@ -0,0 +1,109 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <limits>
+#include <tuple>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/eager/nan_inf_utils.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/api/include/api.h"
+#include "paddle/phi/api/include/strings_api.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(strings_empty, CPU, ALL_LAYOUT);
+
+namespace egr {
+
+#define CHECK_NAN_INF(tensors)                                         \
+  {                                                                    \
+    bool caught_exception = false;                                     \
+    try {                                                              \
+      CheckTensorHasNanOrInf("nan_inf_test", tensors);                 \
+    } catch (paddle::platform::EnforceNotMet & error) {                \
+      caught_exception = true;                                         \
+      std::string ex_msg = error.what();                               \
+      EXPECT_TRUE(ex_msg.find("There are `nan` or `inf` in tensor") != \
+                  std::string::npos);                                  \
+    }                                                                  \
+    EXPECT_TRUE(caught_exception);                                     \
+  }
+
+#define CHECK_NO_NAN_INF(tensors)                                      \
+  {                                                                    \
+    bool caught_exception = false;                                     \
+    try {                                                              \
+      CheckTensorHasNanOrInf("nan_inf_test", tensors);                 \
+    } catch (paddle::platform::EnforceNotMet & error) {                \
+      caught_exception = true;                                         \
+      std::string ex_msg = error.what();                               \
+      EXPECT_TRUE(ex_msg.find("There are `nan` or `inf` in tensor") != \
+                  std::string::npos);                                  \
+    }                                                                  \
+    EXPECT_FALSE(caught_exception);                                    \
+  }
+
+TEST(NanInfUtils, Functions) {
+  // test all methods
+  auto tensor = paddle::experimental::full(
+      {3, 4}, std::numeric_limits<double>::quiet_NaN(), phi::DataType::FLOAT64);
+  CHECK_NAN_INF(tensor);
+  auto tensor1 = paddle::experimental::full(
+      {3, 4}, std::numeric_limits<double>::quiet_NaN(), phi::DataType::FLOAT64);
+  auto two_tensors = std::make_tuple(tensor, tensor1);
+  CHECK_NAN_INF(two_tensors);
+  auto tensor2 = paddle::experimental::full(
+      {3, 4}, std::numeric_limits<double>::quiet_NaN(), phi::DataType::FLOAT64);
+  auto three_tensors = std::make_tuple(tensor, tensor1, tensor2);
+  CHECK_NAN_INF(three_tensors);
+  auto tensor3 = paddle::experimental::full(
+      {3, 4}, std::numeric_limits<double>::quiet_NaN(), phi::DataType::FLOAT64);
+  auto four_tensors = std::make_tuple(tensor, tensor1, tensor2, tensor3);
+  CHECK_NAN_INF(four_tensors);
+  auto tensor4 = paddle::experimental::full(
+      {3, 4}, std::numeric_limits<double>::quiet_NaN(), phi::DataType::FLOAT64);
+  auto five_tensors =
+      std::make_tuple(tensor, tensor1, tensor2, tensor3, tensor4);
+  CHECK_NAN_INF(five_tensors);
+  auto tensor5 = paddle::experimental::full(
+      {3, 4}, std::numeric_limits<double>::quiet_NaN(), phi::DataType::FLOAT64);
+  auto six_tensors =
+      std::make_tuple(tensor, tensor1, tensor2, tensor3, tensor4, tensor5);
+  CHECK_NAN_INF(six_tensors);
+  std::vector<paddle::experimental::Tensor> tensor_vec;
+  tensor_vec.emplace_back(tensor);
+  tensor_vec.emplace_back(tensor1);
+  CHECK_NAN_INF(tensor_vec);
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       egr::kSlotSmallVectorSize>
+      small_vec;
+  small_vec.emplace_back(tensor_vec);
+  CHECK_NAN_INF(small_vec);
+  // test selected_rows
+  paddle::experimental::Tensor tensor_sr;
+  auto sr = std::make_shared<phi::SelectedRows>();
+  *sr->mutable_value() =
+      *(static_cast<const phi::DenseTensor*>(tensor.impl().get()));
+  tensor_sr.set_impl(sr);
+  CHECK_NAN_INF(tensor_sr);
+  // test other tensor
+  auto tensor_str = paddle::experimental::strings::empty({3, 4});
+  CHECK_NO_NAN_INF(tensor_str);
+}
+
+}  // namespace egr
diff --git a/paddle/fluid/eager/type_defs.h b/paddle/fluid/eager/type_defs.h
new file mode 100644
index 0000000000000..c57e718f1df3b
--- /dev/null
+++ b/paddle/fluid/eager/type_defs.h
@@ -0,0 +1,21 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace egr {
+
+constexpr size_t kSlotSmallVectorSize = 15U;
+
+}  // namespace egr
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
index 033af5c496c98..d22f4316d5604 100644
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -157,7 +157,7 @@ void EagerUtils::SetHistory(std::vector<AutogradMeta*>* autograd_metas,
     if (autograd_meta->GradNode()) {
       VLOG(7) << "Should not set grad node twice, original node is:"
               << autograd_meta->GradNode()->name()
-              << "current is: " << grad_node->name();
+              << " current is: " << grad_node->name();
     }
     autograd_meta->SetGradNode(grad_node);
   }
@@ -271,6 +271,33 @@ void EagerUtils::HandleViewBetweenInputAndOutput(
   }
 }
 
+void EagerUtils::HandleViewBetweenInputAndOutput(
+    const paddle::experimental::Tensor& input_tensor,
+    paddle::experimental::Tensor* view_output_tensor) {
+  PADDLE_ENFORCE_EQ(
+      input_tensor.initialized(), true,
+      paddle::platform::errors::InvalidArgument(
+          "Tensor %s has not been initialized!", input_tensor.name()));
+
+  if (input_tensor.is_dense_tensor()) {
+    auto input_dense_tensor =
+        std::dynamic_pointer_cast<phi::DenseTensor>(input_tensor.impl());
+    if (view_output_tensor->impl() == nullptr) {
+      view_output_tensor->set_impl(std::make_shared<phi::DenseTensor>());
+    }
+    auto view_output_dense_tensor =
+        std::dynamic_pointer_cast<phi::DenseTensor>(view_output_tensor->impl());
+    view_output_dense_tensor->ShareBufferWith(*input_dense_tensor);
+    view_output_dense_tensor->ShareInplaceVersionCounterWith(
+        *input_dense_tensor);
+
+    VLOG(3) << "Perform View between Output Tensor("
+            << view_output_tensor->name() << ") and Input Tensor("
+            << input_tensor.name()
+            << "), share allocation and inplace version.";
+  }
+}
+
 std::vector<paddle::experimental::Tensor> EagerUtils::GetOutputs(
     const std::vector<std::shared_ptr<EagerVariable>>& outs) {
   std::vector<paddle::experimental::Tensor> res;
@@ -454,16 +481,48 @@ void EagerUtils::FillZeroForEmptyGradInputs(
             grad_in_meta.HasTensorMeta(),
             paddle::platform::errors::Fatal(
                 "Unable to fill empty grad inputs due to empty GradSlotMeta"));
-
         const auto& tensor_meta = grad_in_meta.GetTensorMeta();
-        phi::Place place = grad_in_meta.GetPlace();
-
         auto tensor_with_zero = paddle::experimental::full(
-            phi::vectorize(tensor_meta.dims), 0.0, tensor_meta.dtype, place);
+            phi::vectorize(tensor_meta.dims), 0.0, tensor_meta.dtype,
+            grad_in_meta.GetPlace());
         grad.set_impl(tensor_with_zero.impl());
       }
     }
   }
 }
 
+void EagerUtils::FillZeroForEmptyGradInput(
+    paddle::experimental::Tensor* in_grad, const GradSlotMeta& grad_in_meta) {
+  if (!in_grad->initialized()) {
+    PADDLE_ENFORCE(
+        grad_in_meta.HasTensorMeta(),
+        paddle::platform::errors::Fatal(
+            "Unable to fill empty grad inputs due to empty GradSlotMeta"));
+    const auto& tensor_meta = grad_in_meta.GetTensorMeta();
+    auto tensor_with_zero =
+        paddle::experimental::full(phi::vectorize(tensor_meta.dims), 0.0,
+                                   tensor_meta.dtype, grad_in_meta.GetPlace());
+    in_grad->set_impl(tensor_with_zero.impl());
+  }
+}
+
+void EagerUtils::FillZeroForEmptyOptionalGradInput(
+    paddle::experimental::Tensor* in_grad, const GradSlotMeta& grad_in_meta) {
+  if (!in_grad->initialized() && grad_in_meta.HasTensorMeta()) {
+    const auto& tensor_meta = grad_in_meta.GetTensorMeta();
+    auto tensor_with_zero =
+        paddle::experimental::full(phi::vectorize(tensor_meta.dims), 0.0,
+                                   tensor_meta.dtype, grad_in_meta.GetPlace());
+    in_grad->set_impl(tensor_with_zero.impl());
+  }
+}
+
+void EagerUtils::FillZeroForEmptyGradInput(
+    std::vector<paddle::experimental::Tensor>* in_grads,
+    const std::vector<GradSlotMeta>& grad_in_metas) {
+  for (size_t i = 0; i < in_grads->size(); i++) {
+    FillZeroForEmptyGradInput(&in_grads->at(i), grad_in_metas[i]);
+  }
+}
+
 }  // namespace egr
diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h
index ef2b1baac661b..7f5864ec887ca 100644
--- a/paddle/fluid/eager/utils.h
+++ b/paddle/fluid/eager/utils.h
@@ -172,6 +172,9 @@ class EagerUtils {
   static void HandleViewBetweenInputAndOutput(
       const std::shared_ptr<EagerVariable>& input_var,
       const std::shared_ptr<EagerVariable>& view_output_var);
+  static void HandleViewBetweenInputAndOutput(
+      const paddle::experimental::Tensor& input_tensor,
+      paddle::experimental::Tensor* view_output_tensor);
 
   // TensorWrapper Utils
   static paddle::experimental::Tensor RecoverTensorWrapper(TensorWrapper* tw);
@@ -238,6 +241,13 @@ class EagerUtils {
                            kSlotSmallVectorSize>* out_grads,
       const paddle::small_vector<std::vector<GradSlotMeta>,
                                  kSlotSmallVectorSize>& grad_out_metas);
+  static void FillZeroForEmptyGradInput(paddle::experimental::Tensor* in_grad,
+                                        const GradSlotMeta& grad_in_meta);
+  static void FillZeroForEmptyOptionalGradInput(
+      paddle::experimental::Tensor* in_grad, const GradSlotMeta& grad_in_meta);
+  static void FillZeroForEmptyGradInput(
+      std::vector<paddle::experimental::Tensor>* in_grads,
+      const std::vector<GradSlotMeta>& grad_in_metas);
 };
 
 }  // namespace egr
diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc
index b4ae9949f2c6e..0c762ab2e77e5 100644
--- a/paddle/fluid/framework/data_set.cc
+++ b/paddle/fluid/framework/data_set.cc
@@ -120,6 +120,24 @@ void DatasetImpl<T>::SetDataFeedDesc(const std::string& data_feed_desc_str) {
                                                 &data_feed_desc_);
 }
 
+template <typename T>
+std::vector<std::string> DatasetImpl<T>::GetSlots() {
+  auto multi_slot_desc = data_feed_desc_.multi_slot_desc();
+  use_slots_.clear();
+  for (int i = 0; i < multi_slot_desc.slots_size(); ++i) {
+    const auto& slot = multi_slot_desc.slots(i);
+    if (slot.type() == "uint64" || slot.type() == "uint32") {
+      use_slots_.push_back(slot.name());
+    }
+  }
+  std::cout << "dataset use slots: ";
+  for (auto s : use_slots_) {
+    std::cout << s << " | ";
+  }
+  std::cout << " end " << std::endl;
+  return use_slots_;
+}
+
 template <typename T>
 void DatasetImpl<T>::SetChannelNum(int channel_num) {
   channel_num_ = channel_num;
@@ -302,12 +320,11 @@ static int compute_thread_batch_nccl(
   thread_avg_batch_num = static_cast<int>(offset.size() / thr_num);
 #ifdef PADDLE_WITH_GLOO
   auto gloo_wrapper = paddle::framework::GlooWrapper::GetInstance();
-  if (!gloo_wrapper->IsInitialized()) {
-    VLOG(0) << "GLOO is not inited";
-    gloo_wrapper->Init();
-  }
-
   if (gloo_wrapper->Size() > 1) {
+    if (!gloo_wrapper->IsInitialized()) {
+      VLOG(0) << "GLOO is not inited";
+      gloo_wrapper->Init();
+    }
     // adjust batch num per thread for NCCL
     std::vector<int> thread_avg_batch_num_vec(1, thread_avg_batch_num);
     std::vector<int64_t> total_instance_num_vec(1, total_instance_num);
diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h
index 1947c669e9bb0..3d096eaebe344 100644
--- a/paddle/fluid/framework/data_set.h
+++ b/paddle/fluid/framework/data_set.h
@@ -159,6 +159,8 @@ class Dataset {
   // set fleet send sleep seconds
   virtual void SetFleetSendSleepSeconds(int seconds) = 0;
 
+  virtual std::vector<std::string> GetSlots() = 0;
+
  protected:
   virtual int ReceiveFromClient(int msg_type, int client_id,
                                 const std::string& msg) = 0;
@@ -246,6 +248,7 @@ class DatasetImpl : public Dataset {
                                        bool discard_remaining_ins = false);
   virtual void DynamicAdjustReadersNum(int thread_num);
   virtual void SetFleetSendSleepSeconds(int seconds);
+  virtual std::vector<std::string> GetSlots();
   /* for enable_heterps_
   virtual void EnableHeterps(bool enable_heterps) {
     enable_heterps_ = enable_heterps;
@@ -321,6 +324,7 @@ class DatasetImpl : public Dataset {
   int64_t global_index_ = 0;
   std::vector<std::shared_ptr<ThreadPool>> consume_task_pool_;
   std::vector<T> input_records_;  // only for paddleboxdatafeed
+  std::vector<std::string> use_slots_;
   bool enable_heterps_ = false;
 };
 
diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc
index 14b5662b24aeb..c4ea6a3c6bc66 100644
--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@@ -18,6 +18,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/platform/transform.h"
 
+#if defined(PADDLE_WITH_XPU)
+#include "paddle/fluid/platform/device/device_wrapper.h"
+#endif
+
 namespace paddle {
 namespace framework {
 
@@ -28,6 +32,49 @@ struct CastDataTypeFunctor {
   }
 };
 
+#if defined(PADDLE_WITH_XPU)
+
+template <typename InType, typename OutType>
+static void XPUCastData(const framework::Tensor& in, framework::Tensor* out,
+                        const platform::XPUDeviceContext* dev_ctx) {
+  using XPUInTDType = typename XPUTypeTrait<InType>::Type;
+  using XPUOutTDType = typename XPUTypeTrait<OutType>::Type;
+  int r = xpu::cast_v2<XPUInTDType, XPUOutTDType>(
+      dev_ctx->x_context(),
+      reinterpret_cast<const XPUInTDType*>(in.data<InType>()),
+      reinterpret_cast<XPUOutTDType*>(out->mutable_data<OutType>(in.place())),
+      in.numel());
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2");
+  dev_ctx->Wait();
+}
+
+template <typename InType>
+static void XPUTransDataType(
+    const framework::Tensor& in, framework::Tensor* out,
+    const paddle::framework::proto::VarType::Type& dst_type,
+    const platform::DeviceContext* ctx) {
+  auto* context = static_cast<const platform::XPUDeviceContext*>(ctx);
+
+#define XPUCastCallback(cpp_type, proto_type)          \
+  do {                                                 \
+    if (dst_type == proto_type) {                      \
+      XPUCastData<InType, cpp_type>(in, out, context); \
+    }                                                  \
+  } while (0)
+
+  if (dst_type == proto::VarType::FP32 && dst_type == proto::VarType::FP16 &&
+      dst_type == proto::VarType::BOOL && dst_type == proto::VarType::INT16 &&
+      dst_type == proto::VarType::INT32 && dst_type == proto::VarType::INT64) {
+    _ForEachDataType_(XPUCastCallback);
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Data type (%s) is not supported in XPU when casting data type.",
+        DataTypeToString(dst_type)));
+  }
+}
+
+#endif
+
 template <typename InType>
 struct CastDataType {
   CastDataType(const framework::Tensor& in, framework::Tensor* out,
@@ -88,6 +135,34 @@ void TransDataType(const Tensor& in,
   auto dst_type = type;
   auto ctx = pool.Get(in.place());
 
+#if defined(PADDLE_WITH_XPU)
+  switch (src_type) {
+    case proto::VarType::FP16:
+      XPUTransDataType<platform::float16>(in, out, dst_type, ctx);
+      break;
+    case proto::VarType::FP32:
+      XPUTransDataType<float>(in, out, dst_type, ctx);
+      break;
+    case proto::VarType::BOOL:
+      XPUTransDataType<bool>(in, out, dst_type, ctx);
+      break;
+    case proto::VarType::INT16:
+      XPUTransDataType<int16_t>(in, out, dst_type, ctx);
+      break;
+    case proto::VarType::INT32:
+      XPUTransDataType<int>(in, out, dst_type, ctx);
+      break;
+    case proto::VarType::INT64:
+      XPUTransDataType<int64_t>(in, out, dst_type, ctx);
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Data type (%s) is not supported in XPU when casting data type.",
+          DataTypeToString(src_type)));
+  }
+
+#else
+
   switch (src_type) {
     case proto::VarType::FP16:
       framework::VisitDataType(dst_type,
@@ -123,6 +198,7 @@ void TransDataType(const Tensor& in,
           "Data type (%s) is not supported when casting data type.",
           DataTypeToString(src_type)));
   }
+#endif
 }
 
 void TransComplexToReal(const proto::VarType::Type& dst_type,
@@ -131,7 +207,6 @@ void TransComplexToReal(const proto::VarType::Type& dst_type,
   auto& pool = platform::DeviceContextPool::Instance();
   auto* ctx = pool.Get(in.place());
   out->Resize(in.dims());
-
   // complex -> real
   switch (src_type) {
     case proto::VarType::COMPLEX64:
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
index da72215653e75..e6790de92d054 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/framework/details/nan_inf_utils_detail.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/scope.h"
 
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
index f6a97160d8271..7cf11f7829da9 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
@@ -19,7 +19,9 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/scope.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.h b/paddle/fluid/framework/details/nan_inf_utils_detail.h
index 08bac5d63323b..5668ab31f36b6 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.h
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.h
@@ -16,7 +16,7 @@
 
 #include <string>
 
-#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/place.h"
 
 namespace phi {
diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
index d198eb1459288..7e63c5ffb9a44 100644
--- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
@@ -75,7 +75,7 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
       in_var_handles.size(), places_.size(),
       platform::errors::PreconditionNotMet(
           "The number of input variables should be equal to the number of "
-          "places, but got the number of input variables is %zu and the the "
+          "places, but got the number of input variables is %zu and the "
           "number of places is %zu.",
           in_var_handles.size(), places_.size()));
   PADDLE_ENFORCE_EQ(
@@ -83,7 +83,7 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
       platform::errors::PreconditionNotMet(
           "The number of input variables should be equal to the number of "
           "output variables, but got the number of input variables is %zu and "
-          "the the number of output variables is %zu.",
+          "the number of output variables is %zu.",
           in_var_handles.size(), out_var_handles.size()));
 
   std::vector<const LoDTensor *> ins;
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc
index 4fddfca5d805a..37ec4666a30d6 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@@ -69,7 +69,7 @@ void FleetWrapper::InitWorker(const std::string& dist_desc,
                               int node_num, int index) {
 #ifdef PADDLE_WITH_PSLIB
   if (!is_initialized_) {
-    VLOG(3) << "Going to init worker";
+    VLOG(0) << "Going to init worker";
     pslib_ptr_ = std::shared_ptr<paddle::distributed::PSlib>(
         new paddle::distributed::PSlib());
     pslib_ptr_->init_worker(dist_desc,
@@ -126,7 +126,7 @@ void FleetWrapper::GatherServers(const std::vector<uint64_t>& host_sign_list,
 
 void FleetWrapper::GatherClients(const std::vector<uint64_t>& host_sign_list) {
 #ifdef PADDLE_WITH_PSLIB
-  VLOG(3) << "Going to gather client ips";
+  VLOG(0) << "Going to gather client ips";
   size_t len = host_sign_list.size();
   pslib_ptr_->gather_clients(const_cast<uint64_t*>(host_sign_list.data()), len);
 #endif
@@ -142,7 +142,7 @@ std::vector<uint64_t> FleetWrapper::GetClientsInfo() {
 
 void FleetWrapper::CreateClient2ClientConnection() {
 #ifdef PADDLE_WITH_PSLIB
-  VLOG(3) << "Going to create client2client connection";
+  VLOG(0) << "Going to create client2client connection";
   pslib_ptr_->create_client2client_connection(client2client_request_timeout_ms_,
                                               client2client_connect_timeout_ms_,
                                               client2client_max_retry_);
@@ -1054,7 +1054,8 @@ void FleetWrapper::PushSparseFromTensorWithLabelAsync(
   int slot_offset = 0;
   int grad_dim = 0;
   // don't worry, user do not have to care about all these flags
-  if (accesor == "DownpourCtrAccessor") {
+  if (accesor == "DownpourCtrAccessor" ||
+      accesor == "DownpourCtrDymfAccessor") {
     dump_slot = true;
     slot_offset = 1;
     grad_dim = fea_dim - 2;
diff --git a/paddle/fluid/framework/fleet/heter_context.h b/paddle/fluid/framework/fleet/heter_context.h
index 3fdcf2379cb54..823b60c5ef1f2 100644
--- a/paddle/fluid/framework/fleet/heter_context.h
+++ b/paddle/fluid/framework/fleet/heter_context.h
@@ -95,24 +95,6 @@ class HeterContext {
   }
   void SetShardNum(uint32_t shard_num) { shard_num_ = shard_num; }
   uint32_t ShardNum() { return shard_num_; }
-  void init(int shard_num, int device_num) {
-    shard_num_ = shard_num;
-    feature_keys_.resize(shard_num_);
-    value_ptr_.resize(shard_num_);
-    device_task_ptr_.resize(shard_num_);
-    device_task_keys_.resize(shard_num_);
-    for (size_t i = 0; i < device_task_ptr_.size(); i++) {
-      device_task_ptr_[i].resize(device_num);
-      device_task_keys_[i].resize(device_num);
-    }
-
-    device_values_.resize(device_num);
-    device_keys_.resize(device_num);
-    mutex_.resize(device_num);
-    for (size_t i = 0; i < mutex_.size(); ++i) {
-      mutex_[i] = new std::mutex();
-    }
-  }
 
   void init(int shard_num, int device_num, int dim_num) {
     shard_num_ = shard_num;
@@ -129,11 +111,6 @@ class HeterContext {
     for (size_t i = 0; i < feature_dim_keys_.size(); i++) {
       feature_dim_keys_[i].resize(dim_num);
       value_dim_ptr_[i].resize(dim_num);
-      if (i == 0) {
-        for (int j = 0; j < dim_num; j++) {
-          feature_dim_keys_[i][j].push_back(0);
-        }
-      }
     }
     device_values_.resize(device_num);
     device_dim_values_.resize(device_num);
diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
index 51456457d0606..d62fc1c084962 100644
--- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
@@ -13,11 +13,10 @@ IF(WITH_GPU)
     nv_test(test_heter_comm SRCS feature_value.h DEPS heter_comm)
     nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
     if(WITH_PSCORE)
-        nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm table hashtable_kernel)
+        nv_library(graph_gpu_ps SRCS graph_gpu_ps_table_inl.cu DEPS heter_comm table hashtable_kernel)
         nv_library(graph_sampler SRCS graph_sampler_inl.h DEPS graph_gpu_ps)
-
-        nv_test(test_cpu_query SRCS test_cpu_query.cu DEPS heter_comm table heter_comm_kernel hashtable_kernel heter_ps ${HETERPS_DEPS})
-        nv_library(graph_gpu_wrapper SRCS graph_gpu_wrapper.cu DEPS heter_comm table heter_comm_kernel hashtable_kernel heter_ps ${HETERPS_DEPS})
+        nv_library(graph_gpu_wrapper SRCS graph_gpu_wrapper.cu DEPS heter_comm table heter_comm_kernel hashtable_kernel heter_ps ${HETERPS_DEPS} graph_gpu_ps)
+        nv_test(test_cpu_query SRCS test_cpu_query.cu DEPS heter_comm table heter_comm_kernel hashtable_kernel heter_ps ${HETERPS_DEPS} graph_gpu_ps graph_gpu_wrapper)
         #ADD_EXECUTABLE(test_sample_rate test_sample_rate.cu)
         #target_link_libraries(test_sample_rate heter_comm table heter_comm_kernel hashtable_kernel heter_ps ${HETERPS_DEPS})
         #nv_test(test_sample_rate SRCS test_sample_rate.cu DEPS heter_comm table heter_comm_kernel hashtable_kernel heter_ps ${HETERPS_DEPS})
diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.h b/paddle/fluid/framework/fleet/heter_ps/feature_value.h
index b633394e7a811..cb7f3a40d6720 100644
--- a/paddle/fluid/framework/fleet/heter_ps/feature_value.h
+++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.h
@@ -32,17 +32,33 @@ struct FeatureValue {
   float lr;
   float lr_g2sum;
   int mf_size;
-  float mf[MF_DIM + 1];
+  int mf_dim;
   uint64_t cpu_ptr;
+  float mf[0];
 
   friend std::ostream& operator<<(std::ostream& out, FeatureValue& val) {
     out << "show: " << val.show << " clk: " << val.clk << " slot: " << val.slot
-        << " lr: " << val.lr << " mf_size: " << val.mf_size << " mf:";
-    for (int i = 0; i < val.mf_size; ++i) {
+        << " lr: " << val.lr << " mf_dim: " << val.mf_dim
+        << "cpuptr: " << val.cpu_ptr << " mf_size: " << val.mf_size << " mf:";
+    for (int i = 0; i < val.mf_dim + 1; ++i) {
       out << " " << val.mf[i];
     }
     return out;
   }
+  __device__ __forceinline__ void operator=(const FeatureValue& in) {
+    delta_score = in.delta_score;
+    show = in.show;
+    clk = in.clk;
+    slot = in.slot;
+    lr = in.lr;
+    lr_g2sum = in.lr_g2sum;
+    mf_size = in.mf_size;
+    mf_dim = in.mf_dim;
+    cpu_ptr = in.cpu_ptr;
+    for (int i = 0; i < mf_dim + 1; i++) {
+      mf[i] = in.mf[i];
+    }
+  }
 };
 
 struct FeaturePushValue {
@@ -50,20 +66,33 @@ struct FeaturePushValue {
   float clk;
   int slot;
   float lr_g;
-  float mf_g[MF_DIM];
+  int mf_dim;
+  float mf_g[0];
 
-  // __device__ __forceinline__ FeaturePushValue
-  // operator+(const FeaturePushValue& a) const {
-  //  FeaturePushValue out;
-  //  out.slot = a.slot;
-  //  out.show = a.show + show;
-  //  out.clk = a.clk + clk;
-  //  out.lr_g = a.lr_g + lr_g;
-  //  for (int i = 0; i < MF_DIM; ++i) {
-  //    out.mf_g[i] = a.mf_g[i] + mf_g[i];
-  //  }
-  //  return out;
-  // }
+  __device__ __forceinline__ FeaturePushValue
+  operator+(const FeaturePushValue& a) const {
+    FeaturePushValue out;
+    out.slot = a.slot;
+    out.mf_dim = a.mf_dim;
+    out.show = a.show + show;
+    out.clk = a.clk + clk;
+    out.lr_g = a.lr_g + lr_g;
+    // out.mf_g = a.mf_g;
+    for (int i = 0; i < out.mf_dim; ++i) {
+      out.mf_g[i] = a.mf_g[i] + mf_g[i];
+    }
+    return out;
+  }
+  __device__ __forceinline__ void operator=(const FeaturePushValue& in) {
+    show = in.show;
+    clk = in.clk;
+    slot = in.slot;
+    lr_g = in.lr_g;
+    mf_dim = in.mf_dim;
+    for (int i = 0; i < mf_dim; i++) {
+      mf_g[i] = in.mf_g[i];
+    }
+  }
 };
 
 }  // end namespace framework
diff --git a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
index e7601edb0ca07..19c355c671a38 100644
--- a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
+++ b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
@@ -24,7 +24,7 @@ namespace paddle {
 namespace framework {
 struct GpuPsGraphNode {
   int64_t node_id;
-  unsigned int neighbor_size, neighbor_offset;
+  int64_t neighbor_size, neighbor_offset;
   // this node's neighbor is stored on [neighbor_offset,neighbor_offset +
   // neighbor_size) of int64_t *neighbor_list;
 };
@@ -32,17 +32,17 @@ struct GpuPsGraphNode {
 struct GpuPsCommGraph {
   int64_t *neighbor_list;
   GpuPsGraphNode *node_list;
-  unsigned int neighbor_size, node_size;
+  int64_t neighbor_size, node_size;
   // the size of neighbor array and graph_node_list array
   GpuPsCommGraph()
       : neighbor_list(NULL), node_list(NULL), neighbor_size(0), node_size(0) {}
   GpuPsCommGraph(int64_t *neighbor_list_, GpuPsGraphNode *node_list_,
-                 unsigned int neighbor_size_, unsigned int node_size_)
+                 int64_t neighbor_size_, int64_t node_size_)
       : neighbor_list(neighbor_list_),
         node_list(node_list_),
         neighbor_size(neighbor_size_),
         node_size(node_size_) {}
-  void init_on_cpu(unsigned int neighbor_size, unsigned int node_size) {
+  void init_on_cpu(int64_t neighbor_size, int64_t node_size) {
     this->neighbor_size = neighbor_size;
     this->node_size = node_size;
     this->neighbor_list = new int64_t[neighbor_size];
@@ -208,12 +208,43 @@ struct NeighborSampleResult {
     delete[] ac_size;
     VLOG(0) << " ------------------";
   }
-  NeighborSampleResult(){};
-  ~NeighborSampleResult() {
-    // if (val != NULL) cudaFree(val);
-    // if (actual_sample_size != NULL) cudaFree(actual_sample_size);
-    // if (offset != NULL) cudaFree(offset);
+  std::vector<int64_t> get_sampled_graph(NeighborSampleQuery q) {
+    std::vector<int64_t> graph;
+    int64_t *sample_keys = new int64_t[q.len];
+    std::string key_str;
+    cudaMemcpy(sample_keys, q.key, q.len * sizeof(int64_t),
+               cudaMemcpyDeviceToHost);
+    int64_t *res = new int64_t[sample_size * key_size];
+    cudaMemcpy(res, val, sample_size * key_size * sizeof(int64_t),
+               cudaMemcpyDeviceToHost);
+    int *ac_size = new int[key_size];
+    cudaMemcpy(ac_size, actual_sample_size, key_size * sizeof(int),
+               cudaMemcpyDeviceToHost);  // 3, 1, 3
+    int total_sample_size = 0;
+    for (int i = 0; i < key_size; i++) {
+      total_sample_size += ac_size[i];
+    }
+    int64_t *res2 = new int64_t[total_sample_size];  // r
+    cudaMemcpy(res2, actual_val, total_sample_size * sizeof(int64_t),
+               cudaMemcpyDeviceToHost);  // r
+
+    int start = 0;
+    for (int i = 0; i < key_size; i++) {
+      graph.push_back(sample_keys[i]);
+      graph.push_back(ac_size[i]);
+      for (int j = 0; j < ac_size[i]; j++) {
+        graph.push_back(res2[start + j]);
+      }
+      start += ac_size[i];  // r
+    }
+    delete[] res;
+    delete[] res2;  // r
+    delete[] ac_size;
+    delete[] sample_keys;
+    return graph;
   }
+  NeighborSampleResult(){};
+  ~NeighborSampleResult() {}
 };
 
 struct NodeQueryResult {
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
index 8a0088114e2ec..ae57c2ebe932f 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
@@ -23,15 +23,17 @@
 #ifdef PADDLE_WITH_HETERPS
 namespace paddle {
 namespace framework {
-class GpuPsGraphTable : public HeterComm<int64_t, unsigned int, int> {
+class GpuPsGraphTable : public HeterComm<uint64_t, int64_t, int> {
  public:
   GpuPsGraphTable(std::shared_ptr<HeterPsResource> resource, int topo_aware)
-      : HeterComm<int64_t, unsigned int, int>(1, resource) {
+      : HeterComm<uint64_t, int64_t, int>(1, resource) {
     load_factor_ = 0.25;
     rw_lock.reset(new pthread_rwlock_t());
     gpu_num = resource_->total_device();
+    memset(global_device_map, -1, sizeof(global_device_map));
     for (int i = 0; i < gpu_num; i++) {
       gpu_graph_list.push_back(GpuPsCommGraph());
+      global_device_map[resource_->dev_id(i)] = i;
       sample_status.push_back(NULL);
       tables_.push_back(NULL);
     }
@@ -98,27 +100,20 @@ class GpuPsGraphTable : public HeterComm<int64_t, unsigned int, int> {
   NeighborSampleResult graph_neighbor_sample_v2(int gpu_id, int64_t *key,
                                                 int sample_size, int len,
                                                 bool cpu_query_switch);
+  void init_sample_status();
+  void free_sample_status();
   NodeQueryResult query_node_list(int gpu_id, int start, int query_size);
   void clear_graph_info();
+  void display_sample_res(void *key, void *val, int len, int sample_len);
   void move_neighbor_sample_result_to_source_gpu(int gpu_id, int gpu_num,
                                                  int sample_size, int *h_left,
                                                  int *h_right,
                                                  int64_t *src_sample_res,
                                                  int *actual_sample_size);
-  // void move_neighbor_sample_result_to_source_gpu(
-  //     int gpu_id, int gpu_num, int *h_left, int *h_right,
-  //     int64_t *src_sample_res, thrust::host_vector<int> &total_sample_size);
-  // void move_neighbor_sample_size_to_source_gpu(int gpu_id, int gpu_num,
-  //                                              int *h_left, int *h_right,
-  //                                              int *actual_sample_size,
-  //                                              int *total_sample_size);
   int init_cpu_table(const paddle::distributed::GraphParameter &graph);
-  // int load(const std::string &path, const std::string &param);
-  // virtual int32_t end_graph_sampling() {
-  //   return cpu_graph_table->end_graph_sampling();
-  // }
   int gpu_num;
   std::vector<GpuPsCommGraph> gpu_graph_list;
+  int global_device_map[32];
   std::vector<int *> sample_status;
   const int parallel_sample_size = 1;
   const int dim_y = 256;
@@ -130,5 +125,5 @@ class GpuPsGraphTable : public HeterComm<int64_t, unsigned int, int> {
 };
 }
 };
-#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h"
+//#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h"
 #endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu
similarity index 76%
rename from paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h
rename to paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu
index 605019cb607fc..72b9cae41c0fd 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu
@@ -18,7 +18,7 @@
 #include <functional>
 #pragma once
 #ifdef PADDLE_WITH_HETERPS
-//#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
+#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
 namespace paddle {
 namespace framework {
 /*
@@ -32,23 +32,48 @@ sample_result is to save the neighbor sampling result, its size is len *
 sample_size;
 */
 
-__global__ void get_cpu_id_index(int64_t* key, unsigned int* val,
+__global__ void get_cpu_id_index(int64_t* key, int* actual_sample_size,
                                  int64_t* cpu_key, int* sum, int* index,
                                  int len) {
   CUDA_KERNEL_LOOP(i, len) {
-    if (val[i] == ((unsigned int)-1)) {
+    if (actual_sample_size[i] == -1) {
       int old = atomicAdd(sum, 1);
       cpu_key[old] = key[i];
       index[old] = i;
+      // printf("old %d i-%d key:%lld\n",old,i,key[i]);
     }
   }
 }
 
+__global__ void get_actual_gpu_ac(int* gpu_ac, int number_on_cpu) {
+  CUDA_KERNEL_LOOP(i, number_on_cpu) { gpu_ac[i] /= sizeof(int64_t); }
+}
+
+template <int WARP_SIZE, int BLOCK_WARPS, int TILE_SIZE>
+__global__ void copy_buffer_ac_to_final_place(
+    int64_t* gpu_buffer, int* gpu_ac, int64_t* val, int* actual_sample_size,
+    int* index, int* cumsum_gpu_ac, int number_on_cpu, int sample_size) {
+  assert(blockDim.x == WARP_SIZE);
+  assert(blockDim.y == BLOCK_WARPS);
+
+  int i = blockIdx.x * TILE_SIZE + threadIdx.y;
+  const int last_idx =
+      min(static_cast<int>(blockIdx.x + 1) * TILE_SIZE, number_on_cpu);
+  while (i < last_idx) {
+    actual_sample_size[index[i]] = gpu_ac[i];
+    for (int j = threadIdx.x; j < gpu_ac[i]; j += WARP_SIZE) {
+      val[index[i] * sample_size + j] = gpu_buffer[cumsum_gpu_ac[i] + j];
+    }
+    i += BLOCK_WARPS;
+  }
+}
+
 template <int WARP_SIZE, int BLOCK_WARPS, int TILE_SIZE>
 __global__ void neighbor_sample_example_v2(GpuPsCommGraph graph,
-                                           unsigned int* node_index,
+                                           int64_t* node_index,
                                            int* actual_size, int64_t* res,
-                                           int sample_len, int n) {
+                                           int sample_len, int n,
+                                           int default_value) {
   assert(blockDim.x == WARP_SIZE);
   assert(blockDim.y == BLOCK_WARPS);
 
@@ -58,13 +83,13 @@ __global__ void neighbor_sample_example_v2(GpuPsCommGraph graph,
   curand_init(blockIdx.x, threadIdx.y * WARP_SIZE + threadIdx.x, 0, &rng);
 
   while (i < last_idx) {
-    if (node_index[i] == (unsigned int)(-1)) {
-      actual_size[i] = 0;
+    if (node_index[i] == -1) {
+      actual_size[i] = default_value;
       i += BLOCK_WARPS;
       continue;
     }
-    int neighbor_len = graph.node_list[node_index[i]].neighbor_size;
-    int data_offset = graph.node_list[node_index[i]].neighbor_offset;
+    int neighbor_len = (int)graph.node_list[node_index[i]].neighbor_size;
+    int64_t data_offset = graph.node_list[node_index[i]].neighbor_offset;
     int offset = i * sample_len;
     int64_t* data = graph.neighbor_list;
     if (neighbor_len <= sample_len) {
@@ -86,7 +111,7 @@ __global__ void neighbor_sample_example_v2(GpuPsCommGraph graph,
       }
       __syncwarp();
       for (int j = threadIdx.x; j < sample_len; j += WARP_SIZE) {
-        const int perm_idx = res[offset + j] + data_offset;
+        const int64_t perm_idx = res[offset + j] + data_offset;
         res[offset + j] = data[perm_idx];
       }
       actual_size[i] = sample_len;
@@ -96,23 +121,22 @@ __global__ void neighbor_sample_example_v2(GpuPsCommGraph graph,
 }
 
 __global__ void neighbor_sample_example(GpuPsCommGraph graph,
-                                        unsigned int* node_index,
-                                        int* actual_size, int64_t* res,
-                                        int sample_len, int* sample_status,
-                                        int n, int from) {
+                                        int64_t* node_index, int* actual_size,
+                                        int64_t* res, int sample_len,
+                                        int* sample_status, int n, int from) {
   int id = blockIdx.x * blockDim.y + threadIdx.y;
   if (id < n) {
-    if (node_index[id] == (unsigned int)(-1)) {
+    if (node_index[id] == -1) {
       actual_size[id] = 0;
       return;
     }
     curandState rng;
     curand_init(blockIdx.x, threadIdx.x, threadIdx.y, &rng);
-    int index = threadIdx.x;
-    int offset = id * sample_len;
+    int64_t index = threadIdx.x;
+    int64_t offset = id * sample_len;
     int64_t* data = graph.neighbor_list;
-    int data_offset = graph.node_list[node_index[id]].neighbor_offset;
-    int neighbor_len = graph.node_list[node_index[id]].neighbor_size;
+    int64_t data_offset = graph.node_list[node_index[id]].neighbor_offset;
+    int64_t neighbor_len = graph.node_list[node_index[id]].neighbor_size;
     int ac_len;
     if (sample_len > neighbor_len)
       ac_len = neighbor_len;
@@ -220,6 +244,29 @@ int GpuPsGraphTable::init_cpu_table(
  that's what fill_dvals does.
 */
 
+void GpuPsGraphTable::display_sample_res(void* key, void* val, int len,
+                                         int sample_len) {
+  char key_buffer[len * sizeof(int64_t)];
+  char val_buffer[sample_len * sizeof(int64_t) * len +
+                  (len + len % 2) * sizeof(int) + len * sizeof(int64_t)];
+  cudaMemcpy(key_buffer, key, sizeof(int64_t) * len, cudaMemcpyDeviceToHost);
+  cudaMemcpy(val_buffer, val,
+             sample_len * sizeof(int64_t) * len +
+                 (len + len % 2) * sizeof(int) + len * sizeof(int64_t),
+             cudaMemcpyDeviceToHost);
+  int64_t* sample_val = (int64_t*)(val_buffer + (len + len % 2) * sizeof(int) +
+                                   len * sizeof(int64_t));
+  for (int i = 0; i < len; i++) {
+    printf("key %lld\n", *(int64_t*)(key_buffer + i * sizeof(int64_t)));
+    printf("index %lld\n", *(int64_t*)(val_buffer + i * sizeof(int64_t)));
+    int ac_size = *(int*)(val_buffer + i * sizeof(int) + len * sizeof(int64_t));
+    printf("sampled %d neigbhors\n", ac_size);
+    for (int j = 0; j < ac_size; j++) {
+      printf("%lld ", sample_val[i * sample_len + j]);
+    }
+    printf("\n");
+  }
+}
 void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu(
     int start_index, int gpu_num, int sample_size, int* h_left, int* h_right,
     int64_t* src_sample_res, int* actual_sample_size) {
@@ -229,7 +276,7 @@ void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu(
       continue;
     }
     shard_len[i] = h_right[i] - h_left[i] + 1;
-    int cur_step = path_[start_index][i].nodes_.size() - 1;
+    int cur_step = (int)path_[start_index][i].nodes_.size() - 1;
     for (int j = cur_step; j > 0; j--) {
       cudaMemcpyAsync(path_[start_index][i].nodes_[j - 1].val_storage,
                       path_[start_index][i].nodes_[j].val_storage,
@@ -240,12 +287,12 @@ void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu(
     auto& node = path_[start_index][i].nodes_.front();
     cudaMemcpyAsync(
         reinterpret_cast<char*>(src_sample_res + h_left[i] * sample_size),
-        node.val_storage + sizeof(int64_t) * shard_len[i],
-        node.val_bytes_len - sizeof(int64_t) * shard_len[i], cudaMemcpyDefault,
+        node.val_storage + sizeof(int64_t) * shard_len[i] +
+            sizeof(int) * (shard_len[i] + shard_len[i] % 2),
+        sizeof(int64_t) * shard_len[i] * sample_size, cudaMemcpyDefault,
         node.out_stream);
-    // resource_->remote_stream(i, start_index));
     cudaMemcpyAsync(reinterpret_cast<char*>(actual_sample_size + h_left[i]),
-                    node.val_storage + sizeof(int) * shard_len[i],
+                    node.val_storage + sizeof(int64_t) * shard_len[i],
                     sizeof(int) * shard_len[i], cudaMemcpyDefault,
                     node.out_stream);
   }
@@ -440,19 +487,19 @@ void GpuPsGraphTable::build_graph_on_single_gpu(GpuPsCommGraph& g, int i) {
   // platform::CUDADeviceGuard guard(i);
   gpu_graph_list[i] = GpuPsCommGraph();
   sample_status[i] = NULL;
-  tables_[i] = new Table(std::max((unsigned int)1, g.node_size) / load_factor_);
+  tables_[i] = new Table(std::max((int64_t)1, g.node_size) / load_factor_);
   if (g.node_size > 0) {
     std::vector<int64_t> keys;
-    std::vector<unsigned int> offset;
+    std::vector<int64_t> offset;
     cudaMalloc((void**)&gpu_graph_list[i].node_list,
                g.node_size * sizeof(GpuPsGraphNode));
     cudaMemcpy(gpu_graph_list[i].node_list, g.node_list,
                g.node_size * sizeof(GpuPsGraphNode), cudaMemcpyHostToDevice);
-    for (unsigned int j = 0; j < g.node_size; j++) {
+    for (int64_t j = 0; j < g.node_size; j++) {
       keys.push_back(g.node_list[j].node_id);
       offset.push_back(j);
     }
-    build_ps(i, keys.data(), offset.data(), keys.size(), 1024, 8);
+    build_ps(i, (uint64_t*)keys.data(), offset.data(), keys.size(), 1024, 8);
     gpu_graph_list[i].node_size = g.node_size;
   } else {
     build_ps(i, NULL, NULL, 0, 1024, 8);
@@ -460,12 +507,15 @@ void GpuPsGraphTable::build_graph_on_single_gpu(GpuPsCommGraph& g, int i) {
     gpu_graph_list[i].node_size = 0;
   }
   if (g.neighbor_size) {
-    int* addr;
-    cudaMalloc((void**)&addr, g.neighbor_size * sizeof(int));
-    cudaMemset(addr, 0, g.neighbor_size * sizeof(int));
-    sample_status[i] = addr;
-    cudaMalloc((void**)&gpu_graph_list[i].neighbor_list,
-               g.neighbor_size * sizeof(int64_t));
+    cudaError_t cudaStatus =
+        cudaMalloc((void**)&gpu_graph_list[i].neighbor_list,
+                   g.neighbor_size * sizeof(int64_t));
+    PADDLE_ENFORCE_EQ(cudaStatus, cudaSuccess,
+                      platform::errors::InvalidArgument(
+                          "ailed to allocate memory for graph on gpu "));
+    VLOG(0) << "sucessfully allocate " << g.neighbor_size * sizeof(int64_t)
+            << " bytes of memory for graph-edges on gpu "
+            << resource_->dev_id(i);
     cudaMemcpy(gpu_graph_list[i].neighbor_list, g.neighbor_list,
                g.neighbor_size * sizeof(int64_t), cudaMemcpyHostToDevice);
     gpu_graph_list[i].neighbor_size = g.neighbor_size;
@@ -474,6 +524,27 @@ void GpuPsGraphTable::build_graph_on_single_gpu(GpuPsCommGraph& g, int i) {
     gpu_graph_list[i].neighbor_size = 0;
   }
 }
+
+void GpuPsGraphTable::init_sample_status() {
+  for (int i = 0; i < gpu_num; i++) {
+    if (gpu_graph_list[i].neighbor_size) {
+      platform::CUDADeviceGuard guard(resource_->dev_id(i));
+      int* addr;
+      cudaMalloc((void**)&addr, gpu_graph_list[i].neighbor_size * sizeof(int));
+      cudaMemset(addr, 0, gpu_graph_list[i].neighbor_size * sizeof(int));
+      sample_status[i] = addr;
+    }
+  }
+}
+
+void GpuPsGraphTable::free_sample_status() {
+  for (int i = 0; i < gpu_num; i++) {
+    if (sample_status[i] != NULL) {
+      platform::CUDADeviceGuard guard(resource_->dev_id(i));
+      cudaFree(sample_status[i]);
+    }
+  }
+}
 void GpuPsGraphTable::build_graph_from_cpu(
     std::vector<GpuPsCommGraph>& cpu_graph_list) {
   VLOG(0) << "in build_graph_from_cpu cpu_graph_list size = "
@@ -485,26 +556,24 @@ void GpuPsGraphTable::build_graph_from_cpu(
   clear_graph_info();
   for (int i = 0; i < cpu_graph_list.size(); i++) {
     platform::CUDADeviceGuard guard(resource_->dev_id(i));
-    // platform::CUDADeviceGuard guard(i);
     gpu_graph_list[i] = GpuPsCommGraph();
     sample_status[i] = NULL;
-    // auto table =
-    //     new Table(std::max(1, cpu_graph_list[i].node_size) / load_factor_);
-    tables_[i] = new Table(
-        std::max((unsigned int)1, cpu_graph_list[i].node_size) / load_factor_);
+    tables_[i] = new Table(std::max((int64_t)1, cpu_graph_list[i].node_size) /
+                           load_factor_);
     if (cpu_graph_list[i].node_size > 0) {
       std::vector<int64_t> keys;
-      std::vector<unsigned int> offset;
+      std::vector<int64_t> offset;
       cudaMalloc((void**)&gpu_graph_list[i].node_list,
                  cpu_graph_list[i].node_size * sizeof(GpuPsGraphNode));
       cudaMemcpy(gpu_graph_list[i].node_list, cpu_graph_list[i].node_list,
                  cpu_graph_list[i].node_size * sizeof(GpuPsGraphNode),
                  cudaMemcpyHostToDevice);
-      for (unsigned int j = 0; j < cpu_graph_list[i].node_size; j++) {
+      for (int64_t j = 0; j < cpu_graph_list[i].node_size; j++) {
         keys.push_back(cpu_graph_list[i].node_list[j].node_id);
         offset.push_back(j);
       }
-      build_ps(i, keys.data(), offset.data(), keys.size(), 1024, 8);
+      build_ps(i, (uint64_t*)(keys.data()), offset.data(), keys.size(), 1024,
+               8);
       gpu_graph_list[i].node_size = cpu_graph_list[i].node_size;
     } else {
       build_ps(i, NULL, NULL, 0, 1024, 8);
@@ -512,12 +581,9 @@ void GpuPsGraphTable::build_graph_from_cpu(
       gpu_graph_list[i].node_size = 0;
     }
     if (cpu_graph_list[i].neighbor_size) {
-      int* addr;
-      cudaMalloc((void**)&addr, cpu_graph_list[i].neighbor_size * sizeof(int));
-      cudaMemset(addr, 0, cpu_graph_list[i].neighbor_size * sizeof(int));
-      sample_status[i] = addr;
       cudaMalloc((void**)&gpu_graph_list[i].neighbor_list,
                  cpu_graph_list[i].neighbor_size * sizeof(int64_t));
+
       cudaMemcpy(gpu_graph_list[i].neighbor_list,
                  cpu_graph_list[i].neighbor_list,
                  cpu_graph_list[i].neighbor_size * sizeof(int64_t),
@@ -533,8 +599,8 @@ void GpuPsGraphTable::build_graph_from_cpu(
 
 NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v3(
     NeighborSampleQuery q, bool cpu_switch) {
-  return graph_neighbor_sample_v2(q.gpu_id, q.key, q.sample_size, q.len,
-                                  cpu_switch);
+  return graph_neighbor_sample_v2(global_device_map[q.gpu_id], q.key,
+                                  q.sample_size, q.len, cpu_switch);
 }
 NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
                                                             int64_t* key,
@@ -571,12 +637,9 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
   }
   platform::CUDAPlace place = platform::CUDAPlace(resource_->dev_id(gpu_id));
   platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id));
-  // cudaMalloc((void**)&result->val, len * sample_size * sizeof(int64_t));
-  // cudaMalloc((void**)&result->actual_sample_size, len * sizeof(int));
   int* actual_sample_size = result.actual_sample_size;
   int64_t* val = result.val;
   int total_gpu = resource_->total_device();
-  // int dev_id = resource_->dev_id(gpu_id);
   auto stream = resource_->local_stream(gpu_id, 0);
 
   int grid_size = (len - 1) / block_size_ + 1;
@@ -603,11 +666,9 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
   int* d_shard_actual_sample_size_ptr =
       reinterpret_cast<int*>(d_shard_actual_sample_size->ptr());
 
-  split_input_to_shard(key, d_idx_ptr, len, d_left_ptr, d_right_ptr, gpu_id);
+  split_input_to_shard((uint64_t*)(key), d_idx_ptr, len, d_left_ptr,
+                       d_right_ptr, gpu_id);
 
-  // fill_shard_key<<<grid_size, block_size_, 0, stream>>>(d_shard_keys_ptr,
-  // key,
-  //                                                     d_idx_ptr, len);
   heter_comm_kernel_->fill_shard_key(d_shard_keys_ptr, key, d_idx_ptr, len,
                                      stream);
   cudaStreamSynchronize(stream);
@@ -643,95 +704,48 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
     of alloc_mem_i, actual_sample_size_of_x equals ((int
    *)alloc_mem_i)[shard_len + x]
     */
+
     create_storage(gpu_id, i, shard_len * sizeof(int64_t),
-                   shard_len * (1 + sample_size) * sizeof(int64_t));
-    auto& node = path_[gpu_id][i].nodes_[0];
-    cudaMemsetAsync(node.val_storage, -1, shard_len * sizeof(int),
-                    node.in_stream);
+                   shard_len * (1 + sample_size) * sizeof(int64_t) +
+                       sizeof(int) * (shard_len + shard_len % 2));
+    // auto& node = path_[gpu_id][i].nodes_[0];
   }
-  // auto end1 = std::chrono::steady_clock::now();
-  // auto tt = std::chrono::duration_cast<std::chrono::microseconds>(end1 -
-  // start1);
-  // VLOG(0)<< "create storage time  " << tt.count() << " us";
-  walk_to_dest(gpu_id, total_gpu, h_left, h_right, d_shard_keys_ptr, NULL);
+  walk_to_dest(gpu_id, total_gpu, h_left, h_right,
+               (uint64_t*)(d_shard_keys_ptr), NULL);
 
   for (int i = 0; i < total_gpu; ++i) {
     if (h_left[i] == -1) {
       continue;
     }
+    int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
     auto& node = path_[gpu_id][i].nodes_.back();
+    cudaMemsetAsync(node.val_storage, -1, shard_len * sizeof(int64_t),
+                    node.in_stream);
     cudaStreamSynchronize(node.in_stream);
     platform::CUDADeviceGuard guard(resource_->dev_id(i));
-    // platform::CUDADeviceGuard guard(i);
-    // use the key-value map to update alloc_mem_i[0,shard_len)
-    // tables_[i]->rwlock_->RDLock();
-    tables_[i]->get(reinterpret_cast<int64_t*>(node.key_storage),
-                    reinterpret_cast<unsigned int*>(node.val_storage),
+    tables_[i]->get(reinterpret_cast<uint64_t*>(node.key_storage),
+                    reinterpret_cast<int64_t*>(node.val_storage),
                     h_right[i] - h_left[i] + 1,
                     resource_->remote_stream(i, gpu_id));
     // node.in_stream);
-    int shard_len = h_right[i] - h_left[i] + 1;
     auto graph = gpu_graph_list[i];
-    unsigned int* id_array = reinterpret_cast<unsigned int*>(node.val_storage);
+    int64_t* id_array = reinterpret_cast<int64_t*>(node.val_storage);
     int* actual_size_array = (int*)(id_array + shard_len);
-    int64_t* sample_array = (int64_t*)(actual_size_array + shard_len);
-    int sample_grid_size = (shard_len - 1) / dim_y + 1;
-    dim3 block(parallel_sample_size, dim_y);
-    dim3 grid(sample_grid_size);
-    // int sample_grid_size = shard_len / block_size_ + 1;
-    // VLOG(0)<<"in sample grid_size = "<<sample_grid_size<<" block_size
-    // ="<<block_size_<<" device = "<<resource_->dev_id(i)<<"len = "<<len;;
-    // neighbor_sample_example<<<sample_grid_size, block_size_, 0,
-    //                           resource_->remote_stream(i, gpu_id)>>>(
-    //     graph, res_array, actual_size_array, sample_array, sample_size,
-    //     shard_len);
-    neighbor_sample_example<<<grid, block, 0,
-                              resource_->remote_stream(i, gpu_id)>>>(
-        graph, id_array, actual_size_array, sample_array, sample_size,
-        sample_status[i], shard_len, gpu_id);
-  }
-  /*
-  for (int i = 0; i < total_gpu; ++i) {
-    if (h_left[i] == -1) {
-      continue;
-    }
-    // cudaStreamSynchronize(resource_->remote_stream(i, num));
-    // tables_[i]->rwlock_->UNLock();
-    platform::CUDADeviceGuard guard(i);
-    //platform::CUDADeviceGuard guard(resource_->dev_id(i));
-    auto& node = path_[gpu_id][i].nodes_.back();
-    auto shard_len = h_right[i] - h_left[i] + 1;
-    auto graph = gpu_graph_list[i];
-    int* id_array = reinterpret_cast<int*>(node.val_storage);
-    int* actual_size_array = id_array + shard_len;
-    int64_t* sample_array = (int64_t*)(id_array + shard_len * 2);
+    int64_t* sample_array =
+        (int64_t*)(actual_size_array + shard_len + shard_len % 2);
     int sample_grid_size = (shard_len - 1) / dim_y + 1;
     dim3 block(parallel_sample_size, dim_y);
     dim3 grid(sample_grid_size);
-    // int sample_grid_size = shard_len / block_size_ + 1;
-    // VLOG(0)<<"in sample grid_size = "<<sample_grid_size<<" block_size
-    // ="<<block_size_<<" device = "<<resource_->dev_id(i)<<"len = "<<len;;
-    // neighbor_sample_example<<<sample_grid_size, block_size_, 0,
-    //                           resource_->remote_stream(i, gpu_id)>>>(
-    //     graph, res_array, actual_size_array, sample_array, sample_size,
-    //     shard_len);
     neighbor_sample_example<<<grid, block, 0,
                               resource_->remote_stream(i, gpu_id)>>>(
         graph, id_array, actual_size_array, sample_array, sample_size,
         sample_status[i], shard_len, gpu_id);
-      // neighbor_sample_example<<<grid, block, 0,
-      //                         node.in_stream>>>(
-      //   graph, id_array, actual_size_array, sample_array, sample_size,
-      //   sample_status[i], shard_len, gpu_id);
   }
-  */
 
   for (int i = 0; i < total_gpu; ++i) {
     if (h_left[i] == -1) {
       continue;
     }
-    // auto& node = path_[gpu_id][i].nodes_.back();
-    // cudaStreamSynchronize(node.in_stream);
     cudaStreamSynchronize(resource_->remote_stream(i, gpu_id));
   }
   move_neighbor_sample_result_to_source_gpu(gpu_id, total_gpu, sample_size,
@@ -776,6 +790,10 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
   auto d_right = memory::Alloc(place, total_gpu * sizeof(int));
   int* d_left_ptr = reinterpret_cast<int*>(d_left->ptr());
   int* d_right_ptr = reinterpret_cast<int*>(d_right->ptr());
+  int default_value = 0;
+  if (cpu_query_switch) {
+    default_value = -1;
+  }
 
   cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream);
   cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream);
@@ -791,7 +809,8 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
   int* d_shard_actual_sample_size_ptr =
       reinterpret_cast<int*>(d_shard_actual_sample_size->ptr());
 
-  split_input_to_shard(key, d_idx_ptr, len, d_left_ptr, d_right_ptr, gpu_id);
+  split_input_to_shard((uint64_t*)(key), d_idx_ptr, len, d_left_ptr,
+                       d_right_ptr, gpu_id);
 
   heter_comm_kernel_->fill_shard_key(d_shard_keys_ptr, key, d_idx_ptr, len,
                                      stream);
@@ -807,42 +826,34 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
     if (shard_len == 0) {
       continue;
     }
-    // create_storage(gpu_id, i, shard_len * sizeof(int64_t),
-    //                shard_len * (1 + sample_size) * sizeof(int64_t));
     create_storage(gpu_id, i, shard_len * sizeof(int64_t),
-                   shard_len * (1 + sample_size) * sizeof(int64_t));
+                   shard_len * (1 + sample_size) * sizeof(int64_t) +
+                       sizeof(int) * (shard_len + shard_len % 2));
   }
-  walk_to_dest(gpu_id, total_gpu, h_left, h_right, d_shard_keys_ptr, NULL);
+  walk_to_dest(gpu_id, total_gpu, h_left, h_right,
+               (uint64_t*)(d_shard_keys_ptr), NULL);
 
-  // For cpu_query_switch, we need global items.
-  std::vector<thrust::device_vector<int64_t>> cpu_keys_list;
-  std::vector<thrust::device_vector<int>> cpu_index_list;
-  thrust::device_vector<int64_t> tmp1;
-  thrust::device_vector<int> tmp2;
   for (int i = 0; i < total_gpu; ++i) {
     if (h_left[i] == -1) {
-      // Insert empty object
-      cpu_keys_list.emplace_back(tmp1);
-      cpu_index_list.emplace_back(tmp2);
       continue;
     }
+    int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
     auto& node = path_[gpu_id][i].nodes_.back();
+    cudaMemsetAsync(node.val_storage, -1, shard_len * sizeof(int64_t),
+                    node.in_stream);
     cudaStreamSynchronize(node.in_stream);
     platform::CUDADeviceGuard guard(resource_->dev_id(i));
     // If not found, val is -1.
-    tables_[i]->get(reinterpret_cast<int64_t*>(node.key_storage),
-                    reinterpret_cast<unsigned int*>(node.val_storage),
+    tables_[i]->get(reinterpret_cast<uint64_t*>(node.key_storage),
+                    reinterpret_cast<int64_t*>(node.val_storage),
                     h_right[i] - h_left[i] + 1,
                     resource_->remote_stream(i, gpu_id));
 
-    auto shard_len = h_right[i] - h_left[i] + 1;
     auto graph = gpu_graph_list[i];
-    // int* id_array = reinterpret_cast<int*>(node.val_storage);
-    // int* actual_size_array = id_array + shard_len;
-    // int64_t* sample_array = (int64_t*)(id_array + shard_len * 2);
-    unsigned int* id_array = reinterpret_cast<unsigned int*>(node.val_storage);
+    int64_t* id_array = reinterpret_cast<int64_t*>(node.val_storage);
     int* actual_size_array = (int*)(id_array + shard_len);
-    int64_t* sample_array = (int64_t*)(actual_size_array + shard_len);
+    int64_t* sample_array =
+        (int64_t*)(actual_size_array + shard_len + shard_len % 2);
     constexpr int WARP_SIZE = 32;
     constexpr int BLOCK_WARPS = 128 / WARP_SIZE;
     constexpr int TILE_SIZE = BLOCK_WARPS * 16;
@@ -852,24 +863,7 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
         WARP_SIZE, BLOCK_WARPS,
         TILE_SIZE><<<grid, block, 0, resource_->remote_stream(i, gpu_id)>>>(
         graph, id_array, actual_size_array, sample_array, sample_size,
-        shard_len);
-
-    // cpu_graph_table->random_sample_neighbors
-    if (cpu_query_switch) {
-      thrust::device_vector<int64_t> cpu_keys_ptr(shard_len);
-      thrust::device_vector<int> index_ptr(shard_len + 1, 0);
-      int64_t* node_id_array = reinterpret_cast<int64_t*>(node.key_storage);
-      int grid_size2 = (shard_len - 1) / block_size_ + 1;
-      get_cpu_id_index<<<grid_size2, block_size_, 0,
-                         resource_->remote_stream(i, gpu_id)>>>(
-          node_id_array, id_array,
-          thrust::raw_pointer_cast(cpu_keys_ptr.data()),
-          thrust::raw_pointer_cast(index_ptr.data()),
-          thrust::raw_pointer_cast(index_ptr.data()) + 1, shard_len);
-
-      cpu_keys_list.emplace_back(cpu_keys_ptr);
-      cpu_index_list.emplace_back(index_ptr);
-    }
+        shard_len, default_value);
   }
 
   for (int i = 0; i < total_gpu; ++i) {
@@ -879,41 +873,6 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
     cudaStreamSynchronize(resource_->remote_stream(i, gpu_id));
   }
 
-  if (cpu_query_switch) {
-    for (int i = 0; i < total_gpu; ++i) {
-      if (h_left[i] == -1) {
-        continue;
-      }
-      auto shard_len = h_right[i] - h_left[i] + 1;
-      int* cpu_index = new int[shard_len + 1];
-      cudaMemcpy(cpu_index, thrust::raw_pointer_cast(cpu_index_list[i].data()),
-                 (shard_len + 1) * sizeof(int), cudaMemcpyDeviceToHost);
-      if (cpu_index[0] > 0) {
-        int number_on_cpu = cpu_index[0];
-        int64_t* cpu_keys = new int64_t[number_on_cpu];
-        cudaMemcpy(cpu_keys, thrust::raw_pointer_cast(cpu_keys_list[i].data()),
-                   number_on_cpu * sizeof(int64_t), cudaMemcpyDeviceToHost);
-
-        std::vector<std::shared_ptr<char>> buffers(number_on_cpu);
-        std::vector<int> ac(number_on_cpu);
-        auto status = cpu_graph_table->random_sample_neighbors(
-            0, cpu_keys, sample_size, buffers, ac, false);
-
-        auto& node = path_[gpu_id][i].nodes_.back();
-        int* id_array = reinterpret_cast<int*>(node.val_storage);
-        int* actual_size_array = id_array + shard_len;
-        int64_t* sample_array = (int64_t*)(id_array + shard_len * 2);
-        for (int j = 0; j < number_on_cpu; j++) {
-          int offset = cpu_index[j + 1] * sample_size;
-          ac[j] = ac[j] / sizeof(int64_t);
-          cudaMemcpy(sample_array + offset, (int64_t*)(buffers[j].get()),
-                     sizeof(int64_t) * ac[j], cudaMemcpyHostToDevice);
-          cudaMemcpy(actual_size_array + cpu_index[j + 1], ac.data() + j,
-                     sizeof(int), cudaMemcpyHostToDevice);
-        }
-      }
-    }
-  }
   move_neighbor_sample_result_to_source_gpu(gpu_id, total_gpu, sample_size,
                                             h_left, h_right, d_shard_vals_ptr,
                                             d_shard_actual_sample_size_ptr);
@@ -921,12 +880,95 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
       d_shard_vals_ptr, val, d_shard_actual_sample_size_ptr, actual_sample_size,
       d_idx_ptr, sample_size, len);
 
+  cudaStreamSynchronize(stream);
+
+  if (cpu_query_switch) {
+    // Get cpu keys and corresponding position.
+    thrust::device_vector<int64_t> t_cpu_keys(len);
+    thrust::device_vector<int> t_index(len + 1, 0);
+    get_cpu_id_index<<<grid_size, block_size_, 0, stream>>>(
+        key, actual_sample_size, thrust::raw_pointer_cast(t_cpu_keys.data()),
+        thrust::raw_pointer_cast(t_index.data()),
+        thrust::raw_pointer_cast(t_index.data()) + 1, len);
+
+    cudaStreamSynchronize(stream);
+
+    int number_on_cpu = 0;
+    cudaMemcpy(&number_on_cpu, thrust::raw_pointer_cast(t_index.data()),
+               sizeof(int), cudaMemcpyDeviceToHost);
+    if (number_on_cpu > 0) {
+      int64_t* cpu_keys = new int64_t[number_on_cpu];
+      cudaMemcpy(cpu_keys, thrust::raw_pointer_cast(t_cpu_keys.data()),
+                 number_on_cpu * sizeof(int64_t), cudaMemcpyDeviceToHost);
+
+      std::vector<std::shared_ptr<char>> buffers(number_on_cpu);
+      std::vector<int> ac(number_on_cpu);
+
+      auto status = cpu_graph_table->random_sample_neighbors(
+          0, cpu_keys, sample_size, buffers, ac, false);
+
+      int total_cpu_sample_size = std::accumulate(ac.begin(), ac.end(), 0);
+      total_cpu_sample_size /= sizeof(int64_t);
+
+      // Merge buffers into one int64_t vector.
+      int64_t* merge_buffers = new int64_t[total_cpu_sample_size];
+      int start = 0;
+      for (int j = 0; j < number_on_cpu; j++) {
+        memcpy(merge_buffers + start, (int64_t*)(buffers[j].get()), ac[j]);
+        start += ac[j] / sizeof(int64_t);
+      }
+
+      // Copy merge_buffers to gpu.
+      thrust::device_vector<int64_t> gpu_buffers(total_cpu_sample_size);
+      thrust::device_vector<int> gpu_ac(number_on_cpu);
+      int64_t* gpu_buffers_ptr = thrust::raw_pointer_cast(gpu_buffers.data());
+      int* gpu_ac_ptr = thrust::raw_pointer_cast(gpu_ac.data());
+      cudaMemcpyAsync(gpu_buffers_ptr, merge_buffers,
+                      total_cpu_sample_size * sizeof(int64_t),
+                      cudaMemcpyHostToDevice, stream);
+      cudaMemcpyAsync(gpu_ac_ptr, ac.data(), number_on_cpu * sizeof(int),
+                      cudaMemcpyHostToDevice, stream);
+
+      // Copy gpu_buffers and gpu_ac using kernel.
+      // Kernel divide for gpu_ac_ptr.
+      int grid_size2 = (number_on_cpu - 1) / block_size_ + 1;
+      get_actual_gpu_ac<<<grid_size2, block_size_, 0, stream>>>(gpu_ac_ptr,
+                                                                number_on_cpu);
+
+      cudaStreamSynchronize(stream);
+
+      thrust::device_vector<int> cumsum_gpu_ac(number_on_cpu);
+      thrust::exclusive_scan(gpu_ac.begin(), gpu_ac.end(),
+                             cumsum_gpu_ac.begin(), 0);
+
+      constexpr int WARP_SIZE_ = 32;
+      constexpr int BLOCK_WARPS_ = 128 / WARP_SIZE_;
+      constexpr int TILE_SIZE_ = BLOCK_WARPS_ * 16;
+      const dim3 block2(WARP_SIZE_, BLOCK_WARPS_);
+      const dim3 grid2((number_on_cpu + TILE_SIZE_ - 1) / TILE_SIZE_);
+      copy_buffer_ac_to_final_place<WARP_SIZE_, BLOCK_WARPS_,
+                                    TILE_SIZE_><<<grid2, block2, 0, stream>>>(
+          gpu_buffers_ptr, gpu_ac_ptr, val, actual_sample_size,
+          thrust::raw_pointer_cast(t_index.data()) + 1,
+          thrust::raw_pointer_cast(cumsum_gpu_ac.data()), number_on_cpu,
+          sample_size);
+
+      delete[] merge_buffers;
+      delete[] cpu_keys;
+    }
+  }
+
   {
+    cudaStreamSynchronize(stream);
     platform::CUDAPlace place = platform::CUDAPlace(resource_->dev_id(gpu_id));
     platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id));
-    thrust::device_ptr<int> t_actual_sample_size(actual_sample_size);
-    int total_sample_size =
-        thrust::reduce(t_actual_sample_size, t_actual_sample_size + len);
+
+    thrust::device_vector<int> t_actual_sample_size(len);
+    thrust::copy(actual_sample_size, actual_sample_size + len,
+                 t_actual_sample_size.begin());
+    int total_sample_size = thrust::reduce(t_actual_sample_size.begin(),
+                                           t_actual_sample_size.end());
+
     result.actual_val_mem =
         memory::AllocShared(place, total_sample_size * sizeof(int64_t));
     result.actual_val = (int64_t*)(result.actual_val_mem)->ptr();
@@ -934,14 +976,14 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
     result.set_total_sample_size(total_sample_size);
 
     thrust::device_vector<int> cumsum_actual_sample_size(len);
-    thrust::exclusive_scan(t_actual_sample_size, t_actual_sample_size + len,
+    thrust::exclusive_scan(t_actual_sample_size.begin(),
+                           t_actual_sample_size.end(),
                            cumsum_actual_sample_size.begin(), 0);
     fill_actual_vals<<<grid_size, block_size_, 0, stream>>>(
         val, result.actual_val, actual_sample_size,
         thrust::raw_pointer_cast(cumsum_actual_sample_size.data()), sample_size,
         len);
   }
-
   for (int i = 0; i < total_gpu; ++i) {
     int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
     if (shard_len == 0) {
@@ -949,6 +991,7 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
     }
     destroy_storage(gpu_id, i);
   }
+
   cudaStreamSynchronize(stream);
   return result;
 }
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
index 93854d7f1ec3f..c976bb67cb21e 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
@@ -18,41 +18,8 @@
 namespace paddle {
 namespace framework {
 #ifdef PADDLE_WITH_HETERPS
-std::string nodes[] = {
-    std::string("user\t37\ta 0.34\tb 13 14\tc hello\td abc"),
-    std::string("user\t96\ta 0.31\tb 15 10\tc 96hello\td abcd"),
-    std::string("user\t59\ta 0.11\tb 11 14"),
-    std::string("user\t97\ta 0.11\tb 12 11"),
-    std::string("item\t45\ta 0.21"),
-    std::string("item\t145\ta 0.21"),
-    std::string("item\t112\ta 0.21"),
-    std::string("item\t48\ta 0.21"),
-    std::string("item\t247\ta 0.21"),
-    std::string("item\t111\ta 0.21"),
-    std::string("item\t46\ta 0.21"),
-    std::string("item\t146\ta 0.21"),
-    std::string("item\t122\ta 0.21"),
-    std::string("item\t49\ta 0.21"),
-    std::string("item\t248\ta 0.21"),
-    std::string("item\t113\ta 0.21")};
-char node_file_name[] = "nodes.txt";
-std::vector<std::string> user_feature_name = {"a", "b", "c", "d"};
-std::vector<std::string> item_feature_name = {"a"};
-std::vector<std::string> user_feature_dtype = {"float32", "int32", "string",
-                                               "string"};
-std::vector<std::string> item_feature_dtype = {"float32"};
-std::vector<int> user_feature_shape = {1, 2, 1, 1};
-std::vector<int> item_feature_shape = {1};
-void prepare_file(char file_name[]) {
-  std::ofstream ofile;
-  ofile.open(file_name);
-
-  for (auto x : nodes) {
-    ofile << x << std::endl;
-  }
-  ofile.close();
-}
 
+std::shared_ptr<GraphGpuWrapper> GraphGpuWrapper::s_instance_(nullptr);
 void GraphGpuWrapper::set_device(std::vector<int> ids) {
   for (auto device_id : ids) {
     device_id_mapping.push_back(device_id);
@@ -205,96 +172,35 @@ void GraphGpuWrapper::upload_batch(int idx,
   // g->build_graph_from_cpu(vec);
 }
 
-void GraphGpuWrapper::initialize() {
-  std::vector<int> device_id_mapping;
-  for (int i = 0; i < 2; i++) device_id_mapping.push_back(i);
-  int gpu_num = device_id_mapping.size();
-  ::paddle::distributed::GraphParameter table_proto;
-  table_proto.add_edge_types("u2u");
-  table_proto.add_node_types("user");
-  table_proto.add_node_types("item");
-  ::paddle::distributed::GraphFeature *g_f = table_proto.add_graph_feature();
-
-  for (int i = 0; i < user_feature_name.size(); i++) {
-    g_f->add_name(user_feature_name[i]);
-    g_f->add_dtype(user_feature_dtype[i]);
-    g_f->add_shape(user_feature_shape[i]);
-  }
-  ::paddle::distributed::GraphFeature *g_f1 = table_proto.add_graph_feature();
-  for (int i = 0; i < item_feature_name.size(); i++) {
-    g_f1->add_name(item_feature_name[i]);
-    g_f1->add_dtype(item_feature_dtype[i]);
-    g_f1->add_shape(item_feature_shape[i]);
-  }
-  prepare_file(node_file_name);
-  table_proto.set_shard_num(24);
-
-  std::shared_ptr<HeterPsResource> resource =
-      std::make_shared<HeterPsResource>(device_id_mapping);
-  resource->enable_p2p();
-  GpuPsGraphTable *g = new GpuPsGraphTable(resource, 1);
-  g->init_cpu_table(table_proto);
-  graph_table = (char *)g;
-  g->cpu_graph_table->Load(node_file_name, "nuser");
-  g->cpu_graph_table->Load(node_file_name, "nitem");
-  std::remove(node_file_name);
-  std::vector<paddle::framework::GpuPsCommGraph> vec;
-  std::vector<int64_t> node_ids;
-  node_ids.push_back(37);
-  node_ids.push_back(96);
-  std::vector<std::vector<std::string>> node_feat(2,
-                                                  std::vector<std::string>(2));
-  std::vector<std::string> feature_names;
-  feature_names.push_back(std::string("c"));
-  feature_names.push_back(std::string("d"));
-  g->cpu_graph_table->get_node_feat(0, node_ids, feature_names, node_feat);
-  VLOG(0) << "get_node_feat: " << node_feat[0][0];
-  VLOG(0) << "get_node_feat: " << node_feat[0][1];
-  VLOG(0) << "get_node_feat: " << node_feat[1][0];
-  VLOG(0) << "get_node_feat: " << node_feat[1][1];
-  int n = 10;
-  std::vector<int64_t> ids0, ids1;
-  for (int i = 0; i < n; i++) {
-    g->cpu_graph_table->add_comm_edge(0, i, (i + 1) % n);
-    g->cpu_graph_table->add_comm_edge(0, i, (i - 1 + n) % n);
-    if (i % 2 == 0) ids0.push_back(i);
-  }
-  g->cpu_graph_table->build_sampler(0);
-  ids1.push_back(5);
-  vec.push_back(g->cpu_graph_table->make_gpu_ps_graph(0, ids0));
-  vec.push_back(g->cpu_graph_table->make_gpu_ps_graph(0, ids1));
-  vec[0].display_on_cpu();
-  vec[1].display_on_cpu();
-  g->build_graph_from_cpu(vec);
-}
-void GraphGpuWrapper::test() {
-  int64_t cpu_key[3] = {0, 1, 2};
-  void *key;
-  platform::CUDADeviceGuard guard(0);
-  cudaMalloc((void **)&key, 3 * sizeof(int64_t));
-  cudaMemcpy(key, cpu_key, 3 * sizeof(int64_t), cudaMemcpyHostToDevice);
-  auto neighbor_sample_res =
-      ((GpuPsGraphTable *)graph_table)
-          ->graph_neighbor_sample(0, (int64_t *)key, 2, 3);
-  int64_t *res = new int64_t[7];
-  cudaMemcpy(res, neighbor_sample_res.val, 3 * 2 * sizeof(int64_t),
-             cudaMemcpyDeviceToHost);
-  int *actual_sample_size = new int[3];
-  cudaMemcpy(actual_sample_size, neighbor_sample_res.actual_sample_size,
-             3 * sizeof(int),
-             cudaMemcpyDeviceToHost);  // 3, 1, 3
+// void GraphGpuWrapper::test() {
+//   int64_t cpu_key[3] = {0, 1, 2};
+//   void *key;
+//   platform::CUDADeviceGuard guard(0);
+//   cudaMalloc((void **)&key, 3 * sizeof(int64_t));
+//   cudaMemcpy(key, cpu_key, 3 * sizeof(int64_t), cudaMemcpyHostToDevice);
+//   auto neighbor_sample_res =
+//       ((GpuPsGraphTable *)graph_table)
+//           ->graph_neighbor_sample(0, (int64_t *)key, 2, 3);
+//   int64_t *res = new int64_t[7];
+//   cudaMemcpy(res, neighbor_sample_res.val, 3 * 2 * sizeof(int64_t),
+//              cudaMemcpyDeviceToHost);
+//   int *actual_sample_size = new int[3];
+//   cudaMemcpy(actual_sample_size, neighbor_sample_res.actual_sample_size,
+//              3 * sizeof(int),
+//              cudaMemcpyDeviceToHost);  // 3, 1, 3
 
-  //{0,9} or {9,0} is expected for key 0
-  //{0,2} or {2,0} is expected for key 1
-  //{1,3} or {3,1} is expected for key 2
-  for (int i = 0; i < 3; i++) {
-    VLOG(0) << "actual sample size for " << i << " is "
-            << actual_sample_size[i];
-    for (int j = 0; j < actual_sample_size[i]; j++) {
-      VLOG(0) << "sampled an neighbor for node" << i << " : " << res[i * 2 + j];
-    }
-  }
-}
+//   //{0,9} or {9,0} is expected for key 0
+//   //{0,2} or {2,0} is expected for key 1
+//   //{1,3} or {3,1} is expected for key 2
+//   for (int i = 0; i < 3; i++) {
+//     VLOG(0) << "actual sample size for " << i << " is "
+//             << actual_sample_size[i];
+//     for (int j = 0; j < actual_sample_size[i]; j++) {
+//       VLOG(0) << "sampled an neighbor for node" << i << " : " << res[i * 2 +
+//       j];
+//     }
+//   }
+// }
 NeighborSampleResult GraphGpuWrapper::graph_neighbor_sample_v3(
     NeighborSampleQuery q, bool cpu_switch) {
   return ((GpuPsGraphTable *)graph_table)
@@ -314,7 +220,6 @@ std::vector<int64_t> GraphGpuWrapper::graph_neighbor_sample(
   auto neighbor_sample_res =
       ((GpuPsGraphTable *)graph_table)
           ->graph_neighbor_sample(gpu_id, cuda_key, sample_size, key.size());
-
   int *actual_sample_size = new int[key.size()];
   cudaMemcpy(actual_sample_size, neighbor_sample_res.actual_sample_size,
              key.size() * sizeof(int),
@@ -323,7 +228,6 @@ std::vector<int64_t> GraphGpuWrapper::graph_neighbor_sample(
   for (int i = 0; i < key.size(); i++) {
     cumsum += actual_sample_size[i];
   }
-  /* VLOG(0) << "cumsum " << cumsum; */
 
   std::vector<int64_t> cpu_key, res;
   cpu_key.resize(key.size() * sample_size);
@@ -340,16 +244,32 @@ std::vector<int64_t> GraphGpuWrapper::graph_neighbor_sample(
   /* for(int i = 0;i < res.size();i ++) { */
   /*     VLOG(0) << i << " " << res[i]; */
   /* } */
-
+  delete[] actual_sample_size;
   cudaFree(cuda_key);
   return res;
 }
 
+void GraphGpuWrapper::init_sample_status() {
+  ((GpuPsGraphTable *)graph_table)->init_sample_status();
+}
+
+void GraphGpuWrapper::free_sample_status() {
+  ((GpuPsGraphTable *)graph_table)->free_sample_status();
+}
 NodeQueryResult GraphGpuWrapper::query_node_list(int gpu_id, int start,
                                                  int query_size) {
   return ((GpuPsGraphTable *)graph_table)
       ->query_node_list(gpu_id, start, query_size);
 }
+void GraphGpuWrapper::load_node_weight(int type_id, int idx, std::string path) {
+  return ((GpuPsGraphTable *)graph_table)
+      ->cpu_graph_table->load_node_weight(type_id, idx, path);
+}
+
+void GraphGpuWrapper::export_partition_files(int idx, std::string file_path) {
+  return ((GpuPsGraphTable *)graph_table)
+      ->cpu_graph_table->export_partition_files(idx, file_path);
+}
 #endif
 }
 };
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
index b638311304773..a34e752fc7ea7 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#pragma once
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -22,10 +23,13 @@ namespace framework {
 #ifdef PADDLE_WITH_HETERPS
 class GraphGpuWrapper {
  public:
-  static GraphGpuWrapper* GetInstance() {
-    static GraphGpuWrapper wrapper;
-    return &wrapper;
+  static std::shared_ptr<GraphGpuWrapper> GetInstance() {
+    if (NULL == s_instance_) {
+      s_instance_.reset(new paddle::framework::GraphGpuWrapper());
+    }
+    return s_instance_;
   }
+  static std::shared_ptr<GraphGpuWrapper> s_instance_;
   void initialize();
   void test();
   void set_device(std::vector<int> ids);
@@ -39,6 +43,8 @@ class GraphGpuWrapper {
   void load_node_file(std::string name, std::string filepath);
   int32_t load_next_partition(int idx);
   int32_t get_partition_num(int idx);
+  void load_node_weight(int type_id, int idx, std::string path);
+  void export_partition_files(int idx, std::string file_path);
   std::vector<int64_t> get_partition(int idx, int num);
   void make_partitions(int idx, int64_t byte_size, int device_len);
   void make_complementary_graph(int idx, int64_t byte_size);
@@ -53,6 +59,8 @@ class GraphGpuWrapper {
                                              std::vector<int64_t>& key,
                                              int sample_size);
 
+  void init_sample_status();
+  void free_sample_status();
   std::unordered_map<std::string, int> edge_to_id, feature_to_id;
   std::vector<std::string> id_to_feature, id_to_edge;
   std::vector<std::unordered_map<std::string, int>> table_feat_mapping;
@@ -62,7 +70,7 @@ class GraphGpuWrapper {
   ::paddle::distributed::GraphParameter table_proto;
   std::vector<int> device_id_mapping;
   int search_level = 1;
-  char* graph_table;
+  void* graph_table;
 };
 #endif
 }
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
index b860ea5d39cb5..234aa15ebf74d 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
@@ -41,9 +41,7 @@ limitations under the License. */
 #include "xpu/kernel/simd.h"
 #endif
 
-#if defined(PADDLE_WITH_XPU_KP)
 #include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h"
-#endif
 
 namespace paddle {
 namespace framework {
@@ -120,8 +118,8 @@ class HashTable {
               StreamType stream);
 
   template <typename StreamType>
-  void insert(const KeyType* d_keys, size_t len, char* pool, size_t start_index,
-              StreamType stream);
+  void insert(const KeyType* d_keys, size_t len, char* pool,
+              size_t feature_value_size, size_t start_index, StreamType stream);
 
   template <typename StreamType>
   void get(const KeyType* d_keys, ValType* d_vals, size_t len,
@@ -132,10 +130,8 @@ class HashTable {
 
   void show();
 
-#if defined(PADDLE_WITH_XPU_KP)
   void set_sparse_sgd(const OptimizerConfig& optimizer_config);
   void set_embedx_sgd(const OptimizerConfig& optimizer_config);
-#endif
 
   template <typename StreamType>
   void dump_to_cpu(int devid, StreamType stream);
@@ -178,9 +174,10 @@ class HashTable {
   TableContainer<KeyType, ValType>* container_;
 #elif defined(PADDLE_WITH_XPU_KP)
   XPUCacheArray<KeyType, ValType>* container_;
-  OptimizerConfig* xpu_optimizer_config_;
-  OptimizerConfig cpu_optimizer_config_;
 #endif
+  OptimizerConfig* device_optimizer_config_;
+  OptimizerConfig host_optimizer_config_;
+
   int BLOCK_SIZE_{256};
   float LOAD_FACTOR{0.75f};
   size_t capacity_;
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
index 87b62c6d380a4..57741c2c19b1c 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
@@ -50,7 +50,8 @@ __global__ void insert_kernel(Table* table,
 template <typename Table>
 __global__ void insert_kernel(Table* table,
                               const typename Table::key_type* const keys,
-                              size_t len, char* pool, int start_index) {
+                              size_t len, char* pool, size_t feature_value_size,
+                              int start_index) {
   ReplaceOp<typename Table::mapped_type> op;
   thrust::pair<typename Table::key_type, typename Table::mapped_type> kv;
 
@@ -58,7 +59,8 @@ __global__ void insert_kernel(Table* table,
 
   if (i < len) {
     kv.first = keys[i];
-    kv.second = (Table::mapped_type)(pool + (start_index + i) * 80);
+    uint64_t offset = uint64_t(start_index + i) * feature_value_size;
+    kv.second = (Table::mapped_type)(pool + offset);
     auto it = table->insert(kv, op);
     assert(it != table->end() && "error: insert fails: table is full");
   }
@@ -81,20 +83,53 @@ __global__ void search_kernel(Table* table,
 template <typename Table>
 __global__ void dy_mf_search_kernel(Table* table,
                                     const typename Table::key_type* const keys,
-                                    char* const vals, size_t len,
+                                    char* vals, size_t len,
                                     size_t pull_feature_value_size) {
   const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  // return;
   if (i < len) {
     auto it = table->find(keys[i]);
 
     if (it != table->end()) {
-      *(FeatureValue*)(vals + i * pull_feature_value_size) = *(it->second);
+      uint64_t offset = i * pull_feature_value_size;
+      FeatureValue* cur = (FeatureValue*)(vals + offset);
+      FeatureValue& input = *(FeatureValue*)(it->second);
+      cur->slot = input.slot;
+      cur->show = input.show;
+      cur->clk = input.clk;
+      cur->mf_dim = input.mf_dim;
+      cur->lr = input.lr;
+      cur->mf_size = input.mf_size;
+      cur->cpu_ptr = input.cpu_ptr;
+      cur->delta_score = input.delta_score;
+      cur->lr_g2sum = input.lr_g2sum;
+      for (int j = 0; j < cur->mf_dim + 1; ++j) {
+        cur->mf[j] = input.mf[j];
+      }
+    } else {
+      if (keys[i] != 0) {
+        printf("warning::pull miss key: %d", keys[i]);
+      }
+      FeatureValue* cur = (FeatureValue*)(vals + i * pull_feature_value_size);
+      cur->delta_score = 0;
+      cur->show = 0;
+      cur->clk = 0;
+      cur->slot = -1;
+      cur->lr = 0;
+      cur->lr_g2sum = 0;
+      cur->mf_size = 0;
+      cur->mf_dim = 8;
+      cur->cpu_ptr;
+      for (int j = 0; j < cur->mf_dim + 1; j++) {
+        cur->mf[j] = 0;
+      }
     }
   }
 }
 
 template <typename Table, typename GradType, typename Sgd>
 __global__ void update_kernel(Table* table,
+                              const OptimizerConfig& optimizer_config,
                               const typename Table::key_type* const keys,
                               const GradType* const grads, size_t len,
                               Sgd sgd) {
@@ -102,13 +137,14 @@ __global__ void update_kernel(Table* table,
   if (i < len) {
     auto it = table->find(keys[i]);
     if (it != table->end()) {
-      sgd.update_value((it.getter())->second, grads[i]);
+      sgd.update_value(optimizer_config, (it.getter())->second, grads[i]);
     }
   }
 }
 
 template <typename Table, typename Sgd>
 __global__ void dy_mf_update_kernel(Table* table,
+                                    const OptimizerConfig& optimizer_config,
                                     const typename Table::key_type* const keys,
                                     const char* const grads, size_t len,
                                     Sgd sgd, size_t grad_value_size) {
@@ -117,9 +153,11 @@ __global__ void dy_mf_update_kernel(Table* table,
     auto it = table->find(keys[i]);
     if (it != table->end()) {
       FeaturePushValue* cur = (FeaturePushValue*)(grads + i * grad_value_size);
-      sgd.dy_mf_update_value((it.getter())->second, *cur);
+      sgd.dy_mf_update_value(optimizer_config, (it.getter())->second, *cur);
     } else {
-      printf("yxf::push miss key: %d", keys[i]);
+      if (keys[i] != 0) {
+        printf("warning::push miss key: %d", keys[i]);
+      }
     }
   }
 }
@@ -127,6 +165,9 @@ __global__ void dy_mf_update_kernel(Table* table,
 template <typename KeyType, typename ValType>
 HashTable<KeyType, ValType>::HashTable(size_t capacity) {
   container_ = new TableContainer<KeyType, ValType>(capacity);
+  cudaMalloc((void**)&device_optimizer_config_, sizeof(OptimizerConfig));
+  cudaMemcpy((void*)device_optimizer_config_, &host_optimizer_config_,
+             sizeof(OptimizerConfig), cudaMemcpyHostToDevice);
   rwlock_.reset(new phi::RWLock);
 }
 
@@ -135,6 +176,22 @@ HashTable<KeyType, ValType>::~HashTable() {
   delete container_;
 }
 
+template <typename KeyType, typename ValType>
+void HashTable<KeyType, ValType>::set_sparse_sgd(
+    const OptimizerConfig& optimizer_config) {
+  host_optimizer_config_.set_sparse_sgd(optimizer_config);
+  cudaMemcpy((void*)device_optimizer_config_, &host_optimizer_config_,
+             sizeof(OptimizerConfig), cudaMemcpyHostToDevice);
+}
+
+template <typename KeyType, typename ValType>
+void HashTable<KeyType, ValType>::set_embedx_sgd(
+    const OptimizerConfig& optimizer_config) {
+  host_optimizer_config_.set_embedx_sgd(optimizer_config);
+  cudaMemcpy((void*)device_optimizer_config_, &host_optimizer_config_,
+             sizeof(OptimizerConfig), cudaMemcpyHostToDevice);
+}
+
 template <typename KeyType, typename ValType>
 void HashTable<KeyType, ValType>::show() {
   container_->print();
@@ -180,7 +237,8 @@ void HashTable<KeyType, ValType>::insert(const KeyType* d_keys,
 template <typename KeyType, typename ValType>
 template <typename StreamType>
 void HashTable<KeyType, ValType>::insert(const KeyType* d_keys, size_t len,
-                                         char* pool, size_t start_index,
+                                         char* pool, size_t feature_value_size,
+                                         size_t start_index,
                                          StreamType stream) {
   if (len == 0) {
     return;
@@ -189,8 +247,8 @@ void HashTable<KeyType, ValType>::insert(const KeyType* d_keys, size_t len,
     return;
   }
   const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
-  insert_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(container_, d_keys, len,
-                                                       pool, start_index);
+  insert_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(
+      container_, d_keys, len, pool, feature_value_size, start_index);
 }
 
 template <typename KeyType, typename ValType>
@@ -279,8 +337,8 @@ void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
     return;
   }
   const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
-  update_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(container_, d_keys,
-                                                       d_grads, len, sgd);
+  update_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(
+      container_, *device_optimizer_config_, d_keys, d_grads, len, sgd);
 }
 
 template <typename KeyType, typename ValType>
@@ -293,11 +351,18 @@ void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
   }
   const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
   dy_mf_update_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(
-      container_, d_keys, d_grads, len, sgd, push_grad_value_size_);
+      container_, *device_optimizer_config_, d_keys, d_grads, len, sgd,
+      push_grad_value_size_);
 }
 
 template class HashTable<unsigned long, paddle::framework::FeatureValue>;
+template class HashTable<unsigned long, paddle::framework::FeatureValue*>;
 template class HashTable<long, int>;
+template class HashTable<unsigned long, int>;
+template class HashTable<unsigned long, unsigned long>;
+template class HashTable<unsigned long, long>;
+template class HashTable<unsigned long, long*>;
+template class HashTable<long, long>;
 template class HashTable<long, unsigned long>;
 template class HashTable<long, unsigned int>;
 
@@ -306,14 +371,25 @@ template void HashTable<unsigned long, paddle::framework::FeatureValue>::get<
                   paddle::framework::FeatureValue* d_vals, size_t len,
                   cudaStream_t stream);
 
+template void
+HashTable<unsigned long, paddle::framework::FeatureValue*>::get<cudaStream_t>(
+    const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t stream);
+
 template void HashTable<long, int>::get<cudaStream_t>(const long* d_keys,
                                                       int* d_vals, size_t len,
                                                       cudaStream_t stream);
 
+template void HashTable<unsigned long, int>::get<cudaStream_t>(
+    const unsigned long* d_keys, int* d_vals, size_t len, cudaStream_t stream);
 template void HashTable<long, unsigned long>::get<cudaStream_t>(
     const long* d_keys, unsigned long* d_vals, size_t len, cudaStream_t stream);
+template void HashTable<long, long>::get<cudaStream_t>(const long* d_keys,
+                                                       long* d_vals, size_t len,
+                                                       cudaStream_t stream);
 template void HashTable<long, unsigned int>::get<cudaStream_t>(
     const long* d_keys, unsigned int* d_vals, size_t len, cudaStream_t stream);
+template void HashTable<unsigned long, long>::get<cudaStream_t>(
+    const unsigned long* d_keys, long* d_vals, size_t len, cudaStream_t stream);
 // template void
 // HashTable<unsigned long, paddle::framework::FeatureValue>::get<cudaStream_t>(
 //    const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t
@@ -324,11 +400,23 @@ template void HashTable<unsigned long, paddle::framework::FeatureValue>::insert<
                   const paddle::framework::FeatureValue* d_vals, size_t len,
                   cudaStream_t stream);
 
+template void HashTable<unsigned long, paddle::framework::FeatureValue*>::
+    insert<cudaStream_t>(const unsigned long* d_keys, size_t len, char* pool,
+                         size_t feature_value_size, size_t start_index,
+                         cudaStream_t stream);
+
 template void HashTable<long, int>::insert<cudaStream_t>(const long* d_keys,
                                                          const int* d_vals,
                                                          size_t len,
                                                          cudaStream_t stream);
+template void HashTable<long, long>::insert<cudaStream_t>(const long* d_keys,
+                                                          const long* d_vals,
+                                                          size_t len,
+                                                          cudaStream_t stream);
 
+template void HashTable<unsigned long, int>::insert<cudaStream_t>(
+    const unsigned long* d_keys, const int* d_vals, size_t len,
+    cudaStream_t stream);
 template void HashTable<long, unsigned long>::insert<cudaStream_t>(
     const long* d_keys, const unsigned long* d_vals, size_t len,
     cudaStream_t stream);
@@ -337,10 +425,9 @@ template void HashTable<long, unsigned int>::insert<cudaStream_t>(
     const long* d_keys, const unsigned int* d_vals, size_t len,
     cudaStream_t stream);
 
-// template void HashTable<unsigned long,
-// paddle::framework::FeatureValue>::insert<
-//    cudaStream_t>(const unsigned long* d_keys, size_t len, char* pool,
-//                  size_t start_index, cudaStream_t stream);
+template void HashTable<unsigned long, long>::insert<cudaStream_t>(
+    const unsigned long* d_keys, const long* d_vals, size_t len,
+    cudaStream_t stream);
 
 template void HashTable<unsigned long, paddle::framework::FeatureValue>::
     dump_to_cpu<cudaStream_t>(int devid, cudaStream_t stream);
@@ -356,6 +443,16 @@ template void HashTable<unsigned long, paddle::framework::FeatureValue>::update<
                                   sgd,
                   cudaStream_t stream);
 
+template void
+HashTable<unsigned long, paddle::framework::FeatureValue*>::update<
+    Optimizer<paddle::framework::FeatureValue,
+              paddle::framework::FeaturePushValue>,
+    cudaStream_t>(const unsigned long* d_keys, const char* d_grads, size_t len,
+                  Optimizer<paddle::framework::FeatureValue,
+                            paddle::framework::FeaturePushValue>
+                      sgd,
+                  cudaStream_t stream);
+
 // template void HashTable<unsigned long,
 // paddle::framework::FeatureValue>::update<
 //    Optimizer<paddle::framework::FeatureValue,
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps
index cd43a73b44ec3..79c5f3d757781 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps
@@ -163,7 +163,7 @@ __global__ void search_kernel(Table& table, const KeyType* const keys,
 }
 
 template <typename KeyType, typename ValType, typename Table, typename GradType>
-__global__ void update_kernel(OptimizerConfig& optimizer_config, Table& table,
+__global__ void update_kernel(Table& table, OptimizerConfig& optimizer_config,
                               const KeyType* const keys,
                               const GradType* const grads, long long len) {
   int cid = core_id();
@@ -202,12 +202,9 @@ HashTable<KeyType, ValType>::HashTable(size_t capacity) {
              sizeof(XPUCacheArray<KeyType, ValType>));
   xpu_memcpy((void*)container_, &tmp_container,
              sizeof(XPUCacheArray<KeyType, ValType>), XPU_HOST_TO_DEVICE);
-
-  OptimizerConfig tmp_opt_config;
-  xpu_malloc(reinterpret_cast<void**>(&xpu_optimizer_config_),
+  xpu_malloc(reinterpret_cast<void**>(&device_optimizer_config_),
              sizeof(OptimizerConfig));
-
-  xpu_memcpy((void*)xpu_optimizer_config_, &tmp_opt_config,
+  xpu_memcpy((void*)device_optimizer_config_, &host_optimizer_config_,
              sizeof(OptimizerConfig), XPU_HOST_TO_DEVICE);
 
   rwlock_.reset(new phi::RWLock);
@@ -216,7 +213,7 @@ HashTable<KeyType, ValType>::HashTable(size_t capacity) {
 template <typename KeyType, typename ValType>
 HashTable<KeyType, ValType>::~HashTable() {
   xpu_free((void*)container_);
-  xpu_free((void*)xpu_optimizer_config_);
+  xpu_free((void*)device_optimizer_config_);
 }
 
 template <typename KeyType, typename ValType>
@@ -227,28 +224,16 @@ void HashTable<KeyType, ValType>::show() {
 template <typename KeyType, typename ValType>
 void HashTable<KeyType, ValType>::set_sparse_sgd(
     const OptimizerConfig& optimizer_config) {
-  cpu_optimizer_config_.nonclk_coeff = optimizer_config.nonclk_coeff;
-  cpu_optimizer_config_.clk_coeff = optimizer_config.clk_coeff;
-  cpu_optimizer_config_.min_bound = optimizer_config.min_bound;
-  cpu_optimizer_config_.max_bound = optimizer_config.max_bound;
-  cpu_optimizer_config_.learning_rate = optimizer_config.learning_rate;
-  cpu_optimizer_config_.initial_g2sum = optimizer_config.initial_g2sum;
-  cpu_optimizer_config_.initial_range = optimizer_config.initial_range;
-  xpu_memcpy((void*)xpu_optimizer_config_, &cpu_optimizer_config_,
+  host_optimizer_config_.set_sparse_sgd(optimizer_config);
+  xpu_memcpy((void*)device_optimizer_config_, &host_optimizer_config_,
              sizeof(OptimizerConfig), XPU_HOST_TO_DEVICE);
 }
 
 template <typename KeyType, typename ValType>
 void HashTable<KeyType, ValType>::set_embedx_sgd(
     const OptimizerConfig& optimizer_config) {
-  cpu_optimizer_config_.mf_create_thresholds =
-      optimizer_config.mf_create_thresholds;
-  cpu_optimizer_config_.mf_learning_rate = optimizer_config.mf_learning_rate;
-  cpu_optimizer_config_.mf_initial_g2sum = optimizer_config.mf_initial_g2sum;
-  cpu_optimizer_config_.mf_initial_range = optimizer_config.mf_initial_range;
-  cpu_optimizer_config_.mf_min_bound = optimizer_config.mf_min_bound;
-  cpu_optimizer_config_.mf_max_bound = optimizer_config.mf_max_bound;
-  xpu_memcpy((void*)xpu_optimizer_config_, &cpu_optimizer_config_,
+  host_optimizer_config_.set_embedx_sgd(optimizer_config);
+  xpu_memcpy((void*)device_optimizer_config_, &host_optimizer_config_,
              sizeof(OptimizerConfig), XPU_HOST_TO_DEVICE);
 }
 
@@ -306,7 +291,7 @@ void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
   long long c_len = (long long)len;
   update_kernel<KeyType, ValType, XPUCacheArray<KeyType, ValType>,
                 GradType><<<4, 64, stream>>>(
-      *xpu_optimizer_config_, *container_, d_keys, d_grads, c_len);
+      *container_, *device_optimizer_config_, d_keys, d_grads, c_len);
 }
 
 template <typename KeyType, typename ValType>
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
index 6379f7ee91264..815f06b0824e6 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
@@ -15,10 +15,13 @@ limitations under the License. */
 #pragma once
 #include <thread>
 #include <vector>
+#include "cub/cub.cuh"
+#include "cub/util_allocator.cuh"
 #if defined(PADDLE_WITH_CUDA)
 #include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/dynload/nccl.h"
+#include "paddle/fluid/platform/timer.h"
 #include "thrust/pair.h"
 #elif defined(PADDLE_WITH_XPU_KP)
 // #include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h"
@@ -38,6 +41,9 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+#define TYPEALIGN(ALIGNVAL, LEN) \
+  (((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1)))
+
 template <typename KeyType, typename ValType, typename GradType>
 class HeterComm {
  public:
@@ -50,9 +56,13 @@ class HeterComm {
                             int* left, int* right, int gpu_num);
   void merge_grad(int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len,
                   int& uniq_len);  // NOLINT
+  void dynamic_merge_grad(int gpu_num, KeyType* d_keys, GradType* d_grads,
+                          size_t len, int& uniq_len);
   void pull_sparse(int num, KeyType* d_keys, ValType* d_vals, size_t len);
   void build_ps(int num, KeyType* h_keys, ValType* h_vals, size_t len,
                 size_t chunk_size, int stream_num);
+  void build_ps(int num, KeyType* h_keys, char* pool, size_t len,
+                size_t feature_value_size, size_t chunk_size, int stream_num);
   void dump();
   void show_one_table(int gpu_num);
   int get_index_by_devid(int devid);
@@ -65,10 +75,8 @@ class HeterComm {
   void push_sparse(int num, KeyType* d_keys, GradType* d_grads, size_t len);
 #endif
 
-#if defined(PADDLE_WITH_XPU_KP)
   void set_sparse_sgd(const OptimizerConfig& optimizer_config);
   void set_embedx_sgd(const OptimizerConfig& optimizer_config);
-#endif
 
   int log2i(int x);
 
@@ -98,6 +106,11 @@ class HeterComm {
     nccl_inter_comms_ = inter_comms;
     node_size_ = comm_size;
   }
+
+  void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) {
+    multi_mf_dim_ = multi_mf_dim;
+    max_mf_dim_ = max_mf_dim;
+  }
 #endif
 
   bool need_transfer(int send_id, int receive_id) {
@@ -116,8 +129,8 @@ class HeterComm {
     char* key_storage;
     char* val_storage;
     int sync;
-    int key_bytes_len;
-    int val_bytes_len;
+    size_t key_bytes_len;
+    size_t val_bytes_len;
     int dev_num;
   };
 
@@ -208,12 +221,18 @@ class HeterComm {
   void destroy_storage(int start_index, int end_index);
   void walk_to_dest(int start_index, int gpu_num, int* h_left, int* h_right,
                     KeyType* src_key, GradType* src_val);
+  void walk_to_dest(int start_index, int gpu_num, int* h_left, int* h_right,
+                    KeyType* src_key, char* src_val, size_t val_size);
   void walk_to_src(int start_index, int gpu_num, int* h_left, int* h_right,
                    ValType* src_val);
+  void walk_to_src(int start_index, int gpu_num, int* h_left, int* h_right,
+                   char* src_val, size_t val_size);
 
  protected:
   using Table = HashTable<KeyType, ValType>;
+  using PtrTable = HashTable<KeyType, ValType*>;
   std::vector<Table*> tables_;
+  std::vector<PtrTable*> ptr_tables_;
   std::shared_ptr<HeterPsResource> resource_;
   std::vector<std::vector<Path>> path_;
   float load_factor_{0.75};
@@ -223,6 +242,7 @@ class HeterComm {
  private:
   int topo_aware_{0};
   std::vector<LocalStorage> storage_;
+  DynamicGradMerger merger_;
   int feanum_{1800 * 2048};
   int multi_node_{0};
   int node_size_;
@@ -230,6 +250,8 @@ class HeterComm {
 #if defined(PADDLE_WITH_CUDA)
   std::vector<ncclComm_t> nccl_inner_comms_;
   std::vector<ncclComm_t> nccl_inter_comms_;
+  int multi_mf_dim_{8};
+  int max_mf_dim_ = 8;
   std::vector<std::shared_ptr<cub::CachingDeviceAllocator>> allocators_;
 #endif
 };
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
index 098adc2bdeb88..64b177abb8638 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
 #ifdef PADDLE_WITH_HETERPS
 #include <queue>
+#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h"
 #include "paddle/fluid/platform/device_context.h"
 #ifdef PADDLE_WITH_XPU_KP
@@ -22,20 +23,32 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-
 template <typename KeyType, typename ValType, typename GradType>
 HeterComm<KeyType, ValType, GradType>::HeterComm(
     size_t capacity, std::shared_ptr<HeterPsResource> resource) {
+  VLOG(1) << "Construct new HeterComm";
   resource_ = resource;
   storage_.resize(resource_->total_device());
+  multi_mf_dim_ = resource->multi_mf();
   for (int i = 0; i < resource_->total_device(); ++i) {
 #if defined(PADDLE_WITH_CUDA)
     platform::CUDADeviceGuard guard(resource_->dev_id(i));
     allocators_.push_back(std::make_shared<cub::CachingDeviceAllocator>(
         8, 1, (unsigned int)-1, (size_t)-1, false, false));  // NOLINT
 #endif
-    auto table = new Table(capacity / load_factor_);
-    tables_.push_back(table);
+    if (!multi_mf_dim_) {
+      auto table = new Table(capacity / load_factor_);
+      tables_.push_back(table);
+    } else {
+      max_mf_dim_ = resource_->max_mf_dim();
+      size_t val_type_size = TYPEALIGN(
+          8, sizeof(FeatureValue) + sizeof(float) * (max_mf_dim_ + 1));
+      size_t grad_type_size = TYPEALIGN(
+          8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float)));
+      auto ptr_table = new PtrTable(capacity / load_factor_);
+      ptr_table->set_feature_value_size(val_type_size, grad_type_size);
+      ptr_tables_.push_back(ptr_table);
+    }
     if (multi_node_) {
       storage_[i].init(feanum_, resource_->dev_id(i));
     }
@@ -193,9 +206,10 @@ void HeterComm<KeyType, ValType, GradType>::walk_to_dest(int start_index,
     memory_copy(dst_place, node.key_storage, src_place,
                 reinterpret_cast<char*>(src_key + h_left[i]),
                 node.key_bytes_len, node.in_stream);
-#if defined(PADDLE_WITH_CUDA)  // adapt for gpu-graph
-    cudaMemsetAsync(node.val_storage, -1, node.val_bytes_len, node.in_stream);
-#endif
+    // #if defined(PADDLE_WITH_CUDA)  // adapt for gpu-graph
+    //     cudaMemsetAsync(node.val_storage, -1, node.val_bytes_len,
+    //     node.in_stream);
+    // #endif
 
     if (need_copy_val) {
       memory_copy(dst_place, node.val_storage, src_place,
@@ -237,95 +251,132 @@ void HeterComm<KeyType, ValType, GradType>::walk_to_dest(int start_index,
 }
 
 template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::walk_to_src(int start_index,
-                                                        int num, int* h_left,
-                                                        int* h_right,
-                                                        ValType* src_val) {
+void HeterComm<KeyType, ValType, GradType>::walk_to_dest(
+    int start_index, int gpu_num, int* h_left, int* h_right, KeyType* src_key,
+    char* src_val, size_t val_size) {
+  int need_copy_val = 0;
+  if (src_val) {
+    need_copy_val = 1;
+  }
   std::queue<CopyTask> que;
+  for (int i = 0; i < gpu_num; i++) {
+    if (h_left[i] == -1 || h_right[i] == -1) {
+      continue;
+    }
+    int size = path_[start_index][i].nodes_.size();
+    auto& node = path_[start_index][i].nodes_[0];
+    CopyTask t(&path_[start_index][i], 0);
+    que.push(t);
+    cudaMemcpyAsync(node.key_storage,
+                    reinterpret_cast<char*>(src_key + h_left[i]),
+                    node.key_bytes_len, cudaMemcpyDefault, node.in_stream);
+    if (need_copy_val) {
+      cudaMemcpyAsync(node.val_storage,
+                      src_val + uint64_t(h_left[i]) * uint64_t(val_size),
+                      node.val_bytes_len, cudaMemcpyDefault, node.in_stream);
+    }
+  }
+  while (!que.empty()) {
+    CopyTask& cur_task = que.front();
+    que.pop();
+    if (cur_task.path->nodes_[cur_task.step].sync) {
+      cudaStreamSynchronize(cur_task.path->nodes_[cur_task.step].in_stream);
+    }
+    if (cur_task.step != cur_task.path->nodes_.size() - 1) {
+      int cur_step = cur_task.step;
+      CopyTask c(cur_task.path, cur_step + 1);
+      que.push(c);
+      cudaMemcpyAsync(cur_task.path->nodes_[cur_step + 1].key_storage,
+                      cur_task.path->nodes_[cur_step].key_storage,
+                      cur_task.path->nodes_[cur_step + 1].key_bytes_len,
+                      cudaMemcpyDefault,
+                      cur_task.path->nodes_[cur_step + 1].in_stream);
+      if (need_copy_val) {
+        cudaMemcpyAsync(cur_task.path->nodes_[cur_step + 1].val_storage,
+                        cur_task.path->nodes_[cur_step].val_storage,
+                        cur_task.path->nodes_[cur_step + 1].val_bytes_len,
+                        cudaMemcpyDefault,
+                        cur_task.path->nodes_[cur_step + 1].in_stream);
+      }
+    }
+  }
+}
 
-  for (int i = 0; i < num; i++) {
+template <typename KeyType, typename ValType, typename GradType>
+void HeterComm<KeyType, ValType, GradType>::walk_to_src(
+    int start_index, int gpu_num, int* h_left, int* h_right, char* src_val,
+    size_t val_size) {
+  std::queue<CopyTask> que;
+  for (int i = 0; i < gpu_num; i++) {
     if (h_left[i] == -1 || h_right[i] == -1) {
       continue;
     }
     int cur_step = path_[start_index][i].nodes_.size() - 1;
     auto& node = path_[start_index][i].nodes_[cur_step];
-
-    auto src_dev_id = resource_->dev_id(i);
-    auto src_place = DevPlace(src_dev_id);
-
     if (cur_step == 0) {
-      auto dst_dev_id = resource_->dev_id(start_index);
-      auto dst_place = DevPlace(dst_dev_id);
-      memory_copy(dst_place, reinterpret_cast<char*>(src_val + h_left[i]),
-                  src_place, node.val_storage, node.val_bytes_len,
-                  node.out_stream);
+      cudaMemcpyAsync(src_val + uint64_t(h_left[i]) * val_size,
+                      node.val_storage, node.val_bytes_len, cudaMemcpyDefault,
+                      node.out_stream);
     } else {
       CopyTask t(&path_[start_index][i], cur_step - 1);
       que.push(t);
-
-      auto dst_dev_id =
-          resource_->dev_id(path_[start_index][i].nodes_[cur_step - 1].dev_num);
-      auto dst_place = DevPlace(dst_dev_id);
-
-      memory_copy(dst_place,
-                  path_[start_index][i].nodes_[cur_step - 1].val_storage,
-                  src_place, node.val_storage,
-                  path_[start_index][i].nodes_[cur_step - 1].val_bytes_len,
-                  path_[start_index][i].nodes_[cur_step - 1].out_stream);
+      cudaMemcpyAsync(path_[start_index][i].nodes_[cur_step - 1].val_storage,
+                      node.val_storage,
+                      path_[start_index][i].nodes_[cur_step - 1].val_bytes_len,
+                      cudaMemcpyDefault,
+                      path_[start_index][i].nodes_[cur_step - 1].out_stream);
     }
   }
-
   while (!que.empty()) {
     CopyTask& cur_task = que.front();
     que.pop();
     int cur_step = cur_task.step;
     if (cur_task.path->nodes_[cur_step].sync) {
-      sync_stream(cur_task.path->nodes_[cur_step].out_stream);
+      cudaStreamSynchronize(cur_task.path->nodes_[cur_step].out_stream);
     }
-
-    auto src_dev_id =
-        resource_->dev_id(cur_task.path->nodes_[cur_step].dev_num);
-    auto src_place = DevPlace(src_dev_id);
-
     if (cur_step > 0) {
       CopyTask c(cur_task.path, cur_step - 1);
       que.push(c);
-
-      auto dst_dev_id =
-          resource_->dev_id(cur_task.path->nodes_[cur_step - 1].dev_num);
-      auto dst_place = DevPlace(dst_dev_id);
-
-      memory_copy(dst_place, cur_task.path->nodes_[cur_step - 1].val_storage,
-                  src_place, cur_task.path->nodes_[cur_step].val_storage,
-                  cur_task.path->nodes_[cur_step - 1].val_bytes_len,
-                  cur_task.path->nodes_[cur_step - 1].out_stream);
-
+      cudaMemcpyAsync(cur_task.path->nodes_[cur_step - 1].val_storage,
+                      cur_task.path->nodes_[cur_step].val_storage,
+                      cur_task.path->nodes_[cur_step - 1].val_bytes_len,
+                      cudaMemcpyDefault,
+                      cur_task.path->nodes_[cur_step - 1].out_stream);
     } else if (cur_step == 0) {
       int end_index = cur_task.path->nodes_.back().dev_num;
-
-      auto dst_dev_id = resource_->dev_id(end_index);
-      auto dst_place = DevPlace(dst_dev_id);
-
-      memory_copy(dst_place,
-                  reinterpret_cast<char*>(src_val + h_left[end_index]),
-                  src_place, cur_task.path->nodes_[cur_step].val_storage,
-                  cur_task.path->nodes_[cur_step].val_bytes_len,
-                  cur_task.path->nodes_[cur_step].out_stream);
+      cudaMemcpyAsync(src_val + uint64_t(h_left[end_index]) * val_size,
+                      cur_task.path->nodes_[cur_step].val_storage,
+                      cur_task.path->nodes_[cur_step].val_bytes_len,
+                      cudaMemcpyDefault,
+                      cur_task.path->nodes_[cur_step].out_stream);
     }
   }
 }
 
 template <typename KeyType, typename ValType, typename GradType>
 HeterComm<KeyType, ValType, GradType>::~HeterComm() {
-  for (auto& table : tables_) {
-    delete table;
-    table = nullptr;
+  if (!multi_mf_dim_) {
+    for (auto& table : tables_) {
+      delete table;
+      table = nullptr;
+    }
+  } else {
+    for (auto& table : ptr_tables_) {
+      delete table;
+      table = nullptr;
+    }
+    for (auto& table : tables_) {
+      delete table;
+      table = nullptr;
+    }
   }
 }
 
 template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::show_one_table(int num) {
-  tables_[num]->show();
+void HeterComm<KeyType, ValType, GradType>::show_one_table(int gpu_num) {
+  if (!multi_mf_dim_) {
+    tables_[gpu_num]->show();
+  }
 }
 
 template <typename KeyType, typename ValType, typename GradType>
@@ -342,7 +393,6 @@ int HeterComm<KeyType, ValType, GradType>::get_index_by_devid(int devid) {
   return resource_->get_index_by_devid(devid);
 }
 
-#if defined(PADDLE_WITH_XPU_KP)
 template <typename KeyType, typename ValType, typename GradType>
 void HeterComm<KeyType, ValType, GradType>::set_sparse_sgd(
     const OptimizerConfig& optimizer_config) {
@@ -358,7 +408,6 @@ void HeterComm<KeyType, ValType, GradType>::set_embedx_sgd(
     table->set_embedx_sgd(optimizer_config);
   }
 }
-#endif
 
 template <typename KeyType, typename ValType, typename GradType>
 void HeterComm<KeyType, ValType, GradType>::build_ps(
@@ -419,59 +468,179 @@ void HeterComm<KeyType, ValType, GradType>::build_ps(
   }
 }
 
+template <typename KeyType, typename ValType, typename GradType>
+void HeterComm<KeyType, ValType, GradType>::build_ps(int num, KeyType* h_keys,
+                                                     char* pool, size_t len,
+                                                     size_t feature_value_size,
+                                                     size_t chunk_size,
+                                                     int stream_num) {
+  if (len <= 0) {
+    return;
+  }
+  int dev_id = resource_->dev_id(num);
+
+  DevPlace place = DevPlace(dev_id);
+  AnyDeviceGuard guard(dev_id);
+
+  // use hbm pool
+  std::vector<memory::allocation::AllocationPtr> d_key_bufs;
+
+  ppStream streams[stream_num];  // NOLINT
+  for (int i = 0; i < stream_num; ++i) {
+    create_stream(&(streams[i]));
+    auto d_k_buf = memory::Alloc(place, chunk_size * sizeof(KeyType));
+    d_key_bufs.push_back(std::move(d_k_buf));
+  }
+
+  int cur_len = 0;
+  int cur_stream = 0;
+
+  while (cur_len < len) {
+    cur_stream = cur_stream % stream_num;
+    auto cur_use_stream = streams[cur_stream];
+#if defined(PADDLE_WITH_XPU_KP)
+    cur_use_stream = 0;
+#endif
+    int tmp_len = cur_len + chunk_size > len ? len - cur_len : chunk_size;
+
+    auto dst_place = place;
+    auto src_place = platform::CPUPlace();
+
+    memory_copy(
+        dst_place, reinterpret_cast<char*>(d_key_bufs[cur_stream]->ptr()),
+        src_place, h_keys + cur_len, sizeof(KeyType) * tmp_len, cur_use_stream);
+    ptr_tables_[num]->insert(
+        reinterpret_cast<KeyType*>(d_key_bufs[cur_stream]->ptr()), tmp_len,
+        pool, feature_value_size, cur_len, cur_use_stream);
+    cur_stream += 1;
+    cur_len += tmp_len;
+  }
+  for (int i = 0; i < stream_num; ++i) {
+    sync_stream(streams[i]);
+    destroy_stream(streams[i]);
+  }
+}
+
 template <typename KeyType, typename ValType, typename GradType>
 void HeterComm<KeyType, ValType, GradType>::merge_grad(
     int dev_num, KeyType* d_keys, GradType* d_grads, size_t len,
     int& uniq_len) {  // NOLINT
-
   int dev_id = resource_->dev_id(dev_num);
   DevPlace place = DevPlace(dev_id);
   AnyDeviceGuard guard(dev_id);
   auto stream = resource_->local_stream(dev_num, 0);
-
   size_t temp_storage_bytes;
-
   auto d_merge_keys = memory::Alloc(place, len * sizeof(KeyType));
   KeyType* d_merge_keys_ptr = reinterpret_cast<KeyType*>(d_merge_keys->ptr());
   auto d_merge_grads = memory::Alloc(place, len * sizeof(GradType));
   GradType* d_merge_grads_ptr =
       reinterpret_cast<GradType*>(d_merge_grads->ptr());
-
   heter_comm_kernel_->sort_pairs(NULL, temp_storage_bytes, d_keys,
                                  d_merge_keys_ptr, d_grads, d_merge_grads_ptr,
                                  len, 0, 8 * sizeof(KeyType), stream, false);
-
   auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
-
   heter_comm_kernel_->sort_pairs(
       d_temp_storage->ptr(), temp_storage_bytes, d_keys, d_merge_keys_ptr,
       d_grads, d_merge_grads_ptr, len, 0, 8 * sizeof(KeyType), stream, false);
   temp_storage_bytes = 0;
-
   auto d_num_runs_out_mem = memory::Alloc(place, sizeof(int));
   int* d_num_runs_out = reinterpret_cast<int*>(d_num_runs_out_mem->ptr());
-
   heter_comm_kernel_->reduce_by_key(NULL, temp_storage_bytes, d_merge_keys_ptr,
                                     d_keys, d_merge_grads_ptr, d_grads,
                                     d_num_runs_out, len, stream, false);
-
   if (d_temp_storage->size() < temp_storage_bytes) {
     d_temp_storage = NULL;
     d_temp_storage = memory::Alloc(place, temp_storage_bytes);
   }
-
   heter_comm_kernel_->reduce_by_key(
       d_temp_storage->ptr(), temp_storage_bytes, d_merge_keys_ptr, d_keys,
       d_merge_grads_ptr, d_grads, d_num_runs_out, len, stream, false);
-
   auto dst_place = platform::CPUPlace();
   auto src_place = place;
   memory_copy(dst_place, &uniq_len, src_place, d_num_runs_out, sizeof(int),
               stream);
-
   sync_stream(stream);
 }
 
+template <typename KeyType, typename ValType, typename GradType>
+void HeterComm<KeyType, ValType, GradType>::dynamic_merge_grad(
+    int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len,
+    int& uniq_len) {
+  int dev_id = resource_->dev_id(gpu_num);
+  platform::CUDAPlace place = platform::CUDAPlace(dev_id);
+  platform::CUDADeviceGuard guard(dev_id);
+  auto stream = resource_->local_stream(gpu_num, 0);
+
+  size_t temp_storage_bytes;
+
+  // VLOG(1) << "hetercomm merge_grad: max_mf_dim: " << max_mf_dim_;
+  size_t grad_value_size =
+      TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float)));
+
+  auto d_merge_keys = memory::Alloc(place, len * sizeof(KeyType));
+  KeyType* d_merge_keys_ptr = reinterpret_cast<KeyType*>(d_merge_keys->ptr());
+
+  auto d_merge_grads = memory::Alloc(place, len * grad_value_size);
+  GradType* d_merge_grads_ptr =
+      reinterpret_cast<GradType*>(d_merge_grads->ptr());
+
+  auto d_fea_num_info = memory::Alloc(place, sizeof(uint32_t) * (len * 3 + 1));
+  uint32_t* d_fea_num_info_ptr =
+      reinterpret_cast<uint32_t*>(d_fea_num_info->ptr());
+  uint32_t* d_index = (uint32_t*)&d_fea_num_info_ptr[len];
+  uint32_t* d_idx = (uint32_t*)&d_index[len];
+  int* d_merged_size = (int*)&d_idx[len];
+  int grid_size = (len - 1) / block_size_ + 1;
+  heter_comm_kernel_->fill_idx(d_idx, len, stream);
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs(
+      NULL, temp_storage_bytes, d_keys, d_merge_keys_ptr, d_idx, d_index, len,
+      0, 8 * sizeof(KeyType), stream));
+  void* d_buff = NULL;
+  auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs(
+      d_temp_storage->ptr(), temp_storage_bytes, d_keys, d_merge_keys_ptr,
+      d_idx, d_index, len, 0, 8 * sizeof(KeyType), stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+  temp_storage_bytes = 0;
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRunLengthEncode::Encode(
+      NULL, temp_storage_bytes, d_merge_keys_ptr, d_keys, d_fea_num_info_ptr,
+      d_merged_size, len, stream));
+  if (d_temp_storage->size() < temp_storage_bytes) {
+    d_temp_storage = NULL;
+    d_temp_storage = memory::Alloc(place, temp_storage_bytes);
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRunLengthEncode::Encode(
+      d_temp_storage->ptr(), temp_storage_bytes, d_merge_keys_ptr, d_keys,
+      d_fea_num_info_ptr, d_merged_size, len, stream));
+
+  cudaMemcpyAsync((void*)&uniq_len, d_merged_size, sizeof(int),
+                  cudaMemcpyDeviceToHost, stream);
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+
+  assert(d_merged_size > 0);
+  uint32_t* d_offset = (uint32_t*)&d_index[len];
+  temp_storage_bytes = 0;
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum(
+      NULL, temp_storage_bytes, d_fea_num_info_ptr, d_offset, uniq_len,
+      stream));
+  if (d_temp_storage->size() < temp_storage_bytes) {
+    d_temp_storage = NULL;
+    d_temp_storage = memory::Alloc(place, temp_storage_bytes);
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum(
+      d_temp_storage->ptr(), temp_storage_bytes, d_fea_num_info_ptr, d_offset,
+      uniq_len, stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+  heter_comm_kernel_->merge_gradient(
+      d_offset, d_fea_num_info_ptr, d_index, (char*)d_grads,
+      (char*)d_merge_grads_ptr, uniq_len, grad_value_size, merger_, stream);
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_grads, d_merge_grads_ptr,
+                                             grad_value_size * uniq_len,
+                                             cudaMemcpyDeviceToDevice, stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+}
+
 template <typename KeyType, typename ValType, typename GradType>
 void HeterComm<KeyType, ValType, GradType>::split_input_to_shard(
     KeyType* d_keys, int* d_idx_ptr, size_t len, int* left, int* right,
@@ -530,8 +699,6 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
   AnyDeviceGuard guard(dev_id);
   auto stream = resource_->local_stream(num, 0);
 
-  // int grid_size = (len - 1) / block_size_ + 1;
-
   int h_left[total_device];   // NOLINT
   int h_right[total_device];  // NOLINT
 
@@ -563,10 +730,11 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
 
   auto d_idx = memory::Alloc(place, len * sizeof(int));
   int* d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr());
-
+  size_t val_type_size =
+      TYPEALIGN(8, sizeof(FeatureValue) + sizeof(float) * (max_mf_dim_ + 1));
   auto d_shard_keys = memory::Alloc(place, len * sizeof(KeyType));
   KeyType* d_shard_keys_ptr = reinterpret_cast<KeyType*>(d_shard_keys->ptr());
-  auto d_shard_vals = memory::Alloc(place, len * sizeof(ValType));
+  auto d_shard_vals = memory::Alloc(place, len * val_type_size);
   ValType* d_shard_vals_ptr = reinterpret_cast<ValType*>(d_shard_vals->ptr());
 
   split_input_to_shard(d_keys, d_idx_ptr, len, d_left_ptr, d_right_ptr, num);
@@ -590,9 +758,8 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
       continue;
     }
     create_storage(num, i, shard_len * sizeof(KeyType),
-                   shard_len * sizeof(ValType));
+                   shard_len * val_type_size);
   }
-
   walk_to_dest(num, total_device, h_left, h_right, d_shard_keys_ptr, NULL);
 
   for (int i = 0; i < total_device; ++i) {
@@ -601,14 +768,11 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
     }
     auto& node = path_[num][i].nodes_.back();
     sync_stream(node.in_stream);
-
     AnyDeviceGuard guard(resource_->dev_id(i));
-
-    tables_[i]->rwlock_->RDLock();
-    tables_[i]->get(reinterpret_cast<KeyType*>(node.key_storage),
-                    reinterpret_cast<ValType*>(node.val_storage),
-                    h_right[i] - h_left[i] + 1,
-                    resource_->remote_stream(i, num));
+    ptr_tables_[i]->rwlock_->RDLock();
+    ptr_tables_[i]->get(reinterpret_cast<KeyType*>(node.key_storage),
+                        node.val_storage, h_right[i] - h_left[i] + 1,
+                        resource_->remote_stream(i, num));
   }
 
   for (int i = 0; i < total_device; ++i) {
@@ -616,21 +780,18 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
     if (h_left[i] == -1) {
       continue;
     }
-    tables_[i]->rwlock_->UNLock();
+    ptr_tables_[i]->rwlock_->UNLock();
   }
-
-  walk_to_src(num, total_device, h_left, h_right, d_shard_vals_ptr);
-
+  walk_to_src(num, total_device, h_left, h_right,
+              reinterpret_cast<char*>(d_shard_vals_ptr), val_type_size);
   for (int i = 0; i < total_device; ++i) {
     auto& node = path_[num][i].nodes_.front();
     sync_stream(node.out_stream);
   }
-
-  heter_comm_kernel_->fill_dvals(d_shard_vals_ptr, d_vals, d_idx_ptr, len,
-                                 stream);
+  heter_comm_kernel_->dy_mf_fill_dvals(d_shard_vals_ptr, d_vals, d_idx_ptr, len,
+                                       val_type_size, stream);
 
   sync_stream(stream);
-
   for (int i = 0; i < total_device; ++i) {
     if (h_left[i] == -1 || h_right[i] == -1) {
       continue;
@@ -654,6 +815,8 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
   int total_device = resource_->total_device();
   int dev_id = resource_->dev_id(dev_num);
 
+  size_t grad_value_size =
+      TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float)));
   DevPlace place = DevPlace(dev_id);
   AnyDeviceGuard guard(dev_id);
   auto stream = resource_->local_stream(dev_num, 0);
@@ -692,21 +855,33 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
 
   auto d_shard_keys = memory::Alloc(place, len * sizeof(KeyType));
   KeyType* d_shard_keys_ptr = reinterpret_cast<KeyType*>(d_shard_keys->ptr());
-  auto d_shard_grads = memory::Alloc(place, len * sizeof(GradType));
-  GradType* d_shard_grads_ptr =
-      reinterpret_cast<GradType*>(d_shard_grads->ptr());
+
+  GradType* d_shard_grads_ptr;
+  if (!multi_mf_dim_) {
+    auto d_shard_grads = memory::Alloc(place, len * sizeof(GradType));
+    d_shard_grads_ptr = reinterpret_cast<GradType*>(d_shard_grads->ptr());
+  } else {
+    auto d_shard_grads = memory::Alloc(place, len * grad_value_size);
+    d_shard_grads_ptr = reinterpret_cast<GradType*>(d_shard_grads->ptr());
+  }
 
   int uniq_len = len;
-  merge_grad(dev_num, d_keys, d_grads, len, uniq_len);
+  dynamic_merge_grad(dev_num, d_keys, d_grads, len, uniq_len);
 
-  // int grid_size = (uniq_len - 1) / block_size_ + 1;
+  int grid_size = (uniq_len - 1) / block_size_ + 1;
 
   split_input_to_shard(d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr,
                        dev_num);
 
-  heter_comm_kernel_->fill_shard_grads(d_shard_keys_ptr, d_keys,
-                                       d_shard_grads_ptr, d_grads, d_idx_ptr,
-                                       uniq_len, stream);
+  if (!multi_mf_dim_) {
+    heter_comm_kernel_->fill_shard_grads(d_shard_keys_ptr, d_keys,
+                                         d_shard_grads_ptr, d_grads, d_idx_ptr,
+                                         uniq_len, stream);
+  } else {
+    heter_comm_kernel_->dy_mf_fill_shard_grads(
+        d_shard_keys_ptr, d_keys, d_shard_grads_ptr, d_grads, d_idx_ptr,
+        uniq_len, grad_value_size, stream);
+  }
 
   sync_stream(stream);
 
@@ -722,12 +897,22 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
     if (h_left[i] == -1 || h_right[i] == -1) {
       continue;
     }
-    create_storage(dev_num, i, shard_len * sizeof(KeyType),
-                   shard_len * sizeof(GradType));
+    if (!multi_mf_dim_) {
+      create_storage(dev_num, i, shard_len * sizeof(KeyType),
+                     shard_len * sizeof(GradType));
+    } else {
+      create_storage(dev_num, i, shard_len * sizeof(KeyType),
+                     shard_len * grad_value_size);
+    }
   }
 
-  walk_to_dest(dev_num, total_device, h_left, h_right, d_shard_keys_ptr,
-               d_shard_grads_ptr);
+  if (!multi_mf_dim_) {
+    walk_to_dest(dev_num, total_device, h_left, h_right, d_shard_keys_ptr,
+                 d_shard_grads_ptr);
+  } else {
+    walk_to_dest(dev_num, total_device, h_left, h_right, d_shard_keys_ptr,
+                 reinterpret_cast<char*>(d_shard_grads_ptr), grad_value_size);
+  }
 
   for (int i = 0; i < total_device; ++i) {
     if (h_left[i] == -1 || h_right[i] == -1) {
@@ -737,17 +922,28 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
     sync_stream(node.in_stream);
 
     AnyDeviceGuard guard(resource_->dev_id(i));
-    tables_[i]->rwlock_->WRLock();
-    tables_[i]->update(reinterpret_cast<KeyType*>(node.key_storage),
-                       reinterpret_cast<GradType*>(node.val_storage),
-                       h_right[i] - h_left[i] + 1, sgd,
-                       resource_->remote_stream(i, dev_num));
+    if (!multi_mf_dim_) {
+      tables_[i]->rwlock_->WRLock();
+      tables_[i]->update(reinterpret_cast<KeyType*>(node.key_storage),
+                         reinterpret_cast<GradType*>(node.val_storage),
+                         h_right[i] - h_left[i] + 1, sgd,
+                         resource_->remote_stream(i, dev_num));
+    } else {
+      ptr_tables_[i]->rwlock_->WRLock();
+      ptr_tables_[i]->update(reinterpret_cast<KeyType*>(node.key_storage),
+                             node.val_storage, h_right[i] - h_left[i] + 1, sgd,
+                             resource_->remote_stream(i, dev_num));
+    }
   }
 
   for (int i = 0; i < total_device; ++i) {
     sync_stream(resource_->remote_stream(i, dev_num));
     if (h_left[i] != -1) {
-      tables_[i]->rwlock_->UNLock();
+      if (!multi_mf_dim_) {
+        tables_[i]->rwlock_->UNLock();
+      } else {
+        ptr_tables_[i]->rwlock_->UNLock();
+      }
     }
   }
 
@@ -1079,11 +1275,13 @@ void HeterComm<KeyType, ValType, GradType>::end_pass() {
     tables_[index]->dump_to_cpu(dev_id, stream);
   };
 
-  for (int i = 0; i < total_device; ++i) {
-    threads.push_back(std::thread(dump_to_cpu_func, i));
-  }
-  for (auto& t : threads) {
-    t.join();
+  if (!multi_mf_dim_) {
+    for (int i = 0; i < total_device; ++i) {
+      threads.push_back(std::thread(dump_to_cpu_func, i));
+    }
+    for (auto& t : threads) {
+      t.join();
+    }
   }
 }
 
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
index bdeb696a92bce..94d7929b2947d 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
@@ -117,6 +117,53 @@ __global__ void fill_dvals_kernel(ValType* d_shard_vals, ValType* d_vals,
   }
 }
 
+template <typename KeyType, typename GradType, typename T>
+__global__ void dy_mf_fill_shard_grads_kernel(
+    KeyType* d_shard_keys, KeyType* d_keys, GradType* d_shard_grads,
+    GradType* d_grads, T* idx, size_t len, size_t grad_value_size) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    d_shard_keys[i] = d_keys[idx[i]];
+    *(GradType*)((char*)d_shard_grads + i * grad_value_size) =
+        *(GradType*)((char*)d_grads + uint64_t(idx[i]) * grad_value_size);
+  }
+}
+
+__global__ void merge_gradients_kernel(const uint32_t* offset,
+                                       const uint32_t* fea_num,
+                                       const uint32_t* index, const char* input,
+                                       char* output, int n,
+                                       size_t grad_value_size,
+                                       DynamicGradMerger& merger_) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (i < n) {
+    uint32_t start = offset[i];
+    uint32_t num = fea_num[i];
+    int ori_index = index[start];
+    FeaturePushValue& out = *(FeaturePushValue*)(output + i * grad_value_size);
+    FeaturePushValue& in =
+        *(FeaturePushValue*)(input + size_t(ori_index) * grad_value_size);
+    merger_.update_one(out, in);
+    for (int j = 1; j < num; ++j) {
+      ori_index = index[start + j];
+      in = *(FeaturePushValue*)(input + size_t(ori_index) * grad_value_size);
+      merger_.merge_one(out, in);
+    }
+  }
+}
+
+template <typename ValType, typename T>
+__global__ void dy_mf_fill_dvals_kernel(ValType* d_shard_vals, ValType* d_vals,
+                                        T* idx, size_t len, size_t val_size) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    uint64_t new_offset = uint64_t(idx[i]) * val_size;
+    *(ValType*)((char*)d_vals + new_offset) =
+        *(ValType*)((char*)d_shard_vals + i * val_size);
+  }
+}
+
 // cuda implemention of  heter_comm_kernel.h
 template <typename T, typename StreamType>
 void HeterCommKernel::fill_idx(T* idx, long long len,
@@ -207,8 +254,42 @@ void HeterCommKernel::reduce_by_key(void* d_temp_storage,
       debug_synchronous));
 }
 
+template <typename KeyType, typename GradType, typename T, typename StreamType>
+void HeterCommKernel::dy_mf_fill_shard_grads(
+    KeyType* d_shard_keys, KeyType* d_keys, GradType* d_shard_grads,
+    GradType* d_grads, T* idx, long long len, size_t grad_value_size,
+    const StreamType& stream) {
+  int grid_size = (len - 1) / block_size_ + 1;
+  size_t c_len = (size_t)len;
+  dy_mf_fill_shard_grads_kernel<<<grid_size, block_size_, 0, stream>>>(
+      d_shard_keys, d_keys, d_shard_grads, d_grads, idx, c_len,
+      grad_value_size);
+}
+
+template <typename StreamType>
+void HeterCommKernel::merge_gradient(
+    const uint32_t* offset, const uint32_t* fea_num, const uint32_t* index,
+    const char* input, char* output, int n, size_t grad_value_size,
+    DynamicGradMerger& merger_, const StreamType& stream) {
+  int grid_size = (n - 1) / block_size_ + 1;
+  merge_gradients_kernel<<<grid_size, block_size_, 0, stream>>>(
+      offset, fea_num, index, input, output, n, grad_value_size, merger_);
+}
+
+template <typename ValType, typename T, typename StreamType>
+void HeterCommKernel::dy_mf_fill_dvals(ValType* d_shard_vals, ValType* d_vals,
+                                       T* idx, long long len, size_t val_size,
+                                       const StreamType& stream) {
+  int grid_size = (len - 1) / block_size_ + 1;
+  size_t c_len = (size_t)len;
+  dy_mf_fill_dvals_kernel<<<grid_size, block_size_, 0, stream>>>(
+      d_shard_vals, d_vals, idx, c_len, val_size);
+}
+
 template void HeterCommKernel::fill_idx<int, cudaStream_t>(
     int* idx, long long len, const cudaStream_t& stream);
+template void HeterCommKernel::fill_idx<uint32_t, cudaStream_t>(
+    uint32_t* idx, long long len, const cudaStream_t& stream);
 
 template void HeterCommKernel::calc_shard_offset<int, cudaStream_t>(
     int* idx, int* left, int* right, long long len, int total_devs,
@@ -270,6 +351,23 @@ template void HeterCommKernel::reduce_by_key<
     paddle::framework::FeaturePushValue* d_aggregates_out, int* d_num_runs_out,
     int num_items, cudaStream_t stream, bool debug_synchronous);
 
+template void HeterCommKernel::dy_mf_fill_shard_grads<
+    unsigned long, paddle::framework::FeaturePushValue, int, cudaStream_t>(
+    unsigned long* d_shard_keys, unsigned long* d_keys,
+    paddle::framework::FeaturePushValue* d_shard_grads,
+    paddle::framework::FeaturePushValue* d_grads, int* idx, long long len,
+    size_t grad_value_size, const cudaStream_t& stream);
+
+template void HeterCommKernel::merge_gradient<cudaStream_t>(
+    const uint32_t* offset, const uint32_t* fea_num, const uint32_t* index,
+    const char* input, char* output, int n, size_t grad_value_size,
+    DynamicGradMerger& merger_, const cudaStream_t& stream);
+
+template void HeterCommKernel::dy_mf_fill_dvals<paddle::framework::FeatureValue,
+                                                int, cudaStream_t>(
+    paddle::framework::FeatureValue* d_shard_vals,
+    paddle::framework::FeatureValue* d_vals, int* idx, long long len,
+    size_t val_size, const cudaStream_t& stream);
 #endif
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h
index 9d2ee5d272c72..4f866ccda8201 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h
@@ -27,6 +27,42 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+struct DynamicGradMerger {
+  template <typename T>
+  CUB_RUNTIME_FUNCTION __forceinline__ __device__ T
+  operator()(const T& a, const T& b) const {
+    T out;
+    out.slot = a.slot;
+    out.mf_dim = a.mf_dim;
+    out.show = a.show + b.show;
+    out.clk = a.clk + b.clk;
+    out.lr_g = a.lr_g + b.lr_g;
+
+    return out;
+  }
+
+  template <typename T>
+  __device__ __forceinline__ void update_one(T& output, const T& input) {
+    output.slot = input.slot;
+    output.show = input.show;
+    output.clk = input.clk;
+    output.mf_dim = input.mf_dim;
+    output.lr_g = input.lr_g;
+    for (int i = 0; i < output.mf_dim; ++i) {
+      output.mf_g[i] = input.mf_g[i];
+    }
+  }
+  template <typename T>
+  __device__ __forceinline__ void merge_one(T& output, const T& input) {
+    output.show += input.show;
+    output.clk += input.clk;
+    output.lr_g += input.lr_g;
+    for (int i = 0; i < input.mf_dim; ++i) {
+      output.mf_g[i] += input.mf_g[i];
+    }
+  }
+};
+
 class HeterCommKernel {
  public:
   HeterCommKernel() {}
@@ -80,6 +116,24 @@ class HeterCommKernel {
 
                      StreamType stream = NULL, bool debug_synchronous = false);
 
+  template <typename KeyType, typename GradType, typename T,
+            typename StreamType>
+  void dy_mf_fill_shard_grads(KeyType* d_shard_keys, KeyType* d_keys,
+                              GradType* d_shard_grads, GradType* d_grads,
+                              T* idx, long long len, size_t grad_value_size,
+                              const StreamType& stream);
+
+  template <typename StreamType>
+  void merge_gradient(const uint32_t* offset, const uint32_t* fea_num,
+                      const uint32_t* index, const char* input, char* output,
+                      int n, size_t grad_value_size, DynamicGradMerger& merger_,
+                      const StreamType& stream);
+
+  template <typename ValType, typename T, typename StreamType>
+  void dy_mf_fill_dvals(ValType* d_shard_vals, ValType* d_vals, T* idx,
+                        long long len, size_t val_size,
+                        const StreamType& stream);
+
  private:
   int block_size_{256};
 };
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.kps b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.kps
index f73757902fef6..b44ea1807fd65 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.kps
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.kps
@@ -18,6 +18,7 @@ limitations under the License. */
 #if defined(PADDLE_WITH_XPU_KP)
 #include <xpu/runtime.h>
 #include "xpu/kernel/cluster_header.h"
+#include "xpu/kernel/debug.h"  // NOLINT
 #include "xpu/kernel/math.h"
 #include "xpu/kernel/simd.h"
 #endif
@@ -91,7 +92,7 @@ __global__ void calc_shard_offset_kernel(T* idx, T* left, T* right,
     // read batch from GM will boost performance
     int read_len = min(len_per_loop, len - i);
     GM2LM(idx + i, local_idx, read_len * sizeof(T));
-    for (int k = 0; k < read_len; k++) {
+    for (int k = 0; k < read_len - 1; k++) {
       if (local_idx[k] != local_idx[k + 1]) {
         int real_idx = i + k;
         local_right[local_idx[k]] = real_idx;
@@ -102,7 +103,7 @@ __global__ void calc_shard_offset_kernel(T* idx, T* left, T* right,
       local_left[local_idx[i]] = i;
     }
     if (i + read_len == len) {
-      local_right[local_idx[len - 1]] = len - 1;
+      local_right[local_idx[read_len - 1]] = len - 1;
     }
   }
   // to be optimized: call LM2GM too frequently
@@ -150,7 +151,7 @@ __global__ void fill_shard_key_kernel(KeyType* d_shard_keys, KeyType* d_keys,
   int thread_id = ncores * cluster_id() + cid;
   int nthreads = ncores * cluster_num();
   const int buf_size = 400;
-  __local__ KeyType local_keys[buf_size];
+  // __local__ KeyType local_keys[buf_size];
   __local__ KeyType local_shard_keys[buf_size];
   __local__ T local_idx[buf_size];
   int len_per_loop = min(buf_size, roundup_div(len, nthreads));
@@ -158,10 +159,11 @@ __global__ void fill_shard_key_kernel(KeyType* d_shard_keys, KeyType* d_keys,
        i += nthreads * len_per_loop) {
     // read batch from GM will boost performance
     int read_len = min(len_per_loop, len - i);
-    GM2LM(d_keys + i, local_keys, read_len * sizeof(KeyType));
+    // GM2LM(d_keys + i, local_keys, read_len * sizeof(KeyType));
     GM2LM(idx + i, local_idx, read_len * sizeof(T));
     for (int k = 0; k < read_len; k++) {
-      local_shard_keys[k] = local_keys[local_idx[k]];
+      GM2LM(d_keys + local_idx[k], &local_shard_keys[k], 1 * sizeof(KeyType));
+      // local_shard_keys[k] = local_keys[local_idx[k]];
     }
     LM2GM(local_shard_keys, d_shard_keys + i, read_len * sizeof(KeyType));
   }
@@ -181,9 +183,9 @@ __global__ void fill_shard_grads_kernel(KeyType* d_shard_keys, KeyType* d_keys,
   int thread_id = ncores * cluster_id() + cid;
   int nthreads = ncores * cluster_num();
 
-  const int buf_size = 100;
-  __local__ KeyType local_keys[buf_size];
-  __local__ GradType local_grads[buf_size];
+  const int buf_size = 50;
+  // __local__ KeyType local_keys[buf_size];
+  // __local__ GradType local_grads[buf_size];
   __local__ KeyType local_shard_keys[buf_size];
   __local__ GradType local_shard_grads[buf_size];
   __local__ T local_idx[buf_size];
@@ -193,12 +195,15 @@ __global__ void fill_shard_grads_kernel(KeyType* d_shard_keys, KeyType* d_keys,
        i += nthreads * len_per_loop) {
     // read batch from GM will boost performance
     int read_len = min(len_per_loop, len - i);
-    GM2LM(d_keys + i, local_keys, read_len * sizeof(KeyType));
-    GM2LM(d_grads + i, local_grads, read_len * sizeof(GradType));
+    // GM2LM(d_keys + i, local_keys, read_len * sizeof(KeyType));
+    // GM2LM(d_grads + i, local_grads, read_len * sizeof(GradType));
     GM2LM(idx + i, local_idx, read_len * sizeof(T));
     for (int k = 0; k < read_len; k++) {
-      local_shard_keys[k] = local_keys[local_idx[k]];
-      local_shard_grads[k] = local_grads[local_idx[k]];
+      GM2LM(d_keys + local_idx[k], &local_shard_keys[k], 1 * sizeof(KeyType));
+      GM2LM(d_grads + local_idx[k], &local_shard_grads[k],
+            1 * sizeof(GradType));
+      // local_shard_keys[k] = local_keys[local_idx[k]];
+      // local_shard_grads[k] = local_grads[local_idx[k]];
     }
     LM2GM(local_shard_keys, d_shard_keys + i, read_len * sizeof(KeyType));
     LM2GM(local_shard_grads, d_shard_grads + i, read_len * sizeof(GradType));
@@ -227,9 +232,10 @@ __global__ void fill_dvals_kernel(ValType* d_shard_vals, ValType* d_vals,
     GM2LM(idx + i, local_idx, read_len * sizeof(T));
     GM2LM(d_shard_vals + i, local_shard_vals, read_len * sizeof(ValType));
     for (int k = 0; k < read_len; k++) {
-      local_vals[local_idx[k]] = local_shard_vals[k];
+      LM2GM(&local_shard_vals[k], d_vals + local_idx[k], 1 * sizeof(ValType));
+      // local_vals[local_idx[k]] = local_shard_vals[k];
     }
-    LM2GM(local_vals, d_vals + i, read_len * sizeof(ValType));
+    // LM2GM(local_vals, d_vals + i, read_len * sizeof(ValType));
   }
 }
 
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
index 581b0d511c23e..43b84ee5d26fb 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
@@ -44,10 +44,25 @@ void HeterPs::build_ps(int num, FeatureKey* h_keys, FeatureValue* h_vals,
   comm_->build_ps(num, h_keys, h_vals, len, chunk_size, stream_num);
 }
 
+void HeterPs::build_ps(int num, FeatureKey* h_keys, char* pool, size_t len,
+                       size_t feature_value_size, size_t chunk_size,
+                       int stream_num) {
+  comm_->build_ps(num, h_keys, pool, len, feature_value_size, chunk_size,
+                  stream_num);
+}
+
 int HeterPs::get_index_by_devid(int devid) {
   return comm_->get_index_by_devid(devid);
 }
 
+void HeterPs::set_sparse_sgd(const OptimizerConfig& optimizer_config) {
+  comm_->set_sparse_sgd(optimizer_config);
+}
+
+void HeterPs::set_embedx_sgd(const OptimizerConfig& optimizer_config) {
+  comm_->set_embedx_sgd(optimizer_config);
+}
+
 void HeterPs::end_pass() { comm_->end_pass(); }
 
 void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); }
@@ -64,6 +79,10 @@ void HeterPs::set_nccl_comm_and_size(const std::vector<ncclComm_t>& inner_comms,
   comm_->set_nccl_comm_and_size(inner_comms, inter_comms, comm_size);
 }
 
+void HeterPs::set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) {
+  comm_->set_multi_mf_dim(multi_mf_dim, max_mf_dim);
+}
+
 }  // end namespace framework
 }  // end namespace paddle
 #endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
index 7060817be91eb..8449a4048b72f 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
@@ -37,17 +37,18 @@ class HeterPs : public HeterPsBase {
                    size_t len) override;
   void build_ps(int num, FeatureKey* h_keys, FeatureValue* h_vals, size_t len,
                 size_t chunk_size, int stream_num) override;
-
+  void build_ps(int num, FeatureKey* h_keys, char* pool, size_t len,
+                size_t feature_value_size, size_t chunk_size,
+                int stream_num) override;
 #if defined(PADDLE_WITH_CUDA)
   void set_nccl_comm_and_size(const std::vector<ncclComm_t>& inner_comms,
                               const std::vector<ncclComm_t>& inter_comms,
                               int comm_size) override;
+  void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) override;
 #endif
 
-#if defined(PADDLE_WITH_XPU_KP)
   void set_sparse_sgd(const OptimizerConfig& optimizer_config) override;
   void set_embedx_sgd(const OptimizerConfig& optimizer_config) override;
-#endif
 
   void end_pass() override;
   int get_index_by_devid(int devid) override;
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
index 79061ab66af1c..2c312e9d4d60a 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
@@ -16,9 +16,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
-#if defined(PADDLE_WITH_XPU_KP)
 #include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h"
-#endif
 
 #ifdef PADDLE_WITH_HETERPS
 
@@ -37,21 +35,23 @@ class HeterPsBase {
                            size_t len) = 0;
   virtual void build_ps(int num, FeatureKey* h_keys, FeatureValue* h_vals,
                         size_t len, size_t chunk_size, int stream_num) = 0;
+  virtual void build_ps(int num, FeatureKey* h_keys, char* pool, size_t len,
+                        size_t feature_value_size, size_t chunk_size,
+                        int stream_num) = 0;
   virtual int get_index_by_devid(int devid) = 0;
 #if defined(PADDLE_WITH_CUDA)
   virtual void set_nccl_comm_and_size(
       const std::vector<ncclComm_t>& inner_comms,
       const std::vector<ncclComm_t>& inter_comms, int comm_size) = 0;
+  virtual void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) = 0;
 #endif
   virtual void end_pass() = 0;
   virtual void show_one_table(int gpu_num) = 0;
   virtual void push_sparse(int num, FeatureKey* d_keys,
                            FeaturePushValue* d_grads, size_t len) = 0;
 
-#if defined(PADDLE_WITH_XPU_KP)
-  virtual void set_sparse_sgd(const OptimizerConfig& optimizer_config) {}
-  virtual void set_embedx_sgd(const OptimizerConfig& optimizer_config) {}
-#endif
+  virtual void set_sparse_sgd(const OptimizerConfig& optimizer_config) = 0;
+  virtual void set_embedx_sgd(const OptimizerConfig& optimizer_config) = 0;
 
   static HeterPsBase* get_instance(size_t capacity,
                                    std::shared_ptr<HeterPsResource> resource);
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
index 17bc12a5af1a7..5717f44d400a5 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
@@ -107,6 +107,8 @@ class HeterPsResource {
   int get_index_by_devid(int devid);
   int dev_id(int num);
   void set_multi_mf(int multi_mf_dim, int max_mf_dim);
+  int multi_mf() { return multi_mf_dim_; }
+  int max_mf_dim() { return max_mf_dim_; }
 
   ppStream local_stream(int dev_num, int stream_num);
   ppStream remote_stream(int dev_num, int stream_num);
diff --git a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
index ebf7dd277c7d6..4684b4a0bc155 100644
--- a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
+++ b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
@@ -35,58 +35,64 @@ class Optimizer {
 
   void initialize() {}
 
-  __device__ void update_lr(float& w, float& g2sum, float g,  // NOLINT
+  __device__ void update_lr(const OptimizerConfig& optimizer_config,
+                            float& w,               // NOLINT
+                            float& g2sum, float g,  // NOLINT
                             float scale) {
     double add_g2sum = 0;
-    double ratio = optimizer_config::learning_rate *
-                   sqrt(optimizer_config::initial_g2sum /
-                        (optimizer_config::initial_g2sum + g2sum));
+    double ratio = optimizer_config.learning_rate *
+                   sqrt(optimizer_config.initial_g2sum /
+                        (optimizer_config.initial_g2sum + g2sum));
     double scaled_grad = g / scale;
 
     w += scaled_grad * ratio;
 
-    if (w < optimizer_config::min_bound) w = optimizer_config::min_bound;
-    if (w > optimizer_config::max_bound) w = optimizer_config::max_bound;
+    if (w < optimizer_config.min_bound) w = optimizer_config.min_bound;
+    if (w > optimizer_config.max_bound) w = optimizer_config.max_bound;
 
     add_g2sum += scaled_grad * scaled_grad;
 
     g2sum += add_g2sum;
   }
 
-  __device__ void update_mf(int n, float* w, float& g2sum,  // NOLINT
+  __device__ void update_mf(const OptimizerConfig& optimizer_config, int n,
+                            float* w,
+                            float& g2sum,  // NOLINT
                             const float* g, float scale) {
     double add_g2sum = 0;
-    double ratio = optimizer_config::mf_learning_rate *
-                   sqrt(optimizer_config::mf_initial_g2sum /
-                        (optimizer_config::mf_initial_g2sum + g2sum));
+    double ratio = optimizer_config.mf_learning_rate *
+                   sqrt(optimizer_config.mf_initial_g2sum /
+                        (optimizer_config.mf_initial_g2sum + g2sum));
     for (int i = 0; i < n; ++i) {
       double scaled_grad = g[i] / scale;
 
       w[i] += scaled_grad * ratio;
 
-      if (w[i] < optimizer_config::mf_min_bound)
-        w[i] = optimizer_config::mf_min_bound;
-      if (w[i] > optimizer_config::mf_max_bound)
-        w[i] = optimizer_config::mf_max_bound;
+      if (w[i] < optimizer_config.mf_min_bound)
+        w[i] = optimizer_config.mf_min_bound;
+      if (w[i] > optimizer_config.mf_max_bound)
+        w[i] = optimizer_config.mf_max_bound;
       add_g2sum += scaled_grad * scaled_grad;
     }
 
     g2sum += add_g2sum / n;
   }
 
-  __device__ void update_value(ValType& val, const GradType& grad) {  // NOLINT
+  __device__ void update_value(const OptimizerConfig& optimizer_config,
+                               ValType& val,  // NOLINT
+                               const GradType& grad) {
     val.slot = grad.slot;
     val.show += grad.show;
     val.clk += grad.clk;
-    val.delta_score += optimizer_config::nonclk_coeff * (grad.show - grad.clk) +
-                       optimizer_config::clk_coeff * grad.clk;
+    val.delta_score += optimizer_config.nonclk_coeff * (grad.show - grad.clk) +
+                       optimizer_config.clk_coeff * grad.clk;
 
-    update_lr(val.lr, val.lr_g2sum, grad.lr_g, grad.show);
+    update_lr(optimizer_config, val.lr, val.lr_g2sum, grad.lr_g, grad.show);
 
     if (val.mf_size == 0) {
-      if (optimizer_config::mf_create_thresholds <=
-          optimizer_config::nonclk_coeff * (val.show - val.clk) +
-              optimizer_config::clk_coeff * val.clk) {
+      if (optimizer_config.mf_create_thresholds <=
+          optimizer_config.nonclk_coeff * (val.show - val.clk) +
+              optimizer_config.clk_coeff * val.clk) {
         val.mf_size = MF_DIM + 1;
         val.mf[0] = 0;
         int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -94,44 +100,46 @@ class Optimizer {
         curand_init(clock64(), tid_x, 0, &state);
         for (int i = 0; i < MF_DIM; ++i) {
           val.mf[i + 1] =
-              (curand_uniform(&state)) * optimizer_config::mf_initial_range;
+              (curand_uniform(&state)) * optimizer_config.mf_initial_range;
         }
       }
     } else {
-      update_mf(MF_DIM, &val.mf[1], val.mf[0], grad.mf_g, grad.show);
+      update_mf(optimizer_config, MF_DIM, &val.mf[1], val.mf[0], grad.mf_g,
+                grad.show);
     }
   }
 
-  __device__ void dy_mf_update_value(ValType* ptr, const GradType& grad) {
+  __device__ void dy_mf_update_value(const OptimizerConfig& optimizer_config,
+                                     ValType* ptr, const GradType& grad) {
     ptr->slot = grad.slot;
     ptr->show += grad.show;
     ptr->clk += grad.clk;
-    ptr->delta_score +=
-        optimizer_config::nonclk_coeff * (grad.show - grad.clk) +
-        optimizer_config::clk_coeff * grad.clk;
+    ptr->delta_score += optimizer_config.nonclk_coeff * (grad.show - grad.clk) +
+                        optimizer_config.clk_coeff * grad.clk;
 
-    update_lr(ptr->lr, ptr->lr_g2sum, grad.lr_g, grad.show);
+    update_lr(optimizer_config, ptr->lr, ptr->lr_g2sum, grad.lr_g, grad.show);
     // use MF_DIM temporarily
     // ptr->mf_dim = grad.mf_dim;
 
     if (ptr->mf_size == 0) {
-      if (optimizer_config::mf_create_thresholds <=
-          optimizer_config::nonclk_coeff * (ptr->show - ptr->clk) +
-              optimizer_config::clk_coeff * ptr->clk) {
-        // ptr->mf_size = ptr->mf_dim + 1;
+      if (optimizer_config.mf_create_thresholds <=
+          optimizer_config.nonclk_coeff * (ptr->show - ptr->clk) +
+              optimizer_config.clk_coeff * ptr->clk) {
+        ptr->mf_size = ptr->mf_dim + 1;
 
-        ptr->mf_size = MF_DIM + 1;
+        // ptr->mf_size = MF_DIM + 1;
         ptr->mf[0] = 0;
         int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
         curandState state;
         curand_init(clock64(), tid_x, 0, &state);
-        for (int i = 0; i < MF_DIM; ++i) {
+        for (int i = 0; i < ptr->mf_dim; ++i) {
           ptr->mf[i + 1] =
-              (curand_uniform(&state)) * optimizer_config::mf_initial_range;
+              (curand_uniform(&state)) * optimizer_config.mf_initial_range;
         }
       }
     } else {
-      update_mf(MF_DIM, &(ptr->mf[1]), ptr->mf[0], grad.mf_g,
+      update_mf(optimizer_config, ptr->mf_dim, &(ptr->mf[1]), ptr->mf[0],
+                grad.mf_g,
                 grad.show);  // for local test
     }
   }
diff --git a/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h b/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h
index 2a80aa4b52d91..03caeb984f7c9 100644
--- a/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h
+++ b/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h
@@ -14,50 +14,69 @@ limitations under the License. */
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA)
+namespace paddle {
+namespace framework {
 
-namespace optimizer_config {
+class OptimizerConfig {
+ public:
+  float nonclk_coeff = 0.1;
+  float clk_coeff = 1;
 
-__constant__ float nonclk_coeff = 0.1;
-__constant__ float clk_coeff = 1;
+  float min_bound = -10;
+  float max_bound = 10;
+  float learning_rate = 0.05;
+  float initial_g2sum = 3.0;
+  float initial_range = 0;
 
-__constant__ float min_bound = -10;
-__constant__ float max_bound = 10;
-__constant__ float learning_rate = 0.05;
-__constant__ float initial_g2sum = 3.0;
-__constant__ float initial_range = 0;
+  float mf_create_thresholds = 10;
+  float mf_learning_rate = 0.05;
+  float mf_initial_g2sum = 3.0;
+  float mf_initial_range = 1e-4;
+  float mf_min_bound = -10;
+  float mf_max_bound = 10;
 
-__constant__ float mf_create_thresholds = 10;
-__constant__ float mf_learning_rate = 0.05;
-__constant__ float mf_initial_g2sum = 3.0;
-__constant__ float mf_initial_range = 1e-4;
-__constant__ float mf_min_bound = -10;
-__constant__ float mf_max_bound = 10;
-}  // namespace optimizer_config
+  void set_sparse_sgd(float nonclk_coeff, float clk_coeff, float min_bound,
+                      float max_bound, float learning_rate, float initial_g2sum,
+                      float initial_range) {
+    this->nonclk_coeff = nonclk_coeff;
+    this->clk_coeff = clk_coeff;
+    this->min_bound = min_bound;
+    this->max_bound = max_bound;
+    this->learning_rate = learning_rate;
+    this->initial_g2sum = initial_g2sum;
+    this->initial_range = initial_range;
+  }
 
-#elif defined(PADDLE_WITH_XPU_KP)
-namespace paddle {
-namespace framework {
+  void set_sparse_sgd(const OptimizerConfig& optimizer_config) {
+    this->nonclk_coeff = optimizer_config.nonclk_coeff;
+    this->clk_coeff = optimizer_config.clk_coeff;
+    this->min_bound = optimizer_config.min_bound;
+    this->max_bound = optimizer_config.max_bound;
+    this->learning_rate = optimizer_config.learning_rate;
+    this->initial_g2sum = optimizer_config.initial_g2sum;
+    this->initial_range = optimizer_config.initial_range;
+  }
 
-class OptimizerConfig {
- public:
-  float nonclk_coeff;
-  float clk_coeff;
-
-  float min_bound;
-  float max_bound;
-  float learning_rate;
-  float initial_g2sum;
-  float initial_range;
-
-  float mf_create_thresholds;
-  float mf_learning_rate;
-  float mf_initial_g2sum;
-  float mf_initial_range;
-  float mf_min_bound;
-  float mf_max_bound;
+  void set_embedx_sgd(float mf_create_thresholds, float mf_learning_rate,
+                      float mf_initial_g2sum, float mf_initial_range,
+                      float mf_min_bound, float mf_max_bound) {
+    this->mf_create_thresholds = mf_create_thresholds;
+    this->mf_learning_rate = mf_learning_rate;
+    this->mf_initial_g2sum = mf_initial_g2sum;
+    this->mf_initial_range = mf_initial_range;
+    this->mf_min_bound = mf_min_bound;
+    this->mf_max_bound = mf_max_bound;
+  }
+
+  void set_embedx_sgd(const OptimizerConfig& optimizer_config) {
+    this->mf_create_thresholds = optimizer_config.mf_create_thresholds;
+    this->mf_learning_rate = optimizer_config.mf_learning_rate;
+    this->mf_initial_g2sum = optimizer_config.mf_initial_g2sum;
+    this->mf_initial_range = optimizer_config.mf_initial_range;
+    this->mf_min_bound = optimizer_config.mf_min_bound;
+    this->mf_max_bound = optimizer_config.mf_max_bound;
+  }
 };
+
 }  // namespace framework
 }  // namespace paddle
-
-#endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu b/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu
index b3a38a6dfde49..ff3cd9d2d046d 100644
--- a/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu
@@ -17,6 +17,7 @@
 #include <vector>
 #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
+#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
 #include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h"
@@ -235,4 +236,9 @@ TEST(TEST_FLEET, test_cpu_cache) {
     }
     index++;
   }
+  auto iter = paddle::framework::GraphGpuWrapper::GetInstance();
+  std::vector<int> device;
+  device.push_back(0);
+  device.push_back(1);
+  iter->set_device(device);
 }
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
index 64765c98fd04b..65892f8488475 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -28,12 +28,16 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_HETERPS
 
+#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
+
 #include <algorithm>
 #include <deque>
 
-#include "paddle/fluid/framework/data_set.h"
-#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
 #include "paddle/fluid/platform/timer.h"
+#if defined(PADDLE_WITH_PSCORE)
+#include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h"
+#include "paddle/fluid/distributed/ps/table/depends/feature_value.h"
+#endif
 
 namespace paddle {
 namespace framework {
@@ -107,29 +111,17 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
   platform::Timer timeline;
   timeline.Start();
   int device_num = heter_devices_.size();
-  if (!multi_mf_dim_) {
-    gpu_task->init(thread_keys_shard_num_, device_num);
-  } else {
-    gpu_task->init(thread_keys_shard_num_, device_num, multi_mf_dim_);
-  }
-  auto& local_keys = gpu_task->feature_keys_;
-  auto& local_ptr = gpu_task->value_ptr_;
+  gpu_task->init(thread_keys_shard_num_, device_num, multi_mf_dim_);
 
   std::vector<std::thread> threads;
 
   // data should be in input channel
-  if (!multi_mf_dim_) {
-    thread_keys_.resize(thread_keys_thread_num_);
-    for (int i = 0; i < thread_keys_thread_num_; i++) {
-      thread_keys_[i].resize(thread_keys_shard_num_);
-    }
-  } else {
-    thread_dim_keys_.resize(thread_keys_thread_num_);
-    for (int i = 0; i < thread_keys_thread_num_; i++) {
-      thread_dim_keys_[i].resize(thread_keys_shard_num_);
-      for (int j = 0; j < thread_keys_shard_num_; j++) {
-        thread_dim_keys_[i][j].resize(multi_mf_dim_);
-      }
+
+  thread_dim_keys_.resize(thread_keys_thread_num_);
+  for (int i = 0; i < thread_keys_thread_num_; i++) {
+    thread_dim_keys_[i].resize(thread_keys_shard_num_);
+    for (int j = 0; j < thread_keys_shard_num_; j++) {
+      thread_dim_keys_[i][j].resize(multi_mf_dim_);
     }
   }
 
@@ -141,28 +133,14 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
   std::string data_set_name = std::string(typeid(*dataset_).name());
 
   if (data_set_name.find("SlotRecordDataset") != std::string::npos) {
-    VLOG(0) << "ps_gpu_wrapper use SlotRecordDataset";
     SlotRecordDataset* dataset = dynamic_cast<SlotRecordDataset*>(dataset_);
     auto input_channel = dataset->GetInputChannel();
-    VLOG(0) << "yxf::buildtask::inputslotchannle size: "
-            << input_channel->Size();
+    VLOG(0) << "psgpu wrapperinputslotchannle size: " << input_channel->Size();
     const std::deque<SlotRecord>& vec_data = input_channel->GetData();
     total_len = vec_data.size();
     len_per_thread = total_len / thread_keys_thread_num_;
     remain = total_len % thread_keys_thread_num_;
     VLOG(0) << "total len: " << total_len;
-    auto gen_func = [this](const std::deque<SlotRecord>& total_data,
-                           int begin_index, int end_index, int i) {
-      for (auto iter = total_data.begin() + begin_index;
-           iter != total_data.begin() + end_index; iter++) {
-        const auto& ins = *iter;
-        const auto& feasign_v = ins->slot_uint64_feasigns_.slot_values;
-        for (const auto feasign : feasign_v) {
-          int shard_id = feasign % thread_keys_shard_num_;
-          this->thread_keys_[i][shard_id].insert(feasign);
-        }
-      }
-    };
     auto gen_dynamic_mf_func = [this](const std::deque<SlotRecord>& total_data,
                                       int begin_index, int end_index, int i) {
       for (auto iter = total_data.begin() + begin_index;
@@ -176,34 +154,18 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
                j < slot_offset[slot_offset_vector_[slot_idx] + 1]; j++) {
             int shard_id = feasign_v[j] % thread_keys_shard_num_;
             int dim_id = slot_index_vec_[slot_idx];
-            this->thread_dim_keys_[i][shard_id][dim_id].insert(feasign_v[j]);
+            if (feasign_v[j] != 0) {
+              this->thread_dim_keys_[i][shard_id][dim_id].insert(feasign_v[j]);
+            }
           }
         }
       }
-      /*
-      for (auto iter = total_data.begin() + begin_index;
-           iter != total_data.begin() + end_index; iter++) {
-        const auto& ins = *iter;
-        const auto& feasign_v = ins->slot_uint64_feasigns_.slot_values;
-        for (const auto feasign : feasign_v) {
-          int shard_id = feasign % thread_keys_shard_num_;
-          this->thread_dim_keys_[i][shard_id][0].insert(feasign);
-        }
-      }
-      */
     };
     for (int i = 0; i < thread_keys_thread_num_; i++) {
-      if (!multi_mf_dim_) {
-        VLOG(0) << "yxf::psgpu wrapper genfunc";
-        threads.push_back(
-            std::thread(gen_func, std::ref(vec_data), begin,
-                        begin + len_per_thread + (i < remain ? 1 : 0), i));
-      } else {
-        VLOG(0) << "yxf::psgpu wrapper genfunc with dynamic mf";
-        threads.push_back(
-            std::thread(gen_dynamic_mf_func, std::ref(vec_data), begin,
-                        begin + len_per_thread + (i < remain ? 1 : 0), i));
-      }
+      threads.push_back(
+          std::thread(gen_dynamic_mf_func, std::ref(vec_data), begin,
+                      begin + len_per_thread + (i < remain ? 1 : 0), i));
+
       begin += len_per_thread + (i < remain ? 1 : 0);
     }
     for (std::thread& t : threads) {
@@ -251,12 +213,6 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
 
   threads.clear();
   // merge thread_keys to shard_keys
-  auto merge_ins_func = [this, gpu_task](int shard_num) {
-    for (int i = 0; i < thread_keys_thread_num_; ++i) {
-      gpu_task->batch_add_keys(shard_num, thread_keys_[i][shard_num]);
-      thread_keys_[i][shard_num].clear();
-    }
-  };
   auto merge_ins_dynamic_mf_func = [this, gpu_task](int shard_num, int dim_id) {
     for (int i = 0; i < thread_keys_thread_num_; ++i) {
       gpu_task->batch_add_keys(shard_num, dim_id,
@@ -264,19 +220,9 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
       thread_dim_keys_[i][shard_num][dim_id].clear();
     }
   };
-  // for (size_t i = 0; i < thread_keys_.size(); i++) {
-  //  gpu_task->batch_add_keys(thread_keys_[i]);
-  //  for (int j = 0; j < thread_keys_thread_num_; j++) {
-  //    thread_keys_[i][j].clear();
-  //  }
-  //}
   for (int i = 0; i < thread_keys_shard_num_; ++i) {
-    if (!multi_mf_dim_) {
-      threads.push_back(std::thread(merge_ins_func, i));
-    } else {
-      for (int j = 0; j < multi_mf_dim_; j++) {
-        threads.push_back(std::thread(merge_ins_dynamic_mf_func, i, j));
-      }
+    for (int j = 0; j < multi_mf_dim_; j++) {
+      threads.push_back(std::thread(merge_ins_dynamic_mf_func, i, j));
     }
   }
   for (auto& t : threads) {
@@ -291,20 +237,12 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
   timeline.Pause();
 
   VLOG(0) << "GpuPs task unique cost " << timeline.ElapsedSec() << " seconds.";
-
-  if (!multi_mf_dim_) {
-    for (int i = 0; i < thread_keys_shard_num_; i++) {
-      VLOG(0) << "GpuPs shard: " << i << " key len: " << local_keys[i].size();
-      local_ptr[i].resize(local_keys[i].size());
-    }
-  } else {
-    for (int i = 0; i < thread_keys_shard_num_; i++) {
-      for (int j = 0; j < multi_mf_dim_; j++) {
-        VLOG(0) << "GpuPs shard: " << i << "mf dim: " << index_dim_vec_[j]
-                << " key len: " << gpu_task->feature_dim_keys_[i][j].size();
-        gpu_task->value_dim_ptr_[i][j].resize(
-            gpu_task->feature_dim_keys_[i][j].size());
-      }
+  for (int i = 0; i < thread_keys_shard_num_; i++) {
+    for (int j = 0; j < multi_mf_dim_; j++) {
+      VLOG(0) << "GpuPs shard: " << i << "mf dim: " << index_dim_vec_[j]
+              << " key len: " << gpu_task->feature_dim_keys_[i][j].size();
+      gpu_task->value_dim_ptr_[i][j].resize(
+          gpu_task->feature_dim_keys_[i][j].size());
     }
   }
 }
@@ -324,12 +262,12 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
   auto& device_dim_keys = gpu_task->device_dim_keys_;
   auto& device_dim_ptr = gpu_task->device_dim_ptr_;
   auto& device_dim_mutex = gpu_task->dim_mutex_;
-  if (multi_mf_dim_) {
-    for (size_t dev = 0; dev < device_dim_keys.size(); dev++) {
-      device_dim_keys[dev].resize(multi_mf_dim_);
-      device_dim_ptr[dev].resize(multi_mf_dim_);
-    }
+
+  for (size_t dev = 0; dev < device_dim_keys.size(); dev++) {
+    device_dim_keys[dev].resize(multi_mf_dim_);
+    device_dim_ptr[dev].resize(multi_mf_dim_);
   }
+
   // auto& device_mutex = gpu_task->mutex_;
 
   std::vector<std::thread> threads(thread_keys_shard_num_);
@@ -353,18 +291,17 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
 #endif
 
   timeline.Start();
-  auto ptl_func = [this, &local_keys, &local_ptr, &fleet_ptr](int i) {
-    size_t key_size = local_keys[i].size();
+
+  auto ptl_dynamic_mf_func = [this, &local_dim_keys, &local_dim_ptr,
+                              &fleet_ptr](int i, int j) {
+    size_t key_size = local_dim_keys[i][j].size();
     int32_t status = -1;
-#ifdef PADDLE_WITH_PSLIB
-    // auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr(
-    //    reinterpret_cast<char**>(local_ptr[i].data()), this->table_id_,
-    //    local_keys[i].data(), key_size);
     int32_t cnt = 0;
+#ifdef PADDLE_WITH_PSLIB
     while (true) {
       auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr(
-          i, reinterpret_cast<char**>(local_ptr[i].data()), this->table_id_,
-          local_keys[i].data(), key_size);
+          i, reinterpret_cast<char**>(local_dim_ptr[i][j].data()),
+          this->table_id_, local_dim_keys[i][j].data(), key_size);
       bool flag = true;
 
       tt.wait();
@@ -392,11 +329,10 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
     }
 #endif
 #ifdef PADDLE_WITH_PSCORE
-    int32_t cnt = 0;
     while (true) {
       auto tt = fleet_ptr->worker_ptr_->PullSparsePtr(
-          reinterpret_cast<char**>(local_ptr[i].data()), this->table_id_,
-          local_keys[i].data(), key_size);
+          reinterpret_cast<char**>(local_dim_ptr[i][j].data()), this->table_id_,
+          local_dim_keys[i][j].data(), key_size);
       bool flag = true;
 
       tt.wait();
@@ -423,51 +359,6 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
       }
     }
 #endif
-    if (status != 0) {
-      LOG(ERROR) << "fleet pull sparse failed, status[" << status << "]";
-      sleep(300);
-      exit(-1);
-    } else {
-      VLOG(3) << "FleetWrapper Pull sparse to local done with table size: "
-              << local_keys[i].size();
-    }
-  };
-
-  auto ptl_dynamic_mf_func = [this, &local_dim_keys, &local_dim_ptr,
-                              &fleet_ptr](int i, int j) {
-#ifdef PADDLE_WITH_PSLIB
-    size_t key_size = local_dim_keys[i][j].size();
-    int32_t status = -1;
-    int32_t cnt = 0;
-    while (true) {
-      auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr(
-          i, reinterpret_cast<char**>(local_dim_ptr[i][j].data()),
-          this->table_id_, local_dim_keys[i][j].data(), key_size);
-      bool flag = true;
-
-      tt.wait();
-
-      try {
-        status = tt.get();
-      } catch (const std::future_error& e) {
-        VLOG(0) << "Caught a future_error with code" << e.code()
-                << ", Message:" << e.what();
-      }
-      if (status != 0) {
-        VLOG(0) << "fleet pull sparse failed, status[" << status << "]";
-        sleep(sleep_seconds_before_fail_exit_);
-        flag = false;
-        cnt++;
-      }
-      if (cnt > 3) {
-        VLOG(0) << "fleet pull sparse failed, retry 3 times";
-        exit(-1);
-      }
-
-      if (flag) {
-        break;
-      }
-    }
     if (status != 0) {
       LOG(ERROR) << "fleet pull sparse failed, status[" << status << "]";
       sleep(300);
@@ -476,23 +367,19 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
       VLOG(0) << "FleetWrapper Pull sparse to local done with table size: "
               << local_dim_keys[i][j].size();
     }
-#endif
   };
-  if (!multi_mf_dim_) {
-    for (size_t i = 0; i < threads.size(); i++) {
-      threads[i] = std::thread(ptl_func, i);
-    }
-  } else {
-    threads.resize(thread_keys_shard_num_ * multi_mf_dim_);
-    for (int i = 0; i < thread_keys_shard_num_; i++) {
-      for (int j = 0; j < multi_mf_dim_; j++) {
-        threads[i * multi_mf_dim_ + j] = std::thread(ptl_dynamic_mf_func, i, j);
-      }
+
+  threads.resize(thread_keys_shard_num_ * multi_mf_dim_);
+  for (int i = 0; i < thread_keys_shard_num_; i++) {
+    for (int j = 0; j < multi_mf_dim_; j++) {
+      task_futures.emplace_back(
+          pull_thread_pool_[i]->enqueue(ptl_dynamic_mf_func, i, j));
     }
   }
-  for (std::thread& t : threads) {
-    t.join();
+  for (auto& f : task_futures) {
+    f.wait();
   }
+  task_futures.clear();
   timeline.Pause();
   VLOG(0) << "pull sparse from CpuPS into GpuPS cost " << timeline.ElapsedSec()
           << " seconds.";
@@ -509,45 +396,40 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
   std::vector<std::vector<std::pair<uint64_t, char*>>> pass_values;
 
   bool record_status = false;
-#ifdef PADDLE_WITH_PSLIB
-  uint16_t pass_id = 0;
-  if (multi_node_) {
-    record_status = fleet_ptr->pslib_ptr_->_worker_ptr->take_sparse_record(
-        table_id_, pass_id, pass_values);
-  }
-#endif
   auto& device_task_keys = gpu_task->device_task_keys_;
   auto& device_task_ptrs = gpu_task->device_task_ptr_;
-  auto build_dynamic_mf_func = [this, device_num, &local_dim_keys,
-                                &local_dim_ptr, &device_dim_keys,
-                                &device_dim_ptr,
-                                &device_dim_mutex](int i, int j) {
-#ifdef PADDLE_WITH_PSLIB
+  auto build_pull_dynamic_mf_func = [this, device_num, &local_dim_keys,
+                                     &local_dim_ptr, &device_dim_keys,
+                                     &device_dim_ptr,
+                                     &device_dim_mutex](int i, int j) {
     std::vector<std::vector<FeatureKey>> task_keys(device_num);
+#ifdef PADDLE_WITH_PSLIB
     std::vector<std::vector<paddle::ps::DownpourFixedFeatureValue*>> task_ptrs(
         device_num);
+#endif
+
+#ifdef PADDLE_WITH_PSCORE
+    std::vector<std::vector<paddle::distributed::FixedFeatureValue*>> task_ptrs(
+        device_num);
+#endif
     for (size_t k = 0; k < local_dim_keys[i][j].size(); k++) {
       int shard = local_dim_keys[i][j][k] % device_num;
       task_keys[shard].push_back(local_dim_keys[i][j][k]);
       task_ptrs[shard].push_back(local_dim_ptr[i][j][k]);
     }
+    // allocate local keys to devices
     for (int dev = 0; dev < device_num; dev++) {
-      for (int dim = 0; dim < multi_mf_dim_; dim++) {
-        device_dim_mutex[dev][dim]->lock();
-
-        int len = task_keys[dev].size();
-        int cur = device_dim_keys[dev][dim].size();
-        device_dim_keys[dev][dim].resize(device_dim_keys[dev][dim].size() +
-                                         len);
-        device_dim_ptr[dev][dim].resize(device_dim_ptr[dev][dim].size() + len);
-        for (int k = 0; k < len; ++k) {
-          device_dim_keys[dev][dim][cur + k] = task_keys[dev][k];
-          device_dim_ptr[dev][dim][cur + k] = task_ptrs[dev][k];
-        }
-        device_dim_mutex[dev][dim]->unlock();
+      device_dim_mutex[dev][j]->lock();
+      int len = task_keys[dev].size();
+      int cur = device_dim_keys[dev][j].size();
+      device_dim_keys[dev][j].resize(device_dim_keys[dev][j].size() + len);
+      device_dim_ptr[dev][j].resize(device_dim_ptr[dev][j].size() + len);
+      for (int k = 0; k < len; ++k) {
+        device_dim_keys[dev][j][cur + k] = task_keys[dev][k];
+        device_dim_ptr[dev][j][cur + k] = task_ptrs[dev][k];
       }
+      device_dim_mutex[dev][j]->unlock();
     }
-#endif
   };
   auto build_func = [device_num, record_status, &pass_values, &local_keys,
                      &local_ptr, &device_task_keys, &device_task_ptrs](int i) {
@@ -697,7 +579,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
     for (int i = 0; i < thread_keys_shard_num_; i++) {
       for (int j = 0; j < multi_mf_dim_; j++) {
         threads[i * multi_mf_dim_ + j] =
-            std::thread(build_dynamic_mf_func, i, j);
+            std::thread(build_pull_dynamic_mf_func, i, j);
       }
     }
     for (std::thread& t : threads) {
@@ -727,22 +609,19 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
 
   std::vector<size_t> feature_keys_count(device_num);
   size_t size_max = 0;
-  if (!multi_mf_dim_) {
-    for (int i = 0; i < device_num; i++) {
-      feature_keys_count[i] = gpu_task->device_keys_[i].size();
-      VLOG(0) << i << " card contains feasign nums: " << feature_keys_count[i];
-      size_max = std::max(size_max, feature_keys_count[i]);
-    }
-  } else {
-    for (int i = 0; i < device_num; i++) {
-      for (int j = 0; j < multi_mf_dim_; j++) {
-        feature_keys_count[i] += gpu_task->device_dim_ptr_[i][j].size();
-      }
-      VLOG(0) << i << " card with dynamic mf contains feasign nums: "
-              << feature_keys_count[i];
-      size_max = std::max(size_max, feature_keys_count[i]);
+
+  for (int i = 0; i < device_num; i++) {
+    for (int j = 0; j < multi_mf_dim_; j++) {
+      feature_keys_count[i] += gpu_task->device_dim_ptr_[i][j].size();
+      VLOG(1) << i << " card with dynamic mf dim: " << index_dim_vec_[j]
+              << " dim index: " << j << " contains feasign nums: "
+              << gpu_task->device_dim_ptr_[i][j].size();
     }
+    VLOG(1) << i << " card with dynamic mf contains feasign nums total: "
+            << feature_keys_count[i];
+    size_max = std::max(size_max, feature_keys_count[i]);
   }
+
   if (HeterPs_) {
     delete HeterPs_;
     HeterPs_ = nullptr;
@@ -756,18 +635,95 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
 #ifdef PADDLE_WITH_CUDA
   HeterPs_->set_nccl_comm_and_size(inner_comms_, inter_comms_, node_size_);
 #endif
-  auto build_func = [this, &gpu_task, &feature_keys_count](int i) {
-    VLOG(3) << "building table: " << i;
-    this->HeterPs_->build_ps(i, gpu_task->device_keys_[i].data(),
-                             gpu_task->device_values_[i].data(),
-                             feature_keys_count[i], 500000, 2);
-    // if (feature_keys_count[i] > 0) {
-    //   HeterPs_->show_one_table(i);
-    // }
+  auto build_dynamic_mf_func = [this, &gpu_task](int i, int j) {
+    this->HeterPs_->set_multi_mf_dim(multi_mf_dim_, max_mf_dim_);
+    int mf_dim = this->index_dim_vec_[j];
+    VLOG(0) << "building table: " << i << "with mf dim: " << mf_dim;
+    size_t feature_value_size =
+        TYPEALIGN(8, sizeof(FeatureValue) + ((mf_dim + 1) * sizeof(float)));
+    auto& device_dim_keys = gpu_task->device_dim_keys_[i][j];
+    auto& device_dim_ptrs = gpu_task->device_dim_ptr_[i][j];
+    size_t len = device_dim_keys.size();
+    CHECK(len == device_dim_ptrs.size());
+    this->mem_pools_[i * this->multi_mf_dim_ + j] =
+        new MemoryPool(len, feature_value_size);
+    auto& mem_pool = this->mem_pools_[i * this->multi_mf_dim_ + j];
+    for (size_t k = 0; k < len; k++) {
+      FeatureValue* val = (FeatureValue*)(mem_pool->mem_address(k));
+      float* ptr_val = device_dim_ptrs[k]->data();
+      size_t dim = device_dim_ptrs[k]->size();
+#ifdef PADDLE_WITH_PSLIB
+      val->delta_score =
+          ptr_val[paddle::ps::DownpourCtrDymfAccessor::
+                      DownpourCtrDymfFeatureValue::delta_score_index()];
+      val->show = ptr_val[paddle::ps::DownpourCtrDymfAccessor::
+                              DownpourCtrDymfFeatureValue::show_index()];
+      val->clk = ptr_val[paddle::ps::DownpourCtrDymfAccessor::
+                             DownpourCtrDymfFeatureValue::click_index()];
+      val->slot = int(ptr_val[paddle::ps::DownpourCtrDymfAccessor::
+                                  DownpourCtrDymfFeatureValue::slot_index()]);
+      val->lr = ptr_val[paddle::ps::DownpourCtrDymfAccessor::
+                            DownpourCtrDymfFeatureValue::embed_w_index()];
+      val->lr_g2sum =
+          ptr_val[paddle::ps::DownpourCtrDymfAccessor::
+                      DownpourCtrDymfFeatureValue::embed_g2sum_index()];
+      // TODO(xuefeng) set mf_dim while using DownpourCtrDymfAccessor
+      ptr_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
+                  mf_dim_index()] = float(mf_dim);
+      val->mf_dim = mf_dim;
+#endif
+#ifdef PADDLE_WITH_PSCORE
+      paddle::distributed::CtrDymfAccessor accessor;
+      val->delta_score =
+          ptr_val[accessor.common_feature_value.DeltaScoreIndex()];
+      val->show = ptr_val[accessor.common_feature_value.ShowIndex()];
+      val->clk = ptr_val[accessor.common_feature_value.ClickIndex()];
+      val->slot = int(ptr_val[accessor.common_feature_value.SlotIndex()]);
+      val->lr = ptr_val[accessor.common_feature_value.EmbedWIndex()];
+      val->lr_g2sum = ptr_val[accessor.common_feature_value.EmbedG2SumIndex()];
+
+      val->cpu_ptr = (uint64_t)(device_dim_ptrs[k]);
+
+      // TODO(xuefeng) set mf_dim while using DownpourCtrDymfAccessor
+      ptr_val[accessor.common_feature_value.MfDimIndex()] = float(mf_dim);
+      val->mf_dim = mf_dim;
+#endif
+      if (dim > 8) {  // CpuPS alreay expand as mf_dim
+        val->mf_size = mf_dim + 1;
+        for (int x = 0; x < val->mf_dim + 1; x++) {
+          val->mf[x] = ptr_val[x + 8];
+        }
+      } else {
+        val->mf_size = 0;
+        for (int x = 0; x < val->mf_dim + 1; x++) {
+          val->mf[x] = 0;
+        }
+      }
+    }
+
+    platform::CUDADeviceGuard guard(resource_->dev_id(i));
+
+    this->hbm_pools_[i * this->multi_mf_dim_ + j] = new HBMMemoryPool(mem_pool);
+    auto& cur_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j];
+
+    this->HeterPs_->build_ps(i, device_dim_keys.data(), cur_pool->mem(), len,
+                             feature_value_size, 500000, 2);
+
+    if (device_dim_keys.size() > 0) {
+      VLOG(0) << "show ptr table: " << i
+              << " table kv size: " << device_dim_keys.size()
+              << "dim: " << mf_dim << " len: " << len;
+      this->HeterPs_->show_one_table(i);
+    }
+    delete mem_pool;
   };
-  for (size_t i = 0; i < threads.size(); i++) {
-    threads[i] = std::thread(build_func, i);
+  threads.resize(device_num * multi_mf_dim_);
+  for (int i = 0; i < device_num; i++) {
+    for (int j = 0; j < multi_mf_dim_; j++) {
+      threads[i + j * device_num] = std::thread(build_dynamic_mf_func, i, j);
+    }
   }
+
   for (std::thread& t : threads) {
     t.join();
   }
@@ -788,10 +744,12 @@ void PSGPUWrapper::LoadIntoMemory(bool is_shuffle) {
   if (is_shuffle) {
     dataset_->LocalShuffle();
   }
-
+  InitSlotInfo();
   std::shared_ptr<HeterContext> gpu_task = gpu_task_pool_.Get();
   gpu_task->Reset();
+
   data_ready_channel_->Put(gpu_task);
+
   VLOG(3) << "End LoadIntoMemory(), dataset[" << dataset_ << "]";
 }
 
@@ -873,18 +831,109 @@ void PSGPUWrapper::EndPass() {
   timer.Start();
   size_t keysize_max = 0;
   // in case of feasign_num = 0, skip dump_to_cpu
+
   for (size_t i = 0; i < heter_devices_.size(); i++) {
-    keysize_max = std::max(keysize_max, current_task_->device_keys_[i].size());
+    for (int j = 0; j < multi_mf_dim_; j++) {
+      keysize_max =
+          std::max(keysize_max, current_task_->device_dim_keys_[i][j].size());
+    }
+  }
+
+  auto dump_pool_to_cpu_func = [this](int i, int j) {
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(this->resource_->dev_id(i)));
+    auto& hbm_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j];
+    auto& device_keys = this->current_task_->device_dim_keys_[i][j];
+    size_t len = device_keys.size();
+    int mf_dim = this->index_dim_vec_[j];
+    VLOG(0) << "dump pool to cpu table: " << i << "with mf dim: " << mf_dim;
+    size_t feature_value_size =
+        TYPEALIGN(8, sizeof(FeatureValue) + ((mf_dim + 1) * sizeof(float)));
+
+    char* test_build_values = (char*)malloc(feature_value_size * len);
+    cudaMemcpy(test_build_values, hbm_pool->mem(), feature_value_size * len,
+               cudaMemcpyDeviceToHost);
+
+    CHECK(len == hbm_pool->capacity());
+    uint64_t unuse_key = std::numeric_limits<uint64_t>::max();
+    for (size_t i = 0; i < len; ++i) {
+      if (device_keys[i] == unuse_key) {
+        continue;
+      }
+      size_t offset = i * feature_value_size;
+      FeatureValue* gpu_val = (FeatureValue*)(test_build_values + offset);
+#ifdef PADDLE_WITH_PSLIB
+      auto* downpour_value =
+          (paddle::ps::DownpourFixedFeatureValue*)(gpu_val->cpu_ptr);
+      int downpour_value_size = downpour_value->size();
+      if (gpu_val->mf_size > 0 && downpour_value_size == 8) {
+        downpour_value->resize(gpu_val->mf_dim + 1 + downpour_value_size);
+      }
+      float* cpu_val = downpour_value->data();
+      cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
+                  delta_score_index()] = gpu_val->delta_score;
+      cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
+                  show_index()] = gpu_val->show;
+      cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
+                  click_index()] = gpu_val->clk;
+      cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
+                  embed_w_index()] = gpu_val->lr;
+      cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
+                  embed_g2sum_index()] = gpu_val->lr_g2sum;
+      cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
+                  slot_index()] = gpu_val->slot;
+#endif
+#ifdef PADDLE_WITH_PSCORE
+      auto* downpour_value =
+          (paddle::distributed::FixedFeatureValue*)(gpu_val->cpu_ptr);
+      int downpour_value_size = downpour_value->size();
+      if (gpu_val->mf_size > 0 && downpour_value_size == 8) {
+        downpour_value->resize(gpu_val->mf_dim + 1 + downpour_value_size);
+      }
+      float* cpu_val = downpour_value->data();
+
+      paddle::distributed::CtrDymfAccessor accessor;
+      cpu_val[accessor.common_feature_value.DeltaScoreIndex()] =
+          gpu_val->delta_score;
+      cpu_val[accessor.common_feature_value.ShowIndex()] = gpu_val->show;
+      cpu_val[accessor.common_feature_value.ClickIndex()] = gpu_val->clk;
+      cpu_val[accessor.common_feature_value.EmbedWIndex()] = gpu_val->lr;
+      cpu_val[accessor.common_feature_value.EmbedG2SumIndex()] =
+          gpu_val->lr_g2sum;
+      cpu_val[accessor.common_feature_value.SlotIndex()] = gpu_val->slot;
+#endif
+      if (gpu_val->mf_size > 0) {
+        for (int x = 0; x < gpu_val->mf_dim + 1; x++) {
+          cpu_val[x + 8] = gpu_val->mf[x];
+        }
+      }
+    }
+    free(test_build_values);
+  };
+  if (multi_mf_dim_) {
+    VLOG(0) << "psgpu wrapper dump pool: multi_mf_dim_: " << multi_mf_dim_;
+    size_t device_num = heter_devices_.size();
+    std::vector<std::thread> threads(device_num * multi_mf_dim_);
+    for (size_t i = 0; i < device_num; i++) {
+      for (int j = 0; j < multi_mf_dim_; j++) {
+        threads[i + j * device_num] = std::thread(dump_pool_to_cpu_func, i, j);
+      }
+    }
+    for (std::thread& t : threads) {
+      t.join();
+    }
   }
   if (keysize_max != 0) {
     HeterPs_->end_pass();
   }
 
+  for (size_t i = 0; i < hbm_pools_.size(); i++) {
+    delete hbm_pools_[i];
+  }
   gpu_task_pool_.Push(current_task_);
   current_task_ = nullptr;
   gpu_free_channel_->Put(current_task_);
   timer.Pause();
-  VLOG(0) << "EndPass end, cost time: " << timer.ElapsedSec() << "s";
+  VLOG(1) << "EndPass end, cost time: " << timer.ElapsedSec() << "s";
 }
 
 void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
@@ -898,28 +947,102 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
   all_timer.Start();
   int64_t total_length =
       std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
+  VLOG(3) << "Begine Gpu/Xpu Ps PullSparse";
+  auto buf = memory::Alloc(place, total_length * sizeof(FeatureValue));
+  FeatureValue* total_values_gpu = reinterpret_cast<FeatureValue*>(buf->ptr());
+  if (platform::is_cpu_place(place)) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Warning:: CPUPlace is not supported in GpuPs now."));
+  } else if (platform::is_gpu_place(place)) {
+#ifdef PADDLE_WITH_CUDA
+    VLOG(3) << "Begin copy keys, key_num[" << total_length << "]";
+    int device_id = place.GetDeviceId();
+    int devid_2_index = HeterPs_->get_index_by_devid(device_id);
+    LoDTensor& total_keys_tensor = keys_tensor[devid_2_index];
+    uint64_t* total_keys = reinterpret_cast<uint64_t*>(
+        total_keys_tensor.mutable_data<int64_t>({total_length, 1}, place));
+
+    // construct slot_level lod info
+    auto slot_lengths_lod = slot_lengths;
+    for (size_t i = 1; i < slot_lengths_lod.size(); i++) {
+      slot_lengths_lod[i] += slot_lengths_lod[i - 1];
+    }
+    auto buf_key = memory::Alloc(place, keys.size() * sizeof(uint64_t*));
+    auto buf_length =
+        memory::Alloc(place, slot_lengths.size() * sizeof(int64_t));
+    uint64_t** gpu_keys = reinterpret_cast<uint64_t**>(buf_key->ptr());
+    int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
+    cudaMemcpy(gpu_keys, keys.data(), keys.size() * sizeof(uint64_t*),
+               cudaMemcpyHostToDevice);
+    cudaMemcpy(gpu_len, slot_lengths_lod.data(),
+               slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice);
+
+    this->CopyKeys(place, gpu_keys, total_keys, gpu_len,
+                   static_cast<int>(slot_lengths.size()),
+                   static_cast<int>(total_length));
+    VLOG(3) << "Begin call PullSparseGPU in GPUPS, dev: " << devid_2_index
+            << " len: " << total_length;
+    pull_gpups_timer.Start();
+    HeterPs_->pull_sparse(devid_2_index, total_keys, total_values_gpu,
+                          static_cast<int>(total_length));
+    pull_gpups_timer.Pause();
+
+    VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length
+            << "]";
+    this->CopyForPull(place, gpu_keys, values, total_values_gpu, gpu_len,
+                      static_cast<int>(slot_lengths.size()), hidden_size,
+                      total_length);
+  } else {
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "GpuPs: PullSparse Only Support CUDAPlace Now."));
+  }
+  all_timer.Pause();
+  VLOG(3) << "GpuPs PullSparse total costs: " << all_timer.ElapsedSec()
+          << " s, of which GPUPS costs: " << pull_gpups_timer.ElapsedSec()
+          << " s";
+  VLOG(3) << "End PullSparse";
+}
+
+void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
+                              const int table_id,
+                              const std::vector<const uint64_t*>& keys,
+                              const std::vector<float*>& values,
+                              const std::vector<int64_t>& slot_lengths,
+                              const std::vector<int>& slot_dim,
+                              const int hidden_size) {
+  VLOG(3) << "Begine Gpu Ps PullSparse";
+  platform::Timer all_timer;
+  platform::Timer pull_gpups_timer;
+  all_timer.Start();
+  size_t total_length =
+      std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
+  size_t feature_value_size = 0;
+
+  feature_value_size = TYPEALIGN(
+      8, sizeof(FeatureValue) + sizeof(float) * (index_dim_vec_.back() + 1));
+
 #ifdef PADDLE_WITH_CUDA
   VLOG(3) << "Begine Gpu Ps PullSparse";
-  auto buf = memory::Alloc(place, total_length * sizeof(FeatureValue));
+  auto buf = memory::Alloc(place, total_length * feature_value_size);
   FeatureValue* total_values_gpu = reinterpret_cast<FeatureValue*>(buf->ptr());
 #endif
 #ifdef PADDLE_WITH_XPU_KP
   VLOG(3) << "Begine Xpu Ps PullSparse";
   FeatureValue* total_values_gpu = nullptr;
   xpu_malloc(reinterpret_cast<void**>(&total_values_gpu),
-             total_length * sizeof(FeatureValue));
+             total_length * feature_value_size);
 #endif
   if (platform::is_cpu_place(place)) {
     PADDLE_THROW(platform::errors::Unimplemented(
         "Warning:: CPUPlace is not supported in GpuPs now."));
   } else if (platform::is_gpu_place(place)) {
-#ifdef PADDLE_WITH_CUDA
     VLOG(3) << "Begin copy keys, key_num[" << total_length << "]";
     int device_id = place.GetDeviceId();
     int devid_2_index = HeterPs_->get_index_by_devid(device_id);
     LoDTensor& total_keys_tensor = keys_tensor[devid_2_index];
-    uint64_t* total_keys = reinterpret_cast<uint64_t*>(
-        total_keys_tensor.mutable_data<int64_t>({total_length, 1}, place));
+    uint64_t* total_keys =
+        reinterpret_cast<uint64_t*>(total_keys_tensor.mutable_data<int64_t>(
+            {int64_t(total_length), 1}, place));
 
     // construct slot_level lod info
     auto slot_lengths_lod = slot_lengths;
@@ -936,23 +1059,30 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
     cudaMemcpy(gpu_len, slot_lengths_lod.data(),
                slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice);
 
+    auto buf_dim = memory::Alloc(place, slot_dim.size() * sizeof(int));
+    int* gpu_dim = reinterpret_cast<int*>(buf_dim->ptr());
+    cudaMemcpy(gpu_dim, slot_dim.data(), slot_dim.size() * sizeof(int),
+               cudaMemcpyHostToDevice);
+
     this->CopyKeys(place, gpu_keys, total_keys, gpu_len,
                    static_cast<int>(slot_lengths.size()),
                    static_cast<int>(total_length));
     VLOG(3) << "Begin call PullSparseGPU in GPUPS, dev: " << devid_2_index
             << " len: " << total_length;
+
     pull_gpups_timer.Start();
     HeterPs_->pull_sparse(devid_2_index, total_keys, total_values_gpu,
-                          static_cast<int>(total_length));
-    // PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-    //                              "PullSparseGPU failed in GPUPS."));
-    pull_gpups_timer.Pause();
+                          total_length);
 
     VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length
             << "]";
+
     this->CopyForPull(place, gpu_keys, values, total_values_gpu, gpu_len,
                       static_cast<int>(slot_lengths.size()), hidden_size,
-                      total_length);
+                      total_length, gpu_dim);
+
+    pull_gpups_timer.Pause();
+
 #endif
   } else if (platform::is_xpu_place(place)) {
 #ifdef PADDLE_WITH_XPU_KP
@@ -969,19 +1099,11 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
       slot_lengths_lod[i] += slot_lengths_lod[i - 1];
     }
 
-    uint64_t* buf_key = nullptr;
-    int64_t* buf_length = nullptr;
-    PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void**>(&buf_key),
-                                 keys.size() * sizeof(uint64_t*)),
-                      XPU_SUCCESS, platform::errors::ResourceExhausted(
-                                       "XPU has no enough memory"));
-    PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void**>(&buf_length),
-                                 slot_lengths.size() * sizeof(int64_t)),
-                      XPU_SUCCESS, platform::errors::ResourceExhausted(
-                                       "XPU has no enough memory"));
-
-    uint64_t** xpu_keys = reinterpret_cast<uint64_t**>(&buf_key);
-    int64_t* xpu_len = reinterpret_cast<int64_t*>(buf_length);
+    auto buf_key = memory::Alloc(place, keys.size() * sizeof(uint64_t*));
+    auto buf_length =
+        memory::Alloc(place, slot_lengths.size() * sizeof(int64_t));
+    uint64_t** xpu_keys = reinterpret_cast<uint64_t**>(buf_key->ptr());
+    int64_t* xpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
     PADDLE_ENFORCE_XPU_SUCCESS(xpu_memcpy(xpu_keys, keys.data(),
                                           keys.size() * sizeof(uint64_t*),
                                           XPU_HOST_TO_DEVICE));
@@ -997,8 +1119,6 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
     pull_gpups_timer.Start();
     HeterPs_->pull_sparse(devid_2_index, total_keys, total_values_gpu,
                           static_cast<int>(total_length));
-    // PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-    //                              "PullSparseGPU failed in GPUPS."));
     pull_gpups_timer.Pause();
 
     VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length
@@ -1029,30 +1149,32 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
   all_timer.Start();
   int64_t total_length =
       std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
-#ifdef PADDLE_WITH_CUDA
+  // #ifdef PADDLE_WITH_CUDA
   VLOG(3) << "Begin GPUPS PushSparseGrad";
-  auto buf = memory::Alloc(place, total_length * sizeof(FeaturePushValue));
+  size_t grad_value_size =
+      TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float)));
+  auto buf = memory::Alloc(place, total_length * grad_value_size);
+  VLOG(3) << "Push Sparse Max mf dimention: " << max_mf_dim_;
   FeaturePushValue* total_grad_values_gpu =
       reinterpret_cast<FeaturePushValue*>(buf->ptr());
-#endif
-#ifdef PADDLE_WITH_XPU_KP
-  VLOG(3) << "Begine Xpu Ps PushSparseGrad";
-  FeaturePushValue* total_grad_values_gpu = nullptr;
-  xpu_malloc(reinterpret_cast<void**>(&total_grad_values_gpu),
-             total_length * sizeof(FeaturePushValue));
-#endif
   if (platform::is_cpu_place(place)) {
     PADDLE_THROW(platform::errors::Unimplemented(
         "Warning:: CPUPlace is not supported in GPUPS now."));
   } else if (platform::is_gpu_place(place)) {
+#ifdef PADDLE_WITH_CUDA
     int device_id = place.GetDeviceId();
     int devid_2_index = HeterPs_->get_index_by_devid(device_id);
     LoDTensor& cached_total_keys_tensor = keys_tensor[devid_2_index];
     uint64_t* total_keys =
         reinterpret_cast<uint64_t*>(cached_total_keys_tensor.data<int64_t>());
     VLOG(3) << "Begin copy grad tensor to gpups struct";
-    this->CopyForPush(place, grad_values, total_grad_values_gpu, slot_lengths,
-                      hidden_size, total_length, batch_size);
+    if (!multi_mf_dim_) {
+      this->CopyForPush(place, grad_values, total_grad_values_gpu, slot_lengths,
+                        hidden_size, total_length, batch_size);
+    } else {
+      this->CopyForPush(place, grad_values, total_grad_values_gpu, slot_lengths,
+                        total_length, batch_size, grad_value_size);
+    }
 
     VLOG(3) << "Begin call PushSparseGPU in GPUPS, dev: " << devid_2_index
             << " len: " << total_length;
@@ -1060,7 +1182,9 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
     HeterPs_->push_sparse(devid_2_index, total_keys, total_grad_values_gpu,
                           static_cast<int>(total_length));
     push_gpups_timer.Pause();
+#endif
   } else if (platform::is_xpu_place(place)) {
+#ifdef PADDLE_WITH_XPU_KP
     int device_id = place.GetDeviceId();
     int devid_2_index = HeterPs_->get_index_by_devid(device_id);
     LoDTensor& cached_total_keys_tensor = keys_tensor[devid_2_index];
@@ -1076,11 +1200,14 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
     HeterPs_->push_sparse(devid_2_index, total_keys, total_grad_values_gpu,
                           static_cast<int>(total_length));
     push_gpups_timer.Pause();
+#endif
   } else {
     PADDLE_THROW(platform::errors::PreconditionNotMet(
         "GPUPS: PushSparseGrad Only Support CUDAPlace Now."));
   }
   all_timer.Pause();
+  time_3 += all_timer.ElapsedSec();
+  time_4 += push_gpups_timer.ElapsedSec();
   VLOG(3) << "PushSparseGrad total cost: " << all_timer.ElapsedSec()
           << " s, of which GPUPS cost: " << push_gpups_timer.ElapsedSec()
           << " s";
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
index cf7d98db27e84..488a9ef8ce78f 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
@@ -61,6 +61,45 @@ __global__ void PullCopy(float** dest, const FeatureValue* src,
   }
 }
 
+__global__ void PullCopy(float** dest, const FeatureValue* src,
+                         const int64_t* len, int slot_num, int total_len,
+                         uint64_t** keys, uint64_t max_val_size, int* gpu_dim) {
+  CUDA_KERNEL_LOOP(i, total_len) {
+    int low = 0;
+    int high = slot_num - 1;
+    while (low < high) {
+      int mid = (low + high) / 2;
+      if (i < len[mid])
+        high = mid;
+      else
+        low = mid + 1;
+    }
+    int x = low;
+    int y = i - (x ? len[x - 1] : 0);
+    FeatureValue* feature_value_ptr =
+        (FeatureValue*)((char*)src + uint64_t(i) * uint64_t(max_val_size));
+    int mf_dim = gpu_dim[x] - 3;
+    if (*(keys[x] + y) == 0) {
+      *(dest[x] + y * (mf_dim + 3)) = 0;
+      *(dest[x] + y * (mf_dim + 3) + 1) = 0;
+      *(dest[x] + y * (mf_dim + 3) + 2) = 0;
+    } else {
+      *(dest[x] + y * (mf_dim + 3)) = feature_value_ptr->show;
+      *(dest[x] + y * (mf_dim + 3) + 1) = feature_value_ptr->clk;
+      *(dest[x] + y * (mf_dim + 3) + 2) = feature_value_ptr->lr;
+    }
+    if ((feature_value_ptr)->mf_size == 0 || *(keys[x] + y) == 0) {
+      for (int j = 0; j < mf_dim; j++) {
+        *(dest[x] + y * (mf_dim + 3) + 3 + j) = 0;
+      }
+    } else {
+      for (int j = 0; j < mf_dim; j++) {
+        *(dest[x] + y * (mf_dim + 3) + 3 + j) = feature_value_ptr->mf[1 + j];
+      }
+    }
+  }
+}
+
 __global__ void CopyKeysKernel(uint64_t** src_keys, uint64_t* dest_total_keys,
                                const int64_t* len, int slot_num,
                                int total_len) {
@@ -105,6 +144,35 @@ __global__ void PushCopy(FeaturePushValue* dest, float** src, int64_t* len,
   }
 }
 
+__global__ void PushCopyWithPool(FeaturePushValue* dest, float** src,
+                                 int64_t* len, int slot_num, uint64_t total_len,
+                                 int bs, int* slot_vector, int* mf_dim_vector,
+                                 size_t grad_value_size) {
+  CUDA_KERNEL_LOOP(i, total_len) {
+    int low = 0;
+    int high = slot_num - 1;
+    while (low < high) {
+      int mid = (low + high) / 2;
+      if (i < len[mid])
+        high = mid;
+      else
+        low = mid + 1;
+    }
+    int x = low;
+    int y = i - (x ? len[low - 1] : 0);
+    FeaturePushValue* cur =
+        (FeaturePushValue*)((char*)dest + i * grad_value_size);
+    cur->slot = slot_vector[x];
+    int mf_dim = mf_dim_vector[x];
+    cur->mf_dim = mf_dim;
+    cur->show = *(src[x] + y * (mf_dim + 3));
+    cur->clk = *(src[x] + y * (mf_dim + 3) + 1);
+    cur->lr_g = *(src[x] + y * (mf_dim + 3) + 2) * -1. * bs;
+    for (int j = 0; j < cur->mf_dim; j++) {
+      cur->mf_g[j] = *(src[x] + y * (mf_dim + 3) + 3 + j) * -1. * bs;
+    }
+  }
+}
 PSGPUWrapper::~PSGPUWrapper() { delete HeterPs_; }
 
 void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place,
@@ -128,6 +196,26 @@ void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place,
   cudaStreamSynchronize(stream);
 }
 
+void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place,
+                               uint64_t** gpu_keys,
+                               const std::vector<float*>& values,
+                               const FeatureValue* total_values_gpu,
+                               const int64_t* gpu_len, const int slot_num,
+                               const int hidden_size,
+                               const int64_t total_length, int* gpu_dim) {
+  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
+                    platform::DeviceContextPool::Instance().Get(place))
+                    ->stream();
+  auto buf_value = memory::Alloc(place, values.size() * sizeof(float*));
+  float** gpu_values = reinterpret_cast<float**>(buf_value->ptr());
+  cudaMemcpy(gpu_values, values.data(), values.size() * sizeof(float*),
+             cudaMemcpyHostToDevice);
+  PullCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      gpu_values, total_values_gpu, gpu_len, slot_num, total_length, gpu_keys,
+      val_type_size_, gpu_dim);
+  cudaStreamSynchronize(stream);
+}
+
 void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
                             uint64_t** origin_keys, uint64_t* total_keys,
                             const int64_t* gpu_len, int slot_num,
@@ -177,39 +265,64 @@ void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place,
   cudaStreamSynchronize(stream);
 }
 
+void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place,
+                               const std::vector<const float*>& grad_values,
+                               FeaturePushValue* total_grad_values_gpu,
+                               const std::vector<int64_t>& slot_lengths,
+                               const uint64_t total_length,
+                               const int batch_size, size_t grad_value_size) {
+  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
+                    platform::DeviceContextPool::Instance().Get(place))
+                    ->stream();
+  auto slot_lengths_lod = slot_lengths;
+  for (int i = 1; i < slot_lengths_lod.size(); i++) {
+    slot_lengths_lod[i] += slot_lengths_lod[i - 1];
+  }
+  auto buf_grad_value =
+      memory::Alloc(place, grad_values.size() * sizeof(float*));
+  auto buf_length = memory::Alloc(place, slot_lengths.size() * sizeof(int64_t));
+  auto buf_slot_vector =
+      memory::Alloc(place, slot_lengths_lod.size() * sizeof(int));
+  auto buf_mf_dim_vector =
+      memory::Alloc(place, slot_lengths_lod.size() * sizeof(int));
+  float** gpu_values = reinterpret_cast<float**>(buf_grad_value->ptr());
+  int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
+  int* d_slot_vector = reinterpret_cast<int*>(buf_slot_vector->ptr());
+  int* d_mf_dim_vector = reinterpret_cast<int*>(buf_mf_dim_vector->ptr());
+  cudaMemcpy(gpu_values, grad_values.data(),
+             grad_values.size() * sizeof(float*), cudaMemcpyHostToDevice);
+  cudaMemcpy(gpu_len, slot_lengths_lod.data(),
+             slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_slot_vector, slot_vector_.data(),
+             slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_mf_dim_vector, slot_mf_dim_vector_.data(),
+             slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice);
+  PushCopyWithPool<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      total_grad_values_gpu, gpu_values, gpu_len, slot_lengths.size(),
+      total_length, batch_size, d_slot_vector, d_mf_dim_vector,
+      grad_value_size);
+  cudaStreamSynchronize(stream);
+}
+
 void PSGPUWrapper::SetSparseSGD(float nonclk_coeff, float clk_coeff,
                                 float min_bound, float max_bound,
                                 float learning_rate, float initial_g2sum,
                                 float initial_range) {
-  cudaMemcpyToSymbol(optimizer_config::nonclk_coeff, &nonclk_coeff,
-                     sizeof(float));
-  cudaMemcpyToSymbol(optimizer_config::clk_coeff, &clk_coeff, sizeof(float));
-  cudaMemcpyToSymbol(optimizer_config::min_bound, &min_bound, sizeof(float));
-  cudaMemcpyToSymbol(optimizer_config::max_bound, &max_bound, sizeof(float));
-  cudaMemcpyToSymbol(optimizer_config::learning_rate, &learning_rate,
-                     sizeof(float));
-  cudaMemcpyToSymbol(optimizer_config::initial_g2sum, &initial_g2sum,
-                     sizeof(float));
-  cudaMemcpyToSymbol(optimizer_config::initial_range, &initial_range,
-                     sizeof(float));
+  OptimizerConfig optimizer_config;
+  optimizer_config.set_sparse_sgd(nonclk_coeff, clk_coeff, min_bound, max_bound,
+                                  learning_rate, initial_g2sum, initial_range);
+  HeterPs_->set_sparse_sgd(optimizer_config);
 }
 
 void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds,
                                 float mf_learning_rate, float mf_initial_g2sum,
                                 float mf_initial_range, float mf_min_bound,
                                 float mf_max_bound) {
-  cudaMemcpyToSymbol(optimizer_config::mf_create_thresholds,
-                     &mf_create_thresholds, sizeof(float));
-  cudaMemcpyToSymbol(optimizer_config::mf_learning_rate, &mf_learning_rate,
-                     sizeof(float));
-  cudaMemcpyToSymbol(optimizer_config::mf_initial_g2sum, &mf_initial_g2sum,
-                     sizeof(float));
-  cudaMemcpyToSymbol(optimizer_config::mf_initial_range, &mf_initial_range,
-                     sizeof(float));
-  cudaMemcpyToSymbol(optimizer_config::mf_min_bound, &mf_min_bound,
-                     sizeof(float));
-  cudaMemcpyToSymbol(optimizer_config::mf_max_bound, &mf_max_bound,
-                     sizeof(float));
+  OptimizerConfig optimizer_config;
+  optimizer_config.set_embedx_sgd(mf_create_thresholds, mf_learning_rate,
+                                  mf_initial_g2sum, mf_initial_range,
+                                  mf_min_bound, mf_max_bound);
+  HeterPs_->set_embedx_sgd(optimizer_config);
 }
 
 }  // end namespace framework
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
index c38b819822c28..0efec57e59db6 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
@@ -27,6 +27,7 @@ limitations under the License. */
 #include <vector>
 #ifdef PADDLE_WITH_GLOO
 #include <gloo/broadcast.h>
+#include "paddle/fluid/framework/data_set.h"
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #endif
 #include "paddle/fluid/distributed/ps/thirdparty/round_robin.h"
@@ -54,6 +55,9 @@ limitations under the License. */
 #ifdef PADDLE_WITH_PSLIB
 #include "afs_api.h"
 #endif
+#ifdef PADDLE_WITH_PSLIB
+#include "downpour_accessor.h"  // NOLINT
+#endif
 
 namespace paddle {
 namespace framework {
@@ -95,12 +99,21 @@ class PSGPUWrapper {
   PSGPUWrapper() {
     HeterPs_ = NULL;
     sleep_seconds_before_fail_exit_ = 300;
+    pull_thread_pool_.resize(thread_keys_shard_num_);
+    for (size_t i = 0; i < pull_thread_pool_.size(); i++) {
+      pull_thread_pool_[i].reset(new ::ThreadPool(1));
+    }
     hbm_thread_pool_.resize(thread_keys_shard_num_);
     for (size_t i = 0; i < hbm_thread_pool_.size(); i++) {
       hbm_thread_pool_[i].reset(new ::ThreadPool(1));
     }
   }
 
+  void PullSparse(const paddle::platform::Place& place, const int table_id,
+                  const std::vector<const uint64_t*>& keys,
+                  const std::vector<float*>& values,
+                  const std::vector<int64_t>& slot_lengths,
+                  const std::vector<int>& slot_dim, const int hidden_size);
   void PullSparse(const paddle::platform::Place& place, const int table_id,
                   const std::vector<const uint64_t*>& keys,
                   const std::vector<float*>& values,
@@ -119,13 +132,23 @@ class PSGPUWrapper {
                    const FeatureValue* total_values_gpu, const int64_t* gpu_len,
                    const int slot_num, const int hidden_size,
                    const int64_t total_length);
-
+  void CopyForPull(const paddle::platform::Place& place, uint64_t** gpu_keys,
+                   const std::vector<float*>& values,
+                   const FeatureValue* total_values_gpu, const int64_t* gpu_len,
+                   const int slot_num, const int hidden_size,
+                   const int64_t total_length, int* gpu_dim);
   void CopyForPush(const paddle::platform::Place& place,
                    const std::vector<const float*>& grad_values,
                    FeaturePushValue* total_grad_values_gpu,
                    const std::vector<int64_t>& slot_lengths,
                    const int hidden_size, const int64_t total_length,
                    const int batch_size);
+  void CopyForPush(const paddle::platform::Place& place,
+                   const std::vector<const float*>& grad_values,
+                   FeaturePushValue* total_grad_values_gpu,
+                   const std::vector<int64_t>& slot_lengths,
+                   const uint64_t total_length, const int batch_size,
+                   size_t grad_value_size);
 
   void BuildGPUTask(std::shared_ptr<HeterContext> gpu_task);
   void PreBuildTask(std::shared_ptr<HeterContext> gpu_task);
@@ -310,13 +333,40 @@ class PSGPUWrapper {
 
   void SetSlotOffsetVector(const std::vector<int>& slot_offset_vector) {
     slot_offset_vector_ = slot_offset_vector;
+    std::cout << "yxf set: ";
+    for (auto s : slot_offset_vector_) {
+      std::cout << s << " | ";
+    }
+    std::cout << " end " << std::endl;
   }
 
 #ifdef PADDLE_WITH_CUDA
   void SetSlotDimVector(const std::vector<int>& slot_mf_dim_vector) {
     slot_mf_dim_vector_ = slot_mf_dim_vector;
     assert(slot_mf_dim_vector_.size() == slot_vector_.size());
-    for (size_t i = 0; i < slot_mf_dim_vector.size(); i++) {
+  }
+
+  void InitSlotInfo() {
+    if (slot_info_initialized_) {
+      return;
+    }
+    SlotRecordDataset* dataset = dynamic_cast<SlotRecordDataset*>(dataset_);
+    auto slots_vec = dataset->GetSlots();
+    slot_offset_vector_.clear();
+    for (auto& slot : slot_vector_) {
+      for (size_t i = 0; i < slots_vec.size(); ++i) {
+        if (std::to_string(slot) == slots_vec[i]) {
+          slot_offset_vector_.push_back(i);
+          break;
+        }
+      }
+    }
+    std::cout << "psgpu wrapper use slots: ";
+    for (auto s : slot_offset_vector_) {
+      std::cout << s << " | ";
+    }
+    std::cout << " end " << std::endl;
+    for (size_t i = 0; i < slot_mf_dim_vector_.size(); i++) {
       slot_dim_map_[slot_vector_[i]] = slot_mf_dim_vector_[i];
     }
 
@@ -345,6 +395,7 @@ class PSGPUWrapper {
         TYPEALIGN(8, sizeof(FeatureValue) + sizeof(float) * (max_mf_dim_ + 1));
     grad_type_size_ =
         TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float)));
+    slot_info_initialized_ = true;
   }
 #endif
 
@@ -385,6 +436,12 @@ class PSGPUWrapper {
   int max_mf_dim_{0};
   size_t val_type_size_{0};
   size_t grad_type_size_{0};
+
+  double time_1 = 0.0;
+  double time_2 = 0.0;
+  double time_3 = 0.0;
+  double time_4 = 0.0;
+
   int multi_node_{0};
   int node_size_;
   uint64_t table_id_;
@@ -405,6 +462,7 @@ class PSGPUWrapper {
   int year_;
   int month_;
   int day_;
+  bool slot_info_initialized_ = false;
   int use_afs_api_ = 0;
 
 #ifdef PADDLE_WITH_CUDA
@@ -428,6 +486,7 @@ class PSGPUWrapper {
   std::shared_ptr<HeterContext> current_task_ = nullptr;
   std::thread pre_build_threads_;
   bool running_ = false;
+  std::vector<std::shared_ptr<ThreadPool>> pull_thread_pool_;
   std::vector<std::shared_ptr<ThreadPool>> hbm_thread_pool_;
 
  protected:
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps b/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps
index 571a090b9b4a6..ef6c70e624d4c 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps
@@ -28,9 +28,9 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-__global__ void PullCopy(float** dest, const FeatureValue* src,
+__global__ void PullCopy(float* dest, const FeatureValue* src,
                          const long long* len, int hidden, int slot_num,
-                         int total_len, unsigned long long** keys) {
+                         int total_len, unsigned long long* keys) {
   int cid = core_id();
   int ncores = core_num();
   if (cid >= ncores) {
@@ -41,11 +41,21 @@ __global__ void PullCopy(float** dest, const FeatureValue* src,
   __local__ int64_t local_len[slot_num];
   GM2LM(len, local_len, slot_num * sizeof(int64_t));
 
+  __global_ptr__ unsigned long long* local_keys[slot_num];
+  GM2LM(keys, local_keys,
+        slot_num * sizeof(__global_ptr__ unsigned long long*));
+
+  __global_ptr__ float* local_dest[slot_num];
+  GM2LM(dest, local_dest, slot_num * sizeof(__global_ptr__ float*));
+
+  int read_len = 30;
+
   for (int i = thread_id; i < slot_num; i += nthreads) {
     // max core local memory = 8KB
     // slot's max memory size = slot_len * sizeof(FeatureValue)
     int slot_len = i ? local_len[i] - local_len[i - 1] : local_len[0];
-    int read_len = min(roundup_div(1024 * 8, sizeof(FeatureValue)), slot_len);
+    // int read_len = min(roundup_div(1024 * 8, sizeof(FeatureValue)),
+    // slot_len);
     int dest_len = i ? local_len[i - 1] : 0;
     __local__ FeatureValue local_slot_vals[read_len];
     __local__ float local_dest_vals[read_len * hidden];
@@ -56,7 +66,8 @@ __global__ void PullCopy(float** dest, const FeatureValue* src,
       int real_read_len = min(read_len, slot_len - k);
       GM2LM(src + dest_len + k, local_slot_vals,
             real_read_len * sizeof(FeatureValue));
-      GM2LM(keys[i] + k, local_slot_keys, real_read_len * sizeof(uint64_t));
+      GM2LM(local_keys[i] + k, local_slot_keys,
+            real_read_len * sizeof(uint64_t));
       for (int j = 0; j < real_read_len; j++) {
         if (local_slot_keys[j] == 0) {
           local_dest_vals[j * hidden] = 0;
@@ -78,13 +89,13 @@ __global__ void PullCopy(float** dest, const FeatureValue* src,
           }
         }
       }
-      LM2GM(local_dest_vals, dest[i] + k * hidden,
+      LM2GM(local_dest_vals, local_dest[i] + k * hidden,
             real_read_len * hidden * sizeof(float));
     }
   }
 }
 
-__global__ void CopyKeysKernel(unsigned long long** src_keys,
+__global__ void CopyKeysKernel(unsigned long long* src_keys,
                                unsigned long long* dest_total_keys,
                                const long long* len, int slot_num,
                                int total_len) {
@@ -95,26 +106,32 @@ __global__ void CopyKeysKernel(unsigned long long** src_keys,
   }
   int thread_id = ncores * cluster_id() + cid;
   int nthreads = ncores * cluster_num();
-  __local__ int64_t local_len[slot_num];
-  GM2LM(len, local_len, slot_num * sizeof(int64_t));
+  __local__ long long local_len[slot_num];
+  GM2LM(len, local_len, slot_num * sizeof(long long));
+
+  __global_ptr__ unsigned long long* local_keys[slot_num];
+  GM2LM(src_keys, local_keys,
+        slot_num * sizeof(__global_ptr__ unsigned long long*));
 
   for (int i = thread_id; i < slot_num; i += nthreads) {
     // max core local memory = 8KB
     int slot_len = i ? local_len[i] - local_len[i - 1] : local_len[0];
-    int read_len = min(slot_len, 1024);
+    // int read_len = min(slot_len, 1024);
+    int read_len = 100;
     int dest_len = i ? local_len[i - 1] : 0;
-    __local__ uint64_t local_slot_keys[read_len];
+    __local__ unsigned long long local_slot_keys[read_len];
 
     for (int k = 0; k < slot_len; k += read_len) {
       int real_read_len = min(read_len, slot_len - k);
-      GM2LM(src_keys[i] + k, local_slot_keys, real_read_len * sizeof(uint64_t));
+      GM2LM(local_keys[i] + k, local_slot_keys,
+            real_read_len * sizeof(unsigned long long));
       LM2GM(local_slot_keys, dest_total_keys + dest_len + k,
-            real_read_len * sizeof(uint64_t));
+            real_read_len * sizeof(unsigned long long));
     }
   }
 }
 
-__global__ void PushCopy(FeaturePushValue* dest, float** src, long long* len,
+__global__ void PushCopy(FeaturePushValue* dest, float* src, long long* len,
                          int hidden, int slot_num, int total_len, int bs,
                          int* slot_vector) {
   int cid = core_id();
@@ -129,12 +146,16 @@ __global__ void PushCopy(FeaturePushValue* dest, float** src, long long* len,
   GM2LM(len, local_len, slot_num * sizeof(int64_t));
   GM2LM(slot_vector, local_slot, slot_num * sizeof(int));
 
+  __global_ptr__ float* local_src[slot_num];
+  GM2LM(src, local_src, slot_num * sizeof(__global_ptr__ float*));
+
   for (int i = thread_id; i < slot_num; i += nthreads) {
     int slot_len = i ? local_len[i] - local_len[i - 1] : local_len[0];
 
     // max core local memory = 8KB
     // slot's max memory size = slot_len * hidden * 8
-    int read_len = min(roundup_div(1024, hidden), slot_len);
+    // int read_len = min(roundup_div(1024, hidden), slot_len);
+    int read_len = 40;
     int dest_len = i ? local_len[i - 1] : 0;
     __local__ float local_slot_grads[read_len * hidden];
     __local__ FeaturePushValue local_dest_grads[read_len];
@@ -142,7 +163,7 @@ __global__ void PushCopy(FeaturePushValue* dest, float** src, long long* len,
     // copy read_len(length) of slots' grad to LM
     for (int k = 0; k < slot_len; k += read_len) {
       int real_read_len = min(read_len, slot_len - k);
-      GM2LM(src[i] + k * hidden, local_slot_grads,
+      GM2LM(local_src[i] + k * hidden, local_slot_grads,
             real_read_len * hidden * sizeof(float));
       // copy from slots' grad to total grad
       for (int j = 0; j < real_read_len; j++) {
@@ -175,14 +196,18 @@ void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place,
   stream = static_cast<platform::XPUDeviceContext*>(dev_ctx)
                ->x_context()
                ->xpu_stream;
-  float* buf_value = nullptr;
-  xpu_malloc(reinterpret_cast<void**>(&buf_value),
+  // float* buf_value = nullptr;
+  // xpu_malloc(reinterpret_cast<void**>(&buf_value),
+  //            values.size() * sizeof(float*));
+  // float** gpu_values = reinterpret_cast<float**>(&buf_value);
+  float* gpu_values = nullptr;
+  xpu_malloc(reinterpret_cast<void**>(&gpu_values),
              values.size() * sizeof(float*));
-  float** gpu_values = reinterpret_cast<float**>(&buf_value);
   xpu_memcpy(gpu_values, values.data(), values.size() * sizeof(float*),
              XPU_HOST_TO_DEVICE);
 
-  unsigned long long** c_keys = (unsigned long long**)gpu_keys;
+  // unsigned long long** c_keys = (unsigned long long**)gpu_keys;
+  unsigned long long* c_keys = reinterpret_cast<unsigned long long*>(gpu_keys);
   const long long* c_len = (const long long*)gpu_len;
   PullCopy<<<2, 64, stream>>>(gpu_values, total_values_gpu, c_len, hidden_size,
                               slot_num, total_length, c_keys);
@@ -199,7 +224,8 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
   stream = static_cast<platform::XPUDeviceContext*>(dev_ctx)
                ->x_context()
                ->xpu_stream;
-  unsigned long long** o_keys = (unsigned long long**)origin_keys;
+  unsigned long long* o_keys =
+      reinterpret_cast<unsigned long long*>(origin_keys);
   unsigned long long* t_keys = (unsigned long long*)total_keys;
   const long long* c_len = (const long long*)gpu_len;
   CopyKeysKernel<<<2, 64, stream>>>(o_keys, t_keys, c_len, slot_num, total_len);
@@ -223,20 +249,17 @@ void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place,
     slot_lengths_lod[i] += slot_lengths_lod[i - 1];
   }
 
-  float* buf_grad_value = nullptr;
-  int64_t* buf_length = nullptr;
-  int* buf_slot_vector = nullptr;
+  float* gpu_values = nullptr;
+  int64_t* gpu_len = nullptr;
+  int* d_slot_vector = nullptr;
 
-  xpu_malloc(reinterpret_cast<void**>(&buf_grad_value),
+  xpu_malloc(reinterpret_cast<void**>(&gpu_values),
              grad_values.size() * sizeof(float*));
-  xpu_malloc(reinterpret_cast<void**>(&buf_length),
+  xpu_malloc(reinterpret_cast<void**>(&gpu_len),
              slot_lengths.size() * sizeof(int64_t));
-  xpu_malloc(reinterpret_cast<void**>(&buf_slot_vector),
+  xpu_malloc(reinterpret_cast<void**>(&d_slot_vector),
              slot_lengths_lod.size() * sizeof(int));
 
-  float** gpu_values = reinterpret_cast<float**>(&buf_grad_value);
-  int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length);
-  int* d_slot_vector = reinterpret_cast<int*>(buf_slot_vector);
   xpu_memcpy(gpu_values, grad_values.data(),
              grad_values.size() * sizeof(float*), XPU_HOST_TO_DEVICE);
   xpu_memcpy(gpu_len, slot_lengths_lod.data(),
@@ -256,13 +279,8 @@ void PSGPUWrapper::SetSparseSGD(float nonclk_coeff, float clk_coeff,
                                 float learning_rate, float initial_g2sum,
                                 float initial_range) {
   OptimizerConfig optimizer_config;
-  optimizer_config.nonclk_coeff = nonclk_coeff;
-  optimizer_config.clk_coeff = clk_coeff;
-  optimizer_config.min_bound = min_bound;
-  optimizer_config.max_bound = max_bound;
-  optimizer_config.learning_rate = learning_rate;
-  optimizer_config.initial_g2sum = initial_g2sum;
-  optimizer_config.initial_range = initial_range;
+  optimizer_config.set_sparse_sgd(nonclk_coeff, clk_coeff, min_bound, max_bound,
+                                  learning_rate, initial_g2sum, initial_range);
   HeterPs_->set_sparse_sgd(optimizer_config);
 }
 
@@ -271,12 +289,9 @@ void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds,
                                 float mf_initial_range, float mf_min_bound,
                                 float mf_max_bound) {
   OptimizerConfig optimizer_config;
-  optimizer_config.mf_create_thresholds = mf_create_thresholds;
-  optimizer_config.mf_learning_rate = mf_learning_rate;
-  optimizer_config.mf_initial_g2sum = mf_initial_g2sum;
-  optimizer_config.mf_initial_range = mf_initial_range;
-  optimizer_config.mf_min_bound = mf_min_bound;
-  optimizer_config.mf_max_bound = mf_max_bound;
+  optimizer_config.set_embedx_sgd(mf_create_thresholds, mf_learning_rate,
+                                  mf_initial_g2sum, mf_initial_range,
+                                  mf_min_bound, mf_max_bound);
   HeterPs_->set_embedx_sgd(optimizer_config);
 }
 
diff --git a/paddle/fluid/framework/generator.cc b/paddle/fluid/framework/generator.cc
index 2bd8ed900f102..b621eca35b893 100644
--- a/paddle/fluid/framework/generator.cc
+++ b/paddle/fluid/framework/generator.cc
@@ -24,7 +24,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-const std::shared_ptr<Generator>& GetDefaultCUDAGenerator(int64_t device_id) {
+const std::shared_ptr<Generator>& DefaultCUDAGenerator(int64_t device_id) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
   static int64_t num_cuda_devices = -1;
@@ -58,8 +58,6 @@ const std::shared_ptr<Generator>& GetDefaultCUDAGenerator(int64_t device_id) {
 const std::shared_ptr<Generator>& DefaultCPUGenerator() {
   static auto default_cpu_generator =
       std::make_shared<Generator>(GetRandomSeed());
-  VLOG(4) << "initial seed: " << default_cpu_generator->GetCurrentSeed()
-          << ", cpu engine: " << default_cpu_generator->GetCPUEngine().get();
   return default_cpu_generator;
 }
 
@@ -100,19 +98,13 @@ const std::shared_ptr<Generator>& GetRandomSeedGenerator(
   return iter->second;
 }
 
-std::shared_ptr<std::mt19937_64> OpDefaultCPUEngine() {
-  static auto op_default_cpu_engine = std::make_shared<std::mt19937_64>();
-  return op_default_cpu_engine;
-}
-
-// NOTE(zhiqiu): there are 3 conditions:
-// (1) op seed is not set and DefaultCPUGenerator is inited, use
-// DefaultCPUGenerator
-// (2) op seed is not set and DefaultCPUGenerator is not inited, use se
-// OpDefaultCPUEngine() and set a radnom seed
-// (3) op seed is set, use OpDefaultCPUEngine() and set the seed
+// There are 3 conditions:
+// (1) op seed is set, use op seed.
+// (2) op seed is not set, global seed is set, use global seed.
+// (3) op seed is not set, global seed is not set too, use random seed from
+// RandomGenerator.
 std::shared_ptr<std::mt19937_64> GetCPURandomEngine(uint64_t seed) {
-  if (DefaultCPUGenerator()->GetIsInitPy() && seed == 0) {
+  if (seed == 0) {
     VLOG(4) << "Use random engine from generator";
     return DefaultCPUGenerator()->GetCPUEngine();
   } else {
@@ -123,12 +115,6 @@ std::shared_ptr<std::mt19937_64> GetCPURandomEngine(uint64_t seed) {
     //
     // And we need to measure the determinacy of Generator in PE.
     auto engine = std::make_shared<std::mt19937_64>();
-    if (seed == 0) {
-      seed = GetRandomSeed();
-      VLOG(4) << "Use default random engine with random seed = " << seed;
-    } else {
-      VLOG(4) << "Use default random engine with fixed random seed = " << seed;
-    }
     static std::mutex mu_;
     {
       std::lock_guard<std::mutex> lock(mu_);
@@ -204,11 +190,5 @@ std::pair<uint64_t, uint64_t> Generator::IncrementOffset(
 #endif
 }
 
-void Generator::SetIsInitPy(bool is_init_py) {
-  this->is_init_py_ = is_init_py;
-  VLOG(4) << "SetIsInitPy:" << this->is_init_py_;
-}
-bool Generator::GetIsInitPy() const { return this->is_init_py_; }
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/generator.h b/paddle/fluid/framework/generator.h
index 1c19234bf7d80..35efc1bee33d5 100644
--- a/paddle/fluid/framework/generator.h
+++ b/paddle/fluid/framework/generator.h
@@ -59,7 +59,6 @@ struct Generator : public phi::Generator {
     this->engine_ = engine;
     VLOG(4) << "initial seed: " << this->state_.current_seed
             << ", cpu engine: " << &this->state_.cpu_engine;
-    this->is_init_py_ = true;  // TODO(zhiqiu): remove it in future
   }
   Generator(uint64_t seed, uint64_t device_id) {
     std::seed_seq seq({seed});
@@ -71,7 +70,6 @@ struct Generator : public phi::Generator {
     this->engine_ = engine;
     VLOG(4) << "initial seed: " << this->state_.current_seed
             << ", cpu engine: " << &this->state_.cpu_engine;
-    this->is_init_py_ = false;  // TODO(zhiqiu): remove it in future
   }
 
   Generator(const Generator& other) = delete;
@@ -95,32 +93,21 @@ struct Generator : public phi::Generator {
 
   std::pair<uint64_t, uint64_t> IncrementOffset(uint64_t increament_offset);
 
-  void SetIsInitPy(bool);
-  bool GetIsInitPy() const;
   uint64_t get_device_id() { return this->state_.device; }
 
  private:
   phi::Generator::GeneratorState state_;
   std::shared_ptr<std::mt19937_64> engine_;
   mutable std::mutex mu_;
-
-  // NOTE(zhiqiu): is_init_py_ is used to make generator be compatible with
-  // old seed, and it should be removed after all random-related operators
-  // and unittests upgrades to use generator.
-  bool is_init_py_ = false;
 };
 
 // The DefaultCPUGenerator is used in manual_seed()
 const std::shared_ptr<Generator>& DefaultCPUGenerator();
 
-// If op seed is set or global is not set, the OpDefaultCPUEngine is used.
-std::shared_ptr<std::mt19937_64> OpDefaultCPUEngine();
+const std::shared_ptr<Generator>& DefaultCUDAGenerator(int64_t device_id = -1);
 
 std::shared_ptr<std::mt19937_64> GetCPURandomEngine(uint64_t);
 
-const std::shared_ptr<Generator>& GetDefaultCUDAGenerator(
-    int64_t device_id = -1);
-
 const std::shared_ptr<Generator>& SetRandomSeedGenerator(
     const std::string& name, uint64_t seed);
 
diff --git a/paddle/fluid/framework/heter_pipeline_trainer.cc b/paddle/fluid/framework/heter_pipeline_trainer.cc
index 13eb78874c395..d0d3c2fea3b56 100644
--- a/paddle/fluid/framework/heter_pipeline_trainer.cc
+++ b/paddle/fluid/framework/heter_pipeline_trainer.cc
@@ -282,7 +282,7 @@ void HeterPipelineTrainer::Run() {
   if (threads_.size() > 0) {
     threads_.clear();
   }
-  VLOG(3) << "Epoch Trainging done";
+  VLOG(3) << "Epoch Training done";
 }
 
 void HeterPipelineTrainer::Finalize() {
diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc
index cb33e87f490c2..a7138fd2642a8 100644
--- a/paddle/fluid/framework/hogwild_worker.cc
+++ b/paddle/fluid/framework/hogwild_worker.cc
@@ -219,6 +219,10 @@ void HogwildWorker::TrainFiles() {
   device_reader_->Start();
   int cur_batch;
   int batch_cnt = 0;
+
+#if defined(PADDLE_WITH_HETERPS) && defined(PADDLE_WITH_CUDA)
+  platform::SetDeviceId(thread_id_);
+#endif
   while ((cur_batch = device_reader_->Next()) > 0) {
     for (auto &op : ops_) {
       bool need_skip = false;
@@ -244,9 +248,12 @@ void HogwildWorker::TrainFiles() {
     ++batch_cnt;
     PrintFetchVars();
     thread_scope_->DropKids();
+#ifdef PADDLE_WITH_HETERPS
+    dev_ctx_->Wait();
+#endif
   }
   timeline.Pause();
-  VLOG(3) << "worker " << thread_id_ << " train cost " << timeline.ElapsedSec()
+  VLOG(1) << "worker " << thread_id_ << " train cost " << timeline.ElapsedSec()
           << " seconds, ins_num: " << total_ins_num;
 
   if (need_dump_field_ || need_dump_param_) {
diff --git a/paddle/fluid/framework/io/shell.cc b/paddle/fluid/framework/io/shell.cc
index f01894f2cf448..361153de7d73a 100644
--- a/paddle/fluid/framework/io/shell.cc
+++ b/paddle/fluid/framework/io/shell.cc
@@ -69,7 +69,7 @@ static int close_open_fds_internal() {
 
   for (;;) {
     int bytes = 0;
-    if ((bytes = syscall(SYS_getdents, dir_fd,
+    if ((bytes = syscall(SYS_getdents64, dir_fd,
                          reinterpret_cast<linux_dirent*>(buffer),
                          sizeof(buffer))) < 0) {
       PADDLE_THROW(platform::errors::Unavailable(
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index a3b49476d820f..8166c43e65db1 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -89,12 +89,14 @@ pass_library(delete_quant_dequant_filter_op_pass inference)
 pass_library(delete_weight_dequant_linear_op_pass inference)
 pass_library(delete_quant_dequant_linear_op_pass inference)
 pass_library(delete_dropout_op_pass inference)
+pass_library(delete_fill_constant_op_pass inference)
 pass_library(simplify_with_basic_ops_pass base)
 pass_library(fc_elementwise_layernorm_fuse_pass base)
 pass_library(skip_layernorm_fuse_pass base)
 pass_library(multihead_matmul_fuse_pass inference)
 pass_library(adaptive_pool2d_convert_global_pass inference)
 pass_library(unsqueeze2_eltwise_fuse_pass inference)
+pass_library(yolo_box_fuse_pass inference)
 pass_library(layer_norm_fuse_pass inference)
 pass_library(add_support_int8_pass inference)
 pass_library(matmul_scale_fuse_pass inference)
@@ -107,6 +109,9 @@ if(WITH_TENSORRT)
     pass_library(trt_map_matmul_to_mul_pass inference)
     pass_library(preln_embedding_eltwise_layernorm_fuse_pass inference)
     pass_library(preln_skip_layernorm_fuse_pass inference)
+    pass_library(set_transformer_input_convert_pass inference)
+    pass_library(remove_padding_recover_padding_pass inference)
+    pass_library(delete_remove_padding_recover_padding_pass inference)
 endif()
 
 if(WITH_GPU OR WITH_ROCM)
@@ -118,10 +123,12 @@ if(WITH_MKLDNN)
     pass_library(mkldnn_placement_pass base DEPS placement_pass_base DIR mkldnn)
     pass_library(mkldnn_inplace_pass inference DEPS mkldnn_placement_pass op_registry elementwise_add_op gelu_op activation_op softmax_op softmax DIR mkldnn)
     pass_library(depthwise_conv_mkldnn_pass base DIR mkldnn)
+    pass_library(conv_affine_channel_mkldnn_fuse_pass inference DIR mkldnn)
     pass_library(conv_bias_mkldnn_fuse_pass inference DIR mkldnn)
     pass_library(conv_activation_mkldnn_fuse_pass inference DIR mkldnn)
     pass_library(conv_concat_relu_mkldnn_fuse_pass inference DIR mkldnn)
     pass_library(conv_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn)
+    pass_library(int8_scale_calculation_mkldnn_pass inference DIR mkldnn)
     pass_library(fc_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn)
     pass_library(scale_matmul_fuse_pass inference DIR mkldnn)
     pass_library(cpu_bfloat16_placement_pass inference DIR mkldnn)
@@ -208,6 +215,7 @@ if (WITH_MKLDNN)
     cc_test(test_conv_activation_mkldnn_fuse_pass SRCS mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc DEPS conv_activation_mkldnn_fuse_pass)
     cc_test(test_conv_concat_relu_mkldnn_fuse_pass SRCS mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc DEPS conv_concat_relu_mkldnn_fuse_pass)
     cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass pass_test_util)
+    cc_test(test_int8_scale_calculation_mkldnn_pass SRCS mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc DEPS int8_scale_calculation_mkldnn_pass pass_test_util)
     cc_test(test_fc_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/fc_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS fc_elementwise_add_mkldnn_fuse_pass pass_test_util)
     cc_test(test_fc_act_mkldnn_fuse_pass SRCS mkldnn/fc_act_mkldnn_fuse_pass_tester.cc DEPS fc_act_mkldnn_fuse_pass pass_test_util)
     cc_test(test_batch_norm_act_fuse_pass SRCS mkldnn/batch_norm_act_fuse_pass_tester.cc DEPS batch_norm_act_fuse_pass pass_test_util)
diff --git a/paddle/fluid/framework/ir/delete_fill_constant_op_pass.cc b/paddle/fluid/framework/ir/delete_fill_constant_op_pass.cc
new file mode 100644
index 0000000000000..e86bb2926b640
--- /dev/null
+++ b/paddle/fluid/framework/ir/delete_fill_constant_op_pass.cc
@@ -0,0 +1,103 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/delete_fill_constant_op_pass.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+template <typename T>
+void FillConstData(LoDTensor* out_t, T value) {
+  auto output_data = out_t->mutable_data<T>(platform::CPUPlace());
+  for (int i = 0; i < out_t->numel(); i++) {
+    output_data[i] = value;
+  }
+}
+
+void DeleteFillConstantOpPass::ApplyImpl(ir::Graph* graph) const {
+  FusePassBase::Init("delete_fill_constant_op_pass", graph);
+  GraphPatternDetector detector;
+  auto fill_constant_op = detector.mutable_pattern()
+                              ->NewNode("fill_constant")
+                              ->assert_is_op("fill_constant")
+                              ->assert_is_not_op_input("ValueTensor")
+                              ->assert_is_not_op_input("str_value")
+                              ->assert_is_not_op_input("ShapeTensor")
+                              ->assert_is_not_op_input("ShapeTensorList");
+  auto fill_constant_out =
+      detector.mutable_pattern()
+          ->NewNode("fill_constant_out")
+          ->assert_is_op_output("fill_constant")
+          ->assert_more([](Node* x) { return x->outputs.size() == 1UL; });
+  auto next_op = detector.mutable_pattern()
+                     ->NewNode("next_op")
+                     ->assert_is_not_op_type("conditional_block")
+                     ->assert_is_not_op_type("while");
+  // Create the topological connections for the above pattern nodes.
+  fill_constant_op->LinksTo({fill_constant_out});
+  next_op->LinksFrom({fill_constant_out});
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* graph) {
+    Node* fill_constant_op_node = subgraph.at(fill_constant_op);
+    Node* fill_constant_out_node = subgraph.at(fill_constant_out);
+    // Get fill_constant's attr
+    auto fill_constant = fill_constant_op_node->Op();
+    auto value = BOOST_GET_CONST(float, fill_constant->GetAttr("value"));
+    auto shape =
+        BOOST_GET_CONST(std::vector<int64_t>, fill_constant->GetAttr("shape"));
+    auto* scope = param_scope();
+    auto fill_constant_out_desc = fill_constant_out_node->Var();
+    fill_constant_out_desc->SetShape(shape);
+    fill_constant_out_desc->SetPersistable(true);
+    auto* fill_constant_out_tensor =
+        scope->Var(fill_constant_out_desc->Name())->GetMutable<LoDTensor>();
+    auto dtype =
+        framework::TransToPhiDataType(fill_constant_out_desc->GetDataType());
+    fill_constant_out_tensor->Resize(phi::make_ddim(shape));
+    switch (dtype) {
+      case paddle::experimental::DataType::BOOL:
+        FillConstData<bool>(fill_constant_out_tensor, static_cast<bool>(value));
+        break;
+      case paddle::experimental::DataType::INT32:
+        FillConstData<int32_t>(fill_constant_out_tensor,
+                               static_cast<int32_t>(value));
+        break;
+      case paddle::experimental::DataType::INT64:
+        FillConstData<int64_t>(fill_constant_out_tensor,
+                               static_cast<int64_t>(value));
+        break;
+      case paddle::experimental::DataType::FLOAT32:
+        FillConstData<float>(fill_constant_out_tensor,
+                             static_cast<float>(value));
+        break;
+      default:
+        LOG(WARNING) << "Unsupported dtype for fill_constant op: " << dtype;
+        return;
+    }
+    // Remove links in graph
+    GraphSafeRemoveNodes(graph, {fill_constant_op_node});
+  };
+
+  detector(graph, handler);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(delete_fill_constant_op_pass,
+              paddle::framework::ir::DeleteFillConstantOpPass);
diff --git a/paddle/fluid/framework/ir/delete_fill_constant_op_pass.h b/paddle/fluid/framework/ir/delete_fill_constant_op_pass.h
new file mode 100644
index 0000000000000..33d10f4502f2a
--- /dev/null
+++ b/paddle/fluid/framework/ir/delete_fill_constant_op_pass.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class Graph;
+
+class DeleteFillConstantOpPass : public FusePassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+
+ private:
+  virtual ~DeleteFillConstantOpPass() = default;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/delete_remove_padding_recover_padding_pass.cc b/paddle/fluid/framework/ir/delete_remove_padding_recover_padding_pass.cc
new file mode 100644
index 0000000000000..63233e0b584b2
--- /dev/null
+++ b/paddle/fluid/framework/ir/delete_remove_padding_recover_padding_pass.cc
@@ -0,0 +1,100 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/delete_remove_padding_recover_padding_pass.h"
+
+#include <string>
+
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+void RecoverPadding::operator()() {
+  // Create nodes for recover_padding.
+  auto *recover_padding_input =
+      pattern->NewNode(recover_padding_input_repr())
+          ->assert_is_op_input("recover_padding", "Input");
+  auto *recover_padding_op = pattern->NewNode(recover_padding_op_repr())
+                                 ->assert_is_op("recover_padding");
+  auto *recover_padding_out =
+      pattern->NewNode(recover_padding_out_repr())
+          ->assert_is_op_output("recover_padding", "Out");
+
+  // Add links for recover_padding op.
+  recover_padding_op->LinksFrom({recover_padding_input})
+      .LinksTo({recover_padding_out});
+}
+}  // namespace patterns
+
+void DeleteRemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph *graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::PreconditionNotMet("graph should not be null."));
+  FusePassBase::Init(name_scope_, graph);
+  int found_subgraph_count = 0;
+
+  //
+  GraphPatternDetector gpd;
+  patterns::RecoverPadding recover_padding(
+      gpd.mutable_pattern(), "delete_remove_padding_recover_padding_pass");
+  recover_padding();
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *graph) {
+    VLOG(3) << "delete_remove_padding_recover_padding_pass";
+
+    GET_IR_NODE_FROM_SUBGRAPH(recover_padding_input, recover_padding_input,
+                              recover_padding);
+    GET_IR_NODE_FROM_SUBGRAPH(recover_padding_op, recover_padding_op,
+                              recover_padding);
+    GET_IR_NODE_FROM_SUBGRAPH(recover_padding_out, recover_padding_out,
+                              recover_padding);
+
+    std::unordered_set<const Node *> del_node_set;
+
+    bool delete_recover_padding = true;
+    for (size_t i = 0; i < recover_padding_out->outputs.size(); ++i) {
+      if (recover_padding_out->outputs[i]->Name() ==
+          "remove_padding") {  // op_node
+        auto *remove_padding_out_node =
+            recover_padding_out->outputs[i]->outputs[0];          // var_node
+        auto *out_op_node = remove_padding_out_node->outputs[0];  // op_node
+        IR_NODE_LINK_TO(recover_padding_input, out_op_node);
+        del_node_set.insert(recover_padding_out->outputs[i]);
+        del_node_set.insert(remove_padding_out_node);
+        out_op_node->Op()->RenameInput(remove_padding_out_node->Name(),
+                                       recover_padding_input->Name());
+        found_subgraph_count++;
+      } else {
+        delete_recover_padding = false;
+      }
+    }
+    if (delete_recover_padding) {
+      del_node_set.insert(recover_padding_op);
+      del_node_set.insert(recover_padding_out);
+    }
+    GraphSafeRemoveNodes(graph, del_node_set);
+  };
+  gpd(graph, handler);
+  AddStatis(found_subgraph_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(delete_remove_padding_recover_padding_pass,
+              paddle::framework::ir::DeleteRemovePaddingRecoverPaddingPass);
diff --git a/paddle/fluid/framework/ir/delete_remove_padding_recover_padding_pass.h b/paddle/fluid/framework/ir/delete_remove_padding_recover_padding_pass.h
new file mode 100644
index 0000000000000..3504b124c91d1
--- /dev/null
+++ b/paddle/fluid/framework/ir/delete_remove_padding_recover_padding_pass.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Graph;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+struct RecoverPadding : public PatternBase {
+  RecoverPadding(PDPattern *pattern, const std::string &name_scope)
+      : PatternBase(pattern, name_scope, "recover_padding") {}
+
+  void operator()();
+
+  PATTERN_DECL_NODE(recover_padding_input);
+  PATTERN_DECL_NODE(recover_padding_op);
+  PATTERN_DECL_NODE(recover_padding_out);
+};
+}  // namespace patterns
+
+class DeleteRemovePaddingRecoverPaddingPass : public FusePassBase {
+ public:
+  DeleteRemovePaddingRecoverPaddingPass() {}
+  virtual ~DeleteRemovePaddingRecoverPaddingPass() {}
+
+ protected:
+  void ApplyImpl(Graph *graph) const;
+  const std::string name_scope_{"delete_remove_padding_recover_padding_pass"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
index 48df5869a7a1f..40e1de8a523aa 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
@@ -172,7 +172,7 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
       VLOG(6) << "The number of new gradients is " << new_grad_idx.size();
       if (new_grad_idx.size() == 1) return;
       // NOTE(zcd): If the gradients of backward stage and optimization stage
-      // have diff, Only take care of the the gradient of optimization stage.
+      // have diff, Only take care of the gradient of optimization stage.
       GradientsFilter(new_grad_idx, &opt_nodes, &aux_var_map);
     }
   }
diff --git a/paddle/fluid/framework/ir/fusion_group/operation.cc b/paddle/fluid/framework/ir/fusion_group/operation.cc
index 921cf0904f632..2b7a3e1899c76 100644
--- a/paddle/fluid/framework/ir/fusion_group/operation.cc
+++ b/paddle/fluid/framework/ir/fusion_group/operation.cc
@@ -127,7 +127,7 @@ void OperationMap::InsertUnaryElementwiseOperations() {
 
   // scale
   //  out = (bias_after_scale) ? scale * X +  bias : scale(X + bias)
-  //  here we use '=' operator to seperate th default value
+  //  here we use '=' operator to separate th default value
   // TODO(wangchaochaohu): Later we need to support Tensor input for scale and
   //  bias.
   insert_handler(
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index fbd8fda131b6d..f7c1a68c826f0 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -408,6 +408,13 @@ PDNode *PDNode::assert_is_op(const std::string &op_type) {
   return this;
 }
 
+PDNode *PDNode::assert_is_not_op_type(const std::string &op_type) {
+  asserts_.emplace_back([op_type](Node *x) {
+    return x && x->IsOp() && x->Op()->Type() != op_type;
+  });
+  return this;
+}
+
 PDNode *PDNode::assert_is_var() {
   asserts_.emplace_back([](Node *x) { return x && x->IsVar(); });
   return this;
@@ -720,7 +727,7 @@ bool HasOutput(Node *op, const std::string &argument) {
   PADDLE_ENFORCE_EQ(
       op->IsOp(), true,
       platform::errors::InvalidArgument(
-          "First parameter of function HasOuput must be Node::Op"));
+          "First parameter of function HasOutput must be Node::Op"));
   auto const &names = op->Op()->OutputNames();
   if (std::find(names.begin(), names.end(), argument) == names.end())
     return false;
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index d7e265fe28bf9..3c6b6ce94e23f 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -81,6 +81,7 @@ struct PDNode {
   bool IsVar() const { return type_ == Type::kVar; }
 
   const std::string& name() const { return name_; }
+  const PDPattern* pdpattern() const { return pattern_; }
 
   PDNode& operator=(const PDNode&) = delete;
   PDNode(const PDNode&) = delete;
@@ -109,6 +110,7 @@ struct PDNode {
   // Assertions, helper functions to simplify the pattern definition.
   PDNode* assert_is_op();
   PDNode* assert_is_op(const std::string& op_type);
+  PDNode* assert_is_not_op_type(const std::string& op_type);
   PDNode* assert_is_var();
   PDNode* assert_var_dtype(proto::VarType::Type dtype);
   PDNode* assert_is_not_ctrl_var();
@@ -277,7 +279,44 @@ class PDPattern {
  */
 class GraphPatternDetector {
  public:
-  using subgraph_t = std::map<PDNode*, Node*>;
+  struct NodeIdCompare {
+    bool operator()(Node* node1, Node* node2) const {
+      return node1->id() < node2->id();
+    }
+  };
+
+  struct PDNodeCompare {
+    bool operator()(const PDNode* node1, const PDNode* node2) const {
+      auto& nodes1 = node1->pdpattern()->nodes();
+      auto& nodes2 = node2->pdpattern()->nodes();
+      if (nodes1.size() != nodes2.size()) {
+        return nodes1.size() < nodes2.size();
+      } else {
+        std::string pdnode_hash_key1 = "";
+        std::string pdnode_hash_key2 = "";
+        for (auto& node : nodes1) {
+          pdnode_hash_key1 += node.get()->name();
+          pdnode_hash_key1 += "#";
+        }
+        pdnode_hash_key1 += node1->name();
+        for (auto& node : nodes2) {
+          pdnode_hash_key2 += node.get()->name();
+          pdnode_hash_key2 += "#";
+        }
+        pdnode_hash_key2 += node2->name();
+
+        auto pdnode_key1 =
+            std::to_string(std::hash<std::string>()(pdnode_hash_key1));
+        auto pdnode_key2 =
+            std::to_string(std::hash<std::string>()(pdnode_hash_key2));
+
+        return pdnode_key1 < pdnode_key2;
+      }
+      return false;
+    }
+  };
+
+  using subgraph_t = std::map<PDNode*, Node*, PDNodeCompare>;
 
   // Operate on the detected pattern.
   using handle_t =
@@ -321,7 +360,8 @@ class GraphPatternDetector {
   using hit_rcd_t =
       std::pair<Node* /*node in graph*/, PDNode* /*node in pattern*/>;
   PDPattern pattern_;
-  std::map<const PDNode*, std::set<Node*>> pdnodes2nodes_;
+  std::map<const PDNode*, std::set<Node*, NodeIdCompare>, PDNodeCompare>
+      pdnodes2nodes_;
 };
 
 // some helper methods.
@@ -1017,8 +1057,8 @@ struct Pool : public PatternBase {
 };
 
 // Elementwise ops
-// Forward pass for element-wise operators (add, mul)
-// elementwise_mul_out is the result of the operator
+// Forward pass for element-wise operators
+// elementwise_out is the result of the operator
 struct Elementwise : public PatternBase {
   Elementwise(PDPattern* pattern, const std::string& name_scope)
       : PatternBase(pattern, name_scope, "elementwise") {}
@@ -1393,7 +1433,7 @@ struct PriorBox : public PatternBase {
 };
 
 // Conv + ElementwiseAdd + an activation
-// This pattern can futher fuse the conv related ops after the conv+bn fusion.
+// This pattern can further fuse the conv related ops after the conv+bn fusion.
 struct ConvElementwiseaddAct : public PatternBase {
   ConvElementwiseaddAct(PDPattern* pattern, const std::string& name_scope)
       : PatternBase(pattern, name_scope, "conv_elementwiseadd_act") {}
diff --git a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
index 290fbe3ea1373..6b91ea4e360df 100644
--- a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
+++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/identity_scale_op_clean_pass.h"
-
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -29,55 +29,62 @@ void IdentityScaleOpCleanPass::ApplyImpl(ir::Graph* graph) const {
   // ->
   // pre_op -> scale_out
   GraphPatternDetector detector;
-  auto pre_op = detector.mutable_pattern()->NewNode("pre_op")->assert_is_op();
-  auto scale_in = detector.mutable_pattern()
-                      ->NewNode("scale_in")
-                      ->assert_is_op_input("scale")
-                      ->AsIntermediate();
+  auto scale_in =
+      detector.mutable_pattern()
+          ->NewNode("scale_in")
+          ->assert_is_op_input("scale")
+          ->assert_more([](Node* x) { return x->outputs.size() == 1UL; });
   auto scale_op = detector.mutable_pattern()
                       ->NewNode("scale_fuse")
                       ->assert_is_op("scale")
                       ->assert_op_attr<float>("scale", 1.)
                       ->assert_op_attr<float>("bias", 0.);
-  auto scale_out =
-      detector.mutable_pattern()
-          ->NewNode("scale_out")
-          ->assert_is_op_output("scale")
-          // scale's output var should has only one consumer, or it can't be
-          // removed.
-          ->assert_more([](Node* x) { return x->outputs.size() == 1UL; });
+  auto scale_out = detector.mutable_pattern()
+                       ->NewNode("scale_out")
+                       ->assert_is_op_output("scale");
 
-  pre_op->LinksTo({scale_in});
   scale_op->LinksFrom({scale_in}).LinksTo({scale_out});
 
+  int found_subgraph_count = 0;
   GraphPatternDetector::handle_t handler = [&](
       const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) {
     Node* scale_op_var = subgraph.at(scale_op);
     Node* scale_in_var = subgraph.at(scale_in);
     Node* scale_out_var = subgraph.at(scale_out);
-    Node* pre_op_var = subgraph.at(pre_op);
-    // Link pre_op directly to scale_out
     const std::string scale_in_name = scale_in_var->Name();
     const std::string scale_out_name = scale_out_var->Name();
     // Remove links in graph
     GraphSafeRemoveNodes(graph, {scale_in_var, scale_op_var});
-    // Modify proto message
-    auto* pre_op_desc = pre_op_var->Op();
-    for (auto& parameter : *pre_op_desc->Proto()->mutable_outputs()) {
-      auto* arguments = parameter.mutable_arguments();
-      auto it = std::find(arguments->begin(), arguments->end(), scale_in_name);
-      PADDLE_ENFORCE_NE(
-          it, arguments->end(),
-          platform::errors::NotFound(
-              "Can not find input variable(%s) from scale op(%s).",
-              scale_in_name, pre_op_desc->Type()));
-      *it = scale_out_name;
+    // Modify pre_op_desc
+    // Link pre_op directly to scale_out
+    for (auto& node : graph->Nodes()) {
+      if (node->IsOp()) {
+        auto* op_desc = node->Op();
+        auto out_vars_map = op_desc->Outputs();
+        for (auto out_var_map : out_vars_map) {
+          auto names = out_var_map.second;
+          bool reset = false;
+          for (size_t i = 0; i < names.size(); i++) {
+            if (names[i] == scale_in_name) {
+              reset = true;
+              names[i] = scale_out_name;
+              break;
+            }
+          }
+          if (reset) {
+            op_desc->SetOutput(out_var_map.first, names);
+            op_desc->Flush();
+            IR_NODE_LINK_TO(node, scale_out_var);
+            break;
+          }
+        }
+      }
     }
-
-    IR_NODE_LINK_TO(pre_op_var, scale_out_var);
+    found_subgraph_count++;
   };
 
   detector(graph, handler);
+  AddStatis(found_subgraph_count);
 }
 
 }  // namespace ir
@@ -86,3 +93,7 @@ void IdentityScaleOpCleanPass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(identity_scale_op_clean_pass,
               paddle::framework::ir::IdentityScaleOpCleanPass);
+REGISTER_PASS_CAPABILITY(identity_scale_op_clean_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination().EQ(
+            "scale", 0));
diff --git a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
index 7c517a50e9af4..84a14200cb7a5 100644
--- a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h"
 
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 
 namespace paddle {
@@ -68,7 +69,7 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const {
   std::vector<float> weight_decay_values{};
 
   // use map store <op_type, op_ptr> ?
-  for (auto* node : graph->Nodes()) {
+  for (auto* node : TopologySortOperations(*graph)) {
     if (!node->IsOp()) {
       continue;
     }
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc
index 9fc6de3c8c172..313b2cc33459e 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc
@@ -66,7 +66,7 @@ static void ShareVarInfoToCinnLaunch(
           << paddle::string::join_strings(vars_to_delete, ',');
 
   const Graph& subgraph = paddle2cinn::CinnCompiler::GetInstance()->FindGraph(
-      cinn_launch_op->GetOp()->Attr<std::string>(operators::kCompilationKey));
+      cinn_launch_op->GetOp()->Attr<int64_t>(operators::kCompilationKey));
   auto& dst_varinfo_map =
       subgraph.Get<Name2VarInfoMap>(paddle2cinn::kMemOptVarInfoFromMainGraph);
   const Name2VarInfoMap& src_varinfo_map =
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc
index 60f4e4b309c5d..88bf9e3876399 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc
@@ -51,8 +51,7 @@ static ProgramDesc BuildProgramInsideCinnLaunchOp() {
   return program;
 }
 
-static ProgramDesc BuildProgramWithCinnLaunchOp(
-    const std::string& compilation_key) {
+static ProgramDesc BuildProgramWithCinnLaunchOp(int64_t compilation_key) {
   // create a cinn_launch op
   ProgramDesc program;
   auto* block = program.MutableBlock(0);
@@ -89,7 +88,7 @@ TEST(ShareMemInfoToSubGraphPassTest, test_main_graph_share_varinfo) {
   auto subgraph = std::make_unique<ir::Graph>(BuildProgramInsideCinnLaunchOp());
   subgraph->GetOrInit<Name2VarInfoMap>(
       paddle2cinn::kMemOptVarInfoFromMainGraph);
-  std::string compilation_key =
+  auto compilation_key =
       paddle2cinn::CinnCompiler::GetInstance()->AddGraph(std::move(subgraph));
 
   // build test data and apply pass
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.cc
new file mode 100644
index 0000000000000..50e751e02dfa0
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.cc
@@ -0,0 +1,272 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.h"
+
+#include <cmath>
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace phi {
+class DenseTensor;
+}  // namespace phi
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class Node;
+
+#define GET_CONV_BN_NODES(pattern_name)                                    \
+  /* OPERATORS */                                                          \
+  GET_IR_NODE_FROM_SUBGRAPH(conv, conv, pattern_name);                     \
+  GET_IR_NODE_FROM_SUBGRAPH(affine_channel, affine_channel, pattern_name); \
+  /* CONV inputs */                                                        \
+  GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, pattern_name);       \
+  /* CONV outputs */                                                       \
+  GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, pattern_name);             \
+  /* Affine Channel inputs */                                              \
+  GET_IR_NODE_FROM_SUBGRAPH(ac_scale, ac_scale, pattern_name);             \
+  GET_IR_NODE_FROM_SUBGRAPH(ac_bias, ac_bias, pattern_name);               \
+  /* Affine channel outputs */                                             \
+  GET_IR_NODE_FROM_SUBGRAPH(ac_out, ac_out, pattern_name); /* Out */
+
+void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight,
+                                const ir::Node& ac_scale,
+                                const LoDTensor& ac_bias_tensor,
+                                LoDTensor* eltwise_y_in_tensor) {
+  using EigenVectorArrayMap =
+      Eigen::Map<Eigen::Array<float, Eigen::Dynamic, 1>>;
+  using ConstEigenVectorArrayMap =
+      Eigen::Map<const Eigen::Array<float, Eigen::Dynamic, 1>>;
+  using EigenMatrixArrayMap = Eigen::Map<
+      Eigen::Array<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
+
+  // Re-compute bias of conv2d from AffineChannel
+  PADDLE_ENFORCE_EQ(
+      eltwise_y_in_tensor->dims(), ac_bias_tensor.dims(),
+      platform::errors::InvalidArgument(
+          "Tensor elementwise y(%d) and activation bias(%d) must have same "
+          "dimension.",
+          eltwise_y_in_tensor->dims().size(), ac_bias_tensor.dims().size()));
+
+  auto* scale_tensor = scope->FindVar(ac_scale.Name())->GetMutable<LoDTensor>();
+
+  ConstEigenVectorArrayMap scale_array(scale_tensor->data<float>(),
+                                       scale_tensor->numel(), 1);
+  ConstEigenVectorArrayMap ac_bias_array(ac_bias_tensor.data<float>(),
+                                         ac_bias_tensor.numel(), 1);
+
+  EigenVectorArrayMap eltwise_y_in_array(
+      eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
+      eltwise_y_in_tensor->numel(), 1);
+
+  eltwise_y_in_array = (eltwise_y_in_array * scale_array) + ac_bias_array;
+
+  // Re-compute weight of conv2d from AffineChannel
+  auto* weights = scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>();
+  auto weights_shape = weights->dims();
+  auto weights_shape_2d = phi::flatten_to_2d(weights_shape, 1);
+  auto* weights_data = weights->mutable_data<float>(platform::CPUPlace());
+
+  EigenMatrixArrayMap weights_array_2d(weights_data, weights_shape_2d[0],
+                                       weights_shape_2d[1]);
+
+  weights_array_2d.colwise() *= scale_array;
+
+  // Check for subnormal values that slows down convolution execution
+  for (int i = 0; i < weights->numel(); ++i) {
+    if (std::fpclassify(weights_data[i]) == FP_SUBNORMAL) weights_data[i] = 0;
+  }
+}
+
+ConvAffineChannelFusePass::ConvAffineChannelFusePass() {
+  AddOpCompat(OpCompat("conv2d"))
+      .AddInput("Input")
+      .IsTensor()
+      .End()
+      .AddInput("Filter")
+      .IsTensor()
+      .End()
+      .AddInput("Bias")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddInput("ResidualData")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddOutput("Output")
+      .IsTensor()
+      .End()
+      .AddAttr("strides")
+      .IsType<std::vector<int>>()
+      .End()
+      .AddAttr("paddings")
+      .IsType<std::vector<int>>()
+      .End()
+      .AddAttr("padding_algorithm")
+      .IsOptional()
+      .IsStringIn({"EXPLICIT", "SAME", "VALID"})
+      .End()
+      .AddAttr("groups")
+      .IsNumGE(1)
+      .End()
+      .AddAttr("dilations")
+      .IsType<std::vector<int>>()
+      .End()
+      .AddAttr("data_format")
+      .IsStringIn({"NCHW", "AnyLayout"})
+      .End();
+
+  AddOpCompat(OpCompat("affine_channel"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Scale")
+      .IsTensor()
+      .End()
+      .AddInput("Bias")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("data_layout")
+      .IsStringIn({"NCHW", "AnyLayout"})
+      .End();
+
+  AddOpCompat(OpCompat("elementwise_add"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("axis")
+      .IsNumEQ(1)
+      .End();
+}
+
+void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
+  FusePassBase::Init(name_scope_, graph);
+
+  auto* scope = param_scope();
+  PADDLE_ENFORCE_NOT_NULL(
+      scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
+
+  GraphPatternDetector gpd;
+  auto* conv_input =
+      gpd.mutable_pattern()
+          ->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
+          ->AsInput()
+          ->assert_is_op_input("conv2d", "Input");
+  patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(),
+                                              name_scope_);
+  conv_ac_pattern(conv_input, false /*with_eltwise_add*/);
+
+  int found_conv_ac_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    if (!IsCompat(subgraph, g)) {
+      LOG(WARNING) << "ConvAffineChannelFusePass in op compat failed.";
+      return;
+    }
+
+    VLOG(4) << "handle ConvAffineChannel fuse";
+
+    GET_CONV_BN_NODES(conv_ac_pattern);
+
+    auto data_format = conv->Op()->GetAttrIfExists<std::string>("data_format");
+    if (data_format == "AnyLayout") {
+      LOG_FIRST_N(WARNING, 1) << "conv_affine_channel_fuse_pass is enabled, "
+                                 "it's wrong if data_format of conv is not "
+                                 "NCHW.";
+    }
+
+    // Get affine_channel bias for resizing eltwise_y!
+    auto* ac_bias_tensor =
+        scope->FindVar(ac_bias->Name())->GetMutable<LoDTensor>();
+
+    // Create eltwise_y (conv bias) variable
+    VarDesc eltwise_y_in_desc(
+        patterns::PDNodeName(name_scope_, "eltwise_y_in"));
+    // Set shape && datatype manually
+    eltwise_y_in_desc.SetShape(phi::vectorize(ac_bias_tensor->dims()));
+    eltwise_y_in_desc.SetDataType(
+        framework::TransToProtoVarType(ac_bias_tensor->dtype()));
+    eltwise_y_in_desc.SetLoDLevel(ac_bias->Var()->GetLoDLevel());
+    eltwise_y_in_desc.SetPersistable(true);
+
+    // Initialize eltwise_y
+    auto* eltwise_y_in_node = g->CreateVarNode(&eltwise_y_in_desc);
+    auto* eltwise_y_in_tensor =
+        scope->Var(eltwise_y_in_node->Name())->GetMutable<LoDTensor>();
+    eltwise_y_in_tensor->Resize(ac_bias_tensor->dims());
+    std::fill_n(eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
+                eltwise_y_in_tensor->numel(), 0.0f);
+
+    // update weights and biases
+    recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor,
+                               eltwise_y_in_tensor);
+
+    // create an elementwise add node.
+    OpDesc desc;
+    desc.SetInput("X", std::vector<std::string>({conv_out->Name()}));
+    desc.SetInput("Y", std::vector<std::string>({eltwise_y_in_node->Name()}));
+    desc.SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
+    desc.SetType("elementwise_add");
+    desc.SetAttr("axis", 1);
+    desc.SetAttr("use_mkldnn", conv->Op()->GetAttrIfExists<bool>("use_mkldnn"));
+
+    auto eltwise_op = g->CreateOpNode(&desc);  // OpDesc will be copied.
+
+    GraphSafeRemoveNodes(graph, {ac_scale, ac_bias, affine_channel});
+
+    IR_NODE_LINK_TO(conv_out, eltwise_op);
+    IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op);
+    IR_NODE_LINK_TO(eltwise_op, ac_out);
+    found_conv_ac_count++;
+  };
+
+  gpd(graph, handler);
+
+  AddStatis(found_conv_ac_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(conv_affine_channel_mkldnn_fuse_pass,
+              paddle::framework::ir::ConvAffineChannelFusePass);
+
+REGISTER_PASS_CAPABILITY(conv_affine_channel_mkldnn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .LE("conv2d", 1)
+            .EQ("affine_channel", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.h
new file mode 100644
index 0000000000000..075b6d7220316
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * Fuse the Conv and ConvAffineChannel.
+ */
+class Graph;
+
+class ConvAffineChannelFusePass : public FusePassBase {
+ public:
+  ConvAffineChannelFusePass();
+  virtual ~ConvAffineChannelFusePass() {}
+
+ protected:
+  void ApplyImpl(ir::Graph*) const override;
+  const std::string name_scope_{"conv_affine_channel_mkldnn_fuse"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
index 62b2be712beef..eebc87f5d9988 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
@@ -186,9 +186,22 @@ class DeQuantizer final : public Quanter {
   // Checking whether a reorder from BF16 to FP32
   // should be added after the output to the operator
   bool IsNotPermittedName(const std::string& output_name) const override {
-    // XShape is output in transpose2 and reshape2 operators used to store the
-    // shape and lod of X. So this output do not need dequantize before.
-    return (output_name == "XShape");
+    std::unordered_map<std::string, std::vector<std::string>> block_list{
+        {"layer_norm",
+         {"Mean", "Variance"}}};  // not used in inference in MKLDNN
+
+    std::vector<std::string> blocked_outputs{"XShape"};  // blocklist for any op
+    auto op_name = op->Name();
+    if (block_list.count(op_name)) {
+      const auto& op_blocklist = block_list[op_name];
+      blocked_outputs.insert(blocked_outputs.begin(), op_blocklist.begin(),
+                             op_blocklist.end());
+    }
+
+    return std::any_of(blocked_outputs.begin(), blocked_outputs.end(),
+                       [&output_name](const std::string& name) {
+                         return name == output_name;
+                       });
   }
 
   std::string get_op_type() const override { return "dequantize"; };
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc
index 877ee71fc2d85..3f5e9a1484841 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc
@@ -65,22 +65,20 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
 static const std::initializer_list<std::string> variable_names{
     "z", "a", "b", "c", "d", "e", "f", "g", "h", "i"};
 
-void PreparePass(std::unique_ptr<ir::Graph>* graph, const ProgramDesc& prog,
-                 const std::initializer_list<std::string> variable_names,
-                 int* original_nodes_num, int* current_nodes_num) {
+void PreparePass(std::unique_ptr<ir::Graph>& graph, int* original_nodes_num,
+                 int* current_nodes_num) {
   auto pass = PassRegistry::Instance().Get("cpu_bfloat16_pass");
 
-  *original_nodes_num = (*graph)->Nodes().size();
-  (*graph).reset(pass->Apply((*graph).release()));
-  *current_nodes_num = (*graph)->Nodes().size();
+  *original_nodes_num = graph->Nodes().size();
+  graph.reset(pass->Apply(graph.release()));
+  *current_nodes_num = graph->Nodes().size();
 }
 
 void MainTest(const ProgramDesc& prog, const int& quant_count,
               const int& dequant_count, const int& added_nodes_count) {
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  auto graph = std::make_unique<ir::Graph>(prog);
   int original_nodes_num, current_nodes_num;
-  PreparePass(&graph, prog, variable_names, &original_nodes_num,
-              &current_nodes_num);
+  PreparePass(graph, &original_nodes_num, &current_nodes_num);
 
   int quantize_nodes_count = 0;
   int dequantize_nodes_count = 0;
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index 4aae60b853d4f..a61c043b58065 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -1188,6 +1188,7 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
   QuantizeMatmul(graph);
   QuantizeElementwise(graph, "elementwise_add");
   QuantizeElementwise(graph, "elementwise_mul");
+  QuantizeElementwise(graph, "elementwise_sub");
   QuantizeFusionGru(graph);
   QuantizeMultiGru(graph);
   QuantizeFusionLSTM(graph);
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
index 22000865948d6..912c16288c2b9 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
@@ -90,7 +90,8 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
     op->SetAttr("Scale_x", 1.0f);
     op->SetAttr("Scale_y", 1.0f);
     op->SetAttr("Scale_out", 1.0f);
-  } else if (type == "elementwise_add" || type == "elementwise_mul") {
+  } else if (type == "elementwise_add" || type == "elementwise_mul" ||
+             type == "elementwise_sub") {
     op->SetInput("X", {inputs[0]});
     if (inputs.size() > 1) op->SetInput("Y", {inputs[1]});
     op->SetOutput("Out", {outputs[0]});
@@ -168,7 +169,7 @@ void CheckScales(const OpDesc* op, float scale, float shift) {
     scale_names.push_back("Scale_in");
     scale_names.push_back("Scale_out");
   } else if (type == "matmul" || type == "elementwise_add" ||
-             type == "elementwise_mul") {
+             type == "elementwise_mul" || type == "elementwise_sub") {
     scale_names.push_back("Scale_x");
     scale_names.push_back("Scale_y");
     scale_names.push_back("Scale_out");
@@ -565,60 +566,59 @@ ProgramDesc BuildProgramDescElementwise(const std::string elementwise_type,
   return prog;
 }
 
-void TestElementwise(const std::string elementwise_type,
-                     const std::string elementwise_name) {
+void TestElementwise(std::vector<std::string> elementwise) {
   // 2 Quant + 2 IN + 1 DeQuant + 1 OUT
   int added_nodes = 6;
   std::unordered_map<std::string, int> expected_operators = {
-      {elementwise_type, 1}, {"quantize", 2}, {"dequantize", 3}};
-  MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name),
+      {elementwise[0], 1}, {"quantize", 2}, {"dequantize", 3}};
+  MainTest(BuildProgramDescElementwise(elementwise[0], elementwise[1]),
            variable_names_elementwise, expected_operators, added_nodes,
            SCALE * S8_MAX);
 }
 
-void TestElementwiseOutputScaleMissing(const std::string elementwise_type,
-                                       const std::string elementwise_name) {
+void TestElementwiseOutputScaleMissing(std::vector<std::string> elementwise) {
   int added_nodes = 0;
   std::unordered_map<std::string, int> expected_operators = {
-      {elementwise_type, 1}, {"quantize", 0}, {"dequantize", 2}};
-  MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name),
+      {elementwise[0], 1}, {"quantize", 0}, {"dequantize", 2}};
+  MainTest(BuildProgramDescElementwise(elementwise[0], elementwise[1]),
            variable_names_elementwise, expected_operators, added_nodes, 1.f,
            1.f, "e");
 }
 
-void TestElementwiseUnsignedAndSignedInput(const std::string elementwise_type,
-                                           const std::string elementwise_name) {
+void TestElementwiseUnsignedAndSignedInput(
+    std::vector<std::string> elementwise) {
   int added_nodes = 0;
   std::unordered_map<std::string, int> expected_operators = {
-      {elementwise_type, 1}, {"quantize", 0}, {"dequantize", 2}};
-  MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name),
+      {elementwise[0], 1}, {"quantize", 0}, {"dequantize", 2}};
+  MainTest(BuildProgramDescElementwise(elementwise[0], elementwise[1]),
            variable_names_elementwise, expected_operators, added_nodes, 1.f,
            1.f, "", "b");
 }
 
-TEST(CpuQuantizePass, elementwise_add) {
-  TestElementwise("elementwise_add", "ElementwiseAdd");
-}
+const std::vector<std::vector<std::string>> elementwises = {
+    {"elementwise_add", "ElementwiseAdd"},
+    {"elementwise_mul", "ElementwiseMul"},
+    {"elementwise_sub", "ElementwiseSub"}};
 
-TEST(CpuQuantizePass, elementwise_add_output_scale_missing) {
-  TestElementwiseOutputScaleMissing("elementwise_add", "ElementwiseAdd");
-}
+class TestElementwises
+    : public testing::TestWithParam<std::vector<std::string>> {};
 
-TEST(CpuQuantizePass, elementwise_add_unsigned_and_signed_input) {
-  TestElementwiseUnsignedAndSignedInput("elementwise_add", "ElementwiseAdd");
-}
+TEST_P(TestElementwises, elementwise_basic) { TestElementwise(GetParam()); }
 
-TEST(CpuQuantizePass, elementwise_mul) {
-  TestElementwise("elementwise_mul", "ElementwiseMul");
+TEST_P(TestElementwises, elementwise_output_scale_missing) {
+  TestElementwiseOutputScaleMissing(GetParam());
 }
 
-TEST(CpuQuantizePass, elementwise_mul_output_scale_missing) {
-  TestElementwiseOutputScaleMissing("elementwise_mul", "ElementwiseMul");
+TEST_P(TestElementwises, elementwise_unsigned_and_signed_input) {
+  TestElementwiseUnsignedAndSignedInput(GetParam());
 }
 
-TEST(CpuQuantizePass, elementwise_mul_unsigned_and_signed_input) {
-  TestElementwiseUnsignedAndSignedInput("elementwise_mul", "ElementwiseMul");
-}
+INSTANTIATE_TEST_CASE_P(
+    Elementwises, TestElementwises, testing::ValuesIn(elementwises),
+    [](const ::testing::TestParamInfo<TestElementwises::ParamType>& info) {
+      std::string name = info.param[0];
+      return name;
+    });
 
 const std::vector<std::string> churn_out_vars(ProgramDesc* prog,
                                               const std::string& prefix,
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
index 3b883dac9782a..5b606a89ac90a 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
@@ -27,9 +27,10 @@ void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const {
   std::unordered_set<std::string> supported_op_types =
       std::unordered_set<std::string>(
           {"concat", "conv2d", "depthwise_conv2d", "elementwise_add",
-           "elementwise_mul", "fc", "matmul", "nearest_interp",
-           "nearest_interp_v2", "pool2d", "prior_box", "reshape2", "transpose2",
-           "fusion_gru", "fusion_lstm", "multi_gru", "slice"});
+           "elementwise_mul", "elementwise_sub", "fc", "matmul",
+           "nearest_interp", "nearest_interp_v2", "pool2d", "prior_box",
+           "reshape2", "transpose2", "fusion_gru", "fusion_lstm", "multi_gru",
+           "slice"});
   const auto& excluded_ids_list =
       Get<std::unordered_set<int>>("quantize_excluded_op_ids");
   const auto& op_types_list =
diff --git a/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.cc
new file mode 100644
index 0000000000000..678a8fb4a6955
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.cc
@@ -0,0 +1,179 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.h"
+
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+Int8ScaleCalculationMkldnnPass::Int8ScaleCalculationMkldnnPass() {
+  AddOpCompat(OpCompat("conv2d"))
+      .AddInput("Input")
+      .IsTensor()
+      .End()
+      .AddInput("Filter")
+      .IsTensor()
+      .End()
+      .AddInput("Bias")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddInput("ResidualData")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddOutput("Output")
+      .IsTensor()
+      .End()
+      .AddAttr("strides")
+      .IsType<std::vector<int>>()
+      .End()
+      .AddAttr("paddings")
+      .IsType<std::vector<int>>()
+      .End()
+      .AddAttr("padding_algorithm")
+      .IsOptional()
+      .IsStringIn({"EXPLICIT", "SAME", "VALID"})
+      .End()
+      .AddAttr("groups")
+      .IsNumGE(1)
+      .End()
+      .AddAttr("dilations")
+      .IsType<std::vector<int>>()
+      .End()
+      .AddAttr("data_format")
+      .IsStringIn({"NCHW", "AnyLayout"})
+      .End();
+}
+
+void Int8ScaleCalculationMkldnnPass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE_NOT_NULL(graph,
+                          platform::errors::InvalidArgument(
+                              "Pointer to graph argument should not be NULL."));
+  FusePassBase::Init("int8_scale_calculation_mkldnn_pass", graph);
+  GraphPatternDetector gpd;
+  patterns::Conv conv_pattern(gpd.mutable_pattern(),
+                              "int8_scale_calculation_mkldnn_pass");
+  conv_pattern();
+
+  int found_int8_scales_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    if (!IsCompat(subgraph, g)) {
+      LOG(WARNING) << "Pass in op compat failed.";
+      return;
+    }
+    GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
+
+    if (!platform::HasOpINT8DataType(conv_op->Op()) ||
+        conv_op->Op()->HasAttr("Sum_scale")) {
+      return;
+    }
+
+    GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
+
+    auto input_names = conv_op->Op()->InputNames();
+    bool has_bias = std::find(input_names.begin(), input_names.end(), "Bias") !=
+                    input_names.end();
+    std::vector<int64_t> weights_tz = conv_filter->Var()->GetShape();
+    const int groups =
+        std::max(conv_op->Op()->GetAttrIfExists<int>("groups"), 1);
+
+    const auto& scale_weights_data =
+        conv_op->Op()->GetAttrIfExists<std::vector<float>>("Scale_weights");
+    const auto& scale_in_data =
+        conv_op->Op()->GetAttrIfExists<float>("Scale_in");
+
+    bool is_multi_channel = scale_weights_data.size() > 1;
+
+    int count = 1;
+    if (is_multi_channel) {
+      count *= weights_tz[0];
+      if (groups > 1) {
+        count *= weights_tz[1];
+      }
+    }
+
+    if (has_bias && conv_op->Op()->Input("Bias").size() > 0) {
+      auto bias_scales = std::vector<float>(count);
+      for (int i = 0; i < count; i++) {
+        bias_scales[i] = scale_in_data * scale_weights_data[i];
+      }
+      conv_op->Op()->SetAttr("Bias_scales", bias_scales);
+    }
+
+    const bool& force_fp32_output =
+        conv_op->Op()->GetAttrIfExists<bool>("force_fp32_output");
+    const bool& fuse_residual_conn =
+        conv_op->Op()->GetAttrIfExists<bool>("fuse_residual_connection");
+    const auto& scale_in_eltwise_data =
+        conv_op->Op()->GetAttrIfExists<float>("Scale_in_eltwise");
+    bool has_activation =
+        !conv_op->Op()->GetAttrIfExists<std::string>("fuse_activation").empty();
+    float activation_scale =
+        force_fp32_output
+            ? 1.0f
+            : has_activation
+                  ? conv_op->Op()->GetAttrIfExists<float>("Scale_out")
+                  : 1.0f;
+    auto scale_out_data =
+        force_fp32_output
+            ? 1.0f
+            : has_activation
+                  ? 1.0f
+                  : conv_op->Op()->GetAttrIfExists<float>("Scale_out");
+    float sum_scale =
+        fuse_residual_conn ? scale_out_data / scale_in_eltwise_data : 1.0f;
+
+    std::vector<float> output_shift_scale(count);
+
+#pragma omp parallel for if (count > 50)
+    for (int i = 0; i < count; i++) {
+      if (scale_weights_data[i] == 0.0)
+        // weights data will contain 0 in some models, then weights
+        // scale couldn't be calculated
+        output_shift_scale[i] = scale_out_data;
+      else
+        output_shift_scale[i] =
+            static_cast<float>(static_cast<double>(scale_out_data) /
+                               (static_cast<double>(scale_in_data) *
+                                static_cast<double>(scale_weights_data[i])));
+    }
+
+    conv_op->Op()->SetAttr("Sum_scale", sum_scale);
+    conv_op->Op()->SetAttr("Output_shift_scale", output_shift_scale);
+    conv_op->Op()->SetAttr("Activation_scale", activation_scale);
+    found_int8_scales_count++;
+  };
+  gpd(graph, handler);
+  AddStatis(found_int8_scales_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(int8_scale_calculation_mkldnn_pass,
+              paddle::framework::ir::Int8ScaleCalculationMkldnnPass);
+REGISTER_PASS_CAPABILITY(int8_scale_calculation_mkldnn_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination().LE(
+            "conv2d", 1));
diff --git a/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.h
new file mode 100644
index 0000000000000..383c4f40fc03d
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class Graph;
+/*
+ * compute quantization scales for biases and weights
+ */
+class Int8ScaleCalculationMkldnnPass : public FusePassBase {
+ public:
+  Int8ScaleCalculationMkldnnPass();
+  virtual ~Int8ScaleCalculationMkldnnPass() {}
+
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc
new file mode 100644
index 0000000000000..804d04e35f690
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc
@@ -0,0 +1,149 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.h"
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs,
+           std::vector<float> scale_weights = {1.5f}) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+
+  op->SetType(type);
+  if (type == "conv2d") {
+    op->SetAttr("use_mkldnn", true);
+    op->SetAttr("name", name);
+    op->SetAttr("strides", std::vector<int>({1, 1}));
+    op->SetAttr("groups", 1);
+    op->SetAttr("paddings", std::vector<int>({0, 0}));
+    op->SetAttr("padding_algorithm", std::string("EXPLICIT"));
+    op->SetAttr("dilations", std::vector<int>({1, 1}));
+    op->SetAttr("data_format", std::string("NCHW"));
+    op->SetInput("Input", {inputs[0]});
+    op->SetInput("Filter", {inputs[1]});
+    if (inputs.size() > 2)
+      op->SetInput("Bias", {inputs[2]});
+    else
+      op->SetInput("Bias", {});
+
+    op->SetOutput("Output", outputs);
+    op->SetAttr("Scale_in", 1.0f);
+    op->SetAttr("Scale_out", 1.0f);
+    op->SetAttr("Scale_weights", scale_weights);
+    op->SetAttr("use_mkldnn", true);
+    op->SetAttr("mkldnn_data_type", std::string("int8"));
+  } else {
+    FAIL() << "Unexpected operator type.";
+  }
+}
+
+ProgramDesc BuildProgramDesc(bool convWithExistingBias,
+                             std::vector<float> scale_weights = {1.5}) {
+  ProgramDesc prog;
+  std::vector<std::string> nodes{"c", "weights", "f"};
+  if (convWithExistingBias) nodes.push_back("conv_bias");
+  for (auto& v : nodes) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(proto::VarType::LOD_TENSOR);
+    if (v == "weights") {
+      var->SetPersistable(true);
+      var->SetShape({1, static_cast<int>(scale_weights.size()), 1, 1});
+    }
+  }
+
+  if (convWithExistingBias) {
+    SetOp(&prog, "conv2d", "conv",
+          std::vector<std::string>({"c", "weights", "conv_bias"}),
+          std::vector<std::string>({"f"}), scale_weights);
+  } else if (scale_weights.size() > 1) {
+    SetOp(&prog, "conv2d", "conv",
+          std::vector<std::string>({"c", "weights", "conv_bias"}),
+          std::vector<std::string>({"f"}), scale_weights);
+  } else {
+    SetOp(&prog, "conv2d", "conv", std::vector<std::string>({"c", "weights"}),
+          std::vector<std::string>({"f"}));
+  }
+
+  return prog;
+}
+
+void MainTest(bool convWithExistingBias, int removed_nodes_count, float scale,
+              std::vector<float> scale_weights = {1.5f}) {
+  auto prog = BuildProgramDesc(convWithExistingBias, scale_weights);
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  auto pass =
+      PassRegistry::Instance().Get("int8_scale_calculation_mkldnn_pass");
+  int original_nodes_num = graph->Nodes().size();
+  graph.reset(pass->Apply(graph.release()));
+  int current_nodes_num = graph->Nodes().size();
+
+  EXPECT_EQ(original_nodes_num, current_nodes_num);
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp() && node->Op()->Type() == "conv2d") {
+      auto* op = node->Op();
+      ASSERT_TRUE(op->HasAttr("use_mkldnn"));
+
+      EXPECT_EQ(op->GetAttrIfExists<std::vector<float>>("Scale_weights"),
+                scale_weights);
+      EXPECT_EQ(op->GetAttrIfExists<float>("Scale_in"), scale);
+      EXPECT_EQ(op->GetAttrIfExists<float>("Scale_out"), scale);
+
+      EXPECT_EQ(op->GetAttrIfExists<float>("Sum_scale"), scale);
+      EXPECT_EQ(
+          op->GetAttrIfExists<std::vector<float>>("Output_shift_scale")[0],
+          scale / scale_weights[0]);
+      EXPECT_EQ(op->GetAttrIfExists<float>("Activation_scale"), scale);
+
+      if (convWithExistingBias) {
+        EXPECT_EQ(op->GetAttrIfExists<std::vector<float>>("Bias_scales")[0],
+                  scale * scale_weights[0]);
+      }
+    }
+  }
+  EXPECT_EQ(original_nodes_num - removed_nodes_count, current_nodes_num);
+}
+
+TEST(Int8ScaleCalculationMkldnnPass, int8_scale_calculation_with_no_bias) {
+  auto scale = 1.0f;
+  int removed_nodes_count = 0;
+  auto scale_weights = {1.5f};
+  MainTest(false, removed_nodes_count, scale, scale_weights);
+}
+
+TEST(Int8ScaleCalculationMkldnnPass, int8_scale_calculation_with_bias) {
+  auto scale = 1.0f;
+  int removed_nodes_count = 0;
+  auto scale_weights = {1.5f};
+  MainTest(true, removed_nodes_count, scale, scale_weights);
+}
+
+TEST(Int8ScaleCalculationMkldnnPass,
+     int8_scale_calculation_with_bias_scale_weights) {
+  auto scale = 1.0f;
+  int removed_nodes_count = 0;
+  std::vector<float> scale_weights = {1.5f, 2.3f};
+  MainTest(true, removed_nodes_count, scale, scale_weights);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(int8_scale_calculation_mkldnn_pass);
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
index a8595d55b31b0..4a5947778056a 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
@@ -864,7 +864,7 @@ int MultiHeadMatmulV2FusePass::BuildFusionV2(Graph* graph,
     auto* mul0_op_desc = mul0->Op();
 
     // all mul op has same input.
-    if (multihead_op_desc.HasAttr("Input_scale")) {
+    if (mul0_op_desc->HasAttr("Input_scale")) {
       multihead_op_desc.SetAttr("Input_scale",
                                 mul0_op_desc->GetAttr("Input_scale"));
     }
diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
index 281e0b9910619..e436bee035cea 100644
--- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
@@ -488,7 +488,7 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph, Scope* scope,
     // Convert weight to fp32 range
     auto* weight_tensor =
         scope->Var(quantized_op_weight_node->Name())->GetMutable<LoDTensor>();
-    auto w_dims = weight_tensor->dims();
+    const auto& w_dims = weight_tensor->dims();
     float* quantized_weight_data =
         weight_tensor->mutable_data<float>(platform::CPUPlace());
     // If quantized op is fc, weight scale size = 1;
diff --git a/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc b/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc
new file mode 100644
index 0000000000000..67dfe074dc075
--- /dev/null
+++ b/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc
@@ -0,0 +1,298 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/remove_padding_recover_padding_pass.h"
+
+#include <string>
+
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+void SkipLayernorm::operator()() {
+  // Create nodes for skip_layernorm.
+  auto* skip_layernorm_x = pattern->NewNode(skip_layernorm_x_repr())
+                               ->assert_is_op_input("skip_layernorm", "X");
+  auto* skip_layernorm_y = pattern->NewNode(skip_layernorm_y_repr())
+                               ->assert_is_op_input("skip_layernorm", "Y");
+  auto* skip_layernorm_op = pattern->NewNode(skip_layernorm_op_repr())
+                                ->assert_is_op("skip_layernorm");
+  auto* skip_layernorm_out = pattern->NewNode(skip_layernorm_out_repr())
+                                 ->assert_is_op_output("skip_layernorm", "Out");
+
+  // Add links for skip_layernorm op.
+  skip_layernorm_op->LinksFrom({skip_layernorm_x, skip_layernorm_y})
+      .LinksTo({skip_layernorm_out});
+}
+
+void MultiheadMatmul::operator()() {
+  // Create nodes for multihead_matmul.
+  auto* multihead_matmul_input =
+      pattern->NewNode(multihead_matmul_input_repr())
+          ->assert_is_op_input("multihead_matmul", "Input");
+  auto* multihead_matmul_op = pattern->NewNode(multihead_matmul_op_repr())
+                                  ->assert_is_op("multihead_matmul");
+  auto* multihead_matmul_out =
+      pattern->NewNode(multihead_matmul_out_repr())
+          ->assert_is_op_output("multihead_matmul", "Out");
+
+  // Add links for multihead_matmul op.
+  multihead_matmul_op->LinksFrom({multihead_matmul_input})
+      .LinksTo({multihead_matmul_out});
+}
+
+void Fc::operator()() {
+  // Create nodes for fc.
+  auto* fc_input =
+      pattern->NewNode(fc_input_repr())->assert_is_op_input("fc", "Input");
+  auto* fc_op = pattern->NewNode(fc_op_repr())->assert_is_op("fc");
+  auto* fc_out =
+      pattern->NewNode(fc_out_repr())->assert_is_op_output("fc", "Out");
+
+  // Add links for fc op.
+  fc_op->LinksFrom({fc_input}).LinksTo({fc_out});
+}
+
+void Activation::operator()() {
+  // Create nodes for activation.
+  std::unordered_set<std::string> activation_ops{"relu", "sigmoid", "tanh"};
+  auto* activation_input = pattern->NewNode(activation_input_repr())
+                               ->assert_is_ops_input(activation_ops);
+  auto* activation_op =
+      pattern->NewNode(activation_op_repr())->assert_is_ops(activation_ops);
+  auto* activation_out = pattern->NewNode(activation_out_repr())
+                             ->assert_is_ops_output(activation_ops);
+
+  // Add links for activation op.
+  activation_op->LinksFrom({activation_input}).LinksTo({activation_out});
+}
+}  // namespace patterns
+
+void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::PreconditionNotMet("graph should not be null."));
+  FusePassBase::Init(name_scope_, graph);
+  auto* scope = param_scope();
+  int found_subgraph_count = 0;
+
+  // Create an remove_padding op node
+  auto insert_remove_padding_op = [&](Node* input_node, Node* op_node) {
+    // create op, var in graph
+    OpDesc remove_padding;
+    std::string remove_padding_out_name =
+        input_node->Name() + ".remove_padding";
+
+    VarDesc remove_padding_out(remove_padding_out_name);
+    remove_padding_out.SetDataType(input_node->Var()->GetDataType());
+    remove_padding_out.SetShape(input_node->Var()->GetShape());
+    remove_padding_out.SetPersistable(false);
+
+    // remove_padding_op
+    remove_padding.SetType("remove_padding");
+
+    // input
+    remove_padding.SetInput("Input", {input_node->Name()});
+
+    // output
+    remove_padding.SetOutput("Out", {remove_padding_out_name});
+
+    auto remove_padding_op_node = graph->CreateOpNode(&remove_padding);
+    auto remove_padding_out_node = graph->CreateVarNode(&remove_padding_out);
+
+    // replace link
+    for (size_t i = 0; i < input_node->outputs.size(); ++i) {
+      if (input_node->outputs[i] == op_node) {
+        input_node->outputs[i] = remove_padding_op_node;
+        remove_padding_op_node->inputs.push_back(input_node);
+      }
+    }
+
+    // link node
+    IR_NODE_LINK_TO(remove_padding_op_node, remove_padding_out_node);
+
+    // replace link
+    for (size_t i = 0; i < op_node->inputs.size(); ++i) {
+      if (op_node->inputs[i] == input_node) {
+        op_node->inputs[i] = remove_padding_out_node;
+        remove_padding_out_node->outputs.push_back(op_node);
+      }
+    }
+
+    // create variable in scope
+    scope->Var(remove_padding_out_name);
+    auto* remove_padding_out_tensor =
+        scope->FindVar(remove_padding_out_name)->GetMutable<LoDTensor>();
+    remove_padding_out_tensor->mutable_data<float>(platform::CUDAPlace());
+
+    // rename
+    op_node->Op()->RenameInput(input_node->Name(),
+                               remove_padding_out_node->Name());
+  };
+
+  // create an remove_padding op node
+  auto insert_recover_padding_op = [&](Node* op_node, Node* out_node) {
+    // create op, var in graph
+    OpDesc recover_padding;
+    std::string recover_padding_input_name =
+        out_node->Name() + ".recover_padding";
+    VarDesc recover_padding_input(recover_padding_input_name);
+    recover_padding_input.SetDataType(out_node->Var()->GetDataType());
+    recover_padding_input.SetShape(out_node->Var()->GetShape());
+    recover_padding_input.SetPersistable(false);
+
+    // recover_padding_op
+    recover_padding.SetType("recover_padding");
+
+    // input
+    recover_padding.SetInput("Input", {recover_padding_input_name});
+
+    // output
+    recover_padding.SetOutput("Out", {out_node->Name()});
+
+    auto recover_padding_op_node = graph->CreateOpNode(&recover_padding);
+    auto recover_padding_input_node =
+        graph->CreateVarNode(&recover_padding_input);
+
+    // replace link
+    for (size_t i = 0; i < op_node->outputs.size(); ++i) {
+      if (op_node->outputs[i] == out_node) {
+        op_node->outputs[i] = recover_padding_input_node;
+        recover_padding_input_node->inputs.push_back(op_node);
+      }
+    }
+
+    // link node
+    IR_NODE_LINK_TO(recover_padding_input_node, recover_padding_op_node);
+
+    // replace link
+    for (size_t i = 0; i < out_node->inputs.size(); ++i) {
+      if (out_node->inputs[i] == op_node) {
+        out_node->inputs[i] = recover_padding_op_node;
+        recover_padding_op_node->outputs.push_back(out_node);
+      }
+    }
+
+    // create variable in scope
+    scope->Var(recover_padding_input_name);
+    auto* recover_padding_input_tensor =
+        scope->FindVar(recover_padding_input_name)->GetMutable<LoDTensor>();
+    recover_padding_input_tensor->mutable_data<float>(platform::CUDAPlace());
+
+    // rename
+    op_node->Op()->RenameOutput(out_node->Name(), recover_padding_input_name);
+  };
+
+  GraphPatternDetector gpd1;
+  patterns::SkipLayernorm skip_layernorm(gpd1.mutable_pattern(),
+                                         "remove_padding_recover_padding_pass");
+  skip_layernorm();
+
+  auto handler1 = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                      Graph* graph) {
+    VLOG(3) << "remove_padding_recover_padding_pass for transformer: "
+               "skip_layernorm";
+
+    GET_IR_NODE_FROM_SUBGRAPH(skip_layernorm_x, skip_layernorm_x,
+                              skip_layernorm);
+    GET_IR_NODE_FROM_SUBGRAPH(skip_layernorm_y, skip_layernorm_y,
+                              skip_layernorm);
+    GET_IR_NODE_FROM_SUBGRAPH(skip_layernorm_op, skip_layernorm_op,
+                              skip_layernorm);
+    GET_IR_NODE_FROM_SUBGRAPH(skip_layernorm_out, skip_layernorm_out,
+                              skip_layernorm);
+
+    insert_remove_padding_op(skip_layernorm_x, skip_layernorm_op);
+    insert_remove_padding_op(skip_layernorm_y, skip_layernorm_op);
+    insert_recover_padding_op(skip_layernorm_op, skip_layernorm_out);
+
+    found_subgraph_count++;
+  };
+  gpd1(graph, handler1);
+
+  GraphPatternDetector gpd2;
+  patterns::MultiheadMatmul multihead_matmul(
+      gpd2.mutable_pattern(), "remove_padding_recover_padding_pass");
+  multihead_matmul();
+
+  auto handler2 = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                      Graph* graph) {
+    VLOG(3) << "remove_padding_recover_padding_pass for transformer: "
+               "multihead_matmul";
+
+    GET_IR_NODE_FROM_SUBGRAPH(multihead_matmul_input, multihead_matmul_input,
+                              multihead_matmul);
+    GET_IR_NODE_FROM_SUBGRAPH(multihead_matmul_op, multihead_matmul_op,
+                              multihead_matmul);
+    GET_IR_NODE_FROM_SUBGRAPH(multihead_matmul_out, multihead_matmul_out,
+                              multihead_matmul);
+
+    insert_remove_padding_op(multihead_matmul_input, multihead_matmul_op);
+    insert_recover_padding_op(multihead_matmul_op, multihead_matmul_out);
+
+    found_subgraph_count++;
+  };
+  gpd2(graph, handler2);
+
+  GraphPatternDetector gpd3;
+  patterns::Fc fc(gpd3.mutable_pattern(),
+                  "remove_padding_recover_padding_pass");
+  fc();
+
+  auto handler3 = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                      Graph* graph) {
+    VLOG(3) << "remove_padding_recover_padding_pass for transformer: fc";
+
+    GET_IR_NODE_FROM_SUBGRAPH(fc_input, fc_input, fc);
+    GET_IR_NODE_FROM_SUBGRAPH(fc_op, fc_op, fc);
+    GET_IR_NODE_FROM_SUBGRAPH(fc_out, fc_out, fc);
+
+    insert_remove_padding_op(fc_input, fc_op);
+    insert_recover_padding_op(fc_op, fc_out);
+
+    found_subgraph_count++;
+  };
+  gpd3(graph, handler3);
+
+  GraphPatternDetector gpd4;
+  patterns::Activation activation(gpd4.mutable_pattern(),
+                                  "remove_padding_recover_padding_pass");
+  activation();
+
+  auto handler4 = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                      Graph* graph) {
+    VLOG(3)
+        << "remove_padding_recover_padding_pass for transformer: activation";
+
+    GET_IR_NODE_FROM_SUBGRAPH(activation_input, activation_input, activation);
+    GET_IR_NODE_FROM_SUBGRAPH(activation_op, activation_op, activation);
+    GET_IR_NODE_FROM_SUBGRAPH(activation_out, activation_out, activation);
+
+    insert_remove_padding_op(activation_input, activation_op);
+    insert_recover_padding_op(activation_op, activation_out);
+
+    found_subgraph_count++;
+  };
+  gpd4(graph, handler4);
+
+  AddStatis(found_subgraph_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(remove_padding_recover_padding_pass,
+              paddle::framework::ir::RemovePaddingRecoverPaddingPass);
diff --git a/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.h b/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.h
new file mode 100644
index 0000000000000..d7ccfc75c2000
--- /dev/null
+++ b/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.h
@@ -0,0 +1,94 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Graph;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+struct SkipLayernorm : public PatternBase {
+  SkipLayernorm(PDPattern *pattern, const std::string &name_scope)
+      : PatternBase(pattern, name_scope, "skip_layernorm") {}
+
+  void operator()();
+
+  PATTERN_DECL_NODE(skip_layernorm_x);
+  PATTERN_DECL_NODE(skip_layernorm_y);
+  PATTERN_DECL_NODE(skip_layernorm_op);
+  PATTERN_DECL_NODE(skip_layernorm_out);
+};
+
+struct MultiheadMatmul : public PatternBase {
+  MultiheadMatmul(PDPattern *pattern, const std::string &name_scope)
+      : PatternBase(pattern, name_scope, "multihead_matmul") {}
+
+  void operator()();
+
+  PATTERN_DECL_NODE(multihead_matmul_input);
+  PATTERN_DECL_NODE(multihead_matmul_op);
+  PATTERN_DECL_NODE(multihead_matmul_out);
+};
+
+struct Fc : public PatternBase {
+  Fc(PDPattern *pattern, const std::string &name_scope)
+      : PatternBase(pattern, name_scope, "fc") {}
+
+  void operator()();
+
+  PATTERN_DECL_NODE(fc_input);
+  PATTERN_DECL_NODE(fc_op);
+  PATTERN_DECL_NODE(fc_out);
+};
+
+struct Activation : public PatternBase {
+  Activation(PDPattern *pattern, const std::string &name_scope)
+      : PatternBase(pattern, name_scope, "activation") {}
+
+  void operator()();
+
+  PATTERN_DECL_NODE(activation_input);
+  PATTERN_DECL_NODE(activation_op);
+  PATTERN_DECL_NODE(activation_out);
+};
+}  // namespace patterns
+
+class RemovePaddingRecoverPaddingPass : public FusePassBase {
+ public:
+  RemovePaddingRecoverPaddingPass() {}
+  virtual ~RemovePaddingRecoverPaddingPass() {}
+
+ protected:
+  void ApplyImpl(Graph *graph) const;
+  const std::string name_scope_{"remove_padding_recover_padding_pass"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/set_transformer_input_convert_pass.cc b/paddle/fluid/framework/ir/set_transformer_input_convert_pass.cc
new file mode 100644
index 0000000000000..37e77bc134d3c
--- /dev/null
+++ b/paddle/fluid/framework/ir/set_transformer_input_convert_pass.cc
@@ -0,0 +1,161 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/set_transformer_input_convert_pass.h"
+
+#include <string>
+
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+SetTransformerInputConvertPass::SetTransformerInputConvertPass() {
+  AddOpCompat(OpCompat("elementwise_add"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("axis")
+      .End();
+}
+namespace patterns {
+
+void SetTransformerInputConvert::operator()() {
+  std::unordered_set<std::string> lookup_table_ops{"lookup_table",
+                                                   "lookup_table_v2"};
+  // Create nodes for lookup_table1 op.
+  auto *lookup_table1_x = pattern->NewNode(lookup_table1_x_repr())
+                              ->assert_is_ops_input(lookup_table_ops, "Ids");
+  auto *lookup_table1_w = pattern->NewNode(lookup_table1_w_repr())
+                              ->assert_is_ops_input(lookup_table_ops, "W");
+  auto *lookup_table1_op =
+      pattern->NewNode(lookup_table1_repr())->assert_is_ops(lookup_table_ops);
+  auto *lookup_table1_out = pattern->NewNode(lookup_table1_out_repr())
+                                ->assert_is_ops_output(lookup_table_ops)
+                                ->AsIntermediate()
+                                ->assert_is_op_input("elementwise_add", "X");
+
+  // Create nodes for lookup_table2 op.
+  auto *lookup_table2_x = pattern->NewNode(lookup_table2_x_repr())
+                              ->assert_is_ops_input(lookup_table_ops, "Ids");
+  auto *lookup_table2_w = pattern->NewNode(lookup_table2_w_repr())
+                              ->assert_is_ops_input(lookup_table_ops, "W");
+  auto *lookup_table2_op =
+      pattern->NewNode(lookup_table2_repr())->assert_is_ops(lookup_table_ops);
+  auto *lookup_table2_out = pattern->NewNode(lookup_table2_out_repr())
+                                ->assert_is_ops_output(lookup_table_ops)
+                                ->AsIntermediate()
+                                ->assert_is_op_input("elementwise_add", "Y");
+
+  // Create nodes for elementwise_add op.
+  auto *elementwise_op =
+      pattern->NewNode(elementwise_repr())->assert_is_op("elementwise_add");
+  auto *elementwise_out = pattern->NewNode(elementwise_out_repr())
+                              ->AsOutput()
+                              ->assert_is_only_output_of_op("elementwise_add");
+
+  // links nodes.
+  lookup_table1_op->LinksFrom({lookup_table1_x, lookup_table1_w})
+      .LinksTo({lookup_table1_out});
+  lookup_table2_op->LinksFrom({lookup_table2_x, lookup_table2_w})
+      .LinksTo({lookup_table2_out});
+  elementwise_op->LinksFrom({lookup_table1_out, lookup_table2_out})
+      .LinksTo({elementwise_out});
+}
+
+}  // namespace patterns
+
+void SetTransformerInputConvertPass::ApplyImpl(ir::Graph *graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::PreconditionNotMet("graph should not be null."));
+  FusePassBase::Init(name_scope_, graph);
+  int found_subgraph_count = 0;
+
+  GraphPatternDetector gpd;
+  patterns::SetTransformerInputConvert fused_pattern(
+      gpd.mutable_pattern(), "transformer_input_convert_pass");
+  fused_pattern();
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *graph) {
+    if (!IsCompat(subgraph, graph)) {
+      LOG(WARNING) << "transformer_input_convert_pass in op compat failed.";
+      return;
+    }
+
+    VLOG(3) << "transformer_input_convert_pass for pos_id, max_seqlen";
+
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table2_x, lookup_table2_x, fused_pattern);
+
+    // create op, var in graph
+    OpDesc new_desc;
+    new_desc.SetType("transformer_input_convert");
+
+    // inputs
+    new_desc.SetInput("X", {lookup_table2_x->Name()});
+
+    // outputs
+    std::vector<std::string> output_0 = {"pos_id_tensor"};
+    std::vector<std::string> output_1 = {"max_seqlen_tensor"};
+    new_desc.SetOutput("PosId", output_0);
+    new_desc.SetOutput("MaxSeqlen", output_1);
+
+    std::string transformer_input_convert_out0_name = "pos_id_tensor";
+    std::string transformer_input_convert_out1_name = "max_seqlen_tensor";
+    VarDesc transformer_input_convert_out0(transformer_input_convert_out0_name);
+    VarDesc transformer_input_convert_out1(transformer_input_convert_out1_name);
+    transformer_input_convert_out0.SetDataType(proto::VarType::INT32);
+    transformer_input_convert_out1.SetDataType(proto::VarType::INT32);
+    transformer_input_convert_out0.SetShape({-1});
+    transformer_input_convert_out1.SetShape({-1});
+    transformer_input_convert_out0.SetPersistable(false);
+    transformer_input_convert_out1.SetPersistable(false);
+
+    auto new_op_node = graph->CreateOpNode(&new_desc);
+    auto transformer_input_convert_out0_node =
+        graph->CreateVarNode(&transformer_input_convert_out0);
+    auto transformer_input_convert_out1_node =
+        graph->CreateVarNode(&transformer_input_convert_out1);
+
+    // needn't create variable in scope
+
+    IR_NODE_LINK_TO(lookup_table2_x, new_op_node);
+    IR_NODE_LINK_TO(new_op_node, transformer_input_convert_out0_node);
+    IR_NODE_LINK_TO(new_op_node, transformer_input_convert_out1_node);
+
+    found_subgraph_count++;
+  };
+
+  gpd(graph, handler);
+  AddStatis(found_subgraph_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(set_transformer_input_convert_pass,
+              paddle::framework::ir::SetTransformerInputConvertPass);
+REGISTER_PASS_CAPABILITY(set_transformer_input_convert_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .LE("lookup_table", 1)
+            .LE("lookup_table_v2", 1)
+            .LE("elementweise_add", 1));
diff --git a/paddle/fluid/framework/ir/set_transformer_input_convert_pass.h b/paddle/fluid/framework/ir/set_transformer_input_convert_pass.h
new file mode 100644
index 0000000000000..5a5843e810f9a
--- /dev/null
+++ b/paddle/fluid/framework/ir/set_transformer_input_convert_pass.h
@@ -0,0 +1,80 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Graph;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+//     in_var  emb       in_var   emb
+//       |      |          |       |
+//     lookup_table      lookup_table
+//           |                 |
+//        lkt_var           lkt_var
+//            \                /
+//             elementwise_add
+//                    |
+//               elt_out_var
+//
+struct SetTransformerInputConvert : public PatternBase {
+  SetTransformerInputConvert(PDPattern *pattern, const std::string &name_scope)
+      : PatternBase(pattern, name_scope, "transformer_input_convert") {}
+
+  void operator()();
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(lookup_table1);
+  PATTERN_DECL_NODE(lookup_table2);
+  PATTERN_DECL_NODE(elementwise);
+
+  // declare variable node's name
+  PATTERN_DECL_NODE(lookup_table1_x);
+  PATTERN_DECL_NODE(lookup_table1_w);
+  PATTERN_DECL_NODE(lookup_table1_out);
+  PATTERN_DECL_NODE(lookup_table2_x);
+  PATTERN_DECL_NODE(lookup_table2_w);
+  PATTERN_DECL_NODE(lookup_table2_out);
+  PATTERN_DECL_NODE(elementwise_out);
+};
+}  // namespace patterns
+
+class SetTransformerInputConvertPass : public FusePassBase {
+ public:
+  SetTransformerInputConvertPass();
+  virtual ~SetTransformerInputConvertPass() {}
+
+ protected:
+  void ApplyImpl(Graph *graph) const;
+  const std::string name_scope_{"transformer_input_convert_pass"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
index f3d96c3850656..bda6b90386475 100644
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
@@ -93,7 +93,7 @@ void TransposeFlattenConcatFusePass::RunTransposeFlattenConcatFuse(
 
     std::vector<Node *> nodes;
     std::vector<int> trans_axis0;
-    int flatten_axis0;
+    int flatten_axis0 = 0;
     for (int i = 0; i < times; i++) {
       PADDLE_ENFORCE_NOT_NULL(
           subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i))),
diff --git a/paddle/fluid/framework/ir/yolo_box_fuse_pass.cc b/paddle/fluid/framework/ir/yolo_box_fuse_pass.cc
new file mode 100644
index 0000000000000..20075a49749f7
--- /dev/null
+++ b/paddle/fluid/framework/ir/yolo_box_fuse_pass.cc
@@ -0,0 +1,304 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/yolo_box_fuse_pass.h"
+#include <string>
+#include "glog/logging.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class Node;
+
+namespace patterns {
+struct YoloBoxPattern : public PatternBase {
+  YoloBoxPattern(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, name_scope) {
+    // elementwise_div pattern
+    auto* elt_div_in_x = pattern->NewNode(elt_div_in_x_repr())
+                             ->assert_is_op_input("elementwise_div", "X");
+    auto* elt_div_in_y = pattern->NewNode(elt_div_in_y_repr())
+                             ->assert_is_op_input("elementwise_div", "Y");
+    auto* elt_div =
+        pattern->NewNode(elt_div_repr())->assert_is_op("elementwise_div");
+    auto* elt_div_out = pattern->NewNode(elt_div_out_repr())
+                            ->assert_is_op_output("elementwise_div", "Out")
+                            ->assert_is_op_input("cast", "X");
+    elt_div->LinksFrom({elt_div_in_x, elt_div_in_y}).LinksTo({elt_div_out});
+    // cast pattern
+    auto* cast = pattern->NewNode(cast_repr())->assert_is_op("cast");
+    auto* cast_out = pattern->NewNode(cast_out_repr())
+                         ->assert_is_op_output("cast", "Out")
+                         ->assert_is_op_input("yolo_box", "ImgSize");
+    cast->LinksFrom({elt_div_out}).LinksTo({cast_out});
+// 3 * (yolo_box + transpose) pattern
+#define YOLO_BOX_TRANSPOSE_PATTERN(idx_)                                       \
+  auto* yolo_box##idx_##_in_x = pattern->NewNode(yolo_box##idx_##_in_x_repr()) \
+                                    ->assert_is_op_input("yolo_box", "X");     \
+  auto* yolo_box##idx_ =                                                       \
+      pattern->NewNode(yolo_box##idx_##_repr())->assert_is_op("yolo_box");     \
+  auto* yolo_box##idx_##_out_boxes =                                           \
+      pattern->NewNode(yolo_box##idx_##_out_boxes_repr())                      \
+          ->assert_is_op_output("yolo_box", "Boxes")                           \
+          ->assert_is_op_nth_input("concat", "X", idx_);                       \
+  auto* yolo_box##idx_##_out_scores =                                          \
+      pattern->NewNode(yolo_box##idx_##_out_scores_repr())                     \
+          ->assert_is_op_output("yolo_box", "Scores")                          \
+          ->assert_is_op_input("transpose2", "X");                             \
+  yolo_box##idx_->LinksFrom({yolo_box##idx_##_in_x, cast_out})                 \
+      .LinksTo({yolo_box##idx_##_out_boxes, yolo_box##idx_##_out_scores});     \
+  auto* transpose##idx_ =                                                      \
+      pattern->NewNode(transpose##idx_##_repr())->assert_is_op("transpose2");  \
+  auto* transpose##idx_##_out =                                                \
+      pattern->NewNode(transpose##idx_##_out_repr())                           \
+          ->assert_is_op_output("transpose2", "Out")                           \
+          ->assert_is_op_nth_input("concat", "X", idx_);                       \
+  auto* transpose##idx_##_out_xshape =                                         \
+      pattern->NewNode(transpose##idx_##_out_xshape_repr())                    \
+          ->assert_is_op_output("transpose2", "XShape");                       \
+  transpose##idx_->LinksFrom({yolo_box##idx_##_out_scores})                    \
+      .LinksTo({transpose##idx_##_out, transpose##idx_##_out_xshape});
+    YOLO_BOX_TRANSPOSE_PATTERN(0);
+    YOLO_BOX_TRANSPOSE_PATTERN(1);
+    YOLO_BOX_TRANSPOSE_PATTERN(2);
+#undef YOLO_BOX_TRANSPOSE_PATTERN
+    // concat0 pattern
+    auto* concat0 = pattern->NewNode(concat0_repr())->assert_is_op("concat");
+    auto* concat0_out = pattern->NewNode(concat0_out_repr())
+                            ->assert_is_op_output("concat", "Out")
+                            ->assert_is_op_input("multiclass_nms3", "BBoxes");
+    concat0
+        ->LinksFrom(
+            {yolo_box0_out_boxes, yolo_box1_out_boxes, yolo_box2_out_boxes})
+        .LinksTo({concat0_out});
+    // concat1 pattern
+    auto* concat1 = pattern->NewNode(concat1_repr())->assert_is_op("concat");
+    auto* concat1_out = pattern->NewNode(concat1_out_repr())
+                            ->assert_is_op_output("concat", "Out")
+                            ->assert_is_op_input("multiclass_nms3", "Scores");
+    concat1->LinksFrom({transpose0_out, transpose1_out, transpose2_out})
+        .LinksTo({concat1_out});
+    // nms pattern
+    auto* nms = pattern->NewNode(nms_repr())->assert_is_op("multiclass_nms3");
+    auto* nms_out = pattern->NewNode(nms_out_repr())
+                        ->assert_is_op_output("multiclass_nms3", "Out");
+    auto* nms_out_index = pattern->NewNode(nms_out_index_repr())
+                              ->assert_is_op_output("multiclass_nms3", "Index");
+    auto* nms_out_rois_num =
+        pattern->NewNode(nms_out_rois_num_repr())
+            ->assert_is_op_output("multiclass_nms3", "NmsRoisNum");
+    nms->LinksFrom({concat0_out, concat1_out})
+        .LinksTo({nms_out, nms_out_index, nms_out_rois_num});
+  }
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(elt_div);
+  PATTERN_DECL_NODE(cast);
+  PATTERN_DECL_NODE(yolo_box0);
+  PATTERN_DECL_NODE(yolo_box1);
+  PATTERN_DECL_NODE(yolo_box2);
+  PATTERN_DECL_NODE(concat0);
+  PATTERN_DECL_NODE(transpose0);
+  PATTERN_DECL_NODE(transpose1);
+  PATTERN_DECL_NODE(transpose2);
+  PATTERN_DECL_NODE(concat1);
+  PATTERN_DECL_NODE(nms);
+  // declare variable node's name
+  PATTERN_DECL_NODE(elt_div_in_x);
+  PATTERN_DECL_NODE(elt_div_in_y);
+  PATTERN_DECL_NODE(elt_div_out);
+  PATTERN_DECL_NODE(cast_out);
+  PATTERN_DECL_NODE(yolo_box0_in_x);
+  PATTERN_DECL_NODE(yolo_box1_in_x);
+  PATTERN_DECL_NODE(yolo_box2_in_x);
+  PATTERN_DECL_NODE(yolo_box0_out_boxes);
+  PATTERN_DECL_NODE(yolo_box1_out_boxes);
+  PATTERN_DECL_NODE(yolo_box2_out_boxes);
+  PATTERN_DECL_NODE(yolo_box0_out_scores);
+  PATTERN_DECL_NODE(yolo_box1_out_scores);
+  PATTERN_DECL_NODE(yolo_box2_out_scores);
+  PATTERN_DECL_NODE(concat0_out);
+  PATTERN_DECL_NODE(transpose0_out);
+  PATTERN_DECL_NODE(transpose1_out);
+  PATTERN_DECL_NODE(transpose2_out);
+  PATTERN_DECL_NODE(transpose0_out_xshape);
+  PATTERN_DECL_NODE(transpose1_out_xshape);
+  PATTERN_DECL_NODE(transpose2_out_xshape);
+  PATTERN_DECL_NODE(concat1_out);
+  PATTERN_DECL_NODE(nms_out);
+  PATTERN_DECL_NODE(nms_out_index);
+  PATTERN_DECL_NODE(nms_out_rois_num);
+};
+}  // namespace patterns
+
+YoloBoxFusePass::YoloBoxFusePass() {}
+
+void YoloBoxFusePass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::PreconditionNotMet("graph should not be null."));
+  Init(name_scope_, graph);
+  GraphPatternDetector gpd;
+  patterns::YoloBoxPattern yolo_box_pattern(gpd.mutable_pattern(), name_scope_);
+  int found_subgraph_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* graph) {
+    VLOG(4) << "handle YoloBoxFusePass fuse";
+#define GET_IR_NODE(node_) \
+  GET_IR_NODE_FROM_SUBGRAPH(node_, node_, yolo_box_pattern)
+    GET_IR_NODE(elt_div);
+    GET_IR_NODE(cast);
+    GET_IR_NODE(yolo_box0);
+    GET_IR_NODE(yolo_box1);
+    GET_IR_NODE(yolo_box2);
+    GET_IR_NODE(concat0);
+    GET_IR_NODE(transpose0);
+    GET_IR_NODE(transpose1);
+    GET_IR_NODE(transpose2);
+    GET_IR_NODE(concat1);
+    GET_IR_NODE(nms);
+    GET_IR_NODE(elt_div_in_x);
+    GET_IR_NODE(elt_div_in_y);
+    GET_IR_NODE(elt_div_out);
+    GET_IR_NODE(cast_out);
+    GET_IR_NODE(yolo_box0_in_x);
+    GET_IR_NODE(yolo_box1_in_x);
+    GET_IR_NODE(yolo_box2_in_x);
+    GET_IR_NODE(yolo_box0_out_boxes);
+    GET_IR_NODE(yolo_box1_out_boxes);
+    GET_IR_NODE(yolo_box2_out_boxes);
+    GET_IR_NODE(yolo_box0_out_scores);
+    GET_IR_NODE(yolo_box1_out_scores);
+    GET_IR_NODE(yolo_box2_out_scores);
+    GET_IR_NODE(concat0_out);
+    GET_IR_NODE(transpose0_out);
+    GET_IR_NODE(transpose1_out);
+    GET_IR_NODE(transpose2_out);
+    GET_IR_NODE(transpose0_out_xshape);
+    GET_IR_NODE(transpose1_out_xshape);
+    GET_IR_NODE(transpose2_out_xshape);
+    GET_IR_NODE(concat1_out);
+    GET_IR_NODE(nms_out);
+    GET_IR_NODE(nms_out_index);
+    GET_IR_NODE(nms_out_rois_num);
+#undef GET_IR_NODE
+
+    auto* block = yolo_box0->Op()->Block();
+
+// create yolo_box_head
+#define CREATE_YOLO_BOX_HEAD(idx_)                                         \
+  framework::OpDesc yolo_box_head##idx_##_op_desc(block);                  \
+  yolo_box_head##idx_##_op_desc.SetType("yolo_box_head");                  \
+  yolo_box_head##idx_##_op_desc.SetInput("X",                              \
+                                         {yolo_box##idx_##_in_x->Name()}); \
+  yolo_box_head##idx_##_op_desc.SetAttr(                                   \
+      "anchors", yolo_box##idx_->Op()->GetAttr("anchors"));                \
+  yolo_box_head##idx_##_op_desc.SetAttr(                                   \
+      "class_num", yolo_box##idx_->Op()->GetAttr("class_num"));            \
+  yolo_box_head##idx_##_op_desc.SetOutput(                                 \
+      "Out", {yolo_box##idx_##_out_boxes->Name()});                        \
+  yolo_box_head##idx_##_op_desc.Flush();                                   \
+  auto* yolo_box_head##idx_ =                                              \
+      graph->CreateOpNode(&yolo_box_head##idx_##_op_desc);                 \
+  IR_NODE_LINK_TO(yolo_box##idx_##_in_x, yolo_box_head##idx_);             \
+  IR_NODE_LINK_TO(yolo_box_head##idx_, yolo_box##idx_##_out_boxes);
+    CREATE_YOLO_BOX_HEAD(0);
+    CREATE_YOLO_BOX_HEAD(1);
+    CREATE_YOLO_BOX_HEAD(2);
+#undef CREATE_YOLO_BOX_HEAD
+
+    // create yolo_box_post
+    framework::OpDesc yolo_box_post_op_desc(block);
+    yolo_box_post_op_desc.SetType("yolo_box_post");
+    yolo_box_post_op_desc.SetInput("Boxes0", {yolo_box0_out_boxes->Name()});
+    yolo_box_post_op_desc.SetInput("Boxes1", {yolo_box1_out_boxes->Name()});
+    yolo_box_post_op_desc.SetInput("Boxes2", {yolo_box2_out_boxes->Name()});
+    yolo_box_post_op_desc.SetInput("ImageShape", {elt_div_in_x->Name()});
+    yolo_box_post_op_desc.SetInput("ImageScale", {elt_div_in_y->Name()});
+    yolo_box_post_op_desc.SetAttr("anchors0",
+                                  yolo_box0->Op()->GetAttr("anchors"));
+    yolo_box_post_op_desc.SetAttr("anchors1",
+                                  yolo_box1->Op()->GetAttr("anchors"));
+    yolo_box_post_op_desc.SetAttr("anchors2",
+                                  yolo_box2->Op()->GetAttr("anchors"));
+    yolo_box_post_op_desc.SetAttr("class_num",
+                                  yolo_box0->Op()->GetAttr("class_num"));
+    yolo_box_post_op_desc.SetAttr("conf_thresh",
+                                  yolo_box0->Op()->GetAttr("conf_thresh"));
+    yolo_box_post_op_desc.SetAttr("downsample_ratio0",
+                                  yolo_box0->Op()->GetAttr("downsample_ratio"));
+    yolo_box_post_op_desc.SetAttr("downsample_ratio1",
+                                  yolo_box1->Op()->GetAttr("downsample_ratio"));
+    yolo_box_post_op_desc.SetAttr("downsample_ratio2",
+                                  yolo_box2->Op()->GetAttr("downsample_ratio"));
+    yolo_box_post_op_desc.SetAttr("clip_bbox",
+                                  yolo_box0->Op()->GetAttr("clip_bbox"));
+    yolo_box_post_op_desc.SetAttr("scale_x_y",
+                                  yolo_box0->Op()->GetAttr("scale_x_y"));
+    yolo_box_post_op_desc.SetAttr("nms_threshold",
+                                  nms->Op()->GetAttr("nms_threshold"));
+    yolo_box_post_op_desc.SetOutput("Out", {nms_out->Name()});
+    yolo_box_post_op_desc.SetOutput("NmsRoisNum", {nms_out_rois_num->Name()});
+    auto* yolo_box_post = graph->CreateOpNode(&yolo_box_post_op_desc);
+    IR_NODE_LINK_TO(yolo_box0_out_boxes, yolo_box_post);
+    IR_NODE_LINK_TO(yolo_box1_out_boxes, yolo_box_post);
+    IR_NODE_LINK_TO(yolo_box2_out_boxes, yolo_box_post);
+    IR_NODE_LINK_TO(elt_div_in_x, yolo_box_post);
+    IR_NODE_LINK_TO(elt_div_in_y, yolo_box_post);
+    IR_NODE_LINK_TO(yolo_box_post, nms_out);
+    IR_NODE_LINK_TO(yolo_box_post, nms_out_rois_num);
+
+    // delete useless node
+    GraphSafeRemoveNodes(graph, {elt_div,
+                                 cast,
+                                 yolo_box0,
+                                 yolo_box1,
+                                 yolo_box2,
+                                 concat0,
+                                 transpose0,
+                                 transpose1,
+                                 transpose2,
+                                 concat1,
+                                 nms,
+                                 elt_div_out,
+                                 cast_out,
+                                 yolo_box0_out_scores,
+                                 yolo_box1_out_scores,
+                                 yolo_box2_out_scores,
+                                 concat0_out,
+                                 transpose0_out,
+                                 transpose1_out,
+                                 transpose2_out,
+                                 transpose0_out_xshape,
+                                 transpose1_out_xshape,
+                                 transpose2_out_xshape,
+                                 concat1_out,
+                                 nms_out_index});
+    found_subgraph_count++;
+  };
+
+  gpd(graph, handler);
+  AddStatis(found_subgraph_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(yolo_box_fuse_pass, paddle::framework::ir::YoloBoxFusePass);
diff --git a/paddle/fluid/framework/ir/yolo_box_fuse_pass.h b/paddle/fluid/framework/ir/yolo_box_fuse_pass.h
new file mode 100644
index 0000000000000..51dea2431f252
--- /dev/null
+++ b/paddle/fluid/framework/ir/yolo_box_fuse_pass.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class Graph;
+
+/*
+1. before fuse:
+  div
+   |
+  cast-----------|-------------|
+   |             |             |
+yolo_box      yolo_box      yolo_box
+   |             |             |
+transpose-|   transpose-|   transpose-|
+   |------|-----|-------|------|      |
+          |   concat    |             |
+          |-----|-------|-------------|
+                |     cocnat
+                |-------|
+                       nms3
+
+2. after fuse:
+yolo_box_head      yolo_box_head      yolo_box_head
+      |------------------|------------------|
+                    yolo_box_post
+*/
+class YoloBoxFusePass : public FusePassBase {
+ public:
+  YoloBoxFusePass();
+  virtual ~YoloBoxFusePass() {}
+
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+
+ private:
+  std::string name_scope_{"yolo_box_fuse_pass"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/multi_trainer.cc b/paddle/fluid/framework/multi_trainer.cc
index 7a83fdccc218c..6479f7ae72654 100644
--- a/paddle/fluid/framework/multi_trainer.cc
+++ b/paddle/fluid/framework/multi_trainer.cc
@@ -148,6 +148,17 @@ void MultiTrainer::InitTrainerEnv(const ProgramDesc& main_program,
     }
   }
 #endif
+  for (auto& var : main_program.Block(0).AllVars()) {
+    if (var->Persistable()) {
+      auto it = std::find(need_merge_var_names_.begin(),
+                          need_merge_var_names_.end(), var->Name());
+      if (it == need_merge_var_names_.end() &&
+          var->GetType() != proto::VarType::SELECTED_ROWS) {
+        VLOG(2) << "train param: " << var->Name();
+        trainable_param_.push_back(var->Name());
+      }
+    }
+  }
 }
 
 void MultiTrainer::InitOtherEnv(const ProgramDesc& main_program) {
@@ -192,18 +203,30 @@ void MultiTrainer::Run() {
 
 #ifdef PADDLE_WITH_HETERPS
 void MultiTrainer::MergeDenseParam() {
-#ifdef PADDLE_WTIH_PSCORE
+#ifdef PADDLE_WITH_PSCORE
   auto communicator = paddle::distributed::Communicator::GetInstance();
-  auto& recv_ctx = communicator->GetRecvCtxMap();
-  Scope* thread_scope = workers_[0]->GetThreadScope();
-  for (auto& iter : recv_ctx) {
-    auto& varnames = iter.second;
-    for (auto& name : varnames) {
+  auto thread_scope = workers_[0]->GetThreadScope();
+  if (communicator == nullptr) {
+    for (auto& name : trainable_param_) {
+      VLOG(2) << "merge var " << name << " to root scope";
       Variable* root_var = root_scope_->FindVar(name);
       LoDTensor* root_tensor = root_var->GetMutable<LoDTensor>();
       Variable* var = thread_scope->FindVar(name);
       LoDTensor* tensor = var->GetMutable<LoDTensor>();
-      TensorCopy((*tensor), root_tensor->place(), root_tensor);
+      TensorCopySync((*tensor), root_tensor->place(), root_tensor);
+    }
+  } else {
+    auto& recv_ctx = communicator->GetRecvCtxMap();
+    for (auto& iter : recv_ctx) {
+      auto& varnames = iter.second;
+      for (auto& name : varnames) {
+        VLOG(2) << "merge var " << name << " to root scope";
+        Variable* root_var = root_scope_->FindVar(name);
+        LoDTensor* root_tensor = root_var->GetMutable<LoDTensor>();
+        Variable* var = thread_scope->FindVar(name);
+        LoDTensor* tensor = var->GetMutable<LoDTensor>();
+        TensorCopySync((*tensor), root_tensor->place(), root_tensor);
+      }
     }
   }
 #endif
@@ -236,11 +259,7 @@ void MultiTrainer::Finalize() {
     }
     LoDTensor* root_tensor = root_var->GetMutable<LoDTensor>();
 
-#ifdef PADDLE_WITH_HETERPS
-    for (size_t j = 0; j < places_.size(); j++) {
-#else
     for (int j = 1; j < thread_num_; j++) {
-#endif
       Scope* cur_thread_scope = workers_[j]->GetThreadScope();
       Variable* thread_var =
           cur_thread_scope->FindVar(need_merge_var_names_[i]);
diff --git a/paddle/fluid/framework/new_executor/executor_statistics.cc b/paddle/fluid/framework/new_executor/executor_statistics.cc
index 392d6c78f9c70..fb79712d47d9e 100644
--- a/paddle/fluid/framework/new_executor/executor_statistics.cc
+++ b/paddle/fluid/framework/new_executor/executor_statistics.cc
@@ -408,6 +408,7 @@ int StatisticsEngine::Stat(const platform::NodeTrees& trees) {
         // See InterpreterCore::RunInstruction for details.
         if (child->Type() == platform::TracerEventType::Operator &&
             cur_node->Name() == "compute") {
+          removed.insert(cur_node);
           removed.insert(child);
         }
         q.push(child);
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 6735406aacde7..da2fd0c8c6114 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -277,7 +277,7 @@ void InterpreterCore::Convert(
   }
 
   for (size_t i = 0; i < vec_instruction_.size(); ++i) {
-    // checkout ouput
+    // checkout output
     for (auto& item : vec_instruction_[i].Outputs()) {
       for (auto var_id : item.second) {
         if (input_var2op_info_.at(var_id).size() == 0) {
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc
index d6de37a72c772..f601a4ad28bd7 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -666,7 +666,7 @@ std::map<int, std::list<int>> get_downstream_map(
   VLOG(6) << "downstream count: " << downstream_map_count();
   VLOG(6) << "downstream_map: " << std::endl << downstream_map_to_str();
 
-  // step2: remove unneccessary downstream ops
+  // step2: remove unnecessary downstream ops
   // for example, a->b->c
   // a: b, c
   // b: c
diff --git a/paddle/fluid/framework/new_executor/workqueue/workqueue.h b/paddle/fluid/framework/new_executor/workqueue/workqueue.h
index e9c658e3b9dc6..2c2576528fe0e 100644
--- a/paddle/fluid/framework/new_executor/workqueue/workqueue.h
+++ b/paddle/fluid/framework/new_executor/workqueue/workqueue.h
@@ -89,7 +89,7 @@ struct WorkQueueOptions {
   // If you need to blocking the calling  thread to wait "queue empty", set
   // track_task = true and set events_waiter. EventsWaiter::WaitEvent will
   // block the calling thread until any of events (including "queue empty")
-  // occured.
+  // occurred.
   bool track_task;
   // If you need to be noticed when a WorkQueue Destruct() , set detached =
   // false and set events_waiter.
diff --git a/paddle/fluid/framework/op_kernel_type.h b/paddle/fluid/framework/op_kernel_type.h
index a2e9d972c48bc..8b1f0942f820a 100644
--- a/paddle/fluid/framework/op_kernel_type.h
+++ b/paddle/fluid/framework/op_kernel_type.h
@@ -82,9 +82,9 @@ class OpKernelType {
 
 inline std::ostream& operator<<(std::ostream& os,
                                 const OpKernelType& kernel_key) {
-  os << "data_type[" << kernel_key.data_type_ << "]:data_layout["
-     << kernel_key.data_layout_ << "]:place[" << kernel_key.place_
-     << "]:library_type[" << kernel_key.library_type_ << "]";
+  os << "{data_type[" << kernel_key.data_type_ << "]; data_layout["
+     << kernel_key.data_layout_ << "]; place[" << kernel_key.place_
+     << "]; library_type[" << kernel_key.library_type_ << "]}";
   return os;
 }
 
diff --git a/paddle/fluid/framework/op_kernel_type_test.cc b/paddle/fluid/framework/op_kernel_type_test.cc
index 3879a7957600d..20f695d40568e 100644
--- a/paddle/fluid/framework/op_kernel_type_test.cc
+++ b/paddle/fluid/framework/op_kernel_type_test.cc
@@ -27,16 +27,15 @@ TEST(OpKernelType, ToString) {
                               LibraryType::kCUDNN);
 
   ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type),
-            "data_type[float]:data_layout[NCHW]:place[Place(cpu)]:library_type["
-            "CUDNN]");
+            "{data_type[float]; data_layout[NCHW]; place[Place(cpu)]; "
+            "library_type[CUDNN]}");
 
   using CUDAPlace = paddle::platform::CUDAPlace;
   OpKernelType op_kernel_type2(DataType::FP16, CUDAPlace(0), DataLayout::kNCHW,
                                LibraryType::kCUDNN);
   ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type2),
-            "data_type[::paddle::platform::float16]:data_layout[NCHW]:place["
-            "Place(gpu:0)]:library_"
-            "type[CUDNN]");
+            "{data_type[::paddle::platform::float16]; data_layout[NCHW]; "
+            "place[Place(gpu:0)]; library_type[CUDNN]}");
 }
 
 TEST(OpKernelType, Hash) {
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 18287f0c7a4ee..d8eab0e9a7297 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1908,7 +1908,8 @@ Scope* OperatorWithKernel::PrepareData(
             (var->IsType<LoDTensor>() == true) &&
             (expected_kernel_key.data_layout_ != DataLayout::kMKLDNN) &&
             (paddle::platform::MKLDNNDeviceContext::tls()
-                 .get_cur_paddle_data_layout() == DataLayout::kNHWC)) {
+                 .get_cur_paddle_data_layout() == DataLayout::kNHWC) &&
+            (tensor_in->dims().size() >= 3)) {
           // Mixed execution : MKL-DNN and GPU is not supported!
           if (!new_scope) {
             new_scope = &scope.NewScope();
diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
index e259d6d417a5c..295510cdb1cf2 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
@@ -487,7 +487,7 @@ void AddLinkToCinnOp(const GraphNodeSet& cluster_inputs,
 void AddCinnOpToGraph(const GraphNodeSet& cluster,
                       const GraphNodeSet& cluster_inputs,
                       const GraphNodeSet& cluster_outputs,
-                      const std::string& compilation_key,
+                      int64_t compilation_key,
                       const std::unordered_set<std::string>& deny_var_set,
                       Graph* graph) {
   // Add the cinn launch op
@@ -511,7 +511,7 @@ void AddCinnOpToGraph(const GraphNodeSet& cluster,
                        ExtractOpRole(cluster));
   cinn_op_desc.Flush();
   auto* cinn_op_node = graph->CreateOpNode(&cinn_op_desc);
-  // Add new links from or to the the cinn launch op node
+  // Add new links from or to the cinn launch op node
   AddLinkToCinnOp(cluster_inputs, cluster_outputs, cinn_op_node);
 
   VLOG(4) << "Add op [" << kCinnLaunchOp << "] into graph.";
@@ -536,7 +536,7 @@ void RemoveSubGraphFromGraph(const GraphNodeSet& cluster,
 void ReplaceSubGraphWithCinnOpNode(
     const GraphNodeSet& cluster, const GraphNodeSet& cluster_inputs,
     const GraphNodeSet& cluster_outputs, const GraphNodeSet& cluster_internals,
-    const std::string& compilation_key,
+    int64_t compilation_key,
     const std::unordered_set<std::string>& deny_var_set, Graph* graph) {
   // Add the cinn op node whose name is "kCinnLaunchOp" into graph
   AddCinnOpToGraph(cluster, cluster_inputs, cluster_outputs, compilation_key,
@@ -545,6 +545,15 @@ void ReplaceSubGraphWithCinnOpNode(
   RemoveSubGraphFromGraph(cluster, cluster_internals, graph);
 }
 
+static bool IsInplaceOp(const OpDesc& op_desc) {
+  auto inputs = op_desc.InputArgumentNames();
+  std::unordered_set<std::string> input_set(inputs.begin(), inputs.end());
+  for (auto& name : op_desc.OutputArgumentNames()) {
+    if (input_set.count(name) > 0) return true;
+  }
+  return false;
+}
+
 // Search all subgraphs which all op node supported by CINN,
 // Here we using SubgraphDetector to detecte the subgraph that
 // all of op node supported by CINN. We using OpMapperRegistry
@@ -565,9 +574,10 @@ void SearchAllSubgraphs(Graph* graph) {
     if (deny_ops.size()) {
       return registered && !deny_ops.count(node->Name());
     }
+
     // if the user doesn't set FLAGS_allow_cinn_ops and FLAGS_deny_cinn_ops,
     // return true only when it is registered in CINN
-    return registered;
+    return registered && (node->IsOp() && !IsInplaceOp(*node->Op()));
   };
   VLOG(4) << "The allowed Cinn Ops: " << FLAGS_allow_cinn_ops;
   VLOG(4) << "The denied Cinn Ops: " << FLAGS_deny_cinn_ops;
@@ -603,7 +613,7 @@ void SearchAllSubgraphs(Graph* graph) {
 
     // Create a new subgraph according to the found cluster and
     // save it in CinnCompiler
-    std::string compilation_key = cinn_compiler->AddGraph(CreateNewSubGraph(
+    auto compilation_key = cinn_compiler->AddGraph(CreateNewSubGraph(
         cluster_set, cluster_internals, cluster_inputs, cluster_outputs));
     VLOG(4) << "Compilation Key:\n"
             << cinn_compiler->ReadableKey(compilation_key);
diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
index c11c7124b6277..d593aadc02c73 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
@@ -90,12 +90,12 @@ inline bool CheckGraphIndependence(const std::unordered_set<Node*>& nodes) {
 }
 
 // Get compilation_key values
-std::vector<std::string> GetCompilationKeys(const Graph& graph) {
-  std::vector<std::string> compilation_keys;
+std::vector<int64_t> GetCompilationKeys(const Graph& graph) {
+  std::vector<int64_t> compilation_keys;
   for (auto& node : graph.Nodes()) {
     if (node->IsOp() && node->Name() == kCinnLaunchOp) {
       compilation_keys.emplace_back(BOOST_GET_CONST(
-          std::string, node->Op()->GetAttr(operators::kCompilationKey)));
+          int64_t, node->Op()->GetAttr(operators::kCompilationKey)));
     }
   }
   return compilation_keys;
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc
index 499d243b25f8f..9b5ce876c256f 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc
@@ -18,6 +18,7 @@
 #include <functional>
 #include <map>
 #include <set>
+#include <sstream>
 #include <string>
 
 #include "paddle/fluid/framework/ir/graph.h"
@@ -77,22 +78,17 @@ bool CinnCacheKey::operator==(const CinnCacheKey& other) const {
          input_shapes_ == other.input_shapes_ && arch_str_ == other.arch_str_;
 }
 
-size_t CinnCacheKey::Hash::hash_combine(size_t seed, size_t value) {
-  return seed ^ (value + 0x9e3779b9 + (seed << 6) + (seed >> 2));
-}
-
 size_t CinnCacheKey::Hash::operator()(const CinnCacheKey& key) const {
-  std::size_t ret = 0;
+  std::ostringstream has_str;
 
-  std::hash<std::string> string_hasher;
   for (const auto& name_shape : key.input_shapes_) {
-    ret = hash_combine(ret, string_hasher(name_shape.first));
-    ret = hash_combine(ret, string_hasher(name_shape.second.to_str()));
+    has_str << name_shape.first;
+    has_str << name_shape.second.to_str();
   }
 
-  ret = hash_combine(ret, key.graph_hash_val_);
-  ret = hash_combine(ret, string_hasher(key.arch_str_));
-  return ret;
+  has_str << key.graph_hash_val_;
+  has_str << key.arch_str_;
+  return std::hash<std::string>()(has_str.str());
 }
 
 size_t CinnCacheKeyByStructure::HashGraph(const ir::Graph& graph) {
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h
index 239e9e561c9fc..d87ea843b9e7d 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h
+++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h
@@ -58,7 +58,6 @@ class CinnCacheKey {
   bool operator!=(const CinnCacheKey& other) const;
 
   struct Hash {
-    static size_t hash_combine(size_t seed, size_t value);
     size_t operator()(const CinnCacheKey& key) const;
   };
 
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
index 51dca93c7c7f0..12f603542066f 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
@@ -61,8 +61,8 @@ using ::cinn::hlir::framework::BuildScope;
 using ::cinn::hlir::framework::GraphCompiler;
 
 CinnCompiler* CinnCompiler::GetInstance() {
-  static CinnCompiler instance;
-  return &instance;
+  static CinnCompiler* instance = new CinnCompiler();
+  return instance;
 }
 
 const CinnCompiledObject& CinnCompiler::Compile(
@@ -110,7 +110,7 @@ const CinnCompiledObject& CinnCompiler::Compile(
 }
 
 const CinnCompiledObject& CinnCompiler::Compile(
-    const std::string& compilation_key,
+    int64_t compilation_key,
     const std::map<std::string, const LoDTensor*>& input_tensors,
     const Target& target, void* stream) {
   const auto& graph = FindGraph(compilation_key);
@@ -126,12 +126,8 @@ const CinnCompiledObject& CinnCompiler::GetCompiledObject(
   return *res->second;
 }
 
-std::string CinnCompiler::AddGraph(std::unique_ptr<Graph> graph) {
-  std::string graph_key;
-  ProgramDesc program;
-  GraphToProgram(*graph, &program);
-  program.Proto()->SerializeToString(&graph_key);
-
+int64_t CinnCompiler::AddGraph(std::unique_ptr<Graph> graph) {
+  int64_t graph_key = std::hash<Graph*>()((&(*graph)));
   PADDLE_ENFORCE_EQ(
       graphs_.count(graph_key), 0,
       platform::errors::PreconditionNotMet(
@@ -143,16 +139,17 @@ std::string CinnCompiler::AddGraph(std::unique_ptr<Graph> graph) {
   return graph_key;
 }
 
-const Graph& CinnCompiler::FindGraph(const std::string& graph_key) const {
+const Graph& CinnCompiler::FindGraph(int64_t graph_key) const {
+  auto it = graphs_.find(graph_key);
   PADDLE_ENFORCE_NE(
-      graphs_.count(graph_key), 0,
+      it, graphs_.end(),
       platform::errors::PreconditionNotMet(
-          "Can not find the target graph, of which the key is:\n%s",
-          ReadableKey(graph_key).c_str()));
-  return *graphs_.at(graph_key);
+          "Can not find the target graph, of which the key is: %lld",
+          graph_key));
+  return *it->second;
 }
 
-std::string CinnCompiler::VizGraph(const std::string& graph_key) const {
+std::string CinnCompiler::VizGraph(int64_t graph_key) const {
   const Graph& graph = FindGraph(graph_key);
   return VizGraph(graph);
 }
@@ -200,11 +197,24 @@ std::string CinnCompiler::VizGraph(const Graph& graph) const {
   return dot.Build();
 }
 
-std::string CinnCompiler::ReadableKey(
-    const std::string& compilation_key) const {
-  proto::ProgramDesc desc;
-  desc.ParseFromString(compilation_key);
-  return desc.DebugString();
+std::string CinnCompiler::SerializeKey(int64_t compilation_key) const {
+  const auto& graph = FindGraph(compilation_key);
+
+  ProgramDesc program;
+  GraphToProgram(graph, &program);
+
+  std::string serial_graph;
+  program.Proto()->SerializeToString(&serial_graph);
+  return serial_graph;
+}
+
+std::string CinnCompiler::ReadableKey(int64_t compilation_key) const {
+  const auto& graph = FindGraph(compilation_key);
+
+  ProgramDesc program;
+  GraphToProgram(graph, &program);
+
+  return program.Proto()->DebugString();
 }
 
 void CinnCompiler::Clear() {
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
index 7e5df6faf0819..a38e8b4c5f674 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
@@ -78,21 +78,23 @@ class CinnCompiler {
       const ::cinn::common::Target& target, void* stream = nullptr);
 
   const CinnCompiledObject& Compile(
-      const std::string& compilation_key,
+      int64_t compilation_key,
       const std::map<std::string, const LoDTensor*>& input_tensors,
       const ::cinn::common::Target& target, void* stream = nullptr);
 
   const CinnCompiledObject& GetCompiledObject(int64_t cached_index) const;
 
-  std::string AddGraph(std::unique_ptr<ir::Graph> graph);
+  int64_t AddGraph(std::unique_ptr<ir::Graph> graph);
 
-  const ir::Graph& FindGraph(const std::string& graph_key) const;
+  const ir::Graph& FindGraph(int64_t graph_key) const;
 
-  std::string VizGraph(const std::string& graph_key) const;
+  std::string VizGraph(int64_t graph_key) const;
 
   std::string VizGraph(const ir::Graph& graph) const;
 
-  std::string ReadableKey(const std::string& compilation_key) const;
+  std::string SerializeKey(int64_t compilation_key) const;
+
+  std::string ReadableKey(int64_t compilation_key) const;
 
   void Clear();
 
@@ -115,7 +117,7 @@ class CinnCompiler {
       const std::map<std::string, const LoDTensor*>& input_tensors,
       const CinnCompiledObject& compiled_obj) const;
 
-  std::unordered_map<std::string, std::unique_ptr<ir::Graph>> graphs_;
+  std::unordered_map<int64_t, std::unique_ptr<ir::Graph>> graphs_;
   std::unordered_map<CinnCacheKeyByAddress, std::int64_t, CinnCacheKey::Hash>
       cache_by_address_;
   std::unordered_map<CinnCacheKeyByStructure, std::int64_t, CinnCacheKey::Hash>
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
index 44f4424d70d4c..255e318c9fa69 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
@@ -59,12 +59,12 @@ std::ostream& operator<<(std::ostream& os, const std::vector<T, Alloc>& vec) {
 }
 
 // Get compilation_key values
-std::vector<std::string> GetCompilationKeys(const Graph& graph) {
-  std::vector<std::string> compilation_keys;
+std::vector<int64_t> GetCompilationKeys(const Graph& graph) {
+  std::vector<int64_t> compilation_keys;
   for (auto& node : graph.Nodes()) {
     if (node->IsOp() && node->Name() == kCinnLaunchOp) {
       compilation_keys.emplace_back(BOOST_GET_CONST(
-          std::string, node->Op()->GetAttr(operators::kCompilationKey)));
+          int64_t, node->Op()->GetAttr(operators::kCompilationKey)));
     }
   }
   return compilation_keys;
@@ -83,13 +83,12 @@ std::unordered_set<std::string> ExtractOpTypes(const Graph& graph) {
 
 // Get inputs info
 std::unordered_map<std::string, std::vector<int64_t>> GetInputsInfo(
-    const std::string& key, const Graph& graph) {
+    int64_t key, const Graph& graph) {
   std::unordered_set<std::string> inputs;
   for (auto& node : graph.Nodes()) {
     if (node->IsOp() && node->Name() == kCinnLaunchOp) {
-      if (BOOST_GET_CONST(std::string,
-                          node->Op()->GetAttr(operators::kCompilationKey)) !=
-          key) {
+      if (BOOST_GET_CONST(int64_t, node->Op()->GetAttr(
+                                       operators::kCompilationKey)) != key) {
         continue;
       }
       for (auto in_var_name : node->Op()->InputArgumentNames()) {
@@ -251,8 +250,7 @@ TEST(CinnCompilerTest, Compile) {
   const auto& compiling_graph = cinn_compiler->FindGraph(compilation_key);
   viz_graph("compiling_graph.dot", const_cast<Graph*>(&compiling_graph));
 
-  EXPECT_THROW(cinn_compiler->FindGraph("no_existed"),
-               paddle::platform::EnforceNotMet);
+  EXPECT_THROW(cinn_compiler->FindGraph(0), paddle::platform::EnforceNotMet);
 
   auto inputs_info = GetInputsInfo(compilation_key, *graph);
   std::unordered_map<std::string, LoDTensor> create_inputs;
diff --git a/paddle/fluid/framework/prune.cc b/paddle/fluid/framework/prune.cc
index efbab83f7d0e8..4c95f01ae569f 100644
--- a/paddle/fluid/framework/prune.cc
+++ b/paddle/fluid/framework/prune.cc
@@ -421,7 +421,7 @@ void PruneBackwardImpl(proto::BlockDesc* origin, proto::BlockDesc* pruned) {
   for (const auto& name : var_names) {
     if (var_map.count(name)) {
       // NOTE(zhiqiu): For operator in a conditional block, the related vars
-      // may not exist in current block, but in its futher block.
+      // may not exist in current block, but in its further block.
       *pruned_vars->Add() = var_map[name];
     }
   }
diff --git a/paddle/fluid/framework/ps_gpu_trainer.cc b/paddle/fluid/framework/ps_gpu_trainer.cc
index 9b12870a2bb9b..aec40a5a7ebdd 100644
--- a/paddle/fluid/framework/ps_gpu_trainer.cc
+++ b/paddle/fluid/framework/ps_gpu_trainer.cc
@@ -95,8 +95,46 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc,
   return;
 }
 
+void add_sparse_optimizer(
+    std::unordered_map<std::string, float>& config,  // NOLINT
+    const ::paddle::SparseCommonSGDRuleParameter& sgd_param,
+    const std::string& prefix = "") {
+  auto optimizer_name = sgd_param.name();
+  if (optimizer_name == "naive") {
+    config[prefix + "learning_rate"] = sgd_param.naive().learning_rate();
+    config[prefix + "initial_range"] = sgd_param.naive().initial_range();
+    if (sgd_param.naive().weight_bounds_size() == 2) {
+      config[prefix + "min_bound"] = sgd_param.naive().weight_bounds()[0];
+      config[prefix + "max_bound"] = sgd_param.naive().weight_bounds()[1];
+    }
+  } else if (optimizer_name == "adagrad") {
+    config[prefix + "learning_rate"] = sgd_param.adagrad().learning_rate();
+    config[prefix + "initial_range"] = sgd_param.adagrad().initial_range();
+    config[prefix + "initial_g2sum"] = sgd_param.adagrad().initial_g2sum();
+    if (sgd_param.adagrad().weight_bounds_size() == 2) {
+      config[prefix + "min_bound"] = sgd_param.adagrad().weight_bounds()[0];
+      config[prefix + "max_bound"] = sgd_param.adagrad().weight_bounds()[1];
+    }
+  } else if (optimizer_name == "std_adagrad") {
+    config[prefix + "learning_rate"] = sgd_param.adagrad().learning_rate();
+    config[prefix + "initial_range"] = sgd_param.adagrad().initial_range();
+    config[prefix + "initial_g2sum"] = sgd_param.adagrad().initial_g2sum();
+    if (sgd_param.adagrad().weight_bounds_size() == 2) {
+      config[prefix + "min_bound"] = sgd_param.adagrad().weight_bounds()[0];
+      config[prefix + "max_bound"] = sgd_param.adagrad().weight_bounds()[1];
+    }
+  } else if (optimizer_name == "adam") {
+    config[prefix + "learning_rate"] = sgd_param.adam().learning_rate();
+    config[prefix + "initial_range"] = sgd_param.adam().initial_range();
+    if (sgd_param.adam().weight_bounds_size() == 2) {
+      config[prefix + "min_bound"] = sgd_param.adam().weight_bounds()[0];
+      config[prefix + "max_bound"] = sgd_param.adam().weight_bounds()[1];
+    }
+  }
+}
+
 void PSGPUTrainer::InitializeGPUServer(const TrainerDesc& trainer_desc) {
-  // add for hbmps optimizer config
+  // optimizer config for hbmps
   auto fleet_desc_str = trainer_desc.fleet_desc();
   google::protobuf::TextFormat::ParseFromString(fleet_desc_str, &_ps_param);
   auto sparse_table =
@@ -105,7 +143,7 @@ void PSGPUTrainer::InitializeGPUServer(const TrainerDesc& trainer_desc) {
   auto sparse_table_accessor_parameter =
       sparse_table_accessor.downpour_accessor_param();
   auto accessor_class = sparse_table_accessor.accessor_class();
-  // gpups' sparse table optimizer config
+  // NOTE(zhangminxu): gpups' sparse table optimizer config,
   // now only support single sparse table
   // auto sparse_table = param_.sparse_table(0);
   std::unordered_map<std::string, float> config;
@@ -126,7 +164,14 @@ void PSGPUTrainer::InitializeGPUServer(const TrainerDesc& trainer_desc) {
       config["max_bound"] =
           sparse_table_accessor.sparse_sgd_param().weight_bounds()[1];
     }
+    // NOTE(zhangminxu): for DownpourCtrAccessor & DownpourCtrDoubleAccessor,
+    // optimizer config for embed_w & embedx_w is the same
     config["mf_create_thresholds"] = sparse_table_accessor.embedx_threshold();
+    config["mf_learning_rate"] = config["learning_rate"];
+    config["mf_initial_g2sum"] = config["initial_g2sum"];
+    config["mf_initial_range"] = config["initial_range"];
+    config["mf_min_bound"] = config["min_bound"];
+    config["mf_max_bound"] = config["max_bound"];
   } else if (accessor_class == "DownpourSparseValueAccessor") {
     auto optimizer_name = sparse_table_accessor.sparse_commonsgd_param().name();
     if (optimizer_name == "naive") {
@@ -186,71 +231,12 @@ void PSGPUTrainer::InitializeGPUServer(const TrainerDesc& trainer_desc) {
              accessor_class == "DownpourDoubleUnitAccessor") {
     config["nonclk_coeff"] = sparse_table_accessor_parameter.nonclk_coeff();
     config["clk_coeff"] = sparse_table_accessor_parameter.click_coeff();
-    auto optimizer_name = sparse_table_accessor.embedx_sgd_param().name();
-    if (optimizer_name == "naive") {
-      config["mf_learning_rate"] =
-          sparse_table_accessor.embedx_sgd_param().naive().learning_rate();
-      config["mf_initial_range"] =
-          sparse_table_accessor.embedx_sgd_param().naive().initial_range();
-      if (sparse_table_accessor.embedx_sgd_param()
-              .naive()
-              .weight_bounds_size() == 2) {
-        config["mf_min_bound"] =
-            sparse_table_accessor.embedx_sgd_param().naive().weight_bounds()[0];
-        config["mf_max_bound"] =
-            sparse_table_accessor.embedx_sgd_param().naive().weight_bounds()[1];
-      }
-    } else if (optimizer_name == "adagrad") {
-      config["mf_learning_rate"] =
-          sparse_table_accessor.embedx_sgd_param().adagrad().learning_rate();
-      config["mf_initial_range"] =
-          sparse_table_accessor.embedx_sgd_param().adagrad().initial_range();
-      config["mf_initial_g2sum"] =
-          sparse_table_accessor.embedx_sgd_param().adagrad().initial_g2sum();
-      if (sparse_table_accessor.embedx_sgd_param()
-              .adagrad()
-              .weight_bounds_size() == 2) {
-        config["mf_min_bound"] = sparse_table_accessor.embedx_sgd_param()
-                                     .adagrad()
-                                     .weight_bounds()[0];
-        config["mf_max_bound"] = sparse_table_accessor.embedx_sgd_param()
-                                     .adagrad()
-                                     .weight_bounds()[1];
-      }
-    } else if (optimizer_name == "std_adagrad") {
-      config["mf_learning_rate"] =
-          sparse_table_accessor.embedx_sgd_param().adagrad().learning_rate();
-      config["mf_initial_range"] =
-          sparse_table_accessor.embedx_sgd_param().adagrad().initial_range();
-      config["mf_initial_g2sum"] =
-          sparse_table_accessor.embedx_sgd_param().adagrad().initial_g2sum();
-      if (sparse_table_accessor.embedx_sgd_param()
-              .adagrad()
-              .weight_bounds_size() == 2) {
-        config["mf_min_bound"] = sparse_table_accessor.embedx_sgd_param()
-                                     .adagrad()
-                                     .weight_bounds()[0];
-        config["mf_max_bound"] = sparse_table_accessor.embedx_sgd_param()
-                                     .adagrad()
-                                     .weight_bounds()[1];
-      }
-    } else if (optimizer_name == "adam") {
-      config["mf_learning_rate"] =
-          sparse_table_accessor.embedx_sgd_param().adam().learning_rate();
-      config["mf_initial_range"] =
-          sparse_table_accessor.embedx_sgd_param().adam().initial_range();
-      if (sparse_table_accessor.embedx_sgd_param()
-              .adam()
-              .weight_bounds_size() == 2) {
-        config["mf_min_bound"] =
-            sparse_table_accessor.embedx_sgd_param().adam().weight_bounds()[0];
-        config["mf_max_bound"] =
-            sparse_table_accessor.embedx_sgd_param().adam().weight_bounds()[1];
-      }
-    }
     config["mf_create_thresholds"] = sparse_table_accessor.embedx_threshold();
+    // optimizer config for embed_w and embedx
+    add_sparse_optimizer(config, sparse_table_accessor.embed_sgd_param());
+    add_sparse_optimizer(config, sparse_table_accessor.embedx_sgd_param(),
+                         "mf_");
   }
-
   auto ps_gpu_wrapper = paddle::framework::PSGPUWrapper::GetInstance();
   ps_gpu_wrapper->InitializeGPUServer(config);
 }
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index 5186f8fcc1c51..8ce18d89c9b43 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -180,6 +180,11 @@ void TensorFromArray(const T* src, const size_t& array_size,
         reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
   }
 #endif
+#ifdef PADDLE_WITH_MLU
+  else if (platform::is_mlu_place(dst_place)) {  // NOLINT
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
+  }
+#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
   else if (platform::is_custom_place(dst_place)) {  // NOLINT
     memory::Copy(
@@ -247,9 +252,7 @@ void TensorFromVector(const std::vector<T>& src,
 #endif
 #ifdef PADDLE_WITH_MLU
   else if (platform::is_mlu_place(dst_place)) {  // NOLINT
-    memory::Copy(
-        dst_place, dst_ptr, src_place, src_ptr, size,
-        reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
   }
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
@@ -448,9 +451,7 @@ inline void TensorToVector(const Tensor& src,
 #endif
 #ifdef PADDLE_WITH_MLU
   else if (platform::is_mlu_place(src.place())) {  // NOLINT
-    memory::Copy(
-        dst_place, dst_ptr, src.place(), src_ptr, size,
-        reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
+    memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
   }
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
index 2496d4d040e2e..c78f7611b63be 100644
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -37,7 +37,7 @@ limitations under the License. */
 #include "paddle/phi/backends/dynload/port.h"
 
 #ifdef PADDLE_WITH_PSLIB
-#include <pslib.h>
+#include "proto/ps.pb.h"
 #endif
 
 namespace paddle {
@@ -129,6 +129,7 @@ class MultiTrainer : public TrainerBase {
   std::vector<DataFeed*> readers_;
   std::vector<std::shared_ptr<DeviceWorker>> workers_;
   std::vector<std::string> need_merge_var_names_;
+  std::vector<std::string> trainable_param_;
 #ifdef PADDLE_WITH_HETERPS
   std::vector<platform::Place> places_;
 #endif
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
index 61859264441ab..0937d96ad4c20 100644
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -21,8 +21,8 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+#include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/platform/variant.h"
-#include "paddle/phi/core/compat/type_defs.h"
 #include "paddle/utils/small_vector.h"
 
 namespace paddle {
@@ -39,6 +39,12 @@ class InferNoNeedBufferVarsFN;
 using VariableNameMap = std::map<std::string, std::vector<std::string>>;
 using VariableValueMap = std::map<std::string, std::vector<Variable*>>;
 
+using Attribute = boost::variant<
+    boost::blank, int, float, std::string, std::vector<int>, std::vector<float>,
+    std::vector<std::string>, bool, std::vector<bool>, BlockDesc*, int64_t,
+    std::vector<BlockDesc*>, std::vector<int64_t>, std::vector<double>>;
+using AttributeMap = std::unordered_map<std::string, Attribute>;
+
 #ifdef PADDLE_WITH_ASCEND_CL
 using NPUAttribute =
     boost::variant<boost::blank, int, float, std::string, std::vector<int>,
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index e928cbb654839..76f64ab73a64b 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -186,7 +186,7 @@ template <typename VarType>
 static void SetForwardDataTypeOfGradVars(const NameVarMap<VarType>& outs) {
   for (auto& var_pair : outs) {
     for (auto& var : var_pair.second) {
-      // NOTE(zhiqu): The ouput may be NULL because of pruning.
+      // NOTE(zhiqu): The output may be NULL because of pruning.
       if (var) {
         SetForwardDataTypeOfGradVar(var);
       }
diff --git a/paddle/fluid/imperative/layout_autotune.h b/paddle/fluid/imperative/layout_autotune.h
index 679612fdf1ae3..df3772b826da1 100644
--- a/paddle/fluid/imperative/layout_autotune.h
+++ b/paddle/fluid/imperative/layout_autotune.h
@@ -16,8 +16,8 @@
 #include <glog/logging.h>
 #include <memory>
 #include <unordered_set>
+#include "paddle/fluid/framework/type_defs.h"
 #include "paddle/phi/common/layout.h"
-#include "paddle/phi/core/compat/type_defs.h"
 
 namespace paddle {
 namespace imperative {
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 38180ba963c38..cfd3813d60d44 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -317,9 +317,11 @@ PreparedOp PrepareImpl(
                 << " | kernel key: " << pt_cpu_kernel_key
                 << " | kernel: " << pt_cpu_kernel;
         auto* cpu_ctx = pool.Get(paddle::platform::CPUPlace());
-        return PreparedOp(op, empty_ctx, expected_kernel_key, arg_map_fn,
-                          default_kernel_signature, std::move(kernel_signature),
-                          pt_cpu_kernel, cpu_ctx);
+        return PreparedOp(
+            op, empty_ctx,
+            framework::TransPhiKernelKeyToOpKernelType(pt_cpu_kernel_key),
+            arg_map_fn, default_kernel_signature, std::move(kernel_signature),
+            pt_cpu_kernel, cpu_ctx);
       }
     }
   }
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index 03fa46eab5367..c7fd2215eb42a 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -879,7 +879,7 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) {
 }
 
 // TODO(liuyuhui): If BKCL support non-blocking communication, it should be
-// fixed as same as multi gpus card trainging.
+// fixed as same as multi gpus card training.
 void Reducer::MarkGroupReady(size_t group_index) {
   PADDLE_ENFORCE_GE(
       group_index, next_group_,
@@ -957,7 +957,7 @@ void Reducer::FusedAllReduceSchedule(const int run_order, Group &group,
 // default stream for communicating, so there exist some problems in
 // synchronization. And need to add a WaitComm there.
 // TODO(liuyuhui): If BKCL support non-blocking communication, it should be
-// fixed as multi gpus card trainging.
+// fixed as multi gpus card training.
 #ifdef PADDLE_WITH_XPU_BKCL
     if (platform::is_xpu_place(group.dense_tensors_[0].place())) {
       parallel_ctx_->WaitComm(run_order);
diff --git a/paddle/fluid/imperative/tests/test_eager.cc b/paddle/fluid/imperative/tests/test_eager.cc
index 4a0b99518a63f..3def103ae9aa5 100644
--- a/paddle/fluid/imperative/tests/test_eager.cc
+++ b/paddle/fluid/imperative/tests/test_eager.cc
@@ -23,10 +23,10 @@
 #include "paddle/fluid/imperative/execution_context.h"
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/imperative/tracer.h"
+#include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/imperative/var_helper.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/phi/core/compat/type_defs.h"
 
 namespace paddle {
 namespace imperative {
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 7b274339e3cbe..350263bc5457d 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -192,7 +192,7 @@ void Tracer::TraceOpImpl(const std::string& type,
                          paddle::framework::AttributeMap* passed_default_attrs_,
                          bool use_default_attr_map) {
   platform::RecordEvent op_type_record_event(
-      "trace_op", platform::TracerEventType::Operator, 1);
+      type, platform::TracerEventType::Operator, 1);
   platform::ScopedFlushDenormal flush;
   VLOG(1) << "Trace Op: " << type;
   if (FLAGS_use_mkldnn) {
diff --git a/paddle/fluid/imperative/type_defs.h b/paddle/fluid/imperative/type_defs.h
index 79b6e057d21fc..08f3c8d4a0fc2 100644
--- a/paddle/fluid/imperative/type_defs.h
+++ b/paddle/fluid/imperative/type_defs.h
@@ -13,4 +13,58 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/phi/core/compat/type_defs.h"
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace egr {
+class EagerVariable;
+}
+namespace paddle {
+namespace imperative {
+
+class VariableWrapper;
+class SavedVariableWrapperList;
+class VarBase;
+class OpBase;
+class GradOpNode;
+class Tracer;
+
+using WeakNameVarBaseMap =
+    std::map<std::string, std::vector<std::weak_ptr<VarBase>>>;
+
+namespace details {
+template <typename T>
+struct NameVarMapTrait {};
+
+template <>
+struct NameVarMapTrait<VarBase> {
+  using Type = std::map<std::string, std::vector<std::shared_ptr<VarBase>>>;
+};
+
+template <>
+struct NameVarMapTrait<VariableWrapper> {
+  using Type = std::map<std::string, SavedVariableWrapperList>;
+};
+
+template <>
+struct NameVarMapTrait<egr::EagerVariable> {
+  using Type =
+      std::map<std::string, std::vector<std::shared_ptr<egr::EagerVariable>>>;
+};
+
+}  // namespace details
+
+template <typename T>
+using NameVarMap = typename details::NameVarMapTrait<T>::Type;
+
+using NameVarBaseMap = NameVarMap<VarBase>;
+using NameVariableWrapperMap = NameVarMap<VariableWrapper>;
+using NameTensorMap = NameVarMap<egr::EagerVariable>;
+
+using VariableWrapperList = std::vector<std::shared_ptr<VariableWrapper>>;
+
+}  // namespace imperative
+}  // namespace paddle
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 7fae481f58289..633f481df808b 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -109,7 +109,11 @@ endif()
 set_target_properties(paddle_inference_shared PROPERTIES OUTPUT_NAME paddle_inference)
 if(NOT APPLE AND NOT WIN32)
   # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac.
-  set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_inference.map")
+  if (WITH_CUSTOM_DEVICE)
+    set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_inference_custom_device.map")
+  else()
+    set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_inference.map")
+  endif()
   set_target_properties(paddle_inference_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
   # check symbol hidden
   FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index dab1b9f7b1135..3d1a467565c84 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -1,5 +1,5 @@
 unset(analysis_deps CACHE)
-set(analysis_deps # analysis_deps can be extended accross the project
+set(analysis_deps # analysis_deps can be extended across the project
         framework_proto proto_desc graph pass paddle_inference_io executor pretty_log
         ir_pass_manager
         CACHE INTERNAL "")
diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
index a8c29579e12e7..083fc8991192e 100644
--- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
@@ -268,7 +268,7 @@ void LiteSubgraphPass::SetUpEngine(
   auto nnadapter_model_cache_token =
       Get<std::vector<std::string>>("nnadapter_model_cache_token");
 
-  lite_api::TargetType target_type;
+  lite_api::TargetType target_type = TARGET(kX86);
   if (use_gpu) {
     target_type = TARGET(kCUDA);
   } else if (use_xpu) {
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
index 09494a360270b..0c9f8d7e16558 100644
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 /*
- * This file defines the the class to partition a graph.
+ * This file defines the class to partition a graph.
  */
 
 #include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
index 621c631b8539b..21bfe7582061a 100644
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 /*
- * This file defines the the class to partition a graph.
+ * This file defines the class to partition a graph.
  */
 
 #pragma once
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index e4fc52b6fa744..b73eb624db85b 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -139,6 +139,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   block_desc.Proto()->set_parent_idx(-1);
   block_desc.Proto()->set_idx(0);
   LOG(INFO) << "---  detect a sub-graph with " << subgraph.size() << " nodes";
+  for (auto node : subgraph) {
+    if (node->NodeType() == Node::Type::kOperation) {
+      VLOG(5) << "trt subgraph has op: " << (node->Op()->Type());
+    }
+  }
 
   for (auto *node : subgraph) {
     auto *new_block_op = new_block->AppendOp();
@@ -286,7 +291,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   // There are models with the same structure but the different parameters,
   // when running in the 'use_serialize' mode, there is a bug.
   // serialization is affected by max_batch_size, but calibration is not.
-  // So we use seperate engine keys in serialization and calibration.
+  // So we use separate engine keys in serialization and calibration.
   auto engine_key = GenerateEngineKey(
       input_names_with_id, output_names_with_id, std::to_string(0),
       std::to_string(max_batch_size),
@@ -377,12 +382,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   trt_engine->SetUseDLA(Get<bool>("trt_use_dla"));
   trt_engine->SetDLACore(Get<int>("trt_dla_core"));
   trt_engine->SetUseInspector(Get<bool>("use_inspector"));
-
-  trt_engine->SetWithErnie(
-      (graph->Has(framework::ir::kEmbEltwiseLayernormPass) &&
-       graph->Has(framework::ir::kMultiheadMatmulPass)) ||
-      (graph->Has(framework::ir::kPrelnEmbEltwiseLayernormPass) &&
-       graph->Has(framework::ir::kMultiheadMatmulPass)));
+  trt_engine->SetWithErnie(graph->Has(framework::ir::kMultiheadMatmulPass));
 
   if (use_static_engine) {
     trt_engine_serialized_data = GetTrtEngineSerializedData(
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index edec1b1c7d0e4..56cc4aa755bda 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -50,10 +50,10 @@ if(WITH_GPU AND TENSORRT_FOUND)
 endif()
 
 if (WITH_ONNXRUNTIME)
-    cc_library(analysis_predictor SRCS analysis_predictor.cc onnxruntime_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps} 
+    cc_library(analysis_predictor SRCS analysis_predictor.cc onnxruntime_predictor.cc resource_manager.cc infer_context.cc ${mkldnn_quantizer_src} DEPS ${inference_deps}
               zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils onnxruntime paddle2onnx)
 else (WITH_ONNXRUNTIME)
-    cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps} 
+    cc_library(analysis_predictor SRCS analysis_predictor.cc resource_manager.cc infer_context.cc ${mkldnn_quantizer_src} DEPS ${inference_deps}
               zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils)
 endif (WITH_ONNXRUNTIME)
 
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 4827fe6c1ac97..adc3fc46f72ac 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -158,6 +158,19 @@ void AnalysisConfig::EnableNpu(int device_id) {
   Update();
 }
 
+void AnalysisConfig::EnableCustomDevice(const std::string &device_type,
+                                        int device_id) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  use_custom_device_ = true;
+  custom_device_id_ = device_id;
+  custom_device_type_ = device_type;
+#else
+  LOG(ERROR) << "Please compile with CustomDevice to EnableCustomDevice()";
+  use_custom_device_ = false;
+#endif
+  Update();
+}
+
 void AnalysisConfig::EnableIpu(int ipu_device_num, int ipu_micro_batch_size,
                                bool ipu_enable_pipelining,
                                int ipu_batches_per_step) {
@@ -324,6 +337,11 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   // fleet exe related
   CP_MEMBER(dist_config_);
 
+  // custom device related.
+  CP_MEMBER(use_custom_device_);
+  CP_MEMBER(custom_device_type_);
+  CP_MEMBER(custom_device_id_);
+
   if (use_gpu_) {
     PADDLE_ENFORCE_EQ(use_xpu_, false,
                       platform::errors::InvalidArgument(
@@ -539,7 +557,8 @@ void AnalysisConfig::Update() {
   if (!pass_builder_ || ((use_gpu() ^ pass_builder_->use_gpu())) ||
       ((use_xpu() ^ pass_builder_->use_xpu())) ||
       ((use_npu() ^ pass_builder_->use_npu())) ||
-      ((use_ipu() ^ pass_builder_->use_ipu()))) {
+      ((use_ipu() ^ pass_builder_->use_ipu())) ||
+      ((use_custom_device() ^ pass_builder_->use_custom_device()))) {
     if (use_gpu()) {
       pass_builder_.reset(new GpuPassStrategy);
 
@@ -562,6 +581,12 @@ void AnalysisConfig::Update() {
           platform::errors::InvalidArgument(
               "Only one choice can be made between GPU and NPU."));
       pass_builder_.reset(new NpuPassStrategy);
+    } else if (use_custom_device()) {
+      PADDLE_ENFORCE_EQ(
+          use_gpu(), false,
+          platform::errors::InvalidArgument(
+              "Only one choice can be made between GPU and CustomDevice."));
+      pass_builder_.reset(new CustomDevicePassStrategy);
     } else {
       pass_builder_.reset(new CpuPassStrategy);
     }
@@ -588,6 +613,13 @@ void AnalysisConfig::Update() {
               "Only one choice can be made between GPU and NPU."));
       pass_builder_.reset(new NpuPassStrategy(
           *static_cast<NpuPassStrategy *>(pass_builder_.get())));
+    } else if (use_custom_device()) {
+      PADDLE_ENFORCE_EQ(
+          use_gpu(), false,
+          platform::errors::InvalidArgument(
+              "Only one choice can be made between GPU and CustomDevice."));
+      pass_builder_.reset(new CustomDevicePassStrategy(
+          *static_cast<CustomDevicePassStrategy *>(pass_builder_.get())));
     } else {
       pass_builder_.reset(new CpuPassStrategy(
           *static_cast<CpuPassStrategy *>(pass_builder_.get())));
@@ -601,6 +633,11 @@ void AnalysisConfig::Update() {
           (pass == "conv_bn_fuse_pass")) {
         continue;
       }
+      // delete_fill_constant_op_pass is not used under trt dynamic shape
+      if ((!min_input_shape_.empty() || trt_tuned_dynamic_shape_) &&
+          pass == "delete_fill_constant_op_pass") {
+        continue;
+      }
       pass_builder()->AppendPass(pass);
     }
   }
@@ -733,7 +770,13 @@ void AnalysisConfig::Update() {
         "but did not have the option -DWITH_IPU compiled."));
 #endif
   }
-
+  if (use_custom_device_) {
+#ifndef PADDLE_WITH_CUSTOM_DEVICE
+    PADDLE_THROW(platform::errors::Unavailable(
+        "You tried to enable the custom device "
+        "but did not have the option -DWITH_CUSTOM_DEVICE compiled."));
+#endif
+  }
   if (ir_debug_) {
     pass_builder()->TurnOnDebug();
   }
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 4f0d4a908380f..09a5bbddba87c 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -332,6 +332,15 @@ bool AnalysisPredictor::CreateExecutor() {
     PADDLE_THROW(platform::errors::Unavailable(
         "You tried to use IPU forward propagation, but Paddle was not compiled "
         "with WITH_IPU."));
+#endif
+  } else if (config_.use_custom_device()) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    place_ = paddle::platform::CustomPlace(config_.custom_device_type());
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "You tried to use CustomDevice forward propagation, but Paddle was not "
+        "compiled "
+        "with WITH_CUSTOM_DEVICE."));
 #endif
   } else {
     place_ = paddle::platform::CPUPlace();
@@ -1241,6 +1250,12 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
   } else if (platform::is_npu_place(place_)) {
     auto npu_place = place_;
     res->SetPlace(PaddlePlace::kNPU, npu_place.GetDeviceId());
+  } else if (platform::is_custom_place(place_)) {
+    auto custom_place = place_;
+    auto paddleplace = static_cast<PaddlePlace>(
+        static_cast<size_t>(PaddlePlace::kCUSTOM) +
+        phi::GetOrRegisterGlobalDeviceTypeId(place_.GetDeviceType()));
+    res->SetPlace(paddleplace, custom_place.GetDeviceId());
   } else {
     auto gpu_place = place_;
     res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
@@ -1290,6 +1305,12 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
   } else if (platform::is_npu_place(place_)) {
     auto npu_place = place_;
     res->SetPlace(PaddlePlace::kNPU, npu_place.GetDeviceId());
+  } else if (platform::is_custom_place(place_)) {
+    auto custom_place = place_;
+    auto paddleplace = static_cast<PaddlePlace>(
+        static_cast<size_t>(PaddlePlace::kCUSTOM) +
+        phi::GetOrRegisterGlobalDeviceTypeId(place_.GetDeviceType()));
+    res->SetPlace(paddleplace, custom_place.GetDeviceId());
   } else {
     auto gpu_place = place_;
     res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
@@ -1710,6 +1731,10 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<AnalysisConfig>(
 
 #if PADDLE_WITH_TENSORRT
 USE_TRT_CONVERTER(elementwise_add_weight);
+USE_TRT_CONVERTER(elementwise_sub_weight);
+USE_TRT_CONVERTER(elementwise_mul_weight);
+USE_TRT_CONVERTER(elementwise_div_weight);
+USE_TRT_CONVERTER(elementwise_pow_weight);
 USE_TRT_CONVERTER(elementwise_add_tensor);
 USE_TRT_CONVERTER(elementwise_sub_tensor);
 USE_TRT_CONVERTER(elementwise_div_tensor);
@@ -1723,6 +1748,8 @@ USE_TRT_CONVERTER(flatten_contiguous_range);
 USE_TRT_CONVERTER(matmul);
 USE_TRT_CONVERTER(conv2d);
 USE_TRT_CONVERTER(relu);
+USE_TRT_CONVERTER(exp);
+USE_TRT_CONVERTER(log);
 USE_TRT_CONVERTER(sigmoid);
 USE_TRT_CONVERTER(tanh);
 USE_TRT_CONVERTER(fc);
@@ -1754,6 +1781,8 @@ USE_TRT_CONVERTER(clip);
 USE_TRT_CONVERTER(gather);
 USE_TRT_CONVERTER(anchor_generator);
 USE_TRT_CONVERTER(yolo_box);
+USE_TRT_CONVERTER(yolo_box_head);
+USE_TRT_CONVERTER(arg_max);
 USE_TRT_CONVERTER(roi_align);
 USE_TRT_CONVERTER(affine_channel);
 USE_TRT_CONVERTER(multiclass_nms);
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index ecb5eaf982548..e8a1384166aff 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -522,7 +522,7 @@ TEST(Tensor, GpuShareExternalData) {
 
   auto out = predictor->GetOutputHandle("fc_1.tmp_2");
   auto out_shape = out->shape();
-  float* out_data;
+  float* out_data = nullptr;
   auto out_size = std::accumulate(out_shape.begin(), out_shape.end(), 1,
                                   std::multiplies<int>()) *
                   sizeof(float);
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index 290c547c98691..c8a78a168a81c 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -106,6 +106,9 @@ mkdir -p build
 cd build
 rm -rf *
 
+# run all test cases before exit
+EXIT_CODE=0
+
 for WITH_STATIC_LIB in ON OFF; do
   if [ $(echo `uname` | grep "Win") != "" ]; then
     # TODO(wilber, T8T9): Do we still need to support windows gpu static library
@@ -128,8 +131,8 @@ for WITH_STATIC_LIB in ON OFF; do
         --dirname=$DATA_DIR/word2vec/word2vec.inference.model \
         --use_gpu=$use_gpu
       if [ $? -ne 0 ]; then
-        echo "simple_on_word2vec demo runs fail."
-        exit 1
+        echo "simple_on_word2vec use_gpu:${use_gpu} runs failed " > ${current_dir}/test_summary.txt
+        EXIT_CODE=1
       fi
     done
 
@@ -153,8 +156,8 @@ for WITH_STATIC_LIB in ON OFF; do
           --refer=$DATA_DIR/$vis_demo_name/result.txt \
           --use_gpu=$use_gpu
         if [ $? -ne 0 ]; then
-          echo "vis demo $vis_demo_name runs fail."
-          exit 1
+          echo "vis demo $vis_demo_name use_gpu:${use_gpu} runs failed " >> ${current_dir}/test_summary.txt
+          EXIT_CODE=1
         fi
       done
     done
@@ -179,8 +182,8 @@ for WITH_STATIC_LIB in ON OFF; do
         --data=$DATA_DIR/mobilenet/data.txt \
         --refer=$DATA_DIR/mobilenet/result.txt 
       if [ $? -ne 0 ]; then
-        echo "trt demo trt_mobilenet_demo runs fail."
-        exit 1
+        echo "trt_mobilenet_demo runs failed." >> ${current_dir}/test_summary.txt
+        EXIT_CODE=1
       fi
     fi
   else
@@ -200,8 +203,8 @@ for WITH_STATIC_LIB in ON OFF; do
           --dirname=$DATA_DIR/word2vec/word2vec.inference.model \
           --use_gpu=$use_gpu
         if [ $? -ne 0 ]; then
-          echo "simple_on_word2vec demo runs fail."
-          exit 1
+          echo "simple_on_word2vec use_gpu:${use_gpu} runs failed " >> ${current_dir}/test_summary.txt
+          EXIT_CODE=1
         fi
       done
     fi
@@ -222,8 +225,8 @@ for WITH_STATIC_LIB in ON OFF; do
           --refer=$DATA_DIR/$vis_demo_name/result.txt \
           --use_gpu=$use_gpu
         if [ $? -ne 0 ]; then
-          echo "vis demo $vis_demo_name runs fail."
-          exit 1
+          echo "vis demo $vis_demo_name use_gpu:${use_gpu} runs failed " >> ${current_dir}/test_summary.txt
+          EXIT_CODE=1
         fi
       done
     done
@@ -244,8 +247,8 @@ for WITH_STATIC_LIB in ON OFF; do
         --data=$DATA_DIR/mobilenet/data.txt \
         --refer=$DATA_DIR/mobilenet/result.txt 
       if [ $? -ne 0 ]; then
-        echo "trt demo trt_mobilenet_demo runs fail."
-        exit 1
+        echo "trt_mobilenet_demo runs failed " >> ${current_dir}/test_summary.txt
+        EXIT_CODE=1
       fi
     fi
 
@@ -264,10 +267,25 @@ for WITH_STATIC_LIB in ON OFF; do
       ./onnxruntime_mobilenet_demo \
         --modeldir=$DATA_DIR/MobileNetV2/MobileNetV2
       if [ $? -ne 0 ]; then
-        echo "onnxruntime demo onnxruntime_mobilenet_demo runs fail."
-        exit 1
+        echo "onnxruntime_mobilenet_demo runs failed " >> ${current_dir}/test_summary.txt
+        EXIT_CODE=1
       fi
     fi
   fi
 done
+
 set +x
+
+if [[ -f ${current_dir}/test_summary.txt ]];then
+  echo " "
+  echo "Summary demo_ci Failed Tests ..."
+  echo "=====================test summary======================"
+  echo "The following tests Failed: "
+  cat ${current_dir}/test_summary.txt
+  echo "========================================================"
+  echo " "
+fi
+
+set -x
+
+exit ${EXIT_CODE}
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index 0c68acfe98047..bb966dc5c6c1b 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -224,8 +224,23 @@ void Tensor::CopyFromCpu(const T *data) {
         "with NPU."));
 #endif
   } else {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    auto device_type_id =
+        static_cast<size_t>(place_) - static_cast<size_t>(PlaceType::kCUSTOM);
+    paddle::platform::DeviceContextPool &pool =
+        paddle::platform::DeviceContextPool::Instance();
+    paddle::platform::CustomPlace custom_place(
+        phi::GetGlobalDeviceType(device_type_id), device_);
+    auto *t_data = tensor->mutable_data<T>(custom_place);
+    auto *dev_ctx = static_cast<const paddle::platform::CustomDeviceContext *>(
+        pool.Get(custom_place));
+    paddle::memory::Copy(custom_place, static_cast<void *>(t_data),
+                         paddle::platform::CPUPlace(), data, ele_size,
+                         dev_ctx->stream());
+#else
     PADDLE_THROW(paddle::platform::errors::InvalidArgument(
         "The analysis predictor supports CPU, GPU, NPU and XPU now."));
+#endif
   }
 }
 
@@ -398,8 +413,20 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
         "with NPU."));
 #endif
   } else {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    paddle::platform::DeviceContextPool &pool =
+        paddle::platform::DeviceContextPool::Instance();
+    auto custom_place = t_place;
+    auto *dev_ctx = static_cast<const paddle::platform::CustomDeviceContext *>(
+        pool.Get(custom_place));
+    paddle::memory::Copy(paddle::platform::CPUPlace(),
+                         static_cast<void *>(data), custom_place, t_data,
+                         ele_num * sizeof(T), dev_ctx->stream());
+// TODO(wangran16): sync_stream
+#else
     PADDLE_THROW(paddle::platform::errors::InvalidArgument(
         "The analysis predictor supports CPU, GPU, NPU and XPU now."));
+#endif
   }
 }
 
diff --git a/paddle/fluid/inference/api/infer_context.cc b/paddle/fluid/inference/api/infer_context.cc
new file mode 100644
index 0000000000000..7706f2d0824e3
--- /dev/null
+++ b/paddle/fluid/inference/api/infer_context.cc
@@ -0,0 +1,17 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/api/infer_context.h"
+
+namespace paddle {}  // namespace paddle
diff --git a/paddle/fluid/inference/api/infer_context.h b/paddle/fluid/inference/api/infer_context.h
new file mode 100644
index 0000000000000..b7a8bf637d872
--- /dev/null
+++ b/paddle/fluid/inference/api/infer_context.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "paddle/phi/backends/all_context.h"
+
+namespace paddle {
+
+class InferCPUContext : public phi::CPUContext {
+ public:
+  using phi::CPUContext::SetEigenDevice;
+};
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+class InferGPUContext : public phi::GPUContext {
+ public:
+  using phi::GPUContext::SetStream;
+  using phi::GPUContext::SetEigenDevice;
+  using phi::GPUContext::SetBlasHandle;
+  using phi::GPUContext::SetBlasTensorCoreHandle;
+  using phi::GPUContext::SetBlasTF32Handle;
+  using phi::GPUContext::SetDnnHandle;
+  using phi::GPUContext::SetSolverHandle;
+  using phi::GPUContext::SetSparseHandle;
+  // using phi::GPUContext::SetDnnWorkspaceHandle;
+  using phi::GPUContext::SetComputeCapability;
+  using phi::GPUContext::SetMaxThreadsPerMultiProcessor;
+  using phi::GPUContext::SetMultiProcessors;
+  using phi::GPUContext::SetMaxThreadsPerBlock;
+  using phi::GPUContext::SetMaxGridDimSize;
+  using phi::GPUContext::SetDriverVersion;
+  using phi::GPUContext::SetRuntimeVersion;
+};
+#endif
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc
index 3a3e6a0908ea1..4dc80a1d75390 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc
@@ -571,6 +571,7 @@ void AnalysisPredictor::MkldnnQuantizer::PrepareArgument() const {
   auto* builder = predictor_.config_.pass_builder();
   builder->SetPasses({
       "cpu_quantize_pass", "cpu_quantize_squash_pass",
+      "int8_scale_calculation_mkldnn_pass",
   });
   if (predictor_.config_.ir_debug_) builder->TurnOnDebug();
   auto passes = builder->AllPasses();
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 9c48d822b4d0d..af6cf88a3224f 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -332,6 +332,14 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   void EnableNpu(int device_id = 0);
   ///
+  /// \brief Turn on CustomDevice.
+  ///
+  /// \param device_type device_type the custom device to use.
+  ///
+  /// \param device_id device_id the custom device to use (default is 0).
+  ///
+  void EnableCustomDevice(const std::string& device_type, int device_id);
+  ///
   /// \brief Turn on ONNXRuntime.
   ///
   void EnableONNXRuntime();
@@ -366,6 +374,11 @@ struct PD_INFER_DECL AnalysisConfig {
   /// \return bool Whether the IPU is turned on.
   ///
   bool use_ipu() const { return use_ipu_; }
+  /// \brief A boolean state telling whether the CustomDevice is turned on.
+  ///
+  /// \return bool Whether the CustomDevice is turned on.
+  ///
+  bool use_custom_device() const { return use_custom_device_; }
   ///
   /// \brief A boolean state telling whether the ONNXRuntime is turned on.
   ///
@@ -397,12 +410,23 @@ struct PD_INFER_DECL AnalysisConfig {
   /// \return int The NPU device id.
   ///
   int npu_device_id() const { return npu_device_id_; }
-  /// \brief Get the the number of IPU device .
+  /// \brief Get the number of IPU device .
   ///
   /// \return int The number of IPU device.
   ///
   int ipu_device_num() const { return ipu_device_num_; }
   ///
+  /// \brief Get the custom device id.
+  ///
+  /// \return int The custom device id.
+  ///
+  int custom_device_id() const { return custom_device_id_; }
+  /// \brief Get the custom device type.
+  ///
+  /// \return string The custom device type.
+  ///
+  std::string custom_device_type() const { return custom_device_type_; }
+  ///
   /// \brief Get the initial size in MB of the GPU memory pool.
   ///
   /// \return int The initial size in MB of the GPU memory pool.
@@ -900,6 +924,11 @@ struct PD_INFER_DECL AnalysisConfig {
   bool use_npu_{false};
   int npu_device_id_{0};
 
+  // CustomDevice related
+  bool use_custom_device_{false};
+  int custom_device_id_{0};
+  std::string custom_device_type_;
+
   // ONNXRuntime related
   bool use_onnxruntime_{false};
   bool enable_ort_optimization_{false};
diff --git a/paddle/fluid/inference/api/paddle_infer_declare.h b/paddle/fluid/inference/api/paddle_infer_declare.h
index e8525f440fe7f..44eacc6a70554 100644
--- a/paddle/fluid/inference/api/paddle_infer_declare.h
+++ b/paddle/fluid/inference/api/paddle_infer_declare.h
@@ -23,5 +23,7 @@
 #endif  // PADDLE_DLL_INFERENCE
 #endif  // PD_INFER_DECL
 #else
+#ifndef PD_INFER_DECL
 #define PD_INFER_DECL __attribute__((visibility("default")))
+#endif  // PD_INFER_DECL
 #endif  // _WIN32
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 01988d5f539dc..f9ec41f6c8358 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -82,9 +82,11 @@ void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
 void PaddlePassBuilder::ClearPasses() { passes_.clear(); }
 
 const std::vector<std::string> kTRTSubgraphPasses({
-  "adaptive_pool2d_convert_global_pass",
+  "identity_scale_op_clean_pass",              //
+      "adaptive_pool2d_convert_global_pass",   //
       "shuffle_channel_detect_pass",           //
       "quant_conv2d_dequant_fuse_pass",        //
+      "delete_fill_constant_op_pass",          //
       "delete_quant_dequant_op_pass",          //
       "delete_quant_dequant_filter_op_pass",   //
       "delete_weight_dequant_linear_op_pass",  //
@@ -98,18 +100,22 @@ const std::vector<std::string> kTRTSubgraphPasses({
       "multihead_matmul_fuse_pass_v3",                //
       "skip_layernorm_fuse_pass",                     //
       "preln_skip_layernorm_fuse_pass",               //
-      "conv_bn_fuse_pass",                            //
-      "unsqueeze2_eltwise_fuse_pass",                 //
-      "trt_squeeze2_matmul_fuse_pass",                //
-      "trt_reshape2_matmul_fuse_pass",                //
-      "trt_flatten2_matmul_fuse_pass",                //
-      "trt_map_matmul_v2_to_mul_pass",                //
-      "trt_map_matmul_v2_to_matmul_pass",             //
-      "trt_map_matmul_to_mul_pass",                   //
-      "fc_fuse_pass",                                 //
-      "conv_elementwise_add_fuse_pass",               //
-      "tensorrt_subgraph_pass",                       //
-      "conv_bn_fuse_pass",                            //
+      // "set_transformer_input_convert_pass",           //
+      "conv_bn_fuse_pass",                 //
+      "unsqueeze2_eltwise_fuse_pass",      //
+      "trt_squeeze2_matmul_fuse_pass",     //
+      "trt_reshape2_matmul_fuse_pass",     //
+      "trt_flatten2_matmul_fuse_pass",     //
+      "trt_map_matmul_v2_to_mul_pass",     //
+      "trt_map_matmul_v2_to_matmul_pass",  //
+      "trt_map_matmul_to_mul_pass",        //
+      "fc_fuse_pass",                      //
+      "conv_elementwise_add_fuse_pass",    //
+      // "remove_padding_recover_padding_pass",          //
+      // "delete_remove_padding_recover_padding_pass",    //
+      // "yolo_box_fuse_pass",      //
+      "tensorrt_subgraph_pass",  //
+      "conv_bn_fuse_pass",       //
 #if CUDNN_VERSION >= 7100  // To run conv_fusion, the version of cudnn must be
                            // guaranteed at least v7
 // cudnn8.0 has memory leak problem in conv + eltwise + act, so we
@@ -282,7 +288,8 @@ void CpuPassStrategy::EnableMKLDNN() {
              "depthwise_conv_mkldnn_pass",    //
              "conv_bn_fuse_pass",             // Execute BN passes again to
              "conv_eltwiseadd_bn_fuse_pass",  // preserve correct pass order
-             "conv_transpose_bn_fuse_pass",   //
+             "conv_affine_channel_mkldnn_fuse_pass",    //
+             "conv_transpose_bn_fuse_pass",             //
              "conv_transpose_eltwiseadd_bn_fuse_pass",  //
              "conv_bias_mkldnn_fuse_pass",              //
              "conv_transpose_bias_mkldnn_fuse_pass",
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index db6bde62ddc7c..f01799c646077 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -166,6 +166,10 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
   /// \return A bool variable implying whether we are in ipu mode.
   bool use_ipu() const { return use_ipu_; }
 
+  /// \brief Check if we are using CustomDevice.
+  /// \return A bool variable implying whether we are in CustomDevice mode.
+  bool use_custom_device() const { return use_custom_device_; }
+
   /// \brief Default destructor.
   virtual ~PassStrategy() = default;
 
@@ -177,6 +181,7 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
   bool use_ipu_{false};
   bool use_mkldnn_{false};
   bool use_gpu_fp16_{false};
+  bool use_custom_device_{false};
   /// \endcond
 };
 
@@ -291,6 +296,22 @@ class PD_INFER_DECL NpuPassStrategy final : public PassStrategy {
   }
 };
 
+/// \class CustomDevicePassStrategy
+/// \brief The CustomDevice passes controller, it is used in AnalysisPredictor
+/// with CustomDevice
+/// mode.
+class PD_INFER_DECL CustomDevicePassStrategy final : public PassStrategy {
+ public:
+  CustomDevicePassStrategy() : PassStrategy({}) { use_custom_device_ = true; }
+
+  /// \brief Construct by copying another CustomDevicePassStrategy object.
+  /// \param[in] other The CustomDevicePassStrategy object we want to copy.
+  explicit CustomDevicePassStrategy(const CustomDevicePassStrategy &other)
+      : PassStrategy(other.AllPasses()) {
+    use_custom_device_ = true;
+  }
+};
+
 /// \class IpuPassStrategy
 /// \brief The IPU passes controller, it is used in AnalysisPredictor with IPU
 /// mode.
diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h
index 3cd2df3aef639..11086b369fc15 100644
--- a/paddle/fluid/inference/api/paddle_tensor.h
+++ b/paddle/fluid/inference/api/paddle_tensor.h
@@ -54,7 +54,7 @@ enum DataType {
   // TODO(Superjomn) support more data types if needed.
 };
 
-enum class PlaceType { kUNK = -1, kCPU, kGPU, kXPU, kNPU, kIPU };
+enum class PlaceType { kUNK = -1, kCPU, kGPU, kXPU, kNPU, kIPU, kCUSTOM };
 
 enum class DataLayout { kUNK = -1, kAny, kNHWC, kNCHW };
 
diff --git a/paddle/fluid/inference/api/resource_manager.cc b/paddle/fluid/inference/api/resource_manager.cc
new file mode 100644
index 0000000000000..d88f282ce7a62
--- /dev/null
+++ b/paddle/fluid/inference/api/resource_manager.cc
@@ -0,0 +1,290 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/api/resource_manager.h"
+
+#include <unordered_map>
+
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/phi/backends/gpu/forwards.h"
+#include "paddle/phi/backends/gpu/gpu_decls.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_resources.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/allocator.h"
+#include "paddle/phi/core/generator.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+namespace internal {
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+class EigenGpuStreamDevice : public Eigen::StreamInterface {
+ public:
+  EigenGpuStreamDevice() : scratch_(nullptr), semaphore_(nullptr) {
+    Eigen::initializeDeviceProp();
+  }
+  ~EigenGpuStreamDevice() override {}
+
+  void Reinitialize(gpuStream_t cuda_stream, phi::Allocator* allocator,
+                    GPUPlace place) {
+    stream_ = cuda_stream;
+    allocator_ = allocator;
+    device_prop_ = &Eigen::m_deviceProperties[place.device];
+  }
+
+  const gpuStream_t& stream() const override { return stream_; }
+
+  const gpuDeviceProp& deviceProperties() const override {
+    return *device_prop_;
+  }
+
+  void* allocate(size_t num_bytes) const override {
+    if (UNLIKELY(num_bytes == 0)) {
+      return nullptr;
+    }
+    auto buf = allocator_->Allocate(num_bytes);
+    VLOG(4) << "Eigen allocated at " << buf->ptr() << " requested "
+            << num_bytes;
+    void* retv = buf->ptr();
+    {
+      std::lock_guard<std::mutex> lock(mtx_);
+      allocations_.emplace(retv, std::move(buf));
+    }
+    return retv;
+  }
+
+  void deallocate(void* buffer) const override {
+    if (LIKELY(buffer)) {
+      std::lock_guard<std::mutex> lock(mtx_);
+      allocations_.erase(buffer);
+    }
+  }
+
+  void* scratchpad() const override {
+    if (scratch_ == NULL) {
+      scratch_ = allocate(Eigen::kGpuScratchSize + sizeof(unsigned int));
+    }
+    return scratch_;
+  }
+
+  unsigned int* semaphore() const override {
+    if (semaphore_ == NULL) {
+      char* scratch = static_cast<char*>(scratchpad()) + Eigen::kGpuScratchSize;
+      semaphore_ = reinterpret_cast<unsigned int*>(scratch);
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_));
+#endif
+    }
+    return semaphore_;
+  }
+
+ private:
+  gpuStream_t stream_;                // not owned;
+  phi::Allocator* allocator_;         // not owned;
+  const gpuDeviceProp* device_prop_;  // not owned;
+  mutable void* scratch_;
+  mutable unsigned int* semaphore_;
+  mutable std::mutex mtx_;  // to protect allocations_
+  mutable std::unordered_map<void*, phi::Allocator::AllocationPtr> allocations_;
+};
+#endif
+}  // namespace internal
+
+ResourceManager::ResourceManager(const phi::Place& place, void* stream)
+    : place_(place) {
+  InitCPUResource();
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  InitGPUResource(stream);
+#endif
+}
+
+ResourceManager::~ResourceManager() {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  DestroyGPUResource();
+#endif
+}
+
+void ResourceManager::InitCPUResource() {
+  cpu_eigen_device_.reset(new Eigen::DefaultDevice());
+}
+
+Eigen::DefaultDevice* ResourceManager::GetCpuEigenDevice() {
+  return cpu_eigen_device_.get();
+}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+void ResourceManager::InitGPUResource(void* stream) {
+  if (stream == nullptr) {
+    owned_stream_ = true;
+    phi::InitStream(&stream_);
+  } else {
+    owned_stream_ = false;
+    stream_ = reinterpret_cast<gpuStream_t>(stream);
+  }
+
+  InitGpuProperties();
+  InitGpuEigenDevice();
+  InitDnnHanlde();
+  InitBlasHandle();
+  InitBlasLtHandle();
+  InitSolverHandle();
+  InitSparseHandle();
+}
+
+void ResourceManager::DestroyGPUResource() {
+  if (owned_stream_) {
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream_));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream_));
+#endif
+    stream_ = nullptr;
+  }
+
+  DestroyDnnHandle();
+  DestroyBlasHandle();
+  DestroyBlasLtHandle();
+  DestroySolverHandle();
+  DestroySparseHandle();
+}
+
+void ResourceManager::InitGpuProperties() {
+  phi::backends::gpu::GPUDeviceGuard guard(place_.device);
+  phi::InitGpuProperties(place_, &compute_capability_, &runtime_version_,
+                         &driver_version_, &multi_process_,
+                         &max_threads_per_mp_, &max_threads_per_block_,
+                         &max_grid_dim_size_);
+}
+
+void ResourceManager::InitGpuEigenDevice() {
+  auto* allocator = paddle::memory::allocation::AllocatorFacade::Instance()
+                        .GetAllocator(place_)
+                        .get();
+  eigen_stream_.reset(new internal::EigenGpuStreamDevice());
+  eigen_stream_->Reinitialize(stream_, allocator, place_);
+  gpu_eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
+}
+
+void ResourceManager::InitDnnHanlde() {
+  phi::InitDnnHandle(&dnn_handle_, stream_, place_);
+}
+
+void ResourceManager::DestroyDnnHandle() { phi::DestroyDnnHandle(dnn_handle_); }
+
+void ResourceManager::InitBlasHandle() {
+  phi::InitBlasHandle(&blas_handle_, stream_);
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 9000
+  phi::InitBlasHandle(&blas_tensor_core_handle_, stream_);
+  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+      blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
+#endif
+#if CUDA_VERSION >= 11000
+  phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream_);
+  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+      blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
+#endif
+#endif
+}
+
+void ResourceManager::DestroyBlasHandle() {
+  phi::DestroyBlasHandle(blas_handle_);
+  phi::DestroyBlasHandle(blas_tensor_core_handle_);
+  phi::DestroyBlasHandle(blas_tf32_tensor_core_handle_);
+}
+
+void ResourceManager::InitBlasLtHandle() {
+  phi::InitBlasLtHandle(&blaslt_handle_);
+}
+
+void ResourceManager::DestroyBlasLtHandle() {
+  phi::DestroyBlasLtHandle(blaslt_handle_);
+}
+
+void ResourceManager::InitSolverHandle() {
+  phi::InitSolverHandle(&solver_handle_, stream_);
+}
+
+void ResourceManager::DestroySolverHandle() {
+  phi::DestroySolverHandle(solver_handle_);
+}
+
+void ResourceManager::InitSparseHandle() {
+  phi::InitSparseHandle(&sparse_handle_, stream_);
+}
+
+void ResourceManager::DestroySparseHandle() {
+  phi::DestroySparseHandle(sparse_handle_);
+}
+
+gpuStream_t ResourceManager::GetStream() const { return stream_; }
+
+dnnHandle_t ResourceManager::GetDnnHandle() const { return dnn_handle_; }
+
+blasHandle_t ResourceManager::GetBlasHandle() const { return blas_handle_; }
+
+blasHandle_t ResourceManager::GetBlasTensorCoreHandle() const {
+  return blas_tensor_core_handle_;
+}
+
+blasHandle_t ResourceManager::GetBlasTF32Handle() const {
+  return blas_tf32_tensor_core_handle_;
+}
+
+blasLtHandle_t ResourceManager::GetBlasLtHandle() const {
+  return blaslt_handle_;
+}
+
+phi::solverHandle_t ResourceManager::GetSolverDnHandle() const {
+  return solver_handle_;
+}
+
+phi::sparseHandle_t ResourceManager::GetSparseHandle() const {
+  return sparse_handle_;
+}
+
+Eigen::GpuDevice* ResourceManager::GetGpuEigenDevice() const {
+  return gpu_eigen_device_.get();
+}
+
+int ResourceManager::GetGpuComputeCapability() const {
+  return compute_capability_;
+}
+
+int ResourceManager::GetGpuRuntimeVersion() const { return runtime_version_; }
+
+int ResourceManager::GetGpuDriverVersion() const { return driver_version_; }
+
+int ResourceManager::GetGPUMultiProcessors() const { return multi_process_; }
+
+int ResourceManager::GetGpuMaxThreadsPerMp() const {
+  return max_threads_per_mp_;
+}
+
+int ResourceManager::GetGpuMaxThreadsPerBlock() const {
+  return max_threads_per_block_;
+}
+
+std::array<int, 3> ResourceManager::GetGpuMaxGridDimSize() const {
+  return max_grid_dim_size_;
+}
+
+#endif
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/resource_manager.h b/paddle/fluid/inference/api/resource_manager.h
new file mode 100644
index 0000000000000..c41968dc58590
--- /dev/null
+++ b/paddle/fluid/inference/api/resource_manager.h
@@ -0,0 +1,109 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <functional>
+#include <memory>
+#include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/backends/cpu/forwards.h"
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#include "paddle/fluid/platform/device/gpu/gpu_types.h"
+#include "paddle/phi/backends/gpu/forwards.h"
+#include "paddle/phi/backends/gpu/gpu_decls.h"
+#include "paddle/phi/backends/gpu/gpu_resources.h"
+#endif
+
+namespace paddle {
+namespace internal {
+class EigenGpuStreamDevice;
+}  // namespace internal
+
+class ResourceManager {
+ public:
+  explicit ResourceManager(const phi::Place& place, void* stream);
+  ~ResourceManager();
+
+ public:
+  Eigen::DefaultDevice* GetCpuEigenDevice();
+
+ private:
+  void InitCPUResource();
+
+ private:
+  phi::Place place_;
+  std::unique_ptr<Eigen::DefaultDevice> cpu_eigen_device_;
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+ public:
+  gpuStream_t GetStream() const;
+  dnnHandle_t GetDnnHandle() const;
+  blasHandle_t GetBlasHandle() const;
+  blasHandle_t GetBlasTensorCoreHandle() const;
+  blasHandle_t GetBlasTF32Handle() const;
+  blasLtHandle_t GetBlasLtHandle() const;
+  phi::solverHandle_t GetSolverDnHandle() const;
+  phi::sparseHandle_t GetSparseHandle() const;
+  Eigen::GpuDevice* GetGpuEigenDevice() const;
+  int GetGpuComputeCapability() const;
+  int GetGpuRuntimeVersion() const;
+  int GetGpuDriverVersion() const;
+  int GetGPUMultiProcessors() const;
+  int GetGpuMaxThreadsPerMp() const;
+  int GetGpuMaxThreadsPerBlock() const;
+  std::array<int, 3> GetGpuMaxGridDimSize() const;
+
+ private:
+  void InitGPUResource(void* stream);
+  void DestroyGPUResource();
+  void InitGpuProperties();
+  void InitGpuEigenDevice();
+  void InitDnnHanlde();
+  void DestroyDnnHandle();
+  void InitBlasHandle();
+  void DestroyBlasHandle();
+  void InitBlasLtHandle();
+  void DestroyBlasLtHandle();
+  void InitSolverHandle();
+  void DestroySolverHandle();
+  void InitSparseHandle();
+  void DestroySparseHandle();
+
+ private:
+  int compute_capability_;
+  int runtime_version_;
+  int driver_version_;
+  int multi_process_;
+  int max_threads_per_mp_;
+  int max_threads_per_block_;
+  std::array<int, 3> max_grid_dim_size_;
+
+  bool owned_stream_{true};
+  gpuStream_t stream_;
+  std::unique_ptr<Eigen::GpuDevice> gpu_eigen_device_;
+  std::unique_ptr<internal::EigenGpuStreamDevice> eigen_stream_;
+
+  blasHandle_t blas_handle_{nullptr};
+  blasHandle_t blas_tensor_core_handle_{nullptr};
+  blasHandle_t blas_tf32_tensor_core_handle_{nullptr};
+  blasLtHandle_t blaslt_handle_{nullptr};
+  dnnHandle_t dnn_handle_{nullptr};
+  phi::solverHandle_t solver_handle_{nullptr};
+  phi::sparseHandle_t sparse_handle_{nullptr};
+// DnnWorkspaceHandle
+#endif
+};
+
+}  // namespace paddle
diff --git a/paddle/fluid/inference/paddle_inference.map b/paddle/fluid/inference/paddle_inference.map
index 5bb9b8d75620b..05935701635d9 100644
--- a/paddle/fluid/inference/paddle_inference.map
+++ b/paddle/fluid/inference/paddle_inference.map
@@ -6,4 +6,3 @@
 	local:
 		*;
 };
-
diff --git a/paddle/fluid/inference/paddle_inference_custom_device.map b/paddle/fluid/inference/paddle_inference_custom_device.map
new file mode 100644
index 0000000000000..52bc2870482e2
--- /dev/null
+++ b/paddle/fluid/inference/paddle_inference_custom_device.map
@@ -0,0 +1,10 @@
+{
+	global:
+		*paddle*;
+		*Pass*;
+		*profile*;
+		*phi*;
+		*FLAGS_*;
+	local:
+		*;
+};
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index ec8c1b2fcd75c..1910e2f6eb906 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -1,13 +1,43 @@
 # Add TRT tests
 nv_library(tensorrt_converter
-           SRCS matmul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
-                batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc group_norm_op.cc
-                pad_op.cc split_op.cc prelu_op.cc leaky_relu_op.cc gelu_op.cc layer_norm_op.cc multihead_matmul_op.cc
-                shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc transpose_op.cc flatten_op.cc flatten_contiguous_range_op.cc
-                emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc clip_op.cc
+           SRCS matmul_op.cc
+                conv2d_op.cc
+                fc_op.cc
+                pool2d_op.cc
+                elementwise_op.cc
+                batch_norm_op.cc
+                activation_op.cc
+                unary_op.cc
+                softmax_op.cc
+                concat_op.cc
+                dropout_op.cc
+                group_norm_op.cc
+                pad_op.cc
+                split_op.cc
+                prelu_op.cc
+                leaky_relu_op.cc
+                gelu_op.cc
+                layer_norm_op.cc
+                multihead_matmul_op.cc
+                shuffle_channel_op.cc
+                swish_op.cc
+                instance_norm_op.cc
+                stack_op.cc
+                transpose_op.cc
+                flatten_op.cc
+                flatten_contiguous_range_op.cc
+                emb_eltwise_layernorm.cc
+                skip_layernorm.cc
+                scale_op.cc
+                slice_op.cc
+                hard_sigmoid_op.cc
+                hard_swish_op.cc
+                clip_op.cc
                 gather_op.cc
                 anchor_generator_op.cc
                 yolo_box_op.cc
+                yolo_box_head_op.cc
+                arg_max_op.cc
                 roi_align_op.cc
                 affine_channel_op.cc
                 multiclass_nms_op.cc
diff --git a/paddle/fluid/inference/tensorrt/convert/arg_max_op.cc b/paddle/fluid/inference/tensorrt/convert/arg_max_op.cc
new file mode 100644
index 0000000000000..14975e481644d
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/arg_max_op.cc
@@ -0,0 +1,73 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class ArgMaxOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(3) << "convert a fluid arg_max op to tensorrt topk layer";
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
+    auto input_dims = input->getDimensions();
+    int rank = input_dims.nbDims;
+    int axis = op_desc.HasAttr("axis")
+                   ? BOOST_GET_CONST(int64_t, op_desc.GetAttr("axis"))
+                   : -1;
+    if (axis > 0) axis -= 1;
+    if (axis < 0) axis += rank;
+    auto* topk_layer = TRT_ENGINE_ADD_LAYER(
+        engine_, TopK, *input, nvinfer1::TopKOperation::kMAX, 1, 1 << axis);
+
+    auto output_name = op_desc.Output("Out")[0];
+    bool keepdims = BOOST_GET_CONST(bool, op_desc.GetAttr("keepdims"));
+    if (keepdims) {
+      RreplenishLayerAndOutput(topk_layer, "arg_max",
+                               {output_name + "_value", output_name},
+                               test_mode);
+    } else {
+      auto squeeze_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *topk_layer->getOutput(1));
+      auto dims = input_dims;
+      dims.nbDims -= 1;
+      for (int i = axis; i < dims.nbDims; i++) {
+        dims.d[i] = dims.d[i + 1];
+      }
+      squeeze_layer->setReshapeDimensions(dims);
+      RreplenishLayerAndOutput(squeeze_layer, "arg_max", {output_name},
+                               test_mode);
+    }
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(arg_max, ArgMaxOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
index 8fd0e1bbd068d..35d3ead009720 100644
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -53,20 +53,14 @@ class ElementwiseWeightOpConverter : public OpConverter {
     auto output_name = op_desc.Output("Out")[0];
     weight_data = engine_->GetWeightCPUData(op_desc.Input("Y").front(), Y_t);
     nvinfer1::Dims dims_x = X->getDimensions();
+    std::vector<int> dims_y = phi::vectorize<int>(Y_t->dims());
 
     auto regist_eltwise_weight = [&](nvinfer1::ScaleMode scale_mode) {
-      TensorRTEngine::Weight shift_weights{nvinfer1::DataType::kFLOAT,
-                                           static_cast<void*>(weight_data),
-                                           static_cast<size_t>(Y_t->numel())};
-      TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT, nullptr,
-                                           0};
-      TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
-                                           0};
-
       nvinfer1::IShuffleLayer* expand_layer = nullptr;
       nvinfer1::IShuffleLayer* squeeze_layer = nullptr;
       int dynamic_shape_offset = engine_->with_dynamic_shape() ? 1 : 0;
       auto input_dim = X->getDimensions();
+      // reshape
       if (input_dim.nbDims < 3 + dynamic_shape_offset) {
         nvinfer1::Dims expand_shape;
         expand_shape.nbDims = 3 + dynamic_shape_offset;
@@ -85,17 +79,45 @@ class ElementwiseWeightOpConverter : public OpConverter {
         expand_layer->setName(
             ("Elewise: Shuffle: (Output: " + output_name + ")").c_str());
       }
+      // eltwise_ops
+      TensorRTEngine::Weight shift_weights{nvinfer1::DataType::kFLOAT, nullptr,
+                                           0};
+      TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT, nullptr,
+                                           0};
+      TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
+                                           0};
       if (op_type_ == "add") {
-        nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER(
-            engine_, ScaleNd, *X, scale_mode, shift_weights.get(),
-            scale_weights.get(), power_weights.get(), dynamic_shape_offset);
-        layer = scale_layer;
+        shift_weights = TensorRTEngine::Weight(
+            nvinfer1::DataType::kFLOAT, static_cast<void*>(weight_data),
+            static_cast<size_t>(Y_t->numel()));
+      } else if (op_type_ == "sub") {
+        for (int i = 0; i < Y_t->numel(); i++) {
+          weight_data[i] = -weight_data[i];
+        }
+        shift_weights = TensorRTEngine::Weight(
+            nvinfer1::DataType::kFLOAT, static_cast<void*>(weight_data),
+            static_cast<size_t>(Y_t->numel()));
       } else if (op_type_ == "mul") {
-        nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER(
-            engine_, Scale, *X, scale_mode, scale_weights.get(),
-            shift_weights.get(), power_weights.get());
-        layer = scale_layer;
+        scale_weights = TensorRTEngine::Weight(
+            nvinfer1::DataType::kFLOAT, static_cast<void*>(weight_data),
+            static_cast<size_t>(Y_t->numel()));
+      } else if (op_type_ == "div") {
+        for (int i = 0; i < Y_t->numel(); i++) {
+          weight_data[i] = 1.f / weight_data[i];
+        }
+        scale_weights = TensorRTEngine::Weight(
+            nvinfer1::DataType::kFLOAT, static_cast<void*>(weight_data),
+            static_cast<size_t>(Y_t->numel()));
+      } else if (op_type_ == "pow") {
+        power_weights = TensorRTEngine::Weight(
+            nvinfer1::DataType::kFLOAT, static_cast<void*>(weight_data),
+            static_cast<size_t>(Y_t->numel()));
       }
+      nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER(
+          engine_, ScaleNd, *X, scale_mode, shift_weights.get(),
+          scale_weights.get(), power_weights.get(), dynamic_shape_offset);
+      layer = scale_layer;
+      // reshape
       if (input_dim.nbDims < 3 + dynamic_shape_offset) {
         nvinfer1::Dims squeeze_shape;
         squeeze_shape.nbDims = input_dim.nbDims;
@@ -113,71 +135,43 @@ class ElementwiseWeightOpConverter : public OpConverter {
       }
     };
 
+    // dynamic shape
     if (engine_->with_dynamic_shape()) {
-      if (Y_t->dims().size() == 1) {
-        auto scale_mode = nvinfer1::ScaleMode::kCHANNEL;
-        PADDLE_ENFORCE_EQ(Y_t->dims()[0], dims_x.d[1],
-                          platform::errors::InvalidArgument(
-                              "The Bias's size(%d) should be equal to the "
-                              "first dim(%d) of the Input.",
-                              Y_t->dims()[0], dims_x.d[1]));
-        regist_eltwise_weight(scale_mode);
+      if (dims_y.size() == 1 && dims_y[0] == dims_x.d[1]) {
+        regist_eltwise_weight(nvinfer1::ScaleMode::kCHANNEL);
+      } else if (dims_y.size() == 1 && dims_y[0] == 1) {
+        regist_eltwise_weight(nvinfer1::ScaleMode::kUNIFORM);
+      } else if (dims_y.size() == static_cast<size_t>(dims_x.nbDims)) {
+        regist_eltwise_weight(nvinfer1::ScaleMode::kELEMENTWISE);
       } else {
         PADDLE_THROW(platform::errors::InvalidArgument(
-            "The size of input bias's dims is %d, but TensorRT dynamic shape "
-            "only support size = 1 for Elementwise op!",
-            Y_t->dims().size()));
+            "The size of input_y's dims is %d, but TensorRT dynamic shape "
+            "only support size = 1 or size = input_x.size() for Elementwise "
+            "op!",
+            dims_y.size()));
       }
       return;
     }
 
+    // static shape with dynamic batch
     std::vector<int> no_batch_dims;
     int start_index = 0;
-
-    for (; start_index < dims_x.nbDims; start_index++)
+    for (; start_index < dims_x.nbDims; start_index++) {
       no_batch_dims.push_back(dims_x.d[start_index]);
-
-    auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
-
-    std::vector<int> dims_y = phi::vectorize<int>(Y_t->dims());
-    if (dims_y.size() == no_batch_dims.size() + 1) {
-      if (dims_y[0] == 1) dims_y.erase(dims_y.begin());
     }
-
     if (dims_y.size() == 1 && dims_y[0] == no_batch_dims[0]) {
-      scale_mode = nvinfer1::ScaleMode::kCHANNEL;
-    } else if (dims_y.size() == no_batch_dims.size() &&
-               dims_y[0] == no_batch_dims[0]) {
-      scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
-      for (size_t i = 1; i < no_batch_dims.size(); i++) {
-        if (dims_y[i] != no_batch_dims[i]) {
-          scale_mode = nvinfer1::ScaleMode::kCHANNEL;
-          break;
-        }
-      }
-      if (scale_mode == nvinfer1::ScaleMode::kCHANNEL) {
-        for (size_t i = 1; i < no_batch_dims.size(); i++) {
-          if (dims_y[i] != 1)
-            PADDLE_THROW(platform::errors::InvalidArgument(
-                "The bias's %d dim is %d, but TensorRT dynamic shape only "
-                "support it equals to 1 for Elementwise op!",
-                i, dims_y[i]));
-        }
-      }
+      regist_eltwise_weight(nvinfer1::ScaleMode::kCHANNEL);
+    } else if (dims_y.size() == 1 && dims_y[0] == 1) {
+      regist_eltwise_weight(nvinfer1::ScaleMode::kUNIFORM);
+    } else if (dims_y.size() == no_batch_dims.size() + 1) {
+      regist_eltwise_weight(nvinfer1::ScaleMode::kELEMENTWISE);
     } else {
-      if (dims_y.size() >= 1) {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "The size of bias's dims is %d and bias's size is %d. TensorRT "
-            "doesn't support this shape for Elementwise op!",
-            dims_y.size(), dims_y[0]));
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "The size of bias's dims is %d. TensorRT doesn't support "
-            "this shape for Elementwise op!",
-            dims_y.size()));
-      }
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The size of input_y's dims is %d, but TensorRT dynamic shape "
+          "only support size = 1 or size = input_x.size() for Elementwise "
+          "op!",
+          dims_y.size()));
     }
-    regist_eltwise_weight(scale_mode);
   }
 
  protected:
@@ -215,7 +209,6 @@ class ElementwiseTensorOpConverter : public OpConverter {
     auto common_func = [&](nvinfer1::ILayer* layer) {
       RreplenishLayerAndOutput(layer, "elementwise", {output_name}, test_mode);
     };
-
     if (dims_x.nbDims == dims_y.nbDims) {
       // The two input tensor should have the same dims
       VLOG(3) << "Convert a fluid elementwise op to TensorRT IElementWiseLayer";
@@ -244,7 +237,6 @@ class ElementwiseTensorOpConverter : public OpConverter {
         auto* plugin_layer = engine_->AddPlugin(
             inputs.data(), inputs.size(),
             reinterpret_cast<plugin::PluginTensorRT*>(plugin));
-
         layer = plugin_layer;
       }
     }
@@ -278,6 +270,21 @@ class ElementwiseWeightMulOpConverter : public ElementwiseWeightOpConverter {
   ElementwiseWeightMulOpConverter() { op_type_ = "mul"; }
 };
 
+class ElementwiseWeightSubOpConverter : public ElementwiseWeightOpConverter {
+ public:
+  ElementwiseWeightSubOpConverter() { op_type_ = "sub"; }
+};
+
+class ElementwiseWeightDivOpConverter : public ElementwiseWeightOpConverter {
+ public:
+  ElementwiseWeightDivOpConverter() { op_type_ = "div"; }
+};
+
+class ElementwiseWeightPowOpConverter : public ElementwiseWeightOpConverter {
+ public:
+  ElementwiseWeightPowOpConverter() { op_type_ = "pow"; }
+};
+
 class ElementwiseTensorAddOpConverter : public ElementwiseTensorOpConverter {
  public:
   ElementwiseTensorAddOpConverter() { op_type_ = "add"; }
@@ -321,6 +328,12 @@ REGISTER_TRT_OP_CONVERTER(elementwise_add_weight,
                           ElementwiseWeightAddOpConverter);
 REGISTER_TRT_OP_CONVERTER(elementwise_mul_weight,
                           ElementwiseWeightMulOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_sub_weight,
+                          ElementwiseWeightSubOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_div_weight,
+                          ElementwiseWeightDivOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_pow_weight,
+                          ElementwiseWeightPowOpConverter);
 
 REGISTER_TRT_OP_CONVERTER(elementwise_add_tensor,
                           ElementwiseTensorAddOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
index 21c79f0edd27f..4b4ad01f5674a 100644
--- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
@@ -56,7 +56,7 @@ class MultiheadMatMulOpConverter : public OpConverter {
            weight_t->numel() * sizeof(float));
 
     // (hidden_in, 3, hidden_out)
-    auto weight_dims = weight_t->dims();
+    const auto& weight_dims = weight_t->dims();
 
     int hidden_in = weight_dims[0];   // channels_in
     int three = weight_dims[1];       // channels_out
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index f7eb7f859afaa..0a99b12edc25c 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -67,10 +67,8 @@ class OpConverter {
     if (op_desc.Type().find("elementwise") != std::string::npos) {
       static std::unordered_set<std::string> add_tensor_op_set{
           "add", "mul", "sub", "div", "max", "min", "pow"};
-      // TODO(xingzhaolong): all mul, sub, div
-      // static std::unordered_set<std::string> add_weight_op_set {"add", "mul",
-      // "sub", "div"};
-      static std::unordered_set<std::string> add_weight_op_set{"add", "mul"};
+      static std::unordered_set<std::string> add_weight_op_set{
+          "add", "mul", "sub", "div", "pow"};
       PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL,
                         platform::errors::InvalidArgument(
                             "The input op's Input(\"Y\")."
diff --git a/paddle/fluid/inference/tensorrt/convert/reduce_op.cc b/paddle/fluid/inference/tensorrt/convert/reduce_op.cc
index 7c5eaa309ef18..13886f55dff01 100644
--- a/paddle/fluid/inference/tensorrt/convert/reduce_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/reduce_op.cc
@@ -41,7 +41,7 @@ class ReduceOpConverter : public OpConverter {
                   const framework::Scope& scope, bool test_mode) override {
     VLOG(4) << "convert a paddle " << op_type << " op to tensorrt reduce layer";
     framework::OpDesc op_desc(op, nullptr);
-    nvinfer1::ReduceOperation reduce_type;
+    nvinfer1::ReduceOperation reduce_type = nvinfer1::ReduceOperation::kSUM;
     if (op_type == "reduce_sum") {
       reduce_type = nvinfer1::ReduceOperation::kSUM;
     } else if (op_type == "reduce_mean") {
diff --git a/paddle/fluid/inference/tensorrt/convert/strided_slice_op.cc b/paddle/fluid/inference/tensorrt/convert/strided_slice_op.cc
index 26046d38bcbd9..9680e90b2e29d 100644
--- a/paddle/fluid/inference/tensorrt/convert/strided_slice_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/strided_slice_op.cc
@@ -39,7 +39,7 @@ class StridedSliceOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     auto* input = engine_->GetITensor(op_desc.Input("Input")[0]);
     nvinfer1::Dims input_dims = input->getDimensions();
-
+    auto output_name = op_desc.Output("Out")[0];
     std::vector<int> axes =
         BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("axes"));
     std::vector<int> starts =
@@ -48,79 +48,116 @@ class StridedSliceOpConverter : public OpConverter {
         BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("ends"));
     std::vector<int> strides =
         BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("strides"));
-
-    nvinfer1::Dims start;
-    start.nbDims = input_dims.nbDims;
     int axes_size = axes.size();
-    for (int i = 0; i < start.nbDims; i++) {
-      start.d[i] = 0;
-    }
-    for (int i = 0; i < axes_size; i++) {
-      start.d[axes[i]] = starts[i];
-    }
-
+    nvinfer1::Dims start;
     nvinfer1::Dims stride;
-    stride.nbDims = input_dims.nbDims;
-    for (int i = 0; i < stride.nbDims; i++) {
-      stride.d[i] = 1;
-    }
-    for (int i = 0; i < axes_size; i++) {
-      stride.d[axes[i]] = strides[i];
-    }
-
     nvinfer1::Dims size;
+    start.nbDims = input_dims.nbDims;
+    stride.nbDims = input_dims.nbDims;
     size.nbDims = input_dims.nbDims;
-    for (int i = 0; i < size.nbDims; i++) {
-      size.d[i] = 1;
+    for (int i = 0; i < input_dims.nbDims; i++) {
+      start.d[i] = 0;
+      stride.d[i] = 1;
+      size.d[i] = input_dims.d[i];
     }
 
-    auto output_name = op_desc.Output("Out")[0];
-
-    auto create_weights = [&](const std::vector<int>& data,
-                              const std::string& type) -> int* {
-      std::unique_ptr<framework::Tensor> tmp_tensor(new framework::Tensor());
-      int data_size = data.size();
-      tmp_tensor->Resize({data_size});
-      auto* tmp_data = tmp_tensor->mutable_data<int>(platform::CPUPlace());
-      for (int i = 0; i < data_size; i++) {
-        tmp_data[i] = data[i];
+    if (!engine_->with_dynamic_shape()) {
+      for (int i = 0; i < axes_size; i++) {
+        start.d[axes[i] - 1] = starts[i];
+      }
+      for (int i = 0; i < axes_size; i++) {
+        stride.d[axes[i] - 1] = strides[i];
+      }
+      for (int i = 0; i < axes_size; ++i) {
+        int dim = size.d[axes[i] - 1];
+        if (dim > 0) {
+          int start = starts[i] < 0 ? (starts[i] + dim) : starts[i];
+          int end = ends[i] < 0 ? (ends[i] + dim) : ends[i];
+          int stride = std::abs(strides[i]);
+          start = std::max(start, 0);
+          end = std::max(end, 0);
+          end = std::min(end, dim);
+          size.d[axes[i] - 1] = (std::abs(end - start) + stride - 1) / stride;
+        }
+      }
+      auto* layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Slice, *input, start, size, stride);
+      RreplenishLayerAndOutput(layer, "strided_slice", {output_name},
+                               test_mode);
+    } else {
+      for (int i = 0; i < axes_size; i++) {
+        start.d[axes[i]] = starts[i];
+      }
+      for (int i = 0; i < axes_size; i++) {
+        stride.d[axes[i]] = strides[i];
+      }
+      for (int i = 0; i < axes_size; ++i) {
+        int dim = size.d[axes[i]];
+        if (dim > 0) {
+          int start = starts[i] < 0 ? (starts[i] + dim) : starts[i];
+          int end = ends[i] < 0 ? (ends[i] + dim) : ends[i];
+          int stride = std::abs(strides[i]);
+          start = std::max(start, 0);
+          end = std::max(end, 0);
+          end = std::min(end, dim);
+          size.d[axes[i]] = (std::abs(end - start) + stride - 1) / stride;
+        }
       }
 
-      engine_->SetWeights(output_name + "_add_slice_op_" + type,
-                          std::move(tmp_tensor));
-      return tmp_data;
-    };
+      auto create_weights = [&](const std::vector<int>& data,
+                                const std::string& type) -> int* {
+        std::unique_ptr<framework::Tensor> tmp_tensor(new framework::Tensor());
+        int data_size = data.size();
+        tmp_tensor->Resize({data_size});
+        auto* tmp_data = tmp_tensor->mutable_data<int>(platform::CPUPlace());
+        for (int i = 0; i < data_size; i++) {
+          tmp_data[i] = data[i];
+        }
+
+        engine_->SetWeights(output_name + "_add_slice_op_" + type,
+                            std::move(tmp_tensor));
+        return tmp_data;
+      };
+
+      std::vector<int> const_weight(input_dims.nbDims, 0);
+      for (int i = 0; i < axes_size; i++) {
+        int dim = input_dims.d[axes[i]];
+        int start = starts[i] < 0 ? (starts[i] + dim) : starts[i];
+        int end = ends[i] < 0 ? (ends[i] + dim) : ends[i];
+        int stride = std::abs(strides[i]);
+        start = std::max(start, 0);
+        end = std::max(end, 0);
+        end = std::min(end, dim);
+        const_weight[axes[i]] =
+            dim - ((std::abs(end - start) + stride - 1) / stride);
+      }
 
-    std::vector<int> const_weight(input_dims.nbDims, 1);
-    for (int i = 0; i < axes_size; i++) {
-      const_weight[axes[i]] = strides[i];
+      int* weight_data = create_weights(const_weight, "size");
+
+      TensorRTEngine::Weight weight{nvinfer1::DataType::kINT32,
+                                    static_cast<void*>(weight_data),
+                                    static_cast<size_t>(input_dims.nbDims)};
+
+      int input_dim_size = input_dims.nbDims;
+      nvinfer1::Dims input_shape;
+      input_shape.nbDims = 1;
+      input_shape.d[0] = input_dim_size;
+
+      auto const_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Constant, input_shape, weight.get());
+
+      auto shape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shape, *input);
+      // slice layer
+      auto* layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Slice, *input, start, size, stride);
+      // elementwise layer for get size tensor
+      auto size_layer = TRT_ENGINE_ADD_LAYER(
+          engine_, ElementWise, *shape_layer->getOutput(0),
+          *const_layer->getOutput(0), nvinfer1::ElementWiseOperation::kSUB);
+      layer->setInput(2, *size_layer->getOutput(0));
+      RreplenishLayerAndOutput(layer, "strided_slice", {output_name},
+                               test_mode);
     }
-
-    int* weight_data = create_weights(const_weight, "size");
-
-    TensorRTEngine::Weight weight{nvinfer1::DataType::kINT32,
-                                  static_cast<void*>(weight_data),
-                                  static_cast<size_t>(input_dims.nbDims)};
-
-    int input_dim_size = input_dims.nbDims;
-    nvinfer1::Dims input_shape;
-    input_shape.nbDims = 1;
-    input_shape.d[0] = input_dim_size;
-
-    auto const_layer =
-        TRT_ENGINE_ADD_LAYER(engine_, Constant, input_shape, weight.get());
-
-    auto shape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shape, *input);
-
-    auto size_layer = TRT_ENGINE_ADD_LAYER(
-        engine_, ElementWise, *shape_layer->getOutput(0),
-        *const_layer->getOutput(0), nvinfer1::ElementWiseOperation::kDIV);
-
-    auto* layer =
-        TRT_ENGINE_ADD_LAYER(engine_, Slice, *input, start, size, stride);
-    layer->setInput(2, *size_layer->getOutput(0));
-
-    RreplenishLayerAndOutput(layer, "strided_slice", {output_name}, test_mode);
   }
 };
 
diff --git a/paddle/fluid/inference/tensorrt/convert/swish_op.cc b/paddle/fluid/inference/tensorrt/convert/swish_op.cc
index b2e394d14eba2..0b9a6917dd972 100644
--- a/paddle/fluid/inference/tensorrt/convert/swish_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/swish_op.cc
@@ -52,7 +52,7 @@ class SwishOpConverter : public OpConverter {
     PADDLE_ENFORCE_EQ(
         output_num, 1UL,
         platform::errors::InvalidArgument(
-            "The ouput Out's size must equal to 1 in TRT swish op. "
+            "The output Out's size must equal to 1 in TRT swish op. "
             "But received Out's size %u.",
             output_num));
     // Get attrs
@@ -75,7 +75,7 @@ class SwishOpConverter : public OpConverter {
       bool with_fp16 =
           engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
       plugin::SwishPlugin* plugin = new plugin::SwishPlugin(beta, with_fp16);
-      layer = engine_->AddPlugin(&input, input_num, plugin);
+      layer = engine_->AddPluginV2Ext(&input, input_num, plugin);
     }
 
     auto output_name = op_desc.Output("Out")[0];
diff --git a/paddle/fluid/inference/tensorrt/convert/unary_op.cc b/paddle/fluid/inference/tensorrt/convert/unary_op.cc
new file mode 100644
index 0000000000000..aa3d38ebe2073
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/unary_op.cc
@@ -0,0 +1,84 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <NvInfer.h>
+#include <string>
+#include "glog/logging.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/helper.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class UnaryOpConverter : public OpConverter {
+ public:
+  UnaryOpConverter() {}
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    // Here the two nullptr looks strange, that's because the
+    // framework::OpDesc's constructor is strange.
+    framework::OpDesc op_desc(op, nullptr);
+    VLOG(3) << "convert a fluid unary op to tensorrt unary layer whose "
+               "type is "
+            << op_type_;
+    nvinfer1::ITensor* input_tensor =
+        engine_->GetITensor(op_desc.Input("X")[0]);
+    auto op_pair = ops.find(op_type_);
+    nvinfer1::IUnaryLayer* layer =
+        TRT_ENGINE_ADD_LAYER(engine_, Unary, *input_tensor, op_pair->second);
+    auto output_name = op_desc.Output("Out")[0];
+    RreplenishLayerAndOutput(layer, op_type_, {output_name}, test_mode);
+  }
+
+ protected:
+  std::string op_type_;
+  static const std::unordered_map<std::string, nvinfer1::UnaryOperation> ops;
+};
+
+const std::unordered_map<std::string, nvinfer1::UnaryOperation>
+    UnaryOpConverter::ops = {
+        {"exp", nvinfer1::UnaryOperation::kEXP},
+        {"log", nvinfer1::UnaryOperation::kLOG},
+};
+
+class ExpOpConverter : public UnaryOpConverter {
+ public:
+  ExpOpConverter() { op_type_ = "exp"; }
+};
+
+class LogOpConverter : public UnaryOpConverter {
+ public:
+  LogOpConverter() { op_type_ = "log"; }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(exp, ExpOpConverter);
+REGISTER_TRT_OP_CONVERTER(log, LogOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/yolo_box_head_op.cc b/paddle/fluid/inference/tensorrt/convert/yolo_box_head_op.cc
new file mode 100644
index 0000000000000..04276d94bf5e1
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/yolo_box_head_op.cc
@@ -0,0 +1,56 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/yolo_box_head_op_plugin.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class YoloBoxHeadOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(3) << "convert a yolo_box_head op to tensorrt plugin";
+
+    framework::OpDesc op_desc(op, nullptr);
+    auto* x_tensor = engine_->GetITensor(op_desc.Input("X").front());
+    std::vector<int> anchors =
+        BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("anchors"));
+    int class_num = BOOST_GET_CONST(int, op_desc.GetAttr("class_num"));
+
+    auto* yolo_box_plugin = new plugin::YoloBoxHeadPlugin(anchors, class_num);
+    std::vector<nvinfer1::ITensor*> yolo_box_inputs;
+    yolo_box_inputs.push_back(x_tensor);
+    auto* yolo_box_head_layer = engine_->network()->addPluginV2(
+        yolo_box_inputs.data(), yolo_box_inputs.size(), *yolo_box_plugin);
+    std::vector<std::string> output_names;
+    output_names.push_back(op_desc.Output("Out").front());
+    RreplenishLayerAndOutput(yolo_box_head_layer, "yolo_box_head", output_names,
+                             test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(yolo_box_head, YoloBoxHeadOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 33386c746ae5a..00a6b2ffbf923 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -90,7 +90,9 @@ void TensorRTEngine::FreezeNetwork() {
 
   bool enable_int8 = (precision_ == AnalysisConfig::Precision::kInt8);
   if (enable_int8) {
-    infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16);
+    if (!use_dla_) {
+      infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16);
+    }
     infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kINT8);
 
     if (calibrator_) {
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index b44450e7a8212..690bc173c77cf 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -65,6 +65,8 @@ struct SimpleOpTypeSetTeller : public Teller {
       "conv2d_fusion",
       "pool2d",
       "relu",
+      "exp",
+      "log",
       "softmax",
       "sigmoid",
       "hard_swish",
@@ -77,6 +79,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "elementwise_sub",
       "elementwise_mul",
       "elementwise_div",
+      "elementwise_pow",
       "dropout",
       "prelu",
       "conv2d_transpose",
@@ -98,6 +101,8 @@ struct SimpleOpTypeSetTeller : public Teller {
       "gather",
       "gather_nd",
       "yolo_box",
+      "yolo_box_head",
+      "arg_max",
       "roi_align",
       "affine_channel",
       "nearest_interp",
@@ -128,6 +133,8 @@ struct SimpleOpTypeSetTeller : public Teller {
       "conv2d_fusion",
       "pool2d",
       "relu",
+      "exp",
+      "log",
       "softmax",
       "sigmoid",
       "hard_swish",
@@ -140,6 +147,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "elementwise_sub",
       "elementwise_mul",
       "elementwise_div",
+      "elementwise_pow",
       "dropout",
       "prelu",
       "conv2d_transpose",
@@ -161,6 +169,8 @@ struct SimpleOpTypeSetTeller : public Teller {
       "gather",
       "gather_nd",
       "yolo_box",
+      "yolo_box_head",
+      "arg_max",
       "roi_align",
       "affine_channel",
       "nearest_interp",
@@ -200,7 +210,7 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
 
   for (auto& teller : tellers_) {
     if (op_type == "relu" || op_type == "relu6" || op_type == "tanh" ||
-        op_type == "sigmoid") {
+        op_type == "sigmoid" || op_type == "exp" || op_type == "log") {
       auto* block = desc.Block();
       if (block == nullptr) {
         VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
@@ -630,6 +640,22 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       if (!has_attrs) return false;
     }
 
+    if (op_type == "yolo_box_head") {
+      if (with_dynamic_shape) return false;
+      bool has_attrs = desc.HasAttr("class_num") && desc.HasAttr("anchors");
+      if (!has_attrs) return false;
+    }
+
+    if (op_type == "arg_max") {
+      if (with_dynamic_shape) return false;
+      int axis = desc.HasAttr("axis")
+                     ? BOOST_GET_CONST(int64_t, desc.GetAttr("axis"))
+                     : -1;
+      bool flatten = BOOST_GET_CONST(bool, desc.GetAttr("flatten"));
+      int dtype = BOOST_GET_CONST(int, desc.GetAttr("dtype"));
+      if (axis == 0 || flatten || dtype != 2) return false;
+    }
+
     if (op_type == "affine_channel") {
       if (!desc.HasAttr("data_layout")) return false;
       auto data_layout = framework::StringToDataLayout(
@@ -941,9 +967,11 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
     }
 
     if (op_type == "strided_slice") {
-      if (!with_dynamic_shape) {
-        return false;
-      }
+#if !IS_TRT_VERSION_GE(7000)
+      VLOG(3)
+          << "strided_slice converter does not support trt versions below 7.0";
+      return false;
+#endif
       if (!desc.HasAttr("axes") || !desc.HasAttr("starts") ||
           !desc.HasAttr("ends") || !desc.HasAttr("strides")) {
         VLOG(3)
@@ -1009,7 +1037,8 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
     }
 
     if (op_type == "elementwise_add" || op_type == "elementwise_mul" ||
-        op_type == "elementwise_sub" || op_type == "elementwise_div") {
+        op_type == "elementwise_sub" || op_type == "elementwise_div" ||
+        op_type == "elementwise_pow") {
       if (desc.Input("X").size() != 1) {
         VLOG(3) << "The input op's Input(\"X\").size() "
                    "should equal to 1, but received Input(\"X\").size() = "
@@ -1039,32 +1068,15 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       auto* y_var_desc = block->FindVar(desc.Input("Y")[0]);
       const auto x_shape = x_var_desc->GetShape();
       const auto y_shape = y_var_desc->GetShape();
-      if (op_type == "elementwise_add" && y_var_desc->Persistable()) {
-        if (y_shape.size() != 1) {
-          return false;
-        }
-        if (y_shape[0] != x_shape[1]) {
-          return false;
-        }
-      }
       if (x_shape.size() == 1 && y_shape.size() == 1) {
         VLOG(3) << "Now trt may not support two 1d tensor elementwise op.";
         return false;
       }
-      if (op_type == "elementwise_add" || op_type == "elementwise_mul") {
-        if (x_var_desc->Persistable()) {
-          VLOG(3) << "Input X is a parameter which is not supported for "
-                     "elementwise_add/elementwise_mul in tensorrt, swap x and "
-                     "y will work";
-          return false;
-        }
-      }
-      if (op_type == "elementwise_sub" || op_type == "elementwise_div") {
-        if (x_var_desc->Persistable() || y_var_desc->Persistable()) {
-          VLOG(3) << "Input X or Input Y is a parameter which is not supported "
-                     "for elementwise_sub/elementwise_div in tensorrt";
-          return false;
-        }
+      if (x_var_desc->Persistable()) {
+        VLOG(3) << "Input X is a parameter which is not supported for "
+                   "elementwise_add/elementwise_mul in tensorrt, swap x and "
+                   "y will work";
+        return false;
       }
     }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index be6984d0f76b5..ff6a1cd60f720 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -7,6 +7,7 @@ nv_library(tensorrt_plugin
            hard_swish_op_plugin.cu stack_op_plugin.cu special_slice_plugin.cu
            anchor_generator_op_plugin.cu
            yolo_box_op_plugin.cu
+           yolo_box_head_op_plugin.cu
            roi_align_op_plugin.cu
            gather_nd_op_plugin.cu
            mish_op_plugin.cu
diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
index c9163e62a2e19..1070a88cee737 100644
--- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
@@ -35,6 +35,19 @@ template <typename T>
 struct Div {
   __device__ T operator()(const T &a, const T &b) const { return a / b; }
 };
+
+template <typename T>
+struct Sub {
+  __device__ T operator()(const T &a, const T &b) const { return a - b; }
+};
+
+template <typename T>
+struct Pow {
+  __device__ T operator()(const T &a, const T &b) const {
+    return static_cast<T>(::powf(static_cast<float>(a), static_cast<float>(b)));
+  }
+};
+
 }  // namespace details
 
 template <typename T, typename Operator>
@@ -139,6 +152,14 @@ int ElementWisePlugin::enqueue(int batch_size, const void *const *inputs,
     elementwise_kernel<<<block, thread, 0, stream>>>(
         num, x, y, out, prev_size_, batch_size * midd_size_, post_size_,
         details::Div<float>());
+  } else if (type_ == "sub") {
+    elementwise_kernel<<<block, thread, 0, stream>>>(
+        num, x, y, out, prev_size_, batch_size * midd_size_, post_size_,
+        details::Sub<float>());
+  } else if (type_ == "pow") {
+    elementwise_kernel<<<block, thread, 0, stream>>>(
+        num, x, y, out, prev_size_, batch_size * midd_size_, post_size_,
+        details::Pow<float>());
   } else {
     PADDLE_THROW(platform::errors::Fatal(
         "The %s type elementwise is not implemented in trt plugin.", type_));
@@ -254,12 +275,18 @@ int ElementwisePluginDynamic::enqueue(
   } else if (type_ == "div") {
     elementwise_kernel<<<block, thread, 0, stream>>>(
         num, x, y, out, prev_size, midd_size, post_size, details::Div<float>());
+  } else if (type_ == "sub") {
+    elementwise_kernel<<<block, thread, 0, stream>>>(
+        num, x, y, out, prev_size, midd_size, post_size, details::Sub<float>());
+  } else if (type_ == "pow") {
+    elementwise_kernel<<<block, thread, 0, stream>>>(
+        num, x, y, out, prev_size, midd_size, post_size, details::Pow<float>());
   } else {
-    PADDLE_THROW(
-        platform::errors::Unimplemented("Paddle-TRT only support elementwise "
-                                        "operation: {add, mul, div} currently, "
-                                        "but got %s.",
-                                        type_));
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Paddle-TRT only support elementwise "
+        "operation: {add, mul, div, sub, pow} currently, "
+        "but got %s.",
+        type_));
   }
 
   return cudaGetLastError() != cudaSuccess;
diff --git a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu
index 9720719fd0bca..2c2fad74b9a2d 100644
--- a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu
@@ -24,6 +24,16 @@ namespace tensorrt {
 namespace plugin {
 
 int SwishPlugin::initialize() TRT_NOEXCEPT { return 0; }
+void SwishPlugin::terminate() TRT_NOEXCEPT {}
+
+bool SwishPlugin::supportsFormat(
+    nvinfer1::DataType type, nvinfer1::PluginFormat format) const TRT_NOEXCEPT {
+  if (with_fp16_) {
+    return type == nvinfer1::DataType::kFLOAT ||
+           type == nvinfer1::DataType::kHALF;
+  }
+  return type == nvinfer1::DataType::kFLOAT;
+}
 
 nvinfer1::Dims SwishPlugin::getOutputDimensions(int index,
                                                 const nvinfer1::Dims *inputDims,
@@ -85,17 +95,29 @@ int SwishPlugin::enqueue(int batch_size, const void *const *inputs,
                          void *const *outputs, void *workspace,
                          cudaStream_t stream) TRT_NOEXCEPT {
 #endif
-  // input dims is CHW.
   const auto &input_dims = this->getInputDims(0);
-  const float *input = reinterpret_cast<const float *>(inputs[0]);
-  float *output = reinterpret_cast<float *const *>(outputs)[0];
   int num = batch_size;
   for (int i = 0; i < input_dims.nbDims; i++) {
     num *= input_dims.d[i];
   }
   int threads = 1024;
   int blocks = (num + threads - 1) / threads;
-  swish_kernel<<<blocks, threads, 0, stream>>>(num, input, output, beta_);
+  auto type = getDataType();
+  if (type == nvinfer1::DataType::kFLOAT) {
+    VLOG(1) << "TRT Plugin DataType selected. Swish-->fp32";
+    const float *input = reinterpret_cast<const float *>(inputs[0]);
+    float *output = reinterpret_cast<float *const *>(outputs)[0];
+    swish_kernel<<<blocks, threads, 0, stream>>>(num, input, output, beta_);
+  } else if (type == nvinfer1::DataType::kHALF) {
+    VLOG(1) << "TRT Plugin DataType selected. Swish-->fp16";
+    const half *input = reinterpret_cast<const half *>(inputs[0]);
+    half *output = reinterpret_cast<half *const *>(outputs)[0];
+    swish_kernel<<<blocks, threads, 0, stream>>>(num, input, output,
+                                                 (half)beta_);
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "The Swish TRT Plugin's input type should be float or half."));
+  }
 
   return cudaGetLastError() != cudaSuccess;
 }
@@ -140,12 +162,15 @@ bool SwishPluginDynamic::supportsFormatCombination(
   const nvinfer1::PluginTensorDesc &in = in_out[pos];
   if (pos == 0) {
     if (with_fp16_) {
-      return (in.type == nvinfer1::DataType::kFLOAT ||
-              in.type == nvinfer1::DataType::kHALF) &&
-             (in.format == nvinfer1::TensorFormat::kLINEAR);
+      bool res = (in.type == nvinfer1::DataType::kFLOAT ||
+                  in.type == nvinfer1::DataType::kHALF);
+// encounter trt crash bug
+#if IS_TRT_VERSION_LT(8000)
+      res = res && (in.format == nvinfer1::TensorFormat::kLINEAR);
+#endif
+      return res;
     } else {
-      return (in.type == nvinfer1::DataType::kFLOAT) &&
-             (in.format == nvinfer1::TensorFormat::kLINEAR);
+      return in.type == nvinfer1::DataType::kFLOAT;
     }
   }
   const nvinfer1::PluginTensorDesc &prev = in_out[pos - 1];
diff --git a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h
index c4bdc5f921509..aa8fdce23fa89 100644
--- a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h
@@ -26,7 +26,7 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {
 
-class SwishPlugin : public PluginTensorRT {
+class SwishPlugin : public PluginTensorRTV2Ext {
  private:
   float beta_;
 
@@ -55,13 +55,24 @@ class SwishPlugin : public PluginTensorRT {
 
   int initialize() TRT_NOEXCEPT override;
 
-  SwishPlugin* clone() const TRT_NOEXCEPT override {
-    return new SwishPlugin(beta_, with_fp16_);
+  nvinfer1::IPluginV2Ext* clone() const TRT_NOEXCEPT override {
+    auto* plugin = new SwishPlugin(beta_, with_fp16_);
+    plugin->data_format_ = data_format_;
+    plugin->data_type_ = data_type_;
+    plugin->input_dims_ = input_dims_;
+    return plugin;
   }
 
   const char* getPluginType() const TRT_NOEXCEPT override {
     return "swish_plugin";
   }
+
+  nvinfer1::DataType getOutputDataType(
+      int index, const nvinfer1::DataType* input_types,
+      int nb_inputs) const TRT_NOEXCEPT override {
+    return input_types[0];
+  }
+
   int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
   nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
                                      int nbInputDims) TRT_NOEXCEPT override;
@@ -71,6 +82,12 @@ class SwishPlugin : public PluginTensorRT {
   int enqueue(int batchSize, const void* const* inputs, void* const* outputs,
 #endif
               void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+
+  void terminate() TRT_NOEXCEPT override;
+  void destroy() TRT_NOEXCEPT override { delete this; }
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "2"; }
+  bool supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format)
+      const TRT_NOEXCEPT override;
 };
 
 class SwishPluginCreator : public TensorRTPluginCreator {
@@ -79,7 +96,7 @@ class SwishPluginCreator : public TensorRTPluginCreator {
     return "swish_plugin";
   }
 
-  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "2"; }
 
   nvinfer1::IPluginV2* deserializePlugin(
       const char* name, const void* serial_data,
diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_head_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/yolo_box_head_op_plugin.cu
new file mode 100644
index 0000000000000..755bb5aa28572
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_head_op_plugin.cu
@@ -0,0 +1,91 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tensorrt/plugin/yolo_box_head_op_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+inline __device__ float SigmoidGPU(const float& x) {
+  return 1.0f / (1.0f + __expf(-x));
+}
+
+__global__ void YoloBoxHeadKernel(const float* input, float* output,
+                                  const int grid_size_x, const int grid_size_y,
+                                  const int class_num, const int anchors_num) {
+  int x_id = blockIdx.x * blockDim.x + threadIdx.x;
+  int y_id = blockIdx.y * blockDim.y + threadIdx.y;
+  int z_id = blockIdx.z * blockDim.z + threadIdx.z;
+  if ((x_id >= grid_size_x) || (y_id >= grid_size_y) || (z_id >= anchors_num)) {
+    return;
+  }
+  const int grids_num = grid_size_x * grid_size_y;
+  const int bbindex = y_id * grid_size_x + x_id;
+
+  // objectness
+  output[bbindex + grids_num * (z_id * (5 + class_num) + 4)] =
+      SigmoidGPU(input[bbindex + grids_num * (z_id * (5 + class_num) + 4)]);
+  // x
+  output[bbindex + grids_num * (z_id * (5 + class_num) + 0)] =
+      SigmoidGPU(input[bbindex + grids_num * (z_id * (5 + class_num) + 0)]);
+  // y
+  output[bbindex + grids_num * (z_id * (5 + class_num) + 1)] =
+      SigmoidGPU(input[bbindex + grids_num * (z_id * (5 + class_num) + 1)]);
+  // w
+  output[bbindex + grids_num * (z_id * (5 + class_num) + 2)] =
+      __expf(input[bbindex + grids_num * (z_id * (5 + class_num) + 2)]);
+  // h
+  output[bbindex + grids_num * (z_id * (5 + class_num) + 3)] =
+      __expf(input[bbindex + grids_num * (z_id * (5 + class_num) + 3)]);
+  // Probabilities of classes
+  for (int i = 0; i < class_num; ++i) {
+    output[bbindex + grids_num * (z_id * (5 + class_num) + (5 + i))] =
+        SigmoidGPU(
+            input[bbindex + grids_num * (z_id * (5 + class_num) + (5 + i))]);
+  }
+}
+
+int YoloBoxHeadPlugin::enqueue(int batch_size, const void* const* inputs,
+#if IS_TRT_VERSION_LT(8000)
+                               void** outputs,
+#else
+                               void* const* outputs,
+#endif
+                               void* workspace,
+                               cudaStream_t stream) TRT_NOEXCEPT {
+  const int h = input_dims_[0].d[1];
+  const int w = input_dims_[0].d[2];
+  const int grid_size_x = w;
+  const int grid_size_y = h;
+  const int anchors_num = anchors_.size() / 2;
+  const float* input_data = static_cast<const float*>(inputs[0]);
+  float* output_data = static_cast<float*>(outputs[0]);
+  const int volume = input_dims_[0].d[0] * h * w;
+  dim3 block(16, 16, 4);
+  dim3 grid((grid_size_x / block.x) + 1, (grid_size_y / block.y) + 1,
+            (anchors_num / block.z) + 1);
+  for (int n = 0; n < batch_size; n++) {
+    YoloBoxHeadKernel<<<grid, block, 0, stream>>>(
+        input_data + n * volume, output_data + n * volume, grid_size_x,
+        grid_size_y, class_num_, anchors_num);
+  }
+  return 0;
+}
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_head_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/yolo_box_head_op_plugin.h
new file mode 100644
index 0000000000000..2094dbfc9db4b
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_head_op_plugin.h
@@ -0,0 +1,104 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+class YoloBoxHeadPlugin : public PluginTensorRT {
+ public:
+  explicit YoloBoxHeadPlugin(const std::vector<int>& anchors,
+                             const int class_num)
+      : anchors_(anchors), class_num_(class_num) {}
+
+  YoloBoxHeadPlugin(const void* data, size_t length) {
+    deserializeBase(data, length);
+    DeserializeValue(&data, &length, &anchors_);
+    DeserializeValue(&data, &length, &class_num_);
+  }
+
+  ~YoloBoxHeadPlugin() override{};
+
+  nvinfer1::IPluginV2* clone() const TRT_NOEXCEPT override {
+    return new YoloBoxHeadPlugin(anchors_, class_num_);
+  }
+
+  const char* getPluginType() const TRT_NOEXCEPT override {
+    return "yolo_box_head_plugin";
+  }
+
+  int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
+
+  int initialize() TRT_NOEXCEPT override { return 0; }
+
+  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
+                                     int nb_input_dims) TRT_NOEXCEPT override {
+    assert(index == 0);
+    assert(nb_input_dims == 1);
+    return inputs[0];
+  }
+
+  int enqueue(int batch_size, const void* const* inputs,
+#if IS_TRT_VERSION_LT(8000)
+              void** outputs,
+#else
+              void* const* outputs,
+#endif
+              void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+
+  size_t getSerializationSize() const TRT_NOEXCEPT override {
+    return getBaseSerializationSize() + SerializedSize(anchors_) +
+           SerializedSize(class_num_);
+  }
+
+  void serialize(void* buffer) const TRT_NOEXCEPT override {
+    serializeBase(buffer);
+    SerializeValue(&buffer, anchors_);
+    SerializeValue(&buffer, class_num_);
+  }
+
+ private:
+  std::vector<int> anchors_;
+  int class_num_;
+  std::string namespace_;
+};
+
+class YoloBoxHeadPluginCreator : public TensorRTPluginCreator {
+ public:
+  const char* getPluginName() const TRT_NOEXCEPT override {
+    return "yolo_box_head_plugin";
+  }
+
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
+
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name, const void* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override {
+    return new YoloBoxHeadPlugin(serial_data, serial_length);
+  }
+};
+
+REGISTER_TRT_PLUGIN_V2(YoloBoxHeadPluginCreator);
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc
index cca8ac2634c6c..141e60513eb95 100644
--- a/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc
@@ -147,10 +147,10 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs,
   file.read(reinterpret_cast<char *>(&total_words_num), sizeof(int64_t));
   LOG(INFO) << "Total words in file: " << total_words_num;
   size_t lods_beginning_offset = static_cast<size_t>(file.tellg());
-  auto words_begining_offset =
+  auto words_beginning_offset =
       lods_beginning_offset + sizeof(size_t) * total_sentences_num;
   auto targets_beginning_offset =
-      words_begining_offset + sizeof(int64_t) * total_words_num;
+      words_beginning_offset + sizeof(int64_t) * total_words_num;
 
   std::vector<size_t> lod_full =
       ReadSentenceLod(file, lods_beginning_offset, total_sentences_num);
@@ -158,7 +158,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs,
   size_t lods_sum = std::accumulate(lod_full.begin(), lod_full.end(), 0UL);
   EXPECT_EQ(lods_sum, static_cast<size_t>(total_words_num));
 
-  TensorReader<int64_t> words_reader(file, words_begining_offset, "words");
+  TensorReader<int64_t> words_reader(file, words_beginning_offset, "words");
   TensorReader<int64_t> targets_reader(file, targets_beginning_offset,
                                        "targets");
   // If FLAGS_iterations is set to 0, run all batches
diff --git a/paddle/fluid/inference/tests/infer_ut/run.sh b/paddle/fluid/inference/tests/infer_ut/run.sh
index 331608a2cbc01..a78bc2b85d281 100755
--- a/paddle/fluid/inference/tests/infer_ut/run.sh
+++ b/paddle/fluid/inference/tests/infer_ut/run.sh
@@ -309,7 +309,7 @@ echo " "
 
 if [[ -f ${exe_dir}/test_summary.txt ]];then
   echo " "
-  echo "Summary Failed Tests ..."
+  echo "Summary infer_ut Failed Tests ..."
   echo "=====================test summary======================"
   echo "The following tests Failed: "
   cat ${exe_dir}/test_summary.txt
diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt
index 76bb8993cbefa..53e7993945586 100644
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
@@ -13,6 +13,7 @@ cc_library(memcpy SRCS memcpy.cc DEPS place device_context)
 cc_library(stats SRCS stats.cc DEPS enforce)
 cc_library(memory DEPS malloc memcpy stats)
 
+cc_test(memory_stats_test SRCS memory_stats_test.cc DEPS memory)
 cc_test(stats_test SRCS stats_test.cc DEPS stats)
 
 if (WITH_GPU)
diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h
index 003d6988671b5..d51ec52a49e69 100644
--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -33,8 +33,8 @@ namespace allocation {
 // Exception when `Alloc`/`AllocShared` failed
 struct BadAlloc : public std::exception {
   inline explicit BadAlloc(std::string err_msg, const char* file, int line)
-      : err_str_(platform::GetTraceBackString(std::move(err_msg), file, line)) {
-  }
+      : err_str_(platform::GetCompleteTraceBackString(std::move(err_msg), file,
+                                                      line)) {}
 
   const char* what() const noexcept override { return err_str_.c_str(); }
 
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 35ad27f4c62b5..99152607158eb 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -931,10 +931,7 @@ class AllocatorFacadePrivate {
 
   void WrapStatAllocator() {
     for (auto& pair : allocators_) {
-      // Now memory stats is only supported for GPU
-      if (platform::is_gpu_place(pair.first)) {
-        pair.second = std::make_shared<StatAllocator>(pair.second);
-      }
+      pair.second = std::make_shared<StatAllocator>(pair.second);
     }
   }
 
diff --git a/paddle/fluid/memory/allocation/stat_allocator.h b/paddle/fluid/memory/allocation/stat_allocator.h
index 71569366c2446..68209bbaabeca 100644
--- a/paddle/fluid/memory/allocation/stat_allocator.h
+++ b/paddle/fluid/memory/allocation/stat_allocator.h
@@ -30,16 +30,28 @@ class StatAllocator : public Allocator {
 
  protected:
   void FreeImpl(phi::Allocation* allocation) override {
-    MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
-                       -allocation->size());
+    if (platform::is_cpu_place(allocation->place())) {
+      HOST_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
+                              -allocation->size());
+    } else {
+      DEVICE_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
+                                -allocation->size());
+    }
+
     underlying_allocator_->Free(allocation);
   }
 
   phi::Allocation* AllocateImpl(size_t size) override {
     phi::Allocator::AllocationPtr allocation =
         underlying_allocator_->Allocate(size);
-    MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
-                       allocation->size());
+
+    if (platform::is_cpu_place(allocation->place())) {
+      HOST_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
+                              allocation->size());
+    } else {
+      DEVICE_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
+                                allocation->size());
+    }
     return allocation.release();
   }
 
diff --git a/paddle/fluid/memory/detail/memory_block_desc.cc b/paddle/fluid/memory/detail/memory_block_desc.cc
index 4414fb07a7bf3..a6580c28f447a 100644
--- a/paddle/fluid/memory/detail/memory_block_desc.cc
+++ b/paddle/fluid/memory/detail/memory_block_desc.cc
@@ -62,18 +62,12 @@ inline size_t hash(const MemoryBlock::Desc& metadata, size_t initial_seed) {
 }  // namespace
 
 void MemoryBlock::Desc::UpdateGuards() {
-#ifdef PADDLE_WITH_TESTING
   guard_begin = hash(*this, 1);
   guard_end = hash(*this, 2);
-#endif
 }
 
 bool MemoryBlock::Desc::CheckGuards() const {
-#ifdef PADDLE_WITH_TESTING
   return guard_begin == hash(*this, 1) && guard_end == hash(*this, 2);
-#else
-  return true;
-#endif
 }
 
 }  // namespace detail
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index 37ac0b4483291..06038804e6efe 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -15,6 +15,8 @@ limitations under the License. */
 
 #include "paddle/fluid/memory/detail/system_allocator.h"
 
+#include "paddle/fluid/memory/stats.h"
+
 #ifdef _WIN32
 #include <malloc.h>
 #ifndef NOMINMAX
@@ -92,6 +94,8 @@ void* CPUAllocator::Alloc(size_t* index, size_t size) {
     }
   }
 
+  HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
+
   return p;
 }
 
@@ -108,6 +112,8 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
 #else
   free(p);
 #endif
+
+  HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size);
 }
 
 bool CPUAllocator::UseGpu() const { return false; }
diff --git a/paddle/fluid/memory/memory_stats_test.cc b/paddle/fluid/memory/memory_stats_test.cc
new file mode 100644
index 0000000000000..b2fc602e401ed
--- /dev/null
+++ b/paddle/fluid/memory/memory_stats_test.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/memory.h"
+#include <algorithm>
+#include <vector>
+#include "gtest/gtest.h"
+
+namespace paddle {
+namespace memory {
+
+TEST(stat_allocator_test, host_memory_stat_test) {
+  std::vector<int64_t> alloc_sizes{
+      5278, 9593, 8492, 5041, 3351, 4232, 3706, 5963, 5896, 5057, 7527,
+      6235, 0,    7810, 940,  1239, 1945, 789,  2891, 7553, 8046, 2685,
+      1332, 6547, 5238, 5345, 1133, 5475, 9137, 3111, 8478, 6350, 9395,
+      4,    1185, 2186, 357,  9774, 6743, 6136, 7073, 7674, 5640, 3935,
+      528,  6699, 9821, 8717, 2264, 4708, 9936, 3566, 1373, 6955, 3694,
+      221,  309,  3617, 3793, 3334, 7281, 1302};
+
+  int64_t max_alloc_size = 0;
+  for (int64_t size : alloc_sizes) {
+    AllocationPtr allocation = Alloc(platform::CPUPlace(), size);
+    int64_t alloc_size = static_cast<int64_t>(allocation->size());
+    max_alloc_size = std::max(max_alloc_size, alloc_size);
+    EXPECT_EQ(HostMemoryStatCurrentValue("Allocated", 0), alloc_size);
+  }
+  EXPECT_EQ(HostMemoryStatPeakValue("Allocated", 0), max_alloc_size);
+}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+TEST(stat_allocator_test, device_memory_stat_test) {
+  std::vector<int64_t> alloc_sizes{
+      5278, 9593, 8492, 5041, 3351, 4232, 3706, 5963, 5896, 5057, 7527,
+      6235, 0,    7810, 940,  1239, 1945, 789,  2891, 7553, 8046, 2685,
+      1332, 6547, 5238, 5345, 1133, 5475, 9137, 3111, 8478, 6350, 9395,
+      4,    1185, 2186, 357,  9774, 6743, 6136, 7073, 7674, 5640, 3935,
+      528,  6699, 9821, 8717, 2264, 4708, 9936, 3566, 1373, 6955, 3694,
+      221,  309,  3617, 3793, 3334, 7281, 1302};
+
+  int64_t max_alloc_size = 0;
+  for (int64_t size : alloc_sizes) {
+    AllocationPtr allocation = Alloc(platform::CUDAPlace(), size);
+    int64_t alloc_size = static_cast<int64_t>(allocation->size());
+    max_alloc_size = std::max(max_alloc_size, alloc_size);
+    EXPECT_EQ(DeviceMemoryStatCurrentValue("Allocated", 0), alloc_size);
+  }
+  EXPECT_EQ(DeviceMemoryStatPeakValue("Allocated", 0), max_alloc_size);
+}
+#endif
+
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/stats.cc b/paddle/fluid/memory/stats.cc
index 31d776de40702..97197b495f5fc 100644
--- a/paddle/fluid/memory/stats.cc
+++ b/paddle/fluid/memory/stats.cc
@@ -38,7 +38,7 @@ class StatRegistry {
   }
 
   std::string GetStatKey(const std::string& stat_type, int dev_id) {
-    return "STAT_Device" + std::to_string(dev_id) + "_" + stat_type;
+    return stat_type + std::to_string(dev_id);
   }
 
   int64_t GetCurrentValue(const std::string& stat_type, int dev_id) {
@@ -49,6 +49,10 @@ class StatRegistry {
     return GetStat(stat_type, dev_id)->GetPeakValue();
   }
 
+  void Update(const std::string& stat_type, int dev_id, int64_t increment) {
+    GetStat(stat_type, dev_id)->Update(increment);
+  }
+
   void Register(const std::string& stat_type, int dev_id, StatBase* stat) {
     std::lock_guard<SpinLock> lock_guard(stat_map_lock_);
     stat_map_[GetStatKey(stat_type, dev_id)] = stat;
@@ -59,10 +63,6 @@ class StatRegistry {
     stat_map_.erase(GetStatKey(stat_type, dev_id));
   }
 
-  void Update(const std::string& stat_type, int dev_id, int64_t increment) {
-    stat_map_[GetStatKey(stat_type, dev_id)]->Update(increment);
-  }
-
  private:
   StatRegistry() = default;
 
@@ -72,43 +72,67 @@ class StatRegistry {
   SpinLock stat_map_lock_;
 };
 
-int64_t StatGetCurrentValue(const std::string& stat_type, int dev_id) {
-  return StatRegistry::GetInstance()->GetCurrentValue(stat_type, dev_id);
+int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id) {
+  return StatRegistry::GetInstance()->GetCurrentValue("Device" + stat_type,
+                                                      dev_id);
 }
 
-int64_t StatGetPeakValue(const std::string& stat_type, int dev_id) {
-  return StatRegistry::GetInstance()->GetPeakValue(stat_type, dev_id);
+int64_t DeviceMemoryStatPeakValue(const std::string& stat_type, int dev_id) {
+  return StatRegistry::GetInstance()->GetPeakValue("Device" + stat_type,
+                                                   dev_id);
 }
 
-void StatUpdate(const std::string& stat_type, int dev_id, int64_t increment) {
-  StatRegistry::GetInstance()->Update(stat_type, dev_id, increment);
+void DeviceMemoryStatUpdate(const std::string& stat_type, int dev_id,
+                            int64_t increment) {
+  StatRegistry::GetInstance()->Update("Device" + stat_type, dev_id, increment);
 }
 
-#define MEMORY_STAT_REGISTER_WITH_ID(item, id) \
-  StatRegistry::GetInstance()->Register(       \
-      #item, id, Stat<ThreadLocalStatDevice##id##item>::GetInstance());
-
-#define MEMORY_STAT_REGISTER(item)        \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 0);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 1);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 2);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 3);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 4);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 5);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 6);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 7);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 8);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 9);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 10); \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 11); \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 12); \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 13); \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 14); \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 15)
+int64_t HostMemoryStatCurrentValue(const std::string& stat_type, int dev_id) {
+  return StatRegistry::GetInstance()->GetCurrentValue("Host" + stat_type,
+                                                      dev_id);
+}
+
+int64_t HostMemoryStatPeakValue(const std::string& stat_type, int dev_id) {
+  return StatRegistry::GetInstance()->GetPeakValue("Host" + stat_type, dev_id);
+}
+
+void HostMemoryStatUpdate(const std::string& stat_type, int dev_id,
+                          int64_t increment) {
+  StatRegistry::GetInstance()->Update("Host" + stat_type, dev_id, increment);
+}
+
+#define DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, id) \
+  StatRegistry::GetInstance()->Register(              \
+      "Device" #item, id, Stat<DeviceMemoryStat##item##id>::GetInstance());
+
+#define DEVICE_MEMORY_STAT_REGISTER(item)        \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 0);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 1);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 2);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 3);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 4);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 5);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 6);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 7);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 8);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 9);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 10); \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 11); \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 12); \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 13); \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 14); \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 15)
+
+#define HOST_MEMORY_STAT_REGISTER(item)  \
+  StatRegistry::GetInstance()->Register( \
+      "Host" #item, 0, Stat<HostMemoryStat##item##0>::GetInstance());
 
 int RegisterAllStats() {
-  MEMORY_STAT_REGISTER(Allocated);
-  MEMORY_STAT_REGISTER(Reserved);
+  DEVICE_MEMORY_STAT_REGISTER(Allocated);
+  DEVICE_MEMORY_STAT_REGISTER(Reserved);
+
+  HOST_MEMORY_STAT_REGISTER(Allocated);
+  HOST_MEMORY_STAT_REGISTER(Reserved);
   return 0;
 }
 
diff --git a/paddle/fluid/memory/stats.h b/paddle/fluid/memory/stats.h
index 0906567dbf6c1..bb6a3cca6644c 100644
--- a/paddle/fluid/memory/stats.h
+++ b/paddle/fluid/memory/stats.h
@@ -80,8 +80,8 @@ class Stat : public StatBase {
       while (prev_value < current_value &&
              !peak_value_.compare_exchange_weak(prev_value, current_value)) {
       }
-      VLOG(8) << "Update peak_value, after update, peak_value = " << peak_value_
-              << " , current value = " << current_value;
+      VLOG(8) << "Update peak_value, after update, peak_value = "
+              << peak_value_.load() << " , current value = " << current_value;
     }
   }
 
@@ -91,82 +91,113 @@ class Stat : public StatBase {
   std::atomic<int64_t> peak_value_{0};
 };
 
-// StatGetCurrentValue, StatGetPeakValue and StatUpdate support to operate STAT
-// values by a string, however, they has worse performance than the macro
-// function MEMORY_STAT_CURRENT_VALUE, MEMORY_STAT_PEAK_VALUE, and
-// MEMORY_STAT_UPDATE. Try to use the macro functions where ultra-low
-// performance overhead is required.
-int64_t StatGetCurrentValue(const std::string& stat_type, int dev_id);
-int64_t StatGetPeakValue(const std::string& stat_type, int dev_id);
-void StatUpdate(const std::string& stat_type, int dev_id, int64_t increment);
-
-#define MEMORY_STAT_FUNC_SWITHCH_CASE(item, id)                          \
-  case id:                                                               \
-    stat = paddle::memory::Stat<                                         \
-        paddle::memory::ThreadLocalStatDevice##id##item>::GetInstance(); \
+// xxxMemoryStatCurrentValue, xxxMemoryStatPeakValue and xxxMemoryStatUpdate
+// support to operate STAT values by a string, however, they has worse
+// performance than the macro function xxx_MEMORY_STAT_CURRENT_VALUE,
+// xxx_MEMORY_STAT_PEAK_VALUE, and xxx_MEMORY_STAT_UPDATE. Try to use the macro
+// functions where ultra-low performance overhead is required.
+int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id);
+int64_t DeviceMemoryStatPeakValue(const std::string& stat_type, int dev_id);
+void DeviceMemoryStatUpdate(const std::string& stat_type, int dev_id,
+                            int64_t increment);
+
+int64_t HostMemoryStatCurrentValue(const std::string& stat_type, int dev_id);
+int64_t HostMemoryStatPeakValue(const std::string& stat_type, int dev_id);
+void HostMemoryStatUpdate(const std::string& stat_type, int dev_id,
+                          int64_t increment);
+
+#define DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, id)              \
+  case id:                                                          \
+    stat = paddle::memory::Stat<                                    \
+        paddle::memory::DeviceMemoryStat##item##id>::GetInstance(); \
     break
 
-#define MEMORY_STAT_FUNC(item, id, func, ...)                         \
-  [&] {                                                               \
-    paddle::memory::StatBase* stat = nullptr;                         \
-    switch (id) {                                                     \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 0);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 1);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 2);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 3);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 4);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 5);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 6);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 7);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 8);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 9);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 10);                        \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 11);                        \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 12);                        \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 13);                        \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 14);                        \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 15);                        \
-      default:                                                        \
-        PADDLE_THROW(paddle::platform::errors::OutOfRange(            \
-            "Only support device id between [0, 15] in memory stats," \
-            "not support device id: %d",                              \
-            id));                                                     \
-        break;                                                        \
-    }                                                                 \
-    return stat->func(__VA_ARGS__);                                   \
+#define DEVICE_MEMORY_STAT_FUNC(item, id, func, ...)                          \
+  [&] {                                                                       \
+    paddle::memory::StatBase* stat = nullptr;                                 \
+    switch (id) {                                                             \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 0);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 1);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 2);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 3);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 4);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 5);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 6);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 7);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 8);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 9);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 10);                         \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 11);                         \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 12);                         \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 13);                         \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 14);                         \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 15);                         \
+      default:                                                                \
+        PADDLE_THROW(paddle::platform::errors::OutOfRange(                    \
+            "Only support device id between [0, 15] for device memory stats," \
+            "not support device id: %d",                                      \
+            id));                                                             \
+        break;                                                                \
+    }                                                                         \
+    return stat->func(__VA_ARGS__);                                           \
   }()
 
-#define MEMORY_STAT_CURRENT_VALUE(item, id) \
-  MEMORY_STAT_FUNC(item, id, GetCurrentValue)
-#define MEMORY_STAT_PEAK_VALUE(item, id) \
-  MEMORY_STAT_FUNC(item, id, GetPeakValue)
-#define MEMORY_STAT_UPDATE(item, id, increment) \
-  MEMORY_STAT_FUNC(item, id, Update, increment)
-
-#define MEMORY_STAT_DECLARE_WITH_ID(item, id) \
-  struct ThreadLocalStatDevice##id##item : public ThreadLocalStatBase {};
-
-#define MEMORY_STAT_DECLARE(item)        \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 0);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 1);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 2);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 3);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 4);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 5);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 6);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 7);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 8);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 9);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 10); \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 11); \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 12); \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 13); \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 14); \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 15)
+#define DEVICE_MEMORY_STAT_CURRENT_VALUE(item, id) \
+  DEVICE_MEMORY_STAT_FUNC(item, id, GetCurrentValue)
+#define DEVICE_MEMORY_STAT_PEAK_VALUE(item, id) \
+  DEVICE_MEMORY_STAT_FUNC(item, id, GetPeakValue)
+#define DEVICE_MEMORY_STAT_UPDATE(item, id, increment) \
+  DEVICE_MEMORY_STAT_FUNC(item, id, Update, increment)
+
+#define HOST_MEMORY_STAT_FUNC(item, id, func, ...)                           \
+  [&] {                                                                      \
+    PADDLE_ENFORCE_EQ(id, 0, paddle::platform::errors::OutOfRange(           \
+                                 "Only support device id 0 for host memory " \
+                                 "stats, not support device id: %d",         \
+                                 id));                                       \
+    return paddle::memory::Stat<                                             \
+               paddle::memory::HostMemoryStat##item##0>::GetInstance()       \
+        ->func(__VA_ARGS__);                                                 \
+  }()
+
+#define HOST_MEMORY_STAT_CURRENT_VALUE(item, id) \
+  HOST_MEMORY_STAT_FUNC(item, id, GetCurrentValue)
+#define HOST_MEMORY_STAT_PEAK_VALUE(item, id) \
+  HOST_MEMORY_STAT_FUNC(item, id, GetPeakValue)
+#define HOST_MEMORY_STAT_UPDATE(item, id, increment) \
+  HOST_MEMORY_STAT_FUNC(item, id, Update, increment)
+
+#define DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, id) \
+  struct DeviceMemoryStat##item##id : public ThreadLocalStatBase {}
+
+#define DEVICE_MEMORY_STAT_DECLARE(item)        \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 0);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 1);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 2);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 3);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 4);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 5);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 6);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 7);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 8);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 9);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 10); \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 11); \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 12); \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 13); \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 14); \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 15)
+
+// Only support id 0 for host memory stat
+#define HOST_MEMORY_STAT_DECLARE(item) \
+  struct HostMemoryStat##item##0 : public ThreadLocalStatBase{};
 
 // To add a new STAT type, declare here and register in stats.cc
-MEMORY_STAT_DECLARE(Allocated);
-MEMORY_STAT_DECLARE(Reserved);
+DEVICE_MEMORY_STAT_DECLARE(Allocated);
+DEVICE_MEMORY_STAT_DECLARE(Reserved);
+
+HOST_MEMORY_STAT_DECLARE(Allocated);
+HOST_MEMORY_STAT_DECLARE(Reserved);
 
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/stats_test.cc b/paddle/fluid/memory/stats_test.cc
index 436c737916d9f..bcaba8e91080f 100644
--- a/paddle/fluid/memory/stats_test.cc
+++ b/paddle/fluid/memory/stats_test.cc
@@ -23,50 +23,77 @@
 namespace paddle {
 namespace memory {
 
-TEST(stats_test, MultiThreadReadWriteTest) {
-  std::string stat_type = "Allocated";
-  size_t thread_num = 3;
-  size_t data_num = 10;
-
-  std::condition_variable cv;
-  std::mutex mutex;
-  std::vector<std::thread> threads;
-  size_t ready_thread_num = 0;
-
-  for (size_t i = 0; i < thread_num; ++i) {
-    threads.emplace_back(
-        [&stat_type, data_num, &cv, &mutex, &ready_thread_num]() {
-          for (size_t data = 0; data < data_num; ++data) {
-            StatUpdate(stat_type, 0, data);
-          }
-          /* lock guard*/ {
-            std::lock_guard<std::mutex> lock_guard{mutex};
-            ++ready_thread_num;
-            cv.notify_one();
-          }
-          // Sleep here to not exit before the main thread checking stat
-          // results, because the thread-local stat data will be destroyed when
-          // the thread exit
-          std::this_thread::sleep_for(std::chrono::seconds(1));
-        });
+class StatsTest : public ::testing::Test {
+ protected:
+  void SetStatType(const std::string& stat_type) { stat_type_ = stat_type; }
+
+  void SetFunc(
+      std::function<void(const std::string, int, int64_t)> update_func,
+      std::function<int64_t(const std::string, int)> current_value_func,
+      std::function<int64_t(const std::string, int)> peak_value_func) {
+    update_func_ = update_func;
+    current_value_func_ = current_value_func;
+    peak_value_func_ = peak_value_func;
+  }
+
+  void RunTests() {
+    MultiThreadReadWriteTest();
+    PeakValueTest();
   }
 
-  std::unique_lock<std::mutex> unique_lock(mutex);
-  cv.wait(unique_lock, [&ready_thread_num, thread_num]() {
-    return ready_thread_num == thread_num;
-  });
+ private:
+  void MultiThreadReadWriteTest() {
+    size_t thread_num = 3;
+    size_t data_num = 10;
+
+    std::condition_variable cv;
+    std::mutex mutex;
+    std::vector<std::thread> threads;
+    size_t ready_thread_num = 0;
+
+    for (size_t i = 0; i < thread_num; ++i) {
+      threads.emplace_back([&]() {
+        for (size_t data = 0; data < data_num; ++data) {
+          update_func_(stat_type_, 0, data);
+        }
+        /* lock guard*/ {
+          std::lock_guard<std::mutex> lock_guard{mutex};
+          ++ready_thread_num;
+          cv.notify_one();
+        }
+        // Sleep here to not exit before the main thread checking stat
+        // results, because the thread-local stat data will be destroyed when
+        // the thread exit
+        std::this_thread::sleep_for(std::chrono::seconds(1));
+      });
+    }
 
-  EXPECT_EQ(StatGetCurrentValue(stat_type, 0),
-            int64_t((thread_num * data_num * (data_num - 1)) >> 1));
+    std::unique_lock<std::mutex> unique_lock(mutex);
+    cv.wait(unique_lock, [&ready_thread_num, thread_num]() {
+      return ready_thread_num == thread_num;
+    });
 
-  for (size_t i = 0; i < thread_num; ++i) {
-    threads[i].join();
+    EXPECT_EQ(current_value_func_(stat_type_, 0),
+              int64_t((thread_num * data_num * (data_num - 1)) >> 1));
+
+    for (size_t i = 0; i < thread_num; ++i) {
+      threads[i].join();
+    }
+  }
+
+  void PeakValueTest() {
+    int64_t peak_value = ((int64_t)1) << 63;
+    int64_t sum = 0;
+    for (int64_t data : datas_) {
+      update_func_(stat_type_, 0, data);
+      sum += data;
+      peak_value = std::max(peak_value, sum);
+    }
+    EXPECT_EQ(peak_value_func_(stat_type_, 0), peak_value);
   }
-}
 
-TEST(stats_test, PeakValueTest) {
-  std::string stat_type = "Allocated";
-  std::vector<int64_t> datas = {
+  std::string stat_type_;
+  std::vector<int64_t> datas_{
       543149808935355, 634698327471328, 706215795436611, 577939367795333,
       419479490054362, 21975227714595,  812939817942250, 984428837942082,
       537304104446806, 685008544452453, 563352858161268, 690143831596330,
@@ -93,14 +120,53 @@ TEST(stats_test, PeakValueTest) {
       746465732805300, -74049761897414, -65640372433924, 852009039806484,
       305079802044257, -48409757869238, 266031781660228, 327287322379820};
 
-  int64_t peak_value = ((int64_t)1) << 63;
-  int64_t sum = 0;
-  for (int64_t data : datas) {
-    StatUpdate(stat_type, 0, data);
-    sum += data;
-    peak_value = std::max(peak_value, sum);
-  }
-  EXPECT_EQ(StatGetPeakValue(stat_type, 0), peak_value);
+  std::function<void(const std::string, int, int64_t)> update_func_;
+  std::function<int64_t(const std::string, int)> current_value_func_;
+  std::function<int64_t(const std::string, int)> peak_value_func_;
+};
+
+TEST_F(StatsTest, DeviceAllocatedTest) {
+  SetStatType("Allocated");
+  SetFunc(DeviceMemoryStatUpdate, DeviceMemoryStatCurrentValue,
+          DeviceMemoryStatPeakValue);
+  RunTests();
+}
+
+TEST_F(StatsTest, DeviceReservedMacroTest) {
+  SetStatType("Reserved");
+  SetFunc(
+      [](const std::string stat_type, int id, int64_t increment) {
+        return DEVICE_MEMORY_STAT_UPDATE(Reserved, id, increment);
+      },
+      [](const std::string stat_type, int id) {
+        return DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, id);
+      },
+      [](const std::string stat_type, int id) {
+        return DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, id);
+      });
+  RunTests();
+}
+
+TEST_F(StatsTest, HostAllocatedMacroTest) {
+  SetStatType("Allocated");
+  SetFunc(
+      [](const std::string stat_type, int id, int64_t increment) {
+        return HOST_MEMORY_STAT_UPDATE(Allocated, id, increment);
+      },
+      [](const std::string stat_type, int id) {
+        return HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, id);
+      },
+      [](const std::string stat_type, int id) {
+        return HOST_MEMORY_STAT_PEAK_VALUE(Allocated, id);
+      });
+  RunTests();
+}
+
+TEST_F(StatsTest, HostReservedTest) {
+  SetStatType("Reserved");
+  SetFunc(HostMemoryStatUpdate, HostMemoryStatCurrentValue,
+          HostMemoryStatPeakValue);
+  RunTests();
 }
 
 }  // namespace memory
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 6be872b028ca0..6905f3d79546e 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -1659,15 +1659,6 @@ REGISTER_OPERATOR(
     ops::ActivationOpDoubleGrad<ops::CELUGradFunctor<float>::FwdDeps()>,
     ops::ActivationDoubleGradOpInplaceInferer);
 
-REGISTER_ACTIVATION_CPU_KERNEL(celu, CELU, CELUFunctor, CELUGradFunctor);
-REGISTER_OP_CPU_KERNEL(
-    celu_grad_grad, ops::CELUDoubleGradKernel<plat::CPUDeviceContext,
-                                              ops::CELUGradGradFunctor<float>>,
-    ops::CELUDoubleGradKernel<plat::CPUDeviceContext,
-                              ops::CELUGradGradFunctor<double>>,
-    ops::CELUDoubleGradKernel<plat::CPUDeviceContext,
-                              ops::CELUGradGradFunctor<plat::float16>>);
-
 /* ========================================================================== */
 
 /* ===========================   sqrt register  ============================= */
@@ -1687,13 +1678,6 @@ REGISTER_OPERATOR(
     ops::ActivationOpDoubleGrad<ops::SqrtGradGradFunctor<float>::FwdDeps()>,
     ops::ActivationDoubleGradOpInplaceInferer);
 
-REGISTER_OP_CPU_KERNEL(
-    sqrt_grad_grad, ops::SqrtDoubleGradKernel<plat::CPUDeviceContext,
-                                              ops::SqrtGradGradFunctor<float>>,
-    ops::SqrtDoubleGradKernel<plat::CPUDeviceContext,
-                              ops::SqrtGradGradFunctor<double>>,
-    ops::SqrtDoubleGradKernel<plat::CPUDeviceContext,
-                              ops::SqrtGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
 /* ===========================   rsqrt register  =============================
@@ -1714,14 +1698,6 @@ REGISTER_OPERATOR(
     ops::ActivationOpDoubleGrad<ops::RsqrtGradGradFunctor<float>::FwdDeps()>,
     ops::ActivationDoubleGradOpInplaceInferer);
 
-REGISTER_OP_CPU_KERNEL(
-    rsqrt_grad_grad,
-    ops::RsqrtDoubleGradKernel<plat::CPUDeviceContext,
-                               ops::RsqrtGradGradFunctor<float>>,
-    ops::RsqrtDoubleGradKernel<plat::CPUDeviceContext,
-                               ops::RsqrtGradGradFunctor<double>>,
-    ops::RsqrtDoubleGradKernel<plat::CPUDeviceContext,
-                               ops::RsqrtGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
 /* ==========================   square register  ============================ */
@@ -1742,18 +1718,6 @@ REGISTER_OPERATOR(
     ops::ActivationOpDoubleGrad<ops::SquareGradGradFunctor<float>::FwdDeps()>,
     ops::ActivationDoubleGradOpInplaceInferer);
 
-REGISTER_OP_CPU_KERNEL(
-    square_grad_grad,
-    ops::SquareDoubleGradKernel<plat::CPUDeviceContext,
-                                ops::SquareGradGradFunctor<float>>,
-    ops::SquareDoubleGradKernel<plat::CPUDeviceContext,
-                                ops::SquareGradGradFunctor<double>>,
-    ops::SquareDoubleGradKernel<plat::CPUDeviceContext,
-                                ops::SquareGradGradFunctor<plat::float16>>,
-    ops::SquareDoubleGradKernel<plat::CPUDeviceContext,
-                                ops::SquareGradGradFunctor<int>>,
-    ops::SquareDoubleGradKernel<plat::CPUDeviceContext,
-                                ops::SquareGradGradFunctor<int64_t>>);
 /* ========================================================================== */
 
 /* ==========================   pow register  ============================ */
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 5448ed2a4bdad..5f3916a65e792 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -296,9 +296,14 @@ USE_PHI_FUNCTOR(Mish)
 USE_PHI_FUNCTOR(STanh)
 USE_PHI_FUNCTOR(Reciprocal)
 USE_PHI_FUNCTOR(Square)
+USE_PHI_DOUBLE_GRAD_FUNCTOR(Square)
 USE_PHI_FUNCTOR(Sqrt)
+USE_PHI_DOUBLE_GRAD_FUNCTOR(Sqrt)
 USE_PHI_FUNCTOR(Rsqrt)
+USE_PHI_DOUBLE_GRAD_FUNCTOR(Rsqrt)
 USE_PHI_FUNCTOR(Softplus)
+USE_PHI_FUNCTOR(CELU)
+USE_PHI_DOUBLE_GRAD_FUNCTOR(CELU)
 
 template <typename T>
 using ELUGradNegativeAlphaFunctor = phi::funcs::ELUGradNegativeAlphaFunctor<T>;
@@ -331,68 +336,6 @@ using ReluGradGradFunctor = phi::funcs::ReluGradGradFunctor<T>;
 template <typename T>
 using ReluCUDAFunctor = phi::funcs::ReluCUDAFunctor<T>;
 
-template <typename T>
-struct SqrtGradGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* Out,
-                  const framework::Tensor* ddX, framework::Tensor* ddOut,
-                  framework::Tensor* dOut, const framework::Tensor* dX) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(ddX, "Input", "DDX", "SqrtGradGrad"));
-    auto out = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(Out, "Output", "Out", "SqrtGradGrad"));
-    // sqrt GradGrad: ddy = 0.5 * ddx / y, dy = -1 * dx * ddx
-    // calculate dy first, so ddy can inplace ddx
-    if (dOut) {
-      auto dx = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dX, "Output", "DX", "SqrtGradGrad"));
-      auto dout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dOut, "Output", "DOut", "SqrtGradGrad"));
-      dout.device(*d) = dx * ddx * static_cast<T>(-1) / out;
-    }
-    if (ddOut) {
-      auto ddout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SqrtGradGrad"));
-      ddout.device(*d) = ddx * static_cast<T>(0.5) / out;
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
-template <typename T>
-struct RsqrtGradGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* Out,
-                  const framework::Tensor* ddX, framework::Tensor* ddOut,
-                  framework::Tensor* dOut, const framework::Tensor* dX) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(ddX, "Input", "DDX", "RsqrtGradGrad"));
-    auto out = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(Out, "Output", "Out", "RsqrtGradGrad"));
-
-    // rsqrt GradGrad: ddy = -0.5 * ddx * y * y * y, dy = (3/y) * dx * ddx
-    if (dOut) {
-      auto dx = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dX, "Output", "DX", "RsqrtGradGrad"));
-      auto dout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dOut, "Output", "DOut", "RsqrtGradGrad"));
-      dout.device(*d) = (static_cast<T>(3.0) / out) * dx * ddx;
-    }
-    if (ddOut) {
-      auto ddout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "RsqrtGradGrad"));
-      ddout.device(*d) = ddx * static_cast<T>(-0.5) * out * out * out;
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() {
-    return ActBwdOpFwdDeps::kDepOut;
-  }
-};
-
 // relu6(x) = min(max(0, x), 6)
 template <typename T>
 struct Relu6Functor : public BaseActivationFunctor<T> {
@@ -498,51 +441,6 @@ class ELUGradKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
-struct CELUFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) =
-        (x < static_cast<T>(0))
-            .select(static_cast<T>(alpha) *
-                        ((x / static_cast<T>(alpha)).exp() - static_cast<T>(1)),
-                    x);
-  }
-};
-
-template <typename T>
-struct CELUGradFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto temp_a_pos = static_cast<T>(alpha > 0);
-    auto temp_a_neg = static_cast<T>(alpha <= 0);
-    auto temp_x_pos = (x > static_cast<T>(0)).template cast<T>();
-    auto temp_x_neg = (x <= static_cast<T>(0)).template cast<T>();
-
-    // dx = dout, if alpha > 0 and x > 0
-    // dx = dout * (x/alpha).exp(), if alpha > 0 and x <= 0
-    // dx = dout , if alpha < 0 and x > 0
-    // dx = dout * (x/alpha).exp(), if alpha < 0 and x <=0
-    dx.device(d) =
-        dout * temp_a_pos * temp_x_pos +
-        dout * (x / static_cast<T>(alpha)).exp() * temp_a_pos * temp_x_neg +
-        dout * temp_a_neg * temp_x_pos +
-        dout * (x / static_cast<T>(alpha)).exp() * temp_a_neg * temp_x_neg;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
 template <typename T>
 struct AbsGradGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device>
@@ -564,77 +462,9 @@ struct AbsGradGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
-template <typename T>
-struct CELUGradGradFunctor : public BaseActivationFunctor<T> {
-  float alpha;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* X,
-                  const framework::Tensor* ddX, framework::Tensor* ddOut,
-                  const framework::Tensor* dOut, framework::Tensor* dX) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(ddX, "Input", "DDX", "CELUGradGrad"));
-    auto x = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(X, "Input", "X", "CELUGradGrad"));
-
-    if (dX) {
-      auto dx = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dX, "Output", "DX", "CELUGradGrad"));
-      auto dout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dOut, "Output", "DOut", "CELUGradGrad"));
-      dx.device(*d) = ddx * dout / static_cast<T>(alpha) *
-                      (x / static_cast<T>(alpha)).exp() *
-                      (x <= static_cast<T>(0)).template cast<T>();
-    }
-
-    if (ddOut) {
-      auto ddout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "CELUGradGrad"));
-      ddout.device(*d) = ddx *
-                         ((x > static_cast<T>(0)).template cast<T>() +
-                          (x / static_cast<T>(alpha)).exp() *
-                              (x <= static_cast<T>(0)).template cast<T>())
-                             .template cast<T>();
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename T>
-struct SquareGradGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* X,
-                  const framework::Tensor* ddX, framework::Tensor* ddOut,
-                  const framework::Tensor* dOut, framework::Tensor* dX) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(ddX, "Input", "DDX", "SquareGradGrad"));
-    auto x = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(X, "Input", "X", "SquareGradGrad"));
-    // square GradGrad: ddy=2x*ddx, dx=2dy*ddx
-    // calculate dx first, so ddy can inplace ddx
-    if (dX) {
-      auto dx = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dX, "Output", "DX", "SquareGradGrad"));
-      auto dout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dOut, "Output", "DOut", "SquareGradGrad"));
-      dx.device(*d) = ddx * static_cast<T>(2) * dout;
-    }
-    if (ddOut) {
-      auto ddout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SquareGradGrad"));
-      ddout.device(*d) = ddx * static_cast<T>(2) * x;
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
 // TODO(dengkaipeng): double gradient calculation for Square/Sqrt need
 // DOut(dy) as input(not output), tensor extraction is different from
-// others. Impliment extraction kernel seperately here.
+// others. Impliment extraction kernel separately here.
 inline void ExtractDoubleGradTensorWithInputDOut(
     const framework::ExecutionContext& ctx, const framework::Tensor** X,
     const framework::Tensor** ddX, framework::Tensor** dX,
@@ -675,29 +505,6 @@ inline void ExtractDoubleGradTensorWithInputDOut(
   }
 }
 
-template <typename DeviceContext, typename Functor>
-class SquareDoubleGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor *X, *ddX, *dOut;
-    X = ddX = dOut = nullptr;
-    framework::Tensor *dX, *ddOut;
-    dX = ddOut = nullptr;
-
-    ExtractDoubleGradTensorWithInputDOut(ctx, &X, &ddX, &dX, &dOut, &ddOut);
-
-    if (dX) dX->mutable_data<T>(X->dims(), ctx.GetPlace());
-    if (ddOut) ddOut->mutable_data<T>(ctx.GetPlace());
-
-    auto& place = ctx.template device_context<DeviceContext>();
-
-    Functor functor;
-    functor(place, X, ddX, ddOut, dOut, dX);
-  }
-};
-
 template <typename T>
 struct SoftsignFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Out>
@@ -721,153 +528,6 @@ struct SoftsignGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
-template <typename DeviceContext, typename Functor>
-class CELUDoubleGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor *X, *ddX, *dOut;
-    X = ddX = dOut = nullptr;
-    framework::Tensor *dX, *ddOut;
-    dX = ddOut = nullptr;
-
-    ExtractDoubleGradTensorWithInputDOut(ctx, &X, &ddX, &dX, &dOut, &ddOut);
-
-    if (dX) dX->mutable_data<T>(X->dims(), ctx.GetPlace());
-    if (ddOut) ddOut->mutable_data<T>(ctx.GetPlace());
-
-    auto& place = ctx.template device_context<DeviceContext>();
-
-    Functor functor;
-    auto attrs = functor.GetAttrs();
-    for (auto& attr : attrs) {
-      *attr.second = ctx.Attr<float>(attr.first);
-    }
-    functor(place, X, ddX, ddOut, dOut, dX);
-  }
-};
-
-template <typename DeviceContext, typename Functor>
-class SqrtDoubleGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor *Out, *dX, *ddX;
-    Out = dX = ddX = nullptr;
-    framework::Tensor *ddOut, *dOut;
-    ddOut = dOut = nullptr;
-
-    // extract ddx(input), ddout(output)
-    auto ddx_var = ctx.InputVar("DDX");
-    auto ddo_var = ctx.OutputVar("DDOut");
-    PADDLE_ENFORCE_NOT_NULL(
-        ddx_var, platform::errors::NotFound(
-                     "Cannot get input Variable DDX, variable name = %s",
-                     ctx.InputName("DDX")));
-    ddX = ctx.Input<framework::Tensor>("DDX");
-    if (ddo_var) {
-      ddOut = ctx.Output<framework::Tensor>("DDOut");
-    }
-    PADDLE_ENFORCE_NOT_NULL(
-        ddX, platform::errors::NotFound(
-                 "Cannot get input Variable DDX, variable name = %s",
-                 ctx.InputName("DDX")));
-
-    // extract out(input), dout(output)
-    auto out_var = ctx.InputVar("Out");
-    PADDLE_ENFORCE_NOT_NULL(
-        out_var, platform::errors::NotFound(
-                     "Cannot get input Variable Out, variable name = %s",
-                     ctx.InputName("Out")));
-    auto dout_var = ctx.OutputVar("DOut");
-    Out = ctx.Input<framework::Tensor>("Out");
-    if (dout_var) {
-      dOut = ctx.Output<framework::Tensor>("DOut");
-    }
-
-    // extract dx(input)
-    auto dx_var = ctx.InputVar("DX");
-    PADDLE_ENFORCE_NOT_NULL(
-        dx_var, platform::errors::NotFound(
-                    "Cannot get input Variable DX, variable name = %s",
-                    ctx.InputName("DX")));
-    if (dx_var) {
-      dX = ctx.Input<framework::Tensor>("DX");
-    }
-
-    if (dOut) dOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (ddOut) ddOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-
-    auto& place = ctx.template device_context<DeviceContext>();
-
-    Functor functor;
-    functor(place, Out, ddX, ddOut, dOut, dX);
-  }
-};
-
-// rsqrt Grad: dx = -0.5 * dy * y * y * y
-// rsqrt GradGrad: ddy = -0.5 * ddx * y * y * y, dy = (3 / y) * dx * ddx
-template <typename DeviceContext, typename Functor>
-class RsqrtDoubleGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor *Out, *dX, *ddX;
-    Out = dX = ddX = nullptr;
-    framework::Tensor *ddOut, *dOut;
-    ddOut = dOut = nullptr;
-
-    // extract ddx(input), ddout(output)
-    auto ddx_var = ctx.InputVar("DDX");
-    auto ddo_var = ctx.OutputVar("DDOut");
-    PADDLE_ENFORCE_NOT_NULL(
-        ddx_var, platform::errors::NotFound(
-                     "Cannot get input Variable DDX, variable name = %s",
-                     ctx.InputName("DDX")));
-    ddX = ctx.Input<framework::Tensor>("DDX");
-    if (ddo_var) {
-      ddOut = ctx.Output<framework::Tensor>("DDOut");
-    }
-    PADDLE_ENFORCE_NOT_NULL(
-        ddX, platform::errors::NotFound(
-                 "Cannot get input Variable DDX, variable name = %s",
-                 ctx.InputName("DDX")));
-
-    // extract out(input), dout(output)
-    auto out_var = ctx.InputVar("Out");
-    PADDLE_ENFORCE_NOT_NULL(
-        out_var, platform::errors::NotFound(
-                     "Cannot get input Variable Out, variable name = %s",
-                     ctx.InputName("Out")));
-    auto dout_var = ctx.OutputVar("DOut");
-    Out = ctx.Input<framework::Tensor>("Out");
-    if (dout_var) {
-      dOut = ctx.Output<framework::Tensor>("DOut");
-    }
-
-    // extract dx(input)
-    auto dx_var = ctx.InputVar("DX");
-    PADDLE_ENFORCE_NOT_NULL(
-        dx_var, platform::errors::NotFound(
-                    "Cannot get input Variable DX, variable name = %s",
-                    ctx.InputName("DX")));
-    if (dx_var) {
-      dX = ctx.Input<framework::Tensor>("DX");
-    }
-
-    if (dOut) dOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (ddOut) ddOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-
-    auto& place = ctx.template device_context<DeviceContext>();
-
-    Functor functor;
-    functor(place, Out, ddX, ddOut, dOut, dX);
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps
index e33351520e6dd..7298a05827889 100644
--- a/paddle/fluid/operators/activation_op.kps
+++ b/paddle/fluid/operators/activation_op.kps
@@ -126,59 +126,6 @@ struct CudaSoftsignGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
-template <typename T>
-struct CudaCELUFunctor : public BaseActivationFunctor<T> {
-  using CT = typename details::MPTypeTrait<T>::Type;
-  CT zero = static_cast<CT>(0.0f);
-  CT one = static_cast<CT>(1.0f);
-  float alpha;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-
-  // celu(x) = max(0, x) + min(0, alpha * (exp(x/alpha) - 1))
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    CT x = static_cast<CT>(arg_x);
-    CT temp = static_cast<CT>(alpha) * (exp(x / static_cast<CT>(alpha)) - one);
-    CT res = (x > zero ? x : zero) + (temp > zero ? zero : temp);
-    return static_cast<T>(res);
-  }
-};
-
-template <typename T>
-struct CudaCELUGradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType zero = static_cast<MPType>(0.0f);
-  MPType one = static_cast<MPType>(1.0f);
-  float alpha;
-
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha}};
-  }
-
-  // dx = dout, if alpha > 0 and x > 0
-  // dx = dout * (x/alpha).exp(), if alpha > 0 and x <= 0
-  // dx = dout , if alpha < 0 and x > 0
-  // dx = dout * (x/alpha).exp(), if alpha < 0 and x <=0
-  __device__ __forceinline__ T operator()(const T arg_dout,
-                                          const T arg_x) const {
-    MPType dout = static_cast<MPType>(arg_dout);
-    MPType x = static_cast<MPType>(arg_x);
-    MPType a = static_cast<MPType>(alpha);
-    MPType temp_a_pos = static_cast<MPType>(alpha > 0.0f);
-    MPType temp_a_neg = static_cast<MPType>(alpha <= 0.0f);
-    MPType temp_x_pos = static_cast<MPType>(x > zero);
-    MPType temp_x_neg = static_cast<MPType>(x <= zero);
-    return static_cast<T>(
-        dout *
-        (temp_a_pos * temp_x_pos + temp_a_pos * temp_x_neg * exp(x / a) +
-         temp_a_neg * temp_x_pos + exp(x / a) * temp_a_neg * temp_x_neg));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
 template <typename DeviceContext, typename Functor>
 class ActivationCudaKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
@@ -357,79 +304,35 @@ namespace plat = paddle::platform;
       ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
                                     ops::grad_functor<plat::bfloat16>>);
 
-/* ========================================================================== */
-
-/* ======================== celu register  ============================ */
-REGISTER_ACTIVATION_CUDA_KERNEL(celu, CELU, CudaCELUFunctor,
-                                CudaCELUGradFunctor);
-
 REGISTER_OP_CUDA_KERNEL(
-    celu_grad_grad, ops::CELUDoubleGradKernel<plat::CUDADeviceContext,
-                                              ops::CELUGradGradFunctor<float>>,
-    ops::CELUDoubleGradKernel<plat::CUDADeviceContext,
-                              ops::CELUGradGradFunctor<double>>,
-    ops::CELUDoubleGradKernel<plat::CUDADeviceContext,
-                              ops::CELUGradGradFunctor<plat::float16>>);
-/* ========================================================================== */
-
-/* ===========================   sqrt register  ============================= */
-
-REGISTER_OP_CUDA_KERNEL(
-    sqrt_grad_grad,
-    ops::SqrtDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                              ops::SqrtGradGradFunctor<float>>,
-    ops::SqrtDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                              ops::SqrtGradGradFunctor<double>>,
-    ops::SqrtDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                              ops::SqrtGradGradFunctor<plat::float16>>,
-    ops::SqrtDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                              ops::SqrtGradGradFunctor<plat::bfloat16>>);
-/* ========================================================================== */
-
-/* ===========================   rsqrt register  =============================
- */
-
+    relu6, ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
+                                     ops::CudaRelu6Functor<float>>,
+    ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
+                              ops::CudaRelu6Functor<double>>,
+    ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
+                              ops::CudaRelu6Functor<int>>,
+    ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
+                              ops::CudaRelu6Functor<int64_t>>,
+    ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                              ops::CudaRelu6Functor<plat::float16>>,
+    ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                              ops::CudaRelu6Functor<plat::bfloat16>>);
 REGISTER_OP_CUDA_KERNEL(
-    rsqrt_grad_grad,
-    ops::RsqrtDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                               ops::RsqrtGradGradFunctor<float>>,
-    ops::RsqrtDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                               ops::RsqrtGradGradFunctor<double>>,
-    ops::RsqrtDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                               ops::RsqrtGradGradFunctor<plat::float16>>);
-/* ========================================================================== */
-
-/* ===========================  square register  ============================ */
-
-REGISTER_OP_CUDA_KERNEL(
-    square_grad_grad,
-    ops::SquareDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                ops::SquareGradGradFunctor<float>>,
-    ops::SquareDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                ops::SquareGradGradFunctor<double>>,
-    ops::SquareDoubleGradKernel<plat::CUDADeviceContext,
-                                ops::SquareGradGradFunctor<plat::float16>>,
-    ops::SquareDoubleGradKernel<plat::CUDADeviceContext,
-                                ops::SquareGradGradFunctor<plat::bfloat16>>,
-    ops::SquareDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                ops::SquareGradGradFunctor<int>>,
-    ops::SquareDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                ops::SquareGradGradFunctor<int64_t>>);
-/* ========================================================================== */
-
-/* ==========================   logit register  ============================ */
-namespace ops = paddle::operators;
-/* ========================================================================== */
-
-/* ==========================   exp register  ============================ */
-/* ========================================================================== */
-
-/* ==========================   expm1 register  ============================ */
-/* ========================================================================== */
+    relu6_grad, ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                              ops::CudaRelu6GradFunctor<float>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaRelu6GradFunctor<double>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaRelu6GradFunctor<int>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaRelu6GradFunctor<int64_t>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaRelu6GradFunctor<plat::float16>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaRelu6GradFunctor<plat::bfloat16>>);
 
 #define FOR_EACH_ACTIVATION_CUDA_OP(__macro)                                  \
   __macro(soft_relu, SoftRelu, CudaSoftReluFunctor, CudaSoftReluGradFunctor); \
-  __macro(relu6, Relu6, CudaRelu6Functor, CudaRelu6GradFunctor);              \
   __macro(softsign, Softsign, CudaSoftsignFunctor, CudaSoftsignGradFunctor);
 
 FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL)
@@ -452,13 +355,14 @@ REGISTER_OP_KERNEL(
     ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                   ops::CudaZeroGradFunctor<float>>);
 
-REGISTER_OP_KERNEL(celu, KP, plat::XPUPlace,
-                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
-                                             ops::CudaCELUFunctor<float>>);
+REGISTER_OP_KERNEL(
+    celu, KP, plat::XPUPlace,
+    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
+                              phi::funcs::CudaCELUFunctor<float>>);
 REGISTER_OP_KERNEL(
     celu_grad, KP, plat::XPUPlace,
     ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
-                                  ops::CudaCELUGradFunctor<float>>);
+                                  phi::funcs::CudaCELUGradFunctor<float>>);
 
 REGISTER_OP_KERNEL(elu, KP, plat::XPUPlace,
                    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc
index efaea94f26e8d..e311d21bb54d3 100644
--- a/paddle/fluid/operators/affine_grid_op.cc
+++ b/paddle/fluid/operators/affine_grid_op.cc
@@ -136,7 +136,7 @@ class AffineGridOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsExtra();
     AddAttr<bool>("align_corners",
                   "(bool, default false) Whether to align the corners of input"
-                  "and ouput.")
+                  "and output.")
         .SetDefault(true);
     AddAttr<std::vector<int>>(
         "output_shape",
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
index 5808841333f08..f9a93a47ff2be 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
@@ -161,8 +161,8 @@ class LazyZerosNPU {
     }
     auto place = dev_ctx.GetPlace();
     auto stream = dev_ctx.stream();
-    Tensor* zero_tensor;
-    void* zero_ptr;
+    Tensor* zero_tensor = nullptr;
+    void* zero_ptr = nullptr;
     if (found_inf_vec[0]) {
       int max_num = -1;
       for (size_t i = 0; i < xs.size(); ++i) {
diff --git a/paddle/fluid/operators/assign_op_mlu.cc b/paddle/fluid/operators/assign_op_mlu.cc
new file mode 100644
index 0000000000000..85092c516955d
--- /dev/null
+++ b/paddle/fluid/operators/assign_op_mlu.cc
@@ -0,0 +1,47 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+
+#include "paddle/fluid/operators/assign_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+template <typename T>
+class AssignMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc x_desc(*x);
+    MLUCnnlTensorDesc out_desc(*out);
+    MLUCnnl::Assign(ctx, x_desc.get(), GetBasePtr(x), out_desc.get(),
+                    GetBasePtr(out));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(assign, ops::AssignMLUKernel<int>,
+                       ops::AssignMLUKernel<float>,
+                       ops::AssignMLUKernel<plat::float16>,
+                       ops::AssignMLUKernel<bool>)
diff --git a/paddle/fluid/operators/assign_value_op_mlu.cc b/paddle/fluid/operators/assign_value_op_mlu.cc
new file mode 100644
index 0000000000000..651e129ccb17a
--- /dev/null
+++ b/paddle/fluid/operators/assign_value_op_mlu.cc
@@ -0,0 +1,22 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/assign_value_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_MLU_KERNEL(assign_value, ops::AssignValueKernel<bool>,
+                       ops::AssignValueKernel<int>,
+                       ops::AssignValueKernel<int64_t>,
+                       ops::AssignValueKernel<float>);
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 36a0d53e05245..2663a08101157 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -64,7 +64,7 @@ void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const {
         (x_dims[i] == -1) || (x_dims[i] > 0), true,
         platform::errors::InvalidArgument(
             "Each dimension of input tensor is expected to be -1 or a "
-            "positive number, but recieved %d. Input's shape is [%s].",
+            "positive number, but received %d. Input's shape is [%s].",
             x_dims[i], x_dims));
   }
 
diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc
index bc6cf9d831ff0..76e0f23df2168 100644
--- a/paddle/fluid/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
@@ -156,7 +156,7 @@ REGISTER_OP_CPU_KERNEL(
     ops::CastOpKernel<CPU, double>, ops::CastOpKernel<CPU, int>,
     ops::CastOpKernel<CPU, int64_t>, ops::CastOpKernel<CPU, int>,
     ops::CastOpKernel<CPU, int16_t>, ops::CastOpKernel<CPU, bool>,
-    ops::CastOpKernel<CPU, uint8_t>,
+    ops::CastOpKernel<CPU, uint8_t>, ops::CastOpKernel<CPU, int8_t>,
     ops::CastOpKernel<CPU, paddle::platform::float16>,
     ops::CastOpKernel<CPU, paddle::platform::bfloat16>,
     ops::CastOpKernel<CPU, paddle::platform::complex<float>>,
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.cc b/paddle/fluid/operators/cinn/cinn_launch_op.cc
index 5d006a947be19..0a9b66bc92c15 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.cc
@@ -136,7 +136,7 @@ class CinnLaunchOpMaker : public framework::OpProtoAndCheckerMaker {
               "(vector<LoDTensor>)"
               "which are the output of graph inside the CinnLaunchOp.")
         .AsDuplicable();
-    AddAttr<std::string>(
+    AddAttr<int64_t>(
         kCompilationKey,
         "(string)"
         "a hash key used to get the graph object or its computation result.");
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.h b/paddle/fluid/operators/cinn/cinn_launch_op.h
index 024bf2bceb3d0..f40b788dfb5b3 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <chrono>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -27,6 +28,7 @@
 #include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
 #include "paddle/fluid/operators/cinn/cinn_launch_context.h"
 #include "paddle/fluid/operators/cinn/cinn_op_helper.h"
+#include "paddle/fluid/platform/profiler.h"
 
 DECLARE_bool(enable_pe_launch_cinn);
 namespace paddle {
@@ -60,13 +62,14 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
     const auto& scope = ctx.scope();
     const auto& place = ctx.GetPlace();
     void* stream = details::GetStream<DeviceContext>(ctx);
+    platform::RecordEvent record_event_1(
+        "Step 1. Find graph object and prepare input");
     // Step 1. Find graph object and prepare input
     PADDLE_ENFORCE_EQ(ctx.HasAttr(kCompilationKey), true,
                       platform::errors::NotFound(
                           "No Attribute(%s) found for CinnLaunchOp operator.",
                           kCompilationKey));
-    const auto& compilation_key =
-        ctx.template Attr<std::string>(kCompilationKey);
+    const auto& compilation_key = ctx.template Attr<int64_t>(kCompilationKey);
     VLOG(4) << "CinnLaunchOp attribute(" << kCompilationKey << ") "
             << "value:\n"
             << CinnCompiler::GetInstance()->ReadableKey(compilation_key);
@@ -99,24 +102,44 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
                          input_no_need_buffer_tensors);
     }
 
+    platform::RecordEvent record_event_2(
+        "Step 2. Get compilation result of the graph");
     // Step 2. Get compilation result of the graph
     auto target = details::PlaceToCinnTarget(place);
+    using ClockType = std::chrono::steady_clock;
+    std::chrono::time_point<ClockType> start_t, end_t;
+    if (VLOG_IS_ON(1)) {
+      VLOG(1) << "Starts to compile at thread " << std::this_thread::get_id();
+      start_t = ClockType::now();
+    }
     const auto& cinn_compiled_object = CinnCompiler::GetInstance()->Compile(
         compilation_key, inputs_name2tensor, target, stream);
+    if (VLOG_IS_ON(1)) {
+      end_t = ClockType::now();
+      auto time_sec = std::chrono::duration_cast<std::chrono::milliseconds>(
+          end_t - start_t);
+      VLOG(1) << "Ends to compile at thread " << std::this_thread::get_id()
+              << " , time cost : " << time_sec.count() << " ms";
+    }
     details::DebugCinnCompiledResult(cinn_compiled_object);
     auto* launch_context = cinn_compiled_object.launch_context.get();
 
+    platform::RecordEvent record_event_3("Step 3. Set CINN runtime FLAGS.");
     // Step 3. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic.
     details::SetCinnRuntimeFlags();
 
     // Step 4. Execute the compiled CINN instructions by a PE or
     //         by the CINN compiled program in sequential order
     if (FLAGS_enable_pe_launch_cinn) {
+      platform::RecordEvent record_event_4(
+          "Step 4. Execute the runtime graph by PE.");
       VLOG(4) << "Execute the runtime graph by PE";
       framework::Scope& exec_scope = scope.NewScope();
       auto* pe = launch_context->InitializePE(place, &exec_scope);
       pe->RunWithoutFetch(launch_context->GetSkipEagerVars());
     } else {
+      platform::RecordEvent record_event_4(
+          "Step 4. Execute the compiled executable program.");
       VLOG(4) << "Execute the compiled executable program";
       launch_context->UpdateCapturedEnv(scope, place);
       LaunchCinnExecution(cinn_compiled_object, *launch_context, stream);
diff --git a/paddle/fluid/operators/class_center_sample_op.cu b/paddle/fluid/operators/class_center_sample_op.cu
index 1064c77cc0041..a23cf2815d8fe 100644
--- a/paddle/fluid/operators/class_center_sample_op.cu
+++ b/paddle/fluid/operators/class_center_sample_op.cu
@@ -416,14 +416,13 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
                    1) *
                   vec_size;
     int device_id = ctx.GetPlace().GetDeviceId();
-    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
-    if (gen_cuda->GetIsInitPy() && (!fix_seed)) {
+    auto gen_cuda = framework::DefaultCUDAGenerator(device_id);
+    if (!fix_seed) {
       auto seed_offset = gen_cuda->IncrementOffset(offset);
       seed_data = seed_offset.first;
       increment = seed_offset.second;
     } else {
-      std::random_device rnd;
-      seed_data = fix_seed ? seed + rank : rnd();
+      seed_data = seed + rank;
       increment = offset;
     }
     RandomSampleClassCenter<T><<<NumBlocks(num_classes), kNumCUDAThreads, 0,
diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc
index 900fd4d8d292e..aa5a38e4dbf08 100644
--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -24,6 +24,9 @@
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #endif
 #include "paddle/fluid/framework/convert_utils.h"
+#ifdef PADDLE_WITH_MLU
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#endif
 
 namespace paddle {
 namespace operators {
@@ -69,6 +72,13 @@ struct FillConstantVisitor {
       phi::funcs::SetConstant<DeviceContext, T> set_constant;
       set_constant(dev_ctx_, tensor_, static_cast<T>(value_));
     }
+#elif defined(PADDLE_WITH_MLU)
+    if (platform::is_mlu_place(context_.GetPlace())) {
+      FillMLUTensorWithHostValue<T>(context_, static_cast<T>(value_), tensor_);
+    } else {
+      phi::funcs::SetConstant<DeviceContext, T> set_constant;
+      set_constant(dev_ctx_, tensor_, static_cast<T>(value_));
+    }
 #else
     phi::funcs::SetConstant<DeviceContext, T> set_constant;
     set_constant(dev_ctx_, tensor_, static_cast<T>(value_));
@@ -509,6 +519,15 @@ REGISTER_OP_NPU_KERNEL(
     ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, double>);
 #endif
 
+#if defined(PADDLE_WITH_MLU)
+REGISTER_OP_MLU_KERNEL(
+    coalesce_tensor,
+    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext,
+                                plat::float16>,
+    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, float>);
+#endif
+
 REGISTER_OP_VERSION(coalesce_tensor)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
index 4bed282ace8d1..eeae16a0d71f3 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
@@ -77,7 +77,7 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::ncclBcast(out->mutable_data<T>(place), numel,
                                        dtype, root, comm->comm(), stream));
-      VLOG(3) << "rank " << comm->rank() << " invoke Bcast. recieved "
+      VLOG(3) << "rank " << comm->rank() << " invoke Bcast. received "
               << phi::product(out->dims());
     }
 
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc b/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
index d1e269fb5a4fe..8f07480aaab14 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
@@ -62,7 +62,7 @@ class CBroadcastOPMLUKernel : public framework::OpKernel<T> {
     } else {
       PADDLE_ENFORCE_MLU_SUCCESS(cnclBcast(out->mutable_data<T>(place), numel,
                                            dtype, root, comm->comm(), stream));
-      VLOG(3) << "rank " << comm->rank() << " invoke Bcast. recieved "
+      VLOG(3) << "rank " << comm->rank() << " invoke Bcast. received "
               << phi::product(out->dims());
     }
 
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
index 31961d8a246a9..a065e49ff72be 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
@@ -59,7 +59,7 @@ class CBroadcastOpASCENDKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
         ptr, numel, dtype, (uint32_t)root, comm->comm(), stream));
 
-    VLOG(3) << "rank " << comm->rank() << " invoke Bcast. recieved "
+    VLOG(3) << "rank " << comm->rank() << " invoke Bcast. received "
             << phi::product(out->dims());
 
     dev_ctx->Wait();
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
index 088366dbc8f69..6ad22ff8b19eb 100644
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
@@ -11,27 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/collective/c_sync_calc_stream_op.h"
 
 namespace paddle {
 namespace operators {
 
-class CSyncCalcStreamOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {}
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(framework::proto::VarType::FP32,
-                                   ctx.GetPlace());
-  }
-};
-
 class CSyncCalcStreamOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() {
@@ -45,53 +29,6 @@ Call calculation stream synchronization.
   }
 };
 
-template <typename T>
-class CSyncCalcStreamKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
-
-    auto place = ctx.GetPlace();
-    auto dev_ctx = static_cast<platform::CUDADeviceContext*>(
-        platform::DeviceContextPool::Instance().Get(place));
-
-    platform::GpuStreamSync(dev_ctx->stream());
-
-#elif defined(PADDLE_WITH_ASCEND_CL) && !defined(_WIN32)
-    auto place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_npu_place(place), true,
-                      platform::errors::PreconditionNotMet(
-                          "Sync stream op can run on npu place only for now."));
-
-    auto dev_ctx = static_cast<platform::NPUDeviceContext*>(
-        platform::DeviceContextPool::Instance().Get(place));
-    platform::NPUStreamSync(dev_ctx->stream());
-
-#elif defined(PADDLE_WITH_CNCL)
-    auto place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_mlu_place(place), true,
-                      platform::errors::PreconditionNotMet(
-                          "Sync stream op can run on mlu place only for now."));
-
-    auto dev_ctx = static_cast<platform::MLUDeviceContext*>(
-        platform::DeviceContextPool::Instance().Get(place));
-    platform::MLUStreamSync(dev_ctx->stream());
-#elif defined(PADDLE_WITH_XPU_BKCL)
-    auto place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_xpu_place(place), true,
-                      platform::errors::PreconditionNotMet(
-                          "Sync stream op can run on xpu place only for now."));
-
-    auto dev_ctx = static_cast<platform::XPUDeviceContext*>(
-        platform::DeviceContextPool::Instance().Get(place));
-    dev_ctx->Wait();
-#else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with GPU."));
-#endif
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
 
@@ -105,5 +42,3 @@ REGISTER_OP_CUDA_KERNEL(c_sync_calc_stream, ops::CSyncCalcStreamKernel<float>);
 REGISTER_OP_NPU_KERNEL(c_sync_calc_stream, ops::CSyncCalcStreamKernel<float>);
 
 REGISTER_OP_MLU_KERNEL(c_sync_calc_stream, ops::CSyncCalcStreamKernel<float>);
-
-REGISTER_OP_XPU_KERNEL(c_sync_calc_stream, ops::CSyncCalcStreamKernel<float>);
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.h b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h
new file mode 100644
index 0000000000000..b07367f801fa3
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h
@@ -0,0 +1,83 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <string>
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class CSyncCalcStreamOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(framework::proto::VarType::FP32,
+                                   ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class CSyncCalcStreamKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
+
+    auto place = ctx.GetPlace();
+    auto dev_ctx = static_cast<platform::CUDADeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(place));
+
+    platform::GpuStreamSync(dev_ctx->stream());
+
+#elif defined(PADDLE_WITH_ASCEND_CL) && !defined(_WIN32)
+    auto place = ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(platform::is_npu_place(place), true,
+                      platform::errors::PreconditionNotMet(
+                          "Sync stream op can run on npu place only for now."));
+
+    auto dev_ctx = static_cast<platform::NPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(place));
+    platform::NPUStreamSync(dev_ctx->stream());
+
+#elif defined(PADDLE_WITH_CNCL)
+    auto place = ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(platform::is_mlu_place(place), true,
+                      platform::errors::PreconditionNotMet(
+                          "Sync stream op can run on mlu place only for now."));
+
+    auto dev_ctx = static_cast<platform::MLUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(place));
+    platform::MLUStreamSync(dev_ctx->stream());
+#elif defined(PADDLE_WITH_XPU_BKCL)
+    auto place = ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(platform::is_xpu_place(place), true,
+                      platform::errors::PreconditionNotMet(
+                          "Sync stream op can run on xpu place only for now."));
+
+    auto dev_ctx = static_cast<platform::XPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(place));
+    dev_ctx->Wait();
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with GPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/phi/core/storage.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op_xpu.cc
similarity index 65%
rename from paddle/phi/core/storage.cc
rename to paddle/fluid/operators/collective/c_sync_calc_stream_op_xpu.cc
index 0ddf5084464cc..04a83ea64f076 100644
--- a/paddle/phi/core/storage.cc
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op_xpu.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,14 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/core/storage.h"
+#include "paddle/fluid/operators/collective/c_sync_calc_stream_op.h"
 
-namespace phi {
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 
-void TensorStorage::Realloc(size_t size) {
-  this->Clear();
-  data_ = alloc_->Allocate(size);
-  size_ = size;
-}
-
-}  // namespace phi
+REGISTER_OP_XPU_KERNEL(c_sync_calc_stream, ops::CSyncCalcStreamKernel<float>)
diff --git a/paddle/fluid/operators/collective/recv_v2_op.cc b/paddle/fluid/operators/collective/recv_v2_op.cc
index 3e8fa631507ab..494665544f0d3 100644
--- a/paddle/fluid/operators/collective/recv_v2_op.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op.cc
@@ -44,15 +44,21 @@ class RecvOpV2 : public framework::OperatorWithKernel {
               "The size of the output shape must be greater than 0 "
               "but the value given is %d.",
               out_shape.size()));
-      for (size_t i = 0; i < out_shape.size(); ++i) {
-        PADDLE_ENFORCE_GE(out_shape[i], 1,
-                          platform::errors::InvalidArgument(
-                              "The shape attribute for recv_v2 must be set "
-                              "explicitly, but the %dth element is %d which "
-                              "is less than 1.",
-                              i, out_shape[i]));
+      bool dynamic_shape = ctx->Attrs().Get<bool>("dynamic_shape");
+      if (!dynamic_shape) {
+        // No need to check out shape if with dynamic_shape,
+        // since the shape will be recv from send_v2
+        for (size_t i = 0; i < out_shape.size(); ++i) {
+          PADDLE_ENFORCE_GE(out_shape[i], 1,
+                            platform::errors::InvalidArgument(
+                                "The shape attribute for recv_v2 must be set "
+                                "explicitly, but the %dth element is %d which "
+                                "is less than 1. Or dynamic_shape should be "
+                                "set to True for both send_v2 and recv_v2.",
+                                i, out_shape[i]));
+        }
+        ctx->SetOutputDim("Out", phi::make_ddim(out_shape));
       }
-      ctx->SetOutputDim("Out", phi::make_ddim(out_shape));
     }
   }
 
@@ -87,6 +93,10 @@ class RecvOpV2Maker : public framework::OpProtoAndCheckerMaker {
         "use_calc_stream",
         "(bool default false) eject CUDA operations to calculation stream.")
         .SetDefault(false);
+    AddAttr<bool>(
+        "dynamic_shape",
+        "(bool default false) the send/recv will be done with dynamic shape.")
+        .SetDefault(false);
     AddComment(R"DOC(
 Recv Operator
 
diff --git a/paddle/fluid/operators/collective/recv_v2_op.cu.cc b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
index 96b27a833fba3..f7a2e198db938 100644
--- a/paddle/fluid/operators/collective/recv_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
@@ -25,6 +25,85 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \
+    NCCL_VERSION_CODE >= 2703
+framework::DDim recv_shape_info(const platform::Place &place,
+                                const gpuStream_t &stream,
+                                platform::NCCLComm *comm, const int &peer,
+                                distributed::ProcessGroup *group) {
+  if (!group) {
+    PADDLE_ENFORCE_EQ((stream != nullptr && comm != nullptr), true,
+                      platform::errors::InvalidArgument(
+                          "NCCLComm and Stream should be provided if use NCCL "
+                          "to send the shape info."));
+  }
+
+  paddle::experimental::DataType shape_dytpe =
+      paddle::experimental::DataType::INT32;
+  ncclDataType_t nccl_dtype =
+      platform::ToNCCLDataType(framework::TransToProtoVarType(shape_dytpe));
+
+  // step1: recv the shape size
+  framework::Tensor gpu_shape_size_tensor(shape_dytpe);
+  if (!group) {
+    gpu_shape_size_tensor.Resize({1});
+    gpu_shape_size_tensor.mutable_data(place, shape_dytpe);
+    auto *gpu_data = gpu_shape_size_tensor.data<int>();
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
+        gpu_data, 1, nccl_dtype, peer, comm->comm(), stream));
+  }
+
+  // copy the shape size tensor to cpu
+  framework::Tensor *cpu_shape_size_tensor = new framework::Tensor(shape_dytpe);
+  cpu_shape_size_tensor->Resize({1});
+  cpu_shape_size_tensor->mutable_data(platform::CPUPlace(), shape_dytpe);
+  if (group) {
+    std::vector<framework::Tensor> shape_size_tensor;
+    shape_size_tensor.emplace_back(*cpu_shape_size_tensor);
+    auto shape_size_task = group->Recv(shape_size_tensor, peer);
+  } else {
+    framework::TensorCopySync(gpu_shape_size_tensor, platform::CPUPlace(),
+                              cpu_shape_size_tensor);
+  }
+  auto *cpu_data = cpu_shape_size_tensor->data<int>();
+  int shape_size = cpu_data[0];
+  VLOG(3) << "recv the shape size: " << shape_size << " from peer";
+
+  // step2: recv the shape
+  framework::Tensor gpu_shape_tensor(shape_dytpe);
+  if (!group) {
+    gpu_shape_tensor.Resize({shape_size});
+    gpu_shape_tensor.mutable_data(place, shape_dytpe);
+    auto *gpu_shape_data = gpu_shape_tensor.data<int>();
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
+        gpu_shape_data, shape_size, nccl_dtype, peer, comm->comm(), stream));
+  }
+
+  // copy the shape tensor to cpu
+  framework::Tensor *cpu_shape_tensor = new framework::Tensor(shape_dytpe);
+  cpu_shape_tensor->Resize({shape_size});
+  cpu_shape_tensor->mutable_data(platform::CPUPlace(), shape_dytpe);
+  if (group) {
+    std::vector<framework::Tensor> shape_tensor;
+    shape_tensor.emplace_back(*cpu_shape_tensor);
+    auto shape_task = group->Recv(shape_tensor, peer);
+  } else {
+    framework::TensorCopySync(gpu_shape_tensor, platform::CPUPlace(),
+                              cpu_shape_tensor);
+  }
+  auto *cpu_shape_data = cpu_shape_tensor->data<int>();
+  std::vector<int> all_shape;
+  for (int i = 0; i < shape_size; ++i) {
+    all_shape.emplace_back(cpu_shape_data[i]);
+  }
+  framework::DDim new_dim;
+  new_dim = new_dim.reshape(all_shape);
+  VLOG(3) << "recv the shape: (" << new_dim << ") from peer";
+
+  return new_dim;
+}
+#endif
+
 template <typename T>
 class RecvOpV2CUDAKernel : public framework::OpKernel<T> {
  public:
@@ -32,6 +111,7 @@ class RecvOpV2CUDAKernel : public framework::OpKernel<T> {
 #if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \
     NCCL_VERSION_CODE >= 2703
     int rid = ctx.Attr<int>("ring_id");
+    bool dynamic_shape = ctx.Attr<bool>("dynamic_shape");
     PADDLE_ENFORCE_GE(
         rid, 0,
         platform::errors::InvalidArgument(
@@ -53,7 +133,18 @@ class RecvOpV2CUDAKernel : public framework::OpKernel<T> {
       auto out_shape = ctx.Attr<std::vector<int>>("out_shape");
       auto out = ctx.Output<framework::LoDTensor>("Out");
       auto out_dims = out->dims();
-      out->mutable_data<T>(out_dims, place);
+
+      if (dynamic_shape) {
+        VLOG(3) << "recv_v2 will use dynamic shape with send_v2 for switch";
+        framework::DDim new_dim =
+            recv_shape_info(ctx.GetPlace(),
+                            /* gpuStream_t */ nullptr,
+                            /* NCCLComm* */ nullptr, peer, pg);
+        out->Resize(new_dim);
+        out->mutable_data<T>(new_dim, place);
+      } else {
+        out->mutable_data<T>(out_dims, place);
+      }
 
       out_tensor.emplace_back(*out);
       auto task = pg->Recv(out_tensor, peer);
@@ -79,6 +170,10 @@ class RecvOpV2CUDAKernel : public framework::OpKernel<T> {
 
     auto *out_var = ctx.OutputVar("Out");
     if (out_var->IsType<framework::LoDTensorArray>()) {
+      PADDLE_ENFORCE_EQ(
+          dynamic_shape, false,
+          platform::errors::InvalidArgument("Dynamic shape for send/recv not "
+                                            "support LoDTensorArray for now."));
       auto out_array = out_var->GetMutable<framework::LoDTensorArray>();
       for (size_t idx = 0; idx < out_array->size(); ++idx) {
         VLOG(3) << "LodTensorArray: idx(" << idx << ")";
@@ -99,7 +194,16 @@ class RecvOpV2CUDAKernel : public framework::OpKernel<T> {
     auto out_dims = out->dims();
     auto numel = out->numel();
 
-    out->mutable_data<T>(out_dims, place);
+    if (dynamic_shape) {
+      VLOG(3) << "recv_v2 will use dynamic shape with send_v2";
+      framework::DDim new_dim = recv_shape_info(place, stream, comm, peer,
+                                                /* ProcessGroup* */ nullptr);
+      out->Resize(new_dim);
+      numel = out->numel();
+      out->mutable_data<T>(new_dim, place);
+    } else {
+      out->mutable_data<T>(out_dims, place);
+    }
     PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
         out->data<T>(), numel, dtype, peer, comm->comm(), stream));
     VLOG(3) << "rank " << comm->rank() << " recv " << phi::product(out->dims())
@@ -122,4 +226,5 @@ REGISTER_OP_CUDA_KERNEL(recv_v2, ops::RecvOpV2CUDAKernel<float>,
                         ops::RecvOpV2CUDAKernel<double>,
                         ops::RecvOpV2CUDAKernel<int>,
                         ops::RecvOpV2CUDAKernel<int64_t>,
+                        ops::RecvOpV2CUDAKernel<int8_t>,
                         ops::RecvOpV2CUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/send_v2_op.cc b/paddle/fluid/operators/collective/send_v2_op.cc
index 753a33268cc95..d685dd561bc74 100644
--- a/paddle/fluid/operators/collective/send_v2_op.cc
+++ b/paddle/fluid/operators/collective/send_v2_op.cc
@@ -70,6 +70,10 @@ class SendOpV2Maker : public framework::OpProtoAndCheckerMaker {
         "use_calc_stream",
         "(bool default false) eject CUDA operations to calculation stream.")
         .SetDefault(false);
+    AddAttr<bool>(
+        "dynamic_shape",
+        "(bool default false) the send/recv will be done with dynamic shape.")
+        .SetDefault(false);
     AddComment(R"DOC(
 Send Operator
 
diff --git a/paddle/fluid/operators/collective/send_v2_op.cu.cc b/paddle/fluid/operators/collective/send_v2_op.cu.cc
index add352306fa28..8878b7c3449b9 100644
--- a/paddle/fluid/operators/collective/send_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/send_v2_op.cu.cc
@@ -24,6 +24,76 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \
+    NCCL_VERSION_CODE >= 2703
+void send_shape_info(const framework::Tensor& x, const platform::Place& place,
+                     const gpuStream_t& stream, platform::NCCLComm* comm,
+                     const int& peer, distributed::ProcessGroup* group) {
+  if (!group) {
+    PADDLE_ENFORCE_EQ((stream != nullptr && comm != nullptr), true,
+                      platform::errors::InvalidArgument(
+                          "NCCLComm and Stream should be provided if use NCCL "
+                          "to send the shape info."));
+  }
+  paddle::experimental::DataType shape_dytpe =
+      paddle::experimental::DataType::INT32;
+  ncclDataType_t nccl_dtype =
+      platform::ToNCCLDataType(framework::TransToProtoVarType(shape_dytpe));
+  auto dims = x.dims();
+  int shape_size = dims.size();
+
+  // step1: send the shape size
+  framework::Tensor cpu_shape_size_tensor(shape_dytpe);
+  cpu_shape_size_tensor.Resize({1});
+  cpu_shape_size_tensor.mutable_data(platform::CPUPlace(), shape_dytpe);
+  auto* cpu_data = cpu_shape_size_tensor.data<int>();
+  cpu_data[0] = shape_size;
+
+  if (group) {
+    std::vector<framework::Tensor> shape_size_tensor;
+    shape_size_tensor.template emplace_back(cpu_shape_size_tensor);
+    auto shape_size_task = group->Send(shape_size_tensor, peer);
+  } else {
+    // copy the shape size tensor to gpu and send
+    framework::Tensor* gpu_shape_size_tensor =
+        new framework::Tensor(shape_dytpe);
+    gpu_shape_size_tensor->Resize({1});
+    gpu_shape_size_tensor->mutable_data(place, shape_dytpe);
+    framework::TensorCopySync(cpu_shape_size_tensor, place,
+                              gpu_shape_size_tensor);
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::ncclSend(gpu_shape_size_tensor->data<int>(), 1,
+                                    nccl_dtype, peer, comm->comm(), stream));
+  }
+  VLOG(3) << "send the shape size: " << shape_size << " to peer";
+
+  // step2: send the shape
+  framework::Tensor cpu_shape_tensor(shape_dytpe);
+  cpu_shape_tensor.Resize({shape_size});
+  cpu_shape_tensor.mutable_data(platform::CPUPlace(), shape_dytpe);
+  auto* cpu_shape_data = cpu_shape_tensor.data<int>();
+  for (int i = 0; i < shape_size; ++i) {
+    cpu_shape_data[i] = dims[i];
+  }
+
+  if (group) {
+    std::vector<framework::Tensor> shape_tensor;
+    shape_tensor.template emplace_back(cpu_shape_tensor);
+    auto shape_task = group->Send(shape_tensor, peer);
+  } else {
+    // copy the shape tensor to gpu and send
+    framework::Tensor* gpu_shape_tensor = new framework::Tensor(shape_dytpe);
+    gpu_shape_tensor->Resize({shape_size});
+    gpu_shape_tensor->mutable_data(place, shape_dytpe);
+    framework::TensorCopySync(cpu_shape_tensor, place, gpu_shape_tensor);
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::ncclSend(gpu_shape_tensor->data<int>(), shape_size,
+                                    nccl_dtype, peer, comm->comm(), stream));
+  }
+  VLOG(3) << "send the shape: (" << dims << ") to peer";
+}
+#endif
+
 template <typename T>
 class SendOpV2CUDAKernel : public framework::OpKernel<T> {
  public:
@@ -31,6 +101,7 @@ class SendOpV2CUDAKernel : public framework::OpKernel<T> {
 #if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \
     NCCL_VERSION_CODE >= 2703
     int rid = ctx.Attr<int>("ring_id");
+    bool dynamic_shape = ctx.Attr<bool>("dynamic_shape");
     PADDLE_ENFORCE_GE(
         rid, 0,
         platform::errors::InvalidArgument(
@@ -45,8 +116,17 @@ class SendOpV2CUDAKernel : public framework::OpKernel<T> {
     if (map->has(rid)) {
       // Use ProcessGroup
       distributed::ProcessGroup* pg = map->get(rid);
-      std::vector<phi::DenseTensor> in_tensor;
       auto x = ctx.Input<framework::LoDTensor>("X");
+
+      if (dynamic_shape) {
+        // dynamic shape for switch send/recv
+        VLOG(3) << "send_v2 will use dynamic shape with recv_v2 for switch";
+        send_shape_info(*x, ctx.GetPlace(),
+                        /* gpuStream_t */ nullptr,
+                        /* NCCLComm* */ nullptr, peer, pg);
+      }
+
+      std::vector<phi::DenseTensor> in_tensor;
       in_tensor.push_back(*x);
       auto task = pg->Send(in_tensor, peer);
       return;
@@ -68,6 +148,10 @@ class SendOpV2CUDAKernel : public framework::OpKernel<T> {
 
     auto* x_var = ctx.InputVar("X");
     if (x_var->IsType<framework::LoDTensorArray>()) {
+      PADDLE_ENFORCE_EQ(
+          dynamic_shape, false,
+          platform::errors::InvalidArgument("Dynamic shape for send/recv not "
+                                            "support LoDTensorArray for now."));
       auto& x_array = x_var->Get<framework::LoDTensorArray>();
       for (size_t idx = 0; idx < x_array.size(); idx++) {
         VLOG(3) << "LodTensorArray: idx(" << idx << ")";
@@ -85,6 +169,12 @@ class SendOpV2CUDAKernel : public framework::OpKernel<T> {
     auto x = ctx.Input<framework::LoDTensor>("X");
     int numel = x->numel();
 
+    if (dynamic_shape) {
+      VLOG(3) << "send_v2 will use dynamic shape with recv_v2";
+      send_shape_info(*x, place, stream, comm, peer,
+                      /* ProcessGroup* */ nullptr);
+    }
+
     ncclDataType_t dtype =
         platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype()));
     PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
@@ -109,4 +199,5 @@ REGISTER_OP_CUDA_KERNEL(send_v2, ops::SendOpV2CUDAKernel<float>,
                         ops::SendOpV2CUDAKernel<double>,
                         ops::SendOpV2CUDAKernel<int>,
                         ops::SendOpV2CUDAKernel<int64_t>,
+                        ops::SendOpV2CUDAKernel<int8_t>,
                         ops::SendOpV2CUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/send_v2_op_npu.cc b/paddle/fluid/operators/collective/send_v2_op_npu.cc
index 2d7382f3dfd70..882630467a012 100644
--- a/paddle/fluid/operators/collective/send_v2_op_npu.cc
+++ b/paddle/fluid/operators/collective/send_v2_op_npu.cc
@@ -41,7 +41,6 @@ class CSendOpASCENDKernel : public framework::OpKernel<T> {
       // Use ProcessGroup
       distributed::ProcessGroup* pg = map->get(ring_id);
       std::vector<phi::DenseTensor> in_tensor;
-      auto x = ctx.Input<framework::LoDTensor>("X");
       in_tensor.push_back(*x);
       auto task = pg->Send(in_tensor, 1);
       return;
diff --git a/paddle/fluid/operators/compat/fill_constant.pbtxt b/paddle/fluid/operators/compat/fill_constant.pbtxt
index 26fecf623c19c..62701eeb396da 100644
--- a/paddle/fluid/operators/compat/fill_constant.pbtxt
+++ b/paddle/fluid/operators/compat/fill_constant.pbtxt
@@ -58,4 +58,8 @@ extra {
     name: "op_device"
     type: STRING
   }
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
 }
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.cc b/paddle/fluid/operators/controlflow/conditional_block_op.cc
index 6bf419c47a566..fd06e33a6bb6e 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.cc
@@ -17,6 +17,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/assign_op.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -65,6 +69,12 @@ class ConditionalBlockOp : public ConditionalOp {
       scopes->resize(1);
       scopes->front() = &scope.NewScope();
       auto &cur_scope = *scopes->front();
+#ifdef PADDLE_WITH_MKLDNN
+      // (jczaja) Executor on being destroyed clears oneDNN cache and
+      // reset registered model data layout. This is unwanted for nested
+      // Executors (executors declared inside control ops)
+      platform::DontClearMKLDNNCache(dev_place);
+#endif
       framework::Executor exec(dev_place);
       auto *block = Attr<framework::BlockDesc *>("sub_block");
       VLOG(3) << "Conditional block.idx = " << block->ID()
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index eb44655c88f18..d8daa25f31be8 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -17,6 +17,9 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/controlflow/while_op_helper.h"
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 namespace paddle {
 namespace framework {
 class InferShapeContext;
@@ -66,6 +69,12 @@ class WhileOp : public framework::OperatorBase {
             "the Condition's shape is ",
             cond.dims().to_str(), ".\n"));
 
+#ifdef PADDLE_WITH_MKLDNN
+    // (jczaja) Executor on being destroyed clears oneDNN cache and
+    // resets registered model data layout. This is unwanted for nested
+    // Executors (executors declared inside control ops)
+    platform::DontClearMKLDNNCache(dev_place);
+#endif
     framework::Executor executor(dev_place);
     auto *block = Attr<framework::BlockDesc *>(kStepBlock);
 
diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h
index 419fb8a4ca703..3044aa6cf6c5a 100644
--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
@@ -72,8 +72,10 @@ static inline bool UseFixedWorkspace() {
 static size_t CalcWorkspaceLimitInBytes(bool use_fixed_workspace) {
   if (!use_fixed_workspace) {
     int device_id = platform::GetCurrentDeviceId();
-    int64_t allocated = memory::StatGetCurrentValue("Allocated", device_id);
-    int64_t reserved = memory::StatGetCurrentValue("Reserved", device_id);
+    int64_t allocated =
+        memory::DeviceMemoryStatCurrentValue("Allocated", device_id);
+    int64_t reserved =
+        memory::DeviceMemoryStatCurrentValue("Reserved", device_id);
     int64_t availble = platform::GpuAvailableMemToAlloc();
     VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated)
             << " MB, reserved=" << ToMegaBytes(reserved)
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
index a5d888765bf37..58f2eeee256db 100644
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -43,7 +43,7 @@ inline int ConvOutputSize(int input_size, int filter_size, int dilation,
       output_size, 0,
       platform::errors::InvalidArgument(
           "The output's size is expected to be greater than 0. "
-          "But recieved: output's size is %d. The output's size is computed by "
+          "But received: output's size is %d. The output's size is computed by "
           "((input_size + 2 * padding - (dilation * (filter_size - 1) + 1)) / "
           "stride + 1), where input_size is %d, padding is %d, "
           "filter_size is %d, dilation is %d, stride is %d.",
@@ -60,7 +60,7 @@ inline int ConvOutputSize(int input_size, int filter_size, int dilation,
       output_size, 0,
       platform::errors::InvalidArgument(
           "The output's size is expected to be greater than 0. "
-          "But recieved: output's size is %d. The output's size is computed by "
+          "But received: output's size is %d. The output's size is computed by "
           "((input_size + padding_1 + padding_2 - (dilation * (filter_size - "
           "1) + 1)) / stride + 1), where input_size is %d, padding is "
           "(%d, %d), filter_size is %d, dilation is %d, stride is %d.",
@@ -90,7 +90,7 @@ inline void UpdatePaddingAndDilation(std::vector<T>* paddings,
         platform::errors::InvalidArgument(
             "Attribute padding's size should be the same or twice as the "
             "input's dimension. "
-            "But recieved: padding's size is %d, padding is [%s]; input's "
+            "But received: padding's size is %d, padding is [%s]; input's "
             "dimension is %d, input's shape is [%s].",
             paddings->size(), phi::make_ddim(*paddings), data_dims.size(),
             data_dims));
diff --git a/paddle/fluid/operators/conv_op_mlu.cc b/paddle/fluid/operators/conv_op_mlu.cc
index 1ee772ec72950..c1517dbe16f84 100644
--- a/paddle/fluid/operators/conv_op_mlu.cc
+++ b/paddle/fluid/operators/conv_op_mlu.cc
@@ -98,7 +98,7 @@ class MLUConvOpKernel : public framework::OpKernel<T> {
         output_desc.get(), GetBasePtr(&output_tensor));
 
     if (!channel_last) {
-      // transpose ouput from NHWC to NCHW
+      // transpose output from NHWC to NCHW
       const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
       TransposeFromMLUTensor<T>(ctx, perm_to_nchw, &output_tensor, output,
                                 false /*need_reshape_or_alloc*/);
diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc
index f2beb4cec212e..9de5bc6ea3636 100644
--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
@@ -97,7 +97,7 @@ Crop Operator.
 Crop input into output, as specified by offsets and shape.
 
 There are two ways to set the offsets:
-1. In runtime: Using the input 'Offsets', which is a Vairbale and can be 
+1. In runtime: Using the input 'Offsets', which is a Variable and can be 
                output of other operators. This way is suitable for 
                dynamic offsets.
 2. In network configuration: Using the attribute 'offsets', which will be 
diff --git a/paddle/fluid/operators/ctc_align_op.cu b/paddle/fluid/operators/ctc_align_op.cu
index b1f2e61ef3930..ba90c677570c5 100644
--- a/paddle/fluid/operators/ctc_align_op.cu
+++ b/paddle/fluid/operators/ctc_align_op.cu
@@ -26,19 +26,19 @@ __global__ void MergeAndDelCudaKernel(const int64_t num_token, const T* tokens,
                                       const size_t num_seq, size_t* lod0,
                                       const int blank, const int merge_repeated,
                                       size_t* out_lod0, T* output) {
-  int ouput_idx = 0;
+  int output_idx = 0;
   out_lod0[0] = 0;
 
   for (int i = 0; i < num_seq; ++i) {
     T pre_token = -1;
     for (int j = lod0[i]; j < lod0[i + 1]; ++j) {
       if (tokens[j] != blank && !(merge_repeated && tokens[j] == pre_token)) {
-        output[ouput_idx] = tokens[j];
-        ++ouput_idx;
+        output[output_idx] = tokens[j];
+        ++output_idx;
       }
       pre_token = tokens[j];
     }
-    out_lod0[i + 1] = ouput_idx;
+    out_lod0[i + 1] = output_idx;
   }
 }
 
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
index 972dea38f5746..798fd93006620 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -172,17 +172,13 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
     int seed = ctx.Attr<int>("seed");
 
     if (!is_test) {
-      int device_id = ctx.GetPlace().GetDeviceId();
-      auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
-      if (gen_cuda->GetIsInitPy() && seed == 0) {
-        // If perform `manual_seed` in python and inner seed is not specified
-        // (equals 0), use global generator generated seed.
+      if (seed == 0) {
+        // If not specify seed, use global Generator to generate seed.
+        int device_id = ctx.GetPlace().GetDeviceId();
+        auto gen_cuda = paddle::framework::DefaultCUDAGenerator(device_id);
         seed = static_cast<int>(gen_cuda->Random64());
-      } else if (seed == 0) {
-        // use random generated seed
-        std::random_device rd;
-        seed = rd();
-      }  // else use `ctx.Attr<int>("seed")` specified seed
+      }
+      // else use `ctx.Attr<int>("seed")` specified seed
     }
 
     bool has_seq_length = ctx.HasInput("SequenceLength");
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cu b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
index 448f67a4bad7a..873950b2d2f65 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
@@ -200,7 +200,7 @@ class DeformablePSROIPoolCUDAKernel : public framework::OpKernel<T> {
         num_rois, out->dims()[0],
         platform::errors::InvalidArgument(
             "The number of Input(ROIs) should be same with the number of "
-            "Ouput(Output), but received ROIs number is:%d, Output number "
+            "Output(Output), but received ROIs number is:%d, Output number "
             "is:%d.",
             num_rois, out->dims()[0]));
     const int count = num_rois * output_dim * pooled_height * pooled_width;
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.h b/paddle/fluid/operators/deformable_psroi_pooling_op.h
index 51a0fe4172ca2..3deabce54ed0b 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.h
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.h
@@ -175,7 +175,7 @@ class DeformablePSROIPoolCPUKernel : public framework::OpKernel<T> {
         num_rois, out->dims()[0],
         platform::errors::InvalidArgument(
             "The number of Input(ROIs) should be same with the number of "
-            "Ouput(Output), but received ROIs number is:%d, Output number "
+            "Output(Output), but received ROIs number is:%d, Output number "
             "is:%d.",
             num_rois, out->dims()[0]));
     framework::Tensor roi_batch_id_list;
diff --git a/paddle/fluid/operators/detection/matrix_nms_op.cc b/paddle/fluid/operators/detection/matrix_nms_op.cc
index 713c2dc7fe9c1..3353739b01bf6 100644
--- a/paddle/fluid/operators/detection/matrix_nms_op.cc
+++ b/paddle/fluid/operators/detection/matrix_nms_op.cc
@@ -385,7 +385,7 @@ independently for each class. The outputs is a 2-D LoDTenosr, for each
 image, the offsets in first dimension of LoDTensor are called LoD, the number
 of offset is N + 1, where N is the batch size. If LoD[i + 1] - LoD[i] == 0,
 means there is no detected bbox for this image. Now this operator has one more
-ouput, which is RoisNum. The size of RoisNum is N, RoisNum[i] means the number of 
+output, which is RoisNum. The size of RoisNum is N, RoisNum[i] means the number of 
 detected bbox for this image.
 
 For more information on Matrix NMS, please refer to:
diff --git a/paddle/fluid/operators/dgc_op.h b/paddle/fluid/operators/dgc_op.h
index ac9c440076257..b1bf5e2778167 100644
--- a/paddle/fluid/operators/dgc_op.h
+++ b/paddle/fluid/operators/dgc_op.h
@@ -15,9 +15,11 @@ limitations under the License. */
 #pragma once
 #include <vector>
 #include "dgc/dgc.h"
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
 
 namespace paddle {
 namespace operators {
@@ -153,18 +155,18 @@ class DGCOpKernel : public framework::OpKernel<T> {
       u_out_e.device(eigen_ctx) = m * (u_e + grad_out_e);
 
       // v = u + v + g
-      ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(
-          ctx, u, v, 0, AddFunctor<T>(), v_out);
+      ElementwiseComputeEx<phi::funcs::AddFunctor<T>, DeviceContext, T>(
+          ctx, u, v, 0, phi::funcs::AddFunctor<T>(), v_out);
 
-      ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(
-          ctx, g, v, 0, AddFunctor<T>(), v_out);
+      ElementwiseComputeEx<phi::funcs::AddFunctor<T>, DeviceContext, T>(
+          ctx, g, v, 0, phi::funcs::AddFunctor<T>(), v_out);
     } else {
       // u = m * u + g
       u_out_e.device(eigen_ctx) = m * u_e + grad_out_e;
 
       // v = u + v
-      ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(
-          ctx, u, v, 0, AddFunctor<T>(), v_out);
+      ElementwiseComputeEx<phi::funcs::AddFunctor<T>, DeviceContext, T>(
+          ctx, u, v, 0, phi::funcs::AddFunctor<T>(), v_out);
     }
 
     T* v_out_data = v_out->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/dirichlet_op.cu b/paddle/fluid/operators/dirichlet_op.cu
index 63f9c7339bfc5..ac6480a8fa1c6 100644
--- a/paddle/fluid/operators/dirichlet_op.cu
+++ b/paddle/fluid/operators/dirichlet_op.cu
@@ -77,7 +77,7 @@ struct DirichletSampler<platform::CUDADeviceContext, T> {
 
     // init state, seed & offset for all threads
     int device_id = ctx.GetPlace().GetDeviceId();
-    auto p_gen = framework::GetDefaultCUDAGenerator(device_id);
+    auto p_gen = framework::DefaultCUDAGenerator(device_id);
     auto seed_and_offset = p_gen->IncrementOffset(10);  // hard-coded offset
     auto seed = seed_and_offset.first;
     auto offset = seed_and_offset.second;
diff --git a/paddle/fluid/operators/dropout_impl_util.h b/paddle/fluid/operators/dropout_impl_util.h
index c62d45570ba29..571a1c97c52e8 100644
--- a/paddle/fluid/operators/dropout_impl_util.h
+++ b/paddle/fluid/operators/dropout_impl_util.h
@@ -26,7 +26,7 @@ inline void GetSeedDataAndIncrement(const phi::GPUContext& dev_ctx,
                                     const int offset, uint64_t* seed_data,
                                     uint64_t* increment) {
   int device_id = dev_ctx.GetPlace().GetDeviceId();
-  auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
+  auto gen_cuda = framework::DefaultCUDAGenerator(device_id);
 
   if (seed) {
     framework::Tensor seed_cpu_tensor;
@@ -34,13 +34,12 @@ inline void GetSeedDataAndIncrement(const phi::GPUContext& dev_ctx,
                                       &seed_cpu_tensor);
     *seed_data = static_cast<uint64_t>(seed_cpu_tensor.data<int>()[0]);
     *increment = offset;
-  } else if (gen_cuda->GetIsInitPy() && (!is_fix_seed)) {
+  } else if (!is_fix_seed) {
     auto seed_offset = gen_cuda->IncrementOffset(offset);
     *seed_data = seed_offset.first;
     *increment = seed_offset.second;
   } else {
-    std::random_device rnd;
-    *seed_data = is_fix_seed ? seed_val : rnd();
+    *seed_data = seed_val;
     *increment = offset;
   }
 }
diff --git a/paddle/fluid/operators/dropout_op_mlu.cc b/paddle/fluid/operators/dropout_op_mlu.cc
index b88974a51ceff..f4dbbae05532e 100644
--- a/paddle/fluid/operators/dropout_op_mlu.cc
+++ b/paddle/fluid/operators/dropout_op_mlu.cc
@@ -82,7 +82,7 @@ class DropoutMLUKernel : public framework::OpKernel<T> {
             *x, ctx.GetPlace(),
             ctx.template device_context<platform::MLUDeviceContext>(), out);
       } else {
-        float scale = static_cast<T>(1.0f - dropout_prob);
+        auto scale = static_cast<T>(1.0f - dropout_prob);
         Tensor scale_tensor(x->dtype());
         scale_tensor.mutable_data<T>({1}, ctx.GetPlace());
         MLUCnnlTensorDesc scale_desc(scale_tensor);
diff --git a/paddle/fluid/operators/dropout_op_npu.cc b/paddle/fluid/operators/dropout_op_npu.cc
index 07b3b53811625..104ab1b504640 100644
--- a/paddle/fluid/operators/dropout_op_npu.cc
+++ b/paddle/fluid/operators/dropout_op_npu.cc
@@ -54,7 +54,7 @@ class DropoutNPUKernel : public framework::OpKernel<T> {
       return;
     }
 
-    // only achive the default `upscale_in_train` method
+    // only achieve the default `upscale_in_train` method
     if (!is_test) {
       Tensor tmp_x(x->dtype());
       Tensor tmp_out(out->dtype());
diff --git a/paddle/fluid/operators/einsum_op.cc b/paddle/fluid/operators/einsum_op.cc
index 8fdde1ccdc058..6da0045443ccc 100644
--- a/paddle/fluid/operators/einsum_op.cc
+++ b/paddle/fluid/operators/einsum_op.cc
@@ -33,6 +33,13 @@ class EinsumOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Operands", "(TensorList), The input tensor of einsum op.")
         .AsDuplicable();
     AddOutput("Out", "(Tensor), The output tensor of einsum op.");
+    AddOutput(
+        "InnerCache",
+        "(Tensor), The cache of the forward transpose tensors: tA and tB.")
+        .AsDuplicable()
+        .AsExtra()
+        .AsIntermediate();
+
     AddAttr<std::string>("equation",
                          "(string) A einsum equation. such as `ij,jk->ik`"
                          "There must have `->` and the number of operands in "
@@ -72,6 +79,7 @@ class EinsumGradMaker : public framework::SingleGradOpMaker<T> {
   void Apply(GradOpPtr<T> retv) const override {
     retv->SetType("einsum_grad");
     retv->SetInput("Operands", this->Input("Operands"));
+    retv->SetInput("InnerCache", this->Output("InnerCache"));
     retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
     retv->SetAttrMap(this->Attrs());
     retv->SetOutput(framework::GradVarName("Operands"),
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
index 53037c1fa6536..ed9b98a128a21 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-
 #include <string>
 
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
@@ -125,17 +123,6 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 
-REGISTER_OP_CPU_KERNEL(
-    grad_add,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex<float>>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex<double>>);
-
 REGISTER_OP_VERSION(elementwise_add)
     .AddCheckpoint(
         R"ROC(Register elementwise_add for adding the attribute of
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
deleted file mode 100644
index d77d4ed036394..0000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#ifdef __xpu__
-#include <memory>
-#include <string>
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
-#include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
-#include "paddle/fluid/platform/device/device_wrapper.h"
-#else
-#include <algorithm>
-#include <utility>
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-
-// only can include the headers in paddle/phi/include dirs
-#include "paddle/phi/kernels/elementwise_add_grad_kernel.h"
-#include "paddle/phi/kernels/elementwise_add_kernel.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ElementwiseAddKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-#ifdef __xpu__
-    std::vector<const framework::Tensor*> ins;
-    std::vector<framework::Tensor*> outs;
-    int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs);
-    const auto& xpu_ctx =
-        ctx.template device_context<paddle::platform::XPUDeviceContext>();
-    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
-                                                   T, kps::AddFunctor<T>, 1>(
-        xpu_ctx, ins, &outs, axis, kps::AddFunctor<T>());
-#else
-    auto *x = ctx.Input<framework::LoDTensor>("X");
-    auto *y = ctx.Input<framework::LoDTensor>("Y");
-    auto *z = ctx.Output<framework::LoDTensor>("Out");
-    z->mutable_data<T>(ctx.GetPlace());
-
-    auto &dev_ctx = ctx.device_context<DeviceContext>();
-    int axis = ctx.Attr<int>("axis");
-    phi::AddRawKernel<T>(
-        static_cast<const typename framework::ConvertToPhiContext<
-            DeviceContext>::TYPE &>(dev_ctx),
-        *x, *y, axis, z);
-#endif
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.kps b/paddle/fluid/operators/elementwise/elementwise_add_op.kps
deleted file mode 100644
index ecd52a310acdb..0000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.kps
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_XPU_KP
-
-// Please do not modify the following code
-#if defined(__CUDA_ARCH__)
-#undef __CUDA_ARCH__
-#endif
-
-#if defined(__CUDACC__)
-#undef __CUDACC__
-#endif
-
-#if defined(__CUDA__)
-#undef __CUDA__
-#endif
-
-#if defined(__NVCC__)
-#undef __NVCC__
-#endif
-
-#include <xpu/runtime.h>                // NOLINT
-#include "xpu/kernel/cluster_header.h"  // NOLINT
-#include "xpu/kernel/debug.h"           // NOLINT
-#include "xpu/kernel/math.h"            // NOLINT
-
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#else
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include "paddle/phi/kernels/gpu/elementwise_grad.h"
-#endif
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-#ifdef PADDLE_WITH_XPU_KP
-REGISTER_OP_KERNEL(elementwise_add, KP, plat::XPUPlace,
-                   ops::ElementwiseAddKernel<plat::XPUDeviceContext, float>);
-#else
-REGISTER_OP_CUDA_KERNEL(
-    grad_add, ops::ElementwiseAddKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::bfloat16>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::complex<float>>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::complex<double>>);
-#endif
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
index d35e3f6641b45..178aa329577b7 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_npu.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
index feb73abf3ff08..22a5de4c60941 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include <memory>
 #include <string>
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
diff --git a/paddle/fluid/operators/elementwise/elementwise_heaviside_op.cc b/paddle/fluid/operators/elementwise/elementwise_heaviside_op.cc
new file mode 100644
index 0000000000000..e003a43b5c56b
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_heaviside_op.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ElementwiseHeavisideOpMaker : public ElementwiseOpMaker {
+ protected:
+  std::string GetName() const override { return "Heaviside"; }
+  std::string GetEquation() const override { return "Out = Heaviside(X, Y)"; }
+
+  void AddInputX() override {
+    AddInput("X",
+             "(Tensor), The input tensor of Heaviside step function. "
+             "Its dtype can be int32, int64, float32 and float64");
+  }
+
+  void AddInputY() override {
+    AddInput("Y",
+             "(Tensor), The tensor determining a Heaviside step function, "
+             "which is the value when X = 0. Its dtype should be same as X.");
+  }
+
+  std::string GetOpFuntionality() const override {
+    return "Computes the Heaviside step function determined by Y "
+           "for each element in X.";
+  }
+};
+
+template <typename T>
+class ElementwiseHeavisideGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("elementwise_heaviside_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Y", this->Input("Y"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Y"), this->InputGrad("Y"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    elementwise_heaviside, ops::ElementwiseOp, ops::ElementwiseHeavisideOpMaker,
+    ops::ElementwiseHeavisideGradOpMaker<paddle::framework::OpDesc>,
+    ops::ElementwiseHeavisideGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(elementwise_heaviside_grad, ops::ElementwiseOpGrad);
diff --git a/paddle/fluid/operators/elementwise/elementwise_mlu.h b/paddle/fluid/operators/elementwise/elementwise_mlu.h
index 156cea81c0f63..ff1e12103be91 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mlu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mlu.h
@@ -165,7 +165,7 @@ template <UNARY_FUNCTOR func>
 void MLUUnary(const framework::ExecutionContext& ctx,
               cnnlComputationPreference_t prefer,
               const cnnlTensorDescriptor_t input_desc, const void* input,
-              const cnnlTensorDescriptor_t ouput_desc, void* output);
+              const cnnlTensorDescriptor_t output_desc, void* output);
 
 template <>
 inline void MLUUnary<NEG>(const framework::ExecutionContext& ctx,
diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index adc0842fb3882..95753bb336354 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -103,11 +103,12 @@ class ElementwiseOp : public framework::OperatorWithKernel {
       std::vector<int> out_dims_array(max_dim);
 #ifdef PADDLE_WITH_MKLDNN
       // (jczaja): Broadcasting of dims has to be done on Paddle shapes (NHWC)
-      // if model is using NHWC.
+      // if model is using NHWC and any of shapes in at least 3D
       bool should_rotate =
           ctx->IsRunMKLDNNKernel() &&
           (platform::MKLDNNDeviceContext::tls().get_cur_paddle_data_layout() ==
-           framework::DataLayout::kNHWC);
+           framework::DataLayout::kNHWC) &&
+          (x_dims.size() >= 3 || y_dims.size() >= 3);
       if (should_rotate) {
         // Pick bigger shape and rotate this one
         bool x_over_y = (x_dims.size() > y_dims.size());
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
index 3e9263fe93acd..39a80e9571b29 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
@@ -75,7 +75,7 @@ void Compare(f::Scope *scope, const p::DeviceContext &ctx,
   paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
 
   ctx.Wait();
-  float expected;
+  float expected = 0.0;
   if (op_type == "elementwise_add") {
     expected = 3.0;
   } else if (op_type == "elementwise_sub") {
@@ -133,7 +133,7 @@ void CompareGrad(f::Scope *scope, const p::DeviceContext &ctx,
   paddle::framework::TensorToVector(*tensor_dy, ctx, &dy_vec);
 
   ctx.Wait();
-  float expected_x, expected_y;
+  float expected_x = 0, expected_y = 0;
   if (op_type == "elementwise_add_grad") {
     expected_x = 1.0;
     expected_y = 6.0;
diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
index 33518953004ae..6e646f0d4bf26 100644
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -75,7 +75,7 @@ class FCOp : public framework::OperatorWithKernel {
         platform::errors::InvalidArgument(
             "The attribute in_num_col_dims used to flatten Input to "
             "a 2-D tensor, is expected to be less than the number of "
-            "Input's dimensions. But recieved in_num_col_dims is %d, "
+            "Input's dimensions. But received in_num_col_dims is %d, "
             "the number of Input's dimensions is %d, Input's shape is %s.",
             in_num_col_dims, in_dims.size(), in_dims));
 
@@ -93,7 +93,7 @@ class FCOp : public framework::OperatorWithKernel {
           in_dims.size() >= 2 && in_dims.size() <= 4, true,
           platform::errors::Unimplemented(
               "The Input of fc is expected to be a 2-D, 3-D or 4-D tensor when "
-              "use_mkldnn is set. But recieved the number of Input's "
+              "use_mkldnn is set. But received the number of Input's "
               "dimensions is %d, Input's shape is %s.",
               in_dims.size(), in_dims));
     }
diff --git a/paddle/fluid/operators/fc_op.h b/paddle/fluid/operators/fc_op.h
index 6d3b531ce0aa6..47c7128603587 100644
--- a/paddle/fluid/operators/fc_op.h
+++ b/paddle/fluid/operators/fc_op.h
@@ -36,7 +36,7 @@ inline void FCOutputSize(const framework::DDim& in_dims,
       in_mat_dims[1], w_dims0,
       platform::errors::InvalidArgument(
           "The input's second dimension and weight's first dimension is "
-          "expected to be the same. But recieved input's second dimension is "
+          "expected to be the same. But received input's second dimension is "
           "%d, input's shape is %s; weight's first dimension is %d, weight's "
           "shape is %s.",
           in_mat_dims[1], in_mat_dims, w_dims0,
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index ca46a3db1ecd5..07593a70f05b7 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -22,17 +22,17 @@ class FillConstantOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "FillConstant");
 
-    auto& shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
+    auto &shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
     if (!ctx->HasInput("ShapeTensor") && !ctx->HasInputs("ShapeTensorList")) {
       for (size_t i = 0; i < shape.size(); ++i) {
         PADDLE_ENFORCE_GE(
             shape[i], 0,
             platform::errors::InvalidArgument(
                 "Each value of attribute 'shape' is expected to be no less "
-                "than 0. But recieved: shape[%u] = %d; shape = [%s].",
+                "than 0. But received: shape[%u] = %d; shape = [%s].",
                 i, shape[i], phi::make_ddim(shape)));
       }
     }
@@ -52,8 +52,8 @@ class FillConstantOp : public framework::OperatorWithKernel {
 
  protected:
   framework::OpKernelType GetKernelTypeForVar(
-      const std::string& var_name, const framework::Tensor& tensor,
-      const framework::OpKernelType& expected_kernel_type) const override {
+      const std::string &var_name, const framework::Tensor &tensor,
+      const framework::OpKernelType &expected_kernel_type) const override {
     if (var_name == "ShapeTensor" || var_name == "ShapeTensorList") {
       return expected_kernel_type;
     } else {
@@ -63,7 +63,7 @@ class FillConstantOp : public framework::OperatorWithKernel {
   }
 
   framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
+      const framework::ExecutionContext &ctx) const override {
     framework::OpKernelType kt = framework::OpKernelType(
         framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
         ctx.GetPlace());
@@ -97,13 +97,24 @@ class FillConstantOp : public framework::OperatorWithKernel {
       }
     }
 
+#ifdef PADDLE_WITH_MKLDNN
+    auto input_data_type =
+        framework::proto::VarType::Type(ctx.Attr<int>("dtype"));
+
+    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+
     return kt;
   }
 };
 
 class FillConstantOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext* ctx) const override {
+  void operator()(framework::InferVarTypeContext *ctx) const override {
     auto data_type = static_cast<framework::proto::VarType::Type>(
         BOOST_GET_CONST(int, ctx->GetAttr("dtype")));
     ctx->SetOutputDataType("Out", data_type);
@@ -156,6 +167,10 @@ class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
                  "3: XPUPlace. "
                  "4: NPUPlace. ")
         .SetDefault(-1);
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false)
+        .AsExtra();
     AddOutput("Out",
               "(Tensor) Tensor of specified shape will be filled "
               "with the specified value");
diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc
index cb03add314327..edd8613ba525d 100644
--- a/paddle/fluid/operators/fill_constant_op_npu.cc
+++ b/paddle/fluid/operators/fill_constant_op_npu.cc
@@ -65,7 +65,7 @@ class FillConstantNPUKernel : public framework::OpKernel<T> {
       tensor_value.mutable_data<T>({1}, ctx.GetPlace());
       FillNpuTensorWithConstant<T>(&tensor_value, value);
       NpuOpRunner runner;
-#if (CANN_VERSION_CODE >= 503003 && CANN_VERSION_CODE < 504001)
+#if (CANN_VERSION_CODE >= 503003 && CANN_VERSION_CODE < 504000)
       runner.SetType("FillD")
           .AddInput(tensor_value)
           .AddOutput(*out_var)
diff --git a/paddle/fluid/operators/fold_op.cc b/paddle/fluid/operators/fold_op.cc
index 92f59e118c3b7..9c9183c8fafa4 100644
--- a/paddle/fluid/operators/fold_op.cc
+++ b/paddle/fluid/operators/fold_op.cc
@@ -76,47 +76,47 @@ class FoldOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_GT(kernel_height, 0,
                       platform::errors::InvalidArgument(
                           "The `kernel_sizes` should be greater than zero, "
-                          "but recieved kernel_height: %d kernel_width: %d.",
+                          "but received kernel_height: %d kernel_width: %d.",
                           kernel_sizes[0], kernel_sizes[1]));
     PADDLE_ENFORCE_GT(kernel_width, 0,
                       platform::errors::InvalidArgument(
                           "The `kernel_sizes` should be greater than zero, "
-                          "but recieved kernel_height: %d kernel_width: %d.",
+                          "but received kernel_height: %d kernel_width: %d.",
                           kernel_sizes[0], kernel_sizes[1]));
     // check strides
     PADDLE_ENFORCE_GT(stride_height, 0,
                       platform::errors::InvalidArgument(
                           "The `strides` should be greater than zero, "
-                          "but recieved strides_height: %d strides_width: %d.",
+                          "but received strides_height: %d strides_width: %d.",
                           strides[0], strides[1]));
     PADDLE_ENFORCE_GT(stride_width, 0,
                       platform::errors::InvalidArgument(
                           "The `strides` should be greater than zero, "
-                          "but recieved strides_height: %d strides_width: %d.",
+                          "but received strides_height: %d strides_width: %d.",
                           strides[0], strides[1]));
     // check dilations
     PADDLE_ENFORCE_GT(output_height, 1,
                       platform::errors::InvalidArgument(
                           "The `output_height` should be greater than one, "
-                          "but recieved output_height: %d .",
+                          "but received output_height: %d .",
                           output_height));
     PADDLE_ENFORCE_GT(output_width, 1,
                       platform::errors::InvalidArgument(
                           "The `output_width` should be greater than one, "
-                          "but recieved output_width: %d .",
+                          "but received output_width: %d .",
                           output_width));
     // check output size
     PADDLE_ENFORCE_GT(
         dilation_height, 0,
         platform::errors::InvalidArgument(
             "The `dilations` should be greater than zero, "
-            "but recieved dilations_height: %d dilations_width: %d.",
+            "but received dilations_height: %d dilations_width: %d.",
             dilations[0], dilations[1]));
     PADDLE_ENFORCE_GT(
         dilation_width, 0,
         platform::errors::InvalidArgument(
             "The `dilations` should be greater than zero, "
-            "but recieved dilations_height: %d dilations_width: %d.",
+            "but received dilations_height: %d dilations_width: %d.",
             dilations[0], dilations[1]));
 
     std::vector<int> out_dims;
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index 68b9051d85831..03351dbca09e5 100644
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -11,6 +11,8 @@ register_operators(EXCLUDES
     fused_fc_elementwise_layernorm_op
     multihead_matmul_op
     skip_layernorm_op
+    yolo_box_head_op
+    yolo_box_post_op
     fused_embedding_eltwise_layernorm_op
     fusion_group_op
     fusion_gru_op
@@ -53,6 +55,8 @@ if (WITH_GPU OR WITH_ROCM)
     # multihead_matmul_op
     op_library(multihead_matmul_op)
     op_library(skip_layernorm_op)
+    op_library(yolo_box_head_op)
+    op_library(yolo_box_post_op)
     op_library(fused_embedding_eltwise_layernorm_op)
     # fusion_group
     if(NOT APPLE AND NOT WIN32)
diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h
index 3a2de0c4a0935..b059223eaf6e7 100644
--- a/paddle/fluid/operators/fused/attn_bias_add.cu.h
+++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h
@@ -51,8 +51,7 @@ template <typename InT, typename OutT, int ShapeSize, int VecSize,
 __global__ void BroadcastKernelBinary(
     const InT* __restrict__ in0, const InT* __restrict__ in1, OutT* out,
     phi::Array<bool, MAX_INPUT_NUM> use_broadcast, uint32_t numel,
-    phi::Array<kps::details::BroadcastConfig<ShapeSize>, MAX_INPUT_NUM>
-        configlists,
+    phi::Array<kps::details::BroadcastConfig, MAX_INPUT_NUM> configlists,
     int main_tid, int tail_tid, Functor func) {
   int fix = blockIdx.x * blockDim.x * VecSize;
   int num = tail_tid;
@@ -65,14 +64,14 @@ __global__ void BroadcastKernelBinary(
 
   // load in0
   if (use_broadcast[0]) {
-    kernel_primitives::ReadDataBc<InT, VecSize, DATA_PER_THREAD, 1, ShapeSize>(
+    kernel_primitives::ReadDataBc<InT, VecSize, DATA_PER_THREAD, 1>(
         arg0, in0, fix, configlists[0], numel);
   } else {
     kernel_primitives::ReadData<InT, VecSize, 1, 1>(arg0, in0 + fix, num);
   }
   // load in1
   if (use_broadcast[1]) {
-    kernel_primitives::ReadDataBc<InT, VecSize, DATA_PER_THREAD, 1, ShapeSize>(
+    kernel_primitives::ReadDataBc<InT, VecSize, DATA_PER_THREAD, 1>(
         arg1, in1, fix, configlists[1], numel);
   } else {
     kernel_primitives::ReadData<InT, VecSize, 1, 1>(arg1, in1 + fix, num);
@@ -104,7 +103,7 @@ void LaunchBiasAddFwKernel(const platform::CUDADeviceContext& ctx, int m, int n,
   int main_tid = numel / (data_per_thread * vec_size * threads);
   int tail_tid = numel % (data_per_thread * vec_size * threads);
 
-  phi::Array<kps::details::BroadcastConfig<2>, MAX_INPUT_NUM> configlists;
+  phi::Array<kps::details::BroadcastConfig, MAX_INPUT_NUM> configlists;
   phi::Array<bool, MAX_INPUT_NUM> use_broadcast;
 
   use_broadcast[0] = false;
@@ -115,7 +114,7 @@ void LaunchBiasAddFwKernel(const platform::CUDADeviceContext& ctx, int m, int n,
   // Here, dims are transposed due to the logic in BroadcastConfig.
   std::vector<int64_t> input1_dims = {n, 1};
   std::vector<int64_t> out_dims = {n, m};
-  configlists[1] = kps::details::BroadcastConfig<2>(out_dims, input1_dims, 2);
+  configlists[1] = kps::details::BroadcastConfig(out_dims, input1_dims, 2);
 
   auto func = AddFunctor<T>();
   auto stream = ctx.stream();
diff --git a/paddle/fluid/operators/fused/attn_gemm.h b/paddle/fluid/operators/fused/attn_gemm.h
index c4e73c6bf97fd..9542f0742ea34 100644
--- a/paddle/fluid/operators/fused/attn_gemm.h
+++ b/paddle/fluid/operators/fused/attn_gemm.h
@@ -14,9 +14,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
 
 namespace paddle {
 namespace operators {
@@ -67,9 +68,8 @@ class AttnMatMul {
       ins.emplace_back(bias);
       outs.emplace_back(bias_out);
       int elewise_add_axis = -1;
-      paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
-                                                     T, T>(
-          dev_ctx_, ins, &outs, elewise_add_axis, AddFunctor<T>());
+      phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
+          dev_ctx_, ins, &outs, elewise_add_axis, phi::funcs::AddFunctor<T>());
     }
   }
 
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cc b/paddle/fluid/operators/fused/conv_fusion_op.cc
index e60fc44e9a6ff..671e94061cb5c 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cc
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cc
@@ -80,7 +80,7 @@ class Conv2DFusionOp : public operators::ConvOp {
         data_format, "NHWC",
         platform::errors::PermissionDenied(
             "Operator(Conv2DFusion) only supports data format of "
-            "channel first (NCHW) now. But recieved: data_format = '%s'.",
+            "channel first (NCHW) now. But received: data_format = '%s'.",
             data_format));
 
     std::vector<int64_t> output_shape = ComputeOutputShape(ctx);
@@ -113,7 +113,7 @@ class Conv2DFusionOp : public operators::ConvOp {
           split_channels_sum, output_shape[1],
           platform::errors::InvalidArgument(
               "The sum of Attr(split_channels) is expected to be equal to the "
-              "total output channels. But recieved: the sum of "
+              "total output channels. But received: the sum of "
               "Attr(split_channels) = %d, the total output channels = %d.",
               split_channels_sum, output_shape[1]));
 
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu
index 5dbf4fb88b2a7..8191c85f2a120 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cu
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cu
@@ -130,7 +130,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
         default:
           PADDLE_THROW(platform::errors::PermissionDenied(
               "Operator Conv2DFusion expects Input to be a 4-D or 5-D Tensor. "
-              "But recieved the actual dimension = %d, shape = [%s].",
+              "But received the actual dimension = %d, shape = [%s].",
               rank, transformed_input_channel.dims()));
       }
 
@@ -355,7 +355,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
         workspace_size_in_bytes, workspace_size_limit,
         platform::errors::InvalidArgument(
             "The actual workspace size to be allocated for cuDNN is expected "
-            "to be less than the limit. But recieved: the actual workspace "
+            "to be less than the limit. But received: the actual workspace "
             "size = %d, limit = %d.",
             workspace_size_in_bytes, workspace_size_limit));
 
@@ -414,7 +414,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
       } else {
         // TODO(qingiqng): do copy when batch size large than 1
         PADDLE_THROW(platform::errors::Unimplemented(
-            "Input with batch size greater than 1 is unsupported. The recieved "
+            "Input with batch size greater than 1 is unsupported. The received "
             "batch size is %d, Input's shape is [%s].",
             x_dims[0], phi::make_ddim(x_dims)));
       }
diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
index c5adee547bdac..516b10fa021c1 100644
--- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
@@ -103,7 +103,7 @@ void ComputeSumAndSquareSum(const framework::Tensor &cpu_x,
                             framework::Tensor *cpu_sum,
                             framework::Tensor *cpu_sum_of_square) {
   // x is in NHWC format.
-  auto dims = cpu_x.dims();
+  const auto &dims = cpu_x.dims();
   int64_t c = dims[3];
 
   const T *cpu_x_ptr = cpu_x.data<T>();
diff --git a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
index 74cc92eb8ab62..4b3ed56890e18 100644
--- a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
@@ -40,7 +40,7 @@ struct BNStatsFinalizeArgs {
     PADDLE_ENFORCE_EQ(
         param_shape.size(), 4U,
         platform::errors::InvalidArgument(
-            "The size of param_shape is expected to 4. But recieved "
+            "The size of param_shape is expected to 4. But received "
             "param_shape's size is %d, param_shape is [%s].",
             param_shape.size(), phi::make_ddim(param_shape)));
 
diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
index f63fe4b96cbeb..b32f2e40933ac 100644
--- a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
@@ -45,13 +45,13 @@ struct NormConvolutionArgs {
     PADDLE_ENFORCE_EQ(
         input_shape.size(), 4U,
         platform::errors::InvalidArgument(
-            "The size of input_shape is expected to 4. But recieved "
+            "The size of input_shape is expected to 4. But received "
             "input_shape's size is %d, input_shape is [%s].",
             input_shape.size(), phi::make_ddim(input_shape)));
     PADDLE_ENFORCE_EQ(
         filter_shape.size(), 4U,
         platform::errors::InvalidArgument(
-            "The size of filter_shape is expected to 4. But recieved "
+            "The size of filter_shape is expected to 4. But received "
             "filter_shape's size is %d, filter_shape is [%s].",
             filter_shape.size(), phi::make_ddim(filter_shape)));
     PADDLE_ENFORCE_EQ(filter_shape[1] == filter_shape[2] &&
@@ -59,20 +59,20 @@ struct NormConvolutionArgs {
                       true,
                       platform::errors::InvalidArgument(
                           "The filter_shape is expected to store as nhwc, and "
-                          "h = w = 1 or 3. But recieved filter_shape is [%s].",
+                          "h = w = 1 or 3. But received filter_shape is [%s].",
                           phi::make_ddim(filter_shape)));
     PADDLE_ENFORCE_EQ((filter_shape[0] % 32 == 0 && filter_shape[3] % 8 == 0),
                       true,
                       platform::errors::InvalidArgument(
                           "The input channel is expected to be multiple of 8, "
                           "and the output channel is expected to be multiple "
-                          "of 32. But recieved input channel is %d, output "
+                          "of 32. But received input channel is %d, output "
                           "channel is %d.",
                           filter_shape[3], filter_shape[0]));
     PADDLE_ENFORCE_EQ(
         output_shape.size(), 4U,
         platform::errors::InvalidArgument(
-            "The size of output_shape is expected to 4. But recieved "
+            "The size of output_shape is expected to 4. But received "
             "filter_shape's size is %d, filter_shape is [%s].",
             output_shape.size(), phi::make_ddim(output_shape)));
     is_support = IsSupport(ctx, filter_shape, stride, dilation, group);
@@ -83,7 +83,7 @@ struct NormConvolutionArgs {
             "compatiblity greater than or equal to 70 and the kernel size "
             "must be equal to 1 or 3. When the kernel size is 1, "
             "the stride must be 1 if the compatiblity is equal to 70. "
-            "Besides, the dilation and group must be equal to 1. But recieved "
+            "Besides, the dilation and group must be equal to 1. But received "
             "compatiblity is %d, kernel size is %d, stride is %d, "
             "dilation is %d, group is %d",
             ctx.GetComputeCapability(), filter_shape[1], stride, dilation,
diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
index 884fca2c1b0b8..5881322007add 100644
--- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
@@ -51,7 +51,7 @@ void InitRandomTensor(const std::vector<int64_t> &dims,
 template <typename T>
 void TransposeNchwToNhwc(const framework::Tensor &cpu_in,
                          framework::Tensor *cpu_out) {
-  auto in_dims = cpu_in.dims();
+  const auto &in_dims = cpu_in.dims();
   EXPECT_EQ(cpu_in.dims().size(), 4);
 
   const T *cpu_in_ptr = cpu_in.data<T>();
@@ -184,7 +184,7 @@ template <typename T>
 void ComputeSumAndSquareSum(const framework::Tensor &cpu_out,
                             framework::Tensor *cpu_sum,
                             framework::Tensor *cpu_sum_of_square) {
-  auto dims = cpu_out.dims();
+  const auto &dims = cpu_out.dims();
   int64_t c = dims[3];
 
   const T *cpu_out_ptr = cpu_out.data<T>();
diff --git a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
index 9d3090a7179f0..c8588b0c02e9d 100644
--- a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
@@ -43,19 +43,19 @@ struct ScaleBiasAddReluArgs {
     PADDLE_ENFORCE_EQ(
         data_shape.size(), 4U,
         platform::errors::InvalidArgument(
-            "The size of data_shape is expected to 4. But recieved "
+            "The size of data_shape is expected to 4. But received "
             "data_shape's size is %d, data_shape is [%s].",
             data_shape.size(), phi::make_ddim(data_shape)));
     PADDLE_ENFORCE_EQ(
         param_shape.size(), 4U,
         platform::errors::InvalidArgument(
-            "The size of param_shape is expected to 4. But recieved "
+            "The size of param_shape is expected to 4. But received "
             "param_shape's size is %d, param_shape is [%s].",
             param_shape.size(), phi::make_ddim(param_shape)));
     PADDLE_ENFORCE_EQ(
         bitmask_shape.size(), 3U,
         platform::errors::InvalidArgument(
-            "The size of bitmask_shape is expected to 3. But recieved "
+            "The size of bitmask_shape is expected to 3. But received "
             "bitmask_shape's size is %d, bitmask_shape is [%s].",
             bitmask_shape.size(), phi::make_ddim(bitmask_shape)));
 
diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h
index 6eb5881112f89..0e9fba73933b7 100644
--- a/paddle/fluid/operators/fused/fmha_ref.h
+++ b/paddle/fluid/operators/fused/fmha_ref.h
@@ -12,11 +12,12 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/operators/dropout_impl.cu.h"
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
+#include "paddle/fluid/operators/fused/fused_softmax_mask.cu.h"
 #include "paddle/fluid/operators/transpose_op.cu.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/funcs/functors.h"
 #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 
@@ -148,18 +149,24 @@ class FMHARef {
                      stride_b);
     int softmax_axis = -1;
     if (src_mask_tensor != nullptr) {
-      std::vector<const Tensor*> ins;
-      std::vector<Tensor*> outs;
-      ins.emplace_back(qk_out_tensor);
-      ins.emplace_back(src_mask_tensor);
-      outs.emplace_back(src_mask_out_tensor);
-      int elewise_add_axis = -1;
-      paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
-                                                     T, T>(
-          dev_ctx_, ins, &outs, elewise_add_axis, AddFunctor<T>());
+      if (src_mask_out_tensor == nullptr && seq_len_ == out_seq_len) {
+        LaunchFusedSoftmaxMaskKernel<T>(qk_out_data, src_mask_tensor->data<T>(),
+                                        softmax_out_data, batch_size_,
+                                        num_head_, seq_len_, dev_ctx_.stream());
+      } else {
+        std::vector<const Tensor*> ins;
+        std::vector<Tensor*> outs;
+        ins.emplace_back(qk_out_tensor);
+        ins.emplace_back(src_mask_tensor);
+        outs.emplace_back(src_mask_out_tensor);
+        int elewise_add_axis = -1;
+        phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
+            dev_ctx_, ins, &outs, elewise_add_axis,
+            phi::funcs::AddFunctor<T>());
 
-      phi::SoftmaxForwardCUDAKernelDriver<T>(dev_ctx_, *src_mask_out_tensor,
-                                             softmax_axis, softmax_out_tensor);
+        phi::SoftmaxForwardCUDAKernelDriver<T>(
+            dev_ctx_, *src_mask_out_tensor, softmax_axis, softmax_out_tensor);
+      }
     } else {
       phi::SoftmaxForwardCUDAKernelDriver<T>(dev_ctx_, *qk_out_tensor,
                                              softmax_axis, softmax_out_tensor);
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cc b/paddle/fluid/operators/fused/fused_attention_op.cc
index e473f8ff0662c..1f377810a2287 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cc
+++ b/paddle/fluid/operators/fused/fused_attention_op.cc
@@ -163,11 +163,15 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
                             "The third dim of CacheKV must be equal with num "
                             "head %d, but got %d",
                             y_dim[1], c_dim[2]));  // num_head
-      PADDLE_ENFORCE_GE(
-          c_dim[3], 0,
-          paddle::platform::errors::InvalidArgument(
-              "The forth dim of CacheKV must be greater than 0, but got %d",
-              c_dim[3]));  // cache_seq_len
+      // In compile stage, input seq_len can be -1, in that case
+      // c_dim[3] may < 0 in while
+      if (ctx->IsRuntime()) {
+        PADDLE_ENFORCE_GE(
+            c_dim[3], 0,
+            paddle::platform::errors::InvalidArgument(
+                "The forth dim of CacheKV must be greater than 0, but got %d",
+                c_dim[3]));  // cache_seq_len
+      }
       PADDLE_ENFORCE_EQ(c_dim[4], y_dim[2],
                         paddle::platform::errors::InvalidArgument(
                             "The fifth dim of CacheKV must be equal with head "
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu
index d26577f06fe68..ec8a4d962e808 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_attention_op.cu
@@ -19,7 +19,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/operators/fused/attention_layer_norm.h"
@@ -543,10 +544,9 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
     ins.emplace_back(d_x);
     outs.emplace_back(d_x);
     int elewise_add_axis = -1;
-    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
-                                                   T>(
+    phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
         ctx.cuda_device_context(), ins, &outs, elewise_add_axis,
-        AddFunctor<T>());
+        phi::funcs::AddFunctor<T>());
   }
 };
 
diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc
index 27dae27751681..1b5b074ef1c71 100644
--- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc
+++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc
@@ -76,7 +76,7 @@ class FusedFCElementwiseLayerNormOp : public framework::OperatorWithKernel {
         platform::errors::InvalidArgument(
             "The attribute x_num_col_dims used to flatten input X to "
             "a 2-D tensor, is expected to be less than the number of "
-            "input X's dimensions. But recieved x_num_col_dims is %d, "
+            "input X's dimensions. But received x_num_col_dims is %d, "
             "the number of input X's dimensions is %d, input X's shape is %s.",
             x_num_col_dims, x_dims.size(), x_dims));
 
@@ -85,7 +85,7 @@ class FusedFCElementwiseLayerNormOp : public framework::OperatorWithKernel {
         x_mat_dims[1], w_dims[0],
         platform::errors::InvalidArgument(
             "The input's second dimension and weight's first dimension is "
-            "expected to be the same. But recieved input's second dimension is "
+            "expected to be the same. But received input's second dimension is "
             "%d, input's shape is %s; weight's first dimension is %d, weight's "
             "shape is %s.",
             x_mat_dims[1], x_mat_dims, w_dims[0], w_dims));
@@ -100,7 +100,7 @@ class FusedFCElementwiseLayerNormOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(phi::make_ddim(fc_out_dims), y_dims,
                       platform::errors::InvalidArgument(
                           "The output's shape of fc is expected to be equal to "
-                          "that of input Y. But recieved output's shape of fc "
+                          "that of input Y. But received output's shape of fc "
                           "is %s, input Y's shape is %s.",
                           phi::make_ddim(fc_out_dims), y_dims));
 
@@ -110,7 +110,7 @@ class FusedFCElementwiseLayerNormOp : public framework::OperatorWithKernel {
         platform::errors::InvalidArgument(
             "The attribute begin_norm_axis used to flatten input Y to a 2-D "
             "tensor, is expected to be less than the number of input Y's "
-            "dimensions. But recieved begin_norm_axis is %d, the number of "
+            "dimensions. But received begin_norm_axis is %d, the number of "
             "input Y's dimensions is %d, input Y's shape is %s.",
             begin_norm_axis, y_dims.size(), y_dims));
 
@@ -122,7 +122,7 @@ class FusedFCElementwiseLayerNormOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(scale_dims.size(), 1,
                         platform::errors::InvalidArgument(
                             "The input Scale is expected to be an 1-D tensor. "
-                            "But recieved the number of input Scale's "
+                            "But received the number of input Scale's "
                             "dimensions is %d, input Scale's shape is %s.",
                             scale_dims.size(), scale_dims));
 
@@ -132,7 +132,7 @@ class FusedFCElementwiseLayerNormOp : public framework::OperatorWithKernel {
             platform::errors::InvalidArgument(
                 "The first dimension of input Scale is expected to be equal to "
                 "the second dimension of input Y after flattened. "
-                "But recieved the first dimension of input Scale is %d, input "
+                "But received the first dimension of input Scale is %d, input "
                 "Scale's shape is %s; the second dimension of flattened input "
                 "Y is %d, input Y's shape is %s, flattened axis is %d.",
                 scale_dims[0], scale_dims, dim_1, y_dims, begin_norm_axis));
@@ -144,7 +144,7 @@ class FusedFCElementwiseLayerNormOp : public framework::OperatorWithKernel {
           bias1_dims.size(), 1,
           platform::errors::InvalidArgument(
               "The input Bias1 is expected to be an 1-D tensor. "
-              "But recieved the number of input Bias1's dimension is %d, "
+              "But received the number of input Bias1's dimension is %d, "
               "input Bias1's shape is %s.",
               bias1_dims.size(), bias1_dims));
 
@@ -154,7 +154,7 @@ class FusedFCElementwiseLayerNormOp : public framework::OperatorWithKernel {
             platform::errors::InvalidArgument(
                 "The first dimension of input Bias1 is expected to be equal to "
                 "the second dimension of input Y after flattened. "
-                "But recieved the first dimension of input Bias1 is %d, input "
+                "But received the first dimension of input Bias1 is %d, input "
                 "Bias1's shape is %s; the second dimension of flatten input "
                 "Y is %d, input Y's shape is %s, flattened axis is %d.",
                 bias1_dims[0], bias1_dims, dim_1, y_dims, begin_norm_axis));
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu
index c38d9f7d4bcbd..2eb9885286dab 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cu
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu
@@ -17,9 +17,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/matmul_v2_op.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include "paddle/fluid/operators/fused/fused_dropout_helper.h"
 #include "paddle/fluid/operators/layer_norm_kernel.cu.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
@@ -345,9 +346,8 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
     ins[1] = d_x;
     outs[0] = d_x;
     int elewise_add_axis = -1;
-    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
-                                                   T>(
-        ctx, ins, &outs, elewise_add_axis, AddFunctor<T>());
+    phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
+        ctx, ins, &outs, elewise_add_axis, phi::funcs::AddFunctor<T>());
   }
 
   void Compute(const framework::ExecutionContext& context) const override {
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
index e38ac9a0ad2da..fe93d323c59bc 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/operators/fused/attention_layer_norm.h"
@@ -1084,11 +1083,9 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
     auto *qk_out_data =
         qk_out.mutable_data<T>({bsz, num_head, seq_len, out_seq_len}, place);
 
-    Tensor src_mask_out, softmax_out;
+    Tensor softmax_out;
     Tensor attn_dropout_mask_out, attn_dropout_out;
     Tensor qktv_out, fmha_out;
-    auto *src_mask_out_data = src_mask_out.mutable_data<T>(
-        {bsz, num_head, seq_len, out_seq_len}, place);
     auto *softmax_out_data = softmax_out.mutable_data<T>(
         {bsz, num_head, seq_len, out_seq_len}, place);
 
@@ -1219,10 +1216,10 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
                 1. / sqrt(dim_head));
       } else if (cache_kv_out) {  // generation context stage
         // TODO(wangxi): can remove dropout in inference
-        fmha_compute.ComputeForward(
-            qkv_out, nullptr, src_mask, &transpose_out_2, nullptr, &qk_out,
-            &src_mask_out, &softmax_out, &attn_dropout_mask_out,
-            &attn_dropout_out, &qktv_out, &fmha_out);
+        fmha_compute.ComputeForward(qkv_out, nullptr, src_mask,
+                                    &transpose_out_2, nullptr, &qk_out, nullptr,
+                                    &softmax_out, &attn_dropout_mask_out,
+                                    &attn_dropout_out, &qktv_out, &fmha_out);
         // [3, bsz, num_head, seq_len, head_dim]
         T *qkv_data = transpose_out_2_data;
         int64_t q_size = bsz * seq_len * num_head * dim_head;
@@ -1245,7 +1242,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
         // TODO(wangxi): can remove dropout in inference
         fmha_compute.ComputeForward(
             qkv_out, cache_kv, src_mask, &transpose_out_2, cache_kv_out,
-            &qk_out, &src_mask_out, &softmax_out, &attn_dropout_mask_out,
+            &qk_out, nullptr, &softmax_out, &attn_dropout_mask_out,
             &attn_dropout_out, &qktv_out, &fmha_out);
       }
 #ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
diff --git a/paddle/fluid/operators/fused/fused_softmax_mask.cu.h b/paddle/fluid/operators/fused/fused_softmax_mask.cu.h
new file mode 100644
index 0000000000000..11f1011dec3a2
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_softmax_mask.cu.h
@@ -0,0 +1,204 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+namespace plat = paddle::platform;
+
+#define FINAL_MASK 0xffffffff
+#define DIV_UP(x, y) (((x) + (y)-1) / (y))
+
+template <typename T>
+__inline__ __device__ T warpReduceSum(T val) {
+#pragma unroll
+  for (int mask = 16; mask > 0; mask >>= 1)
+    val += __shfl_xor_sync(FINAL_MASK, val, mask, 32);
+  return val;
+}
+
+template <typename T>
+__inline__ __device__ T warpReduceMax(T val) {
+#pragma unroll
+  for (int mask = 16; mask > 0; mask >>= 1)
+    val = max(val, __shfl_xor_sync(FINAL_MASK, val, mask, 32));
+  return val;
+}
+
+inline int ElementsCeil(int seq_len) {
+  int elements = 1;
+  while (elements * 32 < seq_len) elements *= 2;
+  return elements;
+}
+
+template <typename T, int VEC_SIZE, int ELEMENTS_PER_THREADS>
+__global__ void FusedSoftmaxMaskVecKernel(T* dst, const T* src, const T* mask,
+                                          int seq_len) {
+  constexpr int block_size = 128;
+  constexpr int warp_size = 32;
+  constexpr int warps_per_block = block_size / warp_size;
+
+  // blockDim/threadIdx = (warp_size, warps_per_block)
+  // gridDim/blockIdx = (DIV_UP(seq_len, warps_per_block), batch_size, head_num)
+  // every block processes 4(warps_per_block) sequences
+  // seq_id = seq_id * 4 + warp_id, eg.seq_len=128, 127=31*4+3
+  int seq_id = blockIdx.x * warps_per_block + threadIdx.y;
+  if (seq_id >= seq_len) return;
+
+  // ((bid*head_num + hid)*seq_len + seq_id) * seq_len
+  int offset =
+      ((blockIdx.y * gridDim.z + blockIdx.z) * seq_len + seq_id) * seq_len;
+  // (bid * seq_len + seq_id) * seq_len
+  int mask_offset = (blockIdx.y * seq_len + seq_id) * seq_len;
+  src += offset;
+  dst += offset;
+  mask += mask_offset;
+
+  static_assert(ELEMENTS_PER_THREADS % VEC_SIZE == 0, "");
+  constexpr int VEC_NUMS = ELEMENTS_PER_THREADS / VEC_SIZE;
+  using VecT = phi::AlignedVector<T, VEC_SIZE>;
+
+  VecT elements[VEC_NUMS];
+  VecT tmp_mask;
+  float max_val = -std::numeric_limits<float>::infinity();
+
+  for (int i = 0; (i * warp_size + threadIdx.x) * VEC_SIZE < seq_len; ++i) {
+    phi::Load(src + (i * warp_size + threadIdx.x) * VEC_SIZE, &elements[i]);
+    phi::Load(mask + (i * warp_size + threadIdx.x) * VEC_SIZE, &tmp_mask);
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      // TODO(wangxi): vec add
+      elements[i][j] += tmp_mask[j];
+      max_val = max(max_val, static_cast<float>(elements[i][j]));
+    }
+  }
+  max_val = warpReduceMax(max_val);
+
+  float sum_val = 0;
+  for (int i = 0; (i * warp_size + threadIdx.x) * VEC_SIZE < seq_len; ++i) {
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      float tmp = __expf(static_cast<float>(elements[i][j]) - max_val);
+      sum_val += tmp;
+      elements[i][j] = static_cast<T>(tmp);
+    }
+  }
+  sum_val = warpReduceSum(sum_val);
+  float mean_val = __fdividef(1.0f, sum_val + 1e-6f);
+
+  for (int i = 0; (i * warp_size + threadIdx.x) * VEC_SIZE < seq_len; ++i) {
+#pragma unroll
+    for (int j = 0; j < VEC_SIZE; ++j) {
+      float tmp = static_cast<float>(elements[i][j]) * mean_val;
+      elements[i][j] = static_cast<T>(tmp);
+    }
+    phi::Store(elements[i], dst + (i * warp_size + threadIdx.x) * VEC_SIZE);
+  }
+}
+
+#define SOFTMAX_MASK_KERNEL(VEC_SIZE, ELEMENTS)                    \
+  FusedSoftmaxMaskVecKernel<T, VEC_SIZE,                           \
+                            ELEMENTS><<<grid, block, 0, stream>>>( \
+      dst, src, mask, seq_len)
+
+// FIXME(wangxi): It is found that the performance of VEC_SIZE=2 is better
+//  than that of =4 and =8. Further analysis of the kernel is needed later.
+// #define SELECT_SOFTMAX_MASK_KERNEL(ELEMENTS) \
+//   do { \
+//     if (sizeof(T) == 2 && seq_len % 8 == 0) { \
+//       FusedSoftmaxMaskVecKernel<plat::float16, 8, ELEMENTS> \
+//            <<<grid, block, 0, stream>>>( \
+//           (plat::float16*)dst, (const plat::float16*)src, mask, seq_len); \
+//     } \
+//     else if (seq_len % 4 == 0) SOFTMAX_MASK_KERNEL(4, ELEMENTS); \
+//     else if (seq_len % 2 == 0) SOFTMAX_MASK_KERNEL(2, ELEMENTS); \
+//     else SOFTMAX_MASK_KERNEL(1, ELEMENTS);   \
+//   } while(0)
+
+#define SELECT_SOFTMAX_MASK_KERNEL(ELEMENTS) \
+  do {                                       \
+    if (seq_len % 2 == 0) {                  \
+      SOFTMAX_MASK_KERNEL(2, ELEMENTS);      \
+    } else {                                 \
+      SOFTMAX_MASK_KERNEL(1, ELEMENTS);      \
+    }                                        \
+  } while (0)
+
+#define CASE_SOFTMAX_MASK_KERNEL(ELEMENTS) \
+  case ELEMENTS: {                         \
+    SELECT_SOFTMAX_MASK_KERNEL(ELEMENTS);  \
+    break;                                 \
+  }
+
+// template <typename T, typename MaskT = T>
+template <typename T>
+void LaunchFusedSoftmaxMaskKernel(const T* src, const T* mask, T* dst,
+                                  const int batch_size, const int head_num,
+                                  const int seq_len, cudaStream_t stream) {
+  PADDLE_ENFORCE_EQ(
+      seq_len > 0 && seq_len <= 4096, true,
+      platform::errors::InvalidArgument("seq_len must be between (0, 4096] "
+                                        "received the seq_len is %d",
+                                        seq_len));
+
+  constexpr int block_size = 128;
+  constexpr int warp_size = 32;
+  constexpr int warps_per_block = block_size / warp_size;
+
+  // put head_num to the outside for mask
+  dim3 block(warp_size, warps_per_block);
+  dim3 grid(DIV_UP(seq_len, warps_per_block), batch_size, head_num);
+
+  // clang-format off
+  int elements = ElementsCeil(seq_len);
+  switch (elements) {
+    case 1: {  // <=32
+      SOFTMAX_MASK_KERNEL(1, 1);
+      break;
+    }
+    case 2: {  // <=64
+      // if (seq_len % 2 == 0) SOFTMAX_MASK_KERNEL(2, 2);
+      // else SOFTMAX_MASK_KERNEL(1, 2);
+      SELECT_SOFTMAX_MASK_KERNEL(2);
+      break;
+    }
+    case 4: {  // <=128
+      // if (seq_len % 4 == 0) SOFTMAX_MASK_KERNEL(4, 4);
+      // else if (seq_len % 2 == 0) SOFTMAX_MASK_KERNEL(2, 4);
+      // else SOFTMAX_MASK_KERNEL(1, 4);
+      SELECT_SOFTMAX_MASK_KERNEL(4);
+      break;
+    }
+    CASE_SOFTMAX_MASK_KERNEL(8);    // <=256
+    CASE_SOFTMAX_MASK_KERNEL(16);   // <=512
+    CASE_SOFTMAX_MASK_KERNEL(32);   // <=1024
+    CASE_SOFTMAX_MASK_KERNEL(64);   // <=2048
+    CASE_SOFTMAX_MASK_KERNEL(128);  // <=4096
+    default:
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "seq_len must be between (0, 4096], received the seq_len is %d",
+          seq_len));
+  }
+  // clang-format on
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fusion_group_op.cc b/paddle/fluid/operators/fused/fusion_group_op.cc
index 738e069081511..1ebbdf792df85 100644
--- a/paddle/fluid/operators/fused/fusion_group_op.cc
+++ b/paddle/fluid/operators/fused/fusion_group_op.cc
@@ -52,7 +52,7 @@ class FusionGroupOp : public framework::OperatorWithKernel {
             x_dims[0], x_dims[i],
             platform::errors::InvalidArgument(
                 "All the inputs' dims is expected to be the same. "
-                "But recieved [%s] (name: %s) vs [%s] (name: %s).",
+                "But received [%s] (name: %s) vs [%s] (name: %s).",
                 x_dims[0], input_names[0], x_dims[i], input_names[i]));
       }
       std::vector<framework::DDim> out_dims;
diff --git a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
index acb94e20df8cb..bed5125b99583 100644
--- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
@@ -130,7 +130,7 @@ class FusionRepeatedFCReluKernel : public framework::OpKernel<T> {
     int weight_sz = static_cast<int>(weights.size());
 
     auto i_dims = in->dims();
-    auto w_dims = weights[0]->dims();
+    const auto& w_dims = weights[0]->dims();
     jit::matmul_attr_t attr;
     attr.m = i_dims[0];
     attr.n = w_dims[1];
@@ -140,8 +140,8 @@ class FusionRepeatedFCReluKernel : public framework::OpKernel<T> {
             relus[0]->mutable_data<T>(place), attr);
 
     for (int i = 1; i < weight_sz - 1; ++i) {
-      auto i_dims = relus[i - 1]->dims();
-      auto w_dims = weights[i]->dims();
+      const auto& i_dims = relus[i - 1]->dims();
+      const auto& w_dims = weights[i]->dims();
       attr.m = i_dims[0];
       attr.n = w_dims[1];
       attr.k = w_dims[0];
@@ -150,8 +150,8 @@ class FusionRepeatedFCReluKernel : public framework::OpKernel<T> {
               biases[i]->data<T>(), relus[i]->mutable_data<T>(place), attr);
     }
 
-    auto i_dims_last = relus[weight_sz - 2]->dims();
-    auto w_dims_last = weights[weight_sz - 1]->dims();
+    const auto& i_dims_last = relus[weight_sz - 2]->dims();
+    const auto& w_dims_last = weights[weight_sz - 1]->dims();
     attr.m = i_dims_last[0];
     attr.n = w_dims_last[1];
     attr.k = w_dims_last[0];
diff --git a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
index 91bc855d43c83..e574d67e3982c 100644
--- a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
@@ -91,8 +91,8 @@ class FusionSeqPoolConcatKernel : public framework::OpKernel<T> {
     auto* out = ctx.Output<LoDTensor>("Out");
     std::string pooltype = ctx.Attr<std::string>("pooltype");
     auto x0_lod = ins[0]->lod();
-    auto x0_dims = ins[0]->dims();
-    auto y_dims = out->dims();
+    const auto& x0_dims = ins[0]->dims();
+    const auto& y_dims = out->dims();
     size_t bs = x0_lod[0].size() - 1;
     out->Resize({static_cast<int64_t>(bs), y_dims[1]});
     framework::LoD y_lod(1);
@@ -122,7 +122,7 @@ class FusionSeqPoolConcatKernel : public framework::OpKernel<T> {
     size_t n = ins.size();
     size_t dst_step_size = n * w;
     for (size_t i = 0; i < n; ++i) {
-      auto x_dims = ins[i]->dims();
+      const auto& x_dims = ins[i]->dims();
       auto x_lod = ins[i]->lod()[0];
       const T* src = ins[i]->data<T>();
       T* dst = y_data + i * w;
diff --git a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
index 123c4c885ead8..c74cc504840d3 100644
--- a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
@@ -92,8 +92,8 @@ class FusionSeqPoolCVMConcatKernel : public framework::OpKernel<T> {
     auto* out = ctx.Output<LoDTensor>("Out");
     std::string pooltype = ctx.Attr<std::string>("pooltype");
     auto x0_lod = ins[0]->lod();
-    auto x0_dims = ins[0]->dims();
-    auto y_dims = out->dims();
+    const auto& x0_dims = ins[0]->dims();
+    const auto& y_dims = out->dims();
     size_t bs = x0_lod[0].size() - 1;
     out->Resize({static_cast<int64_t>(bs), y_dims[1]});
     framework::LoD y_lod(1);
@@ -121,7 +121,7 @@ class FusionSeqPoolCVMConcatKernel : public framework::OpKernel<T> {
     size_t n = ins.size();
     size_t dst_step_size = n * w;
     for (size_t i = 0; i < n; ++i) {
-      auto x_dims = ins[i]->dims();
+      const auto& x_dims = ins[i]->dims();
       auto x_lod = ins[i]->lod()[0];
       const T* src = ins[i]->data<T>();
       T* dst = y_data + i * w;
diff --git a/paddle/fluid/operators/fused/yolo_box_head_op.cc b/paddle/fluid/operators/fused/yolo_box_head_op.cc
new file mode 100644
index 0000000000000..58df4e61bbbdf
--- /dev/null
+++ b/paddle/fluid/operators/fused/yolo_box_head_op.cc
@@ -0,0 +1,50 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class YoloBoxHeadOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "yolo_box_head");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "yolo_box_head");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+  }
+};
+
+class YoloBoxHeadOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "The input tensor");
+    AddAttr<std::vector<int>>("anchors",
+                              "The anchor width and height, "
+                              "it will be parsed pair by pair.");
+    AddAttr<int>("class_num", "The number of classes to predict.");
+    AddOutput("Out", "The output tensor");
+    AddComment(R"DOC(
+              yolo_box_head Operator.
+              )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(yolo_box_head, ops::YoloBoxHeadOp, ops::YoloBoxHeadOpMaker);
diff --git a/paddle/fluid/operators/fused/yolo_box_head_op.cu b/paddle/fluid/operators/fused/yolo_box_head_op.cu
new file mode 100644
index 0000000000000..4c79e22d1a536
--- /dev/null
+++ b/paddle/fluid/operators/fused/yolo_box_head_op.cu
@@ -0,0 +1,102 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+inline __device__ T SigmoidGPU(const T& x) {
+  return 1.0f / (1.0f + __expf(-x));
+}
+
+template <typename T>
+__global__ void YoloBoxHeadCudaKernel(const T* input, T* output,
+                                      const int grid_size_x,
+                                      const int grid_size_y,
+                                      const int class_num,
+                                      const int anchors_num) {
+  int x_id = blockIdx.x * blockDim.x + threadIdx.x;
+  int y_id = blockIdx.y * blockDim.y + threadIdx.y;
+  int z_id = blockIdx.z * blockDim.z + threadIdx.z;
+  if ((x_id >= grid_size_x) || (y_id >= grid_size_y) || (z_id >= anchors_num)) {
+    return;
+  }
+  const int grids_num = grid_size_x * grid_size_y;
+  const int bbindex = y_id * grid_size_x + x_id;
+
+  // objectness
+  output[bbindex + grids_num * (z_id * (5 + class_num) + 4)] =
+      SigmoidGPU(input[bbindex + grids_num * (z_id * (5 + class_num) + 4)]);
+  // x
+  output[bbindex + grids_num * (z_id * (5 + class_num) + 0)] =
+      SigmoidGPU(input[bbindex + grids_num * (z_id * (5 + class_num) + 0)]);
+  // y
+  output[bbindex + grids_num * (z_id * (5 + class_num) + 1)] =
+      SigmoidGPU(input[bbindex + grids_num * (z_id * (5 + class_num) + 1)]);
+  // w
+  output[bbindex + grids_num * (z_id * (5 + class_num) + 2)] =
+      __expf(input[bbindex + grids_num * (z_id * (5 + class_num) + 2)]);
+  // h
+  output[bbindex + grids_num * (z_id * (5 + class_num) + 3)] =
+      __expf(input[bbindex + grids_num * (z_id * (5 + class_num) + 3)]);
+  // Probabilities of classes
+  for (int i = 0; i < class_num; ++i) {
+    output[bbindex + grids_num * (z_id * (5 + class_num) + (5 + i))] =
+        SigmoidGPU(
+            input[bbindex + grids_num * (z_id * (5 + class_num) + (5 + i))]);
+  }
+}
+
+template <typename T>
+class YoloBoxHeadKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    using Tensor = framework::Tensor;
+    auto* x = context.Input<framework::Tensor>("X");
+    auto* out = context.Output<framework::Tensor>("Out");
+    auto anchors = context.Attr<std::vector<int>>("anchors");
+    auto class_num = context.Attr<int>("class_num");
+    auto& device_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
+    auto x_dims = x->dims();
+    const int batch_size = x_dims[0];
+    const int h = x_dims[2];
+    const int w = x_dims[3];
+    const int grid_size_x = w;
+    const int grid_size_y = h;
+    const int anchors_num = anchors.size() / 2;
+    const T* input_data = x->data<T>();
+    T* output_data = out->mutable_data<T>(context.GetPlace());
+    auto stream = device_ctx.stream();
+    const int volume = x_dims[1] * h * w;
+    dim3 block(16, 16, 4);
+    dim3 grid((grid_size_x / block.x) + 1, (grid_size_y / block.y) + 1,
+              (anchors_num / block.z) + 1);
+    for (int n = 0; n < batch_size; n++) {
+      YoloBoxHeadCudaKernel<<<grid, block, 0, stream>>>(
+          input_data + n * volume, output_data + n * volume, grid_size_x,
+          grid_size_y, class_num, anchors_num);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(yolo_box_head, ops::YoloBoxHeadKernel<float>);
diff --git a/paddle/fluid/operators/fused/yolo_box_post_op.cc b/paddle/fluid/operators/fused/yolo_box_post_op.cc
new file mode 100644
index 0000000000000..674944173698b
--- /dev/null
+++ b/paddle/fluid/operators/fused/yolo_box_post_op.cc
@@ -0,0 +1,76 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class YoloBoxPostOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const {
+    OP_INOUT_CHECK(ctx->HasInput("Boxes0"), "Input", "Boxes0", "yolo_box_post");
+    OP_INOUT_CHECK(ctx->HasInput("Boxes1"), "Input", "Boxes1", "yolo_box_post");
+    OP_INOUT_CHECK(ctx->HasInput("Boxes2"), "Input", "Boxes2", "yolo_box_post");
+    OP_INOUT_CHECK(ctx->HasInput("ImageShape"), "Input", "ImageShape",
+                   "yolo_box_post");
+    OP_INOUT_CHECK(ctx->HasInput("ImageScale"), "Input", "ImageScale",
+                   "yolo_box_post");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "yolo_box_post");
+    OP_INOUT_CHECK(ctx->HasOutput("NmsRoisNum"), "Output", "NmsRoisNum",
+                   "yolo_box_post");
+  }
+};
+
+class YoloBoxPostOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("Boxes0", "The Boxes0 tensor");
+    AddInput("Boxes1", "The Boxes1 tensor");
+    AddInput("Boxes2", "The Boxes2 tensor");
+    AddInput("ImageShape", "The height and width of each input image.");
+    AddInput("ImageScale", "The scale factor of ImageShape.");
+    AddAttr<std::vector<int>>("anchors0", "The anchors of Boxes0.");
+    AddAttr<std::vector<int>>("anchors1", "The anchors of Boxes1.");
+    AddAttr<std::vector<int>>("anchors2", "The anchors of Boxes2.");
+    AddAttr<int>("class_num", "The number of classes to predict.");
+    AddAttr<float>("conf_thresh",
+                   "The confidence scores threshold of detection boxes. "
+                   "Boxes with confidence scores under threshold should "
+                   "be ignored.");
+    AddAttr<int>("downsample_ratio0", "The downsample ratio of Boxes0.");
+    AddAttr<int>("downsample_ratio1", "The downsample ratio of Boxes1.");
+    AddAttr<int>("downsample_ratio2", "The downsample ratio of Boxes2.");
+    AddAttr<bool>("clip_bbox",
+                  "Whether clip output bonding box in Input(ImgSize) "
+                  "boundary. Default true.");
+    AddAttr<float>("scale_x_y",
+                   "Scale the center point of decoded bounding "
+                   "box. Default 1.0");
+    AddAttr<float>("nms_threshold", "The threshold to be used in NMS.");
+    AddOutput("Out", "The output tensor");
+    AddOutput("NmsRoisNum", "The output RoIs tensor");
+    AddComment(R"DOC(
+        yolo_box_post Operator.
+        )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(yolo_box_post, ops::YoloBoxPostOp, ops::YoloBoxPostOpMaker);
diff --git a/paddle/fluid/operators/fused/yolo_box_post_op.cu b/paddle/fluid/operators/fused/yolo_box_post_op.cu
new file mode 100644
index 0000000000000..4438a4c7dd812
--- /dev/null
+++ b/paddle/fluid/operators/fused/yolo_box_post_op.cu
@@ -0,0 +1,519 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+
+struct Box {
+  float x, y, w, h;
+};
+
+struct Detection {
+  Box bbox;
+  int classes;
+  float* prob;
+  float* mask;
+  float objectness;
+  int sort_class;
+  int max_prob_class_index;
+};
+
+struct TensorInfo {
+  int bbox_count_host;  // record bbox numbers
+  int bbox_count_max_alloc{50};
+  float* bboxes_dev_ptr;
+  float* bboxes_host_ptr;
+  int* bbox_count_device_ptr;  // Box counter in gpu memory, used by atomicAdd
+};
+
+static int NMSComparator(const void* pa, const void* pb) {
+  const Detection a = *reinterpret_cast<const Detection*>(pa);
+  const Detection b = *reinterpret_cast<const Detection*>(pb);
+  if (a.max_prob_class_index > b.max_prob_class_index)
+    return 1;
+  else if (a.max_prob_class_index < b.max_prob_class_index)
+    return -1;
+
+  float diff = 0;
+  if (b.sort_class >= 0) {
+    diff = a.prob[b.sort_class] - b.prob[b.sort_class];
+  } else {
+    diff = a.objectness - b.objectness;
+  }
+
+  if (diff < 0)
+    return 1;
+  else if (diff > 0)
+    return -1;
+  return 0;
+}
+
+static float Overlap(float x1, float w1, float x2, float w2) {
+  float l1 = x1 - w1 / 2;
+  float l2 = x2 - w2 / 2;
+  float left = l1 > l2 ? l1 : l2;
+  float r1 = x1 + w1 / 2;
+  float r2 = x2 + w2 / 2;
+  float right = r1 < r2 ? r1 : r2;
+  return right - left;
+}
+
+static float BoxIntersection(Box a, Box b) {
+  float w = Overlap(a.x, a.w, b.x, b.w);
+  float h = Overlap(a.y, a.h, b.y, b.h);
+  if (w < 0 || h < 0) return 0;
+  float area = w * h;
+  return area;
+}
+
+static float BoxUnion(Box a, Box b) {
+  float i = BoxIntersection(a, b);
+  float u = a.w * a.h + b.w * b.h - i;
+  return u;
+}
+
+static float BoxIOU(Box a, Box b) {
+  return BoxIntersection(a, b) / BoxUnion(a, b);
+}
+
+static void PostNMS(std::vector<Detection>* det_bboxes, float thresh,
+                    int classes) {
+  int total = det_bboxes->size();
+  if (total <= 0) {
+    return;
+  }
+
+  Detection* dets = det_bboxes->data();
+  int i, j, k;
+  k = total - 1;
+  for (i = 0; i <= k; ++i) {
+    if (dets[i].objectness == 0) {
+      Detection swap = dets[i];
+      dets[i] = dets[k];
+      dets[k] = swap;
+      --k;
+      --i;
+    }
+  }
+  total = k + 1;
+
+  qsort(dets, total, sizeof(Detection), NMSComparator);
+
+  for (i = 0; i < total; ++i) {
+    if (dets[i].objectness == 0) continue;
+    Box a = dets[i].bbox;
+    for (j = i + 1; j < total; ++j) {
+      if (dets[j].objectness == 0) continue;
+      if (dets[j].max_prob_class_index != dets[i].max_prob_class_index) break;
+      Box b = dets[j].bbox;
+      if (BoxIOU(a, b) > thresh) {
+        dets[j].objectness = 0;
+        for (k = 0; k < classes; ++k) {
+          dets[j].prob[k] = 0;
+        }
+      }
+    }
+  }
+}
+
+__global__ void YoloBoxNum(const float* input, int* bbox_count,
+                           const int grid_size, const int class_num,
+                           const int anchors_num, float prob_thresh) {
+  int x_id = blockIdx.x * blockDim.x + threadIdx.x;
+  int y_id = blockIdx.y * blockDim.y + threadIdx.y;
+  int z_id = blockIdx.z * blockDim.z + threadIdx.z;
+  if ((x_id >= grid_size) || (y_id >= grid_size) || (z_id >= anchors_num)) {
+    return;
+  }
+
+  const int grids_num = grid_size * grid_size;
+  const int bbindex = y_id * grid_size + x_id;
+  float objectness = input[bbindex + grids_num * (z_id * (5 + class_num) + 4)];
+  if (objectness < prob_thresh) {
+    return;
+  }
+
+  atomicAdd(bbox_count, 1);
+}
+
+__global__ void YoloTensorParseKernel(
+    const float* input, const float* im_shape_data, const float* im_scale_data,
+    float* output, int* bbox_index, const int grid_size, const int class_num,
+    const int anchors_num, const int netw, const int neth, int* biases,
+    float prob_thresh) {
+  int x_id = blockIdx.x * blockDim.x + threadIdx.x;
+  int y_id = blockIdx.y * blockDim.y + threadIdx.y;
+  int z_id = blockIdx.z * blockDim.z + threadIdx.z;
+  if ((x_id >= grid_size) || (y_id >= grid_size) || (z_id >= anchors_num)) {
+    return;
+  }
+
+  const float pic_h = im_shape_data[0] / im_scale_data[0];
+  const float pic_w = im_shape_data[1] / im_scale_data[1];
+  const int grids_num = grid_size * grid_size;
+  const int bbindex = y_id * grid_size + x_id;
+  float objectness = input[bbindex + grids_num * (z_id * (5 + class_num) + 4)];
+  if (objectness < prob_thresh) {
+    return;
+  }
+
+  int cur_bbox_index = atomicAdd(bbox_index, 1);
+  int tensor_index = cur_bbox_index * (5 + class_num);
+
+  // x
+  float x = input[bbindex + grids_num * (z_id * (5 + class_num) + 0)];
+  x = (x + static_cast<float>(x_id)) * static_cast<float>(pic_w) /
+      static_cast<float>(grid_size);
+  // y
+  float y = input[bbindex + grids_num * (z_id * (5 + class_num) + 1)];
+  y = (y + static_cast<float>(y_id)) * static_cast<float>(pic_h) /
+      static_cast<float>(grid_size);
+  // w
+  float w = input[bbindex + grids_num * (z_id * (5 + class_num) + 2)];
+  w = w * biases[2 * z_id] * pic_w / netw;
+  // h
+  float h = input[bbindex + grids_num * (z_id * (5 + class_num) + 3)];
+  h = h * biases[2 * z_id + 1] * pic_h / neth;
+
+  output[tensor_index] = objectness;
+  output[tensor_index + 1] = x - w / 2;
+  output[tensor_index + 2] = y - h / 2;
+  output[tensor_index + 3] = x + w / 2;
+  output[tensor_index + 4] = y + h / 2;
+  output[tensor_index + 1] =
+      output[tensor_index + 1] > 0 ? output[tensor_index + 1] : 0.f;
+  output[tensor_index + 2] =
+      output[tensor_index + 2] > 0 ? output[tensor_index + 2] : 0.f;
+  output[tensor_index + 3] = output[tensor_index + 3] < pic_w - 1
+                                 ? output[tensor_index + 3]
+                                 : pic_w - 1;
+  output[tensor_index + 4] = output[tensor_index + 4] < pic_h - 1
+                                 ? output[tensor_index + 4]
+                                 : pic_h - 1;
+
+  // Probabilities of classes
+  for (int i = 0; i < class_num; ++i) {
+    float prob =
+        input[bbindex + grids_num * (z_id * (5 + class_num) + (5 + i))] *
+        objectness;
+    output[tensor_index + 5 + i] = prob;
+  }
+}
+
+static void YoloTensorParseCuda(
+    const float* input_data,  // [in] YOLO_BOX_HEAD layer output
+    const float* image_shape_data, const float* image_scale_data,
+    float** bboxes_tensor_ptr,  // [out] Bounding boxes output tensor
+    int* bbox_count_max_alloc,  // [in/out] maximum bounding Box number
+                                // allocated in dev
+    int* bbox_count_host,  // [in/out] bounding boxes number recorded in host
+    int* bbox_count_device_ptr,  // [in/out] bounding boxes number calculated
+                                 // in
+                                 // device side
+    int* bbox_index_device_ptr,  // [in] bounding Box index for kernel threads
+                                 // shared access
+    int grid_size, int class_num, int anchors_num, int netw, int neth,
+    int* biases_device, float prob_thresh) {
+  dim3 threads_per_block(16, 16, 4);
+  dim3 number_of_blocks((grid_size / threads_per_block.x) + 1,
+                        (grid_size / threads_per_block.y) + 1,
+                        (anchors_num / threads_per_block.z) + 1);
+
+  // Estimate how many boxes will be choosed
+  int bbox_count = 0;
+#ifdef PADDLE_WITH_HIP
+  hipMemcpy(bbox_count_device_ptr, &bbox_count, sizeof(int),
+            hipMemcpyHostToDevice);
+#else
+  cudaMemcpy(bbox_count_device_ptr, &bbox_count, sizeof(int),
+             cudaMemcpyHostToDevice);
+#endif
+  YoloBoxNum<<<number_of_blocks, threads_per_block, 0>>>(
+      input_data, bbox_count_device_ptr, grid_size, class_num, anchors_num,
+      prob_thresh);
+#ifdef PADDLE_WITH_HIP
+  hipMemcpy(&bbox_count, bbox_count_device_ptr, sizeof(int),
+            hipMemcpyDeviceToHost);
+#else
+  cudaMemcpy(&bbox_count, bbox_count_device_ptr, sizeof(int),
+             cudaMemcpyDeviceToHost);
+#endif
+
+  // Record actual bbox number
+  *bbox_count_host = bbox_count;
+
+  // Obtain previous allocated bbox tensor in device side
+  float* bbox_tensor = *bboxes_tensor_ptr;
+  // Update previous maximum bbox number
+  if (bbox_count > *bbox_count_max_alloc) {
+#ifdef PADDLE_WITH_HIP
+    hipFree(bbox_tensor);
+    hipMalloc(&bbox_tensor, bbox_count * (5 + class_num) * sizeof(float));
+#else
+    cudaFree(bbox_tensor);
+    cudaMalloc(&bbox_tensor, bbox_count * (5 + class_num) * sizeof(float));
+#endif
+    *bbox_count_max_alloc = bbox_count;
+    *bboxes_tensor_ptr = bbox_tensor;
+  }
+
+  // Now generate bboxes
+  int bbox_index = 0;
+#ifdef PADDLE_WITH_HIP
+  hipMemcpy(bbox_index_device_ptr, &bbox_index, sizeof(int),
+            hipMemcpyHostToDevice);
+#else
+  cudaMemcpy(bbox_index_device_ptr, &bbox_index, sizeof(int),
+             cudaMemcpyHostToDevice);
+#endif
+  YoloTensorParseKernel<<<number_of_blocks, threads_per_block, 0>>>(
+      input_data, image_shape_data, image_scale_data, bbox_tensor,
+      bbox_index_device_ptr, grid_size, class_num, anchors_num, netw, neth,
+      biases_device, prob_thresh);
+}
+
+template <typename T>
+class YoloBoxPostKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    using Tensor = framework::Tensor;
+    // prepare inputs
+    std::vector<const float*> boxes_input(3);
+    std::vector<std::vector<int32_t>> boxes_input_dims(3);
+    for (int i = 0; i < 3; i++) {
+      auto* boxes_tensor =
+          context.Input<framework::Tensor>("Boxes" + std::to_string(i));
+      boxes_input[i] = boxes_tensor->data<float>();
+      auto dims = boxes_tensor->dims();
+      for (int j = 0; j < dims.size(); j++) {
+        boxes_input_dims[i].push_back(dims[j]);
+      }
+    }
+    const float* image_shape_data =
+        context.Input<framework::Tensor>("ImageShape")->data<float>();
+    const float* image_scale_data =
+        context.Input<framework::Tensor>("ImageScale")->data<float>();
+
+    // prepare outputs
+    auto* boxes_scores_tensor = context.Output<framework::Tensor>("Out");
+    auto* boxes_num_tensor = context.Output<framework::Tensor>("NmsRoisNum");
+
+    // prepare anchors
+    std::vector<int32_t> anchors;
+    auto anchors0 = context.Attr<std::vector<int>>("anchors0");
+    auto anchors1 = context.Attr<std::vector<int>>("anchors1");
+    auto anchors2 = context.Attr<std::vector<int>>("anchors2");
+    anchors.insert(anchors.end(), anchors0.begin(), anchors0.end());
+    anchors.insert(anchors.end(), anchors1.begin(), anchors1.end());
+    anchors.insert(anchors.end(), anchors2.begin(), anchors2.end());
+    int* device_anchors;
+#ifdef PADDLE_WITH_HIP
+    hipMalloc(reinterpret_cast<void**>(&device_anchors),
+              anchors.size() * sizeof(int));
+    hipMemcpy(device_anchors, anchors.data(), anchors.size() * sizeof(int),
+              hipMemcpyHostToDevice);
+#else
+    cudaMalloc(reinterpret_cast<void**>(&device_anchors),
+               anchors.size() * sizeof(int));
+    cudaMemcpy(device_anchors, anchors.data(), anchors.size() * sizeof(int),
+               cudaMemcpyHostToDevice);
+#endif
+    int* device_anchors_ptr[3];
+    device_anchors_ptr[0] = device_anchors;
+    device_anchors_ptr[1] = device_anchors_ptr[0] + anchors0.size();
+    device_anchors_ptr[2] = device_anchors_ptr[1] + anchors1.size();
+    std::vector<int> anchors_num{static_cast<int>(anchors0.size()) / 2,
+                                 static_cast<int>(anchors1.size()) / 2,
+                                 static_cast<int>(anchors2.size()) / 2};
+
+    // prepare other attrs
+    int class_num = context.Attr<int>("class_num");
+    float conf_thresh = context.Attr<float>("conf_thresh");
+    std::vector<int> downsample_ratio{context.Attr<int>("downsample_ratio0"),
+                                      context.Attr<int>("downsample_ratio1"),
+                                      context.Attr<int>("downsample_ratio2")};
+    // clip_bbox and scale_x_y is not used now!
+    float nms_threshold = context.Attr<float>("nms_threshold");
+
+    int batch = context.Input<framework::Tensor>("ImageShape")->dims()[0];
+    TensorInfo* ts_info = new TensorInfo[batch * boxes_input.size()];
+    for (int i = 0; i < batch * static_cast<int>(boxes_input.size()); i++) {
+#ifdef PADDLE_WITH_HIP
+      hipMalloc(
+          reinterpret_cast<void**>(&ts_info[i].bboxes_dev_ptr),
+          ts_info[i].bbox_count_max_alloc * (5 + class_num) * sizeof(float));
+#else
+      cudaMalloc(
+          reinterpret_cast<void**>(&ts_info[i].bboxes_dev_ptr),
+          ts_info[i].bbox_count_max_alloc * (5 + class_num) * sizeof(float));
+#endif
+      ts_info[i].bboxes_host_ptr = reinterpret_cast<float*>(malloc(
+          ts_info[i].bbox_count_max_alloc * (5 + class_num) * sizeof(float)));
+#ifdef PADDLE_WITH_HIP
+      hipMalloc(reinterpret_cast<void**>(&ts_info[i].bbox_count_device_ptr),
+                sizeof(int));
+#else
+      cudaMalloc(reinterpret_cast<void**>(&ts_info[i].bbox_count_device_ptr),
+                 sizeof(int));
+#endif
+    }
+
+    // Box index counter in gpu memory
+    // *bbox_index_device_ptr used by atomicAdd
+    int* bbox_index_device_ptr;
+#ifdef PADDLE_WITH_HIP
+    hipMalloc(reinterpret_cast<void**>(&bbox_index_device_ptr), sizeof(int));
+#else
+    cudaMalloc(reinterpret_cast<void**>(&bbox_index_device_ptr), sizeof(int));
+#endif
+
+    int total_bbox = 0;
+    for (int batch_id = 0; batch_id < batch; batch_id++) {
+      for (int input_id = 0; input_id < boxes_input.size(); input_id++) {
+        int c = boxes_input_dims[input_id][1];
+        int h = boxes_input_dims[input_id][2];
+        int w = boxes_input_dims[input_id][3];
+        int ts_id = batch_id * boxes_input.size() + input_id;
+        int bbox_count_max_alloc = ts_info[ts_id].bbox_count_max_alloc;
+
+        YoloTensorParseCuda(
+            boxes_input[input_id] + batch_id * c * h * w,
+            image_shape_data + batch_id * 2, image_scale_data + batch_id * 2,
+            &(ts_info[ts_id].bboxes_dev_ptr),  // output in gpu,must use 2-level
+                                               // pointer, because we may
+                                               // re-malloc
+            &bbox_count_max_alloc,             // bbox_count_alloc_ptr boxes we
+                                               // pre-allocate
+            &(ts_info[ts_id].bbox_count_host),     // record bbox numbers
+            ts_info[ts_id].bbox_count_device_ptr,  // for atomicAdd
+            bbox_index_device_ptr,                 // for atomicAdd
+            h, class_num, anchors_num[input_id], downsample_ratio[input_id] * h,
+            downsample_ratio[input_id] * w, device_anchors_ptr[input_id],
+            conf_thresh);
+
+        // batch info update
+        if (bbox_count_max_alloc > ts_info[ts_id].bbox_count_max_alloc) {
+          ts_info[ts_id].bbox_count_max_alloc = bbox_count_max_alloc;
+          ts_info[ts_id].bboxes_host_ptr = reinterpret_cast<float*>(
+              realloc(ts_info[ts_id].bboxes_host_ptr,
+                      bbox_count_max_alloc * (5 + class_num) * sizeof(float)));
+        }
+// we need copy bbox_count_host boxes to cpu memory
+#ifdef PADDLE_WITH_HIP
+        hipMemcpyAsync(
+            ts_info[ts_id].bboxes_host_ptr, ts_info[ts_id].bboxes_dev_ptr,
+            ts_info[ts_id].bbox_count_host * (5 + class_num) * sizeof(float),
+            hipMemcpyDeviceToHost);
+#else
+        cudaMemcpyAsync(
+            ts_info[ts_id].bboxes_host_ptr, ts_info[ts_id].bboxes_dev_ptr,
+            ts_info[ts_id].bbox_count_host * (5 + class_num) * sizeof(float),
+            cudaMemcpyDeviceToHost);
+#endif
+        total_bbox += ts_info[ts_id].bbox_count_host;
+      }
+    }
+
+    boxes_scores_tensor->Resize({total_bbox > 0 ? total_bbox : 1, 6});
+    float* boxes_scores_data =
+        boxes_scores_tensor->mutable_data<float>(platform::CPUPlace());
+    memset(boxes_scores_data, 0, sizeof(float) * 6);
+    boxes_num_tensor->Resize({batch});
+    int* boxes_num_data =
+        boxes_num_tensor->mutable_data<int>(platform::CPUPlace());
+    int boxes_scores_id = 0;
+
+    // NMS
+    for (int batch_id = 0; batch_id < batch; batch_id++) {
+      std::vector<Detection> bbox_det_vec;
+      for (int input_id = 0; input_id < boxes_input.size(); input_id++) {
+        int ts_id = batch_id * boxes_input.size() + input_id;
+        int bbox_count = ts_info[ts_id].bbox_count_host;
+        if (bbox_count <= 0) {
+          continue;
+        }
+
+        float* bbox_host_ptr = ts_info[ts_id].bboxes_host_ptr;
+        for (int bbox_index = 0; bbox_index < bbox_count; ++bbox_index) {
+          Detection bbox_det;
+          memset(&bbox_det, 0, sizeof(Detection));
+          bbox_det.objectness = bbox_host_ptr[bbox_index * (5 + class_num) + 0];
+          bbox_det.bbox.x = bbox_host_ptr[bbox_index * (5 + class_num) + 1];
+          bbox_det.bbox.y = bbox_host_ptr[bbox_index * (5 + class_num) + 2];
+          bbox_det.bbox.w =
+              bbox_host_ptr[bbox_index * (5 + class_num) + 3] - bbox_det.bbox.x;
+          bbox_det.bbox.h =
+              bbox_host_ptr[bbox_index * (5 + class_num) + 4] - bbox_det.bbox.y;
+          bbox_det.classes = class_num;
+          bbox_det.prob =
+              reinterpret_cast<float*>(malloc(class_num * sizeof(float)));
+          int max_prob_class_id = -1;
+          float max_class_prob = 0.0;
+          for (int class_id = 0; class_id < class_num; class_id++) {
+            float prob =
+                bbox_host_ptr[bbox_index * (5 + class_num) + 5 + class_id];
+            bbox_det.prob[class_id] = prob;
+            if (prob > max_class_prob) {
+              max_class_prob = prob;
+              max_prob_class_id = class_id;
+            }
+          }
+          bbox_det.max_prob_class_index = max_prob_class_id;
+          bbox_det.sort_class = max_prob_class_id;
+          bbox_det_vec.push_back(bbox_det);
+        }
+      }
+      PostNMS(&bbox_det_vec, nms_threshold, class_num);
+      for (int i = 0; i < bbox_det_vec.size(); i++) {
+        boxes_scores_data[boxes_scores_id++] =
+            bbox_det_vec[i].max_prob_class_index;
+        boxes_scores_data[boxes_scores_id++] = bbox_det_vec[i].objectness;
+        boxes_scores_data[boxes_scores_id++] = bbox_det_vec[i].bbox.x;
+        boxes_scores_data[boxes_scores_id++] = bbox_det_vec[i].bbox.y;
+        boxes_scores_data[boxes_scores_id++] =
+            bbox_det_vec[i].bbox.w + bbox_det_vec[i].bbox.x;
+        boxes_scores_data[boxes_scores_id++] =
+            bbox_det_vec[i].bbox.h + bbox_det_vec[i].bbox.y;
+        free(bbox_det_vec[i].prob);
+      }
+      boxes_num_data[batch_id] = bbox_det_vec.size();
+    }
+
+#ifdef PADDLE_WITH_HIP
+    hipFree(bbox_index_device_ptr);
+#else
+    cudaFree(bbox_index_device_ptr);
+#endif
+    for (int i = 0; i < batch * boxes_input.size(); i++) {
+#ifdef PADDLE_WITH_HIP
+      hipFree(ts_info[i].bboxes_dev_ptr);
+      hipFree(ts_info[i].bbox_count_device_ptr);
+#else
+      cudaFree(ts_info[i].bboxes_dev_ptr);
+      cudaFree(ts_info[i].bbox_count_device_ptr);
+#endif
+      free(ts_info[i].bboxes_host_ptr);
+    }
+    delete[] ts_info;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(yolo_box_post, ops::YoloBoxPostKernel<float>);
diff --git a/paddle/fluid/operators/gather_op_mlu.cc b/paddle/fluid/operators/gather_op_mlu.cc
index 220d045952643..cf35e051edf87 100644
--- a/paddle/fluid/operators/gather_op_mlu.cc
+++ b/paddle/fluid/operators/gather_op_mlu.cc
@@ -27,11 +27,28 @@ class GatherOpMLUKernel : public framework::OpKernel<T> {
     auto *index = ctx.Input<Tensor>("Index");
     auto axis = ctx.Attr<int>("axis");
 
+    const auto index_dims = index->dims();
+    if (index_dims.size() == 2) {
+      PADDLE_ENFORCE_EQ(
+          index_dims[1], 1,
+          platform::errors::InvalidArgument(
+              "The last dim of index should be 1 when it is 2D, but we get %d",
+              index_dims[1]));
+    } else {
+      PADDLE_ENFORCE_EQ(
+          index_dims.size(), 1,
+          platform::errors::InvalidArgument(
+              "The index should be 1D, when it is not 2D, but we get %d",
+              index_dims.size()));
+    }
+
     auto *out = ctx.Output<Tensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
 
     MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc index_desc(*index);
+    int index_shape_1d[1] = {static_cast<int>(index_dims[0])};
+    MLUCnnlTensorDesc index_desc(1, index_shape_1d,
+                                 ToCnnlDataType(index->dtype()));
     MLUCnnlTensorDesc out_desc(*out);
     MLUCnnl::GatherFunctor(ctx, axis, 0 /*batch_dims*/, x_desc.get(),
                            GetBasePtr(x), index_desc.get(), GetBasePtr(index),
@@ -46,6 +63,22 @@ class GatherGradOpMLUKernel : public framework::OpKernel<T> {
     auto *index = ctx.Input<Tensor>("Index");
     auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    const auto index_dims = index->dims();
+    if (index_dims.size() == 2) {
+      PADDLE_ENFORCE_EQ(
+          index_dims[1], 1,
+          platform::errors::InvalidArgument(
+              "The last dim of index should be 1 when it is 2D, but we get %d",
+              index_dims[1]));
+    } else {
+      PADDLE_ENFORCE_EQ(
+          index_dims.size(), 1,
+          platform::errors::InvalidArgument(
+              "The index should be 1D, when it is not 2D, but we get %d",
+              index_dims.size()));
+    }
+
     dx->mutable_data<T>(ctx.GetPlace());
 
     MLUCnnlTensorDesc dx_desc(*dx);
@@ -53,7 +86,9 @@ class GatherGradOpMLUKernel : public framework::OpKernel<T> {
     MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &value, dx_desc.get(),
                   GetBasePtr(dx));
 
-    MLUCnnlTensorDesc index_desc(*index);
+    int index_shape_1d[1] = {static_cast<int>(index_dims[0])};
+    MLUCnnlTensorDesc index_desc(1, index_shape_1d,
+                                 ToCnnlDataType(index->dtype()));
     MLUCnnlTensorDesc dout_desc(*dout);
     const cnnlScatterRefMode_t mode = CNNL_SCATTERREF_UPDATE;
     MLUCnnl::ScatterFunctor(ctx, dx_desc.get(), GetBasePtr(dx), dout_desc.get(),
diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu
index 552649279e911..deac932d59b80 100644
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -54,26 +54,21 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
     auto* tensor = context.Output<framework::Tensor>("Out");
     T* data = tensor->mutable_data<T>(context.GetPlace());
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    bool seed_flag = false;
-    if (seed == 0) {
-      std::random_device rd;
-      seed = rd();
-      seed_flag = true;
-    }
     T mean = static_cast<T>(context.Attr<float>("mean"));
     T std = static_cast<T>(context.Attr<float>("std"));
     int64_t size = tensor->numel();
 
     int device_id = context.GetPlace().GetDeviceId();
-    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
+    auto gen_cuda = framework::DefaultCUDAGenerator(device_id);
     auto& dev_cxt =
         context.template device_context<platform::CUDADeviceContext>();
 
-    if (gen_cuda->GetIsInitPy() && seed_flag) {
+    if (seed == 0) {
+      // use global Generator seed
       auto seed_offset = gen_cuda->IncrementOffset(1);
-      int64_t gen_offset = size * seed_offset.second;
-      auto func = GaussianGenerator<T>(mean, std, seed_offset.first,
-                                       seed_offset.second);
+      uint64_t seed = seed_offset.first;
+      uint64_t offset = seed_offset.second;
+      auto func = GaussianGenerator<T>(mean, std, seed, size * offset);
       phi::IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
     } else {
       auto func = GaussianGenerator<T>(mean, std, seed);
diff --git a/paddle/fluid/operators/instance_norm_op.cc b/paddle/fluid/operators/instance_norm_op.cc
index a7d96437e95c4..de92de453a354 100644
--- a/paddle/fluid/operators/instance_norm_op.cc
+++ b/paddle/fluid/operators/instance_norm_op.cc
@@ -17,93 +17,16 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/ternary.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
 
-void InstanceNormOp::InferShape(framework::InferShapeContext *ctx) const {
-  OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "InstanceNorm");
-  OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "InstanceNorm");
-  OP_INOUT_CHECK(ctx->HasOutput("SavedMean"), "Output", "SavedMean",
-                 "InstanceNorm");
-  OP_INOUT_CHECK(ctx->HasOutput("SavedVariance"), "Output", "SavedVariance",
-                 "InstanceNorm");
-
-  const auto x_dims = ctx->GetInputDim("X");
-  PADDLE_ENFORCE_NE(phi::product(x_dims), 0,
-                    platform::errors::PreconditionNotMet(
-                        "The Input variable X(%s) has not "
-                        "been initialized. You may need to confirm "
-                        "if you put exe.run(startup_program) "
-                        "after optimizer.minimize function.",
-                        ctx->Inputs("X").front()));
-  PADDLE_ENFORCE_GE(
-      x_dims.size(), 2,
-      platform::errors::InvalidArgument(
-          "ShapeError: the dimension of input X must "
-          "greater than or equal to 2. But received: the shape of input "
-          "X = [%s], the dimension of input X =[%d]",
-          x_dims, x_dims.size()));
-  PADDLE_ENFORCE_LE(
-      x_dims.size(), 5,
-      platform::errors::InvalidArgument(
-          "ShapeError: the dimension of input X must "
-          "smaller than or equal to 5, But received: the shape of input "
-          "X = [%s], the dimension of input X = [%d]",
-          x_dims, x_dims.size()));
-  auto N = x_dims[0];
-  auto C = x_dims[1];
-  auto NxC = N * C;
-
-  if (ctx->HasInput("Scale")) {
-    auto scale_dim = ctx->GetInputDim("Scale");
-
-    PADDLE_ENFORCE_EQ(
-        scale_dim.size(), 1UL,
-        platform::errors::InvalidArgument(
-            "ShapeError: the dimension of scale must equal to 1."
-            "But received: the shape of scale is [%s], the dimension "
-            "of scale is [%d]",
-            scale_dim, scale_dim.size()));
-
-    bool check = !((!ctx->IsRuntime()) && (phi::product(scale_dim) <= 0));
-
-    if (check) {
-      PADDLE_ENFORCE_EQ(scale_dim[0], C,
-                        platform::errors::InvalidArgument(
-                            "ShapeError: the shape of scale must equal to [%d]"
-                            "But received: the shape of scale is [%d]",
-                            C, scale_dim[0]));
-    }
-  }
-  if (ctx->HasInput("Bias")) {
-    auto bias_dim = ctx->GetInputDim("Bias");
-    PADDLE_ENFORCE_EQ(
-        bias_dim.size(), 1UL,
-        platform::errors::InvalidArgument(
-            "ShapeError: the dimension of bias must equal to 1."
-            "But received: the shape of bias is [%s],the dimension "
-            "of bias is [%d]",
-            bias_dim, bias_dim.size()));
-
-    bool check = !((!ctx->IsRuntime()) && (phi::product(bias_dim) <= 0));
-    if (check) {
-      PADDLE_ENFORCE_EQ(bias_dim[0], C,
-                        platform::errors::InvalidArgument(
-                            "ShapeError: the shape of bias must equal to [%d]"
-                            "But received: the shape of bias is [%d]",
-                            C, bias_dim[0]));
-    }
-  }
-
-  ctx->SetOutputDim("Y", x_dims);
-  ctx->SetOutputDim("SavedMean", {NxC});
-  ctx->SetOutputDim("SavedVariance", {NxC});
-  ctx->ShareLoD("X", "Y");
-}
-
 framework::OpKernelType InstanceNormOp::GetExpectedKernelType(
     const framework::ExecutionContext &ctx) const {
   auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
@@ -170,127 +93,6 @@ NCHW `[batch, in_channels, in_height, in_width]`
 )DOC");
 }
 
-template <typename T>
-class InstanceNormKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto &x_dims = x->dims();
-
-    const int N = x_dims[0];
-    const int C = x_dims[1];
-    const int NxC = N * C;
-
-    const int sample_size = x->numel() / N / C;
-
-    auto *y = ctx.Output<Tensor>("Y");
-    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
-    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
-
-    auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-    auto *place = dev_ctx.eigen_device();
-
-    Eigen::DSizes<int, 2> shape(NxC, sample_size);
-// Once eigen on Windows is updated, the if branch can be removed.
-#ifndef EIGEN_HAS_INDEX_LIST
-    Eigen::DSizes<int, 2> bcast(1, sample_size);
-    Eigen::DSizes<int, 2> C_shape(C, 1);
-    Eigen::DSizes<int, 2> NxC_shape(NxC, 1);
-    Eigen::DSizes<int, 1> rdims(1);
-#else
-    Eigen::IndexList<Eigen::type2index<1>, int> bcast;
-    bcast.set(1, sample_size);
-    Eigen::IndexList<int, Eigen::type2index<1>> C_shape;
-    C_shape.set(0, C);
-    Eigen::IndexList<int, Eigen::type2index<1>> NxC_shape;
-    NxC_shape.set(0, NxC);
-    Eigen::IndexList<Eigen::type2index<1>> rdims;
-#endif
-
-    phi::funcs::SetConstant<platform::CPUDeviceContext, T> set_constant;
-
-    saved_mean->mutable_data<T>(ctx.GetPlace());
-    saved_variance->mutable_data<T>(ctx.GetPlace());
-    set_constant(dev_ctx, saved_mean, static_cast<T>(0));
-    set_constant(dev_ctx, saved_variance, static_cast<T>(0));
-
-    auto saved_mean_a = framework::EigenVector<T>::Flatten(*saved_mean);
-    auto saved_mean_e = saved_mean_a.reshape(NxC_shape);
-    auto saved_variance_a = framework::EigenVector<T>::Flatten(*saved_variance);
-    auto saved_variance_e = saved_variance_a.reshape(NxC_shape);
-
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto x_arr = x_e.reshape(shape);
-
-    saved_mean_e.device(*place) = x_arr.mean(rdims);
-    auto saved_variance_arr =
-        (x_arr - saved_mean_e.broadcast(bcast)).square().mean(rdims) + epsilon;
-
-    saved_variance_e.device(*place) = saved_variance_arr.sqrt().inverse();
-
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-
-    Tensor scale_data;
-    Tensor bias_data;
-    if (!scale) {
-      scale_data.mutable_data<T>({C}, ctx.GetPlace());
-      set_constant(dev_ctx, &scale_data, static_cast<T>(1));
-    }
-
-    if (!bias) {
-      bias_data.mutable_data<T>({C}, ctx.GetPlace());
-      set_constant(dev_ctx, &bias_data, static_cast<T>(0));
-    }
-    auto scale_e = scale
-                       ? framework::EigenVector<T>::Flatten(*scale)
-                       : framework::EigenVector<T>::Flatten(
-                             const_cast<const framework::Tensor &>(scale_data));
-    auto scale_arr = scale_e.reshape(C_shape);
-    auto bias_e = bias ? framework::EigenVector<T>::Flatten(*bias)
-                       : framework::EigenVector<T>::Flatten(
-                             const_cast<const framework::Tensor &>(bias_data));
-    auto bias_arr = bias_e.reshape(C_shape);
-
-    y->mutable_data<T>(ctx.GetPlace());
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto y_arr = y_e.reshape(shape);
-
-    // (x - mean) * inv_std * scale + bias
-    Eigen::DSizes<int, 2> bcast_param(N, sample_size);
-    y_arr.device(*place) = (x_arr - saved_mean_e.broadcast(bcast)) *
-                               saved_variance_e.broadcast(bcast) *
-                               scale_arr.broadcast(bcast_param) +
-                           bias_arr.broadcast(bcast_param);
-  }
-};
-
-void InstanceNormGradOp::InferShape(framework::InferShapeContext *ctx) const {
-  OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "InstanceNormGrad");
-  OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")), "Input",
-                 framework::GradVarName("Y"), "InstanceNormGrad");
-  OP_INOUT_CHECK(ctx->HasInput("SavedMean"), "Input", "SavedMean",
-                 "InstanceNormGrad");
-  OP_INOUT_CHECK(ctx->HasInput("SavedVariance"), "Input", "SavedVariance",
-                 "InstanceNormGrad");
-
-  // check output
-  OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output",
-                 framework::GradVarName("X"), "InstanceNormGrad");
-  const auto x_dims = ctx->GetInputDim("X");
-  const int C = x_dims[1];
-  ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-  if (ctx->HasOutput(framework::GradVarName("Scale"))) {
-    ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
-  }
-  if (ctx->HasOutput(framework::GradVarName("Bias"))) {
-    ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
-  }
-}
-
 framework::OpKernelType InstanceNormGradOp::GetExpectedKernelType(
     const framework::ExecutionContext &ctx) const {
   const auto *var = ctx.InputVar(framework::GradVarName("Y"));
@@ -312,148 +114,6 @@ framework::OpKernelType InstanceNormGradOp::GetExpectedKernelType(
       OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
 }
 
-template <typename T>
-class InstanceNormGradKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
-    const auto *saved_inv_variance = ctx.Input<Tensor>("SavedVariance");
-
-    const auto &x_dims = x->dims();
-
-    const int N = x_dims[0];
-    const int C = x_dims[1];
-    const int NxC = N * C;
-    const int sample_size = x->numel() / N / C;
-
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-    d_x->mutable_data<T>(ctx.GetPlace());
-
-    auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-    auto *place = dev_ctx.eigen_device();
-
-    Eigen::DSizes<int, 2> rshape(NxC, sample_size);
-    Eigen::DSizes<int, 2> param_shape(N, C);
-    Eigen::DSizes<int, 2> shape(NxC, sample_size);
-#ifndef EIGEN_HAS_INDEX_LIST
-    Eigen::DSizes<int, 1> rdims(0);
-    Eigen::DSizes<int, 1> mean_rdims(1);
-    Eigen::DSizes<int, 2> bcast(1, sample_size);
-    Eigen::DSizes<int, 2> C_shape(C, 1);
-    Eigen::DSizes<int, 2> NxC_shape(NxC, 1);
-#else
-    Eigen::IndexList<Eigen::type2index<0>> rdims;
-    Eigen::IndexList<Eigen::type2index<1>> mean_rdims;
-    Eigen::IndexList<Eigen::type2index<1>, int> bcast;
-    bcast.set(1, sample_size);
-    Eigen::IndexList<int, Eigen::type2index<1>> C_shape;
-    C_shape.set(0, C);
-    Eigen::IndexList<int, Eigen::type2index<1>> NxC_shape;
-    NxC_shape.set(0, NxC);
-#endif
-
-    phi::funcs::SetConstant<platform::CPUDeviceContext, T> set_constant;
-
-    Tensor scale_data;
-    if (!scale) {
-      scale_data.mutable_data<T>({C}, ctx.GetPlace());
-      set_constant(dev_ctx, &scale_data, static_cast<T>(1));
-    }
-
-    auto scale_e = scale
-                       ? framework::EigenVector<T>::Flatten(*scale)
-                       : framework::EigenVector<T>::Flatten(
-                             const_cast<const framework::Tensor &>(scale_data));
-    auto mean_e = framework::EigenVector<T>::Flatten(*saved_mean);
-    auto inv_var_e = framework::EigenVector<T>::Flatten(*saved_inv_variance);
-    auto dy_e = framework::EigenVector<T>::Flatten(*d_y);
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-
-    auto scale_arr = scale_e.reshape(C_shape);
-    auto mean_arr = mean_e.reshape(NxC_shape);
-    auto inv_var_arr = inv_var_e.reshape(NxC_shape);
-    auto dy_arr = dy_e.reshape(shape);
-    auto x_arr = x_e.reshape(shape);
-
-    auto tmp = (x_arr - mean_arr.eval().broadcast(bcast)) *
-               inv_var_arr.eval().broadcast(bcast);
-
-    // math: d_bias = np.sum(d_y, axis=(n,h,w))
-    // math: d_scale = np.sum((X-mean) / inv_std * dy, axis=(n, h,w))
-    if (d_scale && d_bias) {
-      d_scale->mutable_data<T>(ctx.GetPlace());
-      d_bias->mutable_data<T>(ctx.GetPlace());
-      set_constant(dev_ctx, d_scale, static_cast<T>(0));
-      set_constant(dev_ctx, d_bias, static_cast<T>(0));
-
-      auto d_scale_e = framework::EigenVector<T>::Flatten(*d_scale);
-      auto d_scale_data = d_scale_e.reshape(C_shape);
-      auto d_bias_e = framework::EigenVector<T>::Flatten(*d_bias);
-      auto d_bias_data = d_bias_e.reshape(C_shape);
-      d_bias_data.device(*place) =
-          dy_arr.sum(mean_rdims).reshape(param_shape).sum(rdims);
-      d_scale_data.device(*place) =
-          (tmp * dy_arr).sum(mean_rdims).reshape(param_shape).sum(rdims);
-    }
-
-    auto dy_mean =
-        dy_arr.mean(mean_rdims).reshape(NxC_shape).eval().broadcast(bcast);
-
-    Eigen::DSizes<int, 2> bcast_param(N, sample_size);
-    set_constant(dev_ctx, d_x, static_cast<T>(0));
-    // math: d_x = scale * inv_var * d_y - scale * inv_var * np.sum(d_y,
-    // axis=(h,w))
-    //             - scale * (X - mean) * inv_var.pow(3) * np.sum(d_y * (X -
-    //             mean),
-    //             axis=(h,w))
-    auto dx_e = framework::EigenVector<T>::Flatten(*d_x);
-    auto dx_arr = dx_e.reshape(shape);
-    dx_arr.device(*place) = scale_arr.broadcast(bcast_param) *
-                            inv_var_arr.broadcast(bcast) *
-                            (dy_arr - dy_mean -
-                             tmp *
-                                 (dy_arr * tmp)
-                                     .mean(mean_rdims)
-                                     .reshape(NxC_shape)
-                                     .eval()
-                                     .broadcast(bcast));
-  }
-};
-
-void InstanceNormDoubleGradOp::InferShape(
-    framework::InferShapeContext *ctx) const {
-  OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "InstanceNormDoubleGrad");
-  OP_INOUT_CHECK(ctx->HasInput("SavedMean"), "Input", "SavedMean",
-                 "InstanceNormDoubleGrad");
-  OP_INOUT_CHECK(ctx->HasInput("SavedVariance"), "Input", "SavedVariance",
-                 "InstanceNormDoubleGrad");
-  OP_INOUT_CHECK(ctx->HasInput("DDX"), "Input", "DDX",
-                 "InstanceNormDoubleGrad");
-  OP_INOUT_CHECK(ctx->HasInput("DY"), "Input", "DY", "InstanceNormDoubleGrad");
-
-  // check output
-  OP_INOUT_CHECK(ctx->HasOutput("DX"), "Output", "DX",
-                 "InstanceNormDoubleGrad");
-
-  const auto x_dims = ctx->GetInputDim("X");
-  const int C = x_dims[1];
-  if (ctx->HasOutput("DX")) {
-    ctx->SetOutputDim("DX", x_dims);
-  }
-  if (ctx->HasOutput("DScale")) {
-    ctx->SetOutputDim("DScale", {C});
-  }
-  if (ctx->HasOutput("DDY")) {
-    ctx->ShareDim("X", "DDY");
-  }
-}
-
 framework::OpKernelType InstanceNormDoubleGradOp::GetExpectedKernelType(
     const framework::ExecutionContext &ctx) const {
   const auto *var = ctx.InputVar("DY");
@@ -475,213 +135,6 @@ framework::OpKernelType InstanceNormDoubleGradOp::GetExpectedKernelType(
       OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
 }
 
-template <typename T>
-class InstanceNormDoubleGradKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *X = ctx.Input<Tensor>("X");
-    const auto *Scale = ctx.Input<Tensor>("Scale");
-    const auto *dY = ctx.Input<Tensor>("DY");
-    const auto *Saved_mean = ctx.Input<Tensor>("SavedMean");
-    const auto *Saved_variance = ctx.Input<Tensor>("SavedVariance");
-    const auto *ddX = ctx.Input<Tensor>("DDX");
-    const auto *ddScale = ctx.Input<Tensor>("DDScale");
-    const auto *ddBias = ctx.Input<Tensor>("DDBias");
-
-    auto *dX = ctx.Output<Tensor>("DX");
-    auto *dScale = ctx.Output<Tensor>("DScale");
-    auto *ddY = ctx.Output<Tensor>("DDY");
-
-    auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-    phi::funcs::SetConstant<platform::CPUDeviceContext, T> set_constant;
-
-    const auto &x_dims = X->dims();
-    int N, C, H, W, D;
-    ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
-    const int sample_size = X->numel() / N / C;
-    const int NxC = N * C;
-
-    const T *mean_data = Saved_mean->data<T>();
-    const T *inv_var_data = Saved_variance->data<T>();
-    Tensor mean_tensor;
-    Tensor inv_var_tensor;
-    ConstEigenArrayMap<T> x_arr(X->data<T>(), sample_size, NxC);
-    ConstEigenVectorArrayMap<T> mean_arr(mean_data, NxC);
-    ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, NxC);
-
-    Tensor mean_tile;
-    mean_tile.Resize({sample_size, NxC});
-    mean_tile.mutable_data<T>(ctx.GetPlace());
-    EigenArrayMap<T> mean_tile_data(mean_tile.mutable_data<T>(ctx.GetPlace()),
-                                    sample_size, NxC);
-
-    Tensor inv_var_tile;
-    inv_var_tile.Resize({sample_size, NxC});
-    inv_var_tile.mutable_data<T>(ctx.GetPlace());
-    EigenArrayMap<T> inv_var_tile_data(
-        inv_var_tile.mutable_data<T>(ctx.GetPlace()), sample_size, NxC);
-
-    mean_tile_data = mean_arr.transpose().replicate(sample_size, 1);
-    inv_var_tile_data = inv_var_arr.transpose().replicate(sample_size, 1);
-
-    Tensor Scale_data;
-    if (!Scale) {
-      Scale_data.mutable_data<T>({C}, ctx.GetPlace());
-      set_constant(dev_ctx, &Scale_data, static_cast<T>(1));
-    }
-    ConstEigenVectorArrayMap<T> scale_arr(
-        Scale ? Scale->data<T>() : Scale_data.data<T>(), C);
-
-    Tensor scale_tile;
-    scale_tile.Resize({sample_size, NxC});
-    scale_tile.mutable_data<T>(ctx.GetPlace());
-    EigenArrayMap<T> scale_tile_data(scale_tile.mutable_data<T>(ctx.GetPlace()),
-                                     sample_size, NxC);
-    scale_tile_data = scale_arr.transpose().replicate(sample_size, N);
-
-    ConstEigenArrayMap<T> dy_arr(dY->data<T>(), sample_size, NxC);
-    ConstEigenArrayMap<T> ddx_arr(ddX->data<T>(), sample_size, NxC);
-
-    // math: dx = scale * ((x - mean) * inv_var / HxW * (np.mean(ddx,
-    // axis=(h,w)) *
-    //          np.sum(dy, axis=(h,w)) -
-    //          np.sum(dy * ddx, axis=(h,w)) + 3 * np.mean(dy * (x - mean),
-    //          axis=(h,w)) * inv_var.pow(2) *
-    //          np.sum(ddx * (x - mean), axis=(h,w))) + inv_var.pow(3) / HxW *
-    //          np.sum(ddx * (x - mean)) *
-    //          (np.mean(dy, axis=(h,w)) - dy) + inv_var.pow(3) / HxW *
-    //          np.sum(dy,
-    //          axis=(h,w)) * (x - mean) *
-    //          (np.mean(ddx, axis=(h,w)) - ddx)) + ddr * (dy * inv_var -
-    //          inv_var *
-    //          np.mean(dy, axis=(h,w)) -
-    //          inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
-    //          axis=(h,w)))
-
-    Tensor x_sub_mean_mul_invstd;
-    x_sub_mean_mul_invstd.Resize({sample_size, NxC});
-    x_sub_mean_mul_invstd.mutable_data<T>(ctx.GetPlace());
-    EigenArrayMap<T> x_sub_mean_mul_invstd_arr(
-        x_sub_mean_mul_invstd.mutable_data<T>(ctx.GetPlace()), sample_size,
-        NxC);
-    x_sub_mean_mul_invstd_arr = (x_arr - mean_tile_data) * inv_var_tile_data;
-
-    if (dX) {
-      dX->mutable_data<T>(ctx.GetPlace());
-      set_constant(dev_ctx, dX, static_cast<T>(0));
-      EigenArrayMap<T> dx_arr(dX->mutable_data<T>(ctx.GetPlace()), sample_size,
-                              NxC);
-
-      if (ddX) {
-        dx_arr +=
-            x_sub_mean_mul_invstd_arr * inv_var_tile_data * inv_var_tile_data /
-            sample_size *
-            (ddx_arr.colwise().sum() * dy_arr.colwise().sum() / sample_size -
-             (dy_arr * ddx_arr).colwise().sum() +
-             3. * (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() *
-                 (ddx_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
-                 sample_size);
-
-        dx_arr += (ddx_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
-                  sample_size * inv_var_tile_data * inv_var_tile_data *
-                  (dy_arr.colwise().sum() / sample_size - dy_arr);
-
-        dx_arr += (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
-                  sample_size * inv_var_tile_data * inv_var_tile_data *
-                  (ddx_arr.colwise().sum() / sample_size - ddx_arr);
-
-        dx_arr = scale_tile_data * dx_arr;
-      }
-      if (ddScale) {
-        ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
-
-        Tensor ddscale_tile;
-        ddscale_tile.Resize({sample_size, NxC});
-        ddscale_tile.mutable_data<T>(ctx.GetPlace());
-        EigenArrayMap<T> ddscale_tile_data(
-            ddscale_tile.mutable_data<T>(ctx.GetPlace()), sample_size, NxC);
-        ddscale_tile_data = ddscale_arr.transpose().replicate(sample_size, N);
-
-        dx_arr += (dy_arr * inv_var_tile_data -
-                   dy_arr.colwise().sum() / sample_size * inv_var_tile_data -
-                   x_sub_mean_mul_invstd_arr * inv_var_tile_data *
-                       (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
-                       sample_size) *
-                  ddscale_tile_data;
-      }
-    }
-    if (dScale) {
-      // math: dscale = inv_var * (dy - np.mean(dy, axis=(h,w) - (x-mean) *
-      //            inv_var.pow(2) * np.mean(dy * (x-mean), axis=(h,w)))) * ddx
-      dScale->mutable_data<T>(ctx.GetPlace());
-      set_constant(dev_ctx, dScale, static_cast<T>(0));
-      EigenVectorArrayMap<T> dscale_arr(dScale->mutable_data<T>(ctx.GetPlace()),
-                                        C);
-      if (ddX) {
-        Tensor first_grad;
-        first_grad.Resize({sample_size, NxC});
-        first_grad.mutable_data<T>(ctx.GetPlace());
-        set_constant(dev_ctx, &first_grad, static_cast<T>(0));
-        EigenArrayMap<T> first_grad_arr(
-            first_grad.mutable_data<T>(ctx.GetPlace()), sample_size, NxC);
-
-        first_grad_arr +=
-            inv_var_tile_data *
-            (dy_arr -
-             dy_arr.colwise().sum().replicate(sample_size, 1) / sample_size -
-             x_sub_mean_mul_invstd_arr *
-                 (dy_arr * x_sub_mean_mul_invstd_arr)
-                     .colwise()
-                     .sum()
-                     .replicate(sample_size, 1) /
-                 sample_size);
-        first_grad_arr = first_grad_arr * ddx_arr;
-        for (int nc = 0; nc < NxC; ++nc) {
-          int c = nc % C;
-          dscale_arr(c) += first_grad_arr.colwise().sum()(nc);
-        }
-      }
-    }
-    if (ddY) {
-      // math: ddy = (x - mean) * inv_var * ddscale + ddbias +
-      //           scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) *
-      //           np.mean(ddx * (x - mean), axis=(h,w)))
-      ddY->mutable_data<T>(ctx.GetPlace());
-      set_constant(dev_ctx, ddY, static_cast<T>(0));
-      EigenArrayMap<T> ddy_arr(ddY->mutable_data<T>(ctx.GetPlace()),
-                               sample_size, NxC);
-      if (ddX) {
-        ddy_arr += scale_tile_data * inv_var_tile_data *
-                   (ddx_arr - ddx_arr.colwise().sum() / sample_size -
-                    x_sub_mean_mul_invstd_arr *
-                        (ddx_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
-                        sample_size);
-      }
-      if (ddScale && ddBias) {
-        ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
-        Tensor ddscale_tile;
-        ddscale_tile.Resize({sample_size, NxC});
-        ddscale_tile.mutable_data<T>(ctx.GetPlace());
-        EigenArrayMap<T> ddscale_tile_data(
-            ddscale_tile.mutable_data<T>(ctx.GetPlace()), sample_size, NxC);
-        ddscale_tile_data = ddscale_arr.transpose().replicate(sample_size, N);
-
-        ConstEigenVectorArrayMap<T> ddbias_arr(ddBias->data<T>(), C);
-        Tensor ddbias_tile;
-        ddbias_tile.Resize({sample_size, NxC});
-        ddbias_tile.mutable_data<T>(ctx.GetPlace());
-        EigenArrayMap<T> ddbias_tile_data(
-            ddbias_tile.mutable_data<T>(ctx.GetPlace()), sample_size, NxC);
-        ddbias_tile_data = ddbias_arr.transpose().replicate(sample_size, N);
-
-        ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data;
-        ddy_arr += ddbias_tile_data;
-      }
-    }
-  }
-};
-
 DECLARE_INPLACE_OP_INFERER(InstanceNormDoubleGradOpInplaceInferer,
                            {"DY", "DDY"});
 
@@ -689,30 +142,26 @@ DECLARE_INPLACE_OP_INFERER(InstanceNormDoubleGradOpInplaceInferer,
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(instance_norm, InstanceNormInferShapeFunctor,
+                            PD_INFER_META(phi::InstanceNormInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(instance_norm_grad,
+                            InstanceNormGradInferShapeFunctor,
+                            PD_INFER_META(phi::InstanceNormGradInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(
+    instance_norm_grad_grad, InstanceNormDoubleGradInferShapeFunctor,
+    PD_INFER_META(phi::InstanceNormDoubleGradInferMeta));
 REGISTER_OPERATOR(instance_norm, ops::InstanceNormOp, ops::InstanceNormOpMaker,
                   ops::InstanceNormOpInferVarType,
                   ops::InstanceNormGradMaker<paddle::framework::OpDesc>,
-                  ops::InstanceNormGradMaker<paddle::imperative::OpBase>);
+                  ops::InstanceNormGradMaker<paddle::imperative::OpBase>,
+                  InstanceNormInferShapeFunctor);
 REGISTER_OPERATOR(instance_norm_grad, ops::InstanceNormGradOp,
                   ops::InstanceNormDoubleGradMaker<paddle::framework::OpDesc>,
-                  ops::InstanceNormDoubleGradMaker<paddle::imperative::OpBase>);
+                  ops::InstanceNormDoubleGradMaker<paddle::imperative::OpBase>,
+                  InstanceNormGradInferShapeFunctor);
 REGISTER_OPERATOR(instance_norm_grad_grad, ops::InstanceNormDoubleGradOp,
-                  ops::InstanceNormDoubleGradOpInplaceInferer);
-
-REGISTER_OP_CPU_KERNEL(
-    instance_norm,
-    ops::InstanceNormKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::InstanceNormKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    instance_norm_grad,
-    ops::InstanceNormGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::InstanceNormGradKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    instance_norm_grad_grad,
-    ops::InstanceNormDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                      float>,
-    ops::InstanceNormDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                      double>);
+                  ops::InstanceNormDoubleGradOpInplaceInferer,
+                  InstanceNormDoubleGradInferShapeFunctor);
 
 REGISTER_OP_VERSION(instance_norm)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/instance_norm_op.cu b/paddle/fluid/operators/instance_norm_op.cu
deleted file mode 100644
index e51cd9835318a..0000000000000
--- a/paddle/fluid/operators/instance_norm_op.cu
+++ /dev/null
@@ -1,818 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <cfloat>
-#include <string>
-#include <vector>
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-#include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/operators/instance_norm_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using DataLayout = framework::DataLayout;
-template <typename T>
-using CudnnDataType = platform::CudnnDataType<T>;
-template <typename T>
-using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
-
-template <typename T>
-static __global__ void repeat_param(const T *input, T *output,
-                                    const int repeat_num, const int C) {
-  CUDA_KERNEL_LOOP(i, repeat_num * C) {
-    int index = i % C;
-    output[i] = input[index];
-  }
-}
-
-template <typename T, int BlockDim, bool AVG>
-static __global__ void add_param(const T *input, T *output,
-                                 const int repeat_num, const int C) {
-  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage ou_storage;
-  for (int i = blockIdx.x; i < C; i += gridDim.x) {
-    T ou = static_cast<T>(0);
-    for (int j = threadIdx.x; j < repeat_num; j += blockDim.x) {
-      const int index = j * C + i;
-      ou += static_cast<T>(input[index]);
-    }
-    ou = BlockReduce(ou_storage).Reduce(ou, cub::Sum());
-    if (threadIdx.x == 0) {
-      output[i] = ou;
-    }
-    __syncthreads();
-
-    if (AVG) {
-      output[i] /= repeat_num;
-    }
-  }
-}
-
-template <typename T>
-class InstanceNormKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("It must be CUDAPlace."));
-    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-
-    auto *x = ctx.Input<Tensor>("X");
-    auto &x_dims = x->dims();
-    PADDLE_ENFORCE_GE(x_dims.size(), 2,
-                      platform::errors::InvalidArgument(
-                          "The `shape` in InstanceNormOp is invalid: "
-                          "the size of X's dimensions must greater than "
-                          "or equal to 2. But received: "
-                          "the size of X's dimensions is [%d]",
-                          x_dims.size()));
-    PADDLE_ENFORCE_LE(x_dims.size(), 5,
-                      platform::errors::InvalidArgument(
-                          "The `shape` in InstanceNormOp is invalid: "
-                          "the size of X's dimensions must smaller than"
-                          "or equal to 5. But received: "
-                          "the size of X's dimensions is [%d]",
-                          x_dims.size()));
-    int N, C, H, W, D;
-    ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
-    int NxC = N * C;
-    Tensor x_tmp;
-    x_tmp.ShareDataWith(*x).Resize({1, NxC, H, W, D});
-
-    auto *y = ctx.Output<Tensor>("Y");
-    y->mutable_data<T>(ctx.GetPlace());
-
-#ifdef PADDLE_WITH_HIP
-    miopenTensorDescriptor_t data_desc_;
-    miopenTensorDescriptor_t in_param_desc_;
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
-#else
-    cudnnTensorDescriptor_t data_desc_;
-    cudnnTensorDescriptor_t in_param_desc_;
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
-#endif
-    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
-      LOG(ERROR) << "Provided epsilon is smaller than "
-                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
-                 << "CUDNN_BN_MIN_EPSILON instead.";
-    }
-    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
-
-    VLOG(3) << "Setting descriptors.";
-    std::vector<int> dims;
-    std::vector<int> strides;
-    dims = {1, NxC, H, W, D};
-    strides = {NxC * H * W * D, H * W * D, W * D, D, 1};
-
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-
-#ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
-        data_desc_, CudnnDataType<T>::type,
-        x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
-        const_cast<int *>(strides.data())));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::miopenDeriveBNTensorDescriptor(
-            in_param_desc_, data_desc_, miopenBNSpatial));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-        data_desc_, CudnnDataType<T>::type,
-        x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor(
-        in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
-#endif
-
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-
-    Tensor scale_tmp =
-        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({NxC}, dev_ctx);
-    scale_tmp.mutable_data<T>(ctx.GetPlace());
-    Tensor bias_tmp =
-        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({NxC}, dev_ctx);
-    bias_tmp.mutable_data<T>(ctx.GetPlace());
-
-    const int n = x->numel();
-    const int block = 512;
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    const int max_blocks = std::max(max_threads / block, 1);
-    const int grid = std::min((NxC + block - 1) / block, max_blocks);
-
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_constant;
-    if (scale) {
-      repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
-          scale->data<T>(), scale_tmp.data<T>(), N, C);
-    } else {
-      set_constant(dev_ctx, &scale_tmp, static_cast<T>(1));
-    }
-    if (bias) {
-      repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
-          bias->data<T>(), bias_tmp.data<T>(), N, C);
-    } else {
-      set_constant(dev_ctx, &bias_tmp, static_cast<T>(0));
-    }
-
-    auto handle = dev_ctx.cudnn_handle();
-
-    phi::funcs::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
-        functor;
-
-    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
-    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
-    saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
-    functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
-
-#ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::miopenBatchNormalizationForwardTraining(
-            handle, miopenBNSpatial,
-            const_cast<void *>(
-                static_cast<const void *>(CudnnDataType<T>::kOne())),
-            const_cast<void *>(
-                static_cast<const void *>(CudnnDataType<T>::kZero())),
-            data_desc_, static_cast<const void *>(x_tmp.template data<T>()),
-            data_desc_,
-            static_cast<void *>(y->template mutable_data<T>(ctx.GetPlace())),
-            in_param_desc_,
-            const_cast<void *>(static_cast<const void *>(
-                scale_tmp.template data<BatchNormParamType<T>>())),
-            const_cast<void *>(static_cast<const void *>(
-                bias_tmp.template data<BatchNormParamType<T>>())),
-            0, nullptr, nullptr, epsilon,
-            static_cast<void *>(
-                saved_mean->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace())),
-            static_cast<void *>(
-                saved_variance->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()))));
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnBatchNormalizationForwardTraining(
-            handle, CUDNN_BATCHNORM_SPATIAL, CudnnDataType<T>::kOne(),
-            CudnnDataType<T>::kZero(), data_desc_, x_tmp.template data<T>(),
-            data_desc_, y->template mutable_data<T>(ctx.GetPlace()),
-            in_param_desc_, scale_tmp.template data<BatchNormParamType<T>>(),
-            bias_tmp.template data<BatchNormParamType<T>>(), 0, nullptr,
-            nullptr, epsilon,
-            saved_mean->template mutable_data<BatchNormParamType<T>>(
-                ctx.GetPlace()),
-            saved_variance->template mutable_data<BatchNormParamType<T>>(
-                ctx.GetPlace())));
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
-#endif
-  }
-};
-
-template <typename T, int BlockDim>
-static __global__ void GradComputeDX(const T *dy,
-                                     const BatchNormParamType<T> *scale,
-                                     const BatchNormParamType<T> *mean,
-                                     const T *x,
-                                     const BatchNormParamType<T> *variance,
-                                     const int C, const int sample_size,
-                                     T *dx) {
-  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
-  int end_idx = (blockIdx.x + 1) * sample_size;
-  int ncid = blockIdx.x;
-  int c = ncid % C;
-
-  BatchNormParamType<T> mean_val = mean[ncid];
-  BatchNormParamType<T> inv_var_val = variance[ncid];
-
-  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage dy_storage;
-  __shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage;
-  __shared__ BatchNormParamType<T> dy_sum_val;
-  __shared__ BatchNormParamType<T> dy_x_sub_mean_sum_val;
-
-  BatchNormParamType<T> dy_sum = static_cast<BatchNormParamType<T>>(0);
-  BatchNormParamType<T> dy_x_sub_mean_sum =
-      static_cast<BatchNormParamType<T>>(0);
-
-  for (int i = beg_idx; i < end_idx; i += BlockDim) {
-    BatchNormParamType<T> dy_i = static_cast<BatchNormParamType<T>>(dy[i]);
-    dy_sum += dy_i;
-    dy_x_sub_mean_sum +=
-        dy_i * (static_cast<BatchNormParamType<T>>(x[i]) - mean_val);
-  }
-  dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
-  dy_x_sub_mean_sum =
-      BlockReduce(dy_x_sub_mean_storage).Reduce(dy_x_sub_mean_sum, cub::Sum());
-
-  if (threadIdx.x == 0) {
-    dy_sum_val = dy_sum;
-    dy_x_sub_mean_sum_val = dy_x_sub_mean_sum;
-  }
-  __syncthreads();
-
-  for (int i = beg_idx; i < end_idx; i += BlockDim) {
-    dx[i] =
-        (static_cast<BatchNormParamType<T>>(dy[i]) -
-         dy_sum_val / static_cast<BatchNormParamType<T>>(sample_size) -
-         (static_cast<BatchNormParamType<T>>(x[i]) - mean_val) *
-             dy_x_sub_mean_sum_val * inv_var_val * inv_var_val / sample_size) *
-        scale[c] * inv_var_val;
-  }
-}
-
-template <typename T>
-class InstanceNormGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("It must use CUDAPlace."));
-    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-
-    const auto &x_dims = x->dims();
-
-    int N, C, H, W, D;
-    ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
-    int NxC = N * C;
-
-    Tensor x_tmp, d_y_tmp;
-    x_tmp.ShareDataWith(*x).Resize({1, NxC, H, W, D});
-    d_y_tmp.ShareDataWith(*d_y).Resize({1, NxC, H, W, D});
-
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    d_x->mutable_data<T>(ctx.GetPlace());
-    if (d_scale && d_bias) {
-      d_scale->mutable_data<T>(ctx.GetPlace());
-      d_bias->mutable_data<T>(ctx.GetPlace());
-    }
-    if (scale) {
-      PADDLE_ENFORCE_EQ(
-          scale->dims().size(), 1UL,
-          platform::errors::InvalidArgument(
-              "The `shape` in InstanceNormOp is invalid: "
-              "the size of scale's dimensions must be equal to 1. But "
-              "received: the size of scale's dimensions"
-              "is [%d]",
-              scale->dims().size()));
-      PADDLE_ENFORCE_EQ(scale->dims()[0], C,
-                        platform::errors::InvalidArgument(
-                            "The `shape` in InstanceNormOp is invalid: "
-                            "the first dimension of scale must be equal to "
-                            "Channels([%d]). But received: "
-                            "the first dimension of scale is [%d],"
-                            "the dimensions of scale is [%s], ",
-                            C, scale->dims()[0], scale->dims()));
-    }
-
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_constant;
-
-    const int n = x->numel();
-    const int block = 512;
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    const int max_blocks = std::max(max_threads / block, 1);
-    const int grid = std::min(NxC, max_blocks);
-    const int grid1 = (C + block - 1) / block;
-
-    Tensor scale_tmp =
-        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({NxC}, dev_ctx);
-    scale_tmp.mutable_data<T>(ctx.GetPlace());
-    Tensor d_scale_tmp =
-        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({NxC}, dev_ctx);
-    Tensor d_bias_tmp =
-        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({NxC}, dev_ctx);
-    if (scale) {
-      repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
-          scale->data<T>(), scale_tmp.data<T>(), N, C);
-    } else {
-      set_constant(dev_ctx, &scale_tmp, static_cast<T>(1));
-    }
-
-    std::vector<int> dims;
-    std::vector<int> strides;
-    dims = {1, NxC, H, W, D};
-    strides = {NxC * H * W * D, H * W * D, W * D, D, 1};
-
-    if ((H * W * D) == 1) {
-      framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
-      phi::funcs::SetConstant<platform::CUDADeviceContext,
-                              BatchNormParamType<T>>
-          functor;
-      functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
-      functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
-      return;
-    }
-
-#ifdef PADDLE_WITH_HIP
-    miopenTensorDescriptor_t data_desc_;
-    miopenTensorDescriptor_t in_param_desc_;
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
-#else
-    cudnnTensorDescriptor_t data_desc_;
-    cudnnTensorDescriptor_t in_param_desc_;
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
-#endif
-
-    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
-      LOG(ERROR) << "Provided epsilon is smaller than "
-                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
-                 << "CUDNN_BN_MIN_EPSILON instead.";
-    }
-    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
-
-#ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
-        data_desc_, CudnnDataType<T>::type,
-        x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
-        const_cast<int *>(strides.data())));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::miopenDeriveBNTensorDescriptor(
-            in_param_desc_, data_desc_, miopenBNSpatial));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-        data_desc_, CudnnDataType<T>::type,
-        x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor(
-        in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
-#endif
-
-    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
-    const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
-    const auto *saved_mean_data =
-        saved_mean->template data<BatchNormParamType<T>>();
-    const auto *saved_var_data =
-        saved_var->template data<BatchNormParamType<T>>();
-    if (d_scale && d_bias) {
-#ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::miopenBatchNormalizationBackward(
-              dev_ctx.cudnn_handle(), miopenBNSpatial, CudnnDataType<T>::kOne(),
-              CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
-              CudnnDataType<T>::kZero(), data_desc_, x_tmp.template data<T>(),
-              data_desc_, d_y_tmp.template data<T>(), data_desc_,
-              d_x->template mutable_data<T>(ctx.GetPlace()), in_param_desc_,
-              scale_tmp.template data<BatchNormParamType<T>>(),
-              d_scale_tmp.template mutable_data<BatchNormParamType<T>>(
-                  ctx.GetPlace()),
-              d_bias_tmp.template mutable_data<BatchNormParamType<T>>(
-                  ctx.GetPlace()),
-              epsilon, saved_mean_data, saved_var_data));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnBatchNormalizationBackward(
-              dev_ctx.cudnn_handle(), CUDNN_BATCHNORM_SPATIAL,
-              CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(),
-              CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(), data_desc_,
-              x_tmp.template data<T>(), data_desc_, d_y_tmp.template data<T>(),
-              data_desc_, d_x->template mutable_data<T>(ctx.GetPlace()),
-              in_param_desc_, scale_tmp.template data<BatchNormParamType<T>>(),
-              d_scale_tmp.template mutable_data<BatchNormParamType<T>>(
-                  ctx.GetPlace()),
-              d_bias_tmp.template mutable_data<BatchNormParamType<T>>(
-                  ctx.GetPlace()),
-              epsilon, saved_mean_data, saved_var_data));
-#endif
-    } else {
-      if (d_x) {
-        GradComputeDX<T, block><<<NxC, block, 0, dev_ctx.stream()>>>(
-            d_y->data<T>(), scale_tmp.data<BatchNormParamType<T>>(),
-            saved_mean_data, x->data<T>(), saved_var_data, C, H * W * D,
-            d_x->data<T>());
-      }
-    }
-
-    if (d_scale && d_bias) {
-      add_param<T, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
-          d_scale_tmp.data<T>(), d_scale->data<T>(), N, C);
-      add_param<T, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
-          d_bias_tmp.data<T>(), d_bias->data<T>(), N, C);
-    }
-
-#ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
-#endif
-  }
-};
-
-static __device__ __forceinline__ float real_sqrt(float x) {
-  return 1. / sqrtf(x);
-}
-static __device__ __forceinline__ double real_sqrt(double x) {
-  return 1. / sqrt(x);
-}
-
-template <typename T, int BlockDim>
-__global__ void DoubleGradComputeDX(const T *x, const T *mean,
-                                    const T *variance, const T *ddx,
-                                    const T *dy, const T *scale,
-                                    const T *ddscale, int C, int sample_size,
-                                    const double epsilon, T *dx) {
-  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
-  int end_idx = (blockIdx.x + 1) * sample_size;
-  int ncid = blockIdx.x;
-  int c = ncid % C;
-
-  T mean_val = mean[ncid];
-  T var_val = variance[ncid];
-
-  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage dy_storage;
-  __shared__ typename BlockReduce::TempStorage ddx_storage;
-  __shared__ typename BlockReduce::TempStorage dy_mul_ddx_storage;
-  __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage;
-  __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage;
-  __shared__ T dy_sum_val;
-  __shared__ T ddx_sum_val;
-  __shared__ T dy_mul_ddx_sum_val;
-  __shared__ T dy_mul_x_sub_mean_sum_val;
-  __shared__ T ddx_mul_x_sub_mean_sum_val;
-
-  T dy_sum = 0;
-  T ddx_sum = 0;
-  T dy_mul_ddx_sum = 0;
-  T dy_mul_x_sub_mean_sum = 0;
-  T ddx_mul_x_sub_mean_sum = 0;
-  for (int i = beg_idx; i < end_idx; i += BlockDim) {
-    T ddx_i = ddx[i];
-    T dy_i = dy[i];
-    T tmp = x[i] - mean_val;
-
-    dy_sum += dy_i;
-    ddx_sum += ddx_i;
-    dy_mul_ddx_sum += (ddx_i * dy_i);
-
-    dy_mul_x_sub_mean_sum += (dy_i * tmp);
-    ddx_mul_x_sub_mean_sum += (ddx_i * tmp);
-  }
-
-  dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
-  ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum());
-  dy_mul_ddx_sum =
-      BlockReduce(dy_mul_ddx_storage).Reduce(dy_mul_ddx_sum, cub::Sum());
-  dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage)
-                              .Reduce(dy_mul_x_sub_mean_sum, cub::Sum());
-  ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage)
-                               .Reduce(ddx_mul_x_sub_mean_sum, cub::Sum());
-
-  if (threadIdx.x == 0) {
-    dy_sum_val = dy_sum;
-    ddx_sum_val = ddx_sum;
-    dy_mul_ddx_sum_val = dy_mul_ddx_sum;
-    dy_mul_x_sub_mean_sum_val = dy_mul_x_sub_mean_sum;
-    ddx_mul_x_sub_mean_sum_val = ddx_mul_x_sub_mean_sum;
-  }
-  __syncthreads();
-
-  if (ddx != nullptr) {
-    for (int i = beg_idx; i < end_idx; i += BlockDim) {
-      dx[i] +=
-          ((x[i] - mean_val) * var_val * var_val * var_val / sample_size *
-               (ddx_sum_val * dy_sum_val / sample_size - dy_mul_ddx_sum_val +
-                3. * dy_mul_x_sub_mean_sum_val * var_val *
-                    ddx_mul_x_sub_mean_sum_val * var_val / sample_size) +
-           ddx_mul_x_sub_mean_sum_val * var_val / sample_size * var_val *
-               var_val * (dy_sum_val / sample_size - dy[i]) +
-           dy_mul_x_sub_mean_sum_val * var_val / sample_size * var_val *
-               var_val * (ddx_sum_val / sample_size - ddx[i])) *
-          scale[c];
-    }
-  }
-  __syncthreads();
-  if (ddscale != nullptr) {
-    for (int i = beg_idx; i < end_idx; i += BlockDim) {
-      dx[i] += (dy[i] * var_val - dy_sum_val / sample_size * var_val -
-                (x[i] - mean_val) * var_val * dy_mul_x_sub_mean_sum_val *
-                    var_val / sample_size) *
-               ddscale[c];
-    }
-  }
-}
-
-template <typename T, int BlockDim>
-__global__ void DoubleGradComputeDDY(const T *x, const T *mean,
-                                     const T *variance, const T *ddscale,
-                                     const T *ddbias, const T *ddx,
-                                     const T *scale, int C, int sample_size,
-                                     const double epsilon, T *ddy) {
-  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
-  int end_idx = (blockIdx.x + 1) * sample_size;
-  int ncid = blockIdx.x;
-  int c = ncid % C;
-
-  T mean_val = mean[ncid];
-  T var_val = variance[ncid];
-
-  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage ddx_storage;
-  __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage;
-  __shared__ T ddx_sum_val;
-  __shared__ T ddx_mul_x_sub_mean_sum_val;
-
-  T ddx_sum = 0;
-  T ddx_mul_x_sub_mean_sum = 0;
-  for (int i = beg_idx; i < end_idx; i += BlockDim) {
-    T ddx_i = ddx[i];
-    ddx_sum += ddx_i;
-    ddx_mul_x_sub_mean_sum += (ddx_i * (x[i] - mean_val));
-  }
-  ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum());
-  ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage)
-                               .Reduce(ddx_mul_x_sub_mean_sum, cub::Sum());
-
-  if (threadIdx.x == 0) {
-    ddx_sum_val = ddx_sum;
-    ddx_mul_x_sub_mean_sum_val = ddx_mul_x_sub_mean_sum;
-  }
-  __syncthreads();
-
-  if (ddx != nullptr) {
-    for (int i = beg_idx; i < end_idx; i += BlockDim) {
-      ddy[i] += scale[c] * var_val *
-                (ddx[i] - ddx_sum_val / sample_size -
-                 (x[i] - mean_val) * var_val * ddx_mul_x_sub_mean_sum_val *
-                     var_val / sample_size);
-    }
-  }
-  __syncthreads();
-  if (ddscale != nullptr) {
-    for (int i = beg_idx; i < end_idx; i += BlockDim) {
-      ddy[i] += (x[i] - mean_val) * var_val * ddscale[c];
-    }
-  }
-  __syncthreads();
-  if (ddbias != nullptr) {
-    for (int i = beg_idx; i < end_idx; i += BlockDim) {
-      ddy[i] += ddbias[c];
-    }
-  }
-}
-
-template <typename T, int BlockDim>
-__global__ void DoubleGradComputeDScale(const T *x, const T *mean,
-                                        const T *variance, const T *ddx,
-                                        const T *dy, int C, int sample_size,
-                                        const double epsilon, T *dscale) {
-  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
-  int end_idx = (blockIdx.x + 1) * sample_size;
-  int ncid = blockIdx.x;
-  int c = ncid % C;
-
-  T mean_val = mean[ncid];
-  T var_val = variance[ncid];
-
-  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage dy_storage;
-  __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage;
-  __shared__ typename BlockReduce::TempStorage dscale_tmp_storage;
-  __shared__ T dy_sum_val;
-  __shared__ T dy_mul_x_sub_mean_sum_val;
-
-  T dy_sum = 0;
-  T dy_mul_x_sub_mean_sum = 0;
-  for (int i = beg_idx; i < end_idx; i += BlockDim) {
-    T dy_i = dy[i];
-    dy_sum += dy_i;
-    dy_mul_x_sub_mean_sum += (dy_i * (x[i] - mean_val));
-  }
-  dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
-  dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage)
-                              .Reduce(dy_mul_x_sub_mean_sum, cub::Sum());
-
-  if (threadIdx.x == 0) {
-    dy_sum_val = dy_sum;
-    dy_mul_x_sub_mean_sum_val = dy_mul_x_sub_mean_sum;
-  }
-  __syncthreads();
-
-  if (ddx != nullptr) {
-    T dscale_tmp = 0;
-    for (int i = beg_idx; i < end_idx; i += BlockDim) {
-      dscale_tmp +=
-          ddx[i] * var_val * (dy[i] - dy_sum_val / sample_size -
-                              dy_mul_x_sub_mean_sum_val * (x[i] - mean_val) *
-                                  var_val * var_val / sample_size);
-    }
-    dscale_tmp = BlockReduce(dscale_tmp_storage).Reduce(dscale_tmp, cub::Sum());
-
-    if (threadIdx.x == 0) {
-      dscale[ncid] += dscale_tmp;
-    }
-    __syncthreads();
-  }
-}
-
-template <typename T>
-class InstanceNormDoubleGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *X = ctx.Input<Tensor>("X");
-    const auto *Scale = ctx.Input<Tensor>("Scale");
-    const auto *dY = ctx.Input<Tensor>("DY");
-    const auto *Saved_mean = ctx.Input<Tensor>("SavedMean");
-    const auto *Saved_variance = ctx.Input<Tensor>("SavedVariance");
-    const auto *running_mean = ctx.Input<Tensor>("Mean");
-    const auto *running_var = ctx.Input<Tensor>("Variance");
-    const auto *ddX = ctx.Input<Tensor>("DDX");
-    const auto *ddScale = ctx.Input<Tensor>("DDScale");
-    const auto *ddBias = ctx.Input<Tensor>("DDBias");
-    const double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-
-    auto *dX = ctx.Output<Tensor>("DX");
-    auto *dScale = ctx.Output<Tensor>("DScale");
-    auto *ddY = ctx.Output<Tensor>("DDY");
-
-    const T *x_data = X->data<T>();
-    const T *dy_data = dY->data<T>();
-    const T *ddx_data = (ddX == nullptr ? nullptr : ddX->data<T>());
-
-    const T *ddscale_data = (ddScale == nullptr ? nullptr : ddScale->data<T>());
-    const T *ddbias_data = (ddScale == nullptr ? nullptr : ddBias->data<T>());
-
-    const T *mean_data = Saved_mean->data<T>();
-    const T *variance_data = Saved_variance->data<T>();
-
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
-
-    auto &x_dims = X->dims();
-    int N, C, H, W, D;
-    ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
-    int NxC = N * C;
-    const int n = X->numel();
-    int sample_size = n / N / C;
-
-    Tensor scale_tmp;
-    if (!Scale) {
-      scale_tmp.mutable_data<T>({C}, ctx.GetPlace());
-      set_zero(dev_ctx, &scale_tmp, static_cast<T>(1));
-    }
-    const T *scale_data = Scale ? Scale->data<T>() : scale_tmp.data<T>();
-
-    const int block = 512;
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    const int max_blocks = std::max(max_threads / block, 1);
-    const int grid = NxC;
-    const int grid1 = (C + block - 1) / block;
-
-    if (dX) {
-      T *dx_data = dX->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, dX, static_cast<T>(0));
-      DoubleGradComputeDX<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
-          x_data, mean_data, variance_data, ddx_data, dy_data, scale_data,
-          ddscale_data, C, sample_size, epsilon, dx_data);
-    }
-    if (dScale) {
-      Tensor dscale_tmp =
-          ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({NxC}, dev_ctx);
-      set_zero(dev_ctx, &dscale_tmp, static_cast<T>(0));
-      T *dscale_tmp_data = dscale_tmp.mutable_data<T>(ctx.GetPlace());
-
-      T *dscale_data = dScale->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, dScale, static_cast<T>(0));
-      DoubleGradComputeDScale<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
-          x_data, mean_data, variance_data, ddx_data, dy_data, C, sample_size,
-          epsilon, dscale_tmp_data);
-      add_param<T, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
-          dscale_tmp.data<T>(), dScale->data<T>(), N, C);
-    }
-    if (ddY) {
-      T *ddy_data = ddY->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, ddY, static_cast<T>(0));
-      DoubleGradComputeDDY<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
-          x_data, mean_data, variance_data, ddscale_data, ddbias_data, ddx_data,
-          scale_data, C, sample_size, epsilon, ddy_data);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-#ifdef PADDLE_WITH_HIP
-// MIOPEN do not support double
-REGISTER_OP_CUDA_KERNEL(
-    instance_norm, ops::InstanceNormKernel<plat::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    instance_norm_grad,
-    ops::InstanceNormGradKernel<plat::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(instance_norm_grad_grad,
-                        ops::InstanceNormDoubleGradKernel<
-                            paddle::platform::CUDADeviceContext, float>);
-#else
-REGISTER_OP_CUDA_KERNEL(
-    instance_norm, ops::InstanceNormKernel<plat::CUDADeviceContext, float>,
-    ops::InstanceNormKernel<plat::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    instance_norm_grad,
-    ops::InstanceNormGradKernel<plat::CUDADeviceContext, float>,
-    ops::InstanceNormGradKernel<plat::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    instance_norm_grad_grad,
-    ops::InstanceNormDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                      float>,
-    ops::InstanceNormDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                      double>);
-#endif
diff --git a/paddle/fluid/operators/instance_norm_op.h b/paddle/fluid/operators/instance_norm_op.h
index 493f54ab3baa6..265e4acef0d7a 100644
--- a/paddle/fluid/operators/instance_norm_op.h
+++ b/paddle/fluid/operators/instance_norm_op.h
@@ -16,9 +16,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <unordered_map>
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/norm_utils.h"
 
 namespace paddle {
 namespace operators {
@@ -27,22 +25,9 @@ using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 using DataLayout = framework::DataLayout;
 
-template <typename T>
-using EigenArrayMap =
-    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using ConstEigenArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
-template <typename T>
-using ConstEigenVectorArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
-
 class InstanceNormOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override;
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -52,7 +37,6 @@ class InstanceNormOp : public framework::OperatorWithKernel {
 class InstanceNormGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override;
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -62,7 +46,6 @@ class InstanceNormGradOp : public framework::OperatorWithKernel {
 class InstanceNormDoubleGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override;
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -130,23 +113,5 @@ class InstanceNormOpInferVarType
   }
 };
 
-template <typename DeviceContext, typename T>
-class InstanceNormKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override;
-};
-
-template <typename DeviceContext, typename T>
-class InstanceNormGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override;
-};
-
-template <typename DeviceContext, typename T>
-class InstanceNormDoubleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override;
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/inverse_op.cc b/paddle/fluid/operators/inverse_op.cc
index 8c1fd34ae87d2..f5b817a0e11fa 100644
--- a/paddle/fluid/operators/inverse_op.cc
+++ b/paddle/fluid/operators/inverse_op.cc
@@ -33,21 +33,21 @@ class InverseOp : public framework::OperatorWithKernel {
         input_rank, 2,
         platform::errors::InvalidArgument(
             "The dimension of Input(Input) is expected to be no less than 2. "
-            "But recieved: Input(Input)'s dimension = %d, shape = [%s].",
+            "But received: Input(Input)'s dimension = %d, shape = [%s].",
             input_rank, input_dims));
     for (int64_t i = 0; i < input_rank; ++i) {
       PADDLE_ENFORCE_EQ(
           (input_dims[i] == -1) || (input_dims[i] > 0), true,
           platform::errors::InvalidArgument(
               "Each dimension of input tensor is expected to be -1 or a "
-              "positive number, but recieved %d. Input's shape is [%s].",
+              "positive number, but received %d. Input's shape is [%s].",
               input_dims[i], input_dims));
     }
     if (input_dims[input_rank - 2] > 0 && input_dims[input_rank - 1] > 0) {
       PADDLE_ENFORCE_EQ(input_dims[input_rank - 2], input_dims[input_rank - 1],
                         platform::errors::InvalidArgument(
                             "The last two dimensions are expected to be equal. "
-                            "But recieved: %d and %d; "
+                            "But received: %d and %d; "
                             "Input(Input)'s shape = [%s].",
                             input_dims[input_rank - 2],
                             input_dims[input_rank - 1], input_dims));
diff --git a/paddle/fluid/operators/jit/gen/CMakeLists.txt b/paddle/fluid/operators/jit/gen/CMakeLists.txt
index 79fcb780feb93..ab8829b7baf5f 100644
--- a/paddle/fluid/operators/jit/gen/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt
@@ -33,5 +33,6 @@ USE_JITKERNEL_GEN(kHMax)
 USE_JITKERNEL_GEN(kHSum)
 USE_JITKERNEL_GEN(kEmbSeqPool)
 USE_JITKERNEL_GEN(kAdam)
+USE_JITKERNEL_GEN(kAdamW)
 USE_JITKERNEL_GEN(kSgd)
 USE_JITKERNEL_GEN(kVBroadcast)
diff --git a/paddle/fluid/operators/jit/gen/adam.cc b/paddle/fluid/operators/jit/gen/adam.cc
index 7e8cb7f59eed6..38ef6772f01ad 100644
--- a/paddle/fluid/operators/jit/gen/adam.cc
+++ b/paddle/fluid/operators/jit/gen/adam.cc
@@ -80,7 +80,7 @@ void AdamJitCode::mainCode() {
   // beta2 * mom2 + (1 - beta2) * g * g
   vmulps(ymm7 | k1, ymm7, ymm7);
   vmulps(ymm7 | k1, ymm_one_sub_beta2, ymm7);
-  vfmadd231ps(ymm7 | k1, ymm1, ptr[reg_mom2_ptr + reg_offset]);
+  vfmadd231ps(ymm7 | k1, ymm_beta2, ptr[reg_mom2_ptr + reg_offset]);
 
   // store mom1 and mom2
   vmovups(ptr[reg_mom1_out_ptr + reg_offset] | k1, ymm8);
@@ -88,11 +88,11 @@ void AdamJitCode::mainCode() {
 
   // sqrt(mom2) + eps
   vsqrtps(ymm7 | k1, ymm7);
-  vaddps(ymm7 | k1, ymm7, ymm3);
+  vaddps(ymm7 | k1, ymm7, ymm_eps);
 
   // p + (-lr) * (mom1 / sqrt(mom2) + eps)
   vdivps(ymm7 | k1, ymm8, ymm7);
-  vfmadd213ps(ymm7 | k1, ymm2, ptr[reg_param_ptr + reg_offset]);
+  vfmadd213ps(ymm7 | k1, ymm_lr, ptr[reg_param_ptr + reg_offset]);
 
   // store p
   vmovups(ptr[reg_param_out_ptr + reg_offset] | k1, ymm7);
diff --git a/paddle/fluid/operators/jit/gen/adamw.cc b/paddle/fluid/operators/jit/gen/adamw.cc
new file mode 100644
index 0000000000000..b470143fb7d8d
--- /dev/null
+++ b/paddle/fluid/operators/jit/gen/adamw.cc
@@ -0,0 +1,165 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include "paddle/fluid/operators/jit/gen/adamw.h"
+
+#include <stddef.h>  // offsetof
+
+#include "paddle/fluid/operators/jit/registry.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+
+void AdamWJitCode::loadArgs() {
+  static constexpr int32_t one_as_float = 0x3f800000;
+  static constexpr int32_t mask_all_ones = 0xFFFFFFFF;
+  static constexpr int64_t mask_8_divisible = 0xFFFFFFFFFFFFFFF8;
+  static constexpr int64_t abi_pushes_offset = num_g_abi_regs * 8;
+
+  mov(reg_mom2_out_ptr, ptr[rsp + (abi_pushes_offset + 8)]);
+  mov(reg_param_out_ptr, ptr[rsp + (abi_pushes_offset + 16)]);
+  mov(eax, one_as_float);
+  movd(xmm_one, eax);
+
+  vbroadcastss(ymm_one, xmm_one);                 // 1
+  vbroadcastss(ymm_beta1, xmm_beta1);             // beta1
+  vbroadcastss(ymm_beta2, xmm_beta2);             // beta2
+  vbroadcastss(ymm_lr, xmm_lr);                   // -lr
+  vbroadcastss(ymm_eps, xmm_eps);                 // eps
+  vbroadcastss(ymm_old_lr, xmm_old_lr);           // old lr
+  vbroadcastss(ymm_lr_ratio, xmm_lr_ratio);       // lr_ratio
+  vbroadcastss(ymm_coeff, xmm_coeff);             // coeff
+  vsubps(ymm_one_sub_beta1, ymm_one, ymm_beta1);  // 1 - beta1
+  vsubps(ymm_one_sub_beta2, ymm_one, ymm_beta2);  // 1 - beta2
+
+  mov(reg_numel_without_tail, reg_numel);
+  and_(reg_numel_without_tail, mask_8_divisible);  // make it 8-divisible
+
+  shl(reg_numel_without_tail, 2);  // * 4 to treat it as float offset
+  shl(reg_numel, 2);
+
+  mov(eax, mask_all_ones);
+  kmovw(k1, eax);
+
+  xor_(reg_offset, reg_offset);
+}
+
+void AdamWJitCode::setTailOpmask() {
+  mov(r13, rcx);
+
+  mov(rcx, reg_numel);
+  sub(rcx, reg_offset);  // get tail numel as float size
+  shr(rcx, 2);           // as elements
+  mov(r14, 1);
+  shl(r14, cl);  // 2 ^ elements
+  dec(r14);      // 2 ^ elements - 1, so numel first bits are set to 1
+  kmovw(k1, r14d);
+
+  mov(rcx, r13);
+}
+
+void AdamWJitCode::mainCode() {
+  // load p
+  vmovups(ymm10 | k1, ptr[reg_param_ptr + reg_offset]);
+
+  // ((lr * lr_ratio) * coeff)
+  vmulps(ymm11 | k1, ymm_old_lr, ymm_lr_ratio);
+  vmulps(ymm11 | k1, ymm11, ymm_coeff);
+
+  // - (lr * lr_ratio) * coeff) * p + p
+  // p is stored in ymm11
+  vfnmadd132ps(ymm11 | k1, ymm10, ymm10);
+
+  // load grad
+  vmovups(ymm10 | k1, ptr[reg_grad_ptr + reg_offset]);
+
+  // beta1 * mom1 + (1 - beta1) * g
+  vmulps(ymm12 | k1, ymm_one_sub_beta1, ymm10);
+  vfmadd231ps(ymm12 | k1, ymm_beta1, ptr[reg_mom1_ptr + reg_offset]);
+
+  // beta2 * mom2 + (1 - beta2) * g * g
+  vmulps(ymm10 | k1, ymm10, ymm10);
+  vmulps(ymm10 | k1, ymm_one_sub_beta2, ymm10);
+  vfmadd231ps(ymm10 | k1, ymm_beta2, ptr[reg_mom2_ptr + reg_offset]);
+
+  // store mom1 and mom2
+  vmovups(ptr[reg_mom1_out_ptr + reg_offset] | k1, ymm12);
+  vmovups(ptr[reg_mom2_out_ptr + reg_offset] | k1, ymm10);
+
+  // sqrt(mom2) + eps
+  vsqrtps(ymm10 | k1, ymm10);
+  vaddps(ymm10 | k1, ymm10, ymm_eps);
+
+  // p + (-lr) * (mom1 / sqrt(mom2) + eps)
+  vdivps(ymm10 | k1, ymm12, ymm10);
+  vfmadd213ps(ymm10 | k1, ymm_lr, ymm11);
+
+  // store p
+  vmovups(ptr[reg_param_out_ptr + reg_offset] | k1, ymm10);
+}
+
+void AdamWJitCode::genCode() {
+  static constexpr int64_t main_loop_elems_size =
+      8 * sizeof(float);  // 8 floats in YMM
+  static constexpr int64_t offset_increment = main_loop_elems_size;
+  preCode();
+  loadArgs();
+
+  cmp(reg_numel, main_loop_elems_size);
+  jl("process_tail");
+
+  L("main_loop");
+  {
+    mainCode();
+    add(reg_offset, offset_increment);
+    cmp(reg_numel_without_tail, reg_offset);
+    jg("main_loop");
+  }
+
+  cmp(reg_numel, reg_offset);
+  je("end", T_NEAR);  // size between jmp and label is larger than 127 byte,
+                      // T_NEAR allow long jump
+
+  L("process_tail");
+  {
+    setTailOpmask();
+    mainCode();
+  }
+
+  L("end");
+  postCode();
+}
+
+class AdamWCreator : public JitCodeCreator<int> {
+ public:
+  bool CanBeUsed(const int& attr) const override {
+    return platform::MayIUse(platform::avx512f);
+  }
+  size_t CodeSize(const int& attr) const override { return 96 + 32 * 8; }
+  std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override {
+    return make_unique<AdamWJitCode>(attr, CodeSize(attr));
+  }
+};
+
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
+
+namespace gen = paddle::operators::jit::gen;
+
+REGISTER_JITKERNEL_GEN(kAdamW, gen::AdamWCreator);
diff --git a/paddle/fluid/operators/jit/gen/adamw.h b/paddle/fluid/operators/jit/gen/adamw.h
new file mode 100644
index 0000000000000..759dcd62c8256
--- /dev/null
+++ b/paddle/fluid/operators/jit/gen/adamw.h
@@ -0,0 +1,81 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <string>
+
+#include "glog/logging.h"
+#include "paddle/fluid/operators/jit/gen/jitcode.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+
+class AdamWJitCode : public JitCode {
+ public:
+  explicit AdamWJitCode(const int& attr, size_t code_size = 256 * 1024,
+                        void* code_ptr = nullptr)
+      : JitCode(code_size, code_ptr) {
+    this->genCode();
+  }
+
+  DECLARE_JIT_CODE(AdamJitCode);
+  void genCode() override;
+  void loadArgs();
+  void setTailOpmask();
+  void mainCode();
+
+ private:
+  reg64_t reg_numel{abi_param1};
+  reg64_t reg_grad_ptr{abi_param2};
+  reg64_t reg_mom1_ptr{abi_param3};
+  reg64_t reg_mom2_ptr{abi_param4};
+  reg64_t reg_param_ptr{abi_param5};
+  reg64_t reg_mom1_out_ptr{abi_param6};
+
+  xmm_t xmm_beta1 = xmm_t(0);
+  xmm_t xmm_beta2 = xmm_t(1);
+  xmm_t xmm_lr = xmm_t(2);
+  xmm_t xmm_eps = xmm_t(3);
+  xmm_t xmm_old_lr = xmm_t(4);
+  xmm_t xmm_lr_ratio = xmm_t(5);
+  xmm_t xmm_coeff = xmm_t(6);
+  xmm_t xmm_one_sub_beta1 = xmm_t(7);
+  xmm_t xmm_one_sub_beta2 = xmm_t(8);
+  xmm_t xmm_one = xmm_t(9);
+
+  ymm_t ymm_beta1 = ymm_t(0);
+  ymm_t ymm_beta2 = ymm_t(1);
+  ymm_t ymm_lr = ymm_t(2);
+  ymm_t ymm_eps = ymm_t(3);
+  ymm_t ymm_old_lr = ymm_t(4);
+  ymm_t ymm_lr_ratio = ymm_t(5);
+  ymm_t ymm_coeff = ymm_t(6);
+  ymm_t ymm_one_sub_beta1 = ymm_t(7);
+  ymm_t ymm_one_sub_beta2 = ymm_t(8);
+  ymm_t ymm_one = ymm_t(9);
+
+  reg64_t reg_mom2_out_ptr{r10};
+  reg64_t reg_param_out_ptr{r11};
+  reg64_t reg_numel_without_tail{r12};
+  reg64_t reg_offset{rax};
+};
+
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc
index 4bdb65030590f..46da6fba2e98a 100644
--- a/paddle/fluid/operators/jit/helper.cc
+++ b/paddle/fluid/operators/jit/helper.cc
@@ -59,6 +59,7 @@ const char* to_string(KernelType kt) {
     ONE_CASE(kMatMul);
     ONE_CASE(kHMax);
     ONE_CASE(kAdam);
+    ONE_CASE(kAdamW);
     ONE_CASE(kHSum);
     ONE_CASE(kStrideASum);
     ONE_CASE(kSoftmax);
diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h
index 40ea04d3c2791..9a48d9c3c8d6c 100644
--- a/paddle/fluid/operators/jit/kernel_base.h
+++ b/paddle/fluid/operators/jit/kernel_base.h
@@ -25,6 +25,7 @@ typedef enum {
   kNone = 0,
   // sort by alphabet
   kAdam = 1,
+  kAdamW,
   kCRFDecoding,
   kEmbSeqPool,
   kGRUH1,
@@ -285,6 +286,15 @@ struct AdamTuple {
                             const T*, T*, T*, T*);
 };
 
+template <typename T>
+struct AdamWTuple {
+  static constexpr KernelType kernel_type = kAdamW;
+  typedef T data_type;
+  typedef int attr_type;
+  typedef void (*func_type)(T, T, T, T, T, T, T, int64_t, const T*, const T*,
+                            const T*, const T*, T*, T*, T*);
+};
+
 typedef struct matmul_attr_s {
   int m, n, k;
   void* packed_weight{nullptr};
diff --git a/paddle/fluid/operators/jit/refer/CMakeLists.txt b/paddle/fluid/operators/jit/refer/CMakeLists.txt
index e4e3263e01eba..a1ee4508f7241 100644
--- a/paddle/fluid/operators/jit/refer/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt
@@ -37,5 +37,6 @@ USE_JITKERNEL_REFER(kStrideASum)
 USE_JITKERNEL_REFER(kSoftmax)
 USE_JITKERNEL_REFER(kEmbSeqPool)
 USE_JITKERNEL_REFER(kAdam)
+USE_JITKERNEL_REFER(kAdamW)
 USE_JITKERNEL_REFER(kSgd)
 USE_JITKERNEL_REFER(kVBroadcast)
diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc
index 8669bfe37232b..779d4c172b83c 100644
--- a/paddle/fluid/operators/jit/refer/refer.cc
+++ b/paddle/fluid/operators/jit/refer/refer.cc
@@ -56,6 +56,7 @@ REGISTER_REFER_KERNEL(StrideASum);
 REGISTER_REFER_KERNEL(Softmax);
 REGISTER_REFER_KERNEL(EmbSeqPool);
 REGISTER_REFER_KERNEL(Adam);
+REGISTER_REFER_KERNEL(AdamW);
 REGISTER_REFER_KERNEL(Sgd);
 REGISTER_REFER_KERNEL(VBroadcast);
 
diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h
index 3545b35a703f8..79b2e174efc16 100644
--- a/paddle/fluid/operators/jit/refer/refer.h
+++ b/paddle/fluid/operators/jit/refer/refer.h
@@ -565,6 +565,21 @@ void Adam(T beta1, T beta2, T lr, T eps, int64_t numel, const T* grad_ptr,
   }
 }
 
+template <typename T>
+void AdamW(T beta1, T beta2, T lr, T eps, T old_lr, T lr_ratio, T coeff,
+           int64_t numel, const T* grad_ptr, const T* mom1_ptr,
+           const T* mom2_ptr, const T* param_ptr, T* mom1_out_ptr,
+           T* mom2_out_ptr, T* param_out_ptr) {
+  for (int i = 0; i < numel; ++i) {
+    auto param_tmp = param_ptr[i] - old_lr * lr_ratio * coeff * param_ptr[i];
+    mom1_out_ptr[i] = beta1 * mom1_ptr[i] + (1 - beta1) * grad_ptr[i];
+    mom2_out_ptr[i] =
+        beta2 * mom2_ptr[i] + (1 - beta2) * grad_ptr[i] * grad_ptr[i];
+    param_out_ptr[i] =
+        param_tmp + lr * (mom1_out_ptr[i] / (sqrt(mom2_out_ptr[i]) + eps));
+  }
+}
+
 #define DECLARE_REFER_KERNEL(name)                          \
   template <typename T>                                     \
   class name##Kernel : public ReferKernel<name##Tuple<T>> { \
@@ -617,6 +632,7 @@ DECLARE_REFER_KERNEL(MatMul);
 DECLARE_REFER_KERNEL(Softmax);
 DECLARE_REFER_KERNEL(EmbSeqPool);
 DECLARE_REFER_KERNEL(Adam);
+DECLARE_REFER_KERNEL(AdamW);
 DECLARE_REFER_KERNEL(Sgd);
 DECLARE_REFER_KERNEL(VBroadcast);
 
diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc
index 675db4a72bda3..74f2d62c64da9 100644
--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
@@ -907,6 +907,73 @@ void TestKernelAdam() {
       param, mom1_out, mom2_out, param_out);
 }
 
+template <typename KernelTuple, typename PlaceType>
+void TestKernelAdamW() {
+  using T = typename KernelTuple::data_type;
+  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
+  const T old_lr = 0.1;
+  const T beta1 = 0.99;
+  const T beta2 = 0.95;
+  const T beta1_pow = beta1 * beta1;
+  const T beta2_pow = beta2 * beta2;
+
+  const T epsilon = 0.000001;
+  const int64_t numel = 123;
+  const T lr_ratio = 0.2;
+  const T coeff = 0.3;
+
+  T learning_rate = old_lr * (sqrt(1 - beta2_pow) / (1 - beta1_pow));
+  T eps = epsilon * sqrt(1 - beta2_pow);
+
+  std::vector<T> param(numel);
+  std::vector<T> grad(numel);
+  std::vector<T> mom1(numel);
+  std::vector<T> mom2(numel);
+
+  std::vector<T> param_out(param.size());
+  std::vector<T> mom1_out(mom1.size());
+  std::vector<T> mom2_out(mom2.size());
+
+  RandomVec<T>(numel, param.data(), 0.5f);
+  RandomVec<T>(numel, grad.data(), 0.5f);
+  RandomVec<T>(numel, mom1.data(), 0.5f);
+  RandomVec<T>(numel, mom2.data(), 0.5f);
+  auto ref = jit::GetReferFunc<KernelTuple>();
+  EXPECT_TRUE(ref != nullptr);
+  ref(beta1, beta2, -learning_rate, eps, old_lr, lr_ratio, coeff, numel,
+      grad.data(), mom1.data(), mom2.data(), param.data(), mom1_out.data(),
+      mom2_out.data(), param_out.data());
+
+  auto verifier = [](
+      const typename KernelTuple::func_type tgt, T beta1, T beta2, T lr, T eps,
+      T old_lr, T lr_ratio, T coeff, int64_t numel, const std::vector<T>& grad,
+      const std::vector<T>& mom1, const std::vector<T>& mom2,
+      const std::vector<T>& param, const std::vector<T>& ref_mom1_out,
+      const std::vector<T>& ref_mom2_out, const std::vector<T>& ref_param_out) {
+    EXPECT_TRUE(tgt != nullptr);
+    EXPECT_EQ(param.size(), static_cast<size_t>(numel));
+    EXPECT_EQ(grad.size(), static_cast<size_t>(numel));
+    EXPECT_EQ(mom1.size(), static_cast<size_t>(numel));
+    EXPECT_EQ(mom2.size(), static_cast<size_t>(numel));
+
+    std::vector<T> jit_mom1_out(ref_mom1_out.size());
+    std::vector<T> jit_mom2_out(ref_mom2_out.size());
+    std::vector<T> jit_param_out(ref_param_out.size());
+
+    tgt(beta1, beta2, -lr, eps, old_lr, lr_ratio, coeff, numel, grad.data(),
+        mom1.data(), mom2.data(), param.data(), jit_mom1_out.data(),
+        jit_mom2_out.data(), jit_param_out.data());
+
+    ExpectEQ<T>(ref_mom1_out.data(), jit_mom1_out.data(), numel);
+    ExpectEQ<T>(ref_mom2_out.data(), jit_mom2_out.data(), numel);
+    ExpectEQ<T>(ref_param_out.data(), jit_param_out.data(), numel);
+  };
+
+  TestAllImpls<KernelTuple, PlaceType>(
+      1, verifier, beta1, beta2, learning_rate, eps, old_lr, lr_ratio, coeff,
+      numel, grad, mom1, mom2, param, mom1_out, mom2_out, param_out);
+}
+
 template <typename KernelTuple, typename PlaceType>
 void TestKernelSgd() {
   using T = typename KernelTuple::data_type;
@@ -1046,7 +1113,7 @@ TEST(JITKernel_pool, jitcreator) {
 #if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__)
   EXPECT_EQ(jitcreators.size(), 0UL);
 #else
-  EXPECT_EQ(jitcreators.size(), 26UL);
+  EXPECT_EQ(jitcreators.size(), 27UL);
 #endif
 }
 
@@ -1080,7 +1147,7 @@ TEST(JITKernel_pool, more) {
 
 TEST(JITKernel_pool, refer) {
   const auto& kers = jit::ReferKernelPool::Instance().AllKernels();
-  EXPECT_EQ(kers.size(), 32UL);
+  EXPECT_EQ(kers.size(), 33UL);
 }
 
 // test helper
@@ -1464,6 +1531,7 @@ TEST_CPU_KERNEL(EmbSeqPool);
 TEST_CPU_KERNEL(MatMul);
 TEST_CPU_KERNEL(Softmax);
 TEST_CPU_KERNEL(Adam);
+TEST_CPU_KERNEL(AdamW);
 TEST_CPU_KERNEL(Sgd);
 TEST_CPU_KERNEL(VBroadcast);
 
diff --git a/paddle/fluid/operators/layer_norm_op_mlu.cc b/paddle/fluid/operators/layer_norm_op_mlu.cc
new file mode 100644
index 0000000000000..a368af86a3da6
--- /dev/null
+++ b/paddle/fluid/operators/layer_norm_op_mlu.cc
@@ -0,0 +1,234 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+
+template <typename T>
+class LayerNormMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
+    const auto epsilon = ctx.Attr<float>("epsilon");
+    const auto* x = ctx.Input<Tensor>("X");
+    const auto* scale = ctx.Input<Tensor>("Scale");
+    const auto* bias = ctx.Input<Tensor>("Bias");
+    auto* y = ctx.Output<Tensor>("Y");
+    auto* mean = ctx.Output<Tensor>("Mean");
+    auto* variance = ctx.Output<Tensor>("Variance");
+
+    auto place = ctx.GetPlace();
+
+    y->mutable_data<T>(place);
+    mean->mutable_data<T>(place);
+    variance->mutable_data<T>(place);
+
+    const auto& x_dims = x->dims();
+    std::vector<int> scale_bias_axes;
+    std::vector<int> mean_var_axes;
+    for (auto i = 0; i < x_dims.size(); ++i) {
+      if (i >= begin_norm_axis) {
+        scale_bias_axes.push_back(x_dims[i]);
+      } else {
+        mean_var_axes.push_back(x_dims[i]);
+      }
+    }
+
+    MLUCnnlTensorDesc x_desc(*x);
+    MLUCnnlTensorDesc y_desc(*y);
+    MLUCnnlTensorDesc mean_var_desc(mean_var_axes.size(), mean_var_axes.data(),
+                                    ToCnnlDataType<T>());
+    // cnnl only support both of scale and bias is NULL or not.
+    if (!scale && !bias) {
+      MLUCnnl::LayerNormForward(
+          ctx, begin_norm_axis, x_desc.get(), GetBasePtr(x),
+          nullptr /*scale_bias_desc*/, nullptr /*scale*/, nullptr /*bias*/,
+          epsilon, y_desc.get(), GetBasePtr(y), mean_var_desc.get(),
+          GetBasePtr(mean), GetBasePtr(variance));
+    } else {
+      Tensor tmp_scale(x->dtype());
+      if (!scale) {
+        tmp_scale.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
+        FillMLUTensorWithHostValue(ctx, static_cast<T>(1), &tmp_scale);
+      } else {
+        tmp_scale = *scale;
+      }
+
+      Tensor tmp_bias(x->dtype());
+      if (!bias) {
+        tmp_bias.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
+        FillMLUTensorWithHostValue(ctx, static_cast<T>(0), &tmp_bias);
+      } else {
+        tmp_bias = *bias;
+      }
+
+      // scale and bias should have same type with x/y
+      MLUCnnlTensorDesc float32_desc(scale_bias_axes.size(),
+                                     scale_bias_axes.data(), CNNL_DTYPE_FLOAT);
+      MLUCnnlTensorDesc float16_desc(scale_bias_axes.size(),
+                                     scale_bias_axes.data(), CNNL_DTYPE_HALF);
+      cnnlCastDataType_t cast_type = GetCastDataType(VT::FP32, VT::FP16);
+
+      Tensor final_scale(x->dtype());
+      if (final_scale.dtype() == DataType::FLOAT16 &&
+          tmp_scale.dtype() == DataType::FLOAT32) {
+        final_scale.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
+        // cast scale to fp16
+        MLUCnnl::Cast(ctx, cast_type, float32_desc.get(),
+                      GetBasePtr(&tmp_scale), float16_desc.get(),
+                      GetBasePtr(&final_scale));
+      } else {
+        final_scale = tmp_scale;
+      }
+
+      Tensor final_bias(x->dtype());
+      if (final_bias.dtype() == DataType::FLOAT16 &&
+          tmp_bias.dtype() == DataType::FLOAT32) {
+        final_bias.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
+        // cast bias to fp16
+        MLUCnnl::Cast(ctx, cast_type, float32_desc.get(), GetBasePtr(&tmp_bias),
+                      float16_desc.get(), GetBasePtr(&final_bias));
+      } else {
+        final_bias = tmp_bias;
+      }
+
+      MLUCnnlTensorDesc scale_bias_desc(
+          scale_bias_axes.size(), scale_bias_axes.data(), ToCnnlDataType<T>());
+      MLUCnnl::LayerNormForward(
+          ctx, begin_norm_axis, x_desc.get(), GetBasePtr(x),
+          scale_bias_desc.get(), GetBasePtr(&final_scale),
+          GetBasePtr(&final_bias), epsilon, y_desc.get(), GetBasePtr(y),
+          mean_var_desc.get(), GetBasePtr(mean), GetBasePtr(variance));
+    }
+  }
+};
+
+template <typename T>
+class LayerNormGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
+    const auto* x = ctx.Input<Tensor>("X");
+    const auto* mean = ctx.Input<Tensor>("Mean");
+    const auto* variance = ctx.Input<Tensor>("Variance");
+    const auto* scale = ctx.Input<Tensor>("Scale");
+    const auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dscale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto* dbias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    auto place = ctx.GetPlace();
+    dx->mutable_data<T>(place);
+
+    const auto& x_dims = x->dims();
+    std::vector<int> scale_bias_axes;
+    std::vector<int> mean_var_axes;
+    for (auto i = 0; i < x_dims.size(); ++i) {
+      if (i >= begin_norm_axis) {
+        scale_bias_axes.push_back(x_dims[i]);
+      } else {
+        mean_var_axes.push_back(x_dims[i]);
+      }
+    }
+
+    MLUCnnlTensorDesc x_desc(*x);
+    MLUCnnlTensorDesc dy_desc(*dy);
+    MLUCnnlTensorDesc mean_var_desc(mean_var_axes.size(), mean_var_axes.data(),
+                                    ToCnnlDataType<T>());
+    MLUCnnlTensorDesc dx_desc(*dx);
+
+    Tensor tmp_scale(x->dtype());
+    if (!scale) {
+      tmp_scale.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
+      FillMLUTensorWithHostValue(ctx, static_cast<T>(1), &tmp_scale);
+    } else {
+      tmp_scale = *scale;
+    }
+
+    MLUCnnlTensorDesc float32_desc(scale_bias_axes.size(),
+                                   scale_bias_axes.data(), CNNL_DTYPE_FLOAT);
+    MLUCnnlTensorDesc float16_desc(scale_bias_axes.size(),
+                                   scale_bias_axes.data(), CNNL_DTYPE_HALF);
+    cnnlCastDataType_t cast_fp32_to_fp16 = GetCastDataType(VT::FP32, VT::FP16);
+    cnnlCastDataType_t cast_fp16_to_fp32 = GetCastDataType(VT::FP16, VT::FP32);
+
+    Tensor final_scale(x->dtype());
+    if (final_scale.dtype() == DataType::FLOAT16 &&
+        tmp_scale.dtype() == DataType::FLOAT32) {
+      final_scale.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
+      // cast scale to fp16
+      MLUCnnl::Cast(ctx, cast_fp32_to_fp16, float32_desc.get(),
+                    GetBasePtr(&tmp_scale), float16_desc.get(),
+                    GetBasePtr(&final_scale));
+    } else {
+      final_scale = tmp_scale;
+    }
+
+    Tensor tmp_dscale(x->dtype());
+    if (dscale && (tmp_dscale.dtype() == dscale->dtype())) {
+      dscale->mutable_data<T>(place);
+      tmp_dscale = *dscale;
+    } else {
+      tmp_dscale.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
+    }
+    Tensor tmp_dbias(x->dtype());
+    if (dbias && (tmp_dbias.dtype() == dbias->dtype())) {
+      dbias->mutable_data<T>(place);
+      tmp_dbias = *dbias;
+    } else {
+      tmp_dbias.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
+    }
+
+    MLUCnnlTensorDesc scale_desc(scale_bias_axes.size(), scale_bias_axes.data(),
+                                 ToCnnlDataType<T>());
+    MLUCnnl::LayerNormBackward(
+        ctx, begin_norm_axis, x_desc.get(), GetBasePtr(x), dy_desc.get(),
+        GetBasePtr(dy), scale_desc.get(), GetBasePtr(&final_scale),
+        mean_var_desc.get(), GetBasePtr(mean), GetBasePtr(variance),
+        dx_desc.get(), GetBasePtr(dx), GetBasePtr(&tmp_dscale),
+        GetBasePtr(&tmp_dbias));
+
+    if (dscale && (tmp_dscale.dtype() == DataType::FLOAT16 &&
+                   dscale->dtype() == DataType::FLOAT32)) {
+      dscale->mutable_data<T>(place);
+      MLUCnnl::Cast(ctx, cast_fp16_to_fp32, float16_desc.get(),
+                    GetBasePtr(&tmp_dscale), float32_desc.get(),
+                    GetBasePtr(dscale));
+    }
+    if (dbias && (tmp_dbias.dtype() == DataType::FLOAT16 &&
+                  dbias->dtype() == DataType::FLOAT32)) {
+      dbias->mutable_data<T>(place);
+      MLUCnnl::Cast(ctx, cast_fp16_to_fp32, float16_desc.get(),
+                    GetBasePtr(&tmp_dbias), float32_desc.get(),
+                    GetBasePtr(dbias));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(layer_norm, ops::LayerNormMLUKernel<float>,
+                       ops::LayerNormMLUKernel<plat::float16>);
+REGISTER_OP_MLU_KERNEL(layer_norm_grad, ops::LayerNormGradMLUKernel<float>,
+                       ops::LayerNormGradMLUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/linspace_op.cc b/paddle/fluid/operators/linspace_op.cc
index 1cd59672f97fc..e9375be1706eb 100644
--- a/paddle/fluid/operators/linspace_op.cc
+++ b/paddle/fluid/operators/linspace_op.cc
@@ -38,8 +38,11 @@ class LinspaceOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name, const framework::Tensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
-    return framework::OpKernelType(expected_kernel_type.data_type_,
-                                   tensor.place(), tensor.layout());
+    if (platform::is_xpu_place(tensor.place())) {
+      return framework::OpKernelType(expected_kernel_type.data_type_,
+                                     tensor.place(), tensor.layout());
+    }
+    return expected_kernel_type;
   }
 };
 
diff --git a/paddle/fluid/operators/lod_reset_op.h b/paddle/fluid/operators/lod_reset_op.h
index 86327a4f2c13a..642c8bcd9ae49 100644
--- a/paddle/fluid/operators/lod_reset_op.h
+++ b/paddle/fluid/operators/lod_reset_op.h
@@ -77,7 +77,7 @@ class LoDResetKernel : public framework::OpKernel<T> {
         platform::errors::InvalidArgument(
             "The last value of 'Target LoD''s last level LoD should be equal "
             "to the first dimension of Input(X). But received the 'Target LoD' "
-            "is %s, Input(X)'s shape is is %s.",
+            "is %s, Input(X)'s shape is %s.",
             phi::make_ddim(level0), in->dims()));
     for (size_t i = 0; i < level0.size() - 1; ++i) {
       PADDLE_ENFORCE_GE(level0[i + 1], level0[i],
diff --git a/paddle/fluid/operators/log_loss_op_xpu.cc b/paddle/fluid/operators/log_loss_op_xpu.cc
index aa5fdd86745d6..fee1f56ebdcf2 100644
--- a/paddle/fluid/operators/log_loss_op_xpu.cc
+++ b/paddle/fluid/operators/log_loss_op_xpu.cc
@@ -12,6 +12,7 @@ limitations under the License. */
 
 #include <memory>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 namespace paddle {
 namespace operators {
 
@@ -28,15 +29,9 @@ class LogLossXPUKernel : public framework::OpKernel<T> {
     loss->mutable_data<T>(ctx.GetPlace());
     int n = predict->numel();
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    int r =
-        xpu::log_loss_fwd(dev_ctx.x_context(), n, epsilon, predict->data<T>(),
-                          labels->data<T>(), loss->data<T>());
-    PADDLE_ENFORCE_EQ(
-        r, xpu::Error_t::SUCCESS,
-        platform::errors::External(
-            "XPU log_loss kernel return wrong value[%d], please check whether "
-            "Baidu Kunlun Card is properly installed.",
-            r));
+    int r = xpu::log_loss(dev_ctx.x_context(), predict->data<T>(),
+                          labels->data<T>(), loss->data<T>(), n, epsilon);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "log_loss");
   }
 };
 template <typename DeviceContext, typename T, typename AttrType = T>
@@ -54,15 +49,10 @@ class LogLossGradXPUKernel : public framework::OpKernel<T> {
     dpred->mutable_data<T>(ctx.GetPlace());
     int n = predict->numel();
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    int r = xpu::log_loss_bwd(dev_ctx.x_context(), n, epsilon,
-                              predict->data<T>(), labels->data<T>(),
-                              dloss->data<T>(), dpred->data<T>());
-    PADDLE_ENFORCE_EQ(
-        r, xpu::Error_t::SUCCESS,
-        platform::errors::External(
-            "XPU log_loss kernel return wrong value[%d], please check whether "
-            "Baidu Kunlun Card is properly installed.",
-            r));
+    int r = xpu::log_loss_grad(dev_ctx.x_context(), predict->data<T>(),
+                               labels->data<T>(), dloss->data<T>(),
+                               dpred->data<T>(), n, epsilon);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "log_loss_grad");
   }
 };
 
diff --git a/paddle/fluid/operators/lookup_table_v2_op_mlu.cc b/paddle/fluid/operators/lookup_table_v2_op_mlu.cc
new file mode 100644
index 0000000000000..c8ab269c023a5
--- /dev/null
+++ b/paddle/fluid/operators/lookup_table_v2_op_mlu.cc
@@ -0,0 +1,129 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+constexpr int64_t kNoPadding = -1;
+
+template <typename T>
+class LookupTableV2MLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *ids_t = ctx.Input<framework::LoDTensor>("Ids");      // int tensor
+    auto *output_t = ctx.Output<framework::LoDTensor>("Out");  // float tensor
+    auto *table_t = ctx.Input<framework::LoDTensor>("W");
+
+    auto *table_var = ctx.InputVar("W");
+    PADDLE_ENFORCE_EQ(
+        table_var->IsType<framework::LoDTensor>(), true,
+        platform::errors::InvalidArgument("mlu only accept LoDTensor"));
+    output_t->mutable_data<T>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc ids_desc(*ids_t);
+    MLUCnnlTensorDesc table_desc(*table_t);
+    MLUCnnlTensorDesc output_desc(*output_t);
+
+    int64_t padding_idx = ctx.Attr<int64_t>("padding_idx");
+    if (padding_idx == kNoPadding) {
+      MLUCnnl::GatherFunctor(ctx, /*axis=*/0, /*batch_dims=*/0,
+                             table_desc.get(), GetBasePtr(table_t),
+                             ids_desc.get(), GetBasePtr(ids_t),
+                             output_desc.get(), GetBasePtr(output_t));
+    } else {
+      Tensor tmp_table_t(table_t->type());
+      tmp_table_t.mutable_data<T>(table_t->dims(), ctx.GetPlace());
+
+      Tensor index;
+      index.mutable_data<int32_t>({1, 1}, ctx.GetPlace());
+      auto idx_value = static_cast<int32_t>(padding_idx);
+      MLUCnnlTensorDesc index_desc(index);
+      MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &idx_value, index_desc.get(),
+                    GetBasePtr(&index));
+
+      auto update_dim = phi::make_ddim({1, table_t->dims()[1]});
+      Tensor update;
+      update.mutable_data<T>(update_dim, ctx.GetPlace());
+
+      auto update_value = static_cast<T>(0);
+      MLUCnnlTensorDesc update_desc(update);
+      MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &update_value,
+                    update_desc.get(), GetBasePtr(&update));
+
+      MLUCnnlTensorDesc tmp_table_desc(tmp_table_t);
+      MLUCnnl::ScatterNd(
+          ctx, CNNL_SCATTERND_UPDATE, index_desc.get(), GetBasePtr(&index),
+          update_desc.get(), GetBasePtr(&update), table_desc.get(),
+          GetBasePtr(table_t), tmp_table_desc.get(), GetBasePtr(&tmp_table_t));
+
+      MLUCnnl::GatherFunctor(ctx, /*axis=*/0, /*batch_dims=*/0,
+                             tmp_table_desc.get(), GetBasePtr(&tmp_table_t),
+                             ids_desc.get(), GetBasePtr(ids_t),
+                             output_desc.get(), GetBasePtr(output_t));
+    }
+  }
+};
+
+template <typename T>
+class LookupTableV2GradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *ids_t = ctx.Input<framework::LoDTensor>("Ids");
+    auto *output_grad_t =
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    auto *table_grad_t =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("W"));
+    table_grad_t->mutable_data<T>(ctx.GetPlace());
+
+    int padding_idx = static_cast<int>(ctx.Attr<int64_t>("padding_idx"));
+
+    Tensor ids_int32(ids_t->dtype());
+    if (ids_t->dtype() != DataType::INT32) {
+      ids_int32.mutable_data<int>(ids_t->dims(), ctx.GetPlace());
+      MLUCnnlTensorDesc ids_desc(*ids_t);
+      MLUCnnlTensorDesc ids_int32_desc(ids_int32);
+      auto cast_type = GetCastDataType(ids_t->dtype(), DataType::INT32);
+      MLUCnnl::Cast(ctx, cast_type, ids_desc.get(), GetBasePtr(ids_t),
+                    ids_int32_desc.get(), GetBasePtr(&ids_int32));
+    } else {
+      ids_int32 = *ids_t;
+    }
+
+    MLUCnnlTensorDesc ids_int32_desc(ids_int32);
+    MLUCnnlTensorDesc output_grad_desc(*output_grad_t);
+    MLUCnnlTensorDesc table_grad_desc(*table_grad_t);
+
+    MLUCnnl::EmbeddingBackward(ctx, padding_idx, false, ids_int32_desc.get(),
+                               GetBasePtr(&ids_int32), output_grad_desc.get(),
+                               GetBasePtr(output_grad_t), table_grad_desc.get(),
+                               GetBasePtr(table_grad_t));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(lookup_table_v2, ops::LookupTableV2MLUKernel<float>,
+                       ops::LookupTableV2MLUKernel<int>,
+                       ops::LookupTableV2MLUKernel<plat::float16>);
+
+REGISTER_OP_MLU_KERNEL(lookup_table_v2_grad,
+                       ops::LookupTableV2GradMLUKernel<float>,
+                       ops::LookupTableV2GradMLUKernel<int>,
+                       ops::LookupTableV2GradMLUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/math/cross_entropy.h b/paddle/fluid/operators/math/cross_entropy.h
index e339be06d69ed..da7340e4eb0b3 100644
--- a/paddle/fluid/operators/math/cross_entropy.h
+++ b/paddle/fluid/operators/math/cross_entropy.h
@@ -38,7 +38,7 @@ struct TolerableValue {
 // NOTE(dzh): float16 value clip behave different.
 // 1. Our ValueClipping has a  hardcore threshold 1e20
 // for float number. 1e20 will resulting in overflow in float16.
-// 2. float16 should expose the the real number overflow to python.
+// 2. float16 should expose the real number overflow to python.
 // because mixed-training depends the inf/nan value to determine
 // if the scale value will be adjusted.
 // Also. In standard implementation of cross entropy, other
diff --git a/paddle/fluid/operators/math/sample_prob.h b/paddle/fluid/operators/math/sample_prob.h
index 09daf0afe18bf..18a86d1531724 100644
--- a/paddle/fluid/operators/math/sample_prob.h
+++ b/paddle/fluid/operators/math/sample_prob.h
@@ -52,7 +52,7 @@ class SampleWithProb {
                   const std::size_t num_samples, const Tensor* L, Tensor* S,
                   Tensor* P) {
     // UNDERSTAND: dimension issues
-    const auto lbl_dim = L->dims();
+    const auto& lbl_dim = L->dims();
     const int batch_size = lbl_dim[0];
     const int num_true = lbl_dim[1];
     const int num_sampled_classes = num_true + num_samples;
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index f77287826ffb3..e4b033b6c5857 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -32,7 +32,7 @@ struct SelectedRowsAdd<platform::CPUDeviceContext, T> {
     PADDLE_ENFORCE_EQ(
         in1_height, input2.height(),
         platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But recieved first input height  = "
+                                          "But received first input height  = "
                                           "[%d], second input height = [%d]",
                                           in1_height, input2.height()));
     output->set_height(in1_height);
@@ -56,27 +56,27 @@ struct SelectedRowsAdd<platform::CPUDeviceContext, T> {
         in1_row_numel, in2_value.numel() / in2_rows.size(),
         platform::errors::InvalidArgument(
             "The two inputs width must be equal."
-            "But recieved first input width = [%d], second input width = [%d]",
+            "But received first input width = [%d], second input width = [%d]",
             in1_row_numel, in2_value.numel() / in2_rows.size()));
     PADDLE_ENFORCE_EQ(
         in1_row_numel, out_value->numel() / out_rows.size(),
         platform::errors::InvalidArgument(
             "The input and oupput width must be equal."
-            "But recieved input width = [%d], output width = [%d]",
+            "But received input width = [%d], output width = [%d]",
             in1_row_numel, out_value->numel() / out_rows.size()));
 
     auto in1_place = input1.place();
     PADDLE_ENFORCE_EQ(platform::is_cpu_place(in1_place), true,
                       platform::errors::InvalidArgument(
-                          "The running enviroment is not on the CPU place."));
+                          "The running environment is not on the CPU place."));
     auto in2_place = input2.place();
     PADDLE_ENFORCE_EQ(platform::is_cpu_place(in2_place), true,
                       platform::errors::InvalidArgument(
-                          "The running enviroment is not on the CPU place."));
+                          "The running environment is not on the CPU place."));
     auto out_place = context.GetPlace();
     PADDLE_ENFORCE_EQ(platform::is_cpu_place(out_place), true,
                       platform::errors::InvalidArgument(
-                          "The running enviroment is not on the CPU place."));
+                          "The running environment is not on the CPU place."));
 
     auto* out_data = out_value->data<T>();
     auto* in1_data = in1_value.data<T>();
@@ -98,19 +98,19 @@ struct SelectedRowsAddTensor<platform::CPUDeviceContext, T> {
                   const phi::SelectedRows& input1,
                   const framework::Tensor& input2, framework::Tensor* output) {
     auto in1_height = input1.height();
-    auto in2_dims = input2.dims();
-    auto out_dims = output->dims();
+    const auto& in2_dims = input2.dims();
+    const auto& out_dims = output->dims();
     PADDLE_ENFORCE_EQ(
         in1_height, in2_dims[0],
         platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But recieved first input height = "
+                                          "But received first input height = "
                                           "[%d], second input height = [%d]",
                                           in1_height, in2_dims[0]));
     PADDLE_ENFORCE_EQ(
         in1_height, out_dims[0],
         platform::errors::InvalidArgument(
             "The input and output height must be equal."
-            "But recieved input height = [%d], output height = [%d]",
+            "But received input height = [%d], output height = [%d]",
             in1_height, out_dims[0]));
 
     auto& in1_value = input1.value();
@@ -121,13 +121,13 @@ struct SelectedRowsAddTensor<platform::CPUDeviceContext, T> {
         in1_row_numel, input2.numel() / in1_height,
         platform::errors::InvalidArgument(
             "The two inputs width must be equal."
-            "But recieved first input width = [%d], second input width = [%d]",
+            "But received first input width = [%d], second input width = [%d]",
             in1_row_numel, input2.numel() / in1_height));
     PADDLE_ENFORCE_EQ(
         in1_row_numel, output->numel() / in1_height,
         platform::errors::InvalidArgument(
             "The input and output width must be equal."
-            "But recieved input width = [%d], output width = [%d]",
+            "But received input width = [%d], output width = [%d]",
             in1_row_numel, output->numel() / in1_height));
 
     phi::funcs::SetConstant<platform::CPUDeviceContext, T> functor;
@@ -161,7 +161,7 @@ struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
     PADDLE_ENFORCE_EQ(
         in1_height, input2->height(),
         platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But recieved first input height = "
+                                          "But received first input height = "
                                           "[%d], second input height = [%d]",
                                           in1_height, input2->height()));
 
@@ -178,11 +178,11 @@ struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
     auto in1_place = input1.place();
     PADDLE_ENFORCE_EQ(platform::is_cpu_place(in1_place), true,
                       platform::errors::InvalidArgument(
-                          "The running enviroment is not on the CPU place."));
+                          "The running environment is not on the CPU place."));
     auto in2_place = input2->place();
     PADDLE_ENFORCE_EQ(platform::is_cpu_place(in2_place), true,
                       platform::errors::InvalidArgument(
-                          "The running enviroment is not on the CPU place."));
+                          "The running environment is not on the CPU place."));
 
     auto* in1_data = in1_value.data<T>();
     auto* in2_data = in2_value->data<T>();
@@ -211,7 +211,7 @@ struct SelectedRowsSumTo<platform::CPUDeviceContext, T> {
       PADDLE_ENFORCE_EQ(in1_height, input2->height(),
                         platform::errors::InvalidArgument(
                             "The two inputs height must be equal."
-                            "But recieved first input height = [%d], second "
+                            "But received first input height = [%d], second "
                             "input height = [%d]",
                             in1_height, input2->height()));
     }
@@ -249,11 +249,11 @@ struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
       return;
     }
     auto in1_height = input1.height();
-    auto in2_dims = input2->dims();
+    const auto& in2_dims = input2->dims();
     PADDLE_ENFORCE_EQ(
         in1_height, in2_dims[0],
         platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But recieved first input height = "
+                                          "But received first input height = "
                                           "[%d], second input height = [%d]",
                                           in1_height, in2_dims[0]));
 
@@ -265,7 +265,7 @@ struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
         in1_row_numel, input2->numel() / in1_height,
         platform::errors::InvalidArgument(
             "The two inputs width must be equal."
-            "But recieved first input width = [%d], second input width = [%d]",
+            "But received first input width = [%d], second input width = [%d]",
             in1_row_numel, input2->numel() / in1_height));
 
     auto* in1_data = in1_value.data<T>();
@@ -289,11 +289,11 @@ struct SelectedRowsAddToTensor<phi::CPUContext, T> {
       return;
     }
     auto in1_height = input1.height();
-    auto in2_dims = input2->dims();
+    const auto& in2_dims = input2->dims();
     PADDLE_ENFORCE_EQ(
         in1_height, in2_dims[0],
         platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But recieved first input height = "
+                                          "But received first input height = "
                                           "[%d], second input height = [%d]",
                                           in1_height, in2_dims[0]));
 
@@ -305,7 +305,7 @@ struct SelectedRowsAddToTensor<phi::CPUContext, T> {
         in1_row_numel, input2->numel() / in1_height,
         platform::errors::InvalidArgument(
             "The two inputs width must be equal."
-            "But recieved first input width = [%d], second input width = [%d]",
+            "But received first input width = [%d], second input width = [%d]",
             in1_row_numel, input2->numel() / in1_height));
 
     auto* in1_data = in1_value.data<T>();
@@ -838,11 +838,11 @@ struct UpdateToTensor<platform::CPUDeviceContext, T> {
                   const ScatterOps& op, const phi::SelectedRows& input1,
                   framework::Tensor* input2) {
     auto in1_height = input1.height();
-    auto in2_dims = input2->dims();
+    const auto& in2_dims = input2->dims();
     PADDLE_ENFORCE_EQ(
         in1_height, in2_dims[0],
         platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But recieved first input height = "
+                                          "But received first input height = "
                                           "[%d], second input height = [%d]",
                                           in1_height, in2_dims[0]));
 
@@ -854,7 +854,7 @@ struct UpdateToTensor<platform::CPUDeviceContext, T> {
         in1_row_numel, input2->numel() / in1_height,
         platform::errors::InvalidArgument(
             "The two inputs width must be equal."
-            "But recieved first input width = [%d], second input width = [%d]",
+            "But received first input width = [%d], second input width = [%d]",
             in1_row_numel, input2->numel() / in1_height));
 
     auto* in1_data = in1_value.data<T>();
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
index 542d4c9784352..db5c66d319701 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -33,7 +33,7 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
     PADDLE_ENFORCE_EQ(
         in1_height, input2.height(),
         platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But recieved first input height  = "
+                                          "But received first input height  = "
                                           "[%d], second input height = [%d]",
                                           in1_height, input2.height()));
     output->set_height(in1_height);
@@ -57,13 +57,13 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
         in1_row_numel, in2_value.numel() / in2_rows.size(),
         platform::errors::InvalidArgument(
             "The two inputs width must be equal."
-            "But recieved first input width = [%d], second input width = [%d]",
+            "But received first input width = [%d], second input width = [%d]",
             in1_row_numel, in2_value.numel() / in2_rows.size()));
     PADDLE_ENFORCE_EQ(
         in1_row_numel, out_value->numel() / out_rows.size(),
         platform::errors::InvalidArgument(
             "The input and oupput width must be equal."
-            "But recieved input width = [%d], output width = [%d]",
+            "But received input width = [%d], output width = [%d]",
             in1_row_numel, out_value->numel() / out_rows.size()));
 
     auto* out_data = out_value->data<T>();
@@ -72,15 +72,15 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
     auto in1_place = input1.place();
     PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true,
                       platform::errors::InvalidArgument(
-                          "The running enviroment is not on the GPU place."));
+                          "The running environment is not on the GPU place."));
     auto in2_place = input2.place();
     PADDLE_ENFORCE_EQ(platform::is_gpu_place(in2_place), true,
                       platform::errors::InvalidArgument(
-                          "The running enviroment is not on the GPU place."));
+                          "The running environment is not on the GPU place."));
     auto out_place = context.GetPlace();
     PADDLE_ENFORCE_EQ(platform::is_gpu_place(out_place), true,
                       platform::errors::InvalidArgument(
-                          "The running enviroment is not on the GPU place."));
+                          "The running environment is not on the GPU place."));
 
     memory::Copy(out_place, out_data, in1_place, in1_data,
                  in1_value.numel() * sizeof(T), context.stream());
@@ -126,13 +126,13 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
         in1_height, in2_dims[0],
         platform::errors::InvalidArgument(
             "The two inputs height must be equal."
-            "But recieved first input height = [%d], first input height = [%d]",
+            "But received first input height = [%d], first input height = [%d]",
             in1_height, in2_dims[0]));
     PADDLE_ENFORCE_EQ(
         in1_height, out_dims[0],
         platform::errors::InvalidArgument(
             "The input and output height must be equal."
-            "But recieved input height = [%d], output height = [%d]",
+            "But received input height = [%d], output height = [%d]",
             in1_height, out_dims[0]));
 
     auto& in1_value = input1.value();
@@ -143,13 +143,13 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
         in1_row_numel, input2.numel() / in1_height,
         platform::errors::InvalidArgument(
             "The two inputs width must be equal."
-            "But recieved first input width = [%d], second input width = [%d]",
+            "But received first input width = [%d], second input width = [%d]",
             in1_row_numel, input2.numel() / in1_height));
     PADDLE_ENFORCE_EQ(
         in1_row_numel, output->numel() / in1_height,
         platform::errors::InvalidArgument(
             "The input and output width must be equal."
-            "But recieved input width = [%d], output width = [%d]",
+            "But received input width = [%d], output width = [%d]",
             in1_row_numel, output->numel() / in1_height));
 
     auto* in1_data = in1_value.data<T>();
@@ -186,13 +186,13 @@ struct SelectedRowsAddTensor<phi::GPUContext, T> {
         in1_height, in2_dims[0],
         platform::errors::InvalidArgument(
             "The two inputs height must be equal."
-            "But recieved first input height = [%d], first input height = [%d]",
+            "But received first input height = [%d], first input height = [%d]",
             in1_height, in2_dims[0]));
     PADDLE_ENFORCE_EQ(
         in1_height, out_dims[0],
         platform::errors::InvalidArgument(
             "The input and output height must be equal."
-            "But recieved input height = [%d], output height = [%d]",
+            "But received input height = [%d], output height = [%d]",
             in1_height, out_dims[0]));
 
     auto& in1_value = input1.value();
@@ -203,13 +203,13 @@ struct SelectedRowsAddTensor<phi::GPUContext, T> {
         in1_row_numel, input2.numel() / in1_height,
         platform::errors::InvalidArgument(
             "The two inputs width must be equal."
-            "But recieved first input width = [%d], second input width = [%d]",
+            "But received first input width = [%d], second input width = [%d]",
             in1_row_numel, input2.numel() / in1_height));
     PADDLE_ENFORCE_EQ(
         in1_row_numel, output->numel() / in1_height,
         platform::errors::InvalidArgument(
             "The input and output width must be equal."
-            "But recieved input width = [%d], output width = [%d]",
+            "But received input width = [%d], output width = [%d]",
             in1_row_numel, output->numel() / in1_height));
 
     auto* in1_data = in1_value.data<T>();
@@ -254,7 +254,7 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
     PADDLE_ENFORCE_EQ(
         in1_height, input2->height(),
         platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But recieved first input height = "
+                                          "But received first input height = "
                                           "[%d], second input height = [%d]",
                                           in1_height, input2->height()));
 
@@ -273,11 +273,11 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
     auto in1_place = input1.place();
     PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true,
                       platform::errors::InvalidArgument(
-                          "The running enviroment is not on the GPU place."));
+                          "The running environment is not on the GPU place."));
     auto in2_place = input2->place();
     PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true,
                       platform::errors::InvalidArgument(
-                          "The running enviroment is not on the GPU place."));
+                          "The running environment is not on the GPU place."));
 
     auto* in1_data = in1_value.data<T>();
     auto* in2_data = in2_value->data<T>();
@@ -322,7 +322,7 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
     PADDLE_ENFORCE_EQ(
         in1_height, in2_dims[0],
         platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But recieved first input height = "
+                                          "But received first input height = "
                                           "[%d], second input height = [%d]",
                                           in1_height, in2_dims[0]));
 
@@ -334,7 +334,7 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
         in1_row_numel, input2->numel() / in1_height,
         platform::errors::InvalidArgument(
             "The two inputs width must be equal."
-            "But recieved first input width = [%d], second input width = [%d]",
+            "But received first input width = [%d], second input width = [%d]",
             in1_row_numel, input2->numel() / in1_height));
 
     auto* in1_data = in1_value.data<T>();
@@ -359,7 +359,7 @@ struct SelectedRowsAddToTensor<phi::GPUContext, T> {
     PADDLE_ENFORCE_EQ(
         in1_height, in2_dims[0],
         platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But recieved first input height = "
+                                          "But received first input height = "
                                           "[%d], second input height = [%d]",
                                           in1_height, in2_dims[0]));
 
@@ -371,7 +371,7 @@ struct SelectedRowsAddToTensor<phi::GPUContext, T> {
         in1_row_numel, input2->numel() / in1_height,
         platform::errors::InvalidArgument(
             "The two inputs width must be equal."
-            "But recieved first input width = [%d], second input width = [%d]",
+            "But received first input width = [%d], second input width = [%d]",
             in1_row_numel, input2->numel() / in1_height));
 
     auto* in1_data = in1_value.data<T>();
@@ -675,7 +675,7 @@ struct UpdateToTensor<platform::CUDADeviceContext, T> {
     PADDLE_ENFORCE_EQ(
         in1_height, in2_dims[0],
         platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But recieved first input height = "
+                                          "But received first input height = "
                                           "[%d], second input height = [%d]",
                                           in1_height, in2_dims[0]));
 
@@ -687,7 +687,7 @@ struct UpdateToTensor<platform::CUDADeviceContext, T> {
         in1_row_numel, input2->numel() / in1_height,
         platform::errors::InvalidArgument(
             "The two inputs width must be equal."
-            "But recieved first input width = [%d], second input width = [%d]",
+            "But received first input width = [%d], second input width = [%d]",
             in1_row_numel, input2->numel() / in1_height));
 
     auto* in1_data = in1_value.template data<T>();
diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h
index 9833b4447ec45..69642c8194221 100644
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -231,7 +231,7 @@ class SoftmaxFunctor<DeviceContext, T, is_test, enable_if_CPU<DeviceContext>> {
  public:
   void operator()(const DeviceContext& context, const int axis_dim,
                   const framework::Tensor* X, framework::Tensor* Y) {
-    auto in_dims = X->dims();
+    const auto& in_dims = X->dims();
     constexpr int kBatchDim = 0;
     constexpr int kClassDim = 1;
 
diff --git a/paddle/fluid/operators/math/tree2col.cc b/paddle/fluid/operators/math/tree2col.cc
index c07582c84acb9..cd1fa13001ce2 100644
--- a/paddle/fluid/operators/math/tree2col.cc
+++ b/paddle/fluid/operators/math/tree2col.cc
@@ -53,7 +53,7 @@ std::vector<TreeNode> Tree2ColUtil::construct_patch(
 void Tree2ColUtil::construct_tree(const framework::Tensor &EdgeSet,
                                   std::vector<std::vector<int>> *tr,
                                   size_t *node_count) {
-  auto edge_set_dims = EdgeSet.dims();
+  const auto &edge_set_dims = EdgeSet.dims();
   PADDLE_ENFORCE_EQ(edge_set_dims[1], 2,
                     platform::errors::InvalidArgument(
                         "The second dimension of the EdgeSet shall be 2, but "
@@ -89,7 +89,7 @@ class Tree2ColFunctor<platform::CPUDeviceContext, T> {
                   const framework::Tensor &node_features,
                   framework::Tensor *patch, int max_depth) {
     std::vector<std::vector<int>> tr;
-    auto feature_dims = node_features.dims();
+    const auto &feature_dims = node_features.dims();
     auto cpu_place = context.GetPlace();
     phi::funcs::SetConstant<platform::CPUDeviceContext, T> constant;
     int64_t feature_size = feature_dims[1];
@@ -142,7 +142,7 @@ class Col2TreeFunctor<platform::CPUDeviceContext, T> {
                   const framework::Tensor &out_grad, framework::Tensor *in_grad,
                   int max_depth) {
     std::vector<std::vector<int>> tr;
-    auto output_dims = out_grad.dims();
+    const auto &output_dims = out_grad.dims();
     auto cpu_place = context.GetPlace();
     phi::funcs::SetConstant<platform::CPUDeviceContext, T> constant;
     int64_t output_size = output_dims[1];
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index 0811407466ddc..9d381e1f22b5f 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -378,20 +378,6 @@ framework::DDim GetDimForInput(const framework::InferShapeContext &ctx,
       }
     }
 
-    // if "-1" is present then one of reshape dims must be infered
-    auto it_negative = std::find(shape.begin(), shape.end(), -1);
-    if (it_negative != shape.end()) {
-      int64_t dim_product = 1;
-      for (int i = 0; i < dim.size(); i++) {
-        dim_product *= dim.at(i);
-      }
-
-      int64_t shape_product = std::accumulate(shape.begin(), shape.end(), -1,
-                                              std::multiplies<int>());
-      int index = std::distance(shape.begin(), it_negative);
-      shape[index] = dim_product / shape_product;
-    }
-
     dim = dim.reshape(shape).transpose(axis);
   }
   return dim;
@@ -585,6 +571,19 @@ class MatMulOp : public framework::OperatorWithKernel {
 
     auto dim_x = GetDimForInput(*context, "X");
     auto dim_y = GetDimForInput(*context, "Y");
+
+#ifdef PADDLE_WITH_MKLDNN
+    // (jczaja): For NHWC execution output shape needs
+    // to be computed like instead x*y we are to do y*x
+    bool channelwise_onednn =
+        context->IsRunMKLDNNKernel() &&
+        (platform::MKLDNNDeviceContext::tls().get_cur_paddle_data_layout() ==
+         framework::DataLayout::kNHWC);
+    if (channelwise_onednn) {
+      std::swap(dim_x, dim_y);
+    }
+#endif
+
     auto mat_dim_x = phi::funcs::CreateMatrixDescriptor(
         RowMatrixFromVector(dim_x), 0,
         context->Attrs().Get<bool>("transpose_X"));
@@ -673,76 +672,11 @@ class MatMulOp : public framework::OperatorWithKernel {
         context->Attrs().Get<std::vector<int>>("fused_transpose_Out");
 
     if (!reshape_out.empty() && !transpose_out.empty()) {
-      auto reshape_out_size = reshape_out.size();
-      auto transpose_out_size = transpose_out.size();
-      PADDLE_ENFORCE_EQ(transpose_out_size, 4,
-                        platform::errors::InvalidArgument(
-                            "transpose_out supported rank is 4, "
-                            "received %d",
-                            transpose_out_size));
-      const std::vector<int> supported_axis{0, 2, 1, 3};
-      const bool supported_transpose_axis = std::equal(
-          transpose_out.begin(), transpose_out.end(), supported_axis.begin());
-      PADDLE_ENFORCE_EQ(
-          supported_transpose_axis, true,
-          platform::errors::InvalidArgument(
-              "supported transpose axis for the fuse are {0, 2, 1, 3}"));
-      PADDLE_ENFORCE_EQ(
-          reshape_out_size, 3,
-          platform::errors::InvalidArgument("reshape_out supported rank is 3, "
-                                            "received %d",
-                                            reshape_out_size));
-
-      // int num_negative = std::count(reshape_out.begin(), reshape_out.end(),
-      // -1);
-      // PADDLE_ENFORCE_LE(num_negative, 1,
-      //                   platform::errors::InvalidArgument(
-      //                       "The max number of -1 in fused_reshape_Out is 1 "
-      //                       "but received %d.",
-      //                       num_negative));
-
-      // auto it_zero = std::find(reshape_out.begin(), reshape_out.end(), 0);
-      // if (it_zero != reshape_out.end()) {
-      //   for (uint64_t i = 0; i < reshape_out.size(); i++) {
-      //     if (reshape_out[i] == 0) {
-      //       PADDLE_ENFORCE_LT(
-      //           i, ddim_out.size(),
-      //           platform::errors::InvalidArgument(
-      //               "The index of 0 in fused_reshape_Out ",
-      //               "should be less than output dim size, ",
-      //               "but the index is %d and output dim size is %d", i,
-      //               ddim_out.size()));
-      //       reshape_out[i] = ddim_out.at(i);
-      //     }
-      //   }
-      // }
-
-      // if "-1" is present then one of reshape dims must be infered
-      auto it = std::find(reshape_out.begin(), reshape_out.end(), -1);
-      if (it != reshape_out.end()) {
-        int index = std::distance(reshape_out.begin(), it);
-
-        auto ddim_out_vec = phi::vectorize(ddim_out);
-
-        int ddim_out_product =
-            std::accumulate(ddim_out_vec.begin(), ddim_out_vec.end(), 1,
-                            std::multiplies<int>());
-        int reshape_out_product = std::accumulate(
-            reshape_out.begin(), reshape_out.end(), -1, std::multiplies<int>());
-
-        reshape_out[index] = ddim_out_product / reshape_out_product;
-      }
-
-      framework::DDim shape_out =
-          ddim_out.transpose(transpose_out).reshape(reshape_out);
-      context->SetOutputDim("Out", shape_out);
-    } else {
-      context->SetOutputDim("Out", ddim_out);
+      ddim_out = ddim_out.transpose(transpose_out).reshape(reshape_out);
     }
-#else
-    context->SetOutputDim("Out", ddim_out);
 #endif
-    context->ShareLoD("X", /*->*/ "Out");
+    context->SetOutputDim("Out", ddim_out);
+    context->ShareLoD("X", "Out");
   }
 
   framework::OpKernelType GetExpectedKernelType(
@@ -770,6 +704,21 @@ class MatMulOp : public framework::OperatorWithKernel {
           framework::TransToProtoVarType(tensor.dtype()), tensor.place(),
           tensor.layout());
     } else {
+#ifdef PADDLE_WITH_MKLDNN
+      // When matmul is first oneDNN op in a chain (there was some non oneDNN op
+      // previously)
+      // then we also need to rotate shape NHWC -> NCWH
+      if ((expected_kernel_type.data_layout_ ==
+           framework::DataLayout::kMKLDNN) &&
+          (tensor.layout() != framework::DataLayout::kMKLDNN) &&
+          paddle::platform::MKLDNNDeviceContext::tls()
+                  .get_cur_paddle_data_layout() ==
+              framework::DataLayout::kNHWC) {
+        return framework::OpKernelType(expected_kernel_type.data_type_,
+                                       tensor.place(),
+                                       framework::DataLayout::kNHWC);
+      }
+#endif
       return framework::OpKernelType(expected_kernel_type.data_type_,
                                      tensor.place(), tensor.layout());
     }
diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc
index 01fa01e3c6ed0..162ebdafec1cb 100644
--- a/paddle/fluid/operators/matmul_v2_op.cc
+++ b/paddle/fluid/operators/matmul_v2_op.cc
@@ -79,20 +79,6 @@ static framework::DDim GetDimForInput(const framework::InferShapeContext& ctx,
       }
     }
 
-    // if "-1" is present then one of reshape dims must be infered
-    auto it_negative = std::find(shape.begin(), shape.end(), -1);
-    if (it_negative != shape.end()) {
-      int64_t dim_product = 1;
-      for (int i = 0; i < dim.size(); i++) {
-        dim_product *= dim.at(i);
-      }
-
-      int64_t shape_product = std::accumulate(shape.begin(), shape.end(), -1,
-                                              std::multiplies<int>());
-      int index = std::distance(shape.begin(), it_negative);
-      shape[index] = dim_product / shape_product;
-    }
-
     dim = dim.reshape(shape).transpose(axis);
   }
   return dim;
@@ -176,77 +162,12 @@ class MatMulV2Op : public framework::OperatorWithKernel {
         ctx->Attrs().Get<std::vector<int>>("fused_transpose_Out");
 
     if (!reshape_out.empty() && !transpose_out.empty()) {
-      auto reshape_out_size = reshape_out.size();
-      auto transpose_out_size = transpose_out.size();
-      PADDLE_ENFORCE_EQ(transpose_out_size, 4,
-                        platform::errors::InvalidArgument(
-                            "transpose_out supported rank is 4, "
-                            "received %d",
-                            transpose_out_size));
-      const std::vector<int> supported_axis{0, 2, 1, 3};
-      const bool supported_transpose_axis = std::equal(
-          transpose_out.begin(), transpose_out.end(), supported_axis.begin());
-      PADDLE_ENFORCE_EQ(
-          supported_transpose_axis, true,
-          platform::errors::InvalidArgument(
-              "supported transpose axis for the fuse are {0, 2, 1, 3}"));
-      PADDLE_ENFORCE_EQ(
-          reshape_out_size, 3,
-          platform::errors::InvalidArgument("reshape_out supported rank is 3, "
-                                            "received %d",
-                                            reshape_out_size));
-
-      // int num_negative = std::count(reshape_out.begin(), reshape_out.end(),
-      // -1);
-      // PADDLE_ENFORCE_LE(num_negative, 1,
-      //                   platform::errors::InvalidArgument(
-      //                       "The max number of -1 in fused_reshape_Out is 1 "
-      //                       "but received %d.",
-      //                       num_negative));
-
-      // auto it_zero = std::find(reshape_out.begin(), reshape_out.end(), 0);
-      // if (it_zero != reshape_out.end()) {
-      //   for (uint64_t i = 0; i < reshape_out.size(); i++) {
-      //     if (reshape_out[i] == 0) {
-      //       PADDLE_ENFORCE_LT(
-      //           i, ddim_out.size(),
-      //           platform::errors::InvalidArgument(
-      //               "The index of 0 in fused_reshape_Out ",
-      //               "should be less than output dim size, ",
-      //               "but the index is %d and output dim size is %d", i,
-      //               ddim_out.size()));
-      //       reshape_out[i] = ddim_out.at(i);
-      //     }
-      //   }
-      // }
-
-      // if "-1" is present then one of reshape dims must be infered
-      auto it = std::find(reshape_out.begin(), reshape_out.end(), -1);
-      if (it != reshape_out.end()) {
-        int index = std::distance(reshape_out.begin(), it);
-
-        auto ddim_out_vec = phi::vectorize(ddim_out);
-
-        int ddim_out_product =
-            std::accumulate(ddim_out_vec.begin(), ddim_out_vec.end(), 1,
-                            std::multiplies<int>());
-        int reshape_out_product = std::accumulate(
-            reshape_out.begin(), reshape_out.end(), -1, std::multiplies<int>());
-
-        reshape_out[index] = ddim_out_product / reshape_out_product;
-      }
-
-      framework::DDim shape_out =
-          ddim_out.transpose(transpose_out).reshape(reshape_out);
-      ctx->SetOutputDim("Out", shape_out);
-    } else {
-      ctx->SetOutputDim("Out", ddim_out);
+      ddim_out = ddim_out.transpose(transpose_out).reshape(reshape_out);
     }
-#else
-    ctx->SetOutputDim("Out", ddim_out);
 #endif
 
-    ctx->ShareLoD("X", /* --> */ "Out");
+    ctx->SetOutputDim("Out", ddim_out);
+    ctx->ShareLoD("X", "Out");
   }
 
  protected:
@@ -274,6 +195,22 @@ class MatMulV2Op : public framework::OperatorWithKernel {
           framework::TransToProtoVarType(tensor.dtype()), tensor.place(),
           tensor.layout());
     } else {
+#ifdef PADDLE_WITH_MKLDNN
+      // When matmul_v2 is first oneDNN op in a chain (there was some non oneDNN
+      // op
+      // previously)
+      // then we also need to rotate shape NHWC -> NCWH
+      if ((expected_kernel_type.data_layout_ ==
+           framework::DataLayout::kMKLDNN) &&
+          (tensor.layout() != framework::DataLayout::kMKLDNN) &&
+          paddle::platform::MKLDNNDeviceContext::tls()
+                  .get_cur_paddle_data_layout() ==
+              framework::DataLayout::kNHWC) {
+        return framework::OpKernelType(expected_kernel_type.data_type_,
+                                       tensor.place(),
+                                       framework::DataLayout::kNHWC);
+      }
+#endif
       return framework::OpKernelType(expected_kernel_type.data_type_,
                                      tensor.place(), tensor.layout());
     }
diff --git a/paddle/fluid/operators/metrics/accuracy_op.cc b/paddle/fluid/operators/metrics/accuracy_op.cc
index 32ef052119883..ed58c90e17022 100644
--- a/paddle/fluid/operators/metrics/accuracy_op.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op.cc
@@ -36,7 +36,7 @@ class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     // TODO(typhoonzero): support both inference value and indices.
     AddInput("Out", "The network output of topk (inferences)");
-    AddInput("Indices", "The the network output of topk (indices)");
+    AddInput("Indices", "The network output of topk (indices)");
     AddInput("Label", "Label of the training data");
     // TODO(typhoonzero): AddInput("Weight", ...
     AddOutput("Accuracy", "The accuracy of current batch");
diff --git a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
index 3cc1be4de8a82..82e4b90468a38 100644
--- a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device/xpu/xpu_header.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 
 namespace paddle {
 namespace operators {
@@ -42,68 +42,26 @@ class AccuracyXPUKernel : public framework::OpKernel<T> {
     if (num_samples == 0) {
       return;
     }
-    size_t indices_int32_size = num_samples * class_dim * sizeof(int);
-    size_t indices_int64_size = num_samples * class_dim * sizeof(int64_t);
-    size_t label_int32_size = num_samples * sizeof(int);
-    size_t label_int64_size = num_samples * sizeof(int64_t);
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    int* indices_int32_device = NULL;
-    PADDLE_ENFORCE_EQ(
-        xpu_malloc(reinterpret_cast<void**>(&indices_int32_device),
-                   indices_int32_size),
-        XPU_SUCCESS,
-        platform::errors::ResourceExhausted(
-            "\n\nOut of memory error on XPU, Cannot allocate %s memory"
-            " on XPU. \n\nPlease check whether there is any other process "
-            "using XPU.\n",
-            string::HumanReadableSize(indices_int32_size)));
-    int* label_int32_device = NULL;
-    PADDLE_ENFORCE_EQ(
-        xpu_malloc(reinterpret_cast<void**>(&label_int32_device),
-                   label_int32_size),
-        XPU_SUCCESS,
-        platform::errors::ResourceExhausted(
-            "\n\nOut of memory error on XPU, Cannot allocate %s memory"
-            " on XPU. \n\nPlease check whether there is any other process "
-            "using XPU.\n",
-            string::HumanReadableSize(label_int32_size)));
+    xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+    int size = num_samples * class_dim;
+    int* indices_int32_ptr = RAII_GUARD.alloc_l3_or_gm<int>(size);
+    PADDLE_ENFORCE_XDNN_NOT_NULL(indices_int32_ptr);
+    int* label_int32_ptr = RAII_GUARD.alloc_l3_or_gm<int>(size);
+    PADDLE_ENFORCE_XDNN_NOT_NULL(label_int32_ptr);
 
-    int* indices_int32_host =
-        reinterpret_cast<int*>(std::malloc(indices_int32_size));
-    int64_t* indices_int64_host =
-        reinterpret_cast<int64_t*>(std::malloc(indices_int64_size));
-    int* label_int32_host =
-        reinterpret_cast<int*>(std::malloc(label_int32_size));
-    int64_t* label_int64_host =
-        reinterpret_cast<int64_t*>(std::malloc(label_int64_size));
-    dev_ctx.Wait();
-    memory::Copy(platform::CPUPlace(), indices_int64_host, ctx.GetPlace(),
-                 indices_data, indices_int64_size);
-    memory::Copy(platform::CPUPlace(), label_int64_host, ctx.GetPlace(),
-                 label_data, label_int64_size);
-    for (size_t i = 0; i < num_samples; ++i) {
-      label_int32_host[i] = label_int64_host[i];
-      for (size_t j = 0; j < class_dim; ++j) {
-        indices_int32_host[i * class_dim + j] =
-            indices_int64_host[i * class_dim + j];
-      }
-    }
-    memory::Copy(ctx.GetPlace(), indices_int32_device, platform::CPUPlace(),
-                 indices_int32_host, indices_int32_size);
-    memory::Copy(ctx.GetPlace(), label_int32_device, platform::CPUPlace(),
-                 label_int32_host, label_int32_size);
-    int r = xpu::accuracy(dev_ctx.x_context(), indices_int32_device,
-                          label_int32_device, num_samples, class_dim,
-                          correct_data, total_data, accuracy_data);
-    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                      platform::errors::Fatal("XPU accuracy kernel error!"));
-    dev_ctx.Wait();
-    xpu_free(indices_int32_device);
-    xpu_free(label_int32_device);
-    std::free(indices_int32_host);
-    std::free(indices_int64_host);
-    std::free(label_int32_host);
-    std::free(label_int64_host);
+    int r = xpu::cast_v2<int64_t, int32_t>(dev_ctx.x_context(), indices_data,
+                                           indices_int32_ptr, size);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2");
+
+    r = xpu::cast_v2<int64_t, int32_t>(dev_ctx.x_context(), label_data,
+                                       label_int32_ptr, size);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2");
+
+    r = xpu::accuracy(dev_ctx.x_context(), indices_int32_ptr, label_int32_ptr,
+                      num_samples, class_dim, correct_data, total_data,
+                      accuracy_data);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2");
   }
 };
 
diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
index ecee094de346e..393247644c2e8 100644
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
@@ -44,14 +44,6 @@ class MKLDNNActivationKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *x = ctx.Input<Tensor>("X");
-    PADDLE_ENFORCE_EQ(
-        x->layout(), DataLayout::kMKLDNN,
-        platform::errors::InvalidArgument("Wrong layout set for X tensor"));
-    PADDLE_ENFORCE_NE(
-        x->format(), MKLDNNMemoryFormat::undef,
-        platform::errors::InvalidArgument("Wrong format set for X tensor"));
-
     Functor functor;
     functor(ctx);
   }
@@ -62,14 +54,6 @@ class MKLDNNActivationGradKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *diff_y = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    PADDLE_ENFORCE_EQ(diff_y->layout(), DataLayout::kMKLDNN,
-                      platform::errors::InvalidArgument(
-                          "Wrong layout set for Input OutGrad tensor"));
-    PADDLE_ENFORCE_NE(diff_y->format(), MKLDNNMemoryFormat::undef,
-                      platform::errors::InvalidArgument(
-                          "Wrong format set for Input OutGrad tensor"));
-
     Functor functor;
     functor(ctx);
   }
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index 67d1aaa4baf52..fba17d303f282 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -223,9 +223,17 @@ class ConvMKLDNNHandlerT
       float sum_scale = 1.0f;
       float activation_scale = 1.0f;
       std::vector<float> output_shift_scale;
-      if (platform::is_int8<T>())
-        std::tie(sum_scale, output_shift_scale, activation_scale) =
-            get_int8_scales(ctx);
+      if (platform::is_int8<T>()) {
+        if (ctx.HasAttr("Sum_scale")) {
+          sum_scale = ctx.Attr<float>("Sum_scale");
+          activation_scale = ctx.Attr<float>("Activation_scale");
+          output_shift_scale =
+              ctx.Attr<std::vector<float>>("Output_shift_scale");
+        } else {
+          std::tie(sum_scale, output_shift_scale, activation_scale) =
+              get_int8_scales(ctx);
+        }
+      }
 
       const dnnl::primitive_attr conv_attr = CreatePostOps(
           fuse_activation, fuse_alpha, fuse_beta, fuse_residual_conn,
@@ -872,8 +880,18 @@ class ConvMKLDNNOpKernel : public framework::OpKernel<T> {
         {DNNL_ARG_DST, *dst_memory_p}};
 
     if (bias) {
-      auto p_scales_tuple = handler.get_int8_bias_scales(ctx);
-
+      std::vector<float> bias_scales;
+      auto p_scales_tuple =
+          std::make_shared<std::tuple<float, std::vector<float>>>(
+              std::make_tuple(static_cast<float>(mask_reorder), bias_scales));
+      if (ctx.HasAttr("Bias_scales")) {
+        bias_scales = ctx.Attr<std::vector<float>>("Bias_scales");
+        p_scales_tuple =
+            std::make_shared<std::tuple<float, std::vector<float>>>(
+                std::make_tuple(static_cast<float>(mask_reorder), bias_scales));
+      } else {
+        p_scales_tuple = handler.get_int8_bias_scales(ctx);
+      }
       auto bias_memory_p = handler.AcquireBiasMemoryWithReorder(
           bias, true, std::get<1>(*p_scales_tuple),
           std::get<0>(*p_scales_tuple));
diff --git a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
index b10572edf6f27..747e4603d7fe7 100644
--- a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
@@ -36,100 +36,58 @@ template <typename T>
 class DeQuantOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("Input");
-    auto scale_data = ctx.Attr<float>("Scale");
-    auto scale_shift = ctx.Attr<float>("Shift");
-    bool with_shift = scale_shift != 0.0f;
-    auto* output = ctx.Output<Tensor>("Output");
-
-    PADDLE_ENFORCE_NE(scale_data, 0.0f,
-                      platform::errors::InvalidArgument(
-                          "Dequantization scale cannot be 0.0"));
-    PADDLE_ENFORCE_GE(scale_shift, 0,
-                      platform::errors::Unimplemented(
-                          "Dequantization shift must be nonnegative."));
-    PADDLE_ENFORCE_LE(
-        scale_shift, 255,
-        platform::errors::Unimplemented(
-            "Dequantization shift must be less than or equal to 255."));
+    auto* x = ctx.Input<Tensor>("Input");
+    const auto quantization_scale = ctx.Attr<float>("Scale");
+    const auto quantization_shift = ctx.Attr<float>("Shift");
+    const bool with_shift = quantization_shift != 0.0f;
+    auto* out = ctx.Output<Tensor>("Output");
+
+    PADDLE_ENFORCE(quantization_scale != 0.0f,
+                   platform::errors::InvalidArgument(
+                       "Dequantization scale must be different than 0.0f"));
+
+    PADDLE_ENFORCE(
+        quantization_shift <= 255 && quantization_shift >= 0,
+        platform::errors::InvalidArgument(
+            "Dequantization shift must be lower or equal to ",
+            "255 and greater or equal to 0, but got %f", quantization_shift));
 
     auto& dev_ctx =
         ctx.template device_context<platform::MKLDNNDeviceContext>();
-    const auto& engine = dev_ctx.GetEngine();
-
-    const T* input_data = input->data<T>();
-    float* output_data = output->mutable_data<float>(ctx.GetPlace());
-
-    float reorder_shift = -scale_shift / scale_data;
-
-    auto src_tz = phi::vectorize<int64_t>(input->dims());
-    auto dst_tz = phi::vectorize<int64_t>(output->dims());
-    dnnl::memory::data_type src_dt = paddle::framework::ToMKLDNNDataType(
-        framework::TransToProtoVarType(input->dtype()));
-    MKLDNNMemoryFormat src_fmt = input->format();
-
-    std::string key =
-        platform::CreateKey(dev_ctx, src_dt, src_tz, ctx.OutputName("Output"));
-    key = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key);
-
-    const std::string key_prim = key + "@r";
-    const std::string key_src_mem = key + "@s";
-    const std::string key_dst_mem = key + "@d";
-
-    std::shared_ptr<dnnl::memory> src_memory;
-    std::shared_ptr<dnnl::memory> dst_memory;
-    std::shared_ptr<reorder> reorder_p;
-    reorder_p = std::static_pointer_cast<reorder>(dev_ctx.GetBlob(key_prim));
-
-    if (reorder_p == nullptr) {
-      dnnl::primitive_attr attri;
-      int mask = 0;
-      float reorder_scale = 1. / scale_data;
-      attri.set_output_scales(mask, {reorder_scale});
-
-      if (with_shift) {
-        dnnl::post_ops post_operations;
-        post_operations.append_sum();
-        attri.set_post_ops(post_operations);
-        std::fill(output_data, output_data + output->numel(), reorder_shift);
-      }
-
-      auto src_md = platform::MKLDNNMemDesc({src_tz}, src_dt, src_fmt);
-      src_memory = std::make_shared<dnnl::memory>(src_md, engine,
-                                                  to_void_cast<T>(input_data));
-
-      auto dst_md =
-          platform::MKLDNNMemDesc({dst_tz}, memory::data_type::f32,
-                                  platform::MKLDNNFormatForSize(
-                                      dst_tz.size(), MKLDNNMemoryFormat::nchw));
-
-      dst_memory = std::make_shared<dnnl::memory>(
-          dst_md, engine, to_void_cast<float>(output_data));
-
-      auto reorder_pd = std::shared_ptr<reorder::primitive_desc>(
-          new reorder::primitive_desc(*src_memory, *dst_memory, attri));
-      reorder_p = std::shared_ptr<reorder>(new reorder(*reorder_pd));
-      dev_ctx.SetBlob(key_prim, reorder_p);
-      dev_ctx.SetBlob(key_src_mem, src_memory);
-      dev_ctx.SetBlob(key_dst_mem, dst_memory);
-    } else {
-      src_memory =
-          std::static_pointer_cast<dnnl::memory>(dev_ctx.GetBlob(key_src_mem));
-      src_memory->set_data_handle(to_void_cast<T>(input_data));
-
-      dst_memory =
-          std::static_pointer_cast<dnnl::memory>(dev_ctx.GetBlob(key_dst_mem));
-      if (with_shift)
-        std::fill(output_data, output_data + output->numel(), reorder_shift);
-      dst_memory->set_data_handle(output->mutable_data<float>(ctx.GetPlace()));
+
+    auto x_tz = phi::vectorize<int64_t>(x->dims());
+    auto x_paddle_dtype = framework::TransToProtoVarType(x->dtype());
+    auto out_paddle_dtype = framework::TransToProtoVarType(out->dtype());
+
+    dnnl::primitive_attr attrs;
+    static constexpr int32_t mask = 0;  // same shift and scale for whole tensor
+
+    const float reorder_scale = 1. / quantization_scale;
+    attrs.set_output_scales(mask, {reorder_scale});
+
+    if (with_shift) {
+      attrs.set_zero_points(DNNL_ARG_SRC, mask,
+                            {static_cast<int32_t>(quantization_shift)});
     }
 
+    platform::ReorderMKLDNNHandler reorder_handler(
+        x_tz, x_paddle_dtype, framework::ToMKLDNNDataType(x_paddle_dtype),
+        out_paddle_dtype, framework::ToMKLDNNDataType(out_paddle_dtype),
+        dev_ctx.GetEngine());
+
+    auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
+        x->mem_desc(), platform::to_void_cast(x->data<T>()));
+    auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
+        out, x->mem_desc(), dev_ctx.GetPlace());
+
+    auto reorder_p = reorder_handler.AcquireReorder(
+        reorder_dst_memory_p, reorder_src_memory_p, attrs);
+
     auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-    reorder_p->execute(astream, *src_memory, *dst_memory);
+    reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
     astream.wait();
 
-    output->set_layout(DataLayout::kMKLDNN);
-    output->set_format(GetMKLDNNFormat(*dst_memory));
+    out->set_mem_desc(reorder_dst_memory_p->get_desc());
   }
 };
 
diff --git a/paddle/fluid/operators/mkldnn/fill_constant_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fill_constant_mkldnn_op.cc
new file mode 100644
index 0000000000000..73e783068379d
--- /dev/null
+++ b/paddle/fluid/operators/mkldnn/fill_constant_mkldnn_op.cc
@@ -0,0 +1,131 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/utils.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+template <typename T>
+class FillConstantMKLDNNHandler
+    : public platform::MKLDNNHandlerNoCachingT<T, dnnl::binary> {
+ public:
+  FillConstantMKLDNNHandler(Tensor* out, dnnl::engine engine,
+                            platform::Place cpu_place)
+      : platform::MKLDNNHandlerNoCachingT<T, dnnl::binary>(engine, cpu_place) {
+    const auto src0_md = dnnl::memory::desc(
+        {out->numel(), sizeof(T)}, platform::MKLDNNGetDataType<uint8_t>(),
+        dnnl::memory::format_tag::ab);
+
+    dnnl::primitive_attr attrs;
+    attrs.set_scales(DNNL_ARG_SRC_0, /* mask = */ 0, {0.0f});
+
+    this->AcquireForwardPrimitiveDescriptor(attrs, dnnl::algorithm::binary_add,
+                                            src0_md, src1_md, src0_md);
+  }
+
+  static const dnnl::memory::desc src1_md;
+};
+
+template <typename T>
+const dnnl::memory::desc FillConstantMKLDNNHandler<T>::src1_md(
+    {1, sizeof(T)}, platform::MKLDNNGetDataType<uint8_t>(),
+    dnnl::memory::format_tag::ab);
+
+template <typename T>
+class FillConstantMKLDNNKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    this->RunKernel(ctx);
+  }
+
+  void RunKernel(const framework::ExecutionContext& ctx) const {
+    const auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& dnnl_engine = dev_ctx.GetEngine();
+
+    auto* out = ctx.Output<Tensor>("Out");
+    T fill_value = CalculateFillValue(ctx);
+
+    auto shape = GetShape(ctx);
+    out->Resize(shape);
+
+    FillConstantMKLDNNHandler<T> handler(out, dnnl_engine, ctx.GetPlace());
+
+    dnnl::memory constant_value_memory =
+        dnnl::memory(FillConstantMKLDNNHandler<T>::src1_md, dnnl_engine,
+                     reinterpret_cast<uint8_t*>(&fill_value));
+
+    auto src0_memory_p = handler.AcquireDstMemory(out);
+    auto fill_constant_p = handler.AcquireForwardPrimitive();
+
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+    fill_constant_p->execute(astream, {{DNNL_ARG_SRC_0, *src0_memory_p},
+                                       {DNNL_ARG_SRC_1, constant_value_memory},
+                                       {DNNL_ARG_DST, *src0_memory_p}});
+    astream.wait();
+
+    // src0_memory_p's md was just to allow the usage of a binary
+    // primitive as a memset, and now we need to create a real one
+    out->set_mem_desc({phi::vectorize(shape), platform::MKLDNNGetDataType<T>(),
+                       platform::GetPlainMKLDNNFormat(shape.size())});
+  }
+
+  T CalculateFillValue(const framework::ExecutionContext& ctx) const {
+    const auto str_value = ctx.Attr<std::string>("str_value");
+    const auto float_value = ctx.Attr<float>("value");
+
+    T value;
+
+    if (str_value.empty()) {
+      value = static_cast<T>(float_value);
+    } else {
+      // handle NaN/Inf first, which cannot be read from stream
+      if (str_value == "inf") {
+        value = static_cast<T>(std::numeric_limits<float>::infinity());
+      } else if (str_value == "-inf") {
+        value = static_cast<T>(-std::numeric_limits<float>::infinity());
+      } else if (str_value == "nan") {
+        value = static_cast<T>(std::numeric_limits<float>::quiet_NaN());
+      } else {
+        std::stringstream convert_stream(str_value);
+        double tmp_value;
+        convert_stream >> tmp_value;
+        value = static_cast<T>(tmp_value);
+      }
+    }
+
+    if (ctx.HasInput("ValueTensor")) {
+      const auto* value_tensor = ctx.Input<Tensor>("ValueTensor");
+      PADDLE_ENFORCE_EQ(
+          value_tensor->numel(), 1,
+          platform::errors::InvalidArgument(
+              "When use Tensor as value to set Tensor value in fill_constant, "
+              "value input(ValueTensor) size must be 1, but got %d",
+              value_tensor->numel()));
+      value = value_tensor->data<T>()[0];
+    }
+
+    return value;
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(fill_constant, MKLDNN, paddle::platform::CPUPlace,
+                   ops::FillConstantMKLDNNKernel<float>);
diff --git a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
index d3a36555c389a..245ae2196ca38 100644
--- a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
@@ -124,7 +124,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
     if (!workspace_memory->get_desc().is_zero()) {
-      mid->set_format(platform::GetMKLDNNFormat(*workspace_memory));
+      mid->set_mem_desc(workspace_memory->get_desc());
       lrn_p->execute(astream, {{DNNL_ARG_SRC, *src_memory},
                                {DNNL_ARG_DST, *dst_memory},
                                {DNNL_ARG_WORKSPACE, *workspace_memory}});
@@ -134,8 +134,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     }
     astream.wait();
 
-    out->set_layout(framework::DataLayout::kMKLDNN);
-    out->set_format(platform::GetMKLDNNFormat(*dst_memory));
+    out->set_mem_desc(dst_memory->get_desc());
   }
 };
 
@@ -177,8 +176,7 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
                                {DNNL_ARG_WORKSPACE, *workspace}});
     astream.wait();
 
-    in_x_grad->set_layout(framework::DataLayout::kMKLDNN);
-    in_x_grad->set_format(platform::GetMKLDNNFormat(*diff_src_memory));
+    in_x_grad->set_mem_desc(diff_src_memory->get_desc());
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
index f4137733e300e..e9abe84e67980 100644
--- a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
@@ -257,20 +257,6 @@ class MatMulMKLDNNHandler
         }
       }
 
-      // if "-1" is present then one of reshape dims must be infered
-      auto it_negative = std::find(shape.begin(), shape.end(), -1);
-      if (it_negative != shape.end()) {
-        int64_t dim_product = 1;
-        for (int i = 0; i < input_dims.size(); i++) {
-          dim_product *= input_dims.at(i);
-        }
-
-        int64_t shape_product = std::accumulate(shape.begin(), shape.end(), -1,
-                                                std::multiplies<int>());
-        int index = std::distance(shape.begin(), it_negative);
-        shape[index] = dim_product / shape_product;
-      }
-
       return input_dims.reshape(shape).transpose(axis);
     }
     return input_dims;
@@ -299,20 +285,6 @@ class MatMulMKLDNNHandler
         }
       }
 
-      // if "-1" is present then one of reshape dims must be infered
-      auto it_negative = std::find(shape.begin(), shape.end(), -1);
-      if (it_negative != shape.end()) {
-        int64_t dim_product = 1;
-        for (int i = 0; i < input_dims.size(); i++) {
-          dim_product *= input_dims.at(i);
-        }
-
-        int64_t shape_product = std::accumulate(shape.begin(), shape.end(), -1,
-                                                std::multiplies<int>());
-        int index = std::distance(shape.begin(), it_negative);
-        shape[index] = dim_product / shape_product;
-      }
-
       new_dims = input_dims.reshape(shape).transpose(axis);
     }
 
diff --git a/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake b/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake
index c471ba62f609b..8bad3e86b2934 100644
--- a/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake
+++ b/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake
@@ -1,2 +1 @@
-cc_test(test_mkldnn_op_nhwc SRCS mkldnn/test_mkldnn_op_nhwc.cc DEPS op_registry pool_op activation_op pooling transpose_op scope device_context enforce executor)
-
+cc_test(test_mkldnn_op_nhwc SRCS mkldnn/test_mkldnn_op_nhwc.cc DEPS op_registry pool_op shape_op crop_op activation_op pooling transpose_op scope device_context enforce executor)
diff --git a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
index 4cae3f0c73711..8cbe46bee481a 100644
--- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "dnnl.hpp"
+#include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/quantize_op.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -34,83 +35,73 @@ template <typename T>
 class QuantOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("Input");
-    auto scale_data = ctx.Attr<float>("Scale");
-    auto scale_shift = ctx.Attr<float>("Shift");
-    bool with_shift = scale_shift != 0.0f;
-    auto* output = ctx.Output<Tensor>("Output");
-
-    PADDLE_ENFORCE_NE(
-        scale_data, 0.0f,
-        platform::errors::InvalidArgument("Quantization scale cannot be 0.0"));
-    PADDLE_ENFORCE_GE(scale_shift, 0,
-                      platform::errors::Unimplemented(
-                          "Quantization shift must be nonnegative."));
-    PADDLE_ENFORCE_LE(
-        scale_shift, 255,
-        platform::errors::Unimplemented(
-            "Quantization shift must be less than or equal to 255."));
+    auto* x = ctx.Input<Tensor>("Input");
+    auto* out = ctx.Output<Tensor>("Output");
+
+    const auto quantization_scale = ctx.Attr<float>("Scale");
+    const auto quantization_shift = ctx.Attr<float>("Shift");
+    const bool with_scale = quantization_scale != 1.0f;
+    const bool with_shift = quantization_shift != 0.0f;
+
+    PADDLE_ENFORCE_NE(quantization_scale, 0.0f,
+                      platform::errors::InvalidArgument(
+                          "Quantization scale must be different than 0.0f"));
+    PADDLE_ENFORCE(
+        quantization_shift <= 255 && quantization_shift >= 0,
+        platform::errors::InvalidArgument(
+            "Quantization shift must be lower or equal to ",
+            "255 and greater or equal to 0, but got %f", quantization_shift));
 
     auto& dev_ctx =
         ctx.template device_context<platform::MKLDNNDeviceContext>();
-    const auto& engine = dev_ctx.GetEngine();
 
-    std::vector<primitive> pipeline;
-    auto src_tz = phi::vectorize<int64_t>(input->dims());
-    auto dst_tz = phi::vectorize<int64_t>(output->dims());
+    auto x_tz = phi::vectorize<int64_t>(x->dims());
 
-    const T* input_data = input->data<T>();
+    const bool is_negative_input = ctx.Attr<bool>("is_negative_input");
+    const bool bfloat16 = ctx.Attr<bool>("bfloat16");
 
-    bool is_negative_input = ctx.Attr<bool>("is_negative_input");
-    bool bfloat16 = ctx.Attr<bool>("bfloat16");
+    dnnl::primitive_attr attrs;
+    static constexpr int32_t mask = 0;
 
-    // TODO(jczaja): Refactor with Acquire API
-    std::shared_ptr<dnnl::memory> src_memory;
-    std::shared_ptr<dnnl::memory> dst_memory;
-    std::shared_ptr<reorder> reorder_p;
-
-    std::string out_layout = ctx.Attr<std::string>("output_format");
-    MKLDNNMemoryFormat out_format =
-        platform::data_format_to_memory_format(out_layout);
-    dnnl::primitive_attr attri;
-    int mask = 0;
-    attri.set_output_scales(mask, {scale_data});
+    if (with_scale) {
+      attrs.set_output_scales(mask, {quantization_scale});
+    }
 
     if (with_shift) {
-      dnnl::post_ops post_operations;
-      post_operations.append_sum();
-      attri.set_post_ops(post_operations);
-      uint8_t* output_data = output->mutable_data<uint8_t>(ctx.GetPlace());
-      // memset casts scale_shift to unsigned char (uint8_t) internally
-      std::memset(output_data, scale_shift, output->numel());
+      attrs.set_zero_points(DNNL_ARG_DST, mask,
+                            {static_cast<int32_t>(quantization_shift)});
     }
 
-    auto src_md = platform::MKLDNNMemDesc({src_tz}, memory::data_type::f32,
-                                          input->format());
-    src_memory = std::make_shared<dnnl::memory>(src_md, engine,
-                                                to_void_cast<T>(input_data));
+    framework::proto::VarType::Type x_paddle_dtype =
+        framework::TransToProtoVarType(x->dtype());
+    framework::proto::VarType::Type out_paddle_dtype;
 
-    std::shared_ptr<dnnl::memory::desc> dst_md;
     if (bfloat16) {
-      platform::SetDstMemoryQuantized<paddle::platform::bfloat16>(
-          ctx, output, dst_tz, engine, dst_md, dst_memory, out_format);
+      out_paddle_dtype = framework::proto::VarType::BF16;
     } else if (is_negative_input && !with_shift) {
-      platform::SetDstMemoryQuantized<int8_t>(ctx, output, dst_tz, engine,
-                                              dst_md, dst_memory, out_format);
+      out_paddle_dtype = framework::proto::VarType::INT8;
     } else {
-      platform::SetDstMemoryQuantized<uint8_t>(ctx, output, dst_tz, engine,
-                                               dst_md, dst_memory, out_format);
+      out_paddle_dtype = framework::proto::VarType::UINT8;
     }
-    auto reorder_pd = std::shared_ptr<reorder::primitive_desc>(
-        new reorder::primitive_desc(*src_memory, *dst_memory, attri));
-    reorder_p = std::shared_ptr<reorder>(new reorder(*reorder_pd));
+
+    platform::ReorderMKLDNNHandler reorder_handler(
+        x_tz, x_paddle_dtype, framework::ToMKLDNNDataType(x_paddle_dtype),
+        out_paddle_dtype, framework::ToMKLDNNDataType(out_paddle_dtype),
+        dev_ctx.GetEngine());
+
+    auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
+        x->mem_desc(), platform::to_void_cast(x->data<T>()));
+    auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
+        out, x->mem_desc(), dev_ctx.GetPlace());
+
+    auto reorder_p = reorder_handler.AcquireReorder(
+        reorder_dst_memory_p, reorder_src_memory_p, attrs);
 
     auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-    reorder_p->execute(astream, *src_memory, *dst_memory);
+    reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
     astream.wait();
 
-    output->set_layout(DataLayout::kMKLDNN);
-    output->set_format(GetMKLDNNFormat(*dst_memory));
+    out->set_mem_desc(reorder_dst_memory_p->get_desc());
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc
index f04c73ec0b249..517f782e18758 100644
--- a/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc
@@ -32,6 +32,16 @@ class ShapeMKLDNNKernel : public framework::OpKernel<T> {
       in_dims = in_var->Get<phi::SelectedRows>().value().dims();
     } else {
       in_dims = in_var->Get<LoDTensor>().dims();
+      // Output of shape op is often fed as input to fill_constant ops
+      // and we need to rotate a shape otherwise Tensors of wrong shape may be
+      // allocated
+      if (platform::MKLDNNDeviceContext::tls().get_cur_paddle_data_layout() ==
+              framework::DataLayout::kNHWC &&
+          in_dims.size() >= 3) {
+        auto rdims = phi::vectorize<int>(in_dims);
+        std::rotate(rdims.begin() + 1, rdims.begin() + 2, rdims.end());
+        in_dims = phi::make_ddim(rdims);
+      }
     }
     auto* out_t = ctx.Output<Tensor>("Out");
     out_t->Resize({in_dims.size()});
diff --git a/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc
index 2a8627b803a6e..2df9e5c20fda8 100644
--- a/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc
@@ -175,19 +175,17 @@ class SliceGradMKLDNNKernel : public framework::OpKernel<T> {
 
     dnnl::memory::data_type dout_type = framework::ToMKLDNNDataType(
         framework::TransToProtoVarType(dout->dtype()));
-    dnnl::memory::desc md(dout_vec_dims, platform::MKLDNNGetDataType<T>(),
-                          dout->format());
-    dnnl::memory::format_tag reorder_format_tag =
-        platform::GetMKLDNNFormat(md.reshape(slice_dims));
 
     platform::ReorderMKLDNNHandler reorder_handler(
         slice_dims, framework::TransToProtoVarType(dout->dtype()), dout_type,
         onednn_engine);
 
     auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
-        reorder_format_tag, platform::to_void_cast(dout->data<T>()));
+        dout->mem_desc().reshape(slice_dims),
+        platform::to_void_cast(dout->data<T>()));
     auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
-        dx, dx_vec_dims, reorder_format_tag, ctx.GetPlace());
+        dx, dx_vec_dims, platform::GetPlainMKLDNNFormat(dx_vec_dims.size()),
+        ctx.GetPlace());
     memset(dx->data<T>(), 0, reorder_dst_memory_p->get_desc().get_size());
 
     auto slice_mem_p = reorder_handler.AcquireSubmemory(slice_dims, offsets,
@@ -199,8 +197,7 @@ class SliceGradMKLDNNKernel : public framework::OpKernel<T> {
     reorder_p->execute(astream, *reorder_src_memory_p, *slice_mem_p);
     astream.wait();
 
-    dx->set_layout(framework::DataLayout::kMKLDNN);
-    dx->set_format(reorder_format_tag);
+    dx->set_mem_desc(reorder_dst_memory_p->get_desc());
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc
index 36be1681b05e7..28a00be5fa47e 100644
--- a/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc
@@ -59,7 +59,7 @@ class StackMKLDNNHandler
     // wrong output format deduction and suboptimal performance as a result
     if (stack_axis != ndims) {
       for (size_t i = 0; i < inputs.size(); ++i) {
-        srcs_md.emplace_back(memory::desc(input_dims, dt, inputs[i]->format()));
+        srcs_md.push_back(inputs[i]->mem_desc());
       }
 
       input_dims[stack_axis] *= inputs.size();
@@ -69,8 +69,7 @@ class StackMKLDNNHandler
       extended_input_dims[stack_axis] = 1;
 
       for (size_t i = 0; i < inputs.size(); ++i) {
-        srcs_md.emplace_back(memory::desc(input_dims, dt, inputs[i]->format())
-                                 .reshape(extended_input_dims));
+        srcs_md.push_back(inputs[i]->mem_desc().reshape(extended_input_dims));
       }
 
       // concat primitive choses suboptimal format tag because it cannot
@@ -130,9 +129,8 @@ class StackMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     concat_p->execute(astream, args);
     astream.wait();
 
-    output->set_layout(DataLayout::kMKLDNN);
-    output->set_format(platform::GetMKLDNNFormat(
-        dst_mem->get_desc().reshape(phi::vectorize(output->dims()))));
+    output->set_mem_desc(
+        dst_mem->get_desc().reshape(phi::vectorize(output->dims())));
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
index 99f957f573a17..de21c2687bd44 100644
--- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
@@ -60,17 +60,16 @@ class SumMKLDNNHandler
     auto src_tz = dst_tz;
 
     std::vector<dnnl::memory::desc> srcs_md;
+    srcs_md.reserve(in_vars.size());
     for (size_t i = 0; i < in_vars.size(); i++) {
       auto& input_it = in_vars[i]->Get<framework::LoDTensor>();
       if (input_it.numel() == 0) {
         continue;
       }
-      MKLDNNMemoryFormat input_format = input_it.format();
-      srcs_md.push_back(dnnl::memory::desc(
-          src_tz, platform::MKLDNNGetDataType<T>(), input_format));
+      srcs_md.push_back(input_it.mem_desc());
       ++num_inputs_;
     }
-    std::vector<float> scales(num_inputs_, 1.0);
+    std::vector<float> scales(num_inputs_, 1.0f);
 
     auto dst_md = dnnl::memory::desc(dst_tz, platform::MKLDNNGetDataType<T>(),
                                      MKLDNNMemoryFormat::any);
@@ -139,47 +138,27 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       ++input_index;
     }
 
-    std::shared_ptr<dnnl::memory> dst_mem = nullptr;
+    std::unordered_map<int, dnnl::memory> args;
+    std::shared_ptr<dnnl::memory> dst_mem;
+
+    for (size_t i = 0; i < srcs_mem.size(); ++i) {
+      args.insert({DNNL_ARG_MULTIPLE_SRC + i, *(srcs_mem[i])});
+    }
+
     if (in_place) {
-      dst_mem = handler.AcquireDstMemory();
-      output->mutable_data<T>(ctx.GetPlace());
+      dst_mem = srcs_mem[0];
     } else {
       dst_mem = handler.AcquireDstMemory(output);
     }
+    args.insert({DNNL_ARG_DST, *dst_mem});
 
     auto sum_p = handler.AcquireForwardPrimitive();
 
-    std::unordered_map<int, dnnl::memory> args;
-    for (size_t i = 0; i < srcs_mem.size(); ++i) {
-      args.insert({DNNL_ARG_MULTIPLE_SRC + i, *(srcs_mem[i])});
-    }
-    args.insert({DNNL_ARG_DST, *dst_mem});
-
     auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
     sum_p->execute(astream, args);
     astream.wait();
 
-    // For in-place execution which sum does not have we need to fake it
-    // so from oneDNN dst memory we reorder data into input
-    if (in_place) {
-      auto& in_out = in_vars[0]->Get<framework::LoDTensor>();
-      auto output_tz = phi::vectorize<int64_t>(output->dims());
-      platform::ReorderMKLDNNHandler reorder_handler(
-          output_tz, framework::TransToProtoVarType(output->dtype()),
-          framework::ToMKLDNNDataType(
-              framework::TransToProtoVarType(in_out.dtype())),
-          dev_ctx.GetEngine());
-
-      auto target_mem = reorder_handler.AcquireDstMemory(
-          output, in_out.format(), ctx.GetPlace());
-
-      auto reorder_p = reorder_handler.AcquireReorder(target_mem, dst_mem);
-
-      reorder_p->execute(astream, *dst_mem, *target_mem);
-      astream.wait();
-    }
-    output->set_layout(framework::DataLayout::kMKLDNN);
-    output->set_format(platform::GetMKLDNNFormat(*dst_mem));
+    output->set_mem_desc(dst_mem->get_desc());
   }
 };
 
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
index 0e988557df626..b9866ba8c3647 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
@@ -32,9 +32,14 @@ USE_OP_ITSELF(relu);
 USE_OP_DEVICE_KERNEL(relu, MKLDNN);
 USE_OP_ITSELF(transpose);
 USE_OP_DEVICE_KERNEL(transpose, MKLDNN);
+USE_OP_ITSELF(shape);
+USE_OP_DEVICE_KERNEL(shape, MKLDNN);
+USE_OP_ITSELF(crop);
+USE_OP_DEVICE_KERNEL(crop, CPU);
 
 PD_DECLARE_KERNEL(pool2d, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(relu, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(shape, CPU, ALL_LAYOUT);
 
 namespace paddle {
 namespace operators {
@@ -154,5 +159,122 @@ TEST(test_pool2d_relu_relu_nhwc, cpu_place) {
                     platform::errors::InvalidArgument(
                         "Computed shape does not match expected shape"));
 }
+
+TEST(test_pool2d_shape_nhwc, cpu_place) {
+  framework::DDim dims({1, 4, 8, 512});              // NHWC shape
+  std::vector<int32_t> expected_dims{1, 3, 7, 512};  // NHWC expected shape
+  platform::CPUPlace p;
+  framework::Scope scope;
+
+  InputVars input_name = {"x",
+                          scope.Var("x")->GetMutable<framework::LoDTensor>()};
+  // Initialize input data
+  std::uniform_real_distribution<float> dist(static_cast<float>(10.0),
+                                             static_cast<float>(20.0));
+  std::mt19937 engine;
+  size_t numel = static_cast<size_t>(phi::product(dims));
+  input_name.tensor->Resize(dims);
+  auto data_ptr = input_name.tensor->mutable_data<float>(p);
+  for (size_t i = 0; i < numel; ++i) {
+    data_ptr[i] = dist(engine);
+  }
+
+  scope.Var("y")->GetMutable<framework::LoDTensor>();
+  auto *z = scope.Var("z")->GetMutable<framework::LoDTensor>();
+
+  auto &pool = platform::DeviceContextPool::Instance();
+
+  // Make pool2d followed by shape. shape for NHWC should return
+  // as output tensor not-rotated shape of Pool (
+
+  auto ksize = std::vector<int>(2, 2);
+  auto op_pool = framework::OpRegistry::CreateOp(
+      "pool2d", {{"X", {"x"}}}, {{"Out", {"y"}}},
+      {{"pooling_type", {std::string("max")}},
+       {"ksize", {ksize}},
+       {"data_format", {std::string("NHWC")}},
+       {"use_mkldnn", {true}}});
+
+  auto op_shape = framework::OpRegistry::CreateOp(
+      "shape", {{"Input", {"y"}}}, {{"Out", {"z"}}}, {{"use_mkldnn", {true}}});
+
+  op_pool->Run(scope, p);
+  op_shape->Run(scope, p);
+
+  pool.Get(p)->Wait();
+
+  // repack tensor data into vector for easy comparison
+  auto *zdata = z->data<int32_t>();
+  std::vector<int32_t> vzdata(zdata, zdata + z->numel());
+
+  // Verify shape of output
+  PADDLE_ENFORCE_EQ(vzdata, expected_dims,
+                    platform::errors::InvalidArgument(
+                        "Computed shape does not match expected shape"));
+}
+
+TEST(test_pool2d_crop_nhwc, cpu_place) {
+  framework::DDim dims({1, 4, 8, 512});           // NHWC shape
+  framework::DDim expected_dims({1, 3, 7, 512});  // NCHW expected shape
+  platform::CPUPlace p;
+  framework::Scope scope;
+
+  InputVars input_name = {"x",
+                          scope.Var("x")->GetMutable<framework::LoDTensor>()};
+  InputVars second_crop_input_name = {
+      "v", scope.Var("v")->GetMutable<framework::LoDTensor>()};
+  // Initialize input data
+  std::uniform_real_distribution<float> dist(10.0f, 20.0f);
+  std::mt19937 engine;
+  size_t numel = static_cast<size_t>(phi::product(dims));
+  input_name.tensor->Resize(dims);
+  auto data_ptr = input_name.tensor->mutable_data<float>(p);
+  for (size_t i = 0; i < numel; ++i) {
+    data_ptr[i] = dist(engine);
+  }
+  // Second input (Y) to crop is having no buffer
+  // but as it is MKLDNN then its shape order should be NCHW
+  auto expected_dims_nchw = phi::vectorize<int64_t>(expected_dims);
+  std::rotate(expected_dims_nchw.begin() + 1, expected_dims_nchw.end() - 1,
+              expected_dims_nchw.end());
+  second_crop_input_name.tensor->Resize(phi::make_ddim(expected_dims_nchw));
+  const auto second_crop_input_md =
+      dnnl::memory::desc(expected_dims_nchw, dnnl::memory::data_type::f32,
+                         dnnl::memory::format_tag::nhwc);
+  second_crop_input_name.tensor->set_mem_desc(second_crop_input_md);
+
+  scope.Var("y")->GetMutable<framework::LoDTensor>();
+  auto *z = scope.Var("z")->GetMutable<framework::LoDTensor>();
+
+  auto &pool = platform::DeviceContextPool::Instance();
+
+  // Make pool2d followed by crop. crop may have Y input as
+  // non buffered so the path to be executed is handling oneDNN kernel
+  // that is followed by CPU kernel with non-buffered Input
+
+  auto ksize = std::vector<int>(2, 2);
+  auto op_pool = framework::OpRegistry::CreateOp(
+      "pool2d", {{"X", {"x"}}}, {{"Out", {"y"}}},
+      {{"pooling_type", {std::string("max")}},
+       {"ksize", {ksize}},
+       {"data_format", {std::string("NHWC")}},
+       {"use_mkldnn", {true}}});
+
+  std::vector<int> offsets{0, 0, 0, 0};
+  auto op_crop = framework::OpRegistry::CreateOp(
+      "crop", {{"X", {"y"}}, {"Y", {"v"}}}, {{"Out", {"z"}}},
+      {{"offsets", {offsets}}});
+
+  op_pool->Run(scope, p);
+  op_crop->Run(scope, p);
+
+  pool.Get(p)->Wait();
+
+  // Verify shape of output
+  PADDLE_ENFORCE_EQ(z->dims(), expected_dims,
+                    platform::errors::InvalidArgument(
+                        "Output shape does not match expected output shape"));
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc
index eacab46800580..9d3b8e2407fbf 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.cc
+++ b/paddle/fluid/operators/mlu/mlu_baseop.cc
@@ -34,6 +34,12 @@ cnnlCastDataType_t GetCastDataType(const VT::Type& src_type,
   return cast_type;
 }
 
+cnnlCastDataType_t GetCastDataType(const DataType& src_type,
+                                   const DataType& dst_type) {
+  return GetCastDataType(framework::TransToProtoVarType(src_type),
+                         framework::TransToProtoVarType(dst_type));
+}
+
 bool MLUSupportsCast(const VT::Type& src_type, const VT::Type& dst_type) {
   for (auto it = MLU_SUPPORTED_CAST_TYPE.begin();
        it != MLU_SUPPORTED_CAST_TYPE.end(); ++it) {
@@ -688,8 +694,9 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
     const cnnlTensorDescriptor_t diff_y_desc, void* back_out) {
   cnnlHandle_t handle = GetHandleFromCTX(ctx);
 
-  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSparseSoftmaxCrossEntropyWithLogits(
-      handle, mode, x_desc, input, label_desc, label, y_desc, output,
+  const cnnlComputationPreference_t prefer = CNNL_COMPUTATION_HIGH_PRECISION;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSparseSoftmaxCrossEntropyWithLogits_v2(
+      handle, prefer, mode, x_desc, input, label_desc, label, y_desc, output,
       diff_y_desc, back_out));
 }
 
@@ -697,14 +704,14 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
                                   const bool exclusive, const bool reverse,
                                   const cnnlTensorDescriptor_t input_desc,
                                   const void* input,
-                                  const cnnlTensorDescriptor_t ouput_desc,
+                                  const cnnlTensorDescriptor_t output_desc,
                                   void* output) {
   cnnlHandle_t handle = GetHandleFromCTX(ctx);
 
   // NAN propagation mode: Only support CNNL_NOT_PROPAGATE_NAN now.
   cnnlNanPropagation_t mode = CNNL_NOT_PROPAGATE_NAN;
   PADDLE_ENFORCE_MLU_SUCCESS(cnnlCumsum(handle, input_desc, input, axis,
-                                        exclusive, reverse, mode, ouput_desc,
+                                        exclusive, reverse, mode, output_desc,
                                         output));
 }
 
@@ -805,17 +812,17 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
 }
 
 /* static */ void MLUCnnl::ApplyAdam(
-    const ExecutionContext& ctx, const cnnlTensorDescriptor_t grad_desc,
-    const void* grad, const void* lr, const void* beta1, const void* beta2,
-    const void* beta1_power, const void* beta2_power, const void* epsilon,
-    const bool use_nesterov, const cnnlTensorDescriptor_t var_desc, void* var,
-    const cnnlTensorDescriptor_t m_desc, void* m,
-    const cnnlTensorDescriptor_t v_desc, void* v) {
+    const ExecutionContext& ctx, const cnnlTensorDescriptor_t var_desc,
+    void* var, const cnnlTensorDescriptor_t m_desc, void* m,
+    const cnnlTensorDescriptor_t v_desc, void* v,
+    const cnnlTensorDescriptor_t grad_desc, const void* grad, const void* lr,
+    const void* beta1, const void* beta2, const void* beta1_power,
+    const void* beta2_power, const void* epsilon, const bool use_nesterov) {
   cnnlHandle_t handle = GetHandleFromCTX(ctx);
 
   PADDLE_ENFORCE_MLU_SUCCESS(cnnlApplyAdam(
-      handle, grad_desc, var, grad_desc, m, grad_desc, v, grad_desc, grad, lr,
-      beta1, beta2, beta1_power, beta2_power, epsilon, use_nesterov));
+      handle, var_desc, var, m_desc, m, v_desc, v, grad_desc, grad, lr, beta1,
+      beta2, beta1_power, beta2_power, epsilon, use_nesterov));
 }
 
 /* static */ void MLUCnnl::ApplyAdaMax(
@@ -2077,6 +2084,45 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
   }
 }
 
+/* static */ void MLUCnnl::LayerNormForward(
+    const ExecutionContext& ctx, int axis, const cnnlTensorDescriptor_t x_desc,
+    const void* x, const cnnlTensorDescriptor_t weight_bias_desc,
+    const void* weight, const void* bias, float eps,
+    const cnnlTensorDescriptor_t y_desc, void* y,
+    const cnnlTensorDescriptor_t mean_rstd_desc, void* saved_mean,
+    void* saved_rstd) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlGetLayerNormOpWorkspaceSize(handle, axis, x_desc, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlLayerNormForward(handle, x_desc, x, axis, weight_bias_desc, weight,
+                           bias, eps, workspace_ptr, workspace_size, y_desc, y,
+                           mean_rstd_desc, saved_mean, saved_rstd));
+}
+
+/* static */ void MLUCnnl::LayerNormBackward(
+    const ExecutionContext& ctx, int axis, const cnnlTensorDescriptor_t x_desc,
+    const void* x, const cnnlTensorDescriptor_t diff_z_desc, const void* diff_z,
+    const cnnlTensorDescriptor_t weight_bias_desc, const void* weight,
+    const cnnlTensorDescriptor_t mean_rstd_desc, const void* saved_mean,
+    const void* saved_rstd, const cnnlTensorDescriptor_t diff_x_desc,
+    void* diff_x, void* diff_weight, void* diff_bias) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlLayerNormBackward(
+      handle, x_desc, x, axis, diff_z_desc, diff_z, weight_bias_desc, weight,
+      mean_rstd_desc, saved_mean, saved_rstd, diff_x_desc, diff_x, diff_weight,
+      diff_bias));
+}
+
 /* static */ void MLUCnnl::QuantizeParam(
     const ExecutionContext& ctx, const cnnlQuantizeMode_t mode,
     const int bitwidth, const cnnlTensorDescriptor_t input_desc,
@@ -2673,17 +2719,16 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
                                         output_desc, output));
 }
 
-/* static */ void MLUCnnl::ScatterNd(const ExecutionContext& ctx,
-                                     const cnnlTensorDescriptor_t indices_desc,
-                                     const void* indices,
-                                     const cnnlTensorDescriptor_t updates_desc,
-                                     const void* updates,
-                                     const cnnlTensorDescriptor_t output_desc,
-                                     void* output) {
+/* static */ void MLUCnnl::ScatterNd(
+    const ExecutionContext& ctx, cnnlScatterNdMode_t mode,
+    const cnnlTensorDescriptor_t indices_desc, const void* indices,
+    const cnnlTensorDescriptor_t updates_desc, const void* updates,
+    const cnnlTensorDescriptor_t input_desc, const void* input,
+    const cnnlTensorDescriptor_t output_desc, void* output) {
   cnnlHandle_t handle = GetHandleFromCTX(ctx);
-  PADDLE_ENFORCE_MLU_SUCCESS(cnnlScatterNd(handle, indices_desc, indices,
-                                           updates_desc, updates, output_desc,
-                                           output));
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlScatterNd_v2(handle, mode, indices_desc, indices, updates_desc,
+                       updates, input_desc, input, output_desc, output));
 }
 
 /* static */ void MLUCnnl::BitWise(
@@ -2737,5 +2782,26 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
       cnnlReciprocal(handle, input_desc, input, output_desc, output));
 }
 
+/* static */ void MLUCnnl::EmbeddingBackward(
+    const ExecutionContext& ctx, int padding_idx, bool scale_grad_by_freq,
+    const cnnlTensorDescriptor_t indices_desc, const void* indices,
+    const cnnlTensorDescriptor_t diff_desc, const void* diff,
+    const cnnlTensorDescriptor_t output_desc, void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetEmbeddingBackwardWorkspaceSize(
+      handle, diff_desc, output_desc, scale_grad_by_freq, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlEmbeddingBackward(
+      handle, padding_idx, scale_grad_by_freq, indices_desc, indices, diff_desc,
+      diff, workspace_ptr, workspace_size, output_desc, output));
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h
index 572b7aa2bbd01..f048ac7c5c3be 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.h
+++ b/paddle/fluid/operators/mlu/mlu_baseop.h
@@ -146,10 +146,8 @@ const std::map<std::pair<VT::Type, VT::Type>, cnnlCastDataType_t>
         {{VT::FP16, /*cast to*/ VT::BOOL}, CNNL_CAST_HALF_TO_BOOL},
         {{VT::INT32, /*cast to*/ VT::FP32}, CNNL_CAST_INT32_TO_FLOAT},
         {{VT::INT32, /*cast to*/ VT::FP16}, CNNL_CAST_INT32_TO_HALF},
-        {{VT::INT32, /*cast to*/ VT::INT64}, CNNL_CAST_INT32_TO_INT64},
-        {{VT::INT32, /*cast to*/ VT::INT16}, CNNL_CAST_INT32_TO_INT16},
         {{VT::INT32, /*cast to*/ VT::INT8}, CNNL_CAST_INT32_TO_INT8},
-        {{VT::INT32, /*cast to*/ VT::BOOL}, CNNL_CAST_INT32_TO_BOOL},
+        {{VT::INT32, /*cast to*/ VT::INT16}, CNNL_CAST_INT32_TO_INT16},
         {{VT::INT16, /*cast to*/ VT::FP32}, CNNL_CAST_INT16_TO_FLOAT},
         {{VT::INT16, /*cast to*/ VT::FP16}, CNNL_CAST_INT16_TO_HALF},
         {{VT::INT16, /*cast to*/ VT::INT32}, CNNL_CAST_INT16_TO_INT32},
@@ -158,16 +156,29 @@ const std::map<std::pair<VT::Type, VT::Type>, cnnlCastDataType_t>
         {{VT::INT8, /*cast to*/ VT::INT32}, CNNL_CAST_INT8_TO_INT32},
         {{VT::UINT8, /*cast to*/ VT::FP32}, CNNL_CAST_UINT8_TO_FLOAT},
         {{VT::UINT8, /*cast to*/ VT::FP16}, CNNL_CAST_UINT8_TO_HALF},
-        {{VT::UINT8, /*cast to*/ VT::INT64}, CNNL_CAST_UINT8_TO_INT64},
-        {{VT::UINT8, /*cast to*/ VT::INT32}, CNNL_CAST_UINT8_TO_INT32},
         {{VT::BOOL, /*cast to*/ VT::FP32}, CNNL_CAST_BOOL_TO_FLOAT},
         {{VT::BOOL, /*cast to*/ VT::FP16}, CNNL_CAST_BOOL_TO_HALF},
         {{VT::BOOL, /*cast to*/ VT::INT32}, CNNL_CAST_BOOL_TO_INT32},
+        {{VT::UINT8, /*cast to*/ VT::INT32}, CNNL_CAST_UINT8_TO_INT32},
+        {{VT::INT32, /*cast to*/ VT::INT64}, CNNL_CAST_INT32_TO_INT64},
         {{VT::INT64, /*cast to*/ VT::INT32}, CNNL_CAST_INT64_TO_INT32},
+        {{VT::INT32, /*cast to*/ VT::BOOL}, CNNL_CAST_INT32_TO_BOOL},
+        {{VT::UINT8, /*cast to*/ VT::INT64}, CNNL_CAST_UINT8_TO_INT64},
+        {{VT::INT8, /*cast to*/ VT::INT16}, CNNL_CAST_INT8_TO_INT16},
+        {{VT::FP32, /*cast to*/ VT::FP64}, CNNL_CAST_FLOAT_TO_DOUBLE},
+        {{VT::FP64, /*cast to*/ VT::FP32}, CNNL_CAST_DOUBLE_TO_FLOAT},
+        {{VT::INT64, /*cast to*/ VT::FP32}, CNNL_CAST_INT64_TO_FLOAT},
+        {{VT::INT64, /*cast to*/ VT::FP16}, CNNL_CAST_INT64_TO_HALF},
+        {{VT::FP32, /*cast to*/ VT::INT64}, CNNL_CAST_FLOAT_TO_INT64},
+        {{VT::FP16, /*cast to*/ VT::INT64}, CNNL_CAST_HALF_TO_INT64},
 };
 
 cnnlCastDataType_t GetCastDataType(const VT::Type& src_type,
                                    const VT::Type& dst_type);
+
+cnnlCastDataType_t GetCastDataType(const DataType& src_type,
+                                   const DataType& dst_type);
+
 bool MLUSupportsCast(const VT::Type& src_type, const VT::Type& dst_type);
 
 cnnlDeviceType_t GetCnnlDev(int dev_ordinal);
@@ -496,14 +507,14 @@ class MLUCnnl {
       const cnnlTensorDescriptor_t mom_desc, void* mom);
 
   static void ApplyAdam(const ExecutionContext& ctx,
+                        const cnnlTensorDescriptor_t var_desc, void* var,
+                        const cnnlTensorDescriptor_t m_desc, void* m,
+                        const cnnlTensorDescriptor_t v_desc, void* v,
                         const cnnlTensorDescriptor_t grad_desc,
                         const void* grad, const void* lr, const void* beta1,
                         const void* beta2, const void* beta1_power,
                         const void* beta2_power, const void* epsilon,
-                        const bool use_nesterov,
-                        const cnnlTensorDescriptor_t var_desc, void* var,
-                        const cnnlTensorDescriptor_t m_desc, void* m,
-                        const cnnlTensorDescriptor_t v_desc, void* v);
+                        const bool use_nesterov);
 
   static void ApplyAdaMax(const ExecutionContext& ctx,
                           const cnnlTensorDescriptor_t grad_desc,
@@ -1103,6 +1114,24 @@ class MLUCnnl {
       const cnnlTensorDescriptor_t x_backprop_desc, void* x_backprop,
       void* scale_backprop, void* offset_backprop);
 
+  static void LayerNormForward(const ExecutionContext& ctx, int axis,
+                               const cnnlTensorDescriptor_t x_desc,
+                               const void* x,
+                               const cnnlTensorDescriptor_t weight_bias_desc,
+                               const void* weight, const void* bias, float eps,
+                               const cnnlTensorDescriptor_t y_desc, void* y,
+                               const cnnlTensorDescriptor_t mean_rstd_desc,
+                               void* saved_mean, void* saved_rstd);
+
+  static void LayerNormBackward(
+      const ExecutionContext& ctx, int axis,
+      const cnnlTensorDescriptor_t x_desc, const void* x,
+      const cnnlTensorDescriptor_t diff_z_desc, const void* diff_z,
+      const cnnlTensorDescriptor_t weight_bias_desc, const void* weight,
+      const cnnlTensorDescriptor_t mean_rstd_desc, const void* saved_mean,
+      const void* saved_rstd, const cnnlTensorDescriptor_t diff_x_desc,
+      void* diff_x, void* diff_weight, void* diff_bias);
+
   static void Transpose(const ExecutionContext& ctx,
                         const std::vector<int> perm, const int input_dim,
                         const cnnlTensorDescriptor_t input_desc,
@@ -1177,11 +1206,13 @@ class MLUCnnl {
                      const void* k, const int k_int,
                      const cnnlTensorDescriptor_t output_desc, void* output);
 
-  static void ScatterNd(const ExecutionContext& ctx,
+  static void ScatterNd(const ExecutionContext& ctx, cnnlScatterNdMode_t mode,
                         const cnnlTensorDescriptor_t indices_desc,
                         const void* indices,
                         const cnnlTensorDescriptor_t updates_desc,
                         const void* updates,
+                        const cnnlTensorDescriptor_t input_desc,
+                        const void* input,
                         const cnnlTensorDescriptor_t output_desc, void* output);
 
   static void BitWise(const ExecutionContext& ctx,
@@ -1202,6 +1233,12 @@ class MLUCnnl {
                          const void* input,
                          const cnnlTensorDescriptor_t output_desc,
                          void* output);
+
+  static void EmbeddingBackward(
+      const ExecutionContext& ctx, int padding_idx, bool scale_grad_by_freq,
+      const cnnlTensorDescriptor_t indices_desc, const void* indices,
+      const cnnlTensorDescriptor_t diff_desc, const void* diff,
+      const cnnlTensorDescriptor_t output_desc, void* output);
 };
 
 template <typename T>
@@ -1230,5 +1267,13 @@ inline void TransposeFromMLUTensor(const ExecutionContext& ctx,
                      GetBasePtr(transformed_output));
 }
 
+template <typename T>
+inline void FillMLUTensorWithHostValue(const ExecutionContext& ctx, T value,
+                                       Tensor* out) {
+  MLUCnnlTensorDesc out_desc(*out);
+  MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &value, out_desc.get(),
+                GetBasePtr(out));
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/multinomial_op_npu.cc b/paddle/fluid/operators/multinomial_op_npu.cc
new file mode 100644
index 0000000000000..316554e98f01e
--- /dev/null
+++ b/paddle/fluid/operators/multinomial_op_npu.cc
@@ -0,0 +1,58 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+// TODO(Aganlengzi): delete this macro control and remove REMOVE_ITEM in
+// cmake/operators.cmake when Paddle supports
+#if (CANN_VERSION_CODE >= 504000)
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class NPUMultinomialKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto x = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    const int64_t num_samples = ctx.Attr<int>("num_samples");
+    const bool replacement = ctx.Attr<bool>("replacement");
+
+    auto place = ctx.GetPlace();
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    out->mutable_data<int64_t>(place);
+
+    const auto& runner = NpuOpRunner(
+        "MultinomialWithReplacementD", {*x}, {*out},
+        {{"num_samples", num_samples}, {"replacement", replacement}});
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_NPU_KERNEL(
+    multinomial,
+    ops::NPUMultinomialKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::NPUMultinomialKernel<paddle::platform::NPUDeviceContext, double>)
+#endif
diff --git a/paddle/fluid/operators/optimizers/adam_op_mlu.cc b/paddle/fluid/operators/optimizers/adam_op_mlu.cc
new file mode 100644
index 0000000000000..9d335021234eb
--- /dev/null
+++ b/paddle/fluid/operators/optimizers/adam_op_mlu.cc
@@ -0,0 +1,285 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op_mlu.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename T>
+class AdamMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
+                          "The Var(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Param").front(),
+                          framework::ToTypeName(param_var->Type())));
+    auto* param = ctx.Input<LoDTensor>("Param");
+    auto* grad_var = ctx.InputVar("Grad");
+    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
+                          "The Grad(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Grad").front(),
+                          framework::ToTypeName(param_var->Type())));
+    auto* grad = ctx.Input<LoDTensor>("Grad");
+    auto* mom1 = ctx.Input<LoDTensor>("Moment1");
+    auto* mom2 = ctx.Input<LoDTensor>("Moment2");
+    auto* lr = ctx.Input<LoDTensor>("LearningRate");
+
+    auto* beta1_pow = ctx.Input<Tensor>("Beta1Pow");
+    auto* beta2_pow = ctx.Input<Tensor>("Beta2Pow");
+
+    auto* param_out = ctx.Output<LoDTensor>("ParamOut");
+    auto* mom1_out = ctx.Output<LoDTensor>("Moment1Out");
+    auto* mom2_out = ctx.Output<LoDTensor>("Moment2Out");
+    auto* beta1_pow_out = ctx.Output<LoDTensor>("Beta1PowOut");
+    auto* beta2_pow_out = ctx.Output<LoDTensor>("Beta2PowOut");
+
+    bool skip_update = false;
+    if (ctx.HasInput("SkipUpdate")) {
+      auto* skip_update_tensor = ctx.Input<framework::Tensor>("SkipUpdate");
+      PADDLE_ENFORCE_EQ(skip_update_tensor->numel(), 1,
+                        platform::errors::InvalidArgument(
+                            "Input(SkipUpdate) size must be 1, but get %d",
+                            skip_update_tensor->numel()));
+      std::vector<bool> skip_update_vec;
+      paddle::framework::TensorToVector(*skip_update_tensor,
+                                        ctx.device_context(), &skip_update_vec);
+      skip_update = skip_update_vec[0];
+    }
+    // skip_update=true, just copy input to output, and TensorCopy will call
+    // mutable_data
+    if (skip_update) {
+      VLOG(4) << "Adam skip update";
+      framework::TensorCopy(
+          *param, ctx.GetPlace(),
+          ctx.template device_context<platform::MLUDeviceContext>(), param_out);
+      framework::TensorCopy(
+          *mom1, ctx.GetPlace(),
+          ctx.template device_context<platform::MLUDeviceContext>(), mom1_out);
+      framework::TensorCopy(
+          *mom2, ctx.GetPlace(),
+          ctx.template device_context<platform::MLUDeviceContext>(), mom2_out);
+      framework::TensorCopy(
+          *beta1_pow, beta1_pow->place(),
+          ctx.template device_context<platform::MLUDeviceContext>(),
+          beta1_pow_out);
+      framework::TensorCopy(
+          *beta2_pow, beta2_pow->place(),
+          ctx.template device_context<platform::MLUDeviceContext>(),
+          beta2_pow_out);
+      return;
+    }
+
+    bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
+    VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
+
+    param_out->ShareDataWith(*param);
+    mom1_out->ShareDataWith(*mom1);
+    mom2_out->ShareDataWith(*mom2);
+
+    LoDTensor beta1_pow_tmp;
+    LoDTensor beta2_pow_tmp;
+    if (beta1_pow->place() == platform::CPUPlace()) {
+      T beta1 = *beta1_pow->data<T>();
+      beta1_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
+      MLUCnnlTensorDesc beta1_pow_tmp_desc(beta1_pow_tmp);
+      MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &beta1,
+                    beta1_pow_tmp_desc.get(), GetBasePtr(&beta1_pow_tmp));
+      beta1_pow = &beta1_pow_tmp;
+    }
+    if (beta2_pow->place() == platform::CPUPlace()) {
+      T beta2 = *beta2_pow->data<T>();
+      beta2_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
+      MLUCnnlTensorDesc beta2_pow_tmp_desc(beta2_pow_tmp);
+      MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &beta2,
+                    beta2_pow_tmp_desc.get(), GetBasePtr(&beta2_pow_tmp));
+      beta2_pow = &beta2_pow_tmp;
+    }
+
+    VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel()
+            << "beta2_pow.numel() : " << beta2_pow->numel();
+    VLOG(3) << "param.numel(): " << param->numel();
+
+    PADDLE_ENFORCE_EQ(beta1_pow_out->numel(), 1,
+                      platform::errors::InvalidArgument(
+                          "beta1 pow output size should be 1, but received "
+                          "value is:%d.",
+                          beta1_pow_out->numel()));
+
+    PADDLE_ENFORCE_EQ(beta2_pow_out->numel(), 1,
+                      platform::errors::InvalidArgument(
+                          "beta2 pow output size should be 1, but received "
+                          "value is:%d.",
+                          beta2_pow_out->numel()));
+
+    const Tensor* beta1_tensor = nullptr;
+    const Tensor* beta2_tensor = nullptr;
+    const Tensor* epsilon_tensor = nullptr;
+
+    Tensor beta1_tmp(experimental::DataType::FLOAT32);
+    Tensor beta2_tmp(experimental::DataType::FLOAT32);
+    Tensor epsilon_tmp(experimental::DataType::FLOAT32);
+
+    if (ctx.HasInput("Beta1Tensor")) {
+      beta1_tensor = ctx.Input<framework::Tensor>("Beta1Tensor");
+      PADDLE_ENFORCE_EQ(beta1_tensor->numel(), 1,
+                        platform::errors::InvalidArgument(
+                            "Input(Beta1Tensor) size must be 1, but get %d",
+                            beta1_tensor->numel()));
+    } else {
+      T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
+      beta1_tmp.mutable_data<T>({1}, ctx.GetPlace());
+      MLUCnnlTensorDesc beta1_tmp_desc(beta1_tmp);
+      MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &beta1, beta1_tmp_desc.get(),
+                    GetBasePtr(&beta1_tmp));
+      beta1_tensor = &beta1_tmp;
+    }
+
+    if (ctx.HasInput("Beta2Tensor")) {
+      beta2_tensor = ctx.Input<framework::Tensor>("Beta2Tensor");
+      PADDLE_ENFORCE_EQ(beta2_tensor->numel(), 1,
+                        platform::errors::InvalidArgument(
+                            "Input(Beta2Tensor) size must be 1, but get %d",
+                            beta2_tensor->numel()));
+    } else {
+      T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
+      beta2_tmp.mutable_data<T>({1}, ctx.GetPlace());
+      MLUCnnlTensorDesc beta2_tmp_desc(beta2_tmp);
+      MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &beta2, beta2_tmp_desc.get(),
+                    GetBasePtr(&beta2_tmp));
+      beta2_tensor = &beta2_tmp;
+    }
+
+    if (ctx.HasInput("EpsilonTensor")) {
+      epsilon_tensor = ctx.Input<framework::Tensor>("EpsilonTensor");
+      PADDLE_ENFORCE_EQ(epsilon_tensor->numel(), 1,
+                        platform::errors::InvalidArgument(
+                            "Input(EpsilonTensor) size must be 1, but get %d",
+                            epsilon_tensor->numel()));
+    } else {
+      T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+      epsilon_tmp.mutable_data<T>({1}, ctx.GetPlace());
+      MLUCnnlTensorDesc epsilon_tmp_desc(epsilon_tmp);
+      MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &epsilon,
+                    epsilon_tmp_desc.get(), GetBasePtr(&epsilon_tmp));
+      epsilon_tensor = &epsilon_tmp;
+    }
+
+    MLUCnnlTensorDesc param_desc(*param);
+    MLUCnnlTensorDesc mom1_desc(*mom1);
+    MLUCnnlTensorDesc mom2_desc(*mom2);
+    MLUCnnlTensorDesc grad_desc(*grad);
+    MLUCnnl::ApplyAdam(ctx, param_desc.get(), GetBasePtr(param_out),
+                       mom1_desc.get(), GetBasePtr(mom1_out), mom2_desc.get(),
+                       GetBasePtr(mom2_out), grad_desc.get(), GetBasePtr(grad),
+                       GetBasePtr(lr), GetBasePtr(beta1_tensor),
+                       GetBasePtr(beta2_tensor), GetBasePtr(beta1_pow),
+                       GetBasePtr(beta2_pow), GetBasePtr(epsilon_tensor),
+                       /*use_nesterov*/ false);
+
+    if (!use_global_beta_pow) {
+      beta1_pow_out->mutable_data<T>(ctx.GetPlace());
+      beta2_pow_out->mutable_data<T>(ctx.GetPlace());
+
+      MLUCnnlTensorDesc beta1_desc(*beta1_tensor);
+      MLUCnnlOpTensorDesc mul_op_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(),
+                                      CNNL_NOT_PROPAGATE_NAN);
+
+      MLUCnnl::OpTensor(ctx, mul_op_desc.get(), beta1_desc.get(),
+                        GetBasePtr(beta1_pow), beta1_desc.get(),
+                        GetBasePtr(beta1_tensor), beta1_desc.get(),
+                        GetBasePtr(beta1_pow_out), ToCnnlDataType<T>());
+
+      MLUCnnl::OpTensor(ctx, mul_op_desc.get(), beta1_desc.get(),
+                        GetBasePtr(beta2_pow), beta1_desc.get(),
+                        GetBasePtr(beta2_tensor), beta1_desc.get(),
+                        GetBasePtr(beta2_pow_out), ToCnnlDataType<T>());
+    }
+  }
+};
+
+template <typename T>
+class AdamWMLUKernel : public AdamMLUKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    VLOG(3) << "MLU AdamW Kernel";
+    bool skip_update = false;
+    if (ctx.HasInput("SkipUpdate")) {
+      VLOG(3) << "Has SkipUpdate";
+      auto* skip_update_tensor = ctx.Input<framework::Tensor>("SkipUpdate");
+      PADDLE_ENFORCE_EQ(skip_update_tensor->numel(), 1,
+                        platform::errors::InvalidArgument(
+                            "Input(SkipUpdate) size must be 1, but get %d",
+                            skip_update_tensor->numel()));
+      std::vector<bool> skip_update_vec;
+      paddle::framework::TensorToVector(*skip_update_tensor,
+                                        ctx.device_context(), &skip_update_vec);
+      skip_update = skip_update_vec[0];
+    }
+    VLOG(3) << "Skip update" << skip_update;
+    bool with_decay = ctx.Attr<bool>("with_decay");
+    if (!skip_update && with_decay) {
+      if (ctx.HasInput("MasterParam")) {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Master Param is not supported on MLU"));
+      } else {
+        const auto* param_var = ctx.InputVar("Param");
+        PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
+                          platform::errors::InvalidArgument(
+                              "The Var(%s)'s type should be LoDTensor, "
+                              "but the received is %s",
+                              ctx.InputNames("Param").front(),
+                              framework::ToTypeName(param_var->Type())));
+        auto* param = ctx.Input<LoDTensor>("Param");
+        auto* lr = ctx.Input<LoDTensor>("LearningRate");
+        float coeff = ctx.Attr<float>("coeff");
+
+        // update param with decay coeff: mul(-1 * lr, coeff * param) + param
+        MLUCnnlTensorDesc lr_desc(*lr);
+        MLUCnnlTensorDesc param_desc(*param);
+        MLUCnnlOpTensorDesc mul_op_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(),
+                                        CNNL_NOT_PROPAGATE_NAN);
+
+        MLUCnnl::OpTensor(ctx, mul_op_desc.get(), lr_desc.get(), GetBasePtr(lr),
+                          param_desc.get(), GetBasePtr(param), param_desc.get(),
+                          const_cast<void*>(GetBasePtr(param)),
+                          ToCnnlDataType<T>(),
+                          /*alpha1*/ -1.f, /*alpha2*/ coeff, /*beta*/ 1.f);
+      }
+    }
+    AdamMLUKernel<T>::Compute(ctx);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(adam, ops::AdamMLUKernel<float>,
+                       ops::AdamMLUKernel<plat::float16>);
+
+REGISTER_OP_MLU_KERNEL(adamw, ops::AdamWMLUKernel<float>,
+                       ops::AdamWMLUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/optimizers/lamb_op_xpu.cc b/paddle/fluid/operators/optimizers/lamb_op_xpu.cc
index e7cbe4aa8dd4b..7aa5783a01bfd 100644
--- a/paddle/fluid/operators/optimizers/lamb_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/lamb_op_xpu.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/optimizers/lamb_op.h"
 #include "gflags/gflags.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 
 namespace paddle {
 namespace operators {
@@ -70,44 +71,18 @@ class LambOpXPUKernel : public framework::OpKernel<T> {
 
     if (grad_var->IsType<framework::LoDTensor>()) {
       auto& grad = *ctx.Input<LoDTensor>("Grad");
-      int r = xpu::lamb(dev_ctx.x_context(), grad.template data<T>(),
-                        mom1.template data<T>(), mom2.template data<T>(),
-                        param.template data<T>(), beta1_pow.template data<T>(),
-                        beta2_pow.template data<T>(), beta1, beta2, epsilon,
-                        weight_decay, lr.template data<T>(),
-                        mom1_out.template mutable_data<T>(ctx.GetPlace()),
-                        mom2_out.template mutable_data<T>(ctx.GetPlace()),
-                        param_out.template mutable_data<T>(ctx.GetPlace()),
-                        beta1_pow_out.template mutable_data<T>(ctx.GetPlace()),
-                        beta2_pow_out.template mutable_data<T>(ctx.GetPlace()),
-                        param.numel());
+      int r = xpu::lamb(
+          dev_ctx.x_context(), grad.template data<T>(), mom1.template data<T>(),
+          mom2.template data<T>(), param.template data<T>(),
+          beta1_pow.template data<T>(), beta2_pow.template data<T>(),
+          mom1_out.template mutable_data<T>(ctx.GetPlace()),
+          mom2_out.template mutable_data<T>(ctx.GetPlace()),
+          param_out.template mutable_data<T>(ctx.GetPlace()),
+          beta1_pow_out.template mutable_data<T>(ctx.GetPlace()),
+          beta2_pow_out.template mutable_data<T>(ctx.GetPlace()), beta1, beta2,
+          epsilon, weight_decay, lr.template data<T>(), param.numel());
 
-      if (r == xpu::Error_t::INVALID_PARAM) {
-        PADDLE_ENFORCE_EQ(
-            r, xpu::Error_t::SUCCESS,
-            platform::errors::InvalidArgument(
-                "XPU kernel error of LambOp, error message: INVALID_PARAM, "
-                "please check your input & output."));
-      } else if (r == xpu::Error_t::RUNTIME_ERROR) {
-        PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                          platform::errors::Unavailable(
-                              "XPU kernel error of LambOp, error message: "
-                              "RUNTIME_ERROR, please check whether Baidu "
-                              "Kunlun Card is properly installed."));
-      } else if (r == xpu::Error_t::NO_ENOUGH_WORKSPACE) {
-        PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                          platform::errors::ResourceExhausted(
-                              "XPU kernel error of LambOp, error "
-                              "message: NO_ENOUGH_WORKSPACE, XPU "
-                              "has no enough memory."));
-      } else {
-        PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                          platform::errors::ResourceExhausted(
-                              "XPU kernel error of LambOp, error "
-                              "message: OTHER "
-                              "XPU API returns error code: %d.",
-                              r));
-      }
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "lamb");
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "Variable type not supported by lamb_op. Expect LoDTensor, "
diff --git a/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc b/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc
index 85c2d42c841f0..b53d51686cfd7 100644
--- a/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <gflags/gflags.h>
 #include <iostream>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 
 namespace paddle {
 namespace operators {
@@ -105,40 +106,15 @@ class RmspropOpXPUKernel : public framework::OpKernel<T> {
     /// const float* ms, const float* g, const float* mom,
     /// float epsilon, float rho, float momentum, float lr,
     /// float *ms_out, float *mom_out, float *p_out, int n)
-    int r = xpu::rmsprop(dev_ctx.x_context(), param.template data<T>(),
-                         meanSquare.template data<T>(), grad.template data<T>(),
-                         mom.template data<T>(), epsilon, decay, momentum, lr,
+    int r = xpu::rmsprop(dev_ctx.x_context(), grad.template data<T>(),
+                         param.template data<T>(),
+                         meanSquare.template data<T>(), mom.template data<T>(),
+                         param_out.template mutable_data<T>(ctx.GetPlace()),
                          mom_sqrt_out.template mutable_data<T>(ctx.GetPlace()),
                          mom_out.template mutable_data<T>(ctx.GetPlace()),
-                         param_out.template mutable_data<T>(ctx.GetPlace()),
-                         param.numel());
-
-    if (r == xpu::Error_t::INVALID_PARAM) {
-      PADDLE_ENFORCE_EQ(
-          r, xpu::Error_t::SUCCESS,
-          platform::errors::InvalidArgument(
-              "XPU kernel error of RmspropOp, error message: INVALID_PARAM, "
-              "please check your input & output."));
-    } else if (r == xpu::Error_t::RUNTIME_ERROR) {
-      PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                        platform::errors::Unavailable(
-                            "XPU kernel error of RmspropOp, error message: "
-                            "RUNTIME_ERROR, please check whether Baidu "
-                            "Kunlun Card is properly installed."));
-    } else if (r == xpu::Error_t::NO_ENOUGH_WORKSPACE) {
-      PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                        platform::errors::ResourceExhausted(
-                            "XPU kernel error of RmspropOp, error "
-                            "message: NO_ENOUGH_WORKSPACE, XPU "
-                            "has no enough memory."));
-    } else {
-      PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                        platform::errors::ResourceExhausted(
-                            "XPU kernel error of RmspropOp, error "
-                            "message: OTHER "
-                            "XPU API returns error code: %d.",
-                            r));
-    }
+                         epsilon, decay, momentum, lr, param.numel());
+
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "rmsprop");
   }
 };
 
diff --git a/paddle/fluid/operators/optimizers/sgd_op_xpu.cc b/paddle/fluid/operators/optimizers/sgd_op_xpu.cc
index 9dabca1b66a77..e7c03be95cae1 100644
--- a/paddle/fluid/operators/optimizers/sgd_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/sgd_op_xpu.cc
@@ -14,11 +14,15 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/operators/optimizers/sgd_op.h"
 #include <string>
+#include "paddle/fluid/platform/device/device_wrapper.h"
+
 namespace paddle {
 namespace operators {
 
 template <typename DeviceContext, typename T>
 class SGDOpXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
@@ -48,40 +52,31 @@ class SGDOpXPUKernel : public framework::OpKernel<T> {
                             "numel = [%s], ParamOut's numel = [%s]",
                             grad->numel(), sz));
 
-      const T *lr = learning_rate->data<T>();
+      const T *lr_t = learning_rate->data<T>();
+      auto &dev_ctx = ctx.template device_context<DeviceContext>();
+      xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+      const float *lr = nullptr;
+      if (std::is_same<T, paddle::platform::float16>::value) {
+        float *lr_float =
+            RAII_GUARD.alloc_l3_or_gm<float>(learning_rate->numel());
+        int r = xpu::cast_v2<XPUType, float>(
+            dev_ctx.x_context(), reinterpret_cast<const XPUType *>(lr_t),
+            lr_float, learning_rate->numel());
+        PADDLE_ENFORCE_XDNN_SUCCESS(r, "clip_v2");
+        lr = lr_float;
+      } else {
+        lr = reinterpret_cast<const float *>(lr_t);
+      }
+
       const T *param_data = param->data<T>();
       const T *grad_data = grad->data<T>();
       T *out_data = param_out->mutable_data<T>(ctx.GetPlace());
 
-      auto &dev_ctx = ctx.template device_context<DeviceContext>();
-      int r = xpu::sgd(dev_ctx.x_context(), sz, grad_data, param_data, lr,
-                       out_data);
-      if (r == xpu::Error_t::INVALID_PARAM) {
-        PADDLE_ENFORCE_EQ(
-            r, xpu::Error_t::SUCCESS,
-            platform::errors::InvalidArgument(
-                "XPU kernel error of SgdOp, error message: INVALID_PARAM, "
-                "please check your input & output."));
-      } else if (r == xpu::Error_t::RUNTIME_ERROR) {
-        PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                          platform::errors::Unavailable(
-                              "XPU kernel error of SgdOp, error message: "
-                              "RUNTIME_ERROR, please check whether Baidu "
-                              "Kunlun Card is properly installed."));
-      } else if (r == xpu::Error_t::NO_ENOUGH_WORKSPACE) {
-        PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                          platform::errors::ResourceExhausted(
-                              "XPU kernel error of SgdOp, error "
-                              "message: NO_ENOUGH_WORKSPACE, XPU "
-                              "has no enough memory."));
-      }
-    } else {
-      PADDLE_ENFORCE_EQ(false, true,
-                        platform::errors::PermissionDenied(
-                            "Unsupported Variable Type of Param & Grad in "
-                            "SgdOp-XPU. Excepted "
-                            "LodTensor, But received [%s] and [%s]",
-                            paddle::framework::ToTypeName(param_var->Type())));
+      int r = xpu::sgd(dev_ctx.x_context(),
+                       reinterpret_cast<const XPUType *>(grad_data),
+                       reinterpret_cast<const XPUType *>(param_data), lr,
+                       reinterpret_cast<XPUType *>(out_data), sz);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "sgd");
     }
   }
 };
@@ -90,6 +85,8 @@ class SGDOpXPUKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 REGISTER_OP_XPU_KERNEL(
-    sgd, ops::SGDOpXPUKernel<paddle::platform::XPUDeviceContext, float>);
+    sgd, ops::SGDOpXPUKernel<paddle::platform::XPUDeviceContext, float>,
+    ops::SGDOpXPUKernel<paddle::platform::XPUDeviceContext, plat::float16>);
 #endif
diff --git a/paddle/fluid/operators/positive_negative_pair_op.cc b/paddle/fluid/operators/positive_negative_pair_op.cc
index a9646b2e8acb5..cbe58644f5381 100644
--- a/paddle/fluid/operators/positive_negative_pair_op.cc
+++ b/paddle/fluid/operators/positive_negative_pair_op.cc
@@ -123,7 +123,7 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
           column, depth,
           platform::errors::OutOfRange(
               "Attr(column) should be less than depth(the second "
-              "dimension of Input(Score)). Recieved Attr(column): %d, while "
+              "dimension of Input(Score)). Received Attr(column): %d, while "
               "depth is %d.",
               column, depth));
       PADDLE_ENFORCE_GE(
@@ -131,7 +131,7 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
           platform::errors::OutOfRange(
               "Attr(column) should be greater than equal to negative "
               "depth, i.e. the second dimension of Input(Score). "
-              "Recieved Attr(column): %d, while negative depth is %d.",
+              "Received Attr(column): %d, while negative depth is %d.",
               column, -depth));
     }
 
diff --git a/paddle/fluid/operators/prune_gate_by_capacity_op.cu b/paddle/fluid/operators/prune_gate_by_capacity_op.cu
index 7228bdbf3805a..6a2ed6592e7fe 100644
--- a/paddle/fluid/operators/prune_gate_by_capacity_op.cu
+++ b/paddle/fluid/operators/prune_gate_by_capacity_op.cu
@@ -98,7 +98,7 @@ static void VisitDataType(paddle::experimental::DataType type,
     visitor.template apply<int64_t>();
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
-        "The recieved values gate_id type %s can not meet input requirements. "
+        "The received values gate_id type %s can not meet input requirements. "
         "Because the given gate_id data type of operators must be "
         "int64. Please input appropriate gate_id again! ",
         "framework::DataTypeToString(type)"));
diff --git a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc
index 2df0d7526a3d3..457e37744d316 100644
--- a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc
+++ b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc
@@ -63,7 +63,7 @@ void HeterListenAndServOp::RunAsyncLoop(framework::ProgramDesc *program) const {
     PADDLE_ENFORCE_EQ(pieces.size(), 2,
                       platform::errors::PreconditionNotMet(
                           "Invalid format of message_and_id argument. "
-                          "Expected \"message:block_id\". Recieved %s",
+                          "Expected \"message:block_id\". Received %s",
                           grad_and_id.c_str()));
     PADDLE_ENFORCE_EQ(out_map->count(pieces[0]), 0,
                       platform::errors::AlreadyExists(
@@ -82,7 +82,7 @@ void HeterListenAndServOp::RunAsyncLoop(framework::ProgramDesc *program) const {
   PADDLE_ENFORCE_GE(num_blocks, 1,
                     platform::errors::PreconditionNotMet(
                         "Invalid number of blocks in server program. Expected "
-                        "equal or greater than 1. Recieved %zu",
+                        "equal or greater than 1. Received %zu",
                         num_blocks));
   std::vector<int> block_list;
   for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
diff --git a/paddle/fluid/operators/pull_gpups_sparse_op.h b/paddle/fluid/operators/pull_gpups_sparse_op.h
index f721608cffb08..abfdb62ec34ac 100644
--- a/paddle/fluid/operators/pull_gpups_sparse_op.h
+++ b/paddle/fluid/operators/pull_gpups_sparse_op.h
@@ -26,6 +26,7 @@ template <typename T>
 static void PullGpuPSSparseFunctor(const framework::ExecutionContext &ctx) {
   auto inputs = ctx.MultiInput<framework::Tensor>("Ids");
   auto outputs = ctx.MultiOutput<framework::Tensor>("Out");
+  auto embedding_size_vec = ctx.Attr<std::vector<int>>("size");
   const auto slot_size = inputs.size();
   std::vector<const uint64_t *> all_keys(slot_size);
   // GpuPSPS only supports float now
@@ -44,7 +45,7 @@ static void PullGpuPSSparseFunctor(const framework::ExecutionContext &ctx) {
 #ifdef PADDLE_WITH_HETERPS
   auto gpu_ps_ptr = paddle::framework::PSGPUWrapper::GetInstance();
   gpu_ps_ptr->PullSparse(ctx.GetPlace(), 0, all_keys, all_values, slot_lengths,
-                         0);
+                         embedding_size_vec, 0);
 #endif
 }
 
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index 4b6759ea165ed..db0f5758d2f53 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/operators/reader/buffered_reader.h"
 #include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 
@@ -85,10 +86,27 @@ BufferedReader::BufferedReader(
     stream_ = platform::MluStreamResourcePool::Instance().New(dev_idx);
   }
 #endif
+
+#ifdef PADDLE_WITH_XPU
+  if (platform::is_xpu_place(place_)) {
+    int dev_idx = place_.device;
+    compute_stream_ =
+        ((platform::XPUDeviceContext *)(platform::DeviceContextPool::Instance()
+                                            .Get(place_)))
+            ->stream();
+    events_.resize(buffer_size);
+    for (auto &event : events_) {
+      event = platform::XpuEventResourcePool::Instance().New(dev_idx);
+    }
+    stream_ = platform::XpuStreamResourcePool::Instance().New(dev_idx);
+  }
+#endif
+
   cpu_buffer_.resize(buffer_size);
   cuda_buffer_.resize(buffer_size);
   npu_buffer_.resize(buffer_size);
   mlu_buffer_.resize(buffer_size);
+  xpu_buffer_.resize(buffer_size);
   ReadTillBufferFullAsync();
 }
 
@@ -322,6 +340,57 @@ void BufferedReader::ReadAsync(size_t i) {
       platform::MLUStreamSync(stream_.get());
     }
 #endif
+
+#ifdef PADDLE_WITH_XPU
+    if (platform::is_xpu_place(place_)) {
+      TensorVec &xpu = xpu_buffer_[i];
+      if (xpu.empty()) {
+        xpu.resize(cpu.size());
+      } else {
+        PADDLE_ENFORCE_EQ(
+            xpu.size(), cpu.size(),
+            platform::errors::InvalidArgument(
+                "Input tensor number on XPU and CPU devices are not matched. "
+                "The number on XPU is %d, on CPU is %d",
+                xpu.size(), cpu.size()));
+      }
+
+      std::vector<void *> xpu_ptrs;
+      xpu_ptrs.reserve(cpu.size());
+      for (size_t i = 0; i < cpu.size(); ++i) {
+        xpu[i].Resize(cpu[i].dims());
+        xpu[i].set_layout(cpu[i].layout());
+        xpu_ptrs.emplace_back(xpu[i].mutable_data(place_, cpu[i].type()));
+      }
+
+      platform::XPUDeviceGuard gurad(place_.device);
+      int r = xpu_event_record(events_[i].get(), compute_stream_);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "xpu_event_record");
+      r = xpu_stream_wait_event(stream_.get(), events_[i].get());
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "xpu_stream_wait_event");
+
+      platform::RecordEvent record_event("BufferedReader:MemoryCopy",
+                                         platform::TracerEventType::UserDefined,
+                                         1);
+      for (size_t i = 0; i < cpu.size(); ++i) {
+        auto cpu_place = cpu[i].place();
+        auto cpu_ptr = cpu[i].data();
+        auto xpu_ptr = xpu_ptrs[i];
+        auto size =
+            cpu[i].numel() * paddle::framework::DataTypeSize(cpu[i].dtype());
+        // TODO(zhanghuan) for now hardware not support xpu_memcpy_async, maybe
+        // KL3
+        if ((platform::is_xpu_place(cpu_place))) {
+          memory::Copy(place_, xpu_ptr, cpu_place, cpu_ptr, size);
+          platform::XPUStreamSync(stream_.get());
+        } else {
+          memory::Copy(place_, xpu_ptr, cpu_place, cpu_ptr, size);
+        }
+        xpu[i].set_lod(cpu[i].lod());
+      }
+      platform::XPUStreamSync(stream_.get());
+    }
+#endif
     return i;
   }));
 }
@@ -359,6 +428,8 @@ void BufferedReader::ReadNextImpl(std::vector<framework::LoDTensor> *out) {
     *out = std::move(npu_buffer_[i]);
   } else if (platform::is_mlu_place(place_)) {
     *out = std::move(mlu_buffer_[i]);
+  } else if (platform::is_xpu_place(place_)) {
+    *out = std::move(xpu_buffer_[i]);
   } else {
     *out = std::move(cpu_buffer_[i]);
   }
diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h
index f0f3b6b7f9fdf..52d3d8d6999a0 100644
--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -33,6 +33,10 @@
 #include "paddle/fluid/platform/device/mlu/mlu_info.h"
 #include "paddle/fluid/platform/device/mlu/mlu_resource_pool.h"
 #endif
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/device/xpu/xpu_info.h"
+#include "paddle/fluid/platform/device/xpu/xpu_resource_pool.h"
+#endif
 
 namespace paddle {
 namespace operators {
@@ -76,6 +80,7 @@ class BufferedReader : public framework::DecoratedReader {
   std::vector<TensorVec> cuda_buffer_;
   std::vector<TensorVec> npu_buffer_;
   std::vector<TensorVec> mlu_buffer_;
+  std::vector<TensorVec> xpu_buffer_;
   size_t prev_pos_{-1UL};
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   gpuStream_t compute_stream_;
@@ -94,6 +99,12 @@ class BufferedReader : public framework::DecoratedReader {
   std::shared_ptr<platform::MluStreamObject> stream_;
   std::vector<std::shared_ptr<platform::MluEventObject>> events_;
 #endif
+
+#ifdef PADDLE_WITH_XPU
+  xpuStream compute_stream_;
+  std::shared_ptr<platform::XpuStreamObject> stream_;
+  std::vector<std::shared_ptr<platform::XpuEventObject>> events_;
+#endif
 };
 
 }  // namespace reader
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
index f99b72faba4ae..04660fb501142 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
@@ -105,6 +105,68 @@ class ReduceMaxNPUKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename DeviceContext, typename T>
+class ReduceMaxGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* out = context.Input<Tensor>("Out");
+    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+    int in_dtype = context.Attr<int>("in_dtype");
+
+    PADDLE_ENFORCE_EQ(
+        in_dtype == -1, true,
+        platform::errors::InvalidArgument(
+            "NPU only support in_dtype == -1 in reduce_max_grad op."));
+
+    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    x_grad->mutable_data<T>(context.GetPlace());
+
+    auto& dev_ctx =
+        context.template device_context<paddle::platform::NPUDeviceContext>();
+    auto place = context.GetPlace();
+    auto stream = dev_ctx.stream();
+
+    // broadcast
+    auto x_dims_vec = phi::vectorize(x->dims());
+    Tensor transformed_out(x->type());
+    transformed_out.Resize(phi::make_ddim(x_dims_vec));
+    transformed_out.mutable_data<T>(place);
+    NpuOpRunner r_brd_out;
+    r_brd_out.SetType("BroadcastTo")
+        .AddInput(*out)
+        .AddInput(std::move(x_dims_vec))
+        .AddOutput(transformed_out)
+        .Run(stream);
+    Tensor transformed_out_grad(x->type());
+    transformed_out_grad.Resize(phi::make_ddim(x_dims_vec));
+    transformed_out_grad.mutable_data<T>(place);
+    NpuOpRunner r_brd_out_grad;
+    r_brd_out_grad.SetType("BroadcastTo")
+        .AddInput(*out_grad)
+        .AddInput(std::move(x_dims_vec))
+        .AddOutput(transformed_out_grad)
+        .Run(stream);
+
+    // compare
+    Tensor equal_cond;
+    equal_cond.mutable_data<bool>(x_grad->dims(), place);
+    const auto& r_equal =
+        NpuOpRunner("Equal", {*x, transformed_out}, {equal_cond}, {});
+    r_equal.Run(stream);
+
+    // select
+    Tensor t_zero;
+    t_zero.mutable_data<T>(x_grad->dims(), place);
+    FillNpuTensorWithConstant(&t_zero, static_cast<T>(0));
+    t_zero.Resize(x_grad->dims());
+
+    const auto& r_sel = NpuOpRunner(
+        "SelectV2", {equal_cond, transformed_out_grad, t_zero}, {*x_grad}, {});
+    r_sel.Run(stream);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -115,3 +177,8 @@ REGISTER_OP_NPU_KERNEL(
     ops::ReduceMaxNPUKernel<plat::NPUDeviceContext, plat::float16>,
     ops::ReduceMaxNPUKernel<plat::NPUDeviceContext, int64_t>,
     ops::ReduceMaxNPUKernel<plat::NPUDeviceContext, int>);
+REGISTER_OP_NPU_KERNEL(
+    reduce_max_grad, ops::ReduceMaxGradNPUKernel<plat::NPUDeviceContext, float>,
+    ops::ReduceMaxGradNPUKernel<plat::NPUDeviceContext, plat::float16>,
+    ops::ReduceMaxGradNPUKernel<plat::NPUDeviceContext, int64_t>,
+    ops::ReduceMaxGradNPUKernel<plat::NPUDeviceContext, int>);
diff --git a/paddle/fluid/operators/rnn_op_xpu.cc b/paddle/fluid/operators/rnn_op_xpu.cc
index 220d91bf4faab..941e463f63cdc 100644
--- a/paddle/fluid/operators/rnn_op_xpu.cc
+++ b/paddle/fluid/operators/rnn_op_xpu.cc
@@ -65,7 +65,7 @@ class RnnXPUKernel : public framework::OpKernel<T> {
     auto* output = ctx.Output<Tensor>("Out");
     auto* dropout_mask = ctx.Output<Tensor>("DropoutState");
     auto* reserve_data = ctx.Output<Tensor>("Reserve");
-    // Attrbutes
+    // Attributes
     const int& num_layers = ctx.Attr<int>("num_layers");
     const bool& is_bidirec = ctx.Attr<bool>("is_bidirec");
     const int& hidden_size = ctx.Attr<int>("hidden_size");
diff --git a/paddle/fluid/operators/roi_align_op_xpu.cc b/paddle/fluid/operators/roi_align_op_xpu.cc
index 13490d6fcde3a..7be1c19012099 100644
--- a/paddle/fluid/operators/roi_align_op_xpu.cc
+++ b/paddle/fluid/operators/roi_align_op_xpu.cc
@@ -37,7 +37,7 @@ class XPUROIAlignOpKernel : public framework::OpKernel<T> {
     auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
     auto aligned = ctx.Attr<bool>("aligned");
 
-    auto in_dims = in->dims();
+    const auto& in_dims = in->dims();
     int batch_size = in_dims[0];
     int channels = in_dims[1];
     int height = in_dims[2];
diff --git a/paddle/fluid/operators/sample_logits_op.cc b/paddle/fluid/operators/sample_logits_op.cc
index 420c4c5f257ca..e02c7ade9a11a 100644
--- a/paddle/fluid/operators/sample_logits_op.cc
+++ b/paddle/fluid/operators/sample_logits_op.cc
@@ -58,7 +58,7 @@ class SampleLogitsOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput(
         "Probabilities",
         "(Tensor, default: Tensor<float>), A 2-D tensor with shape [N, NT + S]."
-        "The probabilites of sampled positive and negtive labels.")
+        "The probabilities of sampled positive and negtive labels.")
         .AsIntermediate();
     AddOutput("LogitsDim", "Store dim information of Logits for gradient op")
         .AsIntermediate();
diff --git a/paddle/fluid/operators/set_value_op_npu.cc b/paddle/fluid/operators/set_value_op_npu.cc
index c22e07583df3b..daa033f9dc66d 100644
--- a/paddle/fluid/operators/set_value_op_npu.cc
+++ b/paddle/fluid/operators/set_value_op_npu.cc
@@ -178,7 +178,7 @@ class SetValueNPUKernel : public framework::OpKernel<T> {
         .AddInput(std::move(index_indices))
         .AddInput(val_temp)
         .AddOutput(out_temp)
-#if (CANN_VERSION_CODE >= 504001)
+#if (CANN_VERSION_CODE >= 504000)
         .AddAttrs({{"use_locking", false}})
 #endif
         .Run(stream);
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
index f186f95a2b961..ed173bb3ebfa9 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
@@ -22,7 +22,7 @@ using Tensor = framework::Tensor;
 const int kIgnoreIndex = -100;
 
 void CheckAttrs(const framework::ExecutionContext& ctx) {
-  // Add this check is is due to Ascend SigmoidCrossEntropyWithLogits
+  // Add this check is due to Ascend SigmoidCrossEntropyWithLogits
   // and SigmoidCrossEntropyWithLogitsGrad does't supoort
   // attr normalize and ignore_index
   bool normalize = ctx.Attr<bool>("normalize");
diff --git a/paddle/fluid/operators/slice_op_mlu.cc b/paddle/fluid/operators/slice_op_mlu.cc
new file mode 100644
index 0000000000000..43322e4b2e75b
--- /dev/null
+++ b/paddle/fluid/operators/slice_op_mlu.cc
@@ -0,0 +1,196 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/slice_op.h"
+
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/phi/kernels/funcs/slice_utils.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class SliceMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("Input");
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto axes = ctx.Attr<std::vector<int>>("axes");
+    auto starts = ctx.Attr<std::vector<int>>("starts");
+    auto ends = ctx.Attr<std::vector<int>>("ends");
+
+    auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
+    auto infer_flags = ctx.Attr<std::vector<int>>("infer_flags");
+
+    // Get the accurate attribute value of starts and ends
+    auto starts_tensor_list = ctx.MultiInput<Tensor>("StartsTensorList");
+    if (ctx.HasInput("StartsTensor")) {
+      starts = GetDataFromTensor<int>(ctx.Input<Tensor>("StartsTensor"));
+    } else if (starts_tensor_list.size() > 0) {
+      starts = GetDataFromTensorList<int>(starts_tensor_list);
+    }
+
+    auto ends_tensor_list = ctx.MultiInput<Tensor>("EndsTensorList");
+    if (ctx.HasInput("EndsTensor")) {
+      ends = GetDataFromTensor<int>(ctx.Input<Tensor>("EndsTensor"));
+    } else if (ends_tensor_list.size() > 0) {
+      ends = GetDataFromTensorList<int>(ends_tensor_list);
+    }
+
+    PADDLE_ENFORCE_EQ(
+        starts.size(), axes.size(),
+        platform::errors::InvalidArgument(
+            "The size of starts must be equal to the size of axes."));
+    PADDLE_ENFORCE_EQ(
+        ends.size(), axes.size(),
+        platform::errors::InvalidArgument(
+            "The size of ends must be equal to the size of axes."));
+
+    const auto& in_dims = input->dims();
+    auto slice_dims = out->dims();
+    bool reset_slice_dims = false;
+    if (ctx.HasInput("StartsTensor") || ctx.HasInput("EndsTensor") ||
+        starts_tensor_list.size() > 0 || ends_tensor_list.size() > 0) {
+      // Infer output dims
+      for (size_t i = 0; i < axes.size(); ++i) {
+        // when start == -1 && end == start+1
+        if (starts[i] == -1 && ends[i] == 0 && infer_flags[i] == -1) {
+          auto ret =
+              std::find(decrease_axis.begin(), decrease_axis.end(), axes[i]);
+          if (ret != decrease_axis.end()) {
+            ends[i] = in_dims[axes[i]];
+          }
+        }
+      }
+
+      phi::funcs::CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends);
+      slice_dims = phi::funcs::GetSliceDims<int>(in_dims, axes, starts, ends,
+                                                 nullptr, nullptr);
+      reset_slice_dims = true;
+      auto out_dims = phi::funcs::GetDecreasedDims(slice_dims, decrease_axis);
+
+      out->Resize(out_dims);
+    }
+    if (slice_dims.size() != in_dims.size() && !reset_slice_dims) {
+      phi::funcs::CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends);
+      slice_dims = phi::funcs::GetSliceDims<int>(in_dims, axes, starts, ends,
+                                                 nullptr, nullptr);
+    }
+
+    int in_dim_size = input->dims().size();
+    if (static_cast<int>(axes.size()) != in_dim_size) {
+      std::vector<int> tmp_starts(in_dim_size, 0);
+      const auto& in_dims_vec = phi::vectorize(input->dims());
+      std::vector<int> tmp_ends(in_dims_vec.begin(), in_dims_vec.end());
+      for (size_t i = 0; i < axes.size(); ++i) {
+        tmp_starts[axes[i]] = starts[i];
+        tmp_ends[axes[i]] = ends[i];
+      }
+      starts.swap(tmp_starts);
+      ends.swap(tmp_ends);
+    }
+    std::vector<int> strides(in_dim_size, 1);
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc input_desc(*input);
+    MLUCnnlTensorDesc out_desc(slice_dims.size(),
+                               phi::vectorize(slice_dims).data(),
+                               ToCnnlDataType<T>());
+    MLUCnnl::StridedSlice(ctx, starts.data(), ends.data(), strides.data(),
+                          input_desc.get(), GetBasePtr(input), out_desc.get(),
+                          GetBasePtr(out));
+  }
+};
+
+template <typename T>
+class SliceGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("Input");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dinput = ctx.Output<Tensor>(framework::GradVarName("Input"));
+
+    auto axes = ctx.Attr<std::vector<int>>("axes");
+    auto starts = ctx.Attr<std::vector<int>>("starts");
+    auto ends = ctx.Attr<std::vector<int>>("ends");
+
+    // Get the accurate attribute value of starts and ends
+    auto starts_tensor_list = ctx.MultiInput<Tensor>("StartsTensorList");
+    if (ctx.HasInput("StartsTensor")) {
+      starts = GetDataFromTensor<int>(ctx.Input<Tensor>("StartsTensor"));
+    } else if (starts_tensor_list.size() > 0) {
+      starts = GetDataFromTensorList<int>(starts_tensor_list);
+    }
+
+    auto ends_tensor_list = ctx.MultiInput<Tensor>("EndsTensorList");
+    if (ctx.HasInput("EndsTensor")) {
+      ends = GetDataFromTensor<int>(ctx.Input<Tensor>("EndsTensor"));
+    } else if (ends_tensor_list.size() > 0) {
+      ends = GetDataFromTensorList<int>(ends_tensor_list);
+    }
+
+    const auto& in_dims = input->dims();
+    auto slice_dims = dout->dims();
+    if (slice_dims.size() != in_dims.size()) {
+      phi::funcs::CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends);
+      slice_dims = phi::funcs::GetSliceDims<int>(in_dims, axes, starts, ends,
+                                                 nullptr, nullptr);
+    }
+
+    int in_dim_size = input->dims().size();
+    if (static_cast<int>(axes.size()) != in_dim_size) {
+      std::vector<int> tmp_starts(in_dim_size, 0);
+      const auto& in_dims_vec = phi::vectorize(input->dims());
+      std::vector<int> tmp_ends(in_dims_vec.begin(), in_dims_vec.end());
+      for (size_t i = 0; i < axes.size(); ++i) {
+        tmp_starts[axes[i]] = starts[i];
+        tmp_ends[axes[i]] = ends[i];
+      }
+      starts.swap(tmp_starts);
+      ends.swap(tmp_ends);
+    }
+    std::vector<int> strides(in_dim_size, 1);
+
+    dinput->mutable_data<T>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc dout_desc(slice_dims.size(),
+                                phi::vectorize(slice_dims).data(),
+                                ToCnnlDataType<T>());
+    MLUCnnlTensorDesc dinput_desc(*dinput);
+    MLUCnnl::StridedSliceGrad(ctx, starts.data(), ends.data(), strides.data(),
+                              dout_desc.get(), GetBasePtr(dout),
+                              dinput_desc.get(), GetBasePtr(dinput));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_MLU_KERNEL(slice, ops::SliceMLUKernel<float>,
+                       ops::SliceMLUKernel<int>, ops::SliceMLUKernel<bool>,
+                       ops::SliceMLUKernel<int64_t>,
+                       ops::SliceMLUKernel<double>,
+                       ops::SliceMLUKernel<paddle::platform::float16>);
+
+REGISTER_OP_MLU_KERNEL(slice_grad, ops::SliceGradMLUKernel<float>,
+                       ops::SliceGradMLUKernel<int>,
+                       ops::SliceGradMLUKernel<bool>,
+                       ops::SliceGradMLUKernel<int64_t>,
+                       ops::SliceGradMLUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu
index 33590c1d7cca0..8c6c083cde880 100644
--- a/paddle/fluid/operators/sum_op.cu
+++ b/paddle/fluid/operators/sum_op.cu
@@ -156,7 +156,7 @@ void SumToLoDTensor(const framework::ExecutionContext &context) {
     }
   }
 
-  // compute select rows seperately.
+  // compute select rows separately.
   if (!selectrow_index.empty()) {
     std::vector<const T *> sr_in_out_data;
     size_t rows = 0;
@@ -241,7 +241,7 @@ class SumKernel<platform::CUDADeviceContext, T>
       LodTensorArrayCompute<platform::CUDADeviceContext, T>(context);
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
-          "Expected type of Ouput(out) must be Tensor,  SelectedRows or "
+          "Expected type of Output(out) must be Tensor,  SelectedRows or "
           "LodTensorArray. But got "
           "unsupport type: %s.",
           framework::ToTypeName(out_var->Type())));
diff --git a/paddle/fluid/operators/take_along_axis_op_npu.cc b/paddle/fluid/operators/take_along_axis_op_npu.cc
new file mode 100644
index 0000000000000..1d8ef0675c19e
--- /dev/null
+++ b/paddle/fluid/operators/take_along_axis_op_npu.cc
@@ -0,0 +1,86 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+// TODO(Aganlengzi): delete this macro control and remove REMOVE_ITEM in
+// cmake/operators.cmake when Paddle supports
+#if (CANN_VERSION_CODE >= 504000)
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class NPUTakeAlongAxisKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto input = ctx.Input<Tensor>("Input");
+    auto axis = ctx.Attr<int>("Axis");
+    auto index = ctx.Input<Tensor>("Index");
+    auto result = ctx.Output<Tensor>("Result");
+    result->mutable_data<T>(ctx.GetPlace());
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    const auto& runner = NpuOpRunner("GatherElements", {*input, *index},
+                                     {*result}, {{"dim", axis}});
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class NPUTakeAlongAxisGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto axis = ctx.Attr<int>("Axis");
+    auto index = ctx.Input<Tensor>("Index");
+    auto result_grad = ctx.Input<Tensor>(framework::GradVarName("Result"));
+
+    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    input_grad->mutable_data<T>(ctx.GetPlace());
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    const auto& runner =
+        NpuOpRunner("ScatterAddWithAxis", {*input_grad, *index, *result_grad},
+                    {*input_grad}, {{"axis", axis}});
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_NPU_KERNEL(
+    take_along_axis,
+    ops::NPUTakeAlongAxisKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::NPUTakeAlongAxisKernel<paddle::platform::NPUDeviceContext, int64_t>,
+    ops::NPUTakeAlongAxisKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::NPUTakeAlongAxisKernel<paddle::platform::NPUDeviceContext, double>)
+REGISTER_OP_NPU_KERNEL(
+    take_along_axis_grad,
+    ops::NPUTakeAlongAxisGradKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::NPUTakeAlongAxisGradKernel<paddle::platform::NPUDeviceContext,
+                                    int64_t>,
+    ops::NPUTakeAlongAxisGradKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::NPUTakeAlongAxisGradKernel<paddle::platform::NPUDeviceContext, double>)
+
+#endif
diff --git a/paddle/fluid/operators/tdm_child_op.h b/paddle/fluid/operators/tdm_child_op.h
index 963dfd3bf7720..e437975320cc5 100644
--- a/paddle/fluid/operators/tdm_child_op.h
+++ b/paddle/fluid/operators/tdm_child_op.h
@@ -149,7 +149,7 @@ class TDMChildKernel : public framework::OpKernel<T> {
                           output_type == framework::proto::VarType::INT64;
     PADDLE_ENFORCE_EQ(out_type_match, true,
                       platform::errors::InvalidArgument(
-                          "Ouput(Child) & Output(LeafMask) holds the wrong "
+                          "Output(Child) & Output(LeafMask) holds the wrong "
                           "type, it holds %s, but "
                           "desires to be %s or %s",
                           paddle::framework::DataTypeToString(output_type),
diff --git a/paddle/fluid/operators/tril_indices_op.cc b/paddle/fluid/operators/tril_indices_op.cc
new file mode 100644
index 0000000000000..be42f53dd2344
--- /dev/null
+++ b/paddle/fluid/operators/tril_indices_op.cc
@@ -0,0 +1,87 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/nullary.h"
+
+namespace paddle {
+namespace operators {
+
+class TrilIndicesOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
+        ctx.GetPlace());
+  }
+};
+
+class TrilIndicesOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddOutput("out",
+              "Tensor, the output tensor, with the shape (2,x),x bounded by "
+              "[0,rows*cols])");
+    AddAttr<int>("rows",
+                 "int number, the input of tril_indices op"
+                 "which describes the number of row of the matrix")
+        .SetDefault(0);
+    AddAttr<int>("cols",
+                 "int number, the input of tril_indices op"
+                 "which describes the number of col of the matrix")
+        .SetDefault(0);
+    AddAttr<int>(
+        "offset",
+        "int number, the input of tril_indices op bounded by [1-rows,cols-1"
+        "which describes the dignalline index of the lower triangular part of "
+        "the matrix")
+        .SetDefault(0);
+    AddAttr<int>("dtype", "data type ,the input of tril_indices op")
+        .SetDefault(framework::proto::VarType::INT64);
+
+    AddComment(R"DOC(
+  TrilIndices Operator.
+
+  The tril_indices operator returns the indices of the lower triangular part of the matrix 
+  whose rows and cols is knowed. It is a 2-by-x tensor,where the first row contains row coordinates 
+  of all indices and the second row contains column coordinates. Indices are ordered based on 
+  rows and then columns. The lower triangular part of the matrix is defined as the elements on
+  and below the diagonal.
+
+  The argument offset controls which diagonal to consider, default value is 0.
+  A positive valueincludes just as many diagonals above the main diagonal,
+  and similarly a negative value excludes just as many diagonals below the main diagonal
+  )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(tril_indices, TrilIndicesInferShapeFunctor,
+                            PD_INFER_META(phi::TrilIndicesInferMeta));
+
+REGISTER_OPERATOR(
+    tril_indices, ops::TrilIndicesOp, ops::TrilIndicesOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    TrilIndicesInferShapeFunctor);
diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h
index ae846f4cae6fb..3e27402c86947 100644
--- a/paddle/fluid/operators/uniform_random_op.h
+++ b/paddle/fluid/operators/uniform_random_op.h
@@ -151,12 +151,6 @@ void UniformRandom(const framework::ExecutionContext& context,
   T* data = tensor->mutable_data<T>(dev_cxt.GetPlace());
   if (size <= 0) return;
   unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-  bool seed_flag = false;
-  if (seed == 0) {
-    std::random_device rd;
-    seed = rd();
-    seed_flag = true;
-  }
 
   T min = static_cast<T>(context.Attr<float>("min"));
   T max = static_cast<T>(context.Attr<float>("max"));
@@ -165,14 +159,15 @@ void UniformRandom(const framework::ExecutionContext& context,
   unsigned int diag_step =
       static_cast<unsigned int>(context.Attr<int>("diag_step"));
   T diag_val = static_cast<T>(context.Attr<float>("diag_val"));
-  int device_id = context.GetPlace().GetDeviceId();
-  auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
-  if (gen_cuda->GetIsInitPy() && seed_flag) {
+
+  if (seed == 0) {
+    // Use global Generator seed
     using MT = typename details::MPTypeTrait<T>::Type;
     phi::funcs::uniform_distribution<MT> dist;
     phi::funcs::uniform_real_transform<MT> trans(min, max);
     phi::funcs::distribution_and_transform<T>(dev_cxt, tensor, dist, trans);
   } else {
+    // Use OP seed
     auto func =
         UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val);
     phi::IndexKernel<T, UniformGenerator<T>>(dev_cxt, tensor, func);
diff --git a/paddle/fluid/operators/unstack_op_mlu.cc b/paddle/fluid/operators/unstack_op_mlu.cc
new file mode 100644
index 0000000000000..9c4dd256a94ef
--- /dev/null
+++ b/paddle/fluid/operators/unstack_op_mlu.cc
@@ -0,0 +1,95 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class UnStackMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *x = ctx.Input<Tensor>("X");
+    auto out = ctx.MultiOutput<Tensor>("Y");
+    int axis = ctx.Attr<int>("axis");
+    if (axis < 0) axis += x->dims().size();
+    int num = x->dims()[axis];
+
+    std::vector<MLUCnnlTensorDesc> out_descs;
+    std::vector<cnnlTensorDescriptor_t> out_raw_descs;
+    std::vector<void *> out_ptrs;
+    std::vector<int64_t> new_dims = phi::vectorize(x->dims());
+    new_dims[axis] = 1;
+    for (int i = 0; i < num; i++) {
+      out[i]->mutable_data<T>(ctx.GetPlace());
+      out_descs.emplace_back(MLUCnnlTensorDesc(new_dims.size(), new_dims.data(),
+                                               ToCnnlDataType<T>()));
+      out_raw_descs.push_back(out_descs.back().get());
+      out_ptrs.push_back(GetBasePtr(out[i]));
+    }
+
+    MLUCnnlTensorDesc x_desc(*x);
+    MLUCnnl::Split(ctx, num, axis, x_desc.get(), GetBasePtr(x),
+                   out_raw_descs.data(), out_ptrs.data());
+  }
+};
+
+template <typename T>
+class UnStackGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto x = ctx.MultiInput<Tensor>(framework::GradVarName("Y"));
+    auto *y = ctx.Output<Tensor>(framework::GradVarName("X"));
+    int axis = ctx.Attr<int>("axis");
+    if (axis < 0) axis += (x[0]->dims().size() + 1);
+    int num = static_cast<int>(x.size());
+
+    std::vector<MLUCnnlTensorDesc> x_descs;
+    std::vector<cnnlTensorDescriptor_t> x_raw_descs;
+    std::vector<const void *> x_ptrs;
+    for (int i = 0; i < num; i++) {
+      if (x[i]->dims().size() != 0) {
+        std::vector<int64_t> in_dims = phi::vectorize(x[i]->dims());
+        in_dims.insert(in_dims.begin() + axis, 1);
+        x_descs.emplace_back(MLUCnnlTensorDesc(in_dims.size(), in_dims.data(),
+                                               ToCnnlDataType<T>()));
+      } else {
+        int input_dims = 1;
+        x_descs.emplace_back(
+            MLUCnnlTensorDesc(1, &input_dims, ToCnnlDataType<T>()));
+      }
+      x_raw_descs.push_back(x_descs.back().get());
+      x_ptrs.push_back(GetBasePtr(x[i]));
+    }
+    y->mutable_data<T>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc y_desc(*y);
+    MLUCnnl::Concat(ctx, num, axis, x_raw_descs.data(), x_ptrs.data(),
+                    y_desc.get(), GetBasePtr(y));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace plat = paddle::platform;
+namespace ops = paddle::operators;
+
+REGISTER_OP_MLU_KERNEL(unstack, ops::UnStackMLUKernel<float>,
+                       ops::UnStackMLUKernel<plat::float16>);
+
+REGISTER_OP_MLU_KERNEL(unstack_grad, ops::UnStackGradMLUKernel<float>,
+                       ops::UnStackGradMLUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/warpctc_op.cc b/paddle/fluid/operators/warpctc_op.cc
index 5cd9feee82895..1583e5d84b233 100644
--- a/paddle/fluid/operators/warpctc_op.cc
+++ b/paddle/fluid/operators/warpctc_op.cc
@@ -95,7 +95,7 @@ An operator integrating the open-source
 https://arxiv.org/pdf/1512.02595v1.pdf),
 to compute Connectionist Temporal Classification (CTC) loss.
 It can be aliased as softmax with ctc, since a native softmax activation is
-interated to the warp-ctc library, to to normalize values for each row of the
+interated to the warp-ctc library, to normalize values for each row of the
 input tensor.
 
 More detail of CTC loss can be found by referring to
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index f29546c5210d9..24d39c25cf335 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -116,7 +116,7 @@ endif()
 
 cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost)
 
-# seperate init from device_context to avoid cycle dependencies
+# separate init from device_context to avoid cycle dependencies
 cc_library(init SRCS init.cc DEPS device_context custom_kernel context_pool)
 
 # memcpy depends on device_context, here add deps individually for
@@ -125,7 +125,7 @@ cc_library(device_context SRCS device_context.cc DEPS simple_threadpool malloc x
     place phi_place eigen3 stringpiece cpu_helper cpu_info framework_proto ${IPU_CTX_DEPS} ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
     ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS} ${MLU_CTX_DEPS} eigen3 cpu_context generator)
 if(WITH_XPU)
-  target_link_libraries(device_context xpu_context)
+  target_link_libraries(device_context xpu_context xpu_resource_pool)
 endif()
 
 cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce)
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
index eb82389702ca4..5410638ceb39a 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -50,11 +50,12 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb);
 DECLARE_bool(enable_cublas_tensor_op_math);
 DECLARE_uint64(gpu_memory_limit_mb);
 
-#ifdef PADDLE_WITH_TESTING
 PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log, false,
                             "Whether to print the message of gpu memory usage "
                             "at exit, mainly used for UT and CI.");
-#endif
+PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log_mb, true,
+                            "Whether to print the message of gpu memory usage "
+                            "MB as a unit of measurement.");
 
 constexpr static float fraction_reserve_gpu_memory = 0.05f;
 
@@ -145,25 +146,35 @@ class RecordedGpuMallocHelper {
       mtx_.reset(new std::mutex());
     }
 
-#ifdef PADDLE_WITH_TESTING
     if (FLAGS_enable_gpu_memory_usage_log) {
       // A fake UPDATE to trigger the construction of memory stat instances,
       // make sure that they are destructed after RecordedGpuMallocHelper.
-      MEMORY_STAT_UPDATE(Reserved, dev_id, 0);
+      DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id, 0);
+      DEVICE_MEMORY_STAT_UPDATE(Allocated, dev_id, 0);
     }
-#endif
   }
 
   DISABLE_COPY_AND_ASSIGN(RecordedGpuMallocHelper);
 
  public:
   ~RecordedGpuMallocHelper() {
-#ifdef PADDLE_WITH_TESTING
     if (FLAGS_enable_gpu_memory_usage_log) {
-      std::cout << "[Memory Usage (Byte)] gpu " << dev_id_ << " : "
-                << MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_) << std::endl;
+      if (FLAGS_enable_gpu_memory_usage_log_mb) {
+        std::cout << "[Memory Usage (MB)] gpu " << dev_id_ << " : Reserved = "
+                  << DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_) /
+                         1048576.0
+                  << ", Allocated = "
+                  << DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, dev_id_) /
+                         1048576.0
+                  << std::endl;
+      } else {
+        std::cout << "[Memory Usage (Byte)] gpu " << dev_id_ << " : Reserved = "
+                  << DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_)
+                  << ", Allocated = "
+                  << DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, dev_id_)
+                  << std::endl;
+      }
     }
-#endif
   }
 
   static RecordedGpuMallocHelper *Instance(int dev_id) {
@@ -222,7 +233,7 @@ class RecordedGpuMallocHelper {
     if (result == gpuSuccess) {
       cur_size_.fetch_add(size);
       STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
-      MEMORY_STAT_UPDATE(Reserved, dev_id_, size);
+      DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, size);
 
 #ifdef PADDLE_WITH_TESTING
       gpu_ptrs.insert(*ptr);
@@ -261,7 +272,7 @@ class RecordedGpuMallocHelper {
       PADDLE_ENFORCE_GPU_SUCCESS(err);
       cur_size_.fetch_sub(size);
       STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
-      MEMORY_STAT_UPDATE(Reserved, dev_id_, -size);
+      DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, -size);
     } else {
       platform::GpuGetLastError();  // clear the error flag when
                                     // cudaErrorCudartUnloading /
diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h
index 4301ef4bcf126..61ea0fd3cd293 100644
--- a/paddle/fluid/platform/device/gpu/nccl_helper.h
+++ b/paddle/fluid/platform/device/gpu/nccl_helper.h
@@ -50,6 +50,8 @@ inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) {
     return ncclInt64;
   } else if (type == framework::proto::VarType::FP16) {
     return ncclFloat16;
+  } else if (type == framework::proto::VarType::INT8) {
+    return ncclInt8;
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
         "This datatype in nccl is not supported."));
diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.cc b/paddle/fluid/platform/device/ipu/ipu_backend.cc
index 0871624a5d749..9e960a99123c0 100644
--- a/paddle/fluid/platform/device/ipu/ipu_backend.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_backend.cc
@@ -74,11 +74,7 @@ void IpuBackend::WeightsToHost() { executor_->WeightsToHost(); }
 
 void IpuBackend::Detach() { executor_->Detach(); }
 
-void IpuBackend::Reset() {
-  executor_->Detach();
-  compiler_.reset();
-  executor_.reset();
-}
+void IpuBackend::Reset() { executor_->Reset(); }
 
 void IpuBackend::SetScope(const framework::Scope& scope) {
   scope_ = &scope;
diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.cc b/paddle/fluid/platform/device/ipu/ipu_executor.cc
index 96c2b4f9a9ded..d490334ee33f5 100644
--- a/paddle/fluid/platform/device/ipu/ipu_executor.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_executor.cc
@@ -88,11 +88,7 @@ class PdIArray final : public popart::IArray {
 
 }  // namespace
 
-Executor::~Executor() {
-  Detach();
-  session_.reset();
-  executor_resources_.reset();
-}
+Executor::~Executor() { Reset(); }
 
 void Executor::Prepare(const std::string &proto) {
   VLOG(10) << "enter Executor::Prepare";
@@ -197,7 +193,9 @@ void Executor::Run(const std::vector<const Tensor *> &inputs,
   }
   VLOG(10) << "Prepared inputs/anchors";
 
-  if (ipu_strategy_->is_training && compiler_resources_->with_lr_sched) {
+  if (ipu_strategy_->is_training && compiler_resources_->with_lr_sched &&
+      !(ipu_strategy_->popart_options.createImplicitPipeliningFwdOnlyProgram &&
+        ipu_strategy_->runtime_options.enable_eval)) {
     popart::Optimizer *optimizer;
     if (ipu_strategy_->runtime_options.enable_eval) {
       VLOG(10) << "Switch optimizer to eval mode";
@@ -215,7 +213,12 @@ void Executor::Run(const std::vector<const Tensor *> &inputs,
 
   popart::StepIO stepio(popart_inputs, popart_anchors);
   VLOG(10) << "Running...";
-  session_->run(stepio);
+  if (ipu_strategy_->popart_options.createImplicitPipeliningFwdOnlyProgram &&
+      ipu_strategy_->runtime_options.enable_eval) {
+    session_->run("implicitPipeliningFwdOnly", stepio);
+  } else {
+    session_->run(stepio);
+  }
   VLOG(10) << "Running...done";
 }
 
@@ -292,6 +295,12 @@ void Executor::Detach() {
   }
 }
 
+void Executor::Reset() {
+  Detach();
+  session_.reset();
+  executor_resources_.reset();
+}
+
 void Executor::SetWeightsIO() {
   auto opt_type = compiler_resources_->optimizer_type;
   VLOG(10) << "SetWeightsIO for " << opt_type;
diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.h b/paddle/fluid/platform/device/ipu/ipu_executor.h
index 70c9477e69bab..1a46ebc69b197 100644
--- a/paddle/fluid/platform/device/ipu/ipu_executor.h
+++ b/paddle/fluid/platform/device/ipu/ipu_executor.h
@@ -63,6 +63,9 @@ class Executor {
   // Detach IPU
   void Detach();
 
+  // Reset session
+  void Reset();
+
   // Scope
   void SetScope(const Scope *scope) { scope_ = scope; }
 
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.cc b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
index aff5498243000..714f44c69b0d9 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
@@ -32,6 +32,20 @@ void RegisterGetter(
   options_type[name] = type_str;
 }
 
+struct DefaultCompilationProgressLogger {
+  void operator()(int progress, int total) {
+    if (progress != progress_ && progress % log_interval_ == 0) {
+      progress_ = progress;
+      VLOG(1) << "Graph compile progress: " << progress << "%";
+    }
+  }
+
+  int log_interval_ = 10;
+  int progress_ = 0;
+  // default total progress
+  int total_ = 100;
+};
+
 }  // namespace
 
 namespace paddle {
@@ -271,6 +285,8 @@ IpuStrategy::IpuStrategy() {
   ADD_POPART_BOOL_OPTION_ALIAS(
       schedule_non_weight_update_gradient_consumers_early,
       scheduleNonWeightUpdateGradientConsumersEarly);
+  ADD_POPART_BOOL_OPTION_ALIAS(create_implicit_pipelining_fwd_only_program,
+                               createImplicitPipeliningFwdOnlyProgram);
 
   ADD_POPART_DOUBLE_OPTION_ALIAS(outline_sequence_break_cost,
                                  outlineSequenceBreakCost);
@@ -327,21 +343,26 @@ IpuStrategy::IpuStrategy() {
         return std::to_string(popart_options.partialsTypeMatMuls == "half");
       });
 
-  RegisterSetter(
-      container_options, "dot_checks",
-      [&](const std::pair<std::string, std::string>& p) {
-        std::uint64_t value = std::stoul(p.first);
-        popart_options.dotChecks.insert(static_cast<popart::DotCheck>(value));
-      });
+  RegisterSetter(container_options, "dot_checks",
+                 [&](const std::pair<std::string, std::string>& p) {
+                   std::vector<std::string> valid_dot{"Fwd0", "Fwd1", "Bwd0",
+                                                      "PreAlias", "Final"};
+                   if (std::find(valid_dot.begin(), valid_dot.end(), p.first) ==
+                       valid_dot.end()) {
+                     PADDLE_THROW(platform::errors::InvalidArgument(
+                         "Unknown dot check: %s", p.first));
+                   }
+                   popart_options.dotChecks.insert(p.first);
+                 });
 
-  RegisterGetter(
-      vector_options_getter, options_type, "dot_checks", "vector", [&]() {
-        std::vector<std::string> res;
-        for (auto x : popart_options.dotChecks) {
-          res.push_back(std::to_string(static_cast<std::uint64_t>(x)));
-        }
-        return res;
-      });
+  RegisterGetter(vector_options_getter, options_type, "dot_checks", "vector",
+                 [&]() {
+                   std::vector<std::string> res;
+                   for (auto x : popart_options.dotChecks) {
+                     res.push_back(x);
+                   }
+                   return res;
+                 });
 
   RegisterSetter(container_options, "hardware_instrumentations",
                  [&](const std::pair<std::string, std::string>& p) {
@@ -417,11 +438,7 @@ IpuStrategy::IpuStrategy() {
   // Default options
 
   // Can also be set as a custom logger in python, like using tqdm
-  popart_options.compilationProgressLogger = [](int progress, int total) {
-    if (progress % 10 == 0) {
-      VLOG(1) << "compile progress: " << progress << "%";
-    }
-  };
+  popart_options.compilationProgressLogger = DefaultCompilationProgressLogger();
 }
 
 void IpuStrategy::AddBoolOption(const std::string& option, bool value) {
@@ -506,6 +523,21 @@ void IpuStrategy::SetTensorLocation(const std::string& tensor,
   }
 }
 
+void IpuStrategy::SetReplicatedCollectivesSettings(const std::string& opt,
+                                                   bool value) {
+  VLOG(10) << "Set Replica Setting " << opt << " to " << value;
+  if (opt == "prepare_schedule_for_merging_collectives") {
+    popart_options.replicatedCollectivesSettings
+        .prepareScheduleForMergingCollectives = value;
+  } else if (opt == "merge_all_reduce_collectives") {
+    popart_options.replicatedCollectivesSettings.mergeAllReduceCollectives =
+        value;
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Unknown option ' %s' for replicated collectives settings", opt));
+  }
+}
+
 void IpuStrategy::SetAccumulateOuterFragmentSettings(
     const std::uint64_t& schedule, const std::vector<int>& values) {
   VLOG(10) << "SetAccumulateOuterFragmentSettings schedule:" << schedule;
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.h b/paddle/fluid/platform/device/ipu/ipu_strategy.h
index fa57dcd676d81..da08c76fb90d1 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.h
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.h
@@ -118,6 +118,7 @@ class IpuStrategy {
                               const std::string &value);
   void SetTensorLocation(const std::string &tensor, const std::string &option,
                          std::uint64_t value);
+  void SetReplicatedCollectivesSettings(const std::string &opt, bool value);
   void SetAccumulateOuterFragmentSettings(const std::uint64_t &schedule,
                                           const std::vector<int> &values);
   void AddCustomOp(const std::string &paddle_op, const std::string &popart_op,
diff --git a/paddle/fluid/platform/device/npu/npu_op_runner.h b/paddle/fluid/platform/device/npu/npu_op_runner.h
index 2409c14b760fd..739a3ef41e422 100644
--- a/paddle/fluid/platform/device/npu/npu_op_runner.h
+++ b/paddle/fluid/platform/device/npu/npu_op_runner.h
@@ -70,7 +70,7 @@ class NpuOpRunner {
   NpuOpRunner &AddInput(const Tensor &tensor);
 
   // NOTE(zhiqiu): CANN-5.0.2 support input tensors on host.
-  // Specifically, the tensor of shape, tensor of dims, etc, which are are small
+  // Specifically, the tensor of shape, tensor of dims, etc, which are small
   // vector/list.
   NpuOpRunner &AddInput(const Tensor &tensor, aclMemType mem_type);
 
diff --git a/paddle/fluid/platform/device/xpu/CMakeLists.txt b/paddle/fluid/platform/device/xpu/CMakeLists.txt
index b6a26f2554a13..3399fff087f8d 100644
--- a/paddle/fluid/platform/device/xpu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/xpu/CMakeLists.txt
@@ -7,5 +7,6 @@ set(XPU_CTX_DEPS xpulib ssl crypto rt z resolv dl)
 
 cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib device_context place phi_xpu_info)
 cc_library(xpu_op_list SRCS xpu_op_list.cc DEPS gflags glog enforce xpulib device_context op_kernel_type)
+cc_library(xpu_resource_pool SRCS xpu_resource_pool.cc DEPS xpu_info)
 
 add_subdirectory(tests)
diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
index 77019a0192312..99f8e5ace9c00 100644
--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -307,11 +307,13 @@ XPUOpMap& get_kl2_ops() {
                              pOpKernelType(vartype::FP16, XPUPlace())})},
       {"reshape2_grad",
        XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace()),
                      pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace()),
                      pOpKernelType(vartype::BOOL, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace())})},
       {"reshape2", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
+                                 pOpKernelType(vartype::FP16, XPUPlace()),
                                  pOpKernelType(vartype::INT64, XPUPlace()),
                                  pOpKernelType(vartype::INT32, XPUPlace()),
                                  pOpKernelType(vartype::BOOL, XPUPlace()),
@@ -326,6 +328,8 @@ XPUOpMap& get_kl2_ops() {
                               pOpKernelType(vartype::INT64, XPUPlace())})},
       {"scatter", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                                 pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sgd", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                            pOpKernelType(vartype::FP16, XPUPlace())})},
       {"sigmoid_cross_entropy_with_logits_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"sigmoid_cross_entropy_with_logits",
diff --git a/paddle/fluid/platform/device/xpu/xpu_info.cc b/paddle/fluid/platform/device/xpu/xpu_info.cc
index 2e960c1c0dd9c..cdd7ee7f806e9 100644
--- a/paddle/fluid/platform/device/xpu/xpu_info.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_info.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -79,6 +79,10 @@ void MemcpySyncD2D(void* dst, const platform::XPUPlace& dst_place,
                                     *dev_ctx);
 }
 
+void XPUStreamSync(xpuStream stream) {
+  PADDLE_ENFORCE_XDNN_SUCCESS(xpu_wait(stream), "xpu_wait");
+}
+
 /**************************** Others **************************/
 
 phi::backends::xpu::XPUVersion get_xpu_version(int dev_id) {
diff --git a/paddle/fluid/platform/device/xpu/xpu_info.h b/paddle/fluid/platform/device/xpu/xpu_info.h
index 33385f8e45937..38b4defadc6c3 100644
--- a/paddle/fluid/platform/device/xpu/xpu_info.h
+++ b/paddle/fluid/platform/device/xpu/xpu_info.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -14,8 +14,13 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/backends/xpu/xpu_info.h"
+#include "xpu/runtime.h"
 
 namespace paddle {
+
+using xpuStream = XPUStream;
+using xpuEventHandle = XPUEvent;
+
 namespace platform {
 
 /***** Version Management *****/
@@ -51,6 +56,9 @@ void MemcpySyncD2D(void *dst, const platform::XPUPlace &dst_place,
                    const void *src, const platform::XPUPlace &src_place,
                    size_t count);
 
+//! Blocks until stream has completed all operations.
+void XPUStreamSync(xpuStream stream);
+
 using XPUDeviceGuard = phi::backends::xpu::XPUDeviceGuard;
 
 phi::backends::xpu::XPUVersion get_xpu_version(int dev_id);
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
index ab68ebf3a5448..778c18146d64d 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
@@ -109,6 +109,8 @@ XPUOpMap& get_kp_ops() {
       {"reduce_any", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace())})},
       {"pull_box_sparse",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"push_box_sparse",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"reduce_amax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"reduce_amin", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
   };
diff --git a/paddle/fluid/platform/device/xpu/xpu_resource_pool.cc b/paddle/fluid/platform/device/xpu/xpu_resource_pool.cc
new file mode 100644
index 0000000000000..af0d47c716717
--- /dev/null
+++ b/paddle/fluid/platform/device/xpu/xpu_resource_pool.cc
@@ -0,0 +1,98 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(PADDLE_WITH_XPU)
+#include "paddle/fluid/platform/device/xpu/xpu_resource_pool.h"
+
+namespace paddle {
+namespace platform {
+
+XpuStreamResourcePool::XpuStreamResourcePool() {
+  int dev_cnt = platform::GetXPUDeviceCount();
+  pool_.reserve(dev_cnt);
+  for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
+    auto creator = [dev_idx] {
+      platform::XPUDeviceGuard gurad(dev_idx);
+      xpuStream stream;
+      xpu_stream_create(&stream);
+      return stream;
+    };
+
+    auto deleter = [dev_idx](xpuStream stream) {
+      platform::XPUDeviceGuard gurad(dev_idx);
+      xpu_stream_destroy(stream);
+    };
+
+    pool_.emplace_back(ResourcePool<XpuStreamObject>::Create(creator, deleter));
+  }
+}
+
+XpuStreamResourcePool& XpuStreamResourcePool::Instance() {
+  static XpuStreamResourcePool pool;
+  return pool;
+}
+
+std::shared_ptr<XpuStreamObject> XpuStreamResourcePool::New(int dev_idx) {
+  PADDLE_ENFORCE_GE(
+      dev_idx, 0,
+      platform::errors::InvalidArgument(
+          "The dev_idx should be not less than 0, but got %d.", dev_idx));
+  PADDLE_ENFORCE_LT(
+      dev_idx, pool_.size(),
+      platform::errors::OutOfRange(
+          "The dev_idx should be less than device count %d, but got %d.",
+          pool_.size(), dev_idx));
+  return pool_[dev_idx]->New();
+}
+
+XpuEventResourcePool::XpuEventResourcePool() {
+  int dev_cnt = platform::GetXPUDeviceCount();
+  pool_.reserve(dev_cnt);
+  for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
+    auto creator = [dev_idx] {
+      platform::XPUDeviceGuard gurad(dev_idx);
+      xpuEventHandle event;
+      xpu_event_create(&event);
+      return event;
+    };
+
+    auto deleter = [dev_idx](xpuEventHandle event) {
+      platform::XPUDeviceGuard gurad(dev_idx);
+      xpu_event_destroy(event);
+    };
+
+    pool_.emplace_back(ResourcePool<XpuEventObject>::Create(creator, deleter));
+  }
+}
+
+XpuEventResourcePool& XpuEventResourcePool::Instance() {
+  static XpuEventResourcePool pool;
+  return pool;
+}
+
+std::shared_ptr<XpuEventObject> XpuEventResourcePool::New(int dev_idx) {
+  PADDLE_ENFORCE_GE(
+      dev_idx, 0,
+      platform::errors::InvalidArgument(
+          "The dev_idx should be not less than 0, but got %d.", dev_idx));
+  PADDLE_ENFORCE_LT(
+      dev_idx, pool_.size(),
+      platform::errors::OutOfRange(
+          "The dev_idx should be less than device count %d, but got %d.",
+          pool_.size(), dev_idx));
+  return pool_[dev_idx]->New();
+}
+
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/device/xpu/xpu_resource_pool.h b/paddle/fluid/platform/device/xpu/xpu_resource_pool.h
new file mode 100644
index 0000000000000..5c6ade8f6f88f
--- /dev/null
+++ b/paddle/fluid/platform/device/xpu/xpu_resource_pool.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(PADDLE_WITH_XPU)
+#include <memory>
+#include <type_traits>
+#include <vector>
+
+#include "paddle/fluid/platform/device/xpu/xpu_info.h"
+#include "paddle/fluid/platform/resource_pool.h"
+
+namespace paddle {
+namespace platform {
+
+using XpuStreamObject = std::remove_pointer<xpuStream>::type;
+using XpuEventObject = std::remove_pointer<xpuEventHandle>::type;
+
+class XpuStreamResourcePool {
+ public:
+  std::shared_ptr<XpuStreamObject> New(int dev_idx);
+
+  static XpuStreamResourcePool &Instance();
+
+ private:
+  XpuStreamResourcePool();
+
+  DISABLE_COPY_AND_ASSIGN(XpuStreamResourcePool);
+
+ private:
+  std::vector<std::shared_ptr<ResourcePool<XpuStreamObject>>> pool_;
+};
+
+class XpuEventResourcePool {
+ public:
+  std::shared_ptr<XpuEventObject> New(int dev_idx);
+
+  static XpuEventResourcePool &Instance();
+
+ private:
+  XpuEventResourcePool();
+
+  DISABLE_COPY_AND_ASSIGN(XpuEventResourcePool);
+
+ private:
+  std::vector<std::shared_ptr<ResourcePool<XpuEventObject>>> pool_;
+};
+
+}  // namespace platform
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 904e4854ba6b4..09a29c3429cba 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -169,7 +169,7 @@ inline void EmplaceDeviceContext(
 
           cuda_ctx->PartialInitWithAllocator();
           dev_ctx->SetGenerator(
-              framework::GetDefaultCUDAGenerator(p.GetDeviceId()).get());
+              framework::DefaultCUDAGenerator(p.GetDeviceId()).get());
 #endif
         } else {
           dev_ctx->SetAllocator(memory::allocation::AllocatorFacade::Instance()
@@ -750,7 +750,7 @@ dnnl::stream& MKLDNNDeviceContextThreadLocals::Body::get_stream(void) {
 void MKLDNNDeviceContext::ResetBlobMap(void* ptr) {
   VLOG(4) << tls().get_curr_exec() << " " << ptr;
   std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
-  if (!block_next_cache_clearing_) {
+  if (block_next_cache_clearing_ == 0) {
     VLOG(3) << "Clearing DNNL cache.";
     // If no specific executor pointer then clear
     // everything. For executor pointer then clear only
@@ -768,9 +768,20 @@ void MKLDNNDeviceContext::ResetBlobMap(void* ptr) {
         s.second->erase(ptr);
       }
     }
+    // Reset paddle layout to NCHW
+    VLOG(3) << "Resetting Paddle data layout to NCHW.";
+    platform::MKLDNNDeviceContext::tls().set_cur_paddle_data_layout(
+        paddle::framework::DataLayout::kNCHW);
   } else {
-    VLOG(3) << "Prevented Clearing DNNL cache.";
-    block_next_cache_clearing_ = false;
+    --block_next_cache_clearing_;
+    VLOG(3) << "Prevented Clearing DNNL cache. Updated "
+               "block_next_cache_clearing_ : "
+            << block_next_cache_clearing_;
+    PADDLE_ENFORCE_GE(block_next_cache_clearing_, 0,
+                      platform::errors::InvalidArgument(
+                          "Cache clearing mark should be non-negative "
+                          ". But received %d.",
+                          block_next_cache_clearing_));
   }
 }
 
@@ -796,8 +807,10 @@ void MKLDNNDeviceContext::LinkEntryWithExecutor(BlobPtr_t<KeyBlob> pblob,
 
 void MKLDNNDeviceContext::BlockNextCacheClearing() {
   std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
-  VLOG(3) << "Next DNNL cache clearing has been blocked.";
-  block_next_cache_clearing_ = true;
+  ++block_next_cache_clearing_;
+  VLOG(3) << "Next DNNL cache clearing has been blocked. Updated "
+             "block_next_cache_clearing_ : "
+          << block_next_cache_clearing_;
 }
 
 size_t MKLDNNDeviceContext::GetShapeBlobSize() const {
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 2c5f24d28c6d6..a63d41405f1b2 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -188,6 +188,7 @@ class XPUDeviceContext : public phi::XPUContext {
   explicit XPUDeviceContext(XPUPlace place);
   virtual ~XPUDeviceContext();
   Eigen::DefaultDevice* eigen_device() const { return nullptr; }
+  xpuStream stream() const { return XPUContext::x_context()->xpu_stream; }
 };
 
 template <>
@@ -849,7 +850,8 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
   // to erase
   std::shared_ptr<ExecShape> p_exec_items_;
   std::shared_ptr<std::mutex> p_mutex_;
-  bool block_next_cache_clearing_ = false;
+  // 0 - clearing is allowed. x > 0 do not clear.
+  unsigned int block_next_cache_clearing_ = 0;
 };
 #endif
 
diff --git a/paddle/fluid/platform/device_memory_aligment.cc b/paddle/fluid/platform/device_memory_aligment.cc
index 8261c866d073d..e8a6051c19f2d 100644
--- a/paddle/fluid/platform/device_memory_aligment.cc
+++ b/paddle/fluid/platform/device_memory_aligment.cc
@@ -31,9 +31,11 @@ size_t Alignment(size_t size, const platform::Place &place, int align_size) {
       alignment = alignment;
 #elif defined(PADDLE_WITH_ASCEND_CL)
       alignment = NPUMinChunkSize();
+#elif defined(PADDLE_WITH_MLU)
+      alignment = MLUMinChunkSize();
 #else
       PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "Fluid is not compiled with CUDA/XPU/NPU."));
+          "Fluid is not compiled with CUDA/XPU/NPU/MLU."));
 #endif
     }
   }
diff --git a/paddle/fluid/platform/device_memory_aligment.h b/paddle/fluid/platform/device_memory_aligment.h
index a3f88592b7649..ee37b93807eaa 100644
--- a/paddle/fluid/platform/device_memory_aligment.h
+++ b/paddle/fluid/platform/device_memory_aligment.h
@@ -21,6 +21,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/npu/npu_info.h"
 #endif
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#ifdef PADDLE_WITH_MLU
+#include "paddle/fluid/platform/device/mlu/mlu_info.h"
+#endif
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index 054a804e6b38e..2fcc573456d42 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -88,6 +88,21 @@ PADDLE_DEFINE_EXPORTED_bool(
     "input and output must be half precision) and recurrent neural networks "
     "(RNNs).");
 
+/**
+ * CUDA related related FLAG
+ * Name: FLAGS_gemm_use_half_precision_compute_type
+ * Since Version: 2.4
+ * Value Range: bool, default=true
+ * Example:
+ * Note: whether to use fp16 compute type when the input and output is fp16,
+ * faster but it may loss precision.
+ */
+PADDLE_DEFINE_EXPORTED_bool(
+    gemm_use_half_precision_compute_type, true,
+    "Whether to use fp16 compute type when the input and output is fp16, "
+    "faster but it may loss precision in most case. If true, the compute "
+    "type will be set to fp32. Default is true.");
+
 /**
  * CUDA related FLAG
  * Name: FLAGS_selected_gpus
@@ -833,3 +848,16 @@ PADDLE_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait");
  * Example:
  */
 PADDLE_DEFINE_EXPORTED_bool(use_autotune, false, "Whether enable autotune.");
+
+/**
+ * Preformance related FLAG
+ * Name: einsum_opt
+ * Since Version: 2.3.0
+ * Value Range: bool, default=false
+ * Example:
+ * Note: If True, EinsumOp will be optimimzed by innercache reuse, which
+ * uses more gpu memory.
+ */
+PADDLE_DEFINE_EXPORTED_bool(
+    einsum_opt, false,
+    "EinsumOp backward will be speedup at the expense of more gpu memory.");
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index 17736a87409af..5e77046962931 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -78,13 +78,6 @@ tf_pd<Type> MKLDNNBwdPrimitiveDesc(const Engine& e, const Primitive& p,
 inline void MatchShapeToLayout(framework::Tensor* tensor_in,
                                framework::DataLayout from,
                                framework::DataLayout to) {
-  // In these data layouts, channel dimension is either on 2nd position: nChw or
-  // at last nhwC, so for dim==2 these layouts are the same and nothing should
-  // be done. Similarly for dim==1 when you have just one possible combination.
-  if (tensor_in->dims().size() < 3) {
-    return;
-  }
-
   auto print_dims = [](const std::vector<int>& dims) {
     std::ostringstream oss;
 
@@ -101,6 +94,15 @@ inline void MatchShapeToLayout(framework::Tensor* tensor_in,
     return oss.str();
   };
 
+  // In these data layouts, channel dimension is either on 2nd position: nChw or
+  // at last nhwC, so for dim==2 these layouts are the same and nothing should
+  // be done. Similarly for dim==1 when you have just one possible combination.
+  if (tensor_in->dims().size() < 3) {
+    VLOG(3) << "Keeping kMKLDNN/kNHWC/kNDHWC output_shape"
+            << print_dims(phi::vectorize<int>(tensor_in->dims()));
+    return;
+  }
+
   switch (from) {
     case framework::DataLayout::kMKLDNN:
       if ((to == framework::DataLayout::kNHWC) ||
@@ -146,8 +148,6 @@ inline void ClearMKLDNNCache(const platform::Place& place,
     platform::MKLDNNDeviceContext* dev_ctx =
         (platform::MKLDNNDeviceContext*)pool.Get(place);
     dev_ctx->ResetBlobMap(ptr);
-    platform::MKLDNNDeviceContext::tls().set_cur_paddle_data_layout(
-        paddle::framework::DataLayout::kNCHW);
   }
 }
 
@@ -571,6 +571,12 @@ inline void RegisterModelLayout(
     std::vector<std::unique_ptr<framework::OperatorBase>>& ops,
     const platform::Place& place) {
   if (platform::is_cpu_place(place)) {
+    // If there is already registered NHWC then quit this call
+    // not to overwrite setting with analysis of internal "while" op block
+    if (platform::MKLDNNDeviceContext::tls().get_cur_paddle_data_layout() ==
+        framework::DataLayout::kNHWC)
+      return;
+
     VLOG(4) << "RegisterModelLayout for mkldnn";
     auto check_attrib = [](std::unique_ptr<framework::OperatorBase>& op,
                            const std::string& attrib_name) -> bool {
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 12fa933701ef4..13b5005a30fa0 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -1057,6 +1057,14 @@ class ReorderMKLDNNHandler {
     return std::make_shared<dnnl::reorder>(*(src_memory_p), *(dst_memory_p));
   }
 
+  std::shared_ptr<dnnl::reorder> AcquireReorder(
+      std::shared_ptr<dnnl::memory> dst_memory_p,
+      std::shared_ptr<dnnl::memory> src_memory_p,
+      const dnnl::primitive_attr& attrs) {
+    return std::make_shared<dnnl::reorder>(*(src_memory_p), *(dst_memory_p),
+                                           attrs);
+  }
+
  private:
   std::vector<int64_t> dims_;
   framework::proto::VarType::Type vtype_, vtype_dst_;
diff --git a/paddle/fluid/platform/profiler/event_node.cc b/paddle/fluid/platform/profiler/event_node.cc
index 6c8be1811d715..b909fb5f25aa7 100644
--- a/paddle/fluid/platform/profiler/event_node.cc
+++ b/paddle/fluid/platform/profiler/event_node.cc
@@ -51,7 +51,7 @@ void NodeTrees::BuildTrees(
     const std::vector<HostTraceEventNode*>& host_event_nodes,
     std::vector<CudaRuntimeTraceEventNode*>& runtime_event_nodes,
     const std::vector<DeviceTraceEventNode*>& device_event_nodes) {
-  // seperate Host Event Nodes into different threads
+  // separate Host Event Nodes into different threads
   std::map<uint64_t, std::vector<HostTraceEventNode*>>
       thread2host_event_nodes;  // used to store HostTraceEventNodes per thread
   std::map<uint64_t, std::vector<CudaRuntimeTraceEventNode*>>
diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h
index 24c515f5b4956..f64e05504aa3f 100644
--- a/paddle/fluid/platform/profiler_helper.h
+++ b/paddle/fluid/platform/profiler_helper.h
@@ -168,8 +168,10 @@ void PrintMemProfiler(
   if (num_gpus > 0) {
     std::cout << "GPU Memory Usage (MB):\n";
     for (int dev_id = 0; dev_id < num_gpus; ++dev_id) {
-      int64_t allocated = memory::StatGetCurrentValue("Allocated", dev_id);
-      int64_t reserved = memory::StatGetCurrentValue("Reserved", dev_id);
+      int64_t allocated =
+          memory::DeviceMemoryStatCurrentValue("Allocated", dev_id);
+      int64_t reserved =
+          memory::DeviceMemoryStatCurrentValue("Reserved", dev_id);
       size_t available = 0, total = 0, actual_available = 0, actual_total = 0;
       RecordedGpuMemGetInfo(&available, &total, &actual_available,
                             &actual_total, dev_id);
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 2491cd90a83ef..90a86aaf31f26 100755
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -41,6 +41,7 @@ if (WITH_ASCEND_CL)
 endif()
 
 if (WITH_CNCL)
+  set(PYBIND_DEPS ${PYBIND_DEPS} reducer)
   set(PYBIND_DEPS ${PYBIND_DEPS} cncl_context)
 endif()
 
@@ -133,9 +134,9 @@ if (WITH_PSLIB)
 endif(WITH_PSLIB)
 if (WITH_PSCORE)
   if (WITH_ARM_BRPC)
-    set(DISTRIBUTE_COMPILE_FLAGS "-faligned-new -Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=type-limits -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result")
+    set(DISTRIBUTE_COMPILE_FLAGS "-faligned-new -Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result")
   else()
-    set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=type-limits -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result")
+    set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result")
   endif()
   set_source_files_properties(fleet_py.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   list(APPEND PYBIND_DEPS fleet communicator index_wrapper index_sampler)
diff --git a/paddle/fluid/pybind/communication.cc b/paddle/fluid/pybind/communication.cc
index 1a6a395545a96..aef02d65b4dbd 100644
--- a/paddle/fluid/pybind/communication.cc
+++ b/paddle/fluid/pybind/communication.cc
@@ -58,13 +58,16 @@ void BindTCPStore(py::module *m) {
 
   py::class_<TCPStore, std::shared_ptr<TCPStore>>(*m, "TCPStore", Store)
       .def(py::init([](std::string hostname, uint16_t port, bool is_master,
-                       size_t world_size, std::chrono::seconds timeout) {
+                       size_t world_size, std::chrono::seconds timeout,
+                       int stop_check_timeout) {
              return std::make_shared<TCPStore>(hostname, port, is_master,
-                                               world_size, timeout);
+                                               world_size, timeout,
+                                               stop_check_timeout);
            }),
            py::arg("hostname"), py::arg("port"), py::arg("is_master"),
            py::arg("world_size"),
            py::arg("timeout") = distributed::tcputils::kNoTimeout,
+           py::arg("stop_check_timeout") = 900,
            py::call_guard<py::gil_scoped_release>());
 }
 
diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index b94a3b0edcabc..c1b26ee0b792d 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -34,7 +34,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/python_headers.h"
 #include "paddle/fluid/pybind/eager_op_function_impl.h"
 #include "paddle/fluid/pybind/tensor_py.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
 #include "paddle/phi/core/string_tensor.h"
 namespace paddle {
@@ -84,7 +83,7 @@ void EmptyTensorInitializer(TensorObject* self, const std::string& name,
     } else {
       // TODO(dev): we need enhance check for ddims.
       dense_tensor = std::make_shared<phi::DenseTensor>(
-          phi::make_intrusive<paddle::experimental::SharedStorage>(place),
+          std::make_shared<phi::Allocation>(),
           phi::DenseTensorMeta(paddle::framework::TransToPhiDataType(dtype),
                                ddims));
     }
diff --git a/paddle/fluid/pybind/eager_custom_python_api.h b/paddle/fluid/pybind/eager_custom_python_api.h
index c509ab5674930..99ec4212918de 100644
--- a/paddle/fluid/pybind/eager_custom_python_api.h
+++ b/paddle/fluid/pybind/eager_custom_python_api.h
@@ -65,7 +65,7 @@ static PyObject *eager_api_final_state_linear(PyObject *self, PyObject *args,
     if (bias.initialized()) {
       auto mm_out =
           matmul_final_state_dygraph_function(x, weight, false, false);
-      auto out = add_final_state_dygraph_function(bias, mm_out);
+      auto out = add_final_state_dygraph_function(mm_out, bias);
       PyEval_RestoreThread(tstate);
       tstate = nullptr;
       return ToPyObject(out);
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index ac33eb2359c8c..628e808ef99ac 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -45,7 +45,6 @@ typedef SSIZE_T ssize_t;
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/phi/api/ext/op_meta_info.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/compat/convert_utils.h"
@@ -207,7 +206,8 @@ static void ConstructFwdAndBwdMap(
     auto grad_attrs_names =
         paddle::framework::OpMetaInfoHelper::GetAttrs(vec_map[1]);
     std::vector<std::unordered_map<int, int>> res(5);
-    in_out_map.insert({op_type, res});
+
+    in_out_map.insert({op_type, {res}});
     // Prepare pos map for grad_outputs
     VLOG(7) << "Prepare pos map for grad_outputs";
     PADDLE_ENFORCE_LE(
@@ -227,7 +227,7 @@ static void ConstructFwdAndBwdMap(
           VLOG(7) << " ==== Custom Operator: " << op_type << "'s No." << j
                   << " inputs: " << inputs_names[j] << " related to No." << i
                   << " grad_outputs: " << grad_outputs_names[i];
-          in_out_map[op_type][0][j] = i;
+          in_out_map[op_type][0][0][j] = i;
         }
       }
     }
@@ -240,7 +240,7 @@ static void ConstructFwdAndBwdMap(
             VLOG(7) << " ==== Custom Operator: " << op_type << "'s No." << j
                     << " outputs: " << outputs_names[j] << " related to No."
                     << i << " grad_inputs's grad: " << grad_inputs_names[i];
-            in_out_map[op_type][1][j] = i;
+            in_out_map[op_type][0][1][j] = i;
           }
         }
       } else {
@@ -252,7 +252,7 @@ static void ConstructFwdAndBwdMap(
                       << " outputs: " << outputs_names[j] << " related to No."
                       << i
                       << " grad_inputs fwd outputs: " << grad_inputs_names[i];
-              in_out_map[op_type][2][j] = i;
+              in_out_map[op_type][0][2][j] = i;
             }
           }
         } else {
@@ -262,7 +262,7 @@ static void ConstructFwdAndBwdMap(
                       << " inputs: " << inputs_names[j] << " related to No."
                       << i
                       << " grad_inputs fwd inputs: " << grad_inputs_names[i];
-              in_out_map[op_type][3][j] = i;
+              in_out_map[op_type][0][3][j] = i;
             }
           }
         }
@@ -284,7 +284,7 @@ static void ConstructFwdAndBwdMap(
           VLOG(7) << " ==== Custom Operator: " << op_type << "'s No." << j
                   << " attrs: " << attrs_names[j] << " related to No." << i
                   << " grad_attrs: " << grad_attrs_names[i];
-          in_out_map[op_type][4][j] = i;
+          in_out_map[op_type][0][4][j] = i;
         }
       }
     }
@@ -402,8 +402,8 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args,
           ctx.InputsBetween(ctx.InputRangeAt(i).first,
                             ctx.InputRangeAt(i).second);
 
-      if (slot_map[0].find(i) != slot_map[0].end()) {
-        grad_node->SetGradOutMeta(in_tensors, slot_map[0][i]);
+      if (slot_map[0][0].find(i) != slot_map[0][0].end()) {
+        grad_node->SetGradOutMeta(in_tensors, slot_map[0][0][i]);
       } else {
         grad_node->SetGradOutMeta(in_tensors,
                                   ins_auto_grad_metas.size() - 1 - no_grad_cnt);
@@ -423,7 +423,7 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args,
     }
 
     // Prepare Grad inputs with fwd outputs
-    for (auto it = slot_map[2].begin(); it != slot_map[2].end(); it++) {
+    for (auto it = slot_map[0][2].begin(); it != slot_map[0][2].end(); it++) {
       VLOG(7) << "Prepare fwd_outs: " << it->first
               << " to grad_inputs: " << it->second;
       grad_node->fwd_outs[it->second] =
@@ -433,7 +433,7 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args,
     }
 
     // Prepare Grad inputs with fwd inputs
-    for (auto it = slot_map[3].begin(); it != slot_map[3].end(); it++) {
+    for (auto it = slot_map[0][3].begin(); it != slot_map[0][3].end(); it++) {
       VLOG(7) << "Prepare fwd_ins: " << it->first
               << " to grad_inputs: " << it->second;
       grad_node->fwd_ins[it->second] =
@@ -446,7 +446,7 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args,
         meta_info_map.at(op_type)[1]);
     std::vector<paddle::any> attrs(attrs_names.size());
     // Prepare attrs for Grad node
-    for (auto it = slot_map[4].begin(); it != slot_map[4].end(); it++) {
+    for (auto it = slot_map[0][4].begin(); it != slot_map[0][4].end(); it++) {
       VLOG(7) << "Prepare fwd attrs: " << it->first
               << " to grad_attrs: " << it->second;
       attrs[it->second] = res_attrs[it->first];
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index d3393b7cb57ac..1a0838d7f47c6 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -18,6 +18,7 @@ typedef SSIZE_T ssize_t;
 #include <Python.h>
 
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 #include "pybind11/numpy.h"
@@ -76,7 +77,9 @@ class PyTensorHook : public egr::TensorHook {
 
     PyObject* res = nullptr;
     try {
-      res = PyObject_CallFunctionObjArgs(py_func_, ToPyObject(var), nullptr);
+      PyObject* p_tmp_var = ToPyObject(var);
+      res = PyObject_CallFunctionObjArgs(py_func_, p_tmp_var, nullptr);
+      Py_DECREF(p_tmp_var);
     } catch (platform::EnforceNotMet& e) {
       throw std::move(e);
     } catch (std::exception& e) {
@@ -93,7 +96,9 @@ class PyTensorHook : public egr::TensorHook {
     if (res == Py_None) {
       return var;
     }
-    return reinterpret_cast<TensorObject*>(res)->tensor;
+    auto res_tensor = reinterpret_cast<TensorObject*>(res)->tensor;
+    Py_DECREF(res);
+    return res_tensor;
   }
 
  private:
@@ -361,12 +366,33 @@ static PyObject* tensor_method__is_dense_tensor_hold_allocation(
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static void IncreaseTensorReferenceCountUntilCopyComplete(
+    const paddle::experimental::Tensor& tensor, const platform::Place& place) {
+  auto place_ = platform::is_gpu_place(place) ? place : tensor.place();
+
+  auto tracer = egr::Controller::Instance().GetCurrentTracer();
+  auto gc = tracer->MutableGarbageCollectorIfNotExists(place_);
+
+  // Note(dev): This is an empty callback, the only way is to "reference"
+  // inner memory Holder, so it will not be destructed until the kernels
+  // launched at current stream of given place is finished, such as
+  // CUDAPinned Mem -> CUDA by cudamemcpyAsync.
+  auto callback = [tensor, place_]() {
+    VLOG(3) << "Run callback of Tensor:" << tensor.name() << " at place "
+            << place_;
+  };
+  gc->DirectClearCallback(callback);
+}
+
 static PyObject* tensor_method__copy_to(TensorObject* self, PyObject* args,
                                         PyObject* kwargs) {
   EAGER_TRY
   auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 0), 0);
   bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 1), 1);
   auto cp_tensor = self->tensor.copy_to(place, blocking);
+  if (!blocking) {
+    IncreaseTensorReferenceCountUntilCopyComplete(self->tensor, place);
+  }
   egr::EagerUtils::autograd_meta(&cp_tensor)->SetStopGradient(true);
   egr::EagerUtils::autograd_meta(&cp_tensor)
       ->SetPersistable(
@@ -654,7 +680,9 @@ static PyObject* tensor_method_get_underline_tensor(TensorObject* self,
                                                     PyObject* kwargs) {
   EAGER_TRY
   if (!self->tensor.defined()) {
-    RETURN_PY_NONE
+    // The original `get_tensor` method of Variable will create a empty tensor
+    phi::DenseTensor empty_tensor;
+    return ToPyObject(&empty_tensor);
   }
   if (self->tensor.is_dense_tensor()) {
     auto* tensor =
@@ -1254,6 +1282,47 @@ static PyObject* tensor__copy_gradient_from(TensorObject* self, PyObject* args,
 
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
+
+static PyObject* tensor_method_set_vocab(TensorObject* self, PyObject* args,
+                                         PyObject* kwargs) {
+  EAGER_TRY
+  using Vocab = std::unordered_map<std::wstring, int>;
+  auto vocab = CastPyArg2Vocab(PyTuple_GET_ITEM(args, 0), 0);
+  auto var_tensor = std::make_shared<egr::VariableCompatTensor>();
+  *var_tensor->GetMutable<Vocab>() = vocab;
+  self->tensor.set_impl(var_tensor);
+  RETURN_PY_NONE
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* tensor_method_set_string_list(TensorObject* self,
+                                               PyObject* args,
+                                               PyObject* kwargs) {
+  EAGER_TRY
+  using Strings = std::vector<std::string>;
+  auto strings = CastPyArg2Strings(PyTuple_GET_ITEM(args, 0), 0);
+  auto var_tensor = std::make_shared<egr::VariableCompatTensor>();
+  *var_tensor->GetMutable<Strings>() = strings;
+  self->tensor.set_impl(var_tensor);
+  RETURN_PY_NONE
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* tensor_method_get_map_tensor(TensorObject* self,
+                                              PyObject* args,
+                                              PyObject* kwargs) {
+  EAGER_TRY
+  PADDLE_ENFORCE_EQ(
+      egr::IsVariableCompatTensor(self->tensor), true,
+      paddle::platform::errors::Fatal(
+          "this method is only effective for VariableCompatTensor"));
+  using Vocab = std::unordered_map<std::wstring, int>;
+  auto* var_tensor =
+      static_cast<const egr::VariableCompatTensor*>(self->tensor.impl().get());
+  return ToPyObject(var_tensor->Get<Vocab>());
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 static PyObject* tensor_method_get_non_zero_indices(TensorObject* self,
                                                     PyObject* args,
                                                     PyObject* kwargs) {
@@ -1634,6 +1703,15 @@ PyMethodDef variable_methods[] = {
     {"_copy_gradient_from",
      (PyCFunction)(void (*)(void))tensor__copy_gradient_from,
      METH_VARARGS | METH_KEYWORDS, NULL},
+    /** the methods to adapt old dygraph, will be removed in the future **/
+    {"set_string_list",
+     (PyCFunction)(void (*)(void))tensor_method_set_string_list,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"set_vocab", (PyCFunction)(void (*)(void))tensor_method_set_vocab,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"get_map_tensor",
+     (PyCFunction)(void (*)(void))tensor_method_get_map_tensor,
+     METH_VARARGS | METH_KEYWORDS, NULL},
     /***the method of sparse tensor****/
     {"indices", (PyCFunction)(void (*)(void))tensor_method_get_non_zero_indices,
      METH_VARARGS | METH_KEYWORDS, NULL},
diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
index 7af221b9ac82e..590ecfbad4be5 100644
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -37,6 +37,13 @@ extern PyTypeObject* p_tensor_type;
 
 PyObject* tensor_properties_get_name(TensorObject* self, void* closure) {
   EAGER_TRY
+  // NOTE(dev): [why not use egr::Controller::Instance::GernerateUniqueName()?]
+  // Beacause Controller must holder a tracer, but 'tensor.name' maybe called
+  // everywhere such as static mode in @to_static, which means tracer is None.
+  static egr::UniqueNameGenerator name_generator;
+  if (self->tensor.name().empty()) {
+    self->tensor.set_name(name_generator.Generate());
+  }
   return ToPyObject(self->tensor.name());
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
@@ -51,6 +58,10 @@ PyObject* tensor_properties_get_type(TensorObject* self, void* closure) {
     return ToPyObject(paddle::framework::proto::VarType::LOD_TENSOR);
   } else if (self->tensor.is_selected_rows()) {
     return ToPyObject(paddle::framework::proto::VarType::SELECTED_ROWS);
+  } else if (egr::IsVariableCompatTensor(self->tensor)) {
+    return ToPyObject(static_cast<paddle::framework::proto::VarType::Type>(
+        static_cast<const egr::VariableCompatTensor*>(self->tensor.impl().get())
+            ->Type()));
   } else {
     RETURN_PY_NONE
   }
@@ -145,11 +156,27 @@ PyObject* tensor_properties_get_shape(TensorObject* self, void* closure) {
   if (!self->tensor.defined()) {
     return ToPyObject(value);
   }
-  auto ddim = self->tensor.shape();
-  size_t rank = static_cast<size_t>(ddim.size());
-  value.resize(rank);
-  for (size_t i = 0; i < rank; i++) {
-    value[i] = ddim[i];
+  if (egr::IsVariableCompatTensor(self->tensor)) {
+    auto* var_tensor = static_cast<const egr::VariableCompatTensor*>(
+        self->tensor.impl().get());
+    if (var_tensor->IsType<paddle::framework::Vocab>()) {
+      value.emplace_back(static_cast<int64_t>(
+          var_tensor->Get<paddle::framework::Vocab>().size()));
+    } else if (var_tensor->IsType<paddle::framework::Strings>()) {
+      value.emplace_back(static_cast<int64_t>(
+          var_tensor->Get<paddle::framework::Strings>().size()));
+    } else {
+      PADDLE_THROW(paddle::platform::errors::Unavailable(
+          "VariableCompatTensor only support get shape from Vocab or "
+          "Strings."));
+    }
+  } else {
+    auto ddim = self->tensor.shape();
+    size_t rank = static_cast<size_t>(ddim.size());
+    value.resize(rank);
+    for (size_t i = 0; i < rank; i++) {
+      value[i] = ddim[i];
+    }
   }
 
   return ToPyObject(value);
@@ -176,8 +203,22 @@ PyObject* tensor_properties_get_dtype(TensorObject* self, void* closure) {
     // be same to old dygraph
     return ToPyObject(framework::proto::VarType::FP32);
   }
-  return ToPyObject(
-      paddle::framework::TransToProtoVarType(self->tensor.type()));
+  if (egr::IsVariableCompatTensor(self->tensor)) {
+    auto* var_tensor = static_cast<const egr::VariableCompatTensor*>(
+        self->tensor.impl().get());
+    if (var_tensor->IsType<paddle::framework::Vocab>()) {
+      return ToPyObject(framework::proto::VarType::RAW);
+    } else if (var_tensor->IsType<paddle::framework::Strings>()) {
+      return ToPyObject(framework::proto::VarType::STRING);
+    } else {
+      PADDLE_THROW(paddle::platform::errors::Unavailable(
+          "VariableCompatTensor only support get shape from Vocab or "
+          "Strings."));
+    }
+  } else {
+    return ToPyObject(
+        paddle::framework::TransToProtoVarType(self->tensor.type()));
+  }
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 90d7024f7a746..4707f757d8bfb 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -472,6 +472,28 @@ paddle::framework::proto::VarType::Type CastPyArg2ProtoType(PyObject* obj,
   return dtype;
 }
 
+std::unordered_map<std::wstring, int> CastPyArg2Vocab(PyObject* obj,
+                                                      ssize_t arg_pos) {
+  if (PyDict_Check(obj)) {
+    return ::pybind11::handle(obj)
+        .cast<std::unordered_map<std::wstring, int>>();
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "argument (position %d) must be dict, but got %s", arg_pos + 1,
+        reinterpret_cast<PyTypeObject*>(obj->ob_type)->tp_name));
+  }
+}
+
+std::vector<std::string> CastPyArg2Strings(PyObject* obj, ssize_t arg_pos) {
+  if (PyList_Check(obj)) {
+    return ::pybind11::handle(obj).cast<std::vector<std::string>>();
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "argument (position %d) must be list, but got %s", arg_pos + 1,
+        reinterpret_cast<PyTypeObject*>(obj->ob_type)->tp_name));
+  }
+}
+
 paddle::CustomOpKernelContext CastPyArg2CustomOpKernelContext(PyObject* obj,
                                                               ssize_t arg_pos) {
   if (PyObject_IsInstance(
@@ -719,6 +741,28 @@ PyObject* ToPyObject(
   return dict;
 }
 
+PyObject* ToPyObject(const std::unordered_map<std::wstring, int>& value) {
+  PyObject* dict = PyDict_New();
+  for (const auto map_iter : value) {
+    // Convert Key
+    PyObject* key_string =
+        PyUnicode_FromWideChar(map_iter.first.c_str(), map_iter.first.size());
+    if (!key_string) {
+      PADDLE_THROW(platform::errors::Fatal(
+          "Unable to convert std::wstring to PyObject"));
+    }
+
+    // Convert Val
+    PyObject* py_int = PyLong_FromLong(map_iter.second);
+
+    if (PyDict_SetItem(dict, key_string, py_int) != 0) {
+      PADDLE_THROW(
+          platform::errors::Fatal("Unable to set key:value for py_dict"));
+    }
+  }
+  return dict;
+}
+
 // For Final State Dygraph,
 // We directly use paddle::optional(Tensor) as dispensable Tensor
 paddle::optional<const paddle::experimental::Tensor&> GetOptionalTensorFromArgs(
@@ -1045,7 +1089,7 @@ paddle::experimental::Scalar CastNumpy2Scalar(PyObject* obj,
   } else if (type_name == "numpy.int64") {
     int64_t value = CastPyArg2Long(obj, op_type, arg_pos);
     return paddle::experimental::Scalar(value);
-  } else if (type_name == "numpy.int32") {
+  } else if (type_name == "numpy.int32" || type_name == "numpy.intc") {
     int value = CastPyArg2Int(obj, op_type, arg_pos);
     return paddle::experimental::Scalar(value);
   } else {
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index 5273433208d11..c8e1cd4ad0b75 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -65,6 +65,9 @@ std::vector<std::vector<size_t>> CastPyArg2VectorOfVectorOfSize_t(
     PyObject* obj, size_t arg_pos);
 framework::proto::VarType::Type CastPyArg2ProtoType(PyObject* obj,
                                                     ssize_t arg_pos);
+std::unordered_map<std::wstring, int> CastPyArg2Vocab(PyObject* obj,
+                                                      ssize_t arg_pos);
+std::vector<std::string> CastPyArg2Strings(PyObject* obj, ssize_t arg_pos);
 
 PyObject* ToPyObject(int value);
 PyObject* ToPyObject(uint32_t value);
@@ -96,6 +99,7 @@ PyObject* ToPyObject(const paddle::framework::proto::VarType& type);
 PyObject* ToPyObject(const void* value);
 PyObject* ToPyObject(
     const std::unordered_map<std::string, std::vector<std::string>>& value);
+PyObject* ToPyObject(const std::unordered_map<std::wstring, int>& value);
 
 template <typename Tuple, size_t N>
 struct TupleTensorResult {
diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc
index bcf55e46edb76..4ffb513671c56 100644
--- a/paddle/fluid/pybind/fleet_py.cc
+++ b/paddle/fluid/pybind/fleet_py.cc
@@ -327,16 +327,14 @@ void BindNeighborSampleResult(py::module* m) {
       .def("initialize", &NeighborSampleResult::initialize)
       .def("get_len", &NeighborSampleResult::get_len)
       .def("get_val", &NeighborSampleResult::get_actual_val)
+      .def("get_sampled_graph", &NeighborSampleResult::get_sampled_graph)
       .def("display", &NeighborSampleResult::display);
 }
 
 void BindGraphGpuWrapper(py::module* m) {
-  py::class_<GraphGpuWrapper>(*m, "GraphGpuWrapper")
-      // nit<>())
-      //.def("test", &GraphGpuWrapper::test)
-      //.def(py::init([]() { return framework::GraphGpuWrapper::GetInstance();
-      //}))
-      .def(py::init<>())
+  py::class_<GraphGpuWrapper, std::shared_ptr<GraphGpuWrapper>>(
+      *m, "GraphGpuWrapper")
+      .def(py::init([]() { return GraphGpuWrapper::GetInstance(); }))
       .def("neighbor_sample", &GraphGpuWrapper::graph_neighbor_sample_v3)
       .def("graph_neighbor_sample", &GraphGpuWrapper::graph_neighbor_sample)
       .def("set_device", &GraphGpuWrapper::set_device)
@@ -347,6 +345,8 @@ void BindGraphGpuWrapper(py::module* m) {
       .def("load_edge_file", &GraphGpuWrapper::load_edge_file)
       .def("upload_batch", &GraphGpuWrapper::upload_batch)
       .def("get_all_id", &GraphGpuWrapper::get_all_id)
+      .def("init_sample_status", &GraphGpuWrapper::init_sample_status)
+      .def("free_sample_status", &GraphGpuWrapper::free_sample_status)
       .def("load_next_partition", &GraphGpuWrapper::load_next_partition)
       .def("make_partitions", &GraphGpuWrapper::make_partitions)
       .def("make_complementary_graph",
@@ -355,6 +355,8 @@ void BindGraphGpuWrapper(py::module* m) {
       .def("init_search_level", &GraphGpuWrapper::init_search_level)
       .def("get_partition_num", &GraphGpuWrapper::get_partition_num)
       .def("get_partition", &GraphGpuWrapper::get_partition)
+      .def("load_node_weight", &GraphGpuWrapper::load_node_weight)
+      .def("export_partition_files", &GraphGpuWrapper::export_partition_files)
       .def("load_node_file", &GraphGpuWrapper::load_node_file);
 }
 #endif
diff --git a/paddle/fluid/pybind/generator_py.cc b/paddle/fluid/pybind/generator_py.cc
index 53379373d2518..6bb85da8c466f 100644
--- a/paddle/fluid/pybind/generator_py.cc
+++ b/paddle/fluid/pybind/generator_py.cc
@@ -55,13 +55,9 @@ void BindGenerator(py::module* m_ptr) {
            })
       .def("seed", &framework::Generator::Seed)
       .def("initial_seed", &framework::Generator::GetCurrentSeed)
-      .def("random", &framework::Generator::Random64)
-      //  .def("get_cpu_engine", &framework::Generator::GetCPUEngine)
-      //  .def("set_cpu_engine", &framework::Generator::SetCPUEngine)
-      .def_property("_is_init_py", &framework::Generator::GetIsInitPy,
-                    &framework::Generator::SetIsInitPy);
+      .def("random", &framework::Generator::Random64);
   m.def("default_cpu_generator", &framework::DefaultCPUGenerator);
-  m.def("default_cuda_generator", &framework::GetDefaultCUDAGenerator);
+  m.def("default_cuda_generator", &framework::DefaultCUDAGenerator);
   m.def("set_random_seed_generator", &framework::SetRandomSeedGenerator);
   m.def("get_random_seed_generator", &framework::GetRandomSeedGenerator);
 }
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 1da0831fc6323..d24c0355c2493 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -59,7 +59,6 @@ limitations under the License. */
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/fluid/pybind/uva_utils.h"
 #include "paddle/phi/core/compat/arg_map_context.h"
-#include "paddle/phi/core/compat/type_defs.h"
 #include "paddle/phi/core/type_defs.h"
 
 namespace paddle {
@@ -2225,8 +2224,9 @@ void BindImperative(py::module *m_ptr) {
       },
       py::call_guard<py::gil_scoped_release>());
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
-    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||     \
+    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
+    defined(PADDLE_WITH_CNCL)
   py::class_<imperative::ParallelContext,
              std::shared_ptr<imperative::ParallelContext>>(m,
                                                            "ParallelContext");
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 1bbe6808b2846..944781484076b 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -601,6 +601,14 @@ void BindAnalysisConfig(py::module *m) {
       .def("set_xpu_device_id", &AnalysisConfig::SetXpuDeviceId,
            py::arg("device_id") = 0)
       .def("enable_npu", &AnalysisConfig::EnableNpu, py::arg("device_id") = 0)
+      .def("enable_ipu", &AnalysisConfig::EnableIpu,
+           py::arg("ipu_device_num") = 1, py::arg("ipu_micro_batch_size") = 1,
+           py::arg("ipu_enable_pipelining") = false,
+           py::arg("ipu_batches_per_step") = 1)
+      .def("set_ipu_config", &AnalysisConfig::SetIpuConfig,
+           py::arg("ipu_enable_fp16") = false, py::arg("ipu_replica_num") = 1,
+           py::arg("ipu_available_memory_proportion") = 1.0,
+           py::arg("ipu_enable_half_partial") = false)
       .def("disable_gpu", &AnalysisConfig::DisableGpu)
       .def("enable_onnxruntime", &AnalysisConfig::EnableONNXRuntime)
       .def("disable_onnxruntime", &AnalysisConfig::DisableONNXRuntime)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 602a0345b04fe..0e1271c1fe07f 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -166,6 +166,10 @@ limitations under the License. */
 #include "paddle/fluid/pybind/fleet_py.h"
 #endif
 
+#ifdef PADDLE_WITH_CINN
+#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
+#endif
+
 #include "paddle/fluid/eager/api/utils/global_utils.h"
 #include "paddle/fluid/imperative/layout_autotune.h"
 #include "paddle/fluid/pybind/eager_utils.h"
@@ -1930,16 +1934,18 @@ All parameter, weight, gradient are variables in Paddle.
                    which contains the id pair of pruned block and corresponding
                    origin block.
            )DOC");
-  m.def("get_readable_comile_key", [](const OpDesc &op_desc) {
-    auto compilation_key =
-        BOOST_GET_CONST(std::string, op_desc.GetAttr("compilation_key"));
-    VLOG(4) << std::hash<std::string>{}(compilation_key) << " "
-            << compilation_key.size();
-    proto::ProgramDesc desc;
-    desc.ParseFromString(compilation_key);
-    auto s = desc.DebugString();
+  m.def("get_serialize_comile_key", [](int64_t compilation_key) {
+#ifdef PADDLE_WITH_CINN
+    auto compiler = framework::paddle2cinn::CinnCompiler::GetInstance();
+    auto s = compiler->SerializeKey(compilation_key);
     VLOG(4) << s;
     return s;
+#else
+    PADDLE_THROW(
+                 platform::errors::PermissionDenied(
+                 "Cannot get compilation key in non-CINN version, "
+                 "Please recompile or reinstall Paddle with CINN support."));
+#endif
   });
   m.def("empty_var_name",
         []() { return std::string(framework::kEmptyVarName); });
@@ -2999,8 +3005,9 @@ All parameter, weight, gradient are variables in Paddle.
     }
     return stats_map;
   });
-  m.def("memory_stat_get_current", memory::StatGetCurrentValue);
-  m.def("memory_stat_get_peak", memory::StatGetPeakValue);
+  m.def("device_memory_stat_current_value",
+        memory::DeviceMemoryStatCurrentValue);
+  m.def("device_memory_stat_peak_value", memory::DeviceMemoryStatPeakValue);
   m.def("run_cmd",
         [](const std::string &cmd, int time_out = -1,
            int sleep_inter = -1) -> const std::string {
@@ -4394,6 +4401,12 @@ All parameter, weight, gradient are variables in Paddle.
                          option_name, option.first.cast<std::string>(),
                          option.second.cast<std::uint64_t>());
                    }
+                 } else if (option_name == "replicated_collectives_settings") {
+                   for (auto option : element.second.cast<py::dict>()) {
+                     self.SetReplicatedCollectivesSettings(
+                         option.first.cast<std::string>(),
+                         option.second.cast<bool>());
+                   }
                  } else if (option_name == "accumulate_outer_fragment") {
                    for (auto option : element.second.cast<py::dict>()) {
                      std::vector<int> values;
diff --git a/paddle/infrt/common/object.h b/paddle/infrt/common/object.h
index ab2d00cce985c..797595cc7c58b 100644
--- a/paddle/infrt/common/object.h
+++ b/paddle/infrt/common/object.h
@@ -25,7 +25,7 @@ template <typename T>
 class Shared;
 /**
  * Object is the basic element in the INFRT, with `Shared` wrapper, the object
- * can be shared accross the system.
+ * can be shared across the system.
  */
 struct Object {
   //! Get the type representation of this object.
diff --git a/paddle/infrt/tensor/phi/tensor_map.cc b/paddle/infrt/tensor/phi/tensor_map.cc
index 7690322aed4a3..afac7175caf4f 100644
--- a/paddle/infrt/tensor/phi/tensor_map.cc
+++ b/paddle/infrt/tensor/phi/tensor_map.cc
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #include "paddle/infrt/tensor/phi/tensor_map.h"
+
+#include "glog/logging.h"
 #include "llvm/Support/ErrorHandling.h"
 
 namespace infrt {
diff --git a/paddle/infrt/tensor/phi/tensor_map.h b/paddle/infrt/tensor/phi/tensor_map.h
index 1b9fbdd9defc7..5a754f42fb63c 100644
--- a/paddle/infrt/tensor/phi/tensor_map.h
+++ b/paddle/infrt/tensor/phi/tensor_map.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <unordered_map>
+
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace infrt {
diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
index 0595ea4d8bddf..58ad42ddd1ff8 100644
--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -23,7 +23,7 @@ add_subdirectory(tools)
 add_subdirectory(tests)
 
 # make an unity target for compile deps
-set(PHI_DEPS convert_utils dense_tensor phi_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos sparse_csr_tensor sparse_coo_tensor string_tensor api_scalar)
+set(PHI_DEPS convert_utils dense_tensor phi_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos sparse_csr_tensor sparse_coo_tensor string_tensor api_scalar api_int_array)
 get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
 set(PHI_DEPS ${PHI_DEPS} ${phi_kernels})
 
diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt
index b195ed1aefadc..004ed8de520d9 100644
--- a/paddle/phi/api/lib/CMakeLists.txt
+++ b/paddle/phi/api/lib/CMakeLists.txt
@@ -13,6 +13,7 @@ set(api_gen_base ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api_base.py)
 # forward api file
 set(api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api_gen.py)
 set(api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml)
+set(new_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/new_api.yaml)
 set(api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/include/api.h)
 set(api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/api.cc)
 set(api_header_file_tmp ${api_header_file}.tmp)
@@ -21,6 +22,7 @@ set(api_source_file_tmp ${api_source_file}.tmp)
 # backward api file
 set(bw_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/backward_api_gen.py)
 set(bw_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml)
+set(new_bw_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/new_backward.yaml)
 set(bw_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/backward/backward_api.h)
 set(bw_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/backward_api.cc)
 set(bw_api_header_file_tmp ${bw_api_header_file}.tmp)
@@ -59,7 +61,6 @@ set(strings_api_source_file_tmp ${strings_api_source_file}.tmp)
 
 # wrapped infermeta file
 set(wrapped_infermeta_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py)
-set(api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml)
 set(wrapped_infermeta_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/infermeta/generated.h)
 set(wrapped_infermeta_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/infermeta/generated.cc)
 
@@ -67,12 +68,107 @@ if (NOT PYTHON_EXECUTABLE)
   find_package(PythonInterp REQUIRED)
 endif()
 
+# install extra dependencies
+execute_process(
+  COMMAND ${PYTHON_EXECUTABLE} -m pip install -U pyyaml jinja2
+)
+
+# parse apis
+set(parsed_api_dir ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/parsed_apis)
+set(generated_op_path ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/generated_op.cc)
+set(generated_argument_mapping_path ${CMAKE_SOURCE_DIR}/paddle/phi/ops/compat/generated_sig.cc)
+message("parse api yamls: 
+- ${api_yaml_file}
+- ${new_api_yaml_file}
+- ${bw_api_yaml_file}
+- ${new_bw_api_yaml_file}")
+execute_process(
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen
+  COMMAND ${CMAKE_COMMAND} -E make_directory ${parsed_api_dir}
+  COMMAND ${PYTHON_EXECUTABLE} parse_api.py 
+      --api_yaml_path ./api.yaml 
+      --output_path ./parsed_apis/api.parsed.yaml
+  COMMAND ${PYTHON_EXECUTABLE} parse_api.py 
+      --api_yaml_path ./new_api.yaml 
+      --output_path ./parsed_apis/new_api.parsed.yaml
+  COMMAND ${PYTHON_EXECUTABLE} parse_api.py
+      --api_yaml_path ./backward.yaml
+      --output_path ./parsed_apis/backward_api.parsed.yaml
+      --backward
+  COMMAND ${PYTHON_EXECUTABLE} parse_api.py
+      --api_yaml_path ./new_backward.yaml
+      --output_path ./parsed_apis/new_backward_api.parsed.yaml
+      --backward
+  RESULTS_VARIABLE _results
+)
+foreach(_result in ${_results})
+  if (${_result})
+    message(FATAL_ERROR "api yaml parsing failed, exiting.")
+  endif()
+endforeach()
+
+# validation of api yamls
+message("validate api yaml:
+- ${parsed_api_dir}/new_api.parsed.yaml
+- ${parsed_api_dir}/new_backward_api.parsed.yaml")
+execute_process(
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen
+  COMMAND ${PYTHON_EXECUTABLE} cross_validate.py
+      --forward_yaml_paths ./parsed_apis/api.parsed.yaml ./parsed_apis/new_api.parsed.yaml 
+      --backward_yaml_paths ./parsed_apis/backward_api.parsed.yaml ./parsed_apis/new_backward_api.parsed.yaml
+  RESULT_VARIABLE _result
+)
+if (${_result}) 
+    message(FATAL_ERROR "api validation failed, exiting." )
+endif()
+
+# code generation for op, op makers, and argument mapping functions
+message("create or remove auto-geneated operators: ${generated_op_path}.tmp
+create or remove auto-geneated argument mappings: ${generated_argument_mapping_path}.tmp")
+execute_process(
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen
+  COMMAND ${PYTHON_EXECUTABLE} generate_op.py
+      --api_yaml_path ./parsed_apis/new_api.parsed.yaml
+      --backward_api_yaml_path ./parsed_apis/new_backward_api.parsed.yaml
+      --output_op_path "${generated_op_path}.tmp"
+      --output_arg_map_path "${generated_argument_mapping_path}.tmp"
+  RESULT_VARIABLE _result
+)
+if (${_result})
+    message(FATAL_ERROR "operator codegen failed, exiting." )
+endif()
+
+
+if(EXISTS "${generated_op_path}.tmp" AND EXISTS "${generated_op_path}")
+  execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different "${generated_op_path}.tmp" "${generated_op_path}")
+  message("copy if different ${generated_op_path}.tmp ${generated_op_path}")
+elseif(EXISTS "${generated_op_path}.tmp")
+  execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${generated_op_path}.tmp" "${generated_op_path}")
+  message("copy ${generated_op_path}.tmp ${generated_op_path}")
+else()
+  execute_process(COMMAND ${CMAKE_COMMAND} -E remove -f "${generated_op_path}")
+  message("remove ${generated_op_path}")
+endif()
+
+
+if(EXISTS "${generated_argument_mapping_path}.tmp" AND EXISTS "${generated_argument_mapping_path}")
+  execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different "${generated_argument_mapping_path}.tmp" "${generated_argument_mapping_path}")
+  message("copy if different ${generated_argument_mapping_path}.tmp ${generated_argument_mapping_path}")
+elseif(EXISTS "${generated_argument_mapping_path}.tmp")
+  execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${generated_argument_mapping_path}.tmp" "${generated_argument_mapping_path}")
+  message("copy ${generated_argument_mapping_path}.tmp ${generated_argument_mapping_path}")
+else()
+  execute_process(COMMAND ${CMAKE_COMMAND} -E remove -f "${generated_argument_mapping_path}")
+  message("remove ${generated_argument_mapping_path}")
+endif()
+
 # generate forward api
 add_custom_command(
   OUTPUT ${api_header_file} ${api_source_file}
   COMMAND ${PYTHON_EXECUTABLE} -m pip install pyyaml
   COMMAND ${PYTHON_EXECUTABLE} ${api_gen_file}
-                 --api_yaml_path ${api_yaml_file}
+                 --api_yaml_path ${api_yaml_file} ${new_api_yaml_file}
+                 --api_header_path ${api_header_file_tmp}
                  --api_header_path ${api_header_file_tmp}
                  --api_source_path ${api_source_file_tmp}
   COMMAND ${CMAKE_COMMAND} -E copy_if_different ${api_header_file_tmp} ${api_header_file}
@@ -85,7 +181,7 @@ add_custom_command(
 add_custom_command(
   OUTPUT ${bw_api_header_file} ${bw_api_source_file} ${bw_api_header_file_tmp} ${bw_api_source_file_tmp}
   COMMAND ${PYTHON_EXECUTABLE} ${bw_api_gen_file}
-                 --backward_yaml_path ${bw_api_yaml_file}
+                 --backward_yaml_path ${bw_api_yaml_file} ${new_bw_api_yaml_file}
                  --backward_header_path ${bw_api_header_file_tmp}
                  --backward_source_path ${bw_api_source_file_tmp}
   COMMAND ${CMAKE_COMMAND} -E copy_if_different ${bw_api_header_file_tmp} ${bw_api_header_file}
@@ -137,7 +233,7 @@ add_custom_command(
 add_custom_command(
   OUTPUT ${dygraph_api_header_file} ${dygraph_api_source_file}
   COMMAND ${PYTHON_EXECUTABLE} ${im_api_gen_file}
-                 --api_yaml_path ${api_yaml_file}
+                 --api_yaml_path ${api_yaml_file} ${new_api_yaml_file}
                  --sparse_api_yaml_path ${sparse_api_yaml_file}
                  --dygraph_api_header_path ${dygraph_api_header_file_tmp}
                  --dygraph_api_source_path ${dygraph_api_source_file_tmp}
@@ -150,7 +246,7 @@ add_custom_command(
 add_custom_command(
   OUTPUT ${wrapped_infermeta_header_file} ${wrapped_infermeta_source_file}
   COMMAND ${PYTHON_EXECUTABLE} ${wrapped_infermeta_gen_file}
-                 --api_yaml_path ${api_yaml_file}
+                 --api_yaml_path ${api_yaml_file} ${new_api_yaml_file}
                  --wrapped_infermeta_header_path ${wrapped_infermeta_header_file}
                  --wrapped_infermeta_source_path ${wrapped_infermeta_source_file}
   DEPENDS ${api_yaml_file} ${wrapped_infermeta_gen_file} ${api_gen_base}
@@ -175,3 +271,4 @@ cc_library(strings_api SRCS ${strings_api_source_file} DEPS phi_tensor_raw phi k
 cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api api_gen_utils kernel_dispatch infermeta sparse_api strings_api)
 cc_library(tensor_copy SRCS tensor_copy.cc DEPS phi_tensor_raw copy_kernel kernel_dispatch api_gen_utils)
 cc_library(api_scalar SRCS scalar.cc DEPS tensor_copy)
+cc_library(api_int_array SRCS int_array.cc DEPS tensor_copy)
diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc
index 38a60ab978900..8a845c331cc60 100644
--- a/paddle/phi/api/lib/api_custom_impl.cc
+++ b/paddle/phi/api/lib/api_custom_impl.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/tensor_copy.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/common/type_traits.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -592,19 +591,20 @@ Tensor conv2d_impl(const Tensor& input,
   return api_output;
 }
 
-std::vector<std::vector<Tensor>> conv2d_grad_impl(
-    const Tensor& input,
-    const Tensor& filter,
-    const Tensor& out_grad,
-    const std::vector<int>& strides,
-    const std::vector<int>& paddings,
-    const std::string& paddding_algorithm,
-    int groups,
-    const std::vector<int>& dilations,
-    const std::string& data_format,
-    bool use_addto,
-    int workspace_size_MB,
-    bool exhaustive_search) {
+void conv2d_grad_impl(const Tensor& input,
+                      const Tensor& filter,
+                      const Tensor& out_grad,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      const std::string& paddding_algorithm,
+                      int groups,
+                      const std::vector<int>& dilations,
+                      const std::string& data_format,
+                      bool use_addto,
+                      int workspace_size_MB,
+                      bool exhaustive_search,
+                      Tensor* input_grad,
+                      Tensor* filter_grad) {
   Backend kernel_backend = Backend::UNDEFINED;
   DataLayout kernel_layout = DataLayout::UNDEFINED;
   DataType kernel_data_type = DataType::UNDEFINED;
@@ -646,18 +646,15 @@ std::vector<std::vector<Tensor>> conv2d_grad_impl(
   auto input_filter = PrepareData(filter, args1, {});
   auto input_out_grad = PrepareData(out_grad, args2, {});
 
-  std::vector<std::vector<Tensor>> api_output(2);
-  api_output[0].emplace_back();
-  auto kernel_out_0 = SetKernelOutput(kernel_backend, &api_output[0][0]);
-  api_output[1].emplace_back();
-  auto kernel_out_1 = SetKernelOutput(kernel_backend, &api_output[1][0]);
+  auto kernel_out_0 = SetKernelOutput(kernel_backend, input_grad);
+  auto kernel_out_1 = SetKernelOutput(kernel_backend, filter_grad);
   phi::MetaTensor meta_out_0(kernel_out_0);
   phi::MetaTensor meta_out_1(kernel_out_1);
 
   phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_input),
                                   MakeMetaTensor(*input_filter),
-                                  &meta_out_0,
-                                  &meta_out_1);
+                                  kernel_out_0 ? &meta_out_0 : nullptr,
+                                  kernel_out_1 ? &meta_out_1 : nullptr);
 
   using kernel_signature = void (*)(const platform::DeviceContext&,
                                     const phi::DenseTensor&,
@@ -693,8 +690,6 @@ std::vector<std::vector<Tensor>> conv2d_grad_impl(
                  kernel_out_0,
                  kernel_out_1);
   }
-
-  return api_output;
 }
 
 Tensor copy_to_impl(const Tensor& x, Place place, bool blocking) {
@@ -1080,8 +1075,9 @@ std::tuple<Tensor, Tensor> sgd_impl(
 // but if we use this impl, it will not support. We need to be able to reuse
 // the autograd API here, which is not yet implemented
 // TODO(chenweihang): we should support call generated api in custom api impl
-std::vector<Tensor> add_n_grad_impl(const std::vector<Tensor>& x,
-                                    const Tensor& out_grad) {
+void add_n_grad_impl(const std::vector<Tensor>& x,
+                     const Tensor& out_grad,
+                     std::vector<Tensor*> x_grad) {
   auto kernel_key_set = ParseKernelKeyByInputArgs(out_grad);
   auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
 
@@ -1099,9 +1095,7 @@ std::vector<Tensor> add_n_grad_impl(const std::vector<Tensor>& x,
 
   auto dense_out_grad = PrepareData(out_grad, kernel.InputAt(0), {});
 
-  size_t out_number = x.size();
-  std::vector<Tensor> x_grad;
-  auto dense_x_grad = SetKernelOutput(out_number, kernel_backend, &x_grad);
+  auto dense_x_grad = SetKernelOutput(&x_grad);
 
   using kernel_signature = void (*)(const platform::DeviceContext&,
                                     const phi::DenseTensor&,
@@ -1117,8 +1111,6 @@ std::vector<Tensor> add_n_grad_impl(const std::vector<Tensor>& x,
     (*kernel_fn)(
         *dev_ctx, *dense_out_grad, phi::Scalar(1.0), 0.0, true, dense_x_grad_t);
   }
-
-  return x_grad;
 }
 
 std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> batch_norm_impl(
@@ -1250,7 +1242,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> batch_norm_impl(
   return api_output;
 }
 
-Tensor imag_grad_impl(const Tensor& out_grad) {
+void imag_grad_impl(const Tensor& out_grad, Tensor* x_grad) {
   phi::KernelKey kernel_key{ParseBackend(out_grad),
                             out_grad.layout(),
                             phi::dtype::ToComplex(out_grad.dtype())};
@@ -1264,8 +1256,7 @@ Tensor imag_grad_impl(const Tensor& out_grad) {
 
   auto dense_out_grad = TensorToDenseTensor(out_grad);
 
-  Tensor out;
-  auto kernel_out = SetKernelOutput(kernel_key.backend(), &out);
+  auto kernel_out = SetKernelOutput(kernel_key.backend(), x_grad);
   phi::MetaTensor meta_out(kernel_out);
   phi::RealAndImagGradInferMeta(*dense_out_grad, &meta_out);
 
@@ -1274,11 +1265,9 @@ Tensor imag_grad_impl(const Tensor& out_grad) {
 
   auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
   (*kernel_fn)(*dev_ctx, *dense_out_grad, kernel_out);
-
-  return out;
 }
 
-Tensor real_grad_impl(const Tensor& out_grad) {
+void real_grad_impl(const Tensor& out_grad, Tensor* x_grad) {
   phi::KernelKey kernel_key{ParseBackend(out_grad),
                             out_grad.layout(),
                             phi::dtype::ToComplex(out_grad.dtype())};
@@ -1292,8 +1281,7 @@ Tensor real_grad_impl(const Tensor& out_grad) {
 
   auto dense_out_grad = TensorToDenseTensor(out_grad);
 
-  Tensor out;
-  auto kernel_out = SetKernelOutput(kernel_key.backend(), &out);
+  auto kernel_out = SetKernelOutput(kernel_key.backend(), x_grad);
   phi::MetaTensor meta_out(kernel_out);
   phi::RealAndImagGradInferMeta(*dense_out_grad, &meta_out);
 
@@ -1302,8 +1290,6 @@ Tensor real_grad_impl(const Tensor& out_grad) {
 
   auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
   (*kernel_fn)(*dev_ctx, *dense_out_grad, kernel_out);
-
-  return out;
 }
 
 }  // namespace experimental
diff --git a/paddle/phi/api/lib/api_custom_impl.h b/paddle/phi/api/lib/api_custom_impl.h
index 46abcd90de32a..d88a134654caf 100644
--- a/paddle/phi/api/lib/api_custom_impl.h
+++ b/paddle/phi/api/lib/api_custom_impl.h
@@ -96,20 +96,6 @@ Tensor conv2d_impl(const Tensor& input,
                    int workspace_size_MB,
                    bool exhaustive_search);
 
-std::vector<std::vector<Tensor>> conv2d_grad_impl(
-    const Tensor& input,
-    const Tensor& filter,
-    const Tensor& out_grad,
-    const std::vector<int>& strides,
-    const std::vector<int>& paddings,
-    const std::string& paddding_algorithm,
-    int groups,
-    const std::vector<int>& dilations,
-    const std::string& data_format,
-    bool use_addto,
-    int workspace_size_MB,
-    bool exhaustive_search);
-
 Tensor copy_to_impl(const Tensor& x, Place place, bool blocking);
 
 std::vector<Tensor> split_impl(const Tensor& x,
@@ -138,12 +124,28 @@ std::tuple<Tensor, Tensor> sgd_impl(
 
 ////////////////// Backward(grad) api impls //////////////////////
 
-std::vector<Tensor> add_n_grad_impl(const std::vector<Tensor>& x,
-                                    const Tensor& out_grad);
-
-Tensor imag_grad_impl(const Tensor& x);
-
-Tensor real_grad_impl(const Tensor& x);
+void add_n_grad_impl(const std::vector<Tensor>& x,
+                     const Tensor& out_grad,
+                     std::vector<Tensor*> x_grad);
+
+void conv2d_grad_impl(const Tensor& input,
+                      const Tensor& filter,
+                      const Tensor& out_grad,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      const std::string& paddding_algorithm,
+                      int groups,
+                      const std::vector<int>& dilations,
+                      const std::string& data_format,
+                      bool use_addto,
+                      int workspace_size_MB,
+                      bool exhaustive_search,
+                      Tensor* input_grad,
+                      Tensor* filter_grad);
+
+void imag_grad_impl(const Tensor& out_grad, Tensor* x_grad);
+
+void real_grad_impl(const Tensor& out_grad, Tensor* x_grad);
 
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/phi/api/lib/api_gen_utils.cc b/paddle/phi/api/lib/api_gen_utils.cc
index fb205212ff371..2111829b8d60b 100644
--- a/paddle/phi/api/lib/api_gen_utils.cc
+++ b/paddle/phi/api/lib/api_gen_utils.cc
@@ -113,10 +113,13 @@ phi::MetaTensor MakeMetaTensor(const phi::StringTensor& tensor) {
 /* ------------------ for output ----------------------- */
 
 phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) {
-  if (out->impl() == nullptr) {
-    out->set_impl(std::make_shared<phi::DenseTensor>());
+  if (out) {
+    if (out->impl() == nullptr) {
+      out->set_impl(std::make_shared<phi::DenseTensor>());
+    }
+    return static_cast<phi::DenseTensor*>(out->impl().get());
   }
-  return static_cast<phi::DenseTensor*>(out->impl().get());
+  return nullptr;
 }
 
 std::vector<phi::DenseTensor*> SetKernelOutput(size_t out_size,
@@ -133,6 +136,18 @@ std::vector<phi::DenseTensor*> SetKernelOutput(size_t out_size,
   return results;
 }
 
+std::vector<phi::DenseTensor*> SetKernelOutput(std::vector<Tensor*>* out) {
+  std::vector<phi::DenseTensor*> results(out->size(), nullptr);
+  for (size_t i = 0; i < out->size(); ++i) {
+    if (out->at(i)) {
+      auto tensor_ptr = std::make_shared<phi::DenseTensor>();
+      results[i] = tensor_ptr.get();
+      (*out)[i]->set_impl(tensor_ptr);
+    }
+  }
+  return results;
+}
+
 phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend, Tensor* out) {
   if (!out->initialized()) {
     auto select_rows = std::make_shared<phi::SelectedRows>();
diff --git a/paddle/phi/api/lib/api_gen_utils.h b/paddle/phi/api/lib/api_gen_utils.h
index 47b80bb3fc290..097178ae0d928 100644
--- a/paddle/phi/api/lib/api_gen_utils.h
+++ b/paddle/phi/api/lib/api_gen_utils.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/phi/api/include/tensor.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/meta_tensor.h"
@@ -74,6 +73,9 @@ std::vector<phi::DenseTensor*> SetKernelOutput(size_t out_size,
                                                Backend backend,
                                                std::vector<Tensor>* out);
 
+// For backward api
+std::vector<phi::DenseTensor*> SetKernelOutput(std::vector<Tensor*>* out);
+
 phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend, Tensor* out);
 
 phi::TensorBase* SetSparseKernelOutput(Tensor* out, TensorType type);
diff --git a/paddle/phi/api/lib/int_array.cc b/paddle/phi/api/lib/int_array.cc
new file mode 100644
index 0000000000000..503fc8184abf6
--- /dev/null
+++ b/paddle/phi/api/lib/int_array.cc
@@ -0,0 +1,72 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/common/int_array.h"
+
+#include "paddle/phi/api/lib/tensor_copy.h"
+#include "paddle/phi/common/place.h"
+
+namespace paddle {
+namespace experimental {
+
+template <>
+IntArrayBase<Tensor>::IntArrayBase(const Tensor& tensor) {  // NOLINT
+  is_from_tensor_ = true;
+  if (tensor.place().GetType() == phi::AllocationType::CPU) {
+    AssignDataFromTensor(tensor);
+  } else {
+    Tensor tensor_tmp;
+    copy(tensor, phi::CPUPlace(), true, &tensor_tmp);
+    AssignDataFromTensor(tensor_tmp);
+  }
+}
+
+template <>
+IntArrayBase<Tensor>::IntArrayBase(const std::vector<Tensor>& tensor_list) {
+  is_from_tensor_ = true;
+
+  for (size_t i = 0; i < tensor_list.size(); ++i) {
+    DataType data_type = tensor_list[i].dtype();
+    switch (data_type) {
+      case DataType::INT32:
+        if (tensor_list[i].place().GetType() == AllocationType::CPU) {
+          array_.push_back(*tensor_list[i].template data<int32_t>());
+        } else {
+          Tensor tensor_tmp;
+          copy(tensor_list[i], phi::CPUPlace(), true, &tensor_tmp);
+          array_.push_back(*tensor_tmp.template data<int32_t>());
+        }
+        break;
+      case DataType::INT64:
+        if (tensor_list[i].place().GetType() == AllocationType::CPU) {
+          array_.push_back(*tensor_list[i].template data<int64_t>());
+        } else {
+          Tensor tensor_tmp;
+          copy(tensor_list[i], phi::CPUPlace(), true, &tensor_tmp);
+          array_.push_back(*tensor_tmp.template data<int64_t>());
+        }
+        break;
+      default:
+        PD_THROW(
+            "Data type error. Currently, The data type of IntArrayBase "
+            "only supports Tensor with int32 and int64, "
+            "but now received `",
+            data_type,
+            "`.");
+    }
+  }
+}
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/phi/api/lib/kernel_dispatch.h b/paddle/phi/api/lib/kernel_dispatch.h
index 9f2ad6c62c7cf..29254a0486d00 100644
--- a/paddle/phi/api/lib/kernel_dispatch.h
+++ b/paddle/phi/api/lib/kernel_dispatch.h
@@ -109,7 +109,12 @@ struct KernelKeyParser : ArgsIterator<KernelKeyParser> {
     }
   }
 
-  void operator()(const Tensor& x) { AssignKernelKeySet(*x.impl()); }
+  void operator()(const Tensor& x) {
+    const auto* tensor = x.impl().get();
+    if (tensor) {
+      AssignKernelKeySet(*tensor);
+    }
+  }
 
   void operator()(const std::vector<Tensor>& x) {
     const phi::TensorBase& tensor = *x.at(0).impl();
diff --git a/paddle/phi/api/lib/op_meta_info.cc b/paddle/phi/api/lib/op_meta_info.cc
index 048e4f2b428f2..8d64246bdb69f 100644
--- a/paddle/phi/api/lib/op_meta_info.cc
+++ b/paddle/phi/api/lib/op_meta_info.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
+#include "glog/logging.h"
 #include "paddle/fluid/framework/custom_operator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
diff --git a/paddle/phi/api/lib/sparse_api_custom_impl.cc b/paddle/phi/api/lib/sparse_api_custom_impl.cc
index 01e2ee14f4301..71ba8eaae2d36 100644
--- a/paddle/phi/api/lib/sparse_api_custom_impl.cc
+++ b/paddle/phi/api/lib/sparse_api_custom_impl.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <memory>
 #include "glog/logging.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace paddle {
@@ -65,14 +64,10 @@ Tensor to_sparse_coo_impl(const Tensor& x, const int64_t sparse_dim) {
 
   // 5. Prepare outputs
   // create empty SparseCooTensor
-  phi::DenseTensor non_zero_indices(
-      phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPhiPlace(kernel_key.backend())),
-      std::move(indices_meta));
-  phi::DenseTensor non_zero_elements(
-      phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPhiPlace(kernel_key.backend())),
-      std::move(elements_meta));
+  phi::DenseTensor non_zero_indices(std::make_shared<phi::Allocation>(),
+                                    std::move(indices_meta));
+  phi::DenseTensor non_zero_elements(std::make_shared<phi::Allocation>(),
+                                     std::move(elements_meta));
   auto coo = std::make_shared<phi::SparseCooTensor>(
       non_zero_indices, non_zero_elements, x.dims());
 
@@ -127,18 +122,12 @@ Tensor to_sparse_csr_impl(const Tensor& x) {
 
   // 5. Prepare outputs
   // create empty SparseCooTensor
-  phi::DenseTensor non_zero_crows(
-      phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPhiPlace(kernel_key.backend())),
-      std::move(crows_meta));
-  phi::DenseTensor non_zero_cols(
-      phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPhiPlace(kernel_key.backend())),
-      std::move(cols_meta));
-  phi::DenseTensor non_zero_elements(
-      phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPhiPlace(kernel_key.backend())),
-      std::move(elements_meta));
+  phi::DenseTensor non_zero_crows(std::make_shared<phi::Allocation>(),
+                                  std::move(crows_meta));
+  phi::DenseTensor non_zero_cols(std::make_shared<phi::Allocation>(),
+                                 std::move(cols_meta));
+  phi::DenseTensor non_zero_elements(std::make_shared<phi::Allocation>(),
+                                     std::move(elements_meta));
   auto csr = std::make_shared<phi::SparseCsrTensor>(
       non_zero_crows, non_zero_cols, non_zero_elements, x.dims());
 
@@ -192,9 +181,7 @@ Tensor to_dense_impl(const Tensor& x) {
   // 5. Prepare outputs
   // create empty SparseCooTensor
   auto dense_out = std::make_shared<phi::DenseTensor>(
-      phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPhiPlace(kernel_key.backend())),
-      std::move(dense_meta));
+      std::make_shared<phi::Allocation>(), std::move(dense_meta));
 
   kernel_context.EmplaceBackOutput(dense_out.get());
   Tensor out;
diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc
index a7b89d7a4dca9..a340c0fed10d8 100644
--- a/paddle/phi/api/lib/tensor.cc
+++ b/paddle/phi/api/lib/tensor.cc
@@ -394,8 +394,8 @@ uint32_t Tensor::current_inplace_version() {
         static_cast<phi::DenseTensor *>(impl_.get())->InplaceVersionCounter();
     return inplace_version_counter.CurrentVersion();
   } else {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "current_inplace_version is only supported on DenseTensor now."));
+    LOG_FIRST_N(WARNING, 1)
+        << "current_inplace_version is only supported on DenseTensor now.";
   }
   return 0;
 }
diff --git a/paddle/phi/api/lib/tensor_copy.cc b/paddle/phi/api/lib/tensor_copy.cc
index 57e3c28d8cb1f..85de3601fd96a 100644
--- a/paddle/phi/api/lib/tensor_copy.cc
+++ b/paddle/phi/api/lib/tensor_copy.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/phi/api/lib/tensor_copy.h"
 #include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/meta_tensor.h"
diff --git a/paddle/phi/api/lib/utils/CMakeLists.txt b/paddle/phi/api/lib/utils/CMakeLists.txt
index de97e7516f619..0e1cd0cb83fd4 100644
--- a/paddle/phi/api/lib/utils/CMakeLists.txt
+++ b/paddle/phi/api/lib/utils/CMakeLists.txt
@@ -1,2 +1,2 @@
-cc_library(phi_api_utils SRCS storage.cc tensor_utils.cc DEPS
-tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits string_tensor scalar)
+cc_library(phi_api_utils SRCS tensor_utils.cc DEPS
+tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits string_tensor int_array scalar)
diff --git a/paddle/phi/api/lib/utils/allocator.h b/paddle/phi/api/lib/utils/allocator.h
index 84a089e5899ec..96f1294102ae1 100644
--- a/paddle/phi/api/lib/utils/allocator.h
+++ b/paddle/phi/api/lib/utils/allocator.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/phi/core/allocator.h"
-#include "paddle/phi/core/storage.h"
 
 namespace paddle {
 namespace experimental {
diff --git a/paddle/phi/api/lib/utils/storage.cc b/paddle/phi/api/lib/utils/storage.cc
deleted file mode 100644
index 09ff18d10e312..0000000000000
--- a/paddle/phi/api/lib/utils/storage.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/api/lib/utils/storage.h"
-
-namespace paddle {
-namespace experimental {
-
-ExternalStorage::ExternalStorage(void* ptr,
-                                 size_t size,
-                                 const phi::Place& place)
-    : phi::Storage(std::make_shared<phi::Allocation>(ptr, size, place)),
-      size_(size) {}
-
-ExternalStorage::ExternalStorage(const phi::intrusive_ptr<phi::Storage>& root,
-                                 size_t delta,
-                                 size_t size)
-    : Storage(std::make_shared<phi::Allocation>(
-          static_cast<uint8_t*>(root->data()) + delta, size, root->place())),
-      size_(size) {
-  PADDLE_ENFORCE_LE(
-      static_cast<size_t>(delta + size),
-      root->size(),
-      phi::errors::InvalidArgument("The size of the external storage does "
-                                   "not meet the metadata requirements."));
-}
-
-}  // namespace experimental
-}  // namespace paddle
diff --git a/paddle/phi/api/lib/utils/storage.h b/paddle/phi/api/lib/utils/storage.h
deleted file mode 100644
index c2eedd0fa63f7..0000000000000
--- a/paddle/phi/api/lib/utils/storage.h
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/phi/core/storage.h"
-
-namespace paddle {
-namespace experimental {
-
-class ExternalStorage : public phi::Storage {
- public:
-  ExternalStorage(void* ptr, size_t size, const phi::Place& place);
-  ExternalStorage(const phi::intrusive_ptr<phi::Storage>& root,
-                  size_t delta,
-                  size_t size);
-
-  static const char* name() { return "ExternalStorage"; }
-
-  void Realloc(size_t n) override {
-    PADDLE_THROW(phi::errors::Unavailable(
-        "The external shared storage cannot be reallocated."));
-  }
-
-  void Clear() override {
-    data_ = nullptr;
-    size_ = 0;
-  }
-
-  void set_data_shared(
-      const std::shared_ptr<paddle::memory::Allocation>& holder) override {
-    CHECK(holder);
-    data_ = holder;
-    size_ = holder->size();
-  }
-
-  std::shared_ptr<paddle::memory::Allocation>&& move_data_shared() override {
-    size_ = 0;
-    return std::move(data_);
-  }
-
-  size_t size() const noexcept override { return size_; }
-  const phi::Place& place() const override {
-    PADDLE_ENFORCE_NOT_NULL(
-        data_,
-        phi::errors::Unavailable(
-            "Unable to visit place as data_ has not been initialized yet."));
-    return data_->place();
-  }
-  bool OwnsMemory() const noexcept override { return false; }
-
- private:
-  int64_t size_{0};
-};
-
-class SharedStorage : public phi::Storage {
- public:
-  explicit SharedStorage(
-      const std::shared_ptr<paddle::memory::Allocation>& allocation)
-      : Storage(allocation) {
-    CHECK(allocation);
-    place_ = allocation->place();
-    size_ = allocation->size();
-  }
-
-  // In order to be compatible with the original Tensor design and execution
-  // system, we need to allow the uninitialized SharedStorage to exist,
-  // and it can be removed after the compatibility phase is over in the future
-  explicit SharedStorage(const phi::Place& place) { place_ = place; }
-
-  void Realloc(size_t n) override {
-    this->Clear();
-    data_ = paddle::memory::AllocShared(place(), n);
-    size_ = n;
-  }
-
-  static const char* name() { return "SharedStorage"; }
-
-  void Clear() override {
-    data_ = nullptr;
-    size_ = 0;
-  }
-
-  void set_data_shared(
-      const std::shared_ptr<paddle::memory::Allocation>& holder) override {
-    data_ = holder;
-    if (holder) {
-      size_ = holder->size();
-      place_ = holder->place();
-    }
-  }
-
-  std::shared_ptr<paddle::memory::Allocation>&& move_data_shared() override {
-    size_ = 0;
-    place_ = phi::Place();
-    return std::move(data_);
-  }
-
-  size_t size() const noexcept override {
-    return data_ ? data_->size() : size_;
-  }
-  const phi::Place& place() const override {
-    return data_ ? data_->place() : place_;
-  }
-  bool OwnsMemory() const noexcept override { return false; }
-
-  const std::shared_ptr<paddle::memory::Allocation>& GetAllocation() {
-    return data_;
-  }
-
-  // Temporary method: For compatible with fluid Tensor and improve performance
-  void ResetAllocation(std::shared_ptr<paddle::memory::Allocation> allocation) {
-    data_ = allocation;
-    size_ = allocation->size();
-    place_ = allocation->place();
-  }
-
-  // Temporary method: For compatible with fluid Tensor and improve performance
-  void ResetAllocationPlace(const phi::Place& place) { place_ = place; }
-
-  // Temporary method: For compatible with fluid Tensor and improve performance
-  void Reset() { this->Clear(); }
-
- private:
-  phi::Place place_;
-  int64_t size_{0};
-};
-
-class TensorStorage : public paddle::memory::allocation::Allocation {
- public:
-  explicit TensorStorage(phi::intrusive_ptr<phi::Storage> storage)
-      : paddle::memory::allocation::Allocation(
-            storage->data(), storage->size(), storage->place()),
-        storage_(std::move(storage)) {}
-
- private:
-  phi::intrusive_ptr<phi::Storage> storage_;
-};
-
-}  // namespace experimental
-}  // namespace paddle
diff --git a/paddle/phi/api/lib/utils/tensor_utils.cc b/paddle/phi/api/lib/utils/tensor_utils.cc
index 5a6f1b1a7ee0c..c9fb2d3734edc 100644
--- a/paddle/phi/api/lib/utils/tensor_utils.cc
+++ b/paddle/phi/api/lib/utils/tensor_utils.cc
@@ -67,16 +67,9 @@ phi::IntArray MakePhiIntArray(const paddle::framework::Tensor& src) {
 }
 
 phi::IntArray MakePhiIntArrayFromVar(const framework::Variable& variable) {
-  auto expected_place = phi::TransToPhiPlace(phi::Backend::CPU);
   if (variable.IsType<framework::LoDTensor>()) {
     const auto& tensor = variable.Get<framework::LoDTensor>();
-    if (!platform::is_same_place(tensor.place(), expected_place)) {
-      framework::LoDTensor tmp_tensor;
-      framework::TensorCopySync(tensor, expected_place, &tmp_tensor);
-      return MakePhiIntArray(tmp_tensor);
-    } else {
-      return MakePhiIntArray(tensor);
-    }
+    return MakePhiIntArray(tensor);
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
         "Unsupport casting input `%s` type to IntArray when call pt "
diff --git a/paddle/phi/api/lib/utils/tensor_utils.h b/paddle/phi/api/lib/utils/tensor_utils.h
index 00199da1280e8..36a0901bbe980 100644
--- a/paddle/phi/api/lib/utils/tensor_utils.h
+++ b/paddle/phi/api/lib/utils/tensor_utils.h
@@ -20,7 +20,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/variable.h"
 
 #include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/compat/convert_utils.h"
diff --git a/paddle/phi/backends/device_base.cc b/paddle/phi/backends/device_base.cc
index 14fe90192e5bc..b72c6efd51f2c 100644
--- a/paddle/phi/backends/device_base.cc
+++ b/paddle/phi/backends/device_base.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/phi/backends/device_base.h"
 #include "gflags/gflags.h"
+#include "glog/logging.h"
 #include "paddle/phi/core/enforce.h"
 
 DECLARE_double(fraction_of_gpu_memory_to_use);
diff --git a/paddle/phi/backends/device_manager.h b/paddle/phi/backends/device_manager.h
index 39eef27b4a607..18d51687ef121 100644
--- a/paddle/phi/backends/device_manager.h
+++ b/paddle/phi/backends/device_manager.h
@@ -15,6 +15,8 @@
 #pragma once
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 
+#include <unordered_map>
+
 #include "paddle/phi/backends/device_base.h"
 #include "paddle/phi/backends/device_ext.h"
 #include "paddle/phi/backends/event.h"
diff --git a/paddle/phi/backends/gpu/CMakeLists.txt b/paddle/phi/backends/gpu/CMakeLists.txt
index d14e94024f90f..ebe8f1ca4c101 100644
--- a/paddle/phi/backends/gpu/CMakeLists.txt
+++ b/paddle/phi/backends/gpu/CMakeLists.txt
@@ -6,4 +6,5 @@ elseif(WITH_ROCM)
   hip_library(phi_gpu_info SRCS gpu_info.cc DEPS phi_rocm_info gflags glog enforce phi_dynload_cuda)
 endif()
 
-cc_library(gpu_context SRCS gpu_context.cc DEPS phi_device_context phi_gpu_info eigen3)
+cc_library(gpu_resources SRCS gpu_resources.cc DEPS phi_device_context phi_gpu_info)
+cc_library(gpu_context SRCS gpu_context.cc DEPS phi_device_context phi_gpu_info eigen3 gpu_resources)
diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
index ff238b7997865..e8c264b884fe3 100644
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -14,6 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+
 #include <algorithm>
 #include <array>
 #include <functional>
@@ -21,10 +22,12 @@ limitations under the License. */
 #include <memory>
 #include <mutex>
 
-#include "paddle/phi/api/ext/exception.h"
+#include "glog/logging.h"
 
+#include "paddle/phi/api/ext/exception.h"
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_resources.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/allocator.h"
@@ -202,27 +205,65 @@ struct GPUContext::Impl {
   void Init() {
     owned_ = true;
     backends::gpu::GPUDeviceGuard guard(place_.device);
-    InitGpuProperties();
-    InitStream();
+    phi::InitGpuProperties(place_,
+                           &compute_capability_,
+                           &runtime_version_,
+                           &driver_version_,
+                           &multi_process_,
+                           &max_threads_per_mp_,
+                           &max_threads_per_block_,
+                           &max_grid_dim_size_);
+    phi::InitStream(&stream_);
     InitEigenDevice();
-    InitBlasHandle();
-    InitBlasLtHandle();
-    InitDNNHandle();
-    InitSolverHandle();
-    InitSparseHandle();
+    phi::InitBlasHandle(&blas_handle_, stream_);
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 9000
+    phi::InitBlasHandle(&blas_tensor_core_handle_, stream_);
+    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+        blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
+#endif
+#if CUDA_VERSION >= 11000
+    phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream_);
+    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+        blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
+#endif
+#endif
+    phi::InitBlasLtHandle(&blaslt_handle_);
+    phi::InitDnnHandle(&dnn_handle_, stream_, place_);
+    phi::InitSolverHandle(&solver_handle_, stream_);
+    phi::InitSparseHandle(&sparse_handle_, stream_);
     InitDnnWorkspace();
   }
 
   void PartialInitWithoutAllocator() {
     owned_ = true;
     backends::gpu::GPUDeviceGuard guard(place_.device);
-    InitGpuProperties();
-    InitStream();
-    InitBlasHandle();
-    InitBlasLtHandle();
-    InitDNNHandle();
-    InitSolverHandle();
-    InitSparseHandle();
+    phi::InitGpuProperties(place_,
+                           &compute_capability_,
+                           &runtime_version_,
+                           &driver_version_,
+                           &multi_process_,
+                           &max_threads_per_mp_,
+                           &max_threads_per_block_,
+                           &max_grid_dim_size_);
+    phi::InitStream(&stream_);
+    phi::InitBlasHandle(&blas_handle_, stream_);
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 9000
+    phi::InitBlasHandle(&blas_tensor_core_handle_, stream_);
+    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+        blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
+#endif
+#if CUDA_VERSION >= 11000
+    phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream_);
+    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+        blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
+#endif
+#endif
+    phi::InitBlasLtHandle(&blaslt_handle_);
+    phi::InitDnnHandle(&dnn_handle_, stream_, place_);
+    phi::InitSolverHandle(&solver_handle_, stream_);
+    phi::InitSparseHandle(&sparse_handle_, stream_);
   }
 
   void PartialInitWithAllocator() {
@@ -238,19 +279,23 @@ struct GPUContext::Impl {
 
   ~Impl() {
     backends::gpu::GPUDeviceGuard guard(place_.device);
-    DestoryInternalWorkspace();
-    DestoryInternalEigenDevice();
-    DestroyInternalSparseHandle();
-    DestroyInternalSolverHandle();
-    DestroyInternalDnnHandle();
+    if (owned_) {
+      DestoryInternalWorkspace();
+      DestoryInternalEigenDevice();
+      phi::DestroySparseHandle(sparse_handle_);
+      phi::DestroySolverHandle(solver_handle_);
+      phi::DestroyDnnHandle(dnn_handle_);
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    if (nccl_comm_) {
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclCommDestroy(nccl_comm_));
-    }
+      if (nccl_comm_) {
+        PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclCommDestroy(nccl_comm_));
+      }
 #endif
-    DestroyInternalBlasHandle();
-    DestroyInternalBlasLtHandle();
-    DestoryInternalStream();
+      phi::DestroyBlasHandle(blas_handle_);
+      phi::DestroyBlasHandle(blas_tensor_core_handle_);
+      phi::DestroyBlasHandle(blas_tf32_tensor_core_handle_);
+      phi::DestroyBlasLtHandle(blaslt_handle_);
+      phi::DestoryStream(stream_);
+    }
   }
 
   const Place& GetPlace() const { return place_; }
@@ -259,73 +304,6 @@ struct GPUContext::Impl {
     return blas_tensor_core_handle_ != nullptr;
   }
 
-  void InitGpuProperties() {
-    backends::gpu::GPUDeviceGuard guard(place_.GetDeviceId());
-    compute_capability_ =
-        backends::gpu::GetGPUComputeCapability(place_.GetDeviceId());
-    multi_process_ = backends::gpu::GetGPUMultiProcessors(place_.GetDeviceId());
-    max_threads_per_mp_ =
-        backends::gpu::GetGPUMaxThreadsPerMultiProcessor(place_.GetDeviceId());
-    max_grid_dim_size_ =
-        backends::gpu::GetGpuMaxGridDimSize(place_.GetDeviceId());
-    max_threads_per_block_ =
-        backends::gpu::GetGPUMaxThreadsPerBlock(place_.GetDeviceId());
-    driver_version_ = backends::gpu::GetGPUDriverVersion(place_.GetDeviceId());
-    runtime_version_ =
-        backends::gpu::GetGPURuntimeVersion(place_.GetDeviceId());
-
-    // TODO(wilber): glog may be replaced in the future?
-    LOG_FIRST_N(WARNING, 1)
-        << "Please NOTE: device: " << static_cast<int>(place_.device)
-        << ", GPU Compute Capability: " << compute_capability_ / 10 << "."
-        << compute_capability_ % 10
-        << ", Driver API Version: " << driver_version_ / 1000 << "."
-        << (driver_version_ % 100) / 10
-        << ", Runtime API Version: " << runtime_version_ / 1000 << "."
-        << (runtime_version_ % 100) / 10;
-#ifdef PADDLE_WITH_HIP
-    size_t miopen_major, miopen_minor, miopen_patch;
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dynload::miopenGetVersion(&miopen_major, &miopen_minor, &miopen_patch));
-    auto cudnn_dso_ver =
-        (miopen_major * 1000 + miopen_minor * 10 + miopen_patch) / 10;
-    auto compile_miopen_version = MIOPEN_VERSION / 10;
-    if (cudnn_dso_ver < static_cast<size_t>(compile_miopen_version)) {
-      LOG_FIRST_N(WARNING, 1)
-          << "WARNING: device: " << static_cast<int>(place_.device)
-          << ". The installed Paddle is compiled with MIOPEN "
-          << compile_miopen_version / 100 << "." << compile_miopen_version % 100
-          << ", but MIOPEN version in your machine is " << cudnn_dso_ver / 100
-          << "." << cudnn_dso_ver % 100
-          << ", which may cause serious incompatible bug. "
-          << "Please recompile or reinstall Paddle with compatible MIOPEN "
-             "version.";
-    }
-#else
-    size_t cudnn_dso_ver = dynload::cudnnGetVersion();
-    LOG_FIRST_N(WARNING, 1) << "device: " << static_cast<int>(place_.device)
-                            << ", cuDNN Version: " << cudnn_dso_ver / 1000
-                            << "." << (cudnn_dso_ver % 1000) / 100 << ".";
-
-    // Check CUDA/CUDNN version compatiblity
-    auto local_cuda_version =
-        (driver_version_ / 1000) * 10 + (driver_version_ % 100) / 10;
-    auto compile_cuda_version =
-        (CUDA_VERSION / 1000) * 10 + (CUDA_VERSION % 100) / 10;
-    if (local_cuda_version < compile_cuda_version) {
-      LOG_FIRST_N(WARNING, 1)
-          << "WARNING: device: " << static_cast<int>(place_.device)
-          << ". The installed Paddle is compiled with CUDA "
-          << compile_cuda_version / 10 << "." << compile_cuda_version % 10
-          << ", but CUDA runtime version in your machine is "
-          << local_cuda_version / 10 << "." << local_cuda_version % 10
-          << ", which may cause serious incompatible bug. "
-          << "Please recompile or reinstall Paddle with compatible CUDA "
-             "version.";
-    }
-#endif
-  }
-
   void InitDnnWorkspace() {
     PD_CHECK(allocator_ != nullptr,
              "the device allocator for gpu context is nullptr.");
@@ -350,27 +328,6 @@ struct GPUContext::Impl {
     return DnnWorkspaceHandle(allocator_, stream_);
   }
 
-  void InitStream() {
-#ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        hipStreamCreateWithPriority(&stream_, hipStreamDefault, 0));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        cudaStreamCreateWithPriority(&stream_, cudaStreamDefault, 0));
-#endif
-  }
-
-  void DestoryInternalStream() {
-    if (owned_ && stream_ != nullptr) {
-#ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream_));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream_));
-#endif
-    }
-    stream_ = nullptr;
-  }
-
   void SetStream(gpuStream_t stream) { stream_ = stream; }
 
   gpuStream_t GetStream() const {
@@ -400,55 +357,6 @@ struct GPUContext::Impl {
     return eigen_device_;
   }
 
-  void InitBlasHandle() {
-#ifdef PADDLE_WITH_HIP
-    phi::dynload::rocblas_create_handle(&blas_handle_);
-    phi::dynload::rocblas_set_stream(blas_handle_, stream_);
-#else  // PADDLE_WITH_CUDA
-    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasCreate(&blas_handle_));
-    PADDLE_RETRY_CUDA_SUCCESS(
-        phi::dynload::cublasSetStream(blas_handle_, stream_));
-#if CUDA_VERSION >= 9000
-    PADDLE_RETRY_CUDA_SUCCESS(
-        phi::dynload::cublasCreate(&blas_tensor_core_handle_));
-    PADDLE_RETRY_CUDA_SUCCESS(
-        phi::dynload::cublasSetStream(blas_tensor_core_handle_, stream_));
-    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
-        blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
-#if CUDA_VERSION >= 11000
-    PADDLE_RETRY_CUDA_SUCCESS(
-        phi::dynload::cublasCreate(&blas_tf32_tensor_core_handle_));
-    PADDLE_RETRY_CUDA_SUCCESS(
-        phi::dynload::cublasSetStream(blas_tf32_tensor_core_handle_, stream_));
-    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
-        blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
-#endif  // CUDA_VERSION >= 11000
-#endif  // CUDA_VERSION >= 9000
-#endif  // PADDLE_WITH_HIP
-  }
-
-  void DestroyInternalBlasHandle() {
-#ifdef PADDLE_WITH_HIP
-    if (owned_ && blas_handle_ != nullptr) {
-      phi::dynload::rocblas_destroy_handle(blas_handle_);
-      blas_handle_ = nullptr;
-    }
-#else
-    if (owned_ && blas_handle_ != nullptr) {
-      phi::dynload::cublasDestroy(blas_handle_);
-      blas_handle_ = nullptr;
-    }
-    if (owned_ && blas_tensor_core_handle_ != nullptr) {
-      phi::dynload::cublasDestroy(blas_tensor_core_handle_);
-      blas_tensor_core_handle_ = nullptr;
-    }
-    if (owned_ && blas_tf32_tensor_core_handle_ != nullptr) {
-      phi::dynload::cublasDestroy(blas_tf32_tensor_core_handle_);
-      blas_tf32_tensor_core_handle_ = nullptr;
-    }
-#endif  // PADDLE_WITH_HIP
-  }
-
   blasHandle_t GetBlasHandle() const {
     PD_CHECK(blas_handle_ != nullptr, "the gpu blas handle is nullptr.");
     return blas_handle_;
@@ -456,16 +364,12 @@ struct GPUContext::Impl {
 
   void SetBlasHandle(blasHandle_t blas) { blas_handle_ = blas; }
 
-  void InitBlasLtHandle() {
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
-    phi::dynload::cublasLtCreate(&blaslt_handle_);
-#endif
+  void SetBlasTensorCoreHandle(blasHandle_t handle) {
+    blas_tensor_core_handle_ = handle;
   }
 
-  void DestroyInternalBlasLtHandle() {
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
-    phi::dynload::cublasLtDestroy(blaslt_handle_);
-#endif
+  void SetBlasTF32Handle(blasHandle_t handle) {
+    blas_tf32_tensor_core_handle_ = handle;
   }
 
   void SetBlasLtHandle(blasLtHandle_t blaslt) { blaslt_handle_ = blaslt; }
@@ -475,53 +379,6 @@ struct GPUContext::Impl {
     return blaslt_handle_;
   }
 
-  void InitDNNHandle() {
-    if (phi::dynload::HasCUDNN()) {
-#ifdef PADDLE_WITH_HIP
-      size_t miopen_major, miopen_minor, miopen_patch;
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion(
-          &miopen_major, &miopen_minor, &miopen_patch));
-      auto local_miopen_version =
-          (miopen_major * 1000 + miopen_minor * 10 + miopen_patch) / 10;
-      auto compile_miopen_version = MIOPEN_VERSION / 10;
-      if (local_miopen_version < static_cast<size_t>(compile_miopen_version)) {
-        LOG_FIRST_N(WARNING, 1)
-            << "WARNING: device: " << place_.device
-            << ". The installed Paddle is compiled with MIOPEN "
-            << compile_miopen_version / 100 << "."
-            << compile_miopen_version % 100
-            << ", but MIOPEN version in your machine is "
-            << local_miopen_version / 100 << "." << local_miopen_version % 100
-            << ", which may cause serious incompatible bug. "
-            << "Please recompile or reinstall Paddle with compatible MIOPEN "
-               "version.";
-      }
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreate(&dnn_handle_));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          dynload::miopenSetStream(dnn_handle_, stream_));
-#else
-      auto local_cudnn_version = phi::dynload::cudnnGetVersion() / 100;
-      auto compile_cudnn_version = CUDNN_VERSION / 100;
-      if (local_cudnn_version < static_cast<size_t>(compile_cudnn_version)) {
-        LOG_FIRST_N(WARNING, 1)
-            << "WARNING: device: " << place_.device
-            << ". The installed Paddle is compiled with CUDNN "
-            << compile_cudnn_version / 10 << "." << compile_cudnn_version % 10
-            << ", but CUDNN version in your machine is "
-            << local_cudnn_version / 10 << "." << local_cudnn_version % 10
-            << ", which may cause serious incompatible bug. "
-            << "Please recompile or reinstall Paddle with compatible CUDNN "
-               "version.";
-      }
-      PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cudnnCreate(&dnn_handle_));
-      PADDLE_RETRY_CUDA_SUCCESS(
-          phi::dynload::cudnnSetStream(dnn_handle_, stream_));
-#endif
-    } else {
-      dnn_handle_ = nullptr;
-    }
-  }
-
   dnnHandle_t GetDnnHandle() {
     PD_CHECK(dnn_handle_ != nullptr, "the gpu dnn handle is nullptr.");
     return dnn_handle_;
@@ -543,24 +400,6 @@ struct GPUContext::Impl {
 
   void SetDnnHandle(dnnHandle_t handle) { dnn_handle_ = handle; }
 
-  void InitSolverHandle() {
-#ifndef PADDLE_WITH_HIP
-    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnCreate(&solver_handle_));
-    PADDLE_RETRY_CUDA_SUCCESS(
-        phi::dynload::cusolverDnSetStream(solver_handle_, stream_));
-#endif
-  }
-
-  void DestroyInternalSolverHandle() {
-#ifndef PADDLE_WITH_HIP
-    if (owned_ && solver_handle_ != nullptr) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::cusolverDnDestroy(solver_handle_));
-      solver_handle_ = nullptr;
-    }
-#endif
-  }
-
   solverHandle_t GetSolverHandle() const {
     PD_CHECK(solver_handle_ != nullptr, "the gpu solver handle is nullptr.");
     return solver_handle_;
@@ -568,29 +407,6 @@ struct GPUContext::Impl {
 
   void SetSolverHandle(solverHandle_t handle) { solver_handle_ = handle; }
 
-  void InitSparseHandle() {
-// ROCM is not yet supported
-#if defined(PADDLE_WITH_CUDA)
-// The generic APIs is supported from CUDA10.1
-#if CUDA_VERSION >= 10010
-    PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseCreate(&sparse_handle_));
-    PADDLE_RETRY_CUDA_SUCCESS(
-        dynload::cusparseSetStream(sparse_handle_, stream_));
-#endif
-#endif
-  }
-
-  void DestroyInternalSparseHandle() {
-#ifdef PADDLE_WITH_CUDA
-#if CUDA_VERSION >= 10010
-    if (owned_ && sparse_handle_ != nullptr) {
-      PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseDestroy(sparse_handle_));
-      sparse_handle_ = nullptr;
-    }
-#endif
-#endif
-  }
-
   sparseHandle_t GetSparseHandle() const {
     PD_CHECK(sparse_handle_ != nullptr, "the gpu sparse handle is nullptr.");
     return sparse_handle_;
@@ -878,7 +694,10 @@ void GPUContext::Init() {
   impl_->Init();
 }
 
-void GPUContext::SetStream(gpuStream_t stream) { impl_->SetStream(stream); }
+void GPUContext::SetStream(gpuStream_t stream) {
+  impl_->allocator_ = const_cast<Allocator*>(&this->GetAllocator());
+  impl_->SetStream(stream);
+}
 
 void GPUContext::SetEigenDevice(Eigen::GpuDevice* device) {
   impl_->SetEigenDevice(device);
@@ -888,6 +707,14 @@ void GPUContext::SetBlasHandle(blasHandle_t blas) {
   impl_->SetBlasHandle(blas);
 }
 
+void GPUContext::SetBlasTensorCoreHandle(blasHandle_t handle) {
+  impl_->SetBlasTensorCoreHandle(handle);
+}
+
+void GPUContext::SetBlasTF32Handle(blasHandle_t handle) {
+  impl_->SetBlasTF32Handle(handle);
+}
+
 void GPUContext::SetBlasLtHandle(blasLtHandle_t blaslt) {
   impl_->SetBlasLtHandle(blaslt);
 }
diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h
index 8d44acaa4a083..db9f287041dfb 100644
--- a/paddle/phi/backends/gpu/gpu_context.h
+++ b/paddle/phi/backends/gpu/gpu_context.h
@@ -199,6 +199,10 @@ class PADDLE_API GPUContext : public DeviceContext {
 
   void SetBlasHandle(blasHandle_t);
 
+  void SetBlasTensorCoreHandle(blasHandle_t);
+
+  void SetBlasTF32Handle(blasHandle_t);
+
   void SetBlasLtHandle(blasLtHandle_t);
 
   void SetDnnHandle(dnnHandle_t);
diff --git a/paddle/phi/backends/gpu/gpu_resources.cc b/paddle/phi/backends/gpu/gpu_resources.cc
new file mode 100644
index 0000000000000..268024eb25949
--- /dev/null
+++ b/paddle/phi/backends/gpu/gpu_resources.cc
@@ -0,0 +1,271 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_resources.h"
+
+#include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/backends/gpu/gpu_decls.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/allocator.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/phi/backends/dynload/cublas.h"
+#include "paddle/phi/backends/dynload/cudnn.h"
+#include "paddle/phi/backends/dynload/cusolver.h"
+#include "paddle/phi/backends/dynload/cusparse.h"
+#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
+#include "paddle/phi/backends/dynload/nccl.h"
+#endif  // !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
+#endif  // PADDLE_WITH_CUDA
+
+#include "unsupported/Eigen/CXX11/Tensor"
+
+// TODO(phi): remove fluid header.
+#include "paddle/fluid/platform/enforce.h"
+
+namespace phi {
+
+void InitGpuProperties(Place place,
+                       int* compute_capability,
+                       int* runtime_version,
+                       int* driver_version,
+                       int* multi_process,
+                       int* max_threads_per_mp,
+                       int* max_threads_per_block,
+                       std::array<int, 3>* max_grid_dim_size) {
+  backends::gpu::GPUDeviceGuard guard(place.GetDeviceId());
+  *compute_capability =
+      backends::gpu::GetGPUComputeCapability(place.GetDeviceId());
+  *multi_process = backends::gpu::GetGPUMultiProcessors(place.GetDeviceId());
+  *max_threads_per_mp =
+      backends::gpu::GetGPUMaxThreadsPerMultiProcessor(place.GetDeviceId());
+  *max_grid_dim_size = backends::gpu::GetGpuMaxGridDimSize(place.GetDeviceId());
+  *max_threads_per_block =
+      backends::gpu::GetGPUMaxThreadsPerBlock(place.GetDeviceId());
+  *driver_version = backends::gpu::GetGPUDriverVersion(place.GetDeviceId());
+  *runtime_version = backends::gpu::GetGPURuntimeVersion(place.GetDeviceId());
+
+  // TODO(wilber): glog may be replaced in the future?
+  LOG_FIRST_N(WARNING, 1) << "Please NOTE: device: "
+                          << static_cast<int>(place.device)
+                          << ", GPU Compute Capability: "
+                          << *compute_capability / 10 << "."
+                          << *compute_capability % 10
+                          << ", Driver API Version: " << *driver_version / 1000
+                          << "." << (*driver_version % 100) / 10
+                          << ", Runtime API Version: "
+                          << *runtime_version / 1000 << "."
+                          << (*runtime_version % 100) / 10;
+#ifdef PADDLE_WITH_HIP
+  size_t miopen_major, miopen_minor, miopen_patch;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      dynload::miopenGetVersion(&miopen_major, &miopen_minor, &miopen_patch));
+  auto cudnn_dso_ver =
+      (miopen_major * 1000 + miopen_minor * 10 + miopen_patch) / 10;
+  auto compile_miopen_version = MIOPEN_VERSION / 10;
+  if (cudnn_dso_ver < static_cast<size_t>(compile_miopen_version)) {
+    LOG_FIRST_N(WARNING, 1)
+        << "WARNING: device: " << static_cast<int>(place.device)
+        << ". The installed Paddle is compiled with MIOPEN "
+        << compile_miopen_version / 100 << "." << compile_miopen_version % 100
+        << ", but MIOPEN version in your machine is " << cudnn_dso_ver / 100
+        << "." << cudnn_dso_ver % 100
+        << ", which may cause serious incompatible bug. "
+        << "Please recompile or reinstall Paddle with compatible MIOPEN "
+           "version.";
+  }
+#else
+  size_t cudnn_dso_ver = dynload::cudnnGetVersion();
+  LOG_FIRST_N(WARNING, 1) << "device: " << static_cast<int>(place.device)
+                          << ", cuDNN Version: " << cudnn_dso_ver / 1000 << "."
+                          << (cudnn_dso_ver % 1000) / 100 << ".";
+
+  // Check CUDA/CUDNN version compatiblity
+  auto local_cuda_version =
+      (*driver_version / 1000) * 10 + (*driver_version % 100) / 10;
+  auto compile_cuda_version =
+      (CUDA_VERSION / 1000) * 10 + (CUDA_VERSION % 100) / 10;
+  if (local_cuda_version < compile_cuda_version) {
+    LOG_FIRST_N(WARNING, 1)
+        << "WARNING: device: " << static_cast<int>(place.device)
+        << ". The installed Paddle is compiled with CUDA "
+        << compile_cuda_version / 10 << "." << compile_cuda_version % 10
+        << ", but CUDA runtime version in your machine is "
+        << local_cuda_version / 10 << "." << local_cuda_version % 10
+        << ", which may cause serious incompatible bug. "
+        << "Please recompile or reinstall Paddle with compatible CUDA "
+           "version.";
+  }
+#endif
+}
+
+void InitStream(gpuStream_t* stream) {
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      hipStreamCreateWithPriority(stream, hipStreamDefault, 0));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cudaStreamCreateWithPriority(stream, cudaStreamDefault, 0));
+#endif
+}
+
+void DestoryStream(gpuStream_t stream) {
+  if (stream != nullptr) {
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream));
+#endif
+  }
+  stream = nullptr;
+}
+
+void InitBlasHandle(blasHandle_t* blas_handle, gpuStream_t stream) {
+#ifdef PADDLE_WITH_HIP
+  phi::dynload::rocblas_create_handle(blas_handle);
+  phi::dynload::rocblas_set_stream(*blas_handle, stream);
+#else   // PADDLE_WITH_CUDA
+  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasCreate(blas_handle));
+  PADDLE_RETRY_CUDA_SUCCESS(
+      phi::dynload::cublasSetStream(*blas_handle, stream));
+#endif  // PADDLE_WITH_HIP
+}
+
+void DestroyBlasHandle(blasHandle_t handle) {
+#ifdef PADDLE_WITH_HIP
+  if (handle != nullptr) {
+    phi::dynload::rocblas_destroy_handle(handle);
+    handle = nullptr;
+  }
+#else
+  if (handle != nullptr) {
+    phi::dynload::cublasDestroy(handle);
+    handle = nullptr;
+  }
+#endif  // PADDLE_WITH_HIP
+}
+
+void InitBlasLtHandle(blasLtHandle_t* blaslt_handle) {
+#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
+  phi::dynload::cublasLtCreate(blaslt_handle);
+#endif
+}
+
+void DestroyBlasLtHandle(blasLtHandle_t handle) {
+#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
+  if (handle != nullptr) {
+    phi::dynload::cublasLtDestroy(handle);
+    handle = nullptr;
+  }
+#endif
+}
+
+void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place) {
+  if (phi::dynload::HasCUDNN()) {
+#ifdef PADDLE_WITH_HIP
+    size_t miopen_major, miopen_minor, miopen_patch;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        dynload::miopenGetVersion(&miopen_major, &miopen_minor, &miopen_patch));
+    auto local_miopen_version =
+        (miopen_major * 1000 + miopen_minor * 10 + miopen_patch) / 10;
+    auto compile_miopen_version = MIOPEN_VERSION / 10;
+    if (local_miopen_version < static_cast<size_t>(compile_miopen_version)) {
+      LOG_FIRST_N(WARNING, 1)
+          << "WARNING: device: " << place.device
+          << ". The installed Paddle is compiled with MIOPEN "
+          << compile_miopen_version / 100 << "." << compile_miopen_version % 100
+          << ", but MIOPEN version in your machine is "
+          << local_miopen_version / 100 << "." << local_miopen_version % 100
+          << ", which may cause serious incompatible bug. "
+          << "Please recompile or reinstall Paddle with compatible MIOPEN "
+             "version.";
+    }
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreate(handle));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetStream(*handle, stream));
+#else
+    auto local_cudnn_version = phi::dynload::cudnnGetVersion() / 100;
+    auto compile_cudnn_version = CUDNN_VERSION / 100;
+    if (local_cudnn_version < static_cast<size_t>(compile_cudnn_version)) {
+      LOG_FIRST_N(WARNING, 1)
+          << "WARNING: device: " << place.device
+          << ". The installed Paddle is compiled with CUDNN "
+          << compile_cudnn_version / 10 << "." << compile_cudnn_version % 10
+          << ", but CUDNN version in your machine is "
+          << local_cudnn_version / 10 << "." << local_cudnn_version % 10
+          << ", which may cause serious incompatible bug. "
+          << "Please recompile or reinstall Paddle with compatible CUDNN "
+             "version.";
+    }
+    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cudnnCreate(handle));
+    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cudnnSetStream(*handle, stream));
+#endif
+  } else {
+    *handle = nullptr;
+  }
+}
+
+void DestroyDnnHandle(dnnHandle_t handle) {
+#ifdef PADDLE_WITH_HIP
+  if (handle != nullptr) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDestroy(handle));
+    handle = nullptr;
+  }
+#else
+  if (handle != nullptr) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDestroy(handle));
+    handle = nullptr;
+  }
+#endif  // PADDLE_WITH_HIP
+}
+
+void InitSolverHandle(solverHandle_t* handle, gpuStream_t stream) {
+#ifndef PADDLE_WITH_HIP
+  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnCreate(handle));
+  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnSetStream(*handle, stream));
+#endif
+}
+
+void DestroySolverHandle(solverHandle_t solver_handle) {
+#ifndef PADDLE_WITH_HIP
+  if (solver_handle != nullptr) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnDestroy(solver_handle));
+    solver_handle = nullptr;
+  }
+#endif
+}
+
+void InitSparseHandle(sparseHandle_t* handle, gpuStream_t stream) {
+// ROCM is not yet supported
+#if defined(PADDLE_WITH_CUDA)
+// The generic APIs is supported from CUDA10.1
+#if CUDA_VERSION >= 10010
+  PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseCreate(handle));
+  PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseSetStream(*handle, stream));
+#endif
+#endif
+}
+
+void DestroySparseHandle(sparseHandle_t handle) {
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 10010
+  if (handle != nullptr) {
+    PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseDestroy(handle));
+    handle = nullptr;
+  }
+#endif
+#endif
+}
+
+}  // namespace phi
diff --git a/paddle/phi/backends/gpu/gpu_resources.h b/paddle/phi/backends/gpu/gpu_resources.h
new file mode 100644
index 0000000000000..07ccb6215409a
--- /dev/null
+++ b/paddle/phi/backends/gpu/gpu_resources.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <array>
+#include "paddle/phi/backends/gpu/gpu_decls.h"
+#include "paddle/phi/common/place.h"
+
+namespace phi {
+
+void InitGpuProperties(Place place,
+                       int* compute_capability,
+                       int* runtime_version,
+                       int* driver_version,
+                       int* multi_process,
+                       int* max_threads_per_mp,
+                       int* max_threads_per_block,
+                       std::array<int, 3>* max_grid_dim_size);
+
+void InitStream(gpuStream_t* stream);
+void DestoryStream(gpuStream_t stream);
+
+void InitBlasHandle(blasHandle_t* blas_handle, gpuStream_t stream);
+void DestroyBlasHandle(blasHandle_t handle);
+
+void InitBlasLtHandle(blasLtHandle_t* blaslt_handle);
+void DestroyBlasLtHandle(blasLtHandle_t handle);
+
+void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place);
+void DestroyDnnHandle(dnnHandle_t handle);
+
+void InitSolverHandle(solverHandle_t* handle, gpuStream_t stream);
+void DestroySolverHandle(solverHandle_t solver_handle);
+
+void InitSparseHandle(sparseHandle_t* handle, gpuStream_t stream);
+void DestroySparseHandle(sparseHandle_t handle);
+
+// void InitDnnWorkspace();
+
+}  // namespace phi
diff --git a/paddle/phi/common/CMakeLists.txt b/paddle/phi/common/CMakeLists.txt
index aa839eab587cb..b1ca4d1f8a8c6 100644
--- a/paddle/phi/common/CMakeLists.txt
+++ b/paddle/phi/common/CMakeLists.txt
@@ -1,2 +1,3 @@
 cc_library(phi_place SRCS place.cc)
 cc_library(scalar SRCS scalar.cc DEPS phi_enforce tensor)
+cc_library(int_array SRCS int_array.cc DEPS phi_enforce tensor)
diff --git a/paddle/phi/common/int_array.cc b/paddle/phi/common/int_array.cc
new file mode 100644
index 0000000000000..daed2b6625a9e
--- /dev/null
+++ b/paddle/phi/common/int_array.cc
@@ -0,0 +1,77 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/common/int_array.h"
+
+#include "paddle/phi/common/place.h"
+
+#include "paddle/fluid/framework/tensor_util.h"
+
+namespace paddle {
+namespace experimental {
+
+template <>
+IntArrayBase<phi::DenseTensor>::IntArrayBase(
+    const phi::DenseTensor& tensor) {  // NOLINT
+  is_from_tensor_ = true;
+  if (tensor.place().GetType() == AllocationType::CPU) {
+    AssignDataFromTensor(tensor);
+  } else {
+    phi::DenseTensor tensor_tmp;
+    paddle::framework::TensorCopySync(tensor, CPUPlace(), &tensor_tmp);
+    AssignDataFromTensor(tensor_tmp);
+  }
+}
+
+template <>
+IntArrayBase<phi::DenseTensor>::IntArrayBase(
+    const std::vector<phi::DenseTensor>& tensor_list) {
+  is_from_tensor_ = true;
+
+  for (size_t i = 0; i < tensor_list.size(); ++i) {
+    DataType data_type = tensor_list[i].dtype();
+    switch (data_type) {
+      case DataType::INT32:
+        if (tensor_list[i].place().GetType() == AllocationType::CPU) {
+          array_.push_back(*tensor_list[i].template data<int32_t>());
+        } else {
+          phi::DenseTensor tensor_tmp;
+          paddle::framework::TensorCopySync(
+              tensor_list[i], CPUPlace(), &tensor_tmp);
+          array_.push_back(*tensor_tmp.template data<int32_t>());
+        }
+        break;
+      case DataType::INT64:
+        if (tensor_list[i].place().GetType() == AllocationType::CPU) {
+          array_.push_back(*tensor_list[i].template data<int64_t>());
+        } else {
+          phi::DenseTensor tensor_tmp;
+          paddle::framework::TensorCopySync(
+              tensor_list[i], CPUPlace(), &tensor_tmp);
+          array_.push_back(*tensor_tmp.template data<int64_t>());
+        }
+        break;
+      default:
+        PD_THROW(
+            "Data type error. Currently, The data type of IntArrayBase "
+            "only supports Tensor with int32 and int64, "
+            "but now received `",
+            data_type,
+            "`.");
+    }
+  }
+}
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/phi/common/int_array.h b/paddle/phi/common/int_array.h
index f9d07249e0fc9..91b9ace136bc2 100644
--- a/paddle/phi/common/int_array.h
+++ b/paddle/phi/common/int_array.h
@@ -48,50 +48,10 @@ class IntArrayBase {
   void SetFromTensor(bool val) { is_from_tensor_ = val; }
 
   // The Tensor must have one dim
-  IntArrayBase(const T& tensor) {  // NOLINT
-    is_from_tensor_ = true;
-    size_t n = tensor.numel();
-    array_.reserve(n);
-    switch (tensor.dtype()) {
-      case DataType::INT32:
-        AssignData(tensor.template data<int32_t>(), n);
-        break;
-      case DataType::INT64:
-        AssignData(tensor.template data<int64_t>(), n);
-        break;
-      default:
-        PD_THROW(
-            "Data type error. Currently, The data type of IntArrayBase "
-            "only supports Tensor with int32 and int64, "
-            "but now received `",
-            tensor.dtype(),
-            "`.");
-    }
-  }
+  IntArrayBase(const T& tensor);  // NOLINT
 
   // The Tensor in vec must have only one element
-  IntArrayBase(const std::vector<T>& tensor_list) {  // NOLINT
-    is_from_tensor_ = true;
-
-    for (size_t i = 0; i < tensor_list.size(); ++i) {
-      DataType data_type = tensor_list[i].dtype();
-      switch (data_type) {
-        case DataType::INT32:
-          array_.push_back(*tensor_list[i].template data<int32_t>());
-          break;
-        case DataType::INT64:
-          array_.push_back(*tensor_list[i].template data<int64_t>());
-          break;
-        default:
-          PD_THROW(
-              "Data type error. Currently, The data type of IntArrayBase "
-              "only supports Tensor with int32 and int64, "
-              "but now received `",
-              data_type,
-              "`.");
-      }
-    }
-  }
+  IntArrayBase(const std::vector<T>& tensor_list);  // NOLINT
 
   template <typename OtherT>
   IntArrayBase(const IntArrayBase<OtherT>& other) : array_(other.GetData()) {}
@@ -114,6 +74,26 @@ class IntArrayBase {
     }
   }
 
+  void AssignDataFromTensor(const T& tensor) {
+    size_t n = tensor.numel();
+    array_.reserve(n);
+    switch (tensor.dtype()) {
+      case DataType::INT32:
+        AssignData(tensor.template data<int32_t>(), n);
+        break;
+      case DataType::INT64:
+        AssignData(tensor.template data<int64_t>(), n);
+        break;
+      default:
+        PD_THROW(
+            "Data type error. Currently, The data type of IntArrayBase "
+            "only supports Tensor with int32 and int64, "
+            "but now received `",
+            tensor.dtype(),
+            "`.");
+    }
+  }
+
  private:
   // TODO(zhangyunfei) Replace std::vector with a more efficient container
   // structure.
diff --git a/paddle/phi/common/scalar.h b/paddle/phi/common/scalar.h
index c28f6185a556a..5aee59f52ffae 100644
--- a/paddle/phi/common/scalar.h
+++ b/paddle/phi/common/scalar.h
@@ -27,6 +27,7 @@ template <typename T>
 class ScalarBase {
  public:
   // Constructor support implicit
+  ScalarBase() : ScalarBase(0) {}
   ScalarBase(double val) : dtype_(DataType::FLOAT64) {  // NOLINT
     data_.f64 = val;
   }
diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt
index dcf1826012c13..41f654bfc8f30 100644
--- a/paddle/phi/core/CMakeLists.txt
+++ b/paddle/phi/core/CMakeLists.txt
@@ -6,7 +6,7 @@ set(phi_enforce_deps errors flags)
 if(WITH_GPU)
   set(phi_enforce_deps ${phi_enforce_deps} external_error_proto)
 endif()
-cc_library(phi_enforce INTERFACE SRCS enforce.cc DEPS ${phi_enforce_deps})
+cc_library(phi_enforce SRCS enforce.cc DEPS ${phi_enforce_deps})
 
 cc_library(kernel_factory SRCS kernel_factory.cc DEPS phi_enforce fluid_convert_utils)
 cc_library(kernel_context SRCS kernel_context.cc DEPS phi_enforce phi_context)
diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h
index bd19d403c9406..8eb6524e79c0f 100644
--- a/paddle/phi/core/compat/op_utils.h
+++ b/paddle/phi/core/compat/op_utils.h
@@ -17,6 +17,8 @@ limitations under the License. */
 #include <string>
 #include <unordered_set>
 
+#include "glog/logging.h"
+
 #include "paddle/phi/core/compat/arg_map_context.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/phi/core/compat/type_defs.h b/paddle/phi/core/compat/type_defs.h
deleted file mode 100644
index c9d7d5bb54b62..0000000000000
--- a/paddle/phi/core/compat/type_defs.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include <boost/variant.hpp>
-
-namespace egr {
-class EagerVariable;
-}
-namespace paddle {
-namespace framework {
-// The order should be as same as framework.proto
-// NOTE(xiongkun): we extract from framework/typedef.h to ensure we can transfer
-// enforce.h
-class BlockDesc;
-using Attribute = boost::variant<boost::blank,
-                                 int,
-                                 float,
-                                 std::string,
-                                 std::vector<int>,
-                                 std::vector<float>,
-                                 std::vector<std::string>,
-                                 bool,
-                                 std::vector<bool>,
-                                 BlockDesc*,
-                                 int64_t,
-                                 std::vector<BlockDesc*>,
-                                 std::vector<int64_t>,
-                                 std::vector<double>>;
-using AttributeMap = std::unordered_map<std::string, Attribute>;
-}  // namespace framework
-
-namespace imperative {
-
-class VariableWrapper;
-class SavedVariableWrapperList;
-class VarBase;
-class OpBase;
-class GradOpNode;
-class Tracer;
-
-using WeakNameVarBaseMap =
-    std::map<std::string, std::vector<std::weak_ptr<VarBase>>>;
-
-namespace details {
-template <typename T>
-struct NameVarMapTrait {};
-
-template <>
-struct NameVarMapTrait<VarBase> {
-  using Type = std::map<std::string, std::vector<std::shared_ptr<VarBase>>>;
-};
-
-template <>
-struct NameVarMapTrait<VariableWrapper> {
-  using Type = std::map<std::string, SavedVariableWrapperList>;
-};
-
-template <>
-struct NameVarMapTrait<egr::EagerVariable> {
-  using Type =
-      std::map<std::string, std::vector<std::shared_ptr<egr::EagerVariable>>>;
-};
-
-}  // namespace details
-
-template <typename T>
-using NameVarMap = typename details::NameVarMapTrait<T>::Type;
-
-using NameVarBaseMap = NameVarMap<VarBase>;
-using NameVariableWrapperMap = NameVarMap<VariableWrapper>;
-using NameTensorMap = NameVarMap<egr::EagerVariable>;
-
-using VariableWrapperList = std::vector<std::shared_ptr<VariableWrapper>>;
-
-}  // namespace imperative
-}  // namespace paddle
diff --git a/paddle/phi/core/custom_kernel.cc b/paddle/phi/core/custom_kernel.cc
index 48778bb38e548..356dd1482c9d8 100644
--- a/paddle/phi/core/custom_kernel.cc
+++ b/paddle/phi/core/custom_kernel.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/phi/core/custom_kernel.h"
 
+#include "glog/logging.h"
+
 namespace phi {
 
 void CustomKernelMap::RegisterCustomKernel(const std::string& name,
diff --git a/paddle/phi/core/ddim.cc b/paddle/phi/core/ddim.cc
index e6bf81590f158..1809c413bc146 100644
--- a/paddle/phi/core/ddim.cc
+++ b/paddle/phi/core/ddim.cc
@@ -171,11 +171,21 @@ DDim stride_numel(const DDim& ddim) {
   return strides;
 }
 
-DDim DDim::reshape(const std::vector<int>& shape) const {
+DDim DDim::reshape(std::vector<int>& shape) const {
   const int64_t copy_dim_val = 0;
   const DDim& in_dims = *this;
   DDim out_dims;
   out_dims.rank_ = shape.size();
+
+  // Dim marked as "-1" must be inferred
+  auto it = std::find(shape.begin(), shape.end(), -1);
+  if (it != shape.end()) {
+    int index = std::distance(shape.begin(), it);
+    int reshape_out_product =
+        std::accumulate(shape.begin(), shape.end(), -1, std::multiplies<int>());
+    shape[index] = product(in_dims) / reshape_out_product;
+  }
+
   for (size_t i = 0; i < shape.size(); ++i) {
     if (shape[i] == copy_dim_val) {
       PADDLE_ENFORCE_LT(static_cast<int>(i),
diff --git a/paddle/phi/core/ddim.h b/paddle/phi/core/ddim.h
index ce462d8d95402..dd13081ddafff 100644
--- a/paddle/phi/core/ddim.h
+++ b/paddle/phi/core/ddim.h
@@ -155,7 +155,7 @@ class DDim {
 
   std::string to_str() const;
 
-  DDim reshape(const std::vector<int>& shape) const;
+  DDim reshape(std::vector<int>& shape) const;
 
   DDim transpose(const std::vector<int>& axis) const;
 
diff --git a/paddle/phi/core/dense_tensor.h b/paddle/phi/core/dense_tensor.h
index 9861bd68e4a9e..06d3e435bc110 100644
--- a/paddle/phi/core/dense_tensor.h
+++ b/paddle/phi/core/dense_tensor.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/phi/core/allocator.h"
-#include "paddle/phi/core/storage.h"
 #include "paddle/phi/core/stream.h"
 #include "paddle/phi/core/tensor_base.h"
 #include "paddle/phi/core/tensor_meta.h"
diff --git a/paddle/phi/core/dense_tensor.inl b/paddle/phi/core/dense_tensor.inl
index 93513067a268b..01c19e8a55fdf 100644
--- a/paddle/phi/core/dense_tensor.inl
+++ b/paddle/phi/core/dense_tensor.inl
@@ -26,18 +26,6 @@ public:
 */
 explicit DenseTensor(paddle::experimental::DataType dtype);
 
-/// \brief Use existing storage space to create dense tensor. This interface
-/// can be used to deliberately create an uninitialized dense tensor.
-/// \param storage The existing storage.
-/// \param meta The meta data of dense tensor.
-DenseTensor(intrusive_ptr<Storage> storage, const DenseTensorMeta& meta);
-
-/// \brief Use existing storage space to create dense tensor. This interface
-/// can be used to deliberately create an uninitialized dense tensor.
-/// \param storage The existing storage.
-/// \param meta The meta data of dense tensor.
-DenseTensor(intrusive_ptr<Storage> storage, DenseTensorMeta&& meta);
-
 inline bool IsInitialized() const { return holder_ != nullptr; }
 
 template <typename T>
diff --git a/paddle/phi/core/dense_tensor_impl.cc b/paddle/phi/core/dense_tensor_impl.cc
index 3c030cac2e7c9..8c97b6bf223fb 100644
--- a/paddle/phi/core/dense_tensor_impl.cc
+++ b/paddle/phi/core/dense_tensor_impl.cc
@@ -18,9 +18,10 @@ limitations under the License. */
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
 
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 
+#include "paddle/fluid/memory/malloc.h"
+
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_utils.h"
 #endif
@@ -211,13 +212,6 @@ LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::phi::dtype::complex<double>)
 /*   From framework::LoDTensor    */
 /* ------------------------------ */
 
-DenseTensor::DenseTensor(intrusive_ptr<Storage> storage,
-                         const DenseTensorMeta& meta)
-    : meta_(meta), holder_(storage->move_data_shared()) {}
-
-DenseTensor::DenseTensor(intrusive_ptr<Storage> storage, DenseTensorMeta&& meta)
-    : meta_(std::move(meta)), holder_(storage->move_data_shared()) {}
-
 DenseTensor::DenseTensor(const LoD& lod) : DenseTensor() { meta_.lod = lod; }
 
 void DenseTensor::set_lod(const LoD& lod) { meta_.lod = lod; }
diff --git a/paddle/phi/core/enforce.cc b/paddle/phi/core/enforce.cc
index c5ac3b7815bf9..ae6b0135b3222 100644
--- a/paddle/phi/core/enforce.cc
+++ b/paddle/phi/core/enforce.cc
@@ -13,3 +13,162 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/core/enforce.h"
+
+#include <map>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include <boost/variant.hpp>
+
+// <boost/variant.hpp> is not suitable to be placed in the header file,
+// it will introduce a large number of unnecessary includes, and these type
+// declarations that depend on boost are also not suitable for the phi header
+// file. Do some repeated forward declarations here to avoid
+// <boost/variant.hpp> spreading to a large number of phi kernel files
+namespace egr {
+class EagerVariable;
+}
+namespace paddle {
+namespace framework {
+class BlockDesc;
+using Attribute = boost::variant<boost::blank,
+                                 int,
+                                 float,
+                                 std::string,
+                                 std::vector<int>,
+                                 std::vector<float>,
+                                 std::vector<std::string>,
+                                 bool,
+                                 std::vector<bool>,
+                                 BlockDesc*,
+                                 int64_t,
+                                 std::vector<BlockDesc*>,
+                                 std::vector<int64_t>,
+                                 std::vector<double>>;
+using AttributeMap = std::unordered_map<std::string, Attribute>;
+}  // namespace framework
+namespace imperative {
+class VariableWrapper;
+class SavedVariableWrapperList;
+class VarBase;
+
+namespace details {
+template <typename T>
+struct NameVarMapTrait {};
+
+template <>
+struct NameVarMapTrait<VarBase> {
+  using Type = std::map<std::string, std::vector<std::shared_ptr<VarBase>>>;
+};
+
+template <>
+struct NameVarMapTrait<VariableWrapper> {
+  using Type = std::map<std::string, SavedVariableWrapperList>;
+};
+
+template <>
+struct NameVarMapTrait<egr::EagerVariable> {
+  using Type =
+      std::map<std::string, std::vector<std::shared_ptr<egr::EagerVariable>>>;
+};
+
+}  // namespace details
+
+template <typename T>
+using NameVarMap = typename details::NameVarMapTrait<T>::Type;
+
+using NameVarBaseMap = NameVarMap<VarBase>;
+using NameVariableWrapperMap = NameVarMap<VariableWrapper>;
+using NameTensorMap = NameVarMap<egr::EagerVariable>;
+
+}  // namespace imperative
+}  // namespace paddle
+
+namespace phi {
+namespace enforce {
+
+template <typename T>
+static std::string ReplaceComplexTypeStr(std::string str,
+                                         const std::string& type_name) {
+  auto demangle_type_str = demangle(typeid(T).name());
+  size_t start_pos = 0;
+  while ((start_pos = str.find(demangle_type_str, start_pos)) !=
+         std::string::npos) {
+    str.replace(start_pos, demangle_type_str.length(), type_name);
+    start_pos += type_name.length();
+  }
+  return str;
+}
+
+#define __REPLACE_COMPLEX_TYPE_STR__(__TYPENAME, __STR)                      \
+  do {                                                                       \
+    __STR =                                                                  \
+        phi::enforce::ReplaceComplexTypeStr<__TYPENAME>(__STR, #__TYPENAME); \
+  } while (0)
+
+static std::string SimplifyDemangleStr(std::string str) {
+  // the older is important, you have to put complex types in front
+  __REPLACE_COMPLEX_TYPE_STR__(paddle::framework::AttributeMap, str);
+  __REPLACE_COMPLEX_TYPE_STR__(paddle::framework::Attribute, str);
+  __REPLACE_COMPLEX_TYPE_STR__(paddle::imperative::NameVariableWrapperMap, str);
+  __REPLACE_COMPLEX_TYPE_STR__(paddle::imperative::NameVarBaseMap, str);
+  __REPLACE_COMPLEX_TYPE_STR__(paddle::imperative::NameTensorMap, str);
+  __REPLACE_COMPLEX_TYPE_STR__(std::string, str);
+  return str;
+}
+
+std::string GetCurrentTraceBackString(bool for_signal) {
+  std::ostringstream sout;
+
+  if (!for_signal) {
+    sout << "\n\n--------------------------------------\n";
+    sout << "C++ Traceback (most recent call last):";
+    sout << "\n--------------------------------------\n";
+  }
+#if !defined(_WIN32) && !defined(PADDLE_WITH_MUSL)
+  static constexpr int TRACE_STACK_LIMIT = 100;
+
+  void* call_stack[TRACE_STACK_LIMIT];
+  auto size = backtrace(call_stack, TRACE_STACK_LIMIT);
+  auto symbols = backtrace_symbols(call_stack, size);
+  Dl_info info;
+  int idx = 0;
+  // `for_signal` used to remove the stack trace introduced by
+  // obtaining the error stack trace when the signal error occurred,
+  // that is not related to the signal error self, remove it to
+  // avoid misleading users and developers
+  int end_idx = for_signal ? 2 : 0;
+  for (int i = size - 1; i >= end_idx; --i) {
+    if (dladdr(call_stack[i], &info) && info.dli_sname) {
+      auto demangled = demangle(info.dli_sname);
+      std::string path(info.dli_fname);
+      // C++ traceback info are from core.so
+      if (path.substr(path.length() - 3).compare(".so") == 0) {
+        sout << paddle::string::Sprintf(
+            "%-3d %s\n", idx++, SimplifyDemangleStr(demangled));
+      }
+    }
+  }
+  free(symbols);
+#else
+  sout << "Not support stack backtrace yet.\n";
+#endif
+  return sout.str();
+}
+
+std::string SimplifyErrorTypeFormat(const std::string& str) {
+  std::ostringstream sout;
+  size_t type_end_pos = str.find(":", 0);
+  if (type_end_pos == std::string::npos) {
+    sout << str;
+  } else {
+    // Remove "Error:", add "()""
+    sout << "(" << str.substr(0, type_end_pos - 5) << ")"
+         << str.substr(type_end_pos + 1);
+  }
+  return sout.str();
+}
+
+}  // namespace enforce
+}  // namespace phi
diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h
index 2c41c62353296..8da2623bb2c2d 100644
--- a/paddle/phi/core/enforce.h
+++ b/paddle/phi/core/enforce.h
@@ -28,38 +28,32 @@ limitations under the License. */
 #include <windows.h>  // GetModuleFileName, Sleep
 #endif
 
-#include <fstream>
-#include <iomanip>
-#include <memory>
 #include <sstream>
 #include <stdexcept>
 #include <string>
 #include <type_traits>
-#include <utility>
 
 #if !defined(_WIN32) && !defined(PADDLE_WITH_MUSL)
 #include <execinfo.h>
 #endif
 
-#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include "gflags/gflags.h"
-#include "glog/logging.h"
+
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include "paddle/phi/core/errors.h"
 #include "paddle/utils/string/printf.h"
 #include "paddle/utils/string/to_string.h"
 
-// Note: these headers for simplify demangle type string
-#include "paddle/phi/core/compat/type_defs.h"
+DECLARE_int32(call_stack_level);
 
 namespace phi {
 class ErrorSummary;
 }  // namespace phi
 
-DECLARE_int32(call_stack_level);
 namespace phi {
 namespace enforce {
-/** HELPER MACROS AND FUNCTIONS **/
 
+/** HELPER MACROS AND FUNCTIONS **/
 #ifndef PADDLE_MAY_THROW
 #define PADDLE_MAY_THROW noexcept(false)
 #endif
@@ -180,76 +174,11 @@ struct BinaryCompareMessageConverter<false> {
 };
 }  // namespace details
 
-template <typename T>
-inline std::string ReplaceComplexTypeStr(std::string str,
-                                         const std::string& type_name) {
-  auto demangle_type_str = demangle(typeid(T).name());
-  size_t start_pos = 0;
-  while ((start_pos = str.find(demangle_type_str, start_pos)) !=
-         std::string::npos) {
-    str.replace(start_pos, demangle_type_str.length(), type_name);
-    start_pos += type_name.length();
-  }
-  return str;
-}
-
-#define __REPLACE_COMPLEX_TYPE_STR__(__TYPENAME, __STR)                      \
-  do {                                                                       \
-    __STR =                                                                  \
-        phi::enforce::ReplaceComplexTypeStr<__TYPENAME>(__STR, #__TYPENAME); \
-  } while (0)
-
-inline std::string SimplifyDemangleStr(std::string str) {
-  // the older is important, you have to put complex types in front
-  __REPLACE_COMPLEX_TYPE_STR__(paddle::framework::AttributeMap, str);
-  __REPLACE_COMPLEX_TYPE_STR__(paddle::framework::Attribute, str);
-  __REPLACE_COMPLEX_TYPE_STR__(paddle::imperative::NameVariableWrapperMap, str);
-  __REPLACE_COMPLEX_TYPE_STR__(paddle::imperative::NameVarBaseMap, str);
-  __REPLACE_COMPLEX_TYPE_STR__(std::string, str);
-  return str;
-}
-
-inline std::string GetCurrentTraceBackString(bool for_signal = false) {
-  std::ostringstream sout;
-
-  if (!for_signal) {
-    sout << "\n\n--------------------------------------\n";
-    sout << "C++ Traceback (most recent call last):";
-    sout << "\n--------------------------------------\n";
-  }
-#if !defined(_WIN32) && !defined(PADDLE_WITH_MUSL)
-  static constexpr int TRACE_STACK_LIMIT = 100;
-
-  void* call_stack[TRACE_STACK_LIMIT];
-  auto size = backtrace(call_stack, TRACE_STACK_LIMIT);
-  auto symbols = backtrace_symbols(call_stack, size);
-  Dl_info info;
-  int idx = 0;
-  // `for_signal` used to remove the stack trace introduced by
-  // obtaining the error stack trace when the signal error occurred,
-  // that is not related to the signal error self, remove it to
-  // avoid misleading users and developers
-  int end_idx = for_signal ? 2 : 0;
-  for (int i = size - 1; i >= end_idx; --i) {
-    if (dladdr(call_stack[i], &info) && info.dli_sname) {
-      auto demangled = demangle(info.dli_sname);
-      std::string path(info.dli_fname);
-      // C++ traceback info are from core.so
-      if (path.substr(path.length() - 3).compare(".so") == 0) {
-        sout << paddle::string::Sprintf(
-            "%-3d %s\n", idx++, SimplifyDemangleStr(demangled));
-      }
-    }
-  }
-  free(symbols);
-#else
-  sout << "Not support stack backtrace yet.\n";
-#endif
-  return sout.str();
-}
+std::string GetCurrentTraceBackString(bool for_signal = false);
+std::string SimplifyErrorTypeFormat(const std::string& str);
 
 template <typename StrType>
-inline std::string GetErrorSumaryString(StrType&& what,
+static std::string GetErrorSumaryString(StrType&& what,
                                         const char* file,
                                         int line) {
   std::ostringstream sout;
@@ -264,7 +193,20 @@ inline std::string GetErrorSumaryString(StrType&& what,
 }
 
 template <typename StrType>
-inline std::string GetTraceBackString(StrType&& what,
+std::string GetCompleteTraceBackString(StrType&& what,
+                                       const char* file,
+                                       int line) {
+  std::ostringstream sout;
+  sout << "\n----------------------\nError Message "
+          "Summary:\n----------------------\n";
+  sout << paddle::string::Sprintf(
+              "%s (at %s:%d)", std::forward<StrType>(what), file, line)
+       << std::endl;
+  return GetCurrentTraceBackString() + sout.str();
+}
+
+template <typename StrType>
+static std::string GetTraceBackString(StrType&& what,
                                       const char* file,
                                       int line) {
   if (FLAGS_call_stack_level > 1) {
@@ -275,19 +217,6 @@ inline std::string GetTraceBackString(StrType&& what,
   }
 }
 
-inline std::string SimplifyErrorTypeFormat(const std::string& str) {
-  std::ostringstream sout;
-  size_t type_end_pos = str.find(":", 0);
-  if (type_end_pos == std::string::npos) {
-    sout << str;
-  } else {
-    // Remove "Error:", add "()""
-    sout << "(" << str.substr(0, type_end_pos - 5) << ")"
-         << str.substr(type_end_pos + 1);
-  }
-  return sout.str();
-}
-
 inline bool is_error(bool stat) { return !stat; }
 
 // Note: This Macro can only be used within enforce.h
@@ -348,6 +277,8 @@ struct EnforceNotMet : public std::exception {
     }
   }
 
+  ~EnforceNotMet() override = default;
+
  private:
   // Used to determine the final type of exception thrown
   phi::ErrorCode code_ = phi::ErrorCode::LEGACY;
diff --git a/paddle/phi/core/generator.h b/paddle/phi/core/generator.h
index 29ea92cbe6d94..3263b2a525732 100644
--- a/paddle/phi/core/generator.h
+++ b/paddle/phi/core/generator.h
@@ -49,12 +49,6 @@ class Generator {
   virtual std::pair<uint64_t, uint64_t> IncrementOffset(
       uint64_t increament_offset) = 0;
 
-  // NOTE(zhiqiu): is_init_py_ is used to make generator be compatible with
-  // old seed, and it should be removed after all random-related operators
-  // and unittests upgrades to use generator.
-  virtual void SetIsInitPy(bool) = 0;
-  virtual bool GetIsInitPy() const = 0;
-
   virtual uint64_t get_device_id() = 0;
 };
 
diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc
index 08329d0c8636a..d479147f06ba1 100644
--- a/paddle/phi/core/kernel_factory.cc
+++ b/paddle/phi/core/kernel_factory.cc
@@ -14,7 +14,8 @@
 
 #include "paddle/phi/core/kernel_factory.h"
 
-// See Note [ Why still include the fluid headers? ]
+#include "glog/logging.h"
+
 #include "paddle/phi/core/enforce.h"
 
 namespace phi {
diff --git a/paddle/phi/core/meta_tensor.h b/paddle/phi/core/meta_tensor.h
index 3cdbfda61d69c..29afe0d0292d1 100644
--- a/paddle/phi/core/meta_tensor.h
+++ b/paddle/phi/core/meta_tensor.h
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/tensor_base.h"
 #include "paddle/phi/core/tensor_meta.h"
 
+#include "glog/logging.h"
+
 namespace phi {
 
 // TODO(chenweihang): add other flags if needed
diff --git a/paddle/phi/core/storage.h b/paddle/phi/core/storage.h
deleted file mode 100644
index 24dc2c4a4f90b..0000000000000
--- a/paddle/phi/core/storage.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstddef>
-
-#include "boost/intrusive_ptr.hpp"
-#include "paddle/phi/common/place.h"
-#include "paddle/phi/core/allocator.h"
-#include "paddle/phi/core/utils/intrusive_ptr.h"
-#include "paddle/phi/core/utils/intrusive_ref_counter.h"
-#include "paddle/phi/core/utils/type_info.h"
-
-namespace phi {
-
-/// \brief The interface of contiguous storage used for the dense tensor.
-/// It should be used in conjunction with the intrusive pointer. We prohibit
-/// all default copy operations to ensure the integrity of the package.
-class Storage : public intrusive_ref_counter<Storage> {
- public:
-  Storage() = default;
-  Storage(const Storage&) = delete;
-
-  /* @jim19930609: Following interfaces will be modified/replaced/removed
-                   as soon as the new Allocation - Allocator design get
-     finalized.
-    */
-
-  /*   --------- shared_ptr<Allocation> -------- */
-  // Initialize a Storage with unique Allocation
-  explicit Storage(std::shared_ptr<phi::Allocation>&& data)
-      : data_(std::move(data)) {}
-
-  // Initialize a Storage shareing Allocation with another storage
-  explicit Storage(const std::shared_ptr<phi::Allocation>& data)
-      : data_(data) {}
-
-  void* data() const {
-    return data_ ? reinterpret_cast<void*>(
-                       reinterpret_cast<uintptr_t>(data_->ptr()))
-                 : nullptr;
-  }
-
-  const std::shared_ptr<phi::Allocation>& data_shared() const { return data_; }
-
-  virtual void set_data_shared(
-      const std::shared_ptr<phi::Allocation>& holder) = 0;
-
-  virtual std::shared_ptr<phi::Allocation>&& move_data_shared() = 0;
-
-  virtual void ReallocShared(size_t n) {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "ReallocShared has not been overrided by the current Storage"));
-  }
-  /* --------- shared_ptr<Allocation> -------- */
-
-  virtual ~Storage() = default;
-
-  virtual void Clear() = 0;
-
-  virtual size_t size() const = 0;
-  virtual const Place& place() const = 0;
-  virtual bool OwnsMemory() const = 0;
-  virtual void Realloc(size_t n) = 0;
-
- protected:
-  std::shared_ptr<phi::Allocation> data_;
-};
-
-class TensorStorage : public Storage {
- public:
-  explicit TensorStorage(Allocator* a) : alloc_(a) {}
-
-  TensorStorage(Allocator* a, size_t size)
-      : Storage(a->Allocate(size)), alloc_(a) {
-    size_ = data_->size();
-  }
-
-  void Clear() override {
-    data_ = nullptr;
-    size_ = 0;
-  }
-
-  void Realloc(size_t size) override;
-
-  ~TensorStorage() = default;
-
-  static const char* name() { return "TensorStorage"; }
-
-  size_t size() const noexcept override { return size_; }
-
-  const Place& place() const override {
-    if (!data_) {
-      PADDLE_THROW(phi::errors::Unimplemented(
-          "Unable to visit place: either data_ or alloc_ has to be initialized "
-          "first."));
-    }
-    return data_->place();
-  }
-
-  bool OwnsMemory() const noexcept override { return true; }
-
-  void set_data_shared(
-      const std::shared_ptr<phi::Allocation>& holder) override {
-    CHECK(holder);
-    data_ = holder;
-    size_ = holder->size();
-  }
-
-  std::shared_ptr<phi::Allocation>&& move_data_shared() override {
-    size_ = 0;
-    return std::move(data_);
-  }
-
- private:
-  Allocator* alloc_;
-  int64_t size_{0};
-};
-
-}  // namespace phi
diff --git a/paddle/phi/core/string_tensor.cc b/paddle/phi/core/string_tensor.cc
index 35444dc33fe78..0a4e0d6191510 100644
--- a/paddle/phi/core/string_tensor.cc
+++ b/paddle/phi/core/string_tensor.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/core/string_tensor.h"
-#include "paddle/phi/api/lib/utils/storage.h"
+#include "paddle/fluid/memory/malloc.h"
 
 namespace phi {
 
diff --git a/paddle/phi/core/string_tensor.h b/paddle/phi/core/string_tensor.h
index 916c2a2bd4a4e..94c9974f4ad74 100644
--- a/paddle/phi/core/string_tensor.h
+++ b/paddle/phi/core/string_tensor.h
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include "paddle/phi/common/pstring.h"
 #include "paddle/phi/core/allocator.h"
-#include "paddle/phi/core/storage.h"
 #include "paddle/phi/core/tensor_base.h"
 #include "paddle/phi/core/tensor_meta.h"
 
diff --git a/paddle/phi/core/utils/type_registry.h b/paddle/phi/core/utils/type_registry.h
index 5b64dbd01643e..c233e1f743b21 100644
--- a/paddle/phi/core/utils/type_registry.h
+++ b/paddle/phi/core/utils/type_registry.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <cassert>
+#include <map>
 #include <mutex>
 #include <string>
 
diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
index 602942abf4d34..6b13a28c70837 100644
--- a/paddle/phi/infermeta/backward.cc
+++ b/paddle/phi/infermeta/backward.cc
@@ -312,6 +312,63 @@ void GumbelSoftmaxGradInferMeta(const MetaTensor& out,
   dx->share_meta(dout);
 }
 
+void InstanceNormGradInferMeta(const MetaTensor& x,
+                               const MetaTensor& y_grad,
+                               paddle::optional<const MetaTensor&> scale,
+                               const MetaTensor& saved_mean,
+                               const MetaTensor& saved_variance,
+                               float epsilon,
+                               MetaTensor* x_grad,
+                               MetaTensor* scale_grad,
+                               MetaTensor* bias_grad) {
+  PADDLE_ENFORCE_NE(
+      x_grad,
+      nullptr,
+      phi::errors::InvalidArgument(
+          "The X@GRAD in InstanceNormGradInferMeta can't be nullptr."));
+  const auto x_dims = x.dims();
+  const int C = x_dims[1];
+  x_grad->set_dims(x_dims);
+  x_grad->set_dtype(x.dtype());
+  x_grad->set_layout(x.layout());
+  if (scale_grad) {
+    scale_grad->set_dims({C});
+  }
+  if (bias_grad) {
+    bias_grad->set_dims({C});
+  }
+}
+void InstanceNormDoubleGradInferMeta(
+    const MetaTensor& x,
+    paddle::optional<const MetaTensor&> scale,
+    const MetaTensor& saved_mean,
+    const MetaTensor& saved_variance,
+    const MetaTensor& dy,
+    paddle::optional<const MetaTensor&> ddx,
+    paddle::optional<const MetaTensor&> ddscale,
+    paddle::optional<const MetaTensor&> ddbias,
+    float epsilon,
+    MetaTensor* dx,
+    MetaTensor* dscale,
+    MetaTensor* ddy) {
+  PADDLE_ENFORCE_NE(
+      dx,
+      nullptr,
+      phi::errors::InvalidArgument(
+          "The DX in InstanceNormDoubleGradInferMeta can't be nullptr."));
+  const auto x_dims = x.dims();
+  const int C = x_dims[1];
+  dx->set_dims(x_dims);
+  dx->set_dtype(x.dtype());
+  dx->set_layout(x.layout());
+  if (dscale) {
+    dscale->set_dims({C});
+  }
+  if (ddy) {
+    ddy->share_dims(x);
+  }
+}
+
 void KernelWithXShapeInferMeta(const MetaTensor& xshape, MetaTensor* dx) {
   auto xshape_dims = xshape.dims();
   auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h
index c35b58d0f56e4..855b25d7ed4f8 100644
--- a/paddle/phi/infermeta/backward.h
+++ b/paddle/phi/infermeta/backward.h
@@ -144,6 +144,30 @@ void GumbelSoftmaxGradInferMeta(const MetaTensor& out,
                                 int axis,
                                 MetaTensor* dx);
 
+void InstanceNormGradInferMeta(const MetaTensor& x,
+                               const MetaTensor& y_grad,
+                               paddle::optional<const MetaTensor&> scale,
+                               const MetaTensor& saved_mean,
+                               const MetaTensor& saved_variance,
+                               float epsilon,
+                               MetaTensor* x_grad,
+                               MetaTensor* scale_grad,
+                               MetaTensor* bias_grad);
+
+void InstanceNormDoubleGradInferMeta(
+    const MetaTensor& x,
+    paddle::optional<const MetaTensor&> scale,
+    const MetaTensor& saved_mean,
+    const MetaTensor& saved_variance,
+    const MetaTensor& dy,
+    paddle::optional<const MetaTensor&> ddx,
+    paddle::optional<const MetaTensor&> ddscale,
+    paddle::optional<const MetaTensor&> ddbias,
+    float epsilon,
+    MetaTensor* dx,
+    MetaTensor* dscale,
+    MetaTensor* ddy);
+
 void KernelWithXShapeInferMeta(const MetaTensor& xshape, MetaTensor* dx);
 
 void MaxPoolWithIndexGradInferMeta(const MetaTensor& x,
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 2139605fb2048..837a43905e723 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -1534,7 +1534,7 @@ void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out) {
                     phi::errors::InvalidArgument(
                         "X's second dimension is expected to be equal to "
                         "Vec's first dimension"
-                        "but recieved X'shape = [%s], Vec's shape = [%s]",
+                        "but received X'shape = [%s], Vec's shape = [%s]",
                         dim_x,
                         dim_vec));
 
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 519d21b323fc2..48c40673ab819 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -458,7 +458,7 @@ void BatchNormInferMeta(const MetaTensor& x,
         true,
         phi::errors::InvalidArgument(
             "Each dimension of input tensor is expected to be -1 or a "
-            "positive number, but recieved %d. Input's shape is [%s].",
+            "positive number, but received %d. Input's shape is [%s].",
             x_dims[i],
             x_dims));
   }
@@ -755,7 +755,7 @@ inline int ConvOutputSize(
       0,
       phi::errors::InvalidArgument(
           "The output's size is expected to be greater than 0. "
-          "But recieved: output's size is %d. The output's size is computed by "
+          "But received: output's size is %d. The output's size is computed by "
           "((input_size + 2 * padding - (dilation * (filter_size - 1) + 1)) / "
           "stride + 1), where input_size is %d, padding is %d, "
           "filter_size is %d, dilation is %d, stride is %d.",
@@ -1998,7 +1998,9 @@ void StackInferMeta(const std::vector<const MetaTensor*>& x,
 void UnchangedMultiInferMeta(const std::vector<const MetaTensor*>& x,
                              std::vector<MetaTensor*> out) {
   for (size_t i = 0; i < x.size(); ++i) {
-    out[i]->share_meta(*x[i]);
+    if (out[i]) {
+      out[i]->share_meta(*x[i]);
+    }
   }
 }
 
diff --git a/paddle/phi/infermeta/nullary.cc b/paddle/phi/infermeta/nullary.cc
index 3a99103eda5c2..069359bae92b2 100644
--- a/paddle/phi/infermeta/nullary.cc
+++ b/paddle/phi/infermeta/nullary.cc
@@ -24,6 +24,20 @@ void AssignValueInferMeta(const std::vector<int>& shape,
 }
 
 void CreateInferMeta(const IntArray& shape, DataType dtype, MetaTensor* out) {
+  if (!shape.FromTensor()) {
+    const auto& data = shape.GetData();
+    for (size_t i = 0; i < data.size(); ++i) {
+      PADDLE_ENFORCE_GE(
+          data[i],
+          0,
+          phi::errors::InvalidArgument(
+              "Each value of attribute 'shape' is expected to be no less "
+              "than 0. But recieved: shape[%u] = %d; shape = [%s].",
+              i,
+              data[i],
+              phi::make_ddim(data)));
+    }
+  }
   CreateInferMetaBase(shape.GetData(), dtype, DataLayout::NCHW, out);
 }
 
@@ -115,4 +129,27 @@ void TruncatedGaussianRandomInferMeta(const std::vector<int>& shape,
   out->set_layout(DataLayout::NCHW);
 }
 
+void TrilIndicesInferMeta(
+    int rows, int cols, int offset, DataType dtype, MetaTensor* out) {
+  // number of elements in the first row of the tril,bounded by [0, cols]
+  auto n_first_row =
+      offset > 0 ? std::min<int64_t>(cols, 1 + offset) : rows + offset > 0;
+  // number of elements in the last row of the tril, bounded by [0, cols]
+  auto n_last_row =
+      std::max<int64_t>(0, std::min<int64_t>(cols, rows + offset));
+  // number of rows, bounded by [0, rows]
+  auto n_row_all = std::max<int64_t>(0, std::min<int64_t>(rows, rows + offset));
+  auto n_row_trapezoid = (n_last_row - n_first_row + 1);
+  // calculate # of elements in the top trapezoid
+  auto tril_size = (n_first_row + n_last_row) * n_row_trapezoid >> 1;
+  // calculate # of elements in the bottom rectangle if there is any
+  auto diff_row = n_row_all - n_row_trapezoid;
+  if (diff_row > 0) {
+    tril_size += diff_row * cols;
+  }
+  std::vector<int64_t> tmp = {2, tril_size};
+  auto out_dims = phi::make_ddim(tmp);
+  out->set_dims(out_dims);
+  out->set_dtype(dtype);
+}
 }  // namespace phi
diff --git a/paddle/phi/infermeta/nullary.h b/paddle/phi/infermeta/nullary.h
index 8d952d842c0c4..a9f1818e31957 100644
--- a/paddle/phi/infermeta/nullary.h
+++ b/paddle/phi/infermeta/nullary.h
@@ -72,4 +72,6 @@ void UniformRandomInferMeta(const IntArray& shape,
                             int seed,
                             MetaTensor* out);
 
+void TrilIndicesInferMeta(
+    int rows, int cols, int offset, DataType dtype, MetaTensor* out);
 }  // namespace phi
diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
index ae8c7dd61c3bb..e3f946b247f09 100644
--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -191,6 +191,111 @@ void ArangeInferMeta(const MetaTensor& start,
   out->set_dtype(start.dtype());
 }
 
+void InstanceNormInferMeta(const MetaTensor& x,
+                           paddle::optional<const MetaTensor&> scale,
+                           paddle::optional<const MetaTensor&> bias,
+                           float epsilon,
+                           MetaTensor* y,
+                           MetaTensor* saved_mean,
+                           MetaTensor* saved_variance,
+                           MetaConfig config) {
+  PADDLE_ENFORCE_NE(y,
+                    nullptr,
+                    phi::errors::InvalidArgument(
+                        "The y in InstanceNormInferMeta can't be nullptr."));
+  PADDLE_ENFORCE_NE(
+      saved_mean,
+      nullptr,
+      phi::errors::InvalidArgument(
+          "The saved_mean in InstanceNormInferMeta can't be nullptr."));
+  PADDLE_ENFORCE_NE(
+      saved_variance,
+      nullptr,
+      phi::errors::InvalidArgument(
+          "The saved_variance in InstanceNormInferMeta can't be nullptr."));
+  const auto x_dims = x.dims();
+  PADDLE_ENFORCE_NE(phi::product(x_dims),
+                    0,
+                    phi::errors::PreconditionNotMet(
+                        "The Input variable X has not "
+                        "been initialized. You may need to confirm "
+                        "if you put exe.run(startup_program) "
+                        "after optimizer.minimize function."));
+  PADDLE_ENFORCE_GE(
+      x_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "ShapeError: the dimension of input X must "
+          "greater than or equal to 2. But received: the shape of input "
+          "X = [%s], the dimension of input X =[%d]",
+          x_dims,
+          x_dims.size()));
+  PADDLE_ENFORCE_LE(
+      x_dims.size(),
+      5,
+      phi::errors::InvalidArgument(
+          "ShapeError: the dimension of input X must "
+          "smaller than or equal to 5, But received: the shape of input "
+          "X = [%s], the dimension of input X = [%d]",
+          x_dims,
+          x_dims.size()));
+  auto N = x_dims[0];
+  auto C = x_dims[1];
+  auto NxC = N * C;
+  const auto scale_ptr = scale.get_ptr();
+  if (scale_ptr) {
+    auto scale_dim = scale_ptr->dims();
+    PADDLE_ENFORCE_EQ(
+        scale_dim.size(),
+        1UL,
+        phi::errors::InvalidArgument(
+            "ShapeError: the dimension of scale must equal to 1."
+            "But received: the shape of scale is [%s], the dimension "
+            "of scale is [%d]",
+            scale_dim,
+            scale_dim.size()));
+    bool check = !((!config.is_runtime) && (phi::product(scale_dim) <= 0));
+    if (check) {
+      PADDLE_ENFORCE_EQ(scale_dim[0],
+                        C,
+                        phi::errors::InvalidArgument(
+                            "ShapeError: the shape of scale must equal to [%d]"
+                            "But received: the shape of scale is [%d]",
+                            C,
+                            scale_dim[0]));
+    }
+  }
+  const auto bias_ptr = bias.get_ptr();
+  if (bias_ptr) {
+    auto bias_dim = bias_ptr->dims();
+    PADDLE_ENFORCE_EQ(
+        bias_dim.size(),
+        1UL,
+        phi::errors::InvalidArgument(
+            "ShapeError: the dimension of bias must equal to 1."
+            "But received: the shape of bias is [%s],the dimension "
+            "of bias is [%d]",
+            bias_dim,
+            bias_dim.size()));
+    bool check = !((!config.is_runtime) && (phi::product(bias_dim) <= 0));
+    if (check) {
+      PADDLE_ENFORCE_EQ(bias_dim[0],
+                        C,
+                        phi::errors::InvalidArgument(
+                            "ShapeError: the shape of bias must equal to [%d]"
+                            "But received: the shape of bias is [%d]",
+                            C,
+                            bias_dim[0]));
+    }
+  }
+  y->set_dims(x_dims);
+  saved_mean->set_dims({NxC});
+  saved_variance->set_dims({NxC});
+  y->share_lod(x);
+  y->set_dtype(x.dtype());
+  y->set_layout(x.layout());
+}
+
 void GraphSendRecvInferMeta(const MetaTensor& x,
                             const MetaTensor& src_index,
                             const MetaTensor& dst_index,
diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h
index 4f561e0adf19d..b2fb30a4da2d6 100644
--- a/paddle/phi/infermeta/ternary.h
+++ b/paddle/phi/infermeta/ternary.h
@@ -52,6 +52,15 @@ void ArangeInferMeta(const MetaTensor& start,
                      const MetaTensor& step,
                      MetaTensor* out);
 
+void InstanceNormInferMeta(const MetaTensor& x,
+                           paddle::optional<const MetaTensor&> scale,
+                           paddle::optional<const MetaTensor&> bias,
+                           float epsilon,
+                           MetaTensor* y,
+                           MetaTensor* saved_mean,
+                           MetaTensor* saved_variance,
+                           MetaConfig config = MetaConfig());
+
 void GraphSendRecvInferMeta(const MetaTensor& x,
                             const MetaTensor& src_index,
                             const MetaTensor& dst_index,
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index bc1f6cbc4ad57..8e0dac7e91e3f 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -401,7 +401,8 @@ void EighInferMeta(const MetaTensor& x,
 
 void EinsumInferMeta(const std::vector<const MetaTensor*>& inputs,
                      const std::string& equation,
-                     MetaTensor* out) {
+                     MetaTensor* out,
+                     std::vector<MetaTensor*> inner_cache) {
   // collect the following informations to prepare einsum.
   LabelMap labelshape(0);
   LabelMap labeltype(LabelType::Reduction);
@@ -2163,8 +2164,8 @@ void SplitInferMeta(const MetaTensor& x,
 
 void SqueezeInferMeta(const MetaTensor& x,
                       const std::vector<int>& axes,
-                      MetaTensor* xshape,
-                      MetaTensor* out) {
+                      MetaTensor* out,
+                      MetaTensor* xshape) {
   const auto& x_dims = x.dims();
   // Check input tensor dims (<6) Eigen limit.
   PADDLE_ENFORCE_LE(x_dims.size(),
@@ -2746,7 +2747,7 @@ void UnfoldInferMeta(const MetaTensor& x,
       phi::errors::InvalidArgument(
           "The dims of X should be larger than that of kernel_sizes "
           "by a number of 2, due to the batch size and input channel dim. "
-          "But recieved dims(X:%u) - dims(kernel_sizes:%u) != 2",
+          "But received dims(X:%u) - dims(kernel_sizes:%u) != 2",
           in_dims.size(),
           kernel_sizes.size()));
   PADDLE_ENFORCE_EQ(
@@ -2754,7 +2755,7 @@ void UnfoldInferMeta(const MetaTensor& x,
       kernel_sizes.size(),
       phi::errors::InvalidArgument(
           "The dims of strides should be the same with that of kernel_sizes. "
-          "But recieved dims(strides: %u) != dims(kernel_sizes: %u).",
+          "But received dims(strides: %u) != dims(kernel_sizes: %u).",
           strides.size(),
           kernel_sizes.size()));
   PADDLE_ENFORCE_EQ(
@@ -2762,7 +2763,7 @@ void UnfoldInferMeta(const MetaTensor& x,
       2 * strides.size(),
       phi::errors::InvalidArgument(
           "The dims of paddings should be 2 times of that of strides. "
-          "But recieved dims(paddings: %u) != 2*dims(strides: %u).",
+          "But received dims(paddings: %u) != 2*dims(strides: %u).",
           paddings.size(),
           strides.size()));
   PADDLE_ENFORCE_EQ(
@@ -2770,7 +2771,7 @@ void UnfoldInferMeta(const MetaTensor& x,
       dilations.size(),
       phi::errors::InvalidArgument(
           "The dims of strides should be the same with that of dilations. "
-          "But recieved dims(strides: %u) != dims(dilations: %u).",
+          "But received dims(strides: %u) != dims(dilations: %u).",
           strides.size(),
           dilations.size()));
 
@@ -2779,14 +2780,14 @@ void UnfoldInferMeta(const MetaTensor& x,
                     0,
                     phi::errors::InvalidArgument(
                         "The `kernel_sizes` should be greater than zero, "
-                        "but recieved kernel_height: %d kernel_width: %d.",
+                        "but received kernel_height: %d kernel_width: %d.",
                         kernel_sizes[0],
                         kernel_sizes[1]));
   PADDLE_ENFORCE_GT(kernel_sizes[1],
                     0,
                     phi::errors::InvalidArgument(
                         "The `kernel_sizes` should be greater than zero, "
-                        "but recieved kernel_height: %d kernel_width: %d.",
+                        "but received kernel_height: %d kernel_width: %d.",
                         kernel_sizes[0],
                         kernel_sizes[1]));
   // check strides
@@ -2794,14 +2795,14 @@ void UnfoldInferMeta(const MetaTensor& x,
                     0,
                     phi::errors::InvalidArgument(
                         "The `strides` should be greater than zero, "
-                        "but recieved strides_height: %d strides_width: %d.",
+                        "but received strides_height: %d strides_width: %d.",
                         strides[0],
                         strides[1]));
   PADDLE_ENFORCE_GT(strides[1],
                     0,
                     phi::errors::InvalidArgument(
                         "The `strides` should be greater than zero, "
-                        "but recieved strides_height: %d strides_width: %d.",
+                        "but received strides_height: %d strides_width: %d.",
                         strides[0],
                         strides[1]));
   // check dilations
@@ -2810,7 +2811,7 @@ void UnfoldInferMeta(const MetaTensor& x,
       0,
       phi::errors::InvalidArgument(
           "The `dilations` should be greater than zero, "
-          "but recieved dilations_height: %d dilations_width: %d.",
+          "but received dilations_height: %d dilations_width: %d.",
           dilations[0],
           dilations[1]));
   PADDLE_ENFORCE_GT(
@@ -2818,7 +2819,7 @@ void UnfoldInferMeta(const MetaTensor& x,
       0,
       phi::errors::InvalidArgument(
           "The `dilations` should be greater than zero, "
-          "but recieved dilations_height: %d dilations_width: %d.",
+          "but received dilations_height: %d dilations_width: %d.",
           dilations[0],
           dilations[1]));
 
@@ -2964,8 +2965,8 @@ void UniqueRawInferMeta(const MetaTensor& x,
 
 void UnsqueezeInferMeta(const MetaTensor& x,
                         const IntArray& axes,
-                        MetaTensor* xshape,
                         MetaTensor* out,
+                        MetaTensor* xshape,
                         MetaConfig config) {
   const auto& x_dims = x.dims();
   // Validity Check: input tensor dims (<6).
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index 6a64cc381bacb..4e90dae3fed85 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -82,7 +82,8 @@ void EighInferMeta(const MetaTensor& x,
 
 void EinsumInferMeta(const std::vector<const MetaTensor*>& inputs,
                      const std::string& equation,
-                     MetaTensor* out);
+                     MetaTensor* out,
+                     std::vector<MetaTensor*> inner_cache);
 
 void ExpandInferMeta(const MetaTensor& x,
                      const IntArray& shape,
@@ -306,8 +307,8 @@ void SplitInferMeta(const MetaTensor& x_meta,
 
 void SqueezeInferMeta(const MetaTensor& x,
                       const std::vector<int>& axes,
-                      MetaTensor* xshape,
-                      MetaTensor* out);
+                      MetaTensor* out,
+                      MetaTensor* xshape);
 
 void StridedSliceRawInferMeta(const MetaTensor& x,
                               const std::vector<int>& axes,
@@ -425,8 +426,8 @@ void UniqueRawInferMeta(const MetaTensor& x,
 
 void UnsqueezeInferMeta(const MetaTensor& x,
                         const IntArray& axes,
-                        MetaTensor* xshape,
                         MetaTensor* out,
+                        MetaTensor* xshape,
                         MetaConfig config = MetaConfig());
 
 void UnStackInferMeta(const MetaTensor& x,
diff --git a/paddle/phi/kernels/activation_grad_kernel.h b/paddle/phi/kernels/activation_grad_kernel.h
index fd42756ba3867..5d7af6cca947a 100644
--- a/paddle/phi/kernels/activation_grad_kernel.h
+++ b/paddle/phi/kernels/activation_grad_kernel.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/infermeta/unary.h"
+#include "paddle/utils/optional.h"
 
 namespace phi {
 
@@ -136,7 +137,7 @@ void SigmoidTripleGradKernel(const Context& dev_ctx,
                              const DenseTensor& dout,
                              const DenseTensor& ddx,
                              const DenseTensor& d_dout_new,
-                             const DenseTensor& d_ddout,
+                             paddle::optional<const DenseTensor&> d_ddout,
                              DenseTensor* d_out_new,
                              DenseTensor* d_dout,
                              DenseTensor* d_ddx);
@@ -149,6 +150,39 @@ void LogDoubleGradKernel(const Context& dev_ctx,
                          DenseTensor* dx,
                          DenseTensor* ddout);
 
+template <typename T, typename Context>
+void SqrtDoubleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& out,
+                          const DenseTensor& dx,
+                          const DenseTensor& ddx,
+                          DenseTensor* dout,
+                          DenseTensor* ddout);
+
+template <typename T, typename Context>
+void RsqrtDoubleGradKernel(const Context& dev_ctx,
+                           const DenseTensor& out,
+                           const DenseTensor& dx,
+                           const DenseTensor& ddx,
+                           DenseTensor* dout,
+                           DenseTensor* ddout);
+
+template <typename T, typename Context>
+void CeluDoubleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& dout,
+                          const DenseTensor& ddx,
+                          float alpha,
+                          DenseTensor* dx,
+                          DenseTensor* ddout);
+
+template <typename T, typename Context>
+void SquareDoubleGradKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& dout,
+                            const DenseTensor& ddx,
+                            DenseTensor* dx,
+                            DenseTensor* ddout);
+
 template <typename T, typename Context>
 void HardSwishGradKernel(const Context& dev_ctx,
                          const DenseTensor& x,
@@ -187,6 +221,7 @@ DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log1p);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt);
 
 DECLARE_ACTIVATION_GRAD_KERNEL_NODEP(Round);
 DECLARE_ACTIVATION_GRAD_KERNEL_NODEP(Floor);
@@ -198,6 +233,7 @@ DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink, lambda);
 DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink, threshold);
 DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Swish, beta);
 DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Logit, eps);
+DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Celu, alpha);
 
 DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu, t_min, t_max);
 
diff --git a/paddle/phi/kernels/activation_kernel.h b/paddle/phi/kernels/activation_kernel.h
index 8a40bacd395c4..b719ceddc5563 100644
--- a/paddle/phi/kernels/activation_kernel.h
+++ b/paddle/phi/kernels/activation_kernel.h
@@ -78,6 +78,7 @@ DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(SoftShrink, lambda)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(HardShrink, threshold)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Elu, alpha)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Swish, beta)
+DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Celu, alpha)
 
 DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(BRelu, t_min, t_max)
 DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(STanh, scale_a, scale_b)
diff --git a/paddle/phi/kernels/cpu/activation_grad_kernel.cc b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
index d2b816de8fd2b..894f959c13193 100644
--- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
@@ -167,6 +167,7 @@ DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Swish, SwishGradFunctor, beta);
 DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish,
                                                MishGradFunctor,
                                                threshold);
+DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Celu, CELUGradFunctor, alpha);
 
 DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu,
                                                BReluGradFunctor,
@@ -281,6 +282,10 @@ PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(tanh_double_grad,
 PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(leaky_relu_double_grad,
                                           LeakyReluDoubleGradKernel)
 PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(elu_double_grad, EluDoubleGradKernel)
+PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(sqrt_double_grad,
+                                          SqrtDoubleGradKernel)
+PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(rsqrt_double_grad,
+                                          RsqrtDoubleGradKernel)
 
 PD_REGISTER_KERNEL(tanh_triple_grad,
                    CPU,
@@ -317,6 +322,15 @@ PD_REGISTER_KERNEL(square_grad,
                    double,
                    int,
                    int64_t) {}
+PD_REGISTER_KERNEL(square_double_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SquareDoubleGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_grad, SigmoidGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_double_grad, SigmoidDoubleGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_triple_grad, SigmoidTripleGradKernel)
@@ -332,6 +346,9 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(swish_grad, SwishGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(round_grad, RoundGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(floor_grad, FloorGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(ceil_grad, CeilGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(celu_grad, CeluGradKernel)
+PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(celu_double_grad,
+                                          CeluDoubleGradKernel)
 
 PD_REGISTER_KERNEL(pow_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc
index fe0643286cb02..165627839a308 100644
--- a/paddle/phi/kernels/cpu/activation_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_kernel.cc
@@ -90,19 +90,19 @@ DEFINE_CPU_ACTIVATION_KERNEL(Floor, FloorFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(Ceil, CeilFunctor)
 
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, LeakyReluFunctor, alpha)
-
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu,
                                      ThresholdedReluFunctor,
                                      threshold)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Mish, MishFunctor, threshold)
-DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, BReluFunctor, t_min, t_max)
-DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(STanh, STanhFunctor, scale_a, scale_b)
-DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(Softplus, SoftplusFunctor, beta, threshold)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(HardShrink, HardShrinkFunctor, threshold)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, SoftShrinkFunctor, lambda)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, ELUFunctor, alpha)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Swish, SwishFunctor, beta)
+DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Celu, CELUFunctor, alpha)
 
+DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, BReluFunctor, t_min, t_max)
+DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(STanh, STanhFunctor, scale_a, scale_b)
+DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(Softplus, SoftplusFunctor, beta, threshold)
 DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(HardSigmoid,
                                      HardSigmoidFunctor,
                                      slope,
@@ -181,5 +181,6 @@ PD_REGISTER_ACTIVATION_KERNEL(swish, SwishKernel)
 PD_REGISTER_ACTIVATION_KERNEL(round, RoundKernel)
 PD_REGISTER_ACTIVATION_KERNEL(floor, FloorKernel)
 PD_REGISTER_ACTIVATION_KERNEL(ceil, CeilKernel)
+PD_REGISTER_ACTIVATION_KERNEL(celu, CeluKernel)
 PD_REGISTER_KERNEL(
     pow, CPU, ALL_LAYOUT, phi::PowKernel, float, double, int, int64_t) {}
diff --git a/paddle/phi/kernels/cpu/adamw_kernel.cc b/paddle/phi/kernels/cpu/adamw_kernel.cc
index 3a7869a062cf1..f2c98fded4d4f 100644
--- a/paddle/phi/kernels/cpu/adamw_kernel.cc
+++ b/paddle/phi/kernels/cpu/adamw_kernel.cc
@@ -17,6 +17,7 @@
 #include <vector>
 
 #include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -92,41 +93,101 @@ void AdamwDenseKernel(const Context& dev_ctx,
     return;
   }
 
-  auto* param_ =
-      master_param.is_initialized() ? master_param.get_ptr() : &param;
+  T beta1_ = beta1.to<T>();
+  T beta2_ = beta2.to<T>();
+  T epsilon_ = epsilon.to<T>();
   T coeff_ = static_cast<T>(coeff);
   T lr_ratio_ = static_cast<T>(lr_ratio);
 
-  funcs::AdamWFunctor<T, funcs::CPUAdamW> functor(
-      coeff_,
-      lr_ratio_,
-      learning_rate.data<T>(),
-      const_cast<T*>(param_->data<T>()));
-  functor(param_->numel());
-
-  AdamDenseKernel<T, Context>(dev_ctx,
-                              param,
-                              grad,
-                              learning_rate,
-                              moment1,
-                              moment2,
-                              beta1_pow,
-                              beta2_pow,
-                              master_param,
-                              skip_update,
-                              beta1,
-                              beta2,
-                              epsilon,
-                              lazy_mode,
-                              min_row_size_to_use_multithread,
-                              multi_precision,
-                              use_global_beta_pow,
-                              param_out,
-                              moment1_out,
-                              moment2_out,
-                              beta1_pow_out,
-                              beta2_pow_out,
-                              master_param_outs);
+  VLOG(3) << "beta1_pow.numel() : " << beta1_pow.numel();
+  VLOG(3) << "beta2_pow.numel() : " << beta2_pow.numel();
+  VLOG(3) << "param.numel(): " << param.numel();
+
+  PADDLE_ENFORCE_EQ(
+      beta1_pow_out->numel(),
+      1,
+      errors::InvalidArgument("beta1 pow output size should be 1, but received "
+                              "value is:%d.",
+                              beta1_pow_out->numel()));
+
+  PADDLE_ENFORCE_EQ(
+      beta2_pow_out->numel(),
+      1,
+      errors::InvalidArgument("beta2 pow output size should be 1, but received "
+                              "value is:%d.",
+                              beta2_pow_out->numel()));
+
+  T beta1_p = beta1_pow.data<T>()[0];
+  T beta2_p = beta2_pow.data<T>()[0];
+
+  if (!use_global_beta_pow) {
+    dev_ctx.template Alloc<T>(beta1_pow_out)[0] = beta1_ * beta1_p;
+    dev_ctx.template Alloc<T>(beta2_pow_out)[0] = beta2_ * beta2_p;
+  }
+
+  T* param_out_ptr = dev_ctx.template Alloc<T>(param_out);
+  T* mom1_out_ptr = dev_ctx.template Alloc<T>(moment1_out);
+  T* mom2_out_ptr = dev_ctx.template Alloc<T>(moment2_out);
+  T old_lr = learning_rate.data<T>()[0];
+  T learning_rate_ =
+      learning_rate.data<T>()[0] * (sqrt(1 - beta2_p) / (1 - beta1_p));
+  T eps = epsilon_ * sqrt(1 - beta2_p);
+
+  int64_t numel = param.numel();
+
+  const T* param_ptr = param.data<T>();
+  const T* mom1_ptr = moment1.data<T>();
+  const T* mom2_ptr = moment2.data<T>();
+  const T* grad_ptr = grad.data<T>();
+
+  auto adamw =
+      paddle::operators::jit::KernelFuncs<paddle::operators::jit::AdamWTuple<T>,
+                                          phi::CPUPlace>::Cache()
+          .At(1);
+
+  static constexpr int64_t chunk_size = 512;
+
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (int64_t i = 0; i < numel / chunk_size; ++i) {
+    const int64_t offset = i * chunk_size;
+    adamw(beta1_,
+          beta2_,
+          -learning_rate_,
+          eps,
+          old_lr,
+          lr_ratio_,
+          coeff_,
+          chunk_size,
+          grad_ptr + offset,
+          mom1_ptr + offset,
+          mom2_ptr + offset,
+          param_ptr + offset,
+          mom1_out_ptr + offset,
+          mom2_out_ptr + offset,
+          param_out_ptr + offset);
+  }
+
+  if (numel % chunk_size != 0) {
+    const int64_t offset = (numel / chunk_size) * chunk_size;
+    const int64_t tail_numel = numel % chunk_size;
+    adamw(beta1_,
+          beta2_,
+          -learning_rate_,
+          eps,
+          old_lr,
+          lr_ratio_,
+          coeff_,
+          tail_numel,
+          grad_ptr + offset,
+          mom1_ptr + offset,
+          mom2_ptr + offset,
+          param_ptr + offset,
+          mom1_out_ptr + offset,
+          mom2_out_ptr + offset,
+          param_out_ptr + offset);
+  }
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/allclose_kernel.cc b/paddle/phi/kernels/cpu/allclose_kernel.cc
index 80dea561956cf..f95ddc5621e9a 100644
--- a/paddle/phi/kernels/cpu/allclose_kernel.cc
+++ b/paddle/phi/kernels/cpu/allclose_kernel.cc
@@ -15,7 +15,7 @@
 #include "paddle/phi/kernels/allclose_kernel.h"
 
 #include <cmath>
-
+#include "glog/logging.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 
diff --git a/paddle/phi/kernels/cpu/conv_util.h b/paddle/phi/kernels/cpu/conv_util.h
index d26d89086b27e..159a5cfbeb6b4 100644
--- a/paddle/phi/kernels/cpu/conv_util.h
+++ b/paddle/phi/kernels/cpu/conv_util.h
@@ -38,7 +38,7 @@ inline void UpdatePaddingAndDilation(std::vector<T>* paddings,
         phi::errors::InvalidArgument(
             "Attribute padding's size should be the same or twice as the "
             "input's dimension. "
-            "But recieved: padding's size is %d, padding is [%s]; input's "
+            "But received: padding's size is %d, padding is [%s]; input's "
             "dimension is %d, input's shape is [%s].",
             paddings->size(),
             make_ddim(*paddings),
diff --git a/paddle/phi/kernels/cpu/einsum_kernel.cc b/paddle/phi/kernels/cpu/einsum_kernel.cc
index 3e25a65526d89..8968542b3e0b8 100644
--- a/paddle/phi/kernels/cpu/einsum_kernel.cc
+++ b/paddle/phi/kernels/cpu/einsum_kernel.cc
@@ -17,4 +17,5 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/einsum_impl.h"
 
-PD_REGISTER_KERNEL(einsum, CPU, ALL_LAYOUT, phi::EinsumKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    einsum, CPU, ALL_LAYOUT, phi::EinsumKernelRaw, float, double) {}
diff --git a/paddle/phi/kernels/cpu/elementwise_add_kernel.cc b/paddle/phi/kernels/cpu/elementwise_add_kernel.cc
index 6070264547249..5019b9f570628 100644
--- a/paddle/phi/kernels/cpu/elementwise_add_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_add_kernel.cc
@@ -34,6 +34,14 @@ void AddKernel(const Context& dev_ctx,
   AddRawKernel<T>(dev_ctx, x, y, axis, out);
 }
 
+template <typename T, typename Context>
+void GradAddKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   DenseTensor* out) {
+  AddKernel<T>(dev_ctx, x, y, out);
+}
+
 }  // namespace phi
 
 using complex64 = ::phi::dtype::complex<float>;
@@ -65,3 +73,15 @@ PD_REGISTER_KERNEL(add,
                    int64_t,
                    complex64,
                    complex128) {}
+
+PD_REGISTER_KERNEL(grad_add,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GradAddKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
diff --git a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
index 3f5e0b8a4d8ee..ee384cc75193c 100644
--- a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
@@ -88,6 +88,16 @@ PD_REGISTER_KERNEL(minimum_grad,
                    int,
                    int64_t,
                    phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(elementwise_heaviside_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ElementwiseHeavisideGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
 PD_REGISTER_KERNEL(elementwise_pow_grad,
                    CPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/cpu/elementwise_kernel.cc b/paddle/phi/kernels/cpu/elementwise_kernel.cc
index 7478f69d915f1..286b0d0ffaad9 100644
--- a/paddle/phi/kernels/cpu/elementwise_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_kernel.cc
@@ -95,6 +95,18 @@ void ElementwisePowRawKernel(const Context& dev_ctx,
       dev_ctx, x, y, axis, funcs::ElementwisePowFunctor<T>(), out);
 }
 
+template <typename T, typename Context>
+void ElementwiseHeavisideRawKernel(const Context& dev_ctx,
+                                   const DenseTensor& x,
+                                   const DenseTensor& y,
+                                   int axis,
+                                   DenseTensor* out) {
+  // allocate memory for out
+  dev_ctx.template Alloc<T>(out);
+  funcs::ElementwiseCompute<funcs::ElementwiseHeavisideFunctor<T>, T>(
+      dev_ctx, x, y, axis, funcs::ElementwiseHeavisideFunctor<T>(), out);
+}
+
 }  // namespace phi
 
 using complex64 = ::phi::dtype::complex<float>;
@@ -149,3 +161,11 @@ PD_REGISTER_KERNEL(elementwise_pow_raw,
                    double,
                    int,
                    int64_t) {}
+PD_REGISTER_KERNEL(elementwise_heaviside_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ElementwiseHeavisideRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc
new file mode 100644
index 0000000000000..dcb4289ae8d75
--- /dev/null
+++ b/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc
@@ -0,0 +1,348 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/instance_norm_grad_kernel.h"
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/extensions.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/norm_utils.h"
+
+namespace phi {
+
+template <typename T>
+using ConstEigenArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using ConstEigenVectorArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using EigenArrayMap =
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
+
+template <typename T, typename Context>
+void InstanceNormGradKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& d_y,
+                            paddle::optional<const DenseTensor&> scale,
+                            const DenseTensor& saved_mean,
+                            const DenseTensor& saved_variance,
+                            float epsilon,
+                            DenseTensor* d_x,
+                            DenseTensor* d_scale,
+                            DenseTensor* d_bias) {
+  const auto* scale_ptr = scale.get_ptr();
+
+  const auto& x_dims = x.dims();
+
+  const int N = x_dims[0];
+  const int C = x_dims[1];
+  const int NxC = N * C;
+  const int sample_size = x.numel() / N / C;
+
+  dev_ctx.template Alloc<T>(d_x);
+  auto* place = dev_ctx.eigen_device();
+
+  Eigen::DSizes<int, 2> rshape(NxC, sample_size);
+  Eigen::DSizes<int, 2> param_shape(N, C);
+  Eigen::DSizes<int, 2> shape(NxC, sample_size);
+#ifndef EIGEN_HAS_INDEX_LIST
+  Eigen::DSizes<int, 1> rdims(0);
+  Eigen::DSizes<int, 1> mean_rdims(1);
+  Eigen::DSizes<int, 2> bcast(1, sample_size);
+  Eigen::DSizes<int, 2> C_shape(C, 1);
+  Eigen::DSizes<int, 2> NxC_shape(NxC, 1);
+#else
+  Eigen::IndexList<Eigen::type2index<0>> rdims;
+  Eigen::IndexList<Eigen::type2index<1>> mean_rdims;
+  Eigen::IndexList<Eigen::type2index<1>, int> bcast;
+  bcast.set(1, sample_size);
+  Eigen::IndexList<int, Eigen::type2index<1>> C_shape;
+  C_shape.set(0, C);
+  Eigen::IndexList<int, Eigen::type2index<1>> NxC_shape;
+  NxC_shape.set(0, NxC);
+#endif
+
+  phi::funcs::SetConstant<CPUContext, T> set_constant;
+
+  DenseTensor scale_data;
+  if (!scale_ptr) {
+    scale_data.Resize({C});
+    dev_ctx.template Alloc<T>(&scale_data);
+    set_constant(dev_ctx, &scale_data, static_cast<T>(1));
+  }
+
+  auto scale_e =
+      scale_ptr
+          ? EigenVector<T>::Flatten(*scale_ptr)
+          : EigenVector<T>::Flatten(const_cast<const DenseTensor&>(scale_data));
+  auto mean_e = EigenVector<T>::Flatten(saved_mean);
+  auto inv_var_e = EigenVector<T>::Flatten(saved_variance);
+  auto dy_e = EigenVector<T>::Flatten(d_y);
+  auto x_e = EigenVector<T>::Flatten(x);
+
+  auto scale_arr = scale_e.reshape(C_shape);
+  auto mean_arr = mean_e.reshape(NxC_shape);
+  auto inv_var_arr = inv_var_e.reshape(NxC_shape);
+  auto dy_arr = dy_e.reshape(shape);
+  auto x_arr = x_e.reshape(shape);
+
+  auto tmp = (x_arr - mean_arr.eval().broadcast(bcast)) *
+             inv_var_arr.eval().broadcast(bcast);
+
+  // math: d_bias = np.sum(d_y, axis=(n,h,w))
+  // math: d_scale = np.sum((X-mean) / inv_std * dy, axis=(n, h,w))
+  if (d_scale && d_bias) {
+    dev_ctx.template Alloc<T>(d_scale);
+    dev_ctx.template Alloc<T>(d_bias);
+    set_constant(dev_ctx, d_scale, static_cast<T>(0));
+    set_constant(dev_ctx, d_bias, static_cast<T>(0));
+
+    auto d_scale_e = EigenVector<T>::Flatten(*d_scale);
+    auto d_scale_data = d_scale_e.reshape(C_shape);
+    auto d_bias_e = EigenVector<T>::Flatten(*d_bias);
+    auto d_bias_data = d_bias_e.reshape(C_shape);
+    d_bias_data.device(*place) =
+        dy_arr.sum(mean_rdims).reshape(param_shape).sum(rdims);
+    d_scale_data.device(*place) =
+        (tmp * dy_arr).sum(mean_rdims).reshape(param_shape).sum(rdims);
+  }
+
+  auto dy_mean =
+      dy_arr.mean(mean_rdims).reshape(NxC_shape).eval().broadcast(bcast);
+
+  Eigen::DSizes<int, 2> bcast_param(N, sample_size);
+  set_constant(dev_ctx, d_x, static_cast<T>(0));
+  // math: d_x = scale * inv_var * d_y - scale * inv_var * np.sum(d_y,
+  // axis=(h,w))
+  //             - scale * (X - mean) * inv_var.pow(3) * np.sum(d_y * (X -
+  //             mean),
+  //             axis=(h,w))
+  auto dx_e = EigenVector<T>::Flatten(*d_x);
+  auto dx_arr = dx_e.reshape(shape);
+  dx_arr.device(*place) = scale_arr.broadcast(bcast_param) *
+                          inv_var_arr.broadcast(bcast) *
+                          (dy_arr - dy_mean -
+                           tmp *
+                               (dy_arr * tmp)
+                                   .mean(mean_rdims)
+                                   .reshape(NxC_shape)
+                                   .eval()
+                                   .broadcast(bcast));
+}
+
+template <typename T, typename Context>
+void InstanceNormDoubleGradKernel(const Context& dev_ctx,
+                                  const DenseTensor& x,
+                                  paddle::optional<const DenseTensor&> scale,
+                                  const DenseTensor& saved_mean,
+                                  const DenseTensor& saved_variance,
+                                  const DenseTensor& dy,
+                                  paddle::optional<const DenseTensor&> ddx,
+                                  paddle::optional<const DenseTensor&> ddscale,
+                                  paddle::optional<const DenseTensor&> ddbias,
+                                  float epsilon,
+                                  DenseTensor* dx,
+                                  DenseTensor* dscale,
+                                  DenseTensor* ddy) {
+  const auto* Scale = scale.get_ptr();
+  const auto* ddScale = ddscale.get_ptr();
+  const auto* ddX = ddx.get_ptr();
+  const auto* ddBias = ddbias.get_ptr();
+  phi::funcs::SetConstant<CPUContext, T> set_constant;
+  const auto& x_dims = x.dims();
+  int N, C, H, W, D;
+  funcs::ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
+  const int sample_size = x.numel() / N / C;
+  const int NxC = N * C;
+
+  const T* mean_data = saved_mean.data<T>();
+  const T* inv_var_data = saved_variance.data<T>();
+  DenseTensor mean_tensor;
+  DenseTensor inv_var_tensor;
+  ConstEigenArrayMap<T> x_arr(x.data<T>(), sample_size, NxC);
+  ConstEigenVectorArrayMap<T> mean_arr(mean_data, NxC);
+  ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, NxC);
+
+  DenseTensor mean_tile;
+  mean_tile.Resize({sample_size, NxC});
+  dev_ctx.template Alloc<T>(&mean_tile);
+  EigenArrayMap<T> mean_tile_data(mean_tile.data<T>(), sample_size, NxC);
+  DenseTensor inv_var_tile;
+  inv_var_tile.Resize({sample_size, NxC});
+  dev_ctx.template Alloc<T>(&inv_var_tile);
+  EigenArrayMap<T> inv_var_tile_data(inv_var_tile.data<T>(), sample_size, NxC);
+
+  mean_tile_data = mean_arr.transpose().replicate(sample_size, 1);
+  inv_var_tile_data = inv_var_arr.transpose().replicate(sample_size, 1);
+
+  DenseTensor Scale_data;
+  if (!Scale) {
+    Scale_data.Resize({C});
+    dev_ctx.template Alloc<T>(&Scale_data);
+    set_constant(dev_ctx, &Scale_data, static_cast<T>(1));
+  }
+  ConstEigenVectorArrayMap<T> scale_arr(
+      Scale ? Scale->data<T>() : Scale_data.data<T>(), C);
+
+  DenseTensor scale_tile;
+  scale_tile.Resize({sample_size, NxC});
+  dev_ctx.template Alloc<T>(&scale_tile);
+  EigenArrayMap<T> scale_tile_data(scale_tile.data<T>(), sample_size, NxC);
+  scale_tile_data = scale_arr.transpose().replicate(sample_size, N);
+  ConstEigenArrayMap<T> dy_arr(dy.data<T>(), sample_size, NxC);
+  ConstEigenArrayMap<T> ddx_arr(ddX->data<T>(), sample_size, NxC);
+  // math: dx = scale * ((x - mean) * inv_var / HxW * (np.mean(ddx,
+  //          axis=(h,w)) * np.sum(dy, axis=(h,w)) -
+  //          np.sum(dy * ddx, axis=(h,w)) + 3 * np.mean(dy * (x - mean),
+  //          axis=(h,w)) * inv_var.pow(2) *
+  //          np.sum(ddx * (x - mean), axis=(h,w))) + inv_var.pow(3) / HxW *
+  //          np.sum(ddx * (x - mean)) *
+  //          (np.mean(dy, axis=(h,w)) - dy) + inv_var.pow(3) / HxW *
+  //          np.sum(dy, axis=(h,w)) * (x - mean) *
+  //          (np.mean(ddx, axis=(h,w)) - ddx)) + ddr * (dy * inv_var -
+  //          inv_var * np.mean(dy, axis=(h,w)) - inv_var.pow(3) *
+  //          (x - mean) * np.mean(dy * (x - mean),  axis=(h,w)))
+
+  DenseTensor x_sub_mean_mul_invstd;
+  x_sub_mean_mul_invstd.Resize({sample_size, NxC});
+  dev_ctx.template Alloc<T>(&x_sub_mean_mul_invstd);
+  EigenArrayMap<T> x_sub_mean_mul_invstd_arr(
+      x_sub_mean_mul_invstd.data<T>(), sample_size, NxC);
+  x_sub_mean_mul_invstd_arr = (x_arr - mean_tile_data) * inv_var_tile_data;
+
+  if (dx) {
+    dev_ctx.template Alloc<T>(dx);
+    set_constant(dev_ctx, dx, static_cast<T>(0));
+    EigenArrayMap<T> dx_arr(dx->data<T>(), sample_size, NxC);
+    if (ddX) {
+      dx_arr +=
+          x_sub_mean_mul_invstd_arr * inv_var_tile_data * inv_var_tile_data /
+          sample_size *
+          (ddx_arr.colwise().sum() * dy_arr.colwise().sum() / sample_size -
+           (dy_arr * ddx_arr).colwise().sum() +
+           3. * (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() *
+               (ddx_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
+               sample_size);
+      dx_arr += (ddx_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
+                sample_size * inv_var_tile_data * inv_var_tile_data *
+                (dy_arr.colwise().sum() / sample_size - dy_arr);
+      dx_arr += (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
+                sample_size * inv_var_tile_data * inv_var_tile_data *
+                (ddx_arr.colwise().sum() / sample_size - ddx_arr);
+      dx_arr = scale_tile_data * dx_arr;
+    }
+    if (ddScale) {
+      ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
+      DenseTensor ddscale_tile;
+      ddscale_tile.Resize({sample_size, NxC});
+      dev_ctx.template Alloc<T>(&ddscale_tile);
+      EigenArrayMap<T> ddscale_tile_data(
+          ddscale_tile.data<T>(), sample_size, NxC);
+      ddscale_tile_data = ddscale_arr.transpose().replicate(sample_size, N);
+      dx_arr += (dy_arr * inv_var_tile_data -
+                 dy_arr.colwise().sum() / sample_size * inv_var_tile_data -
+                 x_sub_mean_mul_invstd_arr * inv_var_tile_data *
+                     (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
+                     sample_size) *
+                ddscale_tile_data;
+    }
+  }
+  if (dscale) {
+    // math: dscale = inv_var * (dy - np.mean(dy, axis=(h,w) - (x-mean) *
+    //            inv_var.pow(2) * np.mean(dy * (x-mean), axis=(h,w)))) * ddx
+    dev_ctx.template Alloc<T>(dscale);
+    set_constant(dev_ctx, dscale, static_cast<T>(0));
+    EigenVectorArrayMap<T> dscale_arr(dscale->data<T>(), C);
+    if (ddX) {
+      DenseTensor first_grad;
+      first_grad.Resize({sample_size, NxC});
+      dev_ctx.template Alloc<T>(&first_grad);
+      set_constant(dev_ctx, &first_grad, static_cast<T>(0));
+      EigenArrayMap<T> first_grad_arr(first_grad.data<T>(), sample_size, NxC);
+      first_grad_arr +=
+          inv_var_tile_data *
+          (dy_arr -
+           dy_arr.colwise().sum().replicate(sample_size, 1) / sample_size -
+           x_sub_mean_mul_invstd_arr *
+               (dy_arr * x_sub_mean_mul_invstd_arr)
+                   .colwise()
+                   .sum()
+                   .replicate(sample_size, 1) /
+               sample_size);
+      first_grad_arr = first_grad_arr * ddx_arr;
+      for (int nc = 0; nc < NxC; ++nc) {
+        int c = nc % C;
+        dscale_arr(c) += first_grad_arr.colwise().sum()(nc);
+      }
+    }
+  }
+  if (ddy) {
+    // math: ddy = (x - mean) * inv_var * ddscale + ddbias +
+    //           scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) *
+    //           np.mean(ddx * (x - mean), axis=(h,w)))
+    dev_ctx.template Alloc<T>(ddy);
+    set_constant(dev_ctx, ddy, static_cast<T>(0));
+    EigenArrayMap<T> ddy_arr(ddy->data<T>(), sample_size, NxC);
+    if (ddX) {
+      ddy_arr += scale_tile_data * inv_var_tile_data *
+                 (ddx_arr - ddx_arr.colwise().sum() / sample_size -
+                  x_sub_mean_mul_invstd_arr *
+                      (ddx_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
+                      sample_size);
+    }
+    if (ddScale && ddBias) {
+      ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
+      DenseTensor ddscale_tile;
+      ddscale_tile.Resize({sample_size, NxC});
+      dev_ctx.template Alloc<T>(&ddscale_tile);
+      EigenArrayMap<T> ddscale_tile_data(
+          ddscale_tile.data<T>(), sample_size, NxC);
+      ddscale_tile_data = ddscale_arr.transpose().replicate(sample_size, N);
+
+      ConstEigenVectorArrayMap<T> ddbias_arr(ddBias->data<T>(), C);
+      DenseTensor ddbias_tile;
+      ddbias_tile.Resize({sample_size, NxC});
+      dev_ctx.template Alloc<T>(&ddbias_tile);
+      EigenArrayMap<T> ddbias_tile_data(
+          ddbias_tile.data<T>(), sample_size, NxC);
+      ddbias_tile_data = ddbias_arr.transpose().replicate(sample_size, N);
+
+      ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data;
+      ddy_arr += ddbias_tile_data;
+    }
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(instance_norm_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::InstanceNormGradKernel,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(instance_norm_double_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::InstanceNormDoubleGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/instance_norm_kernel.cc b/paddle/phi/kernels/cpu/instance_norm_kernel.cc
new file mode 100644
index 0000000000000..f89ecba901c04
--- /dev/null
+++ b/paddle/phi/kernels/cpu/instance_norm_kernel.cc
@@ -0,0 +1,126 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/instance_norm_kernel.h"
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/eigen/extensions.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void InstanceNormKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        paddle::optional<const DenseTensor&> scale,
+                        paddle::optional<const DenseTensor&> bias,
+                        float epsilon_f,
+                        DenseTensor* y,
+                        DenseTensor* saved_mean,
+                        DenseTensor* saved_variance) {
+  const auto& x_dims = x.dims();
+  T epsilon = static_cast<T>(epsilon_f);
+  const int N = x_dims[0];
+  const int C = x_dims[1];
+  const int NxC = N * C;
+  const int sample_size = x.numel() / N / C;
+  auto* place = dev_ctx.eigen_device();
+
+  Eigen::DSizes<int, 2> shape(NxC, sample_size);
+// Once eigen on Windows is updated, the if branch can be removed.
+#ifndef EIGEN_HAS_INDEX_LIST
+  Eigen::DSizes<int, 2> bcast(1, sample_size);
+  Eigen::DSizes<int, 2> C_shape(C, 1);
+  Eigen::DSizes<int, 2> NxC_shape(NxC, 1);
+  Eigen::DSizes<int, 1> rdims(1);
+#else
+  Eigen::IndexList<Eigen::type2index<1>, int> bcast;
+  bcast.set(1, sample_size);
+  Eigen::IndexList<int, Eigen::type2index<1>> C_shape;
+  C_shape.set(0, C);
+  Eigen::IndexList<int, Eigen::type2index<1>> NxC_shape;
+  NxC_shape.set(0, NxC);
+  Eigen::IndexList<Eigen::type2index<1>> rdims;
+#endif
+
+  phi::funcs::SetConstant<CPUContext, T> set_constant;
+  dev_ctx.template Alloc<T>(saved_mean);
+  dev_ctx.template Alloc<T>(saved_variance);
+  set_constant(dev_ctx, saved_mean, static_cast<T>(0));
+  set_constant(dev_ctx, saved_variance, static_cast<T>(0));
+
+  auto saved_mean_a = EigenVector<T>::Flatten(*saved_mean);
+  auto saved_mean_e = saved_mean_a.reshape(NxC_shape);
+  auto saved_variance_a = EigenVector<T>::Flatten(*saved_variance);
+  auto saved_variance_e = saved_variance_a.reshape(NxC_shape);
+
+  auto x_e = EigenVector<T>::Flatten(x);
+  auto x_arr = x_e.reshape(shape);
+
+  saved_mean_e.device(*place) = x_arr.mean(rdims);
+  auto saved_variance_arr =
+      (x_arr - saved_mean_e.broadcast(bcast)).square().mean(rdims) + epsilon;
+
+  saved_variance_e.device(*place) = saved_variance_arr.sqrt().inverse();
+
+  const auto scale_ptr = scale.get_ptr();
+  const auto bias_ptr = bias.get_ptr();
+
+  DenseTensor scale_data;
+  DenseTensor bias_data;
+  if (!scale_ptr) {
+    scale_data.Resize({C});
+    dev_ctx.template Alloc<T>(&scale_data);
+    set_constant(dev_ctx, &scale_data, static_cast<T>(1));
+  }
+
+  if (!bias_ptr) {
+    bias_data.Resize({C});
+    dev_ctx.template Alloc<T>(&bias_data);
+    set_constant(dev_ctx, &bias_data, static_cast<T>(0));
+  }
+  auto scale_e =
+      scale_ptr
+          ? EigenVector<T>::Flatten(*scale_ptr)
+          : EigenVector<T>::Flatten(const_cast<const DenseTensor&>(scale_data));
+  auto scale_arr = scale_e.reshape(C_shape);
+  auto bias_e =
+      bias_ptr
+          ? EigenVector<T>::Flatten(*bias_ptr)
+          : EigenVector<T>::Flatten(const_cast<const DenseTensor&>(bias_data));
+  auto bias_arr = bias_e.reshape(C_shape);
+
+  dev_ctx.template Alloc<T>(y);
+  auto y_e = EigenVector<T>::Flatten(*y);
+  auto y_arr = y_e.reshape(shape);
+
+  // (x - mean) * inv_std * scale + bias
+  Eigen::DSizes<int, 2> bcast_param(N, sample_size);
+  y_arr.device(*place) = (x_arr - saved_mean_e.broadcast(bcast)) *
+                             saved_variance_e.broadcast(bcast) *
+                             scale_arr.broadcast(bcast_param) +
+                         bias_arr.broadcast(bcast_param);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    instance_norm, CPU, ALL_LAYOUT, phi::InstanceNormKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/mv_grad_kernel.cc b/paddle/phi/kernels/cpu/mv_grad_kernel.cc
index c3b7f94be4194..c87801bb69389 100644
--- a/paddle/phi/kernels/cpu/mv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/mv_grad_kernel.cc
@@ -31,7 +31,7 @@ void MvGradKernel(const Context& dev_ctx,
   auto dx = x_grad;
   auto dvec = vec_grad;
 
-  auto dim_x = x.dims();
+  const auto& dim_x = x.dims();
   int m = dim_x[0];
   int n = dim_x[1];
 
diff --git a/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc
index fbed3f1cb133a..715e6b008ed77 100644
--- a/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc
@@ -32,7 +32,7 @@ void PsroiPoolGradKernel(const Context& ctx,
                          float spatial_scale,
                          DenseTensor* dx) {
   if (dx) {
-    auto in_dims = x.dims();
+    const auto& in_dims = x.dims();
     int input_channels = in_dims[1];
     int height = in_dims[2];
     int width = in_dims[3];
diff --git a/paddle/phi/kernels/cpu/reduce.h b/paddle/phi/kernels/cpu/reduce.h
index 06a458832d19f..35395dccca1af 100644
--- a/paddle/phi/kernels/cpu/reduce.h
+++ b/paddle/phi/kernels/cpu/reduce.h
@@ -20,7 +20,6 @@
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/cast_kernel.h"
 
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -118,7 +117,7 @@ void GetShuffledInput(const DeviceContext& dev_ctx,
   std::vector<int> perm_axis(input.dims().size());
   GetShuffledDim(input.dims(), &shuffled_dims, dims, &perm_axis);
 
-  shuffled_input->ResizeAndAllocate(shuffled_dims);
+  shuffled_input->Resize(shuffled_dims);
   dev_ctx.template Alloc<OutT>(shuffled_input);
 
   phi::funcs::TransposeNormal<DeviceContext, OutT> trans;
@@ -132,10 +131,7 @@ void HandleLargeDim(const DeviceContext& dev_ctx,
                     const std::vector<int64_t>& dims,
                     bool keep_dim) {
   //  shuffle the reduced dim to the end
-  phi::DenseTensor shuffled_input = phi::DenseTensor(
-      phi::make_intrusive<paddle::experimental::SharedStorage>(input.place()),
-      input.meta());
-
+  phi::DenseTensor shuffled_input;
   GetShuffledInput<DeviceContext, OutT>(dev_ctx, input, &shuffled_input, dims);
 
   // transpose to 2D tensor whose shape is {unreduced, reduced}.
diff --git a/paddle/phi/kernels/cpu/rnn_functor.h b/paddle/phi/kernels/cpu/rnn_functor.h
index 961bc7a214be5..ab6f98ffcd5d6 100644
--- a/paddle/phi/kernels/cpu/rnn_functor.h
+++ b/paddle/phi/kernels/cpu/rnn_functor.h
@@ -330,7 +330,7 @@ void RnnFunc(const Context& dev_ctx,
     }
   }
 
-  DenseTensor* input_holder;
+  DenseTensor* input_holder = nullptr;
   DenseTensor* output_holder = output;
   bool has_allocate_mem = false;
 
diff --git a/paddle/phi/kernels/cpu/rnn_kernel.cc b/paddle/phi/kernels/cpu/rnn_kernel.cc
index cae97eb076453..ae2c7a72635f7 100644
--- a/paddle/phi/kernels/cpu/rnn_kernel.cc
+++ b/paddle/phi/kernels/cpu/rnn_kernel.cc
@@ -808,7 +808,7 @@ struct BidirLayer : public Layer<T, CellType> {
                   mode,
                   is_test);
 
-    // concat the the output result
+    // concat the output result
     funcs::ConcatFunctor<CPUContext, T> concat_functor;
     concat_functor(dev_ctx, output_vec, static_cast<int>(2), output);
   }
diff --git a/paddle/phi/kernels/cpu/tril_indices_kernel.cc b/paddle/phi/kernels/cpu/tril_indices_kernel.cc
new file mode 100644
index 0000000000000..71c5cd820b383
--- /dev/null
+++ b/paddle/phi/kernels/cpu/tril_indices_kernel.cc
@@ -0,0 +1,50 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/tril_indices_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+template <typename T, typename Context>
+void TrilIndicesKernel(const Context& dev_ctx,
+                       int rows,
+                       int cols,
+                       int offset,
+                       DataType dtype,
+                       DenseTensor* out) {
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  const auto& out_dims = out->dims();
+  int64_t tril_size = out_dims[1];
+  int64_t i = 0;
+  T r = std::max<int64_t>(0, -offset), c = 0;
+  while (i < tril_size) {
+    out_data[i] = r;
+    out_data[tril_size + i++] = c;
+
+    // move to the next column and check if (r, c) is still in bound
+    c += 1;
+    if (c > r + offset || c >= cols) {
+      r += 1;
+      c = 0;
+      // NOTE: not necessary to check if r is less than row here, because i
+      // and tril_size provide the guarantee
+    }
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    tril_indices, CPU, ALL_LAYOUT, phi::TrilIndicesKernel, int, int64_t) {}
diff --git a/paddle/phi/kernels/cpu/uniform_random_kernel.cc b/paddle/phi/kernels/cpu/uniform_random_kernel.cc
index 91a6903418230..c95a8f4ded6dc 100644
--- a/paddle/phi/kernels/cpu/uniform_random_kernel.cc
+++ b/paddle/phi/kernels/cpu/uniform_random_kernel.cc
@@ -54,7 +54,6 @@ void UniformRandomRawKernel(const Context &dev_ctx,
                             float diag_val,
                             DenseTensor *out) {
   out->Resize(phi::make_ddim(shape.GetData()));
-  VLOG(4) << out->dims();
   T *data = dev_ctx.template Alloc<T>(out);
   auto size = out->numel();
   std::shared_ptr<std::mt19937_64> engine;
diff --git a/paddle/phi/kernels/einsum_grad_kernel.h b/paddle/phi/kernels/einsum_grad_kernel.h
index 5c1970e775825..06785c8532e70 100644
--- a/paddle/phi/kernels/einsum_grad_kernel.h
+++ b/paddle/phi/kernels/einsum_grad_kernel.h
@@ -21,6 +21,7 @@ namespace phi {
 template <typename T, typename Context>
 void EinsumGradKernel(const Context& dev_ctx,
                       const std::vector<const DenseTensor*>& x,
+                      const std::vector<const DenseTensor*>& inner_cache,
                       const DenseTensor& out_grad,
                       const std::string& equation,
                       std::vector<DenseTensor*> x_grad);
diff --git a/paddle/phi/kernels/einsum_kernel.h b/paddle/phi/kernels/einsum_kernel.h
index 3d9e8feda748d..87df2b1c64a4a 100644
--- a/paddle/phi/kernels/einsum_kernel.h
+++ b/paddle/phi/kernels/einsum_kernel.h
@@ -24,4 +24,11 @@ void EinsumKernel(const Context& dev_ctx,
                   const std::string& equation,
                   DenseTensor* out);
 
+template <typename T, typename Context>
+void EinsumKernelRaw(const Context& dev_ctx,
+                     const std::vector<const DenseTensor*>& inputs,
+                     const std::string& equation,
+                     DenseTensor* out,
+                     std::vector<DenseTensor*> cache);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/elementwise_grad_kernel.h b/paddle/phi/kernels/elementwise_grad_kernel.h
index 6f2f2915ecf9e..b1e6ecaee6746 100644
--- a/paddle/phi/kernels/elementwise_grad_kernel.h
+++ b/paddle/phi/kernels/elementwise_grad_kernel.h
@@ -55,6 +55,15 @@ void MinimumGradKernel(const Context& dev_ctx,
                        DenseTensor* dx,
                        DenseTensor* dy);
 
+template <typename T, typename Context>
+void ElementwiseHeavisideGradKernel(const Context& dev_ctx,
+                                    const DenseTensor& x,
+                                    const DenseTensor& y,
+                                    const DenseTensor& dout,
+                                    int axis,
+                                    DenseTensor* dx,
+                                    DenseTensor* dy);
+
 template <typename T, typename Context>
 void ElementwisePowGradKernel(const Context& dev_ctx,
                               const DenseTensor& x,
diff --git a/paddle/phi/kernels/elementwise_kernel.cc b/paddle/phi/kernels/elementwise_kernel.cc
index 9d608cd86a6f7..5e29eb5ace675 100644
--- a/paddle/phi/kernels/elementwise_kernel.cc
+++ b/paddle/phi/kernels/elementwise_kernel.cc
@@ -64,6 +64,15 @@ void ElementwisePowKernel(const Context& dev_ctx,
   ElementwisePowRawKernel<T>(dev_ctx, x, y, axis, out);
 }
 
+template <typename T, typename Context>
+void ElementwiseHeavisideKernel(const Context& dev_ctx,
+                                const DenseTensor& x,
+                                const DenseTensor& y,
+                                DenseTensor* out) {
+  int axis = -1;
+  ElementwiseHeavisideRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+
 }  // namespace phi
 
 using complex64 = ::phi::dtype::complex<float>;
@@ -91,6 +100,14 @@ PD_REGISTER_KERNEL(
     modulo, CPU, ALL_LAYOUT, phi::ModuloKernel, float, double, int, int64_t) {}
 PD_REGISTER_KERNEL(
     floor_divide, CPU, ALL_LAYOUT, phi::FloorDivideKernel, int, int64_t) {}
+PD_REGISTER_KERNEL(elementwise_heaviside,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ElementwiseHeavisideKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
 PD_REGISTER_KERNEL(elementwise_pow,
                    CPU,
                    ALL_LAYOUT,
@@ -126,6 +143,14 @@ PD_REGISTER_KERNEL(
     modulo, GPU, ALL_LAYOUT, phi::ModuloKernel, float, double, int, int64_t) {}
 PD_REGISTER_KERNEL(
     floor_divide, KPS, ALL_LAYOUT, phi::FloorDivideKernel, int, int64_t) {}
+PD_REGISTER_KERNEL(elementwise_heaviside,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ElementwiseHeavisideKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
 PD_REGISTER_KERNEL(elementwise_pow,
                    KPS,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/elementwise_kernel.h b/paddle/phi/kernels/elementwise_kernel.h
index 37fe895d4051f..a39da52e7e3b5 100644
--- a/paddle/phi/kernels/elementwise_kernel.h
+++ b/paddle/phi/kernels/elementwise_kernel.h
@@ -98,6 +98,19 @@ void ElementwisePowKernel(const Context& dev_ctx,
                           const DenseTensor& y,
                           DenseTensor* out);
 
+template <typename T, typename Context>
+void ElementwiseHeavisideRawKernel(const Context& dev_ctx,
+                                   const DenseTensor& x,
+                                   const DenseTensor& y,
+                                   int axis,
+                                   DenseTensor* out);
+
+template <typename T, typename Context>
+void ElementwiseHeavisideKernel(const Context& dev_ctx,
+                                const DenseTensor& x,
+                                const DenseTensor& y,
+                                DenseTensor* out);
+
 template <typename T, typename Context>
 DenseTensor Maximum(const Context& dev_ctx,
                     const DenseTensor& x,
@@ -142,6 +155,17 @@ DenseTensor FloorDivide(const Context& dev_ctx,
   return dense_out;
 }
 
+template <typename T, typename Context>
+DenseTensor ElementwiseHeaviside(const Context& dev_ctx,
+                                 const DenseTensor& x,
+                                 const DenseTensor& y) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  ElementwiseInferMeta(x, y, &meta_out);
+  ElementwiseHeavisideKernel<T, Context>(dev_ctx, x, y, &dense_out);
+  return dense_out;
+}
+
 template <typename T, typename Context>
 DenseTensor ElementwisePow(const Context& dev_ctx,
                            const DenseTensor& x,
diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h
index b75477a1af982..f80117ccec799 100644
--- a/paddle/phi/kernels/funcs/activation_functor.h
+++ b/paddle/phi/kernels/funcs/activation_functor.h
@@ -1428,16 +1428,19 @@ struct SigmoidTripleGradFunctor : public BaseActivationFunctor<T> {
         GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidTripleGrad"));
     auto dout = EigenVector<T>::Flatten(
         GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidTripleGrad"));
-    auto d_ddOut = EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "SigmoidTripleGrad"));
     auto d_dOutNew = EigenVector<T>::Flatten(GET_DATA_SAFELY(
         d_dOut_New, "Input", "D_DOut_New", "SigmoidTripleGrad"));
 
     if (d_Out_New) {
       auto d_OutNew = EigenVector<T>::Flatten(GET_DATA_SAFELY(
           d_Out_New, "Output", "D_OutNew", "SigmoidTripleGrad"));
-      d_OutNew.device(*d) = (ddx - static_cast<T>(2) * out * ddx) * d_ddOut -
-                            static_cast<T>(2) * dout * ddx * d_dOutNew;
+      d_OutNew.device(*d) = -static_cast<T>(2) * dout * ddx * d_dOutNew;
+      if (d_DDOut) {
+        auto d_ddOut = EigenVector<T>::Flatten(
+            GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "SigmoidTripleGrad"));
+        d_OutNew.device(*d) =
+            (ddx - static_cast<T>(2) * out * ddx) * d_ddOut + d_OutNew;
+      }
     }
     if (d_d_Out) {
       auto d_dOut = EigenVector<T>::Flatten(
@@ -1449,8 +1452,12 @@ struct SigmoidTripleGradFunctor : public BaseActivationFunctor<T> {
       auto d_ddx = EigenVector<T>::Flatten(
           GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "SigmoidTripleGrad"));
       d_ddx.device(*d) =
-          (static_cast<T>(1) - out) * out * d_ddOut +
           (static_cast<T>(1) - static_cast<T>(2) * out) * dout * d_dOutNew;
+      if (d_DDOut) {
+        auto d_ddOut = EigenVector<T>::Flatten(
+            GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "SigmoidTripleGrad"));
+        d_ddx.device(*d) = d_ddx + (static_cast<T>(1) - out) * out * d_ddOut;
+      }
     }
   }
   static constexpr ActBwdOpFwdDeps FwdDeps() {
@@ -1825,6 +1832,196 @@ struct ZeroGradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
+template <typename T>
+struct SqrtGradGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev,
+                  const DenseTensor* Out,
+                  const DenseTensor* dX,
+                  const DenseTensor* ddX,
+                  DenseTensor* dOut,
+                  DenseTensor* ddOut) const {
+    auto* d = dev.eigen_device();
+    auto ddx = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "SqrtGradGrad"));
+    auto out = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Output", "Out", "SqrtGradGrad"));
+    // sqrt GradGrad: ddy = 0.5 * ddx / y, dy = -1 * dx * ddx
+    // calculate dy first, so ddy can inplace ddx
+    if (dOut) {
+      auto dx = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dX, "Output", "DX", "SqrtGradGrad"));
+      auto dout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOut, "Output", "DOut", "SqrtGradGrad"));
+      dout.device(*d) = dx * ddx * static_cast<T>(-1) / out;
+    }
+    if (ddOut) {
+      auto ddout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SqrtGradGrad"));
+      ddout.device(*d) = ddx * static_cast<T>(0.5) / out;
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct RsqrtGradGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev,
+                  const DenseTensor* Out,
+                  const DenseTensor* dX,
+                  const DenseTensor* ddX,
+                  DenseTensor* dOut,
+                  DenseTensor* ddOut) const {
+    auto* d = dev.eigen_device();
+    auto ddx = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "RsqrtGradGrad"));
+    auto out = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Output", "Out", "RsqrtGradGrad"));
+
+    // rsqrt GradGrad: ddy = -0.5 * ddx * y * y * y, dy = (3/y) * dx * ddx
+    if (dOut) {
+      auto dx = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dX, "Output", "DX", "RsqrtGradGrad"));
+      auto dout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOut, "Output", "DOut", "RsqrtGradGrad"));
+      dout.device(*d) = (static_cast<T>(3.0) / out) * dx * ddx;
+    }
+    if (ddOut) {
+      auto ddout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "RsqrtGradGrad"));
+      ddout.device(*d) = ddx * static_cast<T>(-0.5) * out * out * out;
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct CELUFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) =
+        (x < static_cast<T>(0))
+            .select(static_cast<T>(alpha) *
+                        ((x / static_cast<T>(alpha)).exp() - static_cast<T>(1)),
+                    x);
+  }
+};
+
+template <typename T>
+struct CELUGradFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto temp_a_pos = static_cast<T>(alpha > 0);
+    auto temp_a_neg = static_cast<T>(alpha <= 0);
+    auto temp_x_pos = (x > static_cast<T>(0)).template cast<T>();
+    auto temp_x_neg = (x <= static_cast<T>(0)).template cast<T>();
+
+    // dx = dout, if alpha > 0 and x > 0
+    // dx = dout * (x/alpha).exp(), if alpha > 0 and x <= 0
+    // dx = dout , if alpha < 0 and x > 0
+    // dx = dout * (x/alpha).exp(), if alpha < 0 and x <=0
+    dx.device(d) =
+        dout * temp_a_pos * temp_x_pos +
+        dout * (x / static_cast<T>(alpha)).exp() * temp_a_pos * temp_x_neg +
+        dout * temp_a_neg * temp_x_pos +
+        dout * (x / static_cast<T>(alpha)).exp() * temp_a_neg * temp_x_neg;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CELUGradGradFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+  template <typename Device>
+  void operator()(const Device& dev,
+                  const DenseTensor* X,
+                  const DenseTensor* dOut,
+                  const DenseTensor* ddX,
+                  DenseTensor* dX,
+                  DenseTensor* ddOut) const {
+    auto* d = dev.eigen_device();
+    auto ddx = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "CELUGradGrad"));
+    auto x = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(X, "Input", "X", "CELUGradGrad"));
+
+    if (dX) {
+      auto dx = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dX, "Output", "DX", "CELUGradGrad"));
+      auto dout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOut, "Output", "DOut", "CELUGradGrad"));
+      dx.device(*d) = ddx * dout / static_cast<T>(alpha) *
+                      (x / static_cast<T>(alpha)).exp() *
+                      (x <= static_cast<T>(0)).template cast<T>();
+    }
+
+    if (ddOut) {
+      auto ddout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "CELUGradGrad"));
+      ddout.device(*d) = ddx *
+                         ((x > static_cast<T>(0)).template cast<T>() +
+                          (x / static_cast<T>(alpha)).exp() *
+                              (x <= static_cast<T>(0)).template cast<T>())
+                             .template cast<T>();
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct SquareGradGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev,
+                  const DenseTensor* X,
+                  const DenseTensor* dOut,
+                  const DenseTensor* ddX,
+                  DenseTensor* dX,
+                  DenseTensor* ddOut) const {
+    auto* d = dev.eigen_device();
+    auto ddx = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "SquareGradGrad"));
+    auto x = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(X, "Input", "X", "SquareGradGrad"));
+    // square GradGrad: ddy=2x*ddx, dx=2dy*ddx
+    // calculate dx first, so ddy can inplace ddx
+    if (dX) {
+      auto dx = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dX, "Output", "DX", "SquareGradGrad"));
+      auto dout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOut, "Output", "DOut", "SquareGradGrad"));
+      dx.device(*d) = ddx * static_cast<T>(2) * dout;
+    }
+    if (ddOut) {
+      auto ddout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SquareGradGrad"));
+      ddout.device(*d) = ddx * static_cast<T>(2) * x;
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
 #if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
 template <typename T>
 struct CudaReluFunctor : public BaseActivationFunctor<T> {
@@ -3084,6 +3281,59 @@ struct CudaZeroGradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
+template <typename T>
+struct CudaCELUFunctor : public BaseActivationFunctor<T> {
+  using CT = typename phi::dtype::MPTypeTrait<T>::Type;
+  CT zero = static_cast<CT>(0.0f);
+  CT one = static_cast<CT>(1.0f);
+  float alpha;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+
+  // celu(x) = max(0, x) + min(0, alpha * (exp(x/alpha) - 1))
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    CT x = static_cast<CT>(arg_x);
+    CT temp = static_cast<CT>(alpha) * (exp(x / static_cast<CT>(alpha)) - one);
+    CT res = (x > zero ? x : zero) + (temp > zero ? zero : temp);
+    return static_cast<T>(res);
+  }
+};
+
+template <typename T>
+struct CudaCELUGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType zero = static_cast<MPType>(0.0f);
+  MPType one = static_cast<MPType>(1.0f);
+  float alpha;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+
+  // dx = dout, if alpha > 0 and x > 0
+  // dx = dout * (x/alpha).exp(), if alpha > 0 and x <= 0
+  // dx = dout , if alpha < 0 and x > 0
+  // dx = dout * (x/alpha).exp(), if alpha < 0 and x <=0
+  __device__ __forceinline__ T operator()(const T arg_dout,
+                                          const T arg_x) const {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType x = static_cast<MPType>(arg_x);
+    MPType a = static_cast<MPType>(alpha);
+    MPType temp_a_pos = static_cast<MPType>(alpha > 0.0f);
+    MPType temp_a_neg = static_cast<MPType>(alpha <= 0.0f);
+    MPType temp_x_pos = static_cast<MPType>(x > zero);
+    MPType temp_x_neg = static_cast<MPType>(x <= zero);
+    return static_cast<T>(
+        dout *
+        (temp_a_pos * temp_x_pos + temp_a_pos * temp_x_neg * exp(x / a) +
+         temp_a_neg * temp_x_pos + exp(x / a) * temp_a_neg * temp_x_neg));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
 #endif
 
 }  // namespace funcs
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
index cd7a24c6d2421..e2b16a1eb7ff1 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
@@ -21,6 +21,7 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 
 DECLARE_bool(enable_cublas_tensor_op_math);
+DECLARE_bool(gemm_use_half_precision_compute_type);
 
 namespace phi {
 namespace funcs {
@@ -2255,8 +2256,25 @@ void Blas<paddle::platform::CUDADeviceContext>::BatchedGEMM(
     }
     VLOG(5) << "use_tensor_op_math: "
             << (use_tensor_op_math ? "True" : "False");
+    VLOG(4) << "use_half_precision_compute_type: "
+            << FLAGS_gemm_use_half_precision_compute_type;
 
     auto fp = std::is_same<T, float>::value ? CUDA_R_32F : CUDA_R_16F;
+    cudaDataType_t compute_type = fp;
+
+    float h_alpha = static_cast<float>(alpha);
+    float h_beta = static_cast<float>(beta);
+    void *a = static_cast<void *>(&h_alpha);
+    void *b = static_cast<void *>(&h_beta);
+    // set ComputeType as CUDA_R_32F for fp16, for better accuracy
+    if (FLAGS_gemm_use_half_precision_compute_type == true &&
+        std::is_same<T, phi::dtype::float16>::value) {
+      a = static_cast<void *>(&alpha);
+      b = static_cast<void *>(&beta);
+      compute_type = CUDA_R_16F;
+    }
+
+    // set ComputeType as CUDA_R_32F for fp16 and fp32, for better accuracy
     context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
       PADDLE_ENFORCE_GPU_SUCCESS(
           paddle::platform::dynload::cublasGemmStridedBatchedEx(handle,
@@ -2265,7 +2283,7 @@ void Blas<paddle::platform::CUDADeviceContext>::BatchedGEMM(
                                                                 N,
                                                                 M,
                                                                 K,
-                                                                &alpha,
+                                                                a,
                                                                 B,
                                                                 fp,
                                                                 ldb,
@@ -2274,13 +2292,13 @@ void Blas<paddle::platform::CUDADeviceContext>::BatchedGEMM(
                                                                 fp,
                                                                 lda,
                                                                 strideA,
-                                                                &beta,
+                                                                b,
                                                                 C,
                                                                 fp,
                                                                 ldc,
                                                                 strideC,
                                                                 batchCount,
-                                                                fp,
+                                                                compute_type,
                                                                 algo));
     });
   } else {
@@ -2348,8 +2366,24 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
     }
     VLOG(5) << "use_tensor_op_math: "
             << (use_tensor_op_math ? "True" : "False");
+    VLOG(4) << "use_half_precision_compute_type: "
+            << FLAGS_gemm_use_half_precision_compute_type;
 
     auto fp = std::is_same<T, float>::value ? CUDA_R_32F : CUDA_R_16F;
+    cudaDataType_t compute_type = CUDA_R_32F;
+
+    float h_alpha = static_cast<float>(alpha);
+    float h_beta = static_cast<float>(beta);
+    void *a = static_cast<void *>(&h_alpha);
+    void *b = static_cast<void *>(&h_beta);
+    // set ComputeType as CUDA_R_32F for fp16, for better accuracy
+    if (FLAGS_gemm_use_half_precision_compute_type == true &&
+        std::is_same<T, phi::dtype::float16>::value) {
+      a = static_cast<void *>(&alpha);
+      b = static_cast<void *>(&beta);
+      compute_type = CUDA_R_16F;
+    }
+
     context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
       PADDLE_ENFORCE_GPU_SUCCESS(
           paddle::platform::dynload::cublasGemmStridedBatchedEx(handle,
@@ -2358,7 +2392,7 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
                                                                 N,
                                                                 M,
                                                                 K,
-                                                                &alpha,
+                                                                a,
                                                                 B,
                                                                 fp,
                                                                 ldb,
@@ -2367,13 +2401,13 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
                                                                 fp,
                                                                 lda,
                                                                 strideA,
-                                                                &beta,
+                                                                b,
                                                                 C,
                                                                 fp,
                                                                 ldc,
                                                                 strideC,
                                                                 batchCount,
-                                                                fp,
+                                                                compute_type,
                                                                 algo));
     });
   } else {
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.h b/paddle/phi/kernels/funcs/blas/blas_impl.h
index 2868aa5acb75e..db4796b3f61ca 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.h
@@ -1284,9 +1284,9 @@ void Blas<DeviceContext>::MatMul(const phi::DenseTensor &mat_a,
                                  T alpha,
                                  phi::DenseTensor *mat_out,
                                  T beta) const {
-  auto dim_a = mat_a.dims();
-  auto dim_b = mat_b.dims();
-  auto dim_out = mat_out->dims();
+  const auto &dim_a = mat_a.dims();
+  const auto &dim_b = mat_b.dims();
+  const auto &dim_out = mat_out->dims();
   PADDLE_ENFORCE_EQ(
       dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
       true,
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index aafa40a3d01bf..ecdfa7abcfd42 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -53,7 +53,7 @@ struct DimensionsTransform {
             PADDLE_THROW(phi::errors::InvalidArgument(
                 "The %d-th dimension of input tensor is expected to be equal "
                 "with the %d-th dimension of output tensor %d or 1, but "
-                "recieved %d.",
+                "received %d.",
                 in_idx + 1,
                 axis + 1,
                 out_dims[axis],
@@ -70,7 +70,7 @@ struct DimensionsTransform {
             PADDLE_THROW(phi::errors::InvalidArgument(
                 "The %d-th dimension of input tensor is expected to be equal "
                 "with the %d-th dimension of output tensor %d or 1, but "
-                "recieved %d.",
+                "received %d.",
                 in_idx + 1,
                 in_idx + 1,
                 out_dims[in_idx],
@@ -223,22 +223,54 @@ struct DimensionsTransform {
   }
 };
 
-template <typename T, int VecSize, int Rank, bool IsBoundary = false>
+template <typename InT, typename OutT, int NumOuts = 1>
+int GetVecsize(const std::vector<const DenseTensor *> &ins,
+               std::vector<DenseTensor *> *outs) {
+  int in_vec_size = 4;
+  int out_vec_size = 4;
+  if (NumOuts > 1) {
+    for (int i = 0; i < NumOuts; ++i) {
+      PADDLE_ENFORCE_EQ(
+          (*outs)[i]->dims(),
+          (*outs)[0]->dims(),
+          phi::errors::InvalidArgument(
+              "The shape of each output tensor shall be identical yet, but "
+              "%d-th output tensor`s shape is not.",
+              i));
+      out_vec_size = std::min(
+          phi::GetVectorizedSize<OutT>((*outs)[i]->data<OutT>()), out_vec_size);
+    }
+  } else {
+    out_vec_size = phi::GetVectorizedSize<OutT>((*outs)[0]->data<OutT>());
+  }
+
+  for (auto *in : ins) {
+    auto temp_size = phi::GetVectorizedSize<InT>(in->data<InT>());
+    in_vec_size = in->dims() == (*outs)[0]->dims()
+                      ? std::min(temp_size, in_vec_size)
+                      : in_vec_size;
+  }
+  return std::min(out_vec_size, in_vec_size);
+}
+
+template <typename T, int VecSize, bool IsBoundary = false>
 __device__ __forceinline__ void LoadData(
     T *dst,
     const _ptr_ T *src,
     uint32_t block_offset,
-    const kps::details::BroadcastConfig<Rank> &config,
+    const kps::details::BroadcastConfig &config,
     int numel,
     int num,
-    int need_broadcast) {
+    int need_broadcast,
+    int read_lens) {
   // numel : whole num of output
   // num: how many data will be deal with in this time
   if (need_broadcast) {
-    kps::ReadDataBc<T, VecSize, 1, 1, Rank, IsBoundary>(
-        dst, src, block_offset, config, numel);
+    kps::ReadDataBc<T, VecSize, 1, 1, IsBoundary>(
+        dst, src, block_offset, config, numel, read_lens);
   } else {
-    kps::ReadData<T, VecSize, 1, 1, IsBoundary>(dst, src + block_offset, num);
+    kps::ReadData<T, VecSize, 1, 1, IsBoundary>(
+        dst, src + block_offset, num, read_lens);
   }
 }
 
@@ -248,30 +280,31 @@ template <typename InT,
           int Arity,
           int NumOuts,
           int VecSize,
-          int Rank,
           bool IsBoundary = false>
 __device__ void VectorizedBroadcastKernelImpl(
     const phi::Array<const _ptr_ InT *__restrict__, Arity> &ins,
     phi::Array<_ptr_ OutT *, NumOuts> outs,
     const phi::Array<int, Arity> &use_broadcast,
     uint32_t numel,
-    const phi::Array<kps::details::BroadcastConfig<Rank>, Arity> &configs,
+    const phi::Array<kps::details::BroadcastConfig, Arity> &configs,
     int num,
     int block_offset,
+    int read_lens,
     Functor func) {
-  InT args[Arity][VecSize];
-  ConditionalT<OutT, NumOuts> result[VecSize];
+  __simd__ InT args[Arity][VecSize];
+  __simd__ ConditionalT<OutT, NumOuts> result[VecSize];
 
 #pragma unroll
   for (int i = 0; i < Arity; i++) {
-    kps::Init<InT, VecSize>(args[i], static_cast<InT>(1.0f));
-    LoadData<InT, VecSize, Rank, IsBoundary>(args[i],
-                                             ins[i],
-                                             block_offset,
-                                             configs[i],
-                                             numel,
-                                             num,
-                                             use_broadcast[i]);
+    kps::Init<InT, VecSize>(args[i], static_cast<InT>(1.0f), read_lens);
+    LoadData<InT, VecSize, IsBoundary>(args[i],
+                                       ins[i],
+                                       block_offset,
+                                       configs[i],
+                                       numel,
+                                       num,
+                                       use_broadcast[i],
+                                       read_lens);
   }
   constexpr bool kCallElementwiseAny =
       paddle::platform::FunctionTraits<Functor>::has_pointer_args;
@@ -281,10 +314,10 @@ __device__ void VectorizedBroadcastKernelImpl(
                                          Functor,
                                          Arity,
                                          kCallElementwiseAny>()(
-      func, args, result);
-
-  phi::funcs::ElementwiseWriteDataCaller<OutT, VecSize, IsBoundary, NumOuts>()(
-      outs, result, block_offset, num);
+      func, args, result, read_lens);
+  phi::funcs::
+      ElementwiseWriteDataCallerBc<OutT, VecSize, IsBoundary, NumOuts>()(
+          outs, result, block_offset, num, read_lens);
 }
 
 template <typename InT,
@@ -292,19 +325,19 @@ template <typename InT,
           typename Functor,
           int Arity,
           int NumOuts,
-          int VecSize,
-          int Rank>
+          int VecSize>
 __global__ void VectorizedBroadcastKernel(
     phi::Array<const _ptr_ InT *__restrict__, Arity> ins,
     phi::Array<_ptr_ OutT *, NumOuts> outs,
     phi::Array<int, Arity> use_broadcast,
     uint32_t numel,
-    phi::Array<kps::details::BroadcastConfig<Rank>, Arity> configs,
+    phi::Array<kps::details::BroadcastConfig, Arity> configs,
     int main_offset,
     int tail_tid,
+    int read_lens,
     Functor func) {
-  int block_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
-  int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
+  int block_offset = BLOCK_ID_X * BLOCK_NUM_X * read_lens;
+  int stride = BLOCK_NUM_X * GRID_NUM_X * read_lens;
 
 #ifdef PADDLE_WITH_XPU_KP
   for (; block_offset < main_offset; block_offset += stride) {
@@ -314,14 +347,14 @@ __global__ void VectorizedBroadcastKernel(
                                   Arity,
                                   NumOuts,
                                   VecSize,
-                                  Rank,
                                   false>(ins,
                                          outs,
                                          use_broadcast,
                                          numel,
                                          configs,
-                                         BLOCK_NUM_X * VecSize,
+                                         BLOCK_NUM_X * read_lens,
                                          block_offset,
+                                         read_lens,
                                          func);
   }
   int num = numel - block_offset;
@@ -332,9 +365,15 @@ __global__ void VectorizedBroadcastKernel(
                                   Arity,
                                   NumOuts,
                                   VecSize,
-                                  Rank,
-                                  true>(
-        ins, outs, use_broadcast, numel, configs, num, block_offset, func);
+                                  true>(ins,
+                                        outs,
+                                        use_broadcast,
+                                        numel,
+                                        configs,
+                                        num,
+                                        block_offset,
+                                        read_lens,
+                                        func);
   }
 #else
   if (block_offset < main_offset) {
@@ -344,7 +383,6 @@ __global__ void VectorizedBroadcastKernel(
                                   Arity,
                                   NumOuts,
                                   VecSize,
-                                  Rank,
                                   false>(ins,
                                          outs,
                                          use_broadcast,
@@ -352,6 +390,7 @@ __global__ void VectorizedBroadcastKernel(
                                          configs,
                                          BLOCK_NUM_X * VecSize,
                                          block_offset,
+                                         read_lens,
                                          func);
   } else {
     VectorizedBroadcastKernelImpl<InT,
@@ -360,9 +399,15 @@ __global__ void VectorizedBroadcastKernel(
                                   Arity,
                                   NumOuts,
                                   VecSize,
-                                  Rank,
-                                  true>(
-        ins, outs, use_broadcast, numel, configs, tail_tid, block_offset, func);
+                                  true>(ins,
+                                        outs,
+                                        use_broadcast,
+                                        numel,
+                                        configs,
+                                        tail_tid,
+                                        block_offset,
+                                        read_lens,
+                                        func);
   }
 #endif
 }
@@ -372,15 +417,14 @@ template <typename InT,
           typename Functor,
           int Arity,
           int NumOuts,
-          int VecSize,
-          int Rank>
-void LaunchBroadcastKernel(const KPDevice &ctx,
-                           const std::vector<const DenseTensor *> &ins,
-                           std::vector<DenseTensor *> *outs,
-                           Functor func,
-                           DimensionsTransform merge_dims) {
+          int VecSize>
+void LaunchBroadcastKernel(
+    const KPDevice &ctx,
+    const std::vector<const DenseTensor *> &ins,
+    std::vector<DenseTensor *> *outs,
+    Functor func,
+    const phi::Array<kps::details::BroadcastConfig, Arity> &configs) {
   int numel = (*outs)[0]->numel();
-  phi::Array<kps::details::BroadcastConfig<Rank>, Arity> configs;
   phi::Array<int, Arity> use_broadcast;
   phi::Array<const _ptr_ InT *__restrict__, Arity> ins_data;
   phi::Array<_ptr_ OutT *, NumOuts> outs_data;
@@ -392,96 +436,41 @@ void LaunchBroadcastKernel(const KPDevice &ctx,
   for (int i = 0; i < Arity; i++) {
     use_broadcast[i] = (ins[i]->numel() != numel);
     ins_data[i] = (const _ptr_ InT *)(ins[i]->data<InT>());
-    if (use_broadcast[i]) {
-      // get the broadcast config,
-      // if data shape is[m, n], then you should set data_dim = {n, m}
-      // eg: out's shape [3, 45, 1]. then out_dims = {1, 45, 3}
-      configs[i] = kps::details::BroadcastConfig<Rank>(
-          merge_dims.out_dims, merge_dims.in_dims[i], merge_dims.dim_size);
-    }
   }
 
 #ifdef PADDLE_WITH_XPU_KP
   const int threads = 64;
   const int blocks = 8;
-  int main_offset = (numel / (VecSize * threads)) * VecSize * threads;
-  int tail_tid = numel % (VecSize * threads);
+  int read_lens = configs[0].buf_len;
   auto stream = ctx.x_context()->xpu_stream;
-  VectorizedBroadcastKernel<InT,
-                            OutT,
-                            Functor,
-                            Arity,
-                            NumOuts,
-                            VecSize,
-                            Rank><<<blocks, threads, stream>>>(ins_data,
-                                                               outs_data,
-                                                               use_broadcast,
-                                                               numel,
-                                                               configs,
-                                                               main_offset,
-                                                               tail_tid,
-                                                               func);
+  int main_offset = (numel / (read_lens * threads)) * read_lens * threads;
+  int tail_tid = numel % (read_lens * threads);
 #else
-  const int threads = 256;
-  int blocks = ((numel + VecSize - 1) / VecSize + threads - 1) / threads;
-  int main_offset = (numel / (VecSize * threads)) * VecSize * threads;
-  int tail_tid = numel % (VecSize * threads);
+  auto gpu_config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, VecSize);
+  int read_lens = VecSize;
   auto stream = ctx.stream();
+  auto threads = gpu_config.thread_per_block;
+  auto blocks = gpu_config.block_per_grid;
+  int main_offset = (numel / (read_lens * gpu_config.GetBlockSize())) *
+                    read_lens * gpu_config.GetBlockSize();
+  int tail_tid = numel % (read_lens * gpu_config.GetBlockSize());
+#endif
   VectorizedBroadcastKernel<InT,
                             OutT,
                             Functor,
                             Arity,
                             NumOuts,
-                            VecSize,
-                            Rank><<<blocks, threads, 0, stream>>>(ins_data,
-                                                                  outs_data,
-                                                                  use_broadcast,
-                                                                  numel,
-                                                                  configs,
-                                                                  main_offset,
-                                                                  tail_tid,
-                                                                  func);
-#endif
-}
-
-template <typename InT,
-          typename OutT,
-          typename Functor,
-          int Arity,
-          int NumOuts,
-          int VecSize>
-void BroadcastKernelForDifferentDimSize(
-    const KPDevice &ctx,
-    const std::vector<const DenseTensor *> &ins,
-    std::vector<DenseTensor *> *outs,
-    int axis,
-    Functor func) {
-  const auto merge_dims = DimensionsTransform(ins, (*outs)[0]->dims(), axis);
-
-#define CALL_BROADCAST_FOR_DIM_SIZE(rank)                                     \
-  case rank: {                                                                \
-    LaunchBroadcastKernel<InT, OutT, Functor, Arity, NumOuts, VecSize, rank>( \
-        ctx, ins, outs, func, merge_dims);                                    \
-  } break;
-
-  switch (merge_dims.dim_size) {
-    CALL_BROADCAST_FOR_DIM_SIZE(1);
-    CALL_BROADCAST_FOR_DIM_SIZE(2);
-    CALL_BROADCAST_FOR_DIM_SIZE(3);
-    CALL_BROADCAST_FOR_DIM_SIZE(4);
-    CALL_BROADCAST_FOR_DIM_SIZE(5);
-    CALL_BROADCAST_FOR_DIM_SIZE(6);
-    CALL_BROADCAST_FOR_DIM_SIZE(7);
-    CALL_BROADCAST_FOR_DIM_SIZE(8);
-    default: {
-      PADDLE_THROW(phi::errors::InvalidArgument(
-          "The maximum dimension of input tensor is expected to be less than "
-          "%d, but recieved %d.",
-          merge_dims.dim_size,
-          phi::DDim::kMaxRank));
-    }
-  }
-#undef CALL_BROADCAST_FOR_DIM_SIZE
+                            VecSize><<<blocks, threads, 0, stream>>>(
+      ins_data,
+      outs_data,
+      use_broadcast,
+      numel,
+      configs,
+      main_offset,
+      tail_tid,
+      read_lens,
+      func);
 }
 
 template <ElementwiseType ET,
@@ -498,79 +487,82 @@ void BroadcastKernelForDifferentVecSize(
   using Traits = paddle::platform::FunctionTraits<Functor>;
   const int kArity =
       Traits::has_pointer_args ? static_cast<int>(ET) : Traits::arity;
-  PADDLE_ENFORCE_EQ(ins.size(),
-                    kArity,
-                    phi::errors::InvalidArgument(
-                        "The number of inputs is expected to be equal to the "
-                        "arity of functor. But recieved: the number of inputs "
-                        "is %d, the arity of functor is %d.",
-                        ins.size(),
-                        kArity));
-  PADDLE_ENFORCE_LE(kArity,
-                    3,
-                    phi::errors::InvalidArgument(
-                        "Currently only broadcast of ternary is supported "
-                        "and verified, but received %d.",
-                        kArity));
-  PADDLE_ENFORCE_EQ(outs->size(),
-                    NumOuts,
-                    phi::errors::InvalidArgument(
-                        "Number of outputs shall equal to number of functions, "
-                        "but number of outputs is %d, of functions is %d.",
-                        outs->size(),
-                        NumOuts));
-  int in_vec_size = 4;
-  int out_vec_size = 4;
-  if (NumOuts > 1) {
-    for (int i = 0; i < NumOuts; ++i) {
-      PADDLE_ENFORCE_EQ(
-          (*outs)[i]->dims(),
-          (*outs)[0]->dims(),
-          phi::errors::InvalidArgument(
-              "The shape of each output tensor shall be identical yet, but "
-              "%d-th output tensor`s shape is not.",
-              i));
-      out_vec_size = std::min(
-          phi::GetVectorizedSize<OutT>((*outs)[i]->data<OutT>()), out_vec_size);
-    }
-  } else {
-    out_vec_size = phi::GetVectorizedSize<OutT>((*outs)[0]->data<OutT>());
-  }
+  PADDLE_ENFORCE_EQ(
+      ins.size(),
+      kArity,
+      phi::errors::InvalidArgument("The number of inputs is expected to be "
+                                   "equal to the "
+                                   "arity of functor. But recieved: the "
+                                   "number of inputs "
+                                   "is %d, the arity of functor is %d.",
+                                   ins.size(),
+                                   kArity));
+  PADDLE_ENFORCE_LE(
+      kArity,
+      3,
+      phi::errors::InvalidArgument("Currently only broadcast of ternary is "
+                                   "supported "
+                                   "and verified, but received %d.",
+                                   kArity));
+  PADDLE_ENFORCE_EQ(
+      outs->size(),
+      NumOuts,
+      phi::errors::InvalidArgument("Number of outputs shall equal to number "
+                                   "of functions, "
+                                   "but number of outputs is %d, of "
+                                   "functions is %d.",
+                                   outs->size(),
+                                   NumOuts));
+
+  // mergedim and get vec_size
+  const auto merge_dims = DimensionsTransform(ins, (*outs)[0]->dims(), axis);
+  phi::Array<kps::details::BroadcastConfig, kArity> configs;
 
-  for (auto *in : ins) {
-    auto temp_size = phi::GetVectorizedSize<InT>(in->data<InT>());
-    in_vec_size = in->dims() == (*outs)[0]->dims()
-                      ? std::min(temp_size, in_vec_size)
-                      : in_vec_size;
+// get vec_size
+#ifdef PADDLE_WITH_XPU_KP
+  PADDLE_ENFORCE_EQ(
+      ins.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "XPU only support inputs is 2, but received %d", ins.size()));
+  configs[0] = kps::details::BroadcastConfig(merge_dims.out_dims,
+                                             merge_dims.in_dims[0],
+                                             merge_dims.in_dims[1],
+                                             merge_dims.dim_size);
+  configs[1] = kps::details::BroadcastConfig(merge_dims.out_dims,
+                                             merge_dims.in_dims[1],
+                                             merge_dims.in_dims[0],
+                                             merge_dims.dim_size);
+  auto type = kps::details::OptType::CanNotOptimize;
+  bool is_optimize = configs[0].cmp_type != type;
+  int vec_size = is_optimize ? VecSizeL : VecSizeM;
+#else
+  for (int i = 0; i < kArity; i++) {
+    // get the broadcast config,
+    // if data shape is[m, n], then you should set data_dim = {n, m}
+    // eg: out's shape [3, 45, 1]. then out_dims = {1, 45, 3}
+    if (ins[i]->numel()) {
+      configs[i] = kps::details::BroadcastConfig(
+          merge_dims.out_dims, merge_dims.in_dims[i], merge_dims.dim_size);
+    }
   }
-  int vec_size = std::min(out_vec_size, in_vec_size);
+  int vec_size = GetVecsize<InT, OutT, NumOuts>(ins, outs);
+#endif
 
   switch (vec_size) {
-    case 4: {
-      BroadcastKernelForDifferentDimSize<InT,
-                                         OutT,
-                                         Functor,
-                                         kArity,
-                                         NumOuts,
-                                         4>(ctx, ins, outs, axis, func);
+    case VecSizeL: {
+      LaunchBroadcastKernel<InT, OutT, Functor, kArity, NumOuts, VecSizeL>(
+          ctx, ins, outs, func, configs);
       break;
     }
-    case 2: {
-      BroadcastKernelForDifferentDimSize<InT,
-                                         OutT,
-                                         Functor,
-                                         kArity,
-                                         NumOuts,
-                                         2>(ctx, ins, outs, axis, func);
+    case VecSizeM: {
+      LaunchBroadcastKernel<InT, OutT, Functor, kArity, NumOuts, VecSizeM>(
+          ctx, ins, outs, func, configs);
       break;
     }
-    case 1: {
-      BroadcastKernelForDifferentDimSize<InT,
-                                         OutT,
-                                         Functor,
-                                         kArity,
-                                         NumOuts,
-                                         1>(ctx, ins, outs, axis, func);
+    case VecSizeS: {
+      LaunchBroadcastKernel<InT, OutT, Functor, kArity, NumOuts, VecSizeS>(
+          ctx, ins, outs, func, configs);
       break;
     }
     default: {
@@ -593,26 +585,16 @@ void BroadcastKernel(const KPDevice &ctx,
                      Functor func) {
   std::vector<int> dims_size;
   dims_size.reserve(ins.size());
-  bool no_broadcast_flag = true;
   for (auto *in : ins) {
-    no_broadcast_flag &= ins[0]->dims() == in->dims();
     dims_size.emplace_back(in->dims().size());
   }
 
-  if (ins.size() > 0 && outs->size() > 0) {
-    no_broadcast_flag &= outs->at(0)->dims() == ins[0]->dims();
-  }
-
-  if (no_broadcast_flag) {
-    phi::funcs::ElementwiseKernel<OutT, Functor, NumOuts>(ctx, ins, outs, func);
-  } else {
-    axis = axis == -1
-               ? *std::max_element(dims_size.begin(), dims_size.end()) -
-                     *std::min_element(dims_size.begin(), dims_size.end())
-               : axis;
-    BroadcastKernelForDifferentVecSize<ET, InT, OutT, Functor, NumOuts>(
-        ctx, ins, outs, axis, func);
-  }
+  axis = axis == -1
+             ? *std::max_element(dims_size.begin(), dims_size.end()) -
+                   *std::min_element(dims_size.begin(), dims_size.end())
+             : axis;
+  BroadcastKernelForDifferentVecSize<ET, InT, OutT, Functor, NumOuts>(
+      ctx, ins, outs, axis, func);
 }
 
 template <typename Functor, typename T, typename OutType = T>
diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h
index 332ec0b0312da..1093bdfa726c8 100644
--- a/paddle/phi/kernels/funcs/elementwise_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_base.h
@@ -577,14 +577,16 @@ template <typename InT,
 struct ElementwisePrimitiveCaller {
   __device__ inline void operator()(Functor func,
                                     InT (*args)[VecSize],
-                                    OutT *result);
+                                    OutT *result,
+                                    int read_lens);
 };
 
 template <typename InT, typename OutT, int VecSize, typename Functor, int Arity>
 struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, Arity, true> {
   __device__ inline void operator()(Functor func,
                                     InT (*args)[VecSize],
-                                    OutT *result) {
+                                    OutT *result,
+                                    int read_lens) {
     kps::ElementwiseAny<InT, OutT, VecSize, 1, 1, Arity, Functor>(
         result, args, func);
   }
@@ -594,7 +596,8 @@ template <typename InT, typename OutT, int VecSize, typename Functor>
 struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 0, false> {
   __device__ inline void operator()(Functor func,
                                     InT (*args)[VecSize],
-                                    OutT *result) {
+                                    OutT *result,
+                                    int read_lens) {
     kps::ElementwiseConstant<InT, OutT, VecSize, 1, 1, Functor>(result, func);
   }
 };
@@ -603,7 +606,8 @@ template <typename InT, typename OutT, int VecSize, typename Functor>
 struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 1, false> {
   __device__ inline void operator()(Functor func,
                                     InT (*args)[VecSize],
-                                    OutT *result) {
+                                    OutT *result,
+                                    int read_lens) {
     kps::ElementwiseUnary<InT, OutT, VecSize, 1, 1, Functor>(
         result, args[0], func);
   }
@@ -613,9 +617,10 @@ template <typename InT, typename OutT, int VecSize, typename Functor>
 struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 2, false> {
   __device__ inline void operator()(Functor func,
                                     InT (*args)[VecSize],
-                                    OutT *result) {
+                                    OutT *result,
+                                    int read_lens) {
     kps::ElementwiseBinary<InT, OutT, VecSize, 1, 1, Functor>(
-        result, args[0], args[1], func);
+        result, args[0], args[1], func, read_lens);
   }
 };
 
@@ -623,7 +628,8 @@ template <typename InT, typename OutT, int VecSize, typename Functor>
 struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 3, false> {
   __device__ inline void operator()(Functor func,
                                     InT (*args)[VecSize],
-                                    OutT *result) {
+                                    OutT *result,
+                                    int read_lens) {
     kps::ElementwiseTernary<InT, OutT, VecSize, 1, 1, Functor>(
         result, args[0], args[1], args[2], func);
   }
@@ -696,6 +702,42 @@ struct ElementwiseWriteDataCaller<OutT, VecSize, IsBoundary, 1> {
   }
 };
 
+template <typename OutT, int VecSize, bool IsBoundary, int NumOuts>
+struct ElementwiseWriteDataCallerBc {
+  __device__ __forceinline__ void operator()(
+      phi::Array<_ptr_ OutT *, NumOuts> outs,
+      ConditionalT<OutT, NumOuts> src[VecSize],
+      int block_offset,
+      int num,
+      int read_lens) {
+    OutT dst[NumOuts][VecSize];
+#pragma unroll
+    for (int i = 0; i < read_lens; ++i) {
+#pragma unroll
+      for (int j = 0; j < NumOuts; ++j) {
+        dst[j][i] = (src[i])[j];
+      }
+    }
+#pragma unroll
+    for (int i = 0; i < NumOuts; ++i) {
+      kps::WriteData<OutT, VecSize, 1, 1, IsBoundary>(
+          outs[i] + block_offset, dst[i], num, read_lens);
+    }
+  }
+};
+
+template <typename OutT, int VecSize, bool IsBoundary>
+struct ElementwiseWriteDataCallerBc<OutT, VecSize, IsBoundary, 1> {
+  __device__ __forceinline__ void operator()(phi::Array<_ptr_ OutT *, 1> outs,
+                                             OutT src[VecSize],
+                                             int block_offset,
+                                             int num,
+                                             int read_lens) {
+    kps::WriteData<OutT, VecSize, 1, 1, IsBoundary>(
+        outs[0] + block_offset, src, num, read_lens);
+  }
+};
+
 template <typename OutT,
           typename Functor,
           int Arity,
@@ -807,7 +849,7 @@ void ElementwiseKernel(const KPDevice &ctx,
                     kArity,
                     phi::errors::InvalidArgument(
                         "The number of inputs is expected to be equal to the "
-                        "arity of functor. But recieved: the number of inputs "
+                        "arity of functor. But received: the number of inputs "
                         "is %d, the arity of functor is %d.",
                         ins.size(),
                         kArity));
diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h
index 4c2b6ef896e71..70b11bc8c90b2 100644
--- a/paddle/phi/kernels/funcs/elementwise_functor.h
+++ b/paddle/phi/kernels/funcs/elementwise_functor.h
@@ -543,6 +543,13 @@ struct InverseModuloFunctor<
   }
 };
 
+template <typename T>
+struct ElementwiseHeavisideFunctor {
+  inline HOSTDEVICE T operator()(const T a, const T b) const {
+    return a == static_cast<T>(0) ? b : static_cast<T>(a > 0);
+  }
+};
+
 template <typename T>
 struct FloorDivideFunctor {
   inline HOSTDEVICE T operator()(const T a, const T b) const {
diff --git a/paddle/phi/kernels/funcs/norm_utils.h b/paddle/phi/kernels/funcs/norm_utils.h
new file mode 100644
index 0000000000000..2d0a879e41c78
--- /dev/null
+++ b/paddle/phi/kernels/funcs/norm_utils.h
@@ -0,0 +1,46 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/ddim.h"
+
+namespace phi {
+namespace funcs {
+inline void ExtractNCWHD(const phi::DDim &dims,
+                         const DataLayout &data_layout,
+                         int *N,
+                         int *C,
+                         int *H,
+                         int *W,
+                         int *D) {
+  *N = dims[0];
+  if (dims.size() == 2) {
+    *C = dims[1];
+    *H = 1;
+    *W = 1;
+    *D = 1;
+  } else {
+    *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
+    *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
+    *W = dims.size() > 3
+             ? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2])
+             : 1;
+    *D = dims.size() > 4
+             ? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3])
+             : 1;
+  }
+}
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h
index 42fee14488373..df14b0a21f24d 100644
--- a/paddle/phi/kernels/funcs/reduce_function.h
+++ b/paddle/phi/kernels/funcs/reduce_function.h
@@ -473,7 +473,11 @@ struct ReduceConfig {
     bool not_higher = x_dim[0] >= max_grid_z;
 #endif
     if (reduce_last_dim && (reduce_rank == 1)) {
+#ifdef PADDLE_WITH_XPU_KP
+      reduce_type = static_cast<int>(ReduceType::kReduceAny);
+#else
       reduce_type = static_cast<int>(ReduceType::kReduceLastDim);
+#endif
     } else if (reduce_rank == 1) {
       reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
       if (rank == 3 && not_higher) {
@@ -588,7 +592,7 @@ struct ReduceConfig {
   void SetBlockDim() {
     // init
     should_reduce_again = false;
-    dim3 block_dim;
+    dim3 block_dim(1, 1, 1);
     dim3 grid_dim(left_num, 1, 1);
     blocking_size = reduce_num;
 
diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
index 944cd2ea1024d..1479fd494435d 100644
--- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -221,6 +221,9 @@ DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Swish,
 DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish,
                                                CudaMishGradFunctor,
                                                threshold);
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Celu,
+                                               CudaCELUGradFunctor,
+                                               alpha);
 
 DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu,
                                                CudaBReluGradFunctor,
@@ -351,7 +354,9 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(stanh_grad, STanhGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(reciprocal_grad, ReciprocalGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(softplus_grad, SoftplusGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sqrt_grad, SqrtGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(sqrt_double_grad, SqrtDoubleGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(rsqrt_grad, RsqrtGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(rsqrt_double_grad, RsqrtDoubleGradKernel)
 
 PD_REGISTER_KERNEL(exp_grad,
                    GPU,
@@ -396,6 +401,16 @@ PD_REGISTER_KERNEL(square_grad,
                    int64_t,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(square_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SquareDoubleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_grad, SigmoidGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_double_grad, SigmoidDoubleGradKernel)
@@ -418,6 +433,8 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(swish_grad, SwishGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(round_grad, RoundGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(floor_grad, FloorGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(ceil_grad, CeilGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(celu_grad, CeluGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(celu_double_grad, CeluDoubleGradKernel)
 
 PD_REGISTER_KERNEL(pow_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu
index 8cc546ba73a06..8db31c5ed5b79 100644
--- a/paddle/phi/kernels/gpu/activation_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -118,8 +118,8 @@ DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(HardShrink,
 DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, CudaSoftShrinkFunctor, lambda)
 DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, CudaELUFunctor, alpha)
 DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Swish, CudaSwishFunctor, beta)
-
 DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Mish, CudaMishFunctor, threshold)
+DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Celu, CudaCELUFunctor, alpha)
 
 DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, CudaBReluFunctor, t_min, t_max)
 DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(Stanh, CudaSTanhFunctor, scale_a, scale_b)
@@ -234,6 +234,7 @@ PD_REGISTER_KERNEL(square,
                    int64_t,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
+
 PD_REGISTER_ACTIVATION_KERNEL(hard_shrink, HardShrinkKernel)
 PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel)
 PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel)
@@ -251,6 +252,7 @@ PD_REGISTER_ACTIVATION_KERNEL(swish, SwishKernel)
 PD_REGISTER_ACTIVATION_KERNEL(round, RoundKernel)
 PD_REGISTER_ACTIVATION_KERNEL(floor, FloorKernel)
 PD_REGISTER_ACTIVATION_KERNEL(ceil, CeilKernel)
+PD_REGISTER_ACTIVATION_KERNEL(celu, CeluKernel)
 PD_REGISTER_KERNEL(pow,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
index ad3b8579ddf67..e808ef644a246 100644
--- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -20,7 +20,7 @@
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 #include "paddle/fluid/operators/norm_utils.cu.h"
-#include "paddle/fluid/operators/norm_utils.h"
+#include "paddle/phi/kernels/funcs/norm_utils.h"
 
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/layout_utils.h"
@@ -351,7 +351,7 @@ void BatchNormGradRawKernel(const Context &ctx,
           x_dims.size(),
           x_dims));
   int N, C, H, W, D;
-  paddle::operators::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
+  phi::funcs::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
 
   // init output
   if (d_x) {
diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index 361e62e566035..e2aeec723628c 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -27,7 +27,7 @@ namespace cub = hipcub;
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
 #include "paddle/fluid/operators/norm_utils.cu.h"
-#include "paddle/fluid/operators/norm_utils.h"
+#include "paddle/phi/kernels/funcs/norm_utils.h"
 
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/layout_utils.h"
@@ -179,7 +179,7 @@ void BatchNormKernel(const Context &ctx,
 
   ctx.template Alloc<T>(y);
   int N, C, H, W, D;
-  paddle::operators::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
+  phi::funcs::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
 
   auto dtype = paddle::platform::CudnnDataType<T>::type;
 
diff --git a/paddle/phi/kernels/gpu/bce_loss_kernel.cu b/paddle/phi/kernels/gpu/bce_loss_kernel.cu
index adbcd3b2b6207..b190bce474280 100644
--- a/paddle/phi/kernels/gpu/bce_loss_kernel.cu
+++ b/paddle/phi/kernels/gpu/bce_loss_kernel.cu
@@ -38,7 +38,7 @@ struct BCELossFunctor {
   HOSTDEVICE inline T operator()(const T x, const T label) const {
     PADDLE_ENFORCE(
         (x >= static_cast<T>(0)) && (x <= one),
-        "Input is expected to be within the interval [0, 1], but recieved %f.",
+        "Input is expected to be within the interval [0, 1], but received %f.",
         x);
     T term1 = max(phi::kps::details::Log(x), neg_100);
     T term2 = max(phi::kps::details::Log(one - x), neg_100);
diff --git a/paddle/phi/kernels/gpu/einsum_grad_kernel.cu b/paddle/phi/kernels/gpu/einsum_grad_kernel.cu
index c8a8745f34522..6ca8dbd9205d8 100644
--- a/paddle/phi/kernels/gpu/einsum_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/einsum_grad_kernel.cu
@@ -18,5 +18,10 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/einsum_grad_impl.h"
 
-PD_REGISTER_KERNEL(
-    einsum_grad, GPU, ALL_LAYOUT, phi::EinsumGradKernel, float, double) {}
+PD_REGISTER_KERNEL(einsum_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::EinsumGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/einsum_kernel.cu b/paddle/phi/kernels/gpu/einsum_kernel.cu
index d73e154eb40f7..d1f4c6590387a 100644
--- a/paddle/phi/kernels/gpu/einsum_kernel.cu
+++ b/paddle/phi/kernels/gpu/einsum_kernel.cu
@@ -18,4 +18,11 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/einsum_impl.h"
 
-PD_REGISTER_KERNEL(einsum, GPU, ALL_LAYOUT, phi::EinsumKernel, float, double) {}
+PD_REGISTER_KERNEL(einsum,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::EinsumKernelRaw,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
index c814e7b3bb63d..3e7430fd84eaf 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
@@ -128,6 +128,16 @@ PD_REGISTER_KERNEL(minimum_grad,
                    int64_t,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(elementwise_heaviside_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ElementwiseHeavisideGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+
 PD_REGISTER_KERNEL(elementwise_pow_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu
index 96ebc0353ef24..b80634357d62f 100644
--- a/paddle/phi/kernels/gpu/gaussian_random_kernel.cu
+++ b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu
@@ -59,34 +59,20 @@ void GaussianRandomKernel(const Context& dev_ctx,
                           int seed,
                           DataType dtype,
                           DenseTensor* out) {
-  auto tensor = out;
-
-  bool seed_flag = false;
+  out->Resize(phi::make_ddim(shape.GetData()));
+  dev_ctx.template Alloc<T>(out);
   if (seed == 0) {
-    std::random_device rd;
-    seed = rd();
-    seed_flag = true;
-  }
-
-  tensor->Resize(phi::make_ddim(shape.GetData()));
-
-  T* data = dev_ctx.template Alloc<T>(tensor);
-
-  int64_t size = tensor->numel();
-
-  int device_id = dev_ctx.GetPlace().GetDeviceId();
-  auto gen_cuda = paddle::framework::GetDefaultCUDAGenerator(device_id);
-
-  if (gen_cuda->GetIsInitPy() && seed_flag) {
+    // use global Generator seed
     using MT = typename phi::dtype::MPTypeTrait<T>::Type;
     funcs::normal_distribution<MT> dist;
     funcs::normal_transform<MT> trans(static_cast<MT>(mean),
                                       static_cast<MT>(std));
-    funcs::distribution_and_transform<T>(dev_ctx, tensor, dist, trans);
+    funcs::distribution_and_transform<T>(dev_ctx, out, dist, trans);
   } else {
+    // use OP seed
     auto func =
         GaussianGenerator<T>(static_cast<T>(mean), static_cast<T>(std), seed);
-    IndexKernel<T, GaussianGenerator<T>>(dev_ctx, tensor, func);
+    IndexKernel<T, GaussianGenerator<T>>(dev_ctx, out, func);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/gelu_grad_kernel.cu b/paddle/phi/kernels/gpu/gelu_grad_kernel.cu
index 1e21f8d4267bc..1f33d5c901f29 100644
--- a/paddle/phi/kernels/gpu/gelu_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/gelu_grad_kernel.cu
@@ -81,11 +81,13 @@ void GeluGradKernel(const Context& dev_ctx,
       }
     }
 #endif
-    phi::funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(
-        dev_ctx, ins, &outs, 0, GeluWithApproximateGradFunctor<T>());
+    using Functor = GeluWithApproximateGradFunctor<T>;
+    phi::funcs::ElementwiseKernel<T, Functor, 1>(
+        dev_ctx, ins, &outs, Functor());
   } else {
-    phi::funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(
-        dev_ctx, ins, &outs, 0, GeluWithoutApproximateGradFunctor<T>());
+    using Functor = GeluWithoutApproximateGradFunctor<T>;
+    phi::funcs::ElementwiseKernel<T, Functor, 1>(
+        dev_ctx, ins, &outs, Functor());
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/gelu_kernel.cu b/paddle/phi/kernels/gpu/gelu_kernel.cu
index ce6dda2d6cc65..00dc58df0d826 100644
--- a/paddle/phi/kernels/gpu/gelu_kernel.cu
+++ b/paddle/phi/kernels/gpu/gelu_kernel.cu
@@ -71,11 +71,13 @@ void GeluKernel(const Context& dev_ctx,
       }
     }
 #endif
-    phi::funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(
-        dev_ctx, ins, &outs, 0, GeluWithApproximateFunctor<T>());
+    using Functor = GeluWithApproximateFunctor<T>;
+    phi::funcs::ElementwiseKernel<T, Functor, 1>(
+        dev_ctx, ins, &outs, Functor());
   } else {
-    phi::funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(
-        dev_ctx, ins, &outs, 0, GeluWithoutApproximateFunctor<T>());
+    using Functor = GeluWithoutApproximateFunctor<T>;
+    phi::funcs::ElementwiseKernel<T, Functor, 1>(
+        dev_ctx, ins, &outs, Functor());
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
index 6b1e58981baa0..c0e557f09bcc9 100644
--- a/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
+++ b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
@@ -27,12 +27,9 @@
 namespace cub = hipcub;
 #endif
 
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
-#include <thrust/random.h>
-#include <thrust/transform.h>
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace phi {
@@ -144,27 +141,21 @@ struct GumbleNoiseGenerator<GPUContext, T> {
     DenseTensor random_tensor;
     int64_t size = size_to_axis * size_from_axis;
     random_tensor.Resize(make_ddim({size}));
-    auto* random_data = ctx.template Alloc<T>(&random_tensor);
-    thrust::counting_iterator<int64_t> index_sequence_begin(0);
+    T* random_data = ctx.template Alloc<T>(&random_tensor);
 
     // generate gumbel noise
     int device_id = ctx.GetPlace().GetDeviceId();
-    auto gen_cuda = paddle::framework::GetDefaultCUDAGenerator(device_id);
-    if (gen_cuda->GetIsInitPy()) {
-      auto seed_offset = gen_cuda->IncrementOffset(1);
-      int64_t gen_offset = size * seed_offset.second;
-      thrust::transform(
-          index_sequence_begin,
-          index_sequence_begin + size,
-          thrust::device_ptr<T>(random_data),
-          UniformCUDAGenerator<T>(0.00001, 1, seed_offset.first, gen_offset));
-    } else {
-      const unsigned int seed = std::random_device()();
-      thrust::transform(index_sequence_begin,
-                        index_sequence_begin + size,
-                        thrust::device_ptr<T>(random_data),
-                        UniformCUDAGenerator<T>(0.00001, 1, seed));
-    }
+    auto gen_cuda = ctx.GetGenerator();
+
+    auto seed_offset = gen_cuda->IncrementOffset(1);
+    uint64_t seed = seed_offset.first;
+    uint64_t offset = seed_offset.second;
+
+    thrust::counting_iterator<int64_t> index_sequence_begin(0);
+    thrust::transform(index_sequence_begin,
+                      index_sequence_begin + size,
+                      thrust::device_ptr<T>(random_data),
+                      UniformCUDAGenerator<T>(0.00001, 1, seed, size * offset));
 
     // add gumbel noise to X
     const int thread_size = 512;
diff --git a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
new file mode 100644
index 0000000000000..387127de48dea
--- /dev/null
+++ b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
@@ -0,0 +1,642 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/instance_norm_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/norm_utils.h"
+#include "paddle/phi/kernels/gpu/instance_norm_utils.h"
+
+namespace phi {
+template <typename T, int BlockDim>
+static __global__ void GradComputeDX(const T *dy,
+                                     const BatchNormParamType<T> *scale,
+                                     const BatchNormParamType<T> *mean,
+                                     const T *x,
+                                     const BatchNormParamType<T> *variance,
+                                     const int C,
+                                     const int sample_size,
+                                     T *dx) {
+  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
+  int end_idx = (blockIdx.x + 1) * sample_size;
+  int ncid = blockIdx.x;
+  int c = ncid % C;
+  BatchNormParamType<T> mean_val = mean[ncid];
+  BatchNormParamType<T> inv_var_val = variance[ncid];
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage dy_storage;
+  __shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage;
+  __shared__ BatchNormParamType<T> dy_sum_val;
+  __shared__ BatchNormParamType<T> dy_x_sub_mean_sum_val;
+  BatchNormParamType<T> dy_sum = static_cast<BatchNormParamType<T>>(0);
+  BatchNormParamType<T> dy_x_sub_mean_sum =
+      static_cast<BatchNormParamType<T>>(0);
+
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    BatchNormParamType<T> dy_i = static_cast<BatchNormParamType<T>>(dy[i]);
+    dy_sum += dy_i;
+    dy_x_sub_mean_sum +=
+        dy_i * (static_cast<BatchNormParamType<T>>(x[i]) - mean_val);
+  }
+  dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
+  dy_x_sub_mean_sum =
+      BlockReduce(dy_x_sub_mean_storage).Reduce(dy_x_sub_mean_sum, cub::Sum());
+  if (threadIdx.x == 0) {
+    dy_sum_val = dy_sum;
+    dy_x_sub_mean_sum_val = dy_x_sub_mean_sum;
+  }
+  __syncthreads();
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    dx[i] =
+        (static_cast<BatchNormParamType<T>>(dy[i]) -
+         dy_sum_val / static_cast<BatchNormParamType<T>>(sample_size) -
+         (static_cast<BatchNormParamType<T>>(x[i]) - mean_val) *
+             dy_x_sub_mean_sum_val * inv_var_val * inv_var_val / sample_size) *
+        scale[c] * inv_var_val;
+  }
+}
+
+static __device__ __forceinline__ float real_sqrt(float x) {
+  return 1. / sqrtf(x);
+}
+static __device__ __forceinline__ double real_sqrt(double x) {
+  return 1. / sqrt(x);
+}
+
+template <typename T, int BlockDim>
+__global__ void DoubleGradComputeDX(const T *x,
+                                    const T *mean,
+                                    const T *variance,
+                                    const T *ddx,
+                                    const T *dy,
+                                    const T *scale,
+                                    const T *ddscale,
+                                    int C,
+                                    int sample_size,
+                                    const double epsilon,
+                                    T *dx) {
+  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
+  int end_idx = (blockIdx.x + 1) * sample_size;
+  int ncid = blockIdx.x;
+  int c = ncid % C;
+
+  T mean_val = mean[ncid];
+  T var_val = variance[ncid];
+
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage dy_storage;
+  __shared__ typename BlockReduce::TempStorage ddx_storage;
+  __shared__ typename BlockReduce::TempStorage dy_mul_ddx_storage;
+  __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage;
+  __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage;
+  __shared__ T dy_sum_val;
+  __shared__ T ddx_sum_val;
+  __shared__ T dy_mul_ddx_sum_val;
+  __shared__ T dy_mul_x_sub_mean_sum_val;
+  __shared__ T ddx_mul_x_sub_mean_sum_val;
+
+  T dy_sum = 0;
+  T ddx_sum = 0;
+  T dy_mul_ddx_sum = 0;
+  T dy_mul_x_sub_mean_sum = 0;
+  T ddx_mul_x_sub_mean_sum = 0;
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    T ddx_i = ddx[i];
+    T dy_i = dy[i];
+    T tmp = x[i] - mean_val;
+
+    dy_sum += dy_i;
+    ddx_sum += ddx_i;
+    dy_mul_ddx_sum += (ddx_i * dy_i);
+
+    dy_mul_x_sub_mean_sum += (dy_i * tmp);
+    ddx_mul_x_sub_mean_sum += (ddx_i * tmp);
+  }
+
+  dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
+  ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum());
+  dy_mul_ddx_sum =
+      BlockReduce(dy_mul_ddx_storage).Reduce(dy_mul_ddx_sum, cub::Sum());
+  dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage)
+                              .Reduce(dy_mul_x_sub_mean_sum, cub::Sum());
+  ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage)
+                               .Reduce(ddx_mul_x_sub_mean_sum, cub::Sum());
+
+  if (threadIdx.x == 0) {
+    dy_sum_val = dy_sum;
+    ddx_sum_val = ddx_sum;
+    dy_mul_ddx_sum_val = dy_mul_ddx_sum;
+    dy_mul_x_sub_mean_sum_val = dy_mul_x_sub_mean_sum;
+    ddx_mul_x_sub_mean_sum_val = ddx_mul_x_sub_mean_sum;
+  }
+  __syncthreads();
+
+  if (ddx != nullptr) {
+    for (int i = beg_idx; i < end_idx; i += BlockDim) {
+      dx[i] +=
+          ((x[i] - mean_val) * var_val * var_val * var_val / sample_size *
+               (ddx_sum_val * dy_sum_val / sample_size - dy_mul_ddx_sum_val +
+                3. * dy_mul_x_sub_mean_sum_val * var_val *
+                    ddx_mul_x_sub_mean_sum_val * var_val / sample_size) +
+           ddx_mul_x_sub_mean_sum_val * var_val / sample_size * var_val *
+               var_val * (dy_sum_val / sample_size - dy[i]) +
+           dy_mul_x_sub_mean_sum_val * var_val / sample_size * var_val *
+               var_val * (ddx_sum_val / sample_size - ddx[i])) *
+          scale[c];
+    }
+  }
+  __syncthreads();
+  if (ddscale != nullptr) {
+    for (int i = beg_idx; i < end_idx; i += BlockDim) {
+      dx[i] += (dy[i] * var_val - dy_sum_val / sample_size * var_val -
+                (x[i] - mean_val) * var_val * dy_mul_x_sub_mean_sum_val *
+                    var_val / sample_size) *
+               ddscale[c];
+    }
+  }
+}
+
+template <typename T, int BlockDim>
+__global__ void DoubleGradComputeDDY(const T *x,
+                                     const T *mean,
+                                     const T *variance,
+                                     const T *ddscale,
+                                     const T *ddbias,
+                                     const T *ddx,
+                                     const T *scale,
+                                     int C,
+                                     int sample_size,
+                                     const double epsilon,
+                                     T *ddy) {
+  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
+  int end_idx = (blockIdx.x + 1) * sample_size;
+  int ncid = blockIdx.x;
+  int c = ncid % C;
+  T mean_val = mean[ncid];
+  T var_val = variance[ncid];
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ddx_storage;
+  __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage;
+  __shared__ T ddx_sum_val;
+  __shared__ T ddx_mul_x_sub_mean_sum_val;
+
+  T ddx_sum = 0;
+  T ddx_mul_x_sub_mean_sum = 0;
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    T ddx_i = ddx[i];
+    ddx_sum += ddx_i;
+    ddx_mul_x_sub_mean_sum += (ddx_i * (x[i] - mean_val));
+  }
+  ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum());
+  ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage)
+                               .Reduce(ddx_mul_x_sub_mean_sum, cub::Sum());
+  if (threadIdx.x == 0) {
+    ddx_sum_val = ddx_sum;
+    ddx_mul_x_sub_mean_sum_val = ddx_mul_x_sub_mean_sum;
+  }
+  __syncthreads();
+  if (ddx != nullptr) {
+    for (int i = beg_idx; i < end_idx; i += BlockDim) {
+      ddy[i] += scale[c] * var_val *
+                (ddx[i] - ddx_sum_val / sample_size -
+                 (x[i] - mean_val) * var_val * ddx_mul_x_sub_mean_sum_val *
+                     var_val / sample_size);
+    }
+  }
+  __syncthreads();
+  if (ddscale != nullptr) {
+    for (int i = beg_idx; i < end_idx; i += BlockDim) {
+      ddy[i] += (x[i] - mean_val) * var_val * ddscale[c];
+    }
+  }
+  __syncthreads();
+  if (ddbias != nullptr) {
+    for (int i = beg_idx; i < end_idx; i += BlockDim) {
+      ddy[i] += ddbias[c];
+    }
+  }
+}
+
+template <typename T, int BlockDim>
+__global__ void DoubleGradComputeDScale(const T *x,
+                                        const T *mean,
+                                        const T *variance,
+                                        const T *ddx,
+                                        const T *dy,
+                                        int C,
+                                        int sample_size,
+                                        const double epsilon,
+                                        T *dscale) {
+  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
+  int end_idx = (blockIdx.x + 1) * sample_size;
+  int ncid = blockIdx.x;
+  int c = ncid % C;
+  T mean_val = mean[ncid];
+  T var_val = variance[ncid];
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage dy_storage;
+  __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage;
+  __shared__ typename BlockReduce::TempStorage dscale_tmp_storage;
+  __shared__ T dy_sum_val;
+  __shared__ T dy_mul_x_sub_mean_sum_val;
+
+  T dy_sum = 0;
+  T dy_mul_x_sub_mean_sum = 0;
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    T dy_i = dy[i];
+    dy_sum += dy_i;
+    dy_mul_x_sub_mean_sum += (dy_i * (x[i] - mean_val));
+  }
+  dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
+  dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage)
+                              .Reduce(dy_mul_x_sub_mean_sum, cub::Sum());
+
+  if (threadIdx.x == 0) {
+    dy_sum_val = dy_sum;
+    dy_mul_x_sub_mean_sum_val = dy_mul_x_sub_mean_sum;
+  }
+  __syncthreads();
+  if (ddx != nullptr) {
+    T dscale_tmp = 0;
+    for (int i = beg_idx; i < end_idx; i += BlockDim) {
+      dscale_tmp +=
+          ddx[i] * var_val * (dy[i] - dy_sum_val / sample_size -
+                              dy_mul_x_sub_mean_sum_val * (x[i] - mean_val) *
+                                  var_val * var_val / sample_size);
+    }
+    dscale_tmp = BlockReduce(dscale_tmp_storage).Reduce(dscale_tmp, cub::Sum());
+    if (threadIdx.x == 0) {
+      dscale[ncid] += dscale_tmp;
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T, typename Context>
+void InstanceNormGradKernel(const Context &dev_ctx,
+                            const DenseTensor &x,
+                            const DenseTensor &d_y,
+                            paddle::optional<const DenseTensor &> scale,
+                            const DenseTensor &saved_mean,
+                            const DenseTensor &saved_variance,
+                            float epsilon_f,
+                            DenseTensor *d_x,
+                            DenseTensor *d_scale,
+                            DenseTensor *d_bias) {
+  double epsilon = static_cast<double>(epsilon_f);
+  const auto *scale_ptr = scale.get_ptr();
+
+  const auto &x_dims = x.dims();
+
+  int N, C, H, W, D;
+  funcs::ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
+  int NxC = N * C;
+
+  DenseTensor x_tmp, d_y_tmp;
+  x_tmp.ShareDataWith(x).Resize({1, NxC, H, W, D});
+  d_y_tmp.ShareDataWith(d_y).Resize({1, NxC, H, W, D});
+
+  dev_ctx.template Alloc<T>(d_x);
+  if (d_scale && d_bias) {
+    dev_ctx.template Alloc<T>(d_scale);
+    dev_ctx.template Alloc<T>(d_bias);
+  }
+  if (scale_ptr) {
+    PADDLE_ENFORCE_EQ(
+        scale_ptr->dims().size(),
+        1UL,
+        phi::errors::InvalidArgument(
+            "The `shape` in InstanceNormOp is invalid: "
+            "the size of scale's dimensions must be equal to 1. But "
+            "received: the size of scale's dimensions"
+            "is [%d]",
+            scale_ptr->dims().size()));
+    PADDLE_ENFORCE_EQ(scale_ptr->dims()[0],
+                      C,
+                      phi::errors::InvalidArgument(
+                          "The `shape` in InstanceNormOp is invalid: "
+                          "the first dimension of scale must be equal to "
+                          "Channels([%d]). But received: "
+                          "the first dimension of scale is [%d],"
+                          "the dimensions of scale is [%s], ",
+                          C,
+                          scale_ptr->dims()[0],
+                          scale_ptr->dims()));
+  }
+
+  phi::funcs::SetConstant<GPUContext, T> set_constant;
+
+  const int n = x.numel();
+  const int block = 512;
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(max_threads / block, 1);
+  const int grid = std::min(NxC, max_blocks);
+  const int grid1 = (C + block - 1) / block;
+
+  DenseTensor scale_tmp;
+  scale_tmp.Resize({NxC});
+  dev_ctx.template Alloc<T>(&scale_tmp);
+
+  DenseTensor d_scale_tmp;
+  d_scale_tmp.Resize({NxC});
+  dev_ctx.template Alloc<T>(&d_scale_tmp);
+
+  DenseTensor d_bias_tmp;
+  d_bias_tmp.Resize({NxC});
+  dev_ctx.template Alloc<T>(&d_bias_tmp);
+
+  if (scale_ptr) {
+    repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
+        scale_ptr->data<T>(), scale_tmp.data<T>(), N, C);
+  } else {
+    set_constant(dev_ctx, &scale_tmp, static_cast<T>(1));
+  }
+
+  std::vector<int> dims;
+  std::vector<int> strides;
+  dims = {1, NxC, H, W, D};
+  strides = {NxC * H * W * D, H * W * D, W * D, D, 1};
+
+  if ((H * W * D) == 1) {
+    phi::Copy(dev_ctx, d_y, dev_ctx.GetPlace(), false, d_x);
+    phi::funcs::SetConstant<GPUContext, BatchNormParamType<T>> functor;
+    functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
+    functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
+    return;
+  }
+
+#ifdef PADDLE_WITH_HIP
+  miopenTensorDescriptor_t data_desc_;
+  miopenTensorDescriptor_t in_param_desc_;
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
+#else
+  cudnnTensorDescriptor_t data_desc_;
+  cudnnTensorDescriptor_t in_param_desc_;
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
+#endif
+
+  if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+    LOG(ERROR) << "Provided epsilon is smaller than "
+               << "CUDNN_BN_MIN_EPSILON. Setting it to "
+               << "CUDNN_BN_MIN_EPSILON instead.";
+  }
+  epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenSetTensorDescriptor(
+          data_desc_,
+          CudnnDataType<T>::type,
+          x_dims.size() > 3 ? x_dims.size() : 4,
+          const_cast<int *>(dims.data()),
+          const_cast<int *>(strides.data())));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenDeriveBNTensorDescriptor(
+          in_param_desc_, data_desc_, miopenBNSpatial));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnSetTensorNdDescriptor(
+          data_desc_,
+          CudnnDataType<T>::type,
+          x_dims.size() > 3 ? x_dims.size() : 4,
+          dims.data(),
+          strides.data()));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnDeriveBNTensorDescriptor(
+          in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
+#endif
+
+  const auto *saved_mean_data =
+      saved_mean.template data<BatchNormParamType<T>>();
+  const auto *saved_var_data =
+      saved_variance.template data<BatchNormParamType<T>>();
+  if (d_scale && d_bias) {
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::miopenBatchNormalizationBackward(
+            dev_ctx.cudnn_handle(),
+            miopenBNSpatial,
+            CudnnDataType<T>::kOne(),
+            CudnnDataType<T>::kZero(),
+            CudnnDataType<T>::kOne(),
+            CudnnDataType<T>::kZero(),
+            data_desc_,
+            x_tmp.template data<T>(),
+            data_desc_,
+            d_y_tmp.template data<T>(),
+            data_desc_,
+            d_x->template data<T>(),
+            in_param_desc_,
+            scale_tmp.template data<BatchNormParamType<T>>(),
+            d_scale_tmp.template data<BatchNormParamType<T>>(),
+            d_bias_tmp.template data<BatchNormParamType<T>>(),
+            epsilon,
+            saved_mean_data,
+            saved_var_data));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnBatchNormalizationBackward(
+            dev_ctx.cudnn_handle(),
+            CUDNN_BATCHNORM_SPATIAL,
+            CudnnDataType<T>::kOne(),
+            CudnnDataType<T>::kZero(),
+            CudnnDataType<T>::kOne(),
+            CudnnDataType<T>::kZero(),
+            data_desc_,
+            x_tmp.template data<T>(),
+            data_desc_,
+            d_y_tmp.template data<T>(),
+            data_desc_,
+            d_x->template data<T>(),
+            in_param_desc_,
+            scale_tmp.template data<BatchNormParamType<T>>(),
+            d_scale_tmp.template data<BatchNormParamType<T>>(),
+            d_bias_tmp.template data<BatchNormParamType<T>>(),
+            epsilon,
+            saved_mean_data,
+            saved_var_data));
+#endif
+  } else {
+    if (d_x) {
+      GradComputeDX<T, block><<<NxC, block, 0, dev_ctx.stream()>>>(
+          d_y.data<T>(),
+          scale_tmp.data<BatchNormParamType<T>>(),
+          saved_mean_data,
+          x.data<T>(),
+          saved_var_data,
+          C,
+          H * W * D,
+          d_x->data<T>());
+    }
+  }
+
+  if (d_scale && d_bias) {
+    add_param<T, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
+        d_scale_tmp.data<T>(), d_scale->data<T>(), N, C);
+    add_param<T, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
+        d_bias_tmp.data<T>(), d_bias->data<T>(), N, C);
+  }
+
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
+#endif
+}
+
+template <typename T, typename Context>
+void InstanceNormDoubleGradKernel(const Context &dev_ctx,
+                                  const DenseTensor &x,
+                                  paddle::optional<const DenseTensor &> scale,
+                                  const DenseTensor &saved_mean,
+                                  const DenseTensor &saved_variance,
+                                  const DenseTensor &dy,
+                                  paddle::optional<const DenseTensor &> ddx,
+                                  paddle::optional<const DenseTensor &> ddscale,
+                                  paddle::optional<const DenseTensor &> ddbias,
+                                  float epsilon_f,
+                                  DenseTensor *dx,
+                                  DenseTensor *dscale,
+                                  DenseTensor *ddy) {
+  const auto *Scale = scale.get_ptr();
+  const auto *ddX = ddx.get_ptr();
+  const auto *ddScale = ddscale.get_ptr();
+  const auto *ddBias = ddbias.get_ptr();
+  const double epsilon = static_cast<double>(epsilon_f);
+  const T *x_data = x.data<T>();
+  const T *dy_data = dy.data<T>();
+  const T *ddx_data = (ddX == nullptr ? nullptr : ddX->data<T>());
+  const T *ddscale_data = (ddScale == nullptr ? nullptr : ddScale->data<T>());
+  const T *ddbias_data = (ddScale == nullptr ? nullptr : ddBias->data<T>());
+  const T *mean_data = saved_mean.data<T>();
+  const T *variance_data = saved_variance.data<T>();
+  phi::funcs::SetConstant<GPUContext, T> set_zero;
+  auto &x_dims = x.dims();
+  int N, C, H, W, D;
+  funcs::ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
+  int NxC = N * C;
+  const int n = x.numel();
+  int sample_size = n / N / C;
+
+  DenseTensor scale_tmp;
+  if (!Scale) {
+    scale_tmp.Resize({C});
+    dev_ctx.template Alloc<T>(&scale_tmp);
+    set_zero(dev_ctx, &scale_tmp, static_cast<T>(1));
+  }
+  const T *scale_data = Scale ? Scale->data<T>() : scale_tmp.data<T>();
+  const int block = 512;
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(max_threads / block, 1);
+  const int grid = NxC;
+  const int grid1 = (C + block - 1) / block;
+
+  if (dx) {
+    T *dx_data = dev_ctx.template Alloc<T>(dx);
+    set_zero(dev_ctx, dx, static_cast<T>(0));
+    DoubleGradComputeDX<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
+        x_data,
+        mean_data,
+        variance_data,
+        ddx_data,
+        dy_data,
+        scale_data,
+        ddscale_data,
+        C,
+        sample_size,
+        epsilon,
+        dx_data);
+  }
+  if (dscale) {
+    DenseTensor dscale_tmp;
+    dscale_tmp.Resize({NxC});
+    dev_ctx.template Alloc<T>(&dscale_tmp);
+    set_zero(dev_ctx, &dscale_tmp, static_cast<T>(0));
+    T *dscale_tmp_data = dscale_tmp.data<T>();
+
+    T *dscale_data = dev_ctx.template Alloc<T>(dscale);
+    set_zero(dev_ctx, dscale, static_cast<T>(0));
+    DoubleGradComputeDScale<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
+        x_data,
+        mean_data,
+        variance_data,
+        ddx_data,
+        dy_data,
+        C,
+        sample_size,
+        epsilon,
+        dscale_tmp_data);
+    add_param<T, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
+        dscale_tmp.data<T>(), dscale->data<T>(), N, C);
+  }
+  if (ddy) {
+    T *ddy_data = dev_ctx.template Alloc<T>(ddy);
+    set_zero(dev_ctx, ddy, static_cast<T>(0));
+    DoubleGradComputeDDY<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
+        x_data,
+        mean_data,
+        variance_data,
+        ddscale_data,
+        ddbias_data,
+        ddx_data,
+        scale_data,
+        C,
+        sample_size,
+        epsilon,
+        ddy_data);
+  }
+}
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+// MIOPEN do not support double
+PD_REGISTER_KERNEL(
+    instance_norm_grad, GPU, ALL_LAYOUT, phi::InstanceNormGradKernel, float) {}
+PD_REGISTER_KERNEL(instance_norm_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::InstanceNormDoubleGradKernel,
+                   float) {}
+#else
+PD_REGISTER_KERNEL(instance_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::InstanceNormGradKernel,
+                   float,
+                   double) {}
+PD_REGISTER_KERNEL(instance_norm_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::InstanceNormDoubleGradKernel,
+                   float,
+                   double) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/instance_norm_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_kernel.cu
new file mode 100644
index 0000000000000..81d9400750190
--- /dev/null
+++ b/paddle/phi/kernels/gpu/instance_norm_kernel.cu
@@ -0,0 +1,220 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/instance_norm_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/norm_utils.h"
+#include "paddle/phi/kernels/gpu/instance_norm_utils.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void InstanceNormKernel(const Context &dev_ctx,
+                        const DenseTensor &x,
+                        paddle::optional<const DenseTensor &> scale,
+                        paddle::optional<const DenseTensor &> bias,
+                        float epsilon_f,
+                        DenseTensor *y,
+                        DenseTensor *saved_mean,
+                        DenseTensor *saved_variance) {
+  double epsilon = static_cast<double>(epsilon_f);
+  auto &x_dims = x.dims();
+  PADDLE_ENFORCE_GE(x_dims.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The `shape` in InstanceNormOp is invalid: "
+                        "the size of X's dimensions must greater than "
+                        "or equal to 2. But received: "
+                        "the size of X's dimensions is [%d]",
+                        x_dims.size()));
+  PADDLE_ENFORCE_LE(x_dims.size(),
+                    5,
+                    phi::errors::InvalidArgument(
+                        "The `shape` in InstanceNormOp is invalid: "
+                        "the size of X's dimensions must smaller than"
+                        "or equal to 5. But received: "
+                        "the size of X's dimensions is [%d]",
+                        x_dims.size()));
+  int N, C, H, W, D;
+  funcs::ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
+  int NxC = N * C;
+  DenseTensor x_tmp;
+  x_tmp.ShareDataWith(x).Resize({1, NxC, H, W, D});
+  dev_ctx.template Alloc<T>(y);
+
+#ifdef PADDLE_WITH_HIP
+  miopenTensorDescriptor_t data_desc_;
+  miopenTensorDescriptor_t in_param_desc_;
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
+#else
+  cudnnTensorDescriptor_t data_desc_;
+  cudnnTensorDescriptor_t in_param_desc_;
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
+#endif
+  if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+    LOG(ERROR) << "Provided epsilon is smaller than "
+               << "CUDNN_BN_MIN_EPSILON. Setting it to "
+               << "CUDNN_BN_MIN_EPSILON instead.";
+  }
+  epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+
+  VLOG(3) << "Setting descriptors.";
+  std::vector<int> dims;
+  std::vector<int> strides;
+  dims = {1, NxC, H, W, D};
+  strides = {NxC * H * W * D, H * W * D, W * D, D, 1};
+
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenSetTensorDescriptor(
+          data_desc_,
+          CudnnDataType<T>::type,
+          x_dims.size() > 3 ? x_dims.size() : 4,
+          const_cast<int *>(dims.data()),
+          const_cast<int *>(strides.data())));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenDeriveBNTensorDescriptor(
+          in_param_desc_, data_desc_, miopenBNSpatial));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnSetTensorNdDescriptor(
+          data_desc_,
+          CudnnDataType<T>::type,
+          x_dims.size() > 3 ? x_dims.size() : 4,
+          dims.data(),
+          strides.data()));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnDeriveBNTensorDescriptor(
+          in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
+#endif
+
+  const auto scale_ptr = scale.get_ptr();
+  const auto bias_ptr = bias.get_ptr();
+
+  DenseTensor scale_tmp;
+  scale_tmp.Resize({NxC});
+  dev_ctx.template Alloc<T>(&scale_tmp);
+  DenseTensor bias_tmp;
+  bias_tmp.Resize({NxC});
+  dev_ctx.template Alloc<T>(&bias_tmp);
+
+  const int n = x.numel();
+  const int block = 512;
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(max_threads / block, 1);
+  const int grid = std::min((NxC + block - 1) / block, max_blocks);
+
+  phi::funcs::SetConstant<GPUContext, T> set_constant;
+  if (scale_ptr) {
+    repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
+        scale_ptr->data<T>(), scale_tmp.data<T>(), N, C);
+  } else {
+    set_constant(dev_ctx, &scale_tmp, static_cast<T>(1));
+  }
+  if (bias_ptr) {
+    repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
+        bias_ptr->data<T>(), bias_tmp.data<T>(), N, C);
+  } else {
+    set_constant(dev_ctx, &bias_tmp, static_cast<T>(0));
+  }
+
+  auto handle = dev_ctx.cudnn_handle();
+
+  phi::funcs::SetConstant<GPUContext, BatchNormParamType<T>> functor;
+  dev_ctx.template Alloc<BatchNormParamType<T>>(saved_mean);
+  dev_ctx.template Alloc<BatchNormParamType<T>>(saved_variance);
+  functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
+  functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
+
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenBatchNormalizationForwardTraining(
+          handle,
+          miopenBNSpatial,
+          const_cast<void *>(
+              static_cast<const void *>(CudnnDataType<T>::kOne())),
+          const_cast<void *>(
+              static_cast<const void *>(CudnnDataType<T>::kZero())),
+          data_desc_,
+          static_cast<const void *>(x_tmp.template data<T>()),
+          data_desc_,
+          static_cast<void *>(y->template data<T>()),
+          in_param_desc_,
+          const_cast<void *>(static_cast<const void *>(
+              scale_tmp.template data<BatchNormParamType<T>>())),
+          const_cast<void *>(static_cast<const void *>(
+              bias_tmp.template data<BatchNormParamType<T>>())),
+          0,
+          nullptr,
+          nullptr,
+          epsilon,
+          static_cast<void *>(
+              saved_mean->template data<BatchNormParamType<T>>()),
+          static_cast<void *>(
+              saved_variance->template data<BatchNormParamType<T>>())));
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnBatchNormalizationForwardTraining(
+          handle,
+          CUDNN_BATCHNORM_SPATIAL,
+          CudnnDataType<T>::kOne(),
+          CudnnDataType<T>::kZero(),
+          data_desc_,
+          x_tmp.template data<T>(),
+          data_desc_,
+          y->template data<T>(),
+          in_param_desc_,
+          scale_tmp.template data<BatchNormParamType<T>>(),
+          bias_tmp.template data<BatchNormParamType<T>>(),
+          0,
+          nullptr,
+          nullptr,
+          epsilon,
+          saved_mean->template data<BatchNormParamType<T>>(),
+          saved_variance->template data<BatchNormParamType<T>>()));
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
+#endif
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+// MIOPEN do not support double
+PD_REGISTER_KERNEL(
+    instance_norm, GPU, ALL_LAYOUT, phi::InstanceNormKernel, float) {}
+#else
+PD_REGISTER_KERNEL(
+    instance_norm, GPU, ALL_LAYOUT, phi::InstanceNormKernel, float, double) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/instance_norm_utils.h b/paddle/phi/kernels/gpu/instance_norm_utils.h
new file mode 100644
index 0000000000000..50dfe4ad222c0
--- /dev/null
+++ b/paddle/phi/kernels/gpu/instance_norm_utils.h
@@ -0,0 +1,73 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cfloat>
+#include <string>
+#include <vector>
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+
+namespace phi {
+
+template <typename T>
+using CudnnDataType = paddle::platform::CudnnDataType<T>;
+template <typename T>
+using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
+
+template <typename T>
+static __global__ void repeat_param(const T *input,
+                                    T *output,
+                                    const int repeat_num,
+                                    const int C) {
+  CUDA_KERNEL_LOOP(i, repeat_num * C) {
+    int index = i % C;
+    output[i] = input[index];
+  }
+}
+
+template <typename T, int BlockDim, bool AVG>
+static __global__ void add_param(const T *input,
+                                 T *output,
+                                 const int repeat_num,
+                                 const int C) {
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ou_storage;
+  for (int i = blockIdx.x; i < C; i += gridDim.x) {
+    T ou = static_cast<T>(0);
+    for (int j = threadIdx.x; j < repeat_num; j += blockDim.x) {
+      const int index = j * C + i;
+      ou += static_cast<T>(input[index]);
+    }
+    ou = BlockReduce(ou_storage).Reduce(ou, cub::Sum());
+    if (threadIdx.x == 0) {
+      output[i] = ou;
+    }
+    __syncthreads();
+
+    if (AVG) {
+      output[i] /= repeat_num;
+    }
+  }
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/linspace_kernel.cu b/paddle/phi/kernels/gpu/linspace_kernel.cu
index 3a6ff365c11db..66a3f833d276a 100644
--- a/paddle/phi/kernels/gpu/linspace_kernel.cu
+++ b/paddle/phi/kernels/gpu/linspace_kernel.cu
@@ -18,7 +18,6 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
-#include "paddle/phi/kernels/funcs/data_type_transform.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace phi {
@@ -42,6 +41,47 @@ __global__ void LinspaceSpecialKernel(T start, T* out) {
   out[0] = static_cast<T>(start);
 }
 
+template <typename T, typename Context>
+T GetValue(const Context& ctx, const DenseTensor& x) {
+  T value = static_cast<T>(0);
+  if (x.place() != CPUPlace()) {
+    DenseTensor cpu_x;
+    Copy(ctx, x, CPUPlace(), true, &cpu_x);
+    value = cpu_x.data<T>()[0];
+  } else {
+    value = x.data<T>()[0];
+  }
+  return value;
+}
+
+template <typename T, typename Context>
+T GetValueOfExpectedType(const Context& ctx, const DenseTensor& x) {
+  switch (x.dtype()) {
+    case DataType::FLOAT32:
+      return static_cast<T>(GetValue<float, Context>(ctx, x));
+    case DataType::FLOAT64:
+      return static_cast<T>(GetValue<double, Context>(ctx, x));
+    case DataType::INT32:
+      return static_cast<T>(GetValue<int32_t, Context>(ctx, x));
+    case DataType::INT64:
+      return static_cast<T>(GetValue<int64_t, Context>(ctx, x));
+    case DataType::FLOAT16:
+      return static_cast<T>(GetValue<phi::dtype::float16, Context>(ctx, x));
+    case DataType::BFLOAT16:
+      return static_cast<T>(GetValue<phi::dtype::bfloat16, Context>(ctx, x));
+    case DataType::BOOL:
+      return static_cast<T>(GetValue<bool, Context>(ctx, x));
+    case DataType::INT16:
+      return static_cast<T>(GetValue<int16_t, Context>(ctx, x));
+    case DataType::UINT8:
+      return static_cast<T>(GetValue<uint8_t, Context>(ctx, x));
+    default:
+      PADDLE_THROW(phi::errors::Unimplemented(
+          "Data type (%s) is not supported when casting data type.",
+          x.dtype()));
+  }
+}
+
 template <typename T, typename Context>
 void LinspaceKernel(const Context& ctx,
                     const DenseTensor& start,
@@ -49,18 +89,9 @@ void LinspaceKernel(const Context& ctx,
                     const DenseTensor& number,
                     DataType dtype,
                     DenseTensor* out) {
-  auto start_t = phi::funcs::TransDataType(ctx, start, dtype);
-  auto stop_t = phi::funcs::TransDataType(ctx, stop, dtype);
-
-  DenseTensor n_start;
-  DenseTensor n_stop;
-  DenseTensor n_num;
-  phi::Copy(ctx, start_t, phi::CPUPlace(), false, &n_start);
-  T start_data = n_start.data<T>()[0];
-  phi::Copy(ctx, stop_t, phi::CPUPlace(), false, &n_stop);
-  T stop_data = n_stop.data<T>()[0];
-  phi::Copy(ctx, number, phi::CPUPlace(), false, &n_num);
-  int64_t num = static_cast<int64_t>(n_num.data<int32_t>()[0]);
+  T start_value = GetValueOfExpectedType<T, Context>(ctx, start);
+  T stop_value = GetValueOfExpectedType<T, Context>(ctx, stop);
+  int64_t num = GetValueOfExpectedType<int64_t, Context>(ctx, number);
 
   PADDLE_ENFORCE_GT(
       num,
@@ -72,16 +103,15 @@ void LinspaceKernel(const Context& ctx,
   out->Resize(phi::make_ddim({num}));
   T* out_data = ctx.template Alloc<T>(out);
 
-  double step = 0;
   auto stream = ctx.stream();
-  int block = 512;
-  int grid = (num + block - 1) / block;
   if (num != 1) {
-    step = (static_cast<double>(stop_data - start_data)) / (num - 1);
+    int block = 512;
+    int grid = (num + block - 1) / block;
+    double step = (static_cast<double>(stop_value - start_value)) / (num - 1);
     LinspaceKernelInner<T><<<grid, block, 0, stream>>>(
-        start_data, stop_data, step, num, out_data);
+        start_value, stop_value, step, num, out_data);
   } else {
-    LinspaceSpecialKernel<T><<<grid, block, 0, stream>>>(start_data, out_data);
+    LinspaceSpecialKernel<T><<<1, 1, 0, stream>>>(start_value, out_data);
   }
 }
 
@@ -94,4 +124,8 @@ PD_REGISTER_KERNEL(linspace,
                    float,
                    int32_t,
                    int64_t,
-                   double) {}
+                   double) {
+  kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->InputAt(1).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
+}
diff --git a/paddle/phi/kernels/gpu/reduce_grad.h b/paddle/phi/kernels/gpu/reduce_grad.h
index e1f7419fb7a01..ed6cc0c3c2022 100644
--- a/paddle/phi/kernels/gpu/reduce_grad.h
+++ b/paddle/phi/kernels/gpu/reduce_grad.h
@@ -43,22 +43,19 @@ void ReduceGrad(const GPUContext& dev_ctx,
       }));
 }
 
-template <typename T,
-          typename Context,
-          template <typename, typename> class TransformOp>
+template <typename T, typename OutT, typename Context, typename Functor>
 void ReduceGradKernel(const Context& dev_ctx,
                       const DenseTensor& x,
                       const DenseTensor& out_grad,
                       const std::vector<int64_t>& dims,
                       bool keep_dim,
                       bool reduce_all,
-                      DenseTensor* x_grad) {
+                      DenseTensor* x_grad,
+                      Functor functor) {
   auto* in_x = &x;
   auto* d_out = &out_grad;
   auto* d_x = x_grad;
 
-  auto pt_out_dtype = x.dtype();
-
   // get reduce_dim and reduce_num for reduce_mean_grad
   int dim_size = in_x->dims().size();
   std::vector<int> reduce_dims =
@@ -79,14 +76,10 @@ void ReduceGradKernel(const Context& dev_ctx,
 
   auto pt_d_out = new_d_out;
   auto pt_d_x = *d_x;
-  using MPType = typename kps::details::MPTypeTrait<T>::Type;
-
-  phi::ReduceGrad<T, TransformOp<T, MPType>>(
-      dev_ctx,
-      &pt_d_out,
-      &pt_d_x,
-      pt_out_dtype,
-      TransformOp<T, MPType>(reduce_num));
+  std::vector<const DenseTensor*> inputs = {&pt_d_out};
+  std::vector<DenseTensor*> outputs = {&pt_d_x};
+  funcs::BroadcastKernel<phi::ElementwiseType::kUnary, T, OutT>(
+      dev_ctx, inputs, &outputs, 0, functor);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/reduce_mean_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_mean_grad_kernel.cu
index b81a5e50dfb3e..50564a339ddc0 100644
--- a/paddle/phi/kernels/gpu/reduce_mean_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_mean_grad_kernel.cu
@@ -29,8 +29,23 @@ void ReduceMeanGradKernel(const Context& dev_ctx,
                           bool keep_dim,
                           bool reduce_all,
                           DenseTensor* x_grad) {
-  ReduceGradKernel<T, Context, kps::DivideFunctor>(
-      dev_ctx, x, out_grad, dims, keep_dim, reduce_all, x_grad);
+  int dim_size = x.dims().size();
+  std::vector<int> reduce_dims =
+      funcs::details::GetReduceDim(dims, dim_size, reduce_all);
+  int reduce_num = 1;
+  for (auto i : reduce_dims) {
+    reduce_num *= (x.dims())[i];
+  }
+  using MPType = typename kps::details::MPTypeTrait<T>::Type;
+  ReduceGradKernel<T, T, Context, kps::DivideFunctor<T, MPType>>(
+      dev_ctx,
+      x,
+      out_grad,
+      dims,
+      keep_dim,
+      reduce_all,
+      x_grad,
+      kps::DivideFunctor<T, MPType>(reduce_num));
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu
index 1ad6b8fefe7e4..8b111641cfa40 100644
--- a/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu
@@ -29,8 +29,40 @@ void ReduceSumGradKernel(const Context& dev_ctx,
                          bool keep_dim,
                          bool reduce_all,
                          DenseTensor* x_grad) {
-  ReduceGradKernel<T, Context, kps::IdentityFunctor>(
-      dev_ctx, x, out_grad, dims, keep_dim, reduce_all, x_grad);
+  using MPType = typename kps::details::MPTypeTrait<T>::Type;
+  auto out_dtype = x.dtype();
+  auto* in_x = &x;
+  auto* d_out = &out_grad;
+  auto* d_x = x_grad;
+
+  // get reduce_dim and reduce_num for reduce_mean_grad
+  int dim_size = in_x->dims().size();
+  std::vector<int> reduce_dims =
+      funcs::details::GetReduceDim(dims, dim_size, reduce_all);
+
+  auto update_dims = vectorize(d_x->dims());
+  int reduce_num = 1;
+  for (auto i : reduce_dims) {
+    reduce_num *= (in_x->dims())[i];
+    update_dims[i] = 1;
+  }
+  // make new tensor
+  DenseTensor new_d_out(d_out->dtype());
+  new_d_out.ShareDataWith(*d_out);
+  new_d_out.Resize(phi::make_ddim(update_dims));
+
+  dev_ctx.Alloc(d_x, x.dtype());
+  auto pt_out_dtype = x.dtype();
+  auto pt_d_out = new_d_out;
+  auto pt_d_x = *d_x;
+  std::vector<const DenseTensor*> inputs = {&pt_d_out};
+  std::vector<DenseTensor*> outputs = {&pt_d_x};
+  phi::ReduceGrad<T, kps::IdentityFunctor<T, MPType>>(
+      dev_ctx,
+      &pt_d_out,
+      &pt_d_x,
+      pt_out_dtype,
+      kps::IdentityFunctor<T, MPType>());
 }
 
 }  // namespace phi
@@ -48,4 +80,3 @@ PD_REGISTER_KERNEL(sum_grad,
                    int64_t,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
-
diff --git a/paddle/phi/kernels/gpu/rnn_functor.h b/paddle/phi/kernels/gpu/rnn_functor.h
index b29b7ed2d8010..68d8b2e5eef0e 100644
--- a/paddle/phi/kernels/gpu/rnn_functor.h
+++ b/paddle/phi/kernels/gpu/rnn_functor.h
@@ -99,7 +99,7 @@ class RNNDescriptors {
 
     // ------------------- cudnn dropout descriptors ---------------------
     size_t state_size;
-    bool is_initialized = dropout_state->IsInitialized();
+    bool is_initialized = dropout_state->initialized();
     if (!is_test_ && !is_initialized) {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
diff --git a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
index d30b7ec34d43c..f2ffe3c9d4fba 100644
--- a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
@@ -175,17 +175,13 @@ void RnnKernel(const Context &dev_ctx,
         mode));
 
   if (!is_test) {
-    int device_id = dev_ctx.GetPlace().GetDeviceId();
-    auto gen_cuda = paddle::framework::GetDefaultCUDAGenerator(device_id);
-    if (gen_cuda->GetIsInitPy() && seed == 0) {
-      // If perform `manual_seed` in python and inner seed is not specified
-      // (equals 0), use global generator generated seed.
+    if (seed == 0) {
+      // If not specify seed, use global Generator to generate seed.
+      int device_id = dev_ctx.GetPlace().GetDeviceId();
+      auto gen_cuda = paddle::framework::DefaultCUDAGenerator(device_id);
       seed = static_cast<int>(gen_cuda->Random64());
-    } else if (seed == 0) {
-      // use random generated seed
-      std::random_device rd;
-      seed = rd();
-    }  // else use `ctx.Attr<int>("seed")` specified seed
+    }
+    // else use `ctx.Attr<int>("seed")` specified seed
   }
 
   const T *x_data = x.data<T>();
diff --git a/paddle/phi/kernels/gpu/tril_indices_kernel.cu b/paddle/phi/kernels/gpu/tril_indices_kernel.cu
new file mode 100644
index 0000000000000..be83f28451166
--- /dev/null
+++ b/paddle/phi/kernels/gpu/tril_indices_kernel.cu
@@ -0,0 +1,142 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/tril_indices_kernel.h"
+
+#include <algorithm>
+#include <tuple>
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T>
+__device__ inline int resolve_root_int(int b, int cX4, int x, int32_t sign) {
+  int bXb_cX4 = b * b - cX4;
+  double sr = ::sqrt(static_cast<double>(bXb_cX4));
+  T res = ::__double2ll_rd((-b + sign * sr) / 2);
+  if (bXb_cX4 != static_cast<int>(sr * sr)) {
+    int llsr = ::__double2ll_rd(sr);
+    int diff = ::__double2ll_ru(
+        ::sqrt(::fabs(static_cast<double>(bXb_cX4 - llsr * llsr))));
+    auto l = res > diff ? res - diff : 0;
+    auto r = res + diff + 1;
+    x <<= 1;
+    while (l + 1 < r) {
+      auto m = (l + r) >> 1;
+      if (sign * (b + m) * m > x) {
+        r = m;
+      } else {
+        l = m;
+      }
+    }
+    res = l;
+  }
+  return res;
+}
+
+template <typename T>
+__device__ inline void get_coordinate_in_tril_trapezoid(int f,
+                                                        int x,
+                                                        T* row,
+                                                        T* col) {
+  f <<= 1;  // all statements use 2f, so only calculate it once here.
+  auto b = f - 1;
+  auto cX4 = -(x << 3);  // 4 * c = 4 * (-2x) = -8x;
+  *row = resolve_root_int<T>(b, cX4, x, 1);
+  *col = x - ((f + *row - 1) * *row >> 1);
+}
+
+template <typename T>
+__global__ void tril_indices_kernel(T* out_data,
+                                    int row_offset,
+                                    int m_first_row,
+                                    int col,
+                                    int trapezoid_size,
+                                    int tril_size) {
+  int linear_index = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (linear_index < tril_size) {
+    T r, c;
+    if (linear_index < trapezoid_size) {
+      // the coordinate is within the top trapezoid
+      get_coordinate_in_tril_trapezoid<T>(m_first_row, linear_index, &r, &c);
+    } else {
+      // the coordinate falls in the bottom rectangle
+      auto surplus = linear_index - trapezoid_size;
+      // add the height of trapezoid: m_last_row (col) - m_first_row + 1
+      r = surplus / col + col - m_first_row + 1;
+      c = surplus % col;
+    }
+    r += row_offset;
+
+    out_data[linear_index] = r;
+    out_data[linear_index + tril_size] = c;
+  }
+}
+
+template <typename T, typename Context>
+void TrilIndicesKernel(const Context& dev_ctx,
+                       int rows,
+                       int cols,
+                       int offset,
+                       DataType dtype,
+                       DenseTensor* out) {
+  T* out_data = dev_ctx.template Alloc<T>(out);
+  auto out_dims = out->dims();
+  int tril_size = out_dims[1];
+
+  if (tril_size > 0) {
+    auto m_first_row = offset > 0
+                           ? std::min<int>(cols, 1 + offset)
+                           : rows + offset > 0;  // the number of first row
+    auto trapezoid_row_offset =
+        std::max<int>(0, -offset);  // index of the first row who has number
+    auto rectangle_row_offset = trapezoid_row_offset + cols - m_first_row +
+                                1;  // the length of the right-up rest matrix
+    int rectangle_size = 0;
+    if (rectangle_row_offset < rows) {
+      rectangle_size = (rows - rectangle_row_offset) * cols;
+    }  // the rectangle part of lowertriangle matrix
+
+    auto GetBlockGridSize = [&dev_ctx](int size) {
+      const int block_size =
+          std::min(size, static_cast<int>(dev_ctx.GetMaxThreadsPerBlock()));
+      int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+      const int max_blocks =
+          std::max(((max_threads - 1) / block_size + 1), static_cast<int>(1));
+      const int grid_size =
+          std::min(max_blocks, (size + block_size - 1) / block_size);
+      return std::tuple<int, int>{grid_size, block_size};
+    };
+
+    std::tuple<int, int> block_grid_size = GetBlockGridSize(tril_size);
+
+    tril_indices_kernel<T><<<std::get<0>(block_grid_size),
+                             std::get<1>(block_grid_size),
+                             0,
+                             dev_ctx.stream()>>>(out_data,
+                                                 trapezoid_row_offset,
+                                                 m_first_row,
+                                                 cols,
+                                                 tril_size - rectangle_size,
+                                                 tril_size);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    tril_indices, GPU, ALL_LAYOUT, phi::TrilIndicesKernel, int, int64_t) {}
diff --git a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
index 5b6ae9d09bff2..33ecb4d6eb544 100644
--- a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
+++ b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
@@ -90,34 +90,25 @@ void TruncatedGaussianRandomKernel(const Context& dev_ctx,
                                    int seed,
                                    DataType dtype,
                                    DenseTensor* out) {
-  auto tensor = out;
-
-  T* data = dev_ctx.template Alloc<T>(tensor);
-
-  bool seed_flag = false;
-  if (seed == 0) {
-    std::random_device rd;
-    seed = rd();
-    seed_flag = true;
-  }
+  T* data = dev_ctx.template Alloc<T>(out);
 
   thrust::counting_iterator<int64_t> index_sequence_begin(0);
-  int64_t size = tensor->numel();
+  int64_t size = out->numel();
 
   auto gen_cuda = dev_ctx.GetGenerator();
-
-  if (gen_cuda->GetIsInitPy() && seed_flag) {
+  if (seed == 0) {
+    // use global Generator seed
     auto seed_offset = gen_cuda->IncrementOffset(1);
-    int64_t gen_offset = size * seed_offset.second;
-    thrust::transform(index_sequence_begin,
-                      index_sequence_begin + size,
-                      thrust::device_ptr<T>(data),
-                      TruncatedNormalOffset<T>(mean,
-                                               std,
-                                               std::numeric_limits<T>::min(),
-                                               seed_offset.first,
-                                               gen_offset));
+    uint64_t seed = seed_offset.first;
+    uint64_t offset = seed_offset.second;
+    thrust::transform(
+        index_sequence_begin,
+        index_sequence_begin + size,
+        thrust::device_ptr<T>(data),
+        TruncatedNormalOffset<T>(
+            mean, std, std::numeric_limits<T>::min(), seed, size * offset));
   } else {
+    // use OP seed
     thrust::transform(
         index_sequence_begin,
         index_sequence_begin + size,
diff --git a/paddle/phi/kernels/gpu/uniform_random_kernel.cu b/paddle/phi/kernels/gpu/uniform_random_kernel.cu
index a4aea10cfe762..68e61b7328971 100644
--- a/paddle/phi/kernels/gpu/uniform_random_kernel.cu
+++ b/paddle/phi/kernels/gpu/uniform_random_kernel.cu
@@ -65,22 +65,15 @@ void UniformRandomRawKernel(const Context& dev_ctx,
                             float diag_val,
                             DenseTensor* out) {
   out->Resize(phi::make_ddim(shape.GetData()));
-  T* data = dev_ctx.template Alloc<T>(out);
-  auto size = out->numel();
-  bool seed_flag = false;
+  dev_ctx.template Alloc<T>(out);
   if (seed == 0) {
-    std::random_device rd;
-    seed = rd();
-    seed_flag = true;
-  }
-
-  auto generator = dev_ctx.GetGenerator();
-  if (generator->GetIsInitPy() && seed_flag) {
+    // Use global Generator seed
     using MT = typename kps::details::MPTypeTrait<T>::Type;
     funcs::uniform_distribution<MT> dist;
     funcs::uniform_real_transform<MT> trans(min, max);
     funcs::distribution_and_transform<T>(dev_ctx, out, dist, trans);
   } else {
+    // Use OP seed
     auto func =
         UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val);
     IndexKernel<T, UniformGenerator<T>>(dev_ctx, out, func);
diff --git a/paddle/phi/kernels/gpu/where_kernel.cu b/paddle/phi/kernels/gpu/where_kernel.cu
index a0be388065f4b..441be02b99efa 100644
--- a/paddle/phi/kernels/gpu/where_kernel.cu
+++ b/paddle/phi/kernels/gpu/where_kernel.cu
@@ -40,8 +40,7 @@ void WhereKernel(const Context& ctx,
   ctx.template Alloc<T>(out);
 
   CondFunctor<T> func;
-  funcs::BroadcastKernel<ElementwiseType::kTernary, T, T>(
-      ctx, ins, &outs, -1, func);
+  funcs::ElementwiseKernel<T, CondFunctor<T>, 1>(ctx, ins, &outs, func);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/activation_grad_impl.h b/paddle/phi/kernels/impl/activation_grad_impl.h
index 2f35acc095085..04391d2538c89 100644
--- a/paddle/phi/kernels/impl/activation_grad_impl.h
+++ b/paddle/phi/kernels/impl/activation_grad_impl.h
@@ -265,7 +265,7 @@ void SigmoidTripleGradKernel(const Context& dev_ctx,
                              const DenseTensor& dout,
                              const DenseTensor& ddx,
                              const DenseTensor& d_dout_new,
-                             const DenseTensor& d_ddout,
+                             paddle::optional<const DenseTensor&> d_ddout,
                              DenseTensor* d_out_new,
                              DenseTensor* d_dout,
                              DenseTensor* d_ddx) {
@@ -274,11 +274,11 @@ void SigmoidTripleGradKernel(const Context& dev_ctx,
     dev_ctx.template Alloc<T>(d_dout);
   }
   if (d_out_new) {
-    d_dout->Resize(out.dims());
+    d_out_new->Resize(out.dims());
     dev_ctx.template Alloc<T>(d_out_new);
   }
   if (d_ddx) {
-    d_dout->Resize(ddx.dims());
+    d_ddx->Resize(ddx.dims());
     dev_ctx.template Alloc<T>(d_ddx);
   }
   funcs::SigmoidTripleGradFunctor<T> functor;
@@ -286,7 +286,7 @@ void SigmoidTripleGradKernel(const Context& dev_ctx,
           &out,
           &ddx,
           &dout,
-          &d_ddout,
+          d_ddout.get_ptr(),
           &d_dout_new,
           d_dout,
           d_out_new,
@@ -335,4 +335,87 @@ void PowGradKernel(const Context& dev_ctx,
   functor(*place, x_flatten, nullptr, dout_flatten, dx_flatten);
 }
 
+template <typename T, typename Context>
+void SqrtDoubleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& out,
+                          const DenseTensor& dx,
+                          const DenseTensor& ddx,
+                          DenseTensor* dout,
+                          DenseTensor* ddout) {
+  if (dout) {
+    dout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(dout);
+  }
+  if (ddout) {
+    ddout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(ddout);
+  }
+
+  phi::funcs::SqrtGradGradFunctor<T> functor;
+  functor(dev_ctx, &out, &dx, &ddx, dout, ddout);
+}
+
+// rsqrt Grad: dx = -0.5 * dy * y * y * y
+// rsqrt GradGrad: ddy = -0.5 * ddx * y * y * y, dy = (3 / y) * dx * ddx
+template <typename T, typename Context>
+void RsqrtDoubleGradKernel(const Context& dev_ctx,
+                           const DenseTensor& out,
+                           const DenseTensor& dx,
+                           const DenseTensor& ddx,
+                           DenseTensor* dout,
+                           DenseTensor* ddout) {
+  if (dout) {
+    dout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(dout);
+  }
+  if (ddout) {
+    ddout->Resize(out.dims());
+    dev_ctx.template Alloc<T>(ddout);
+  }
+
+  phi::funcs::RsqrtGradGradFunctor<T> functor;
+  functor(dev_ctx, &out, &dx, &ddx, dout, ddout);
+}
+
+template <typename T, typename Context>
+void CeluDoubleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& dout,
+                          const DenseTensor& ddx,
+                          float alpha,
+                          DenseTensor* dx,
+                          DenseTensor* ddout) {
+  if (dx) {
+    dx->Resize(x.dims());
+    dev_ctx.template Alloc<T>(dx);
+  }
+  if (ddout) {
+    dev_ctx.template Alloc<T>(ddout);
+  }
+
+  phi::funcs::CELUGradGradFunctor<T> functor;
+  auto attrs = functor.GetAttrs();
+  *(attrs[0].second) = alpha;
+  functor(dev_ctx, &x, &dout, &ddx, dx, ddout);
+}
+
+template <typename T, typename Context>
+void SquareDoubleGradKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& dout,
+                            const DenseTensor& ddx,
+                            DenseTensor* dx,
+                            DenseTensor* ddout) {
+  if (dx) {
+    dx->Resize(x.dims());
+    dev_ctx.template Alloc<T>(dx);
+  }
+  if (ddout) {
+    dev_ctx.template Alloc<T>(ddout);
+  }
+
+  phi::funcs::SquareGradGradFunctor<T> functor;
+  functor(dev_ctx, &x, &dout, &ddx, dx, ddout);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/channel_shuffle_grad_kernel_impl.h b/paddle/phi/kernels/impl/channel_shuffle_grad_kernel_impl.h
index 26bee763eca52..b8406b9143103 100644
--- a/paddle/phi/kernels/impl/channel_shuffle_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/channel_shuffle_grad_kernel_impl.h
@@ -32,8 +32,8 @@ void ChannelShuffleGradKernel(const Context& dev_ctx,
   auto* dx = x_grad;
   dev_ctx.template Alloc<T>(dx);
   bool channel_last = (data_format == "NHWC");
-  auto do_dims = dout->dims();
-  auto dx_dims = dx->dims();
+  const auto& do_dims = dout->dims();
+  const auto& dx_dims = dx->dims();
 
   DenseTensor t(*dout);
   if (!channel_last) {
diff --git a/paddle/phi/kernels/impl/channel_shuffle_kernel_impl.h b/paddle/phi/kernels/impl/channel_shuffle_kernel_impl.h
index c723cd3622af9..7e31e02851591 100644
--- a/paddle/phi/kernels/impl/channel_shuffle_kernel_impl.h
+++ b/paddle/phi/kernels/impl/channel_shuffle_kernel_impl.h
@@ -31,8 +31,8 @@ void ChannelShuffleKernel(const Context& dev_ctx,
   auto* in = &x;
   dev_ctx.template Alloc<T>(out);
   bool channel_last = (data_format == "NHWC");
-  auto in_dims = in->dims();
-  auto o_dims = out->dims();
+  const auto& in_dims = in->dims();
+  const auto& o_dims = out->dims();
 
   DenseTensor t(*in);
   if (!channel_last) {
diff --git a/paddle/phi/kernels/impl/einsum_grad_impl.h b/paddle/phi/kernels/impl/einsum_grad_impl.h
index bd0143379ce15..aceb97a49b1c2 100644
--- a/paddle/phi/kernels/impl/einsum_grad_impl.h
+++ b/paddle/phi/kernels/impl/einsum_grad_impl.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 #pragma once
 
+#include "paddle/fluid/platform/profiler.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/impl/einsum_impl.h"
 #include "paddle/phi/kernels/tile_kernel.h"
@@ -55,7 +56,13 @@ DenseTensor PerformTileAndReduction(const Context& dev_ctx,
   }
   t.Resize(make_ddim(resize_dims));
   DenseTensor after_tile;
-  TileKernel<T, Context>(dev_ctx, t, repeat_times, &after_tile);
+  if (std::all_of(repeat_times.begin(), repeat_times.end(), [](int x) {
+        return x == 1;
+      })) {
+    after_tile = t;
+  } else {
+    TileKernel<T, Context>(dev_ctx, t, repeat_times, &after_tile);
+  }
   size_t n_ellipsis_idx = op_label.find(".", 0);
   if (n_ellipsis_idx != std::string::npos) {
     // may be we need reduce. broadcast_dims is not equal to ellipsis dims.
@@ -91,10 +98,11 @@ DenseTensor PerformTileAndReduction(const Context& dev_ctx,
 template <typename T, typename Context>
 void EinsumGradKernel(const Context& dev_ctx,
                       const std::vector<const DenseTensor*>& x,
+                      const std::vector<const DenseTensor*>& inner_cache,
                       const DenseTensor& out_grad,
                       const std::string& equation,
                       std::vector<DenseTensor*> x_grad) {
-  VLOG(5) << "Start EisumGradKernel:";
+  VLOG(5) << "Start EinsumGradKernel:";
   LabelMap labelshape(0);
   LabelMap labeltype(LabelType::Reduction);
   std::vector<LabelMap> label2perms(x.size(), LabelMap(-1));
@@ -148,20 +156,47 @@ void EinsumGradKernel(const Context& dev_ctx,
     right = splits[1].substr(1);
 
     auto equation_for_A =
-        right + "," + ops[1] + "->" + gather_labels_except_reduction(ops[0]);
+        ops[1] + "," + right + "->" + gather_labels_except_reduction(ops[0]);
     auto equation_for_B =
         right + "," + ops[0] + "->" + gather_labels_except_reduction(ops[1]);
     auto operands_for_A = std::vector<const DenseTensor*>();
     auto operands_for_B = std::vector<const DenseTensor*>();
     DenseTensor dA, dB;
-    operands_for_A.push_back(&out_grad);
+    // dA = einsum(B, dC)
     operands_for_A.push_back(x[1]);
+    operands_for_A.push_back(&out_grad);
+    // dB = einsum(dC, A)
     operands_for_B.push_back(&out_grad);
     operands_for_B.push_back(x[0]);
 
     DenseTensor before_tile;
-    EinsumKernel<T, Context>(dev_ctx, operands_for_A, equation_for_A, &dA);
-    EinsumKernel<T, Context>(dev_ctx, operands_for_B, equation_for_B, &dB);
+
+    std::vector<DenseTensor> cache(3);  // set empty; TA, TB, TdC
+    if (inner_cache.size() >
+        0) {  // for compatibility,  we can load and run v2.3 EinsumOp.
+      cache[0].ShareBufferWith(*(inner_cache[0]));
+      cache[1].ShareBufferWith(*(inner_cache[1]));
+    }
+
+    EinsumKernelImpl<T, Context>(dev_ctx,
+                                 all_labels,
+                                 operands_for_A,
+                                 equation_for_A,
+                                 &dA,
+                                 {&cache[1], &cache[2]},
+                                 false);
+
+    EinsumKernelImpl<T, Context>(dev_ctx,
+                                 all_labels,
+                                 operands_for_B,
+                                 equation_for_B,
+                                 &dB,
+                                 {&cache[2], &cache[0]},
+                                 false);
+
+    // release the cache tensor dTC to save memory right now. they are useless
+    // now.
+    cache.clear();
     *(x_grad[0]) = PerformTileAndReduction<T, Context>(dev_ctx,
                                                        labeltype,
                                                        labelshape,
diff --git a/paddle/phi/kernels/impl/einsum_impl.h b/paddle/phi/kernels/impl/einsum_impl.h
index 73940a45cbde2..bfbd6e0c51cfc 100644
--- a/paddle/phi/kernels/impl/einsum_impl.h
+++ b/paddle/phi/kernels/impl/einsum_impl.h
@@ -13,12 +13,15 @@
 // limitations under the License.
 #pragma once
 
+#include <set>
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/matmul_kernel.h"
 #include "paddle/phi/kernels/reduce_sum_kernel.h"
 #include "paddle/phi/kernels/transpose_kernel.h"
 #include "paddle/utils/string/string_helper.h"
 
+DECLARE_bool(einsum_opt);
+
 namespace phi {
 
 // check the validation of the Einsum equation.
@@ -55,7 +58,8 @@ inline static void ValidationCheck(const std::string& equation) {
 enum LabelType {
   ALL_TYPE = 0,
   Batch = 1,    // ABO
-  Free,         // AO, BO
+  AO,           // AO --  free label
+  BO,           // BO --  free label
   Contraction,  // AB
   Reduction,    // A, B
 };
@@ -125,18 +129,31 @@ inline std::vector<char> union_labels(const std::vector<char>& a,
   return res;
 }
 
+// Apply transforms to all_labels and get another all_labels
+inline std::vector<char> TransformLabelsOrder(
+    const std::vector<char>& all_labels,
+    const LabelMap& type,
+    std::vector<LabelType> new_order) {
+  std::vector<char> ret;
+  for (auto cnt_type : new_order) {
+    std::vector<char> tmp;
+    for (int c : all_labels) {
+      if (type[c] == cnt_type) tmp.push_back(c);
+    }
+    ret.insert(ret.end(), tmp.begin(), tmp.end());
+  }
+  return ret;
+}
+
 inline static void GlobalInfo(const std::vector<std::string>& op_labels,
                               const std::string& right,
                               LabelMap* label2type,
                               std::vector<char>* sorted_labels) {
-  // sorted_labels: ['.', <right>, <left only label>]
-  VLOG(5) << "GlobalInfo: "
-          << paddle::string::join_strings(*sorted_labels, ",");
   std::vector<char> all;
   LabelMap counter(0);
   for (auto& ch : right) {  // char
     int c = ch;
-    (*label2type)[c] = LabelType::Free;
+    (*label2type)[c] = LabelType::BO;
   }
 
   for (auto& op : op_labels) {
@@ -146,39 +163,45 @@ inline static void GlobalInfo(const std::vector<std::string>& op_labels,
         all.push_back(ch);
       }
       counter[c] += 1;
-      if ((*label2type)[c] != LabelType::Free && counter[c] == 2)
+      if ((*label2type)[c] != LabelType::BO && counter[c] == 2)
         (*label2type)[c] = LabelType::Contraction;
       else if (counter[c] == 2)
         (*label2type)[c] = LabelType::Batch;
     }
   }
+
+  // BO is represent Free, so we need find the AO.
+  for (int c : op_labels[0]) {
+    if ((*label2type)[c] == LabelType::BO) (*label2type)[c] = LabelType::AO;
+  }
+
   (*label2type)['.'] = LabelType::Batch;
-  std::for_each(all.begin(), all.end(), [sorted_labels, label2type](int c) {
-    if ((*label2type)[c] == LabelType::Batch)
-      sorted_labels->push_back(static_cast<char>(c));
-  });
-  std::for_each(all.begin(), all.end(), [sorted_labels, label2type](int c) {
-    if ((*label2type)[c] == LabelType::Free)
-      sorted_labels->push_back(static_cast<char>(c));
-  });
-  std::for_each(all.begin(), all.end(), [sorted_labels, label2type](int c) {
-    if ((*label2type)[c] == LabelType::Contraction)
-      sorted_labels->push_back(static_cast<char>(c));
-  });
-  std::for_each(all.begin(), all.end(), [&sorted_labels, label2type](int c) {
-    if ((*label2type)[c] == LabelType::Reduction)
-      sorted_labels->push_back(static_cast<char>(c));
-  });
-  VLOG(5) << "GlobalInfo: sorted_labels before: "
-          << paddle::string::join_strings(*sorted_labels, ",");
+
+  if (sorted_labels->size()) {
+    std::set<char> exist(all.begin(), all.end());
+    all.clear();
+    std::for_each(
+        sorted_labels->begin(), sorted_labels->end(), [&exist, &all](char c) {
+          if (exist.count(c)) all.push_back(c);
+        });
+  }
+
+  *sorted_labels = TransformLabelsOrder(all,
+                                        *label2type,
+                                        {LabelType::Batch,
+                                         LabelType::AO,
+                                         LabelType::BO,
+                                         LabelType::Contraction,
+                                         LabelType::Reduction});
+
   if (counter[static_cast<int>('.')] > 0) {
     std::vector<char> tmp;
     tmp.push_back('.');
     // push '.' in the front
     *sorted_labels = union_labels(tmp, *sorted_labels);
-    VLOG(5) << "GlobalInfo: sorted_labels after: "
-            << paddle::string::join_strings(*sorted_labels, ",");
   }
+  VLOG(5) << "GlobalInfo: sorted_labels after: "
+          << paddle::string::join_strings(*sorted_labels, ",");
 }
 
 inline static void InferLabelShape(const std::vector<std::string>& op_labels,
@@ -289,17 +312,20 @@ inline static void ParseEinsumEquation(
   *right = results[1].substr(1);
   ReplaceEllipsis(*right);
   auto op_labels = paddle::string::split_string(left, ",");
+  // split_string("i,") -> ["i"], we expect 2 op_labels.
+  if (left[left.size() - 1] == ',') op_labels.push_back("");
   std::for_each(op_labels.begin(), op_labels.end(), ReplaceEllipsis);
   GlobalInfo(op_labels, *right, labeltype, all_labels);
   InferLabelShape(op_labels, inputs, labelshape, ellipsis_dims, broadcast_dims);
-  VLOG(5) << "Einsum Infershape: right:" << right;
-  VLOG(5) << "Einsum Infershape: op_labels:"
-          << paddle::string::join_strings(op_labels, "\n");
+  VLOG(5) << "Einsum Infershape: right:" << *right;
+  VLOG(5) << "Einsum Infershape: left :"
+          << paddle::string::join_strings(op_labels, '\n');
   InferOutputDims(*right, *broadcast_dims, *labelshape, output_dims);
   for (size_t i = 0; i < inputs.size(); ++i) {
     InferLabelPerm(
         op_labels[i], ellipsis_dims->at(i).size(), &((*label2perms)[i]));
   }
+  VLOG(5) << "Einsum Infershape: end";
 }
 
 template <typename T>
@@ -327,10 +353,12 @@ std::vector<T> GetShapeByType(const std::vector<char>& all_labels,
                               const LabelMap& perm,
                               const LabelMap& label2shape,
                               const std::vector<int>& ellipsis,
-                              LabelType filter) {
+                              std::set<LabelType> filter) {
   std::vector<T> res;
   for (T c : all_labels) {
-    if ((filter == LabelType::ALL_TYPE || type[c] == filter) && perm[c] != -1) {
+    if ((filter.count(LabelType::ALL_TYPE) ||
+         filter.count(LabelType(type[c]))) &&
+        perm[c] != -1) {
       if (c == '.')
         res.insert(res.end(), ellipsis.begin(), ellipsis.end());
       else
@@ -390,7 +418,9 @@ DenseTensor PerformContraction(
     const LabelMap& label2type,
     const LabelMap& label2shape,
     const std::vector<std::vector<int>>& ellipsis_dims,
-    const std::vector<int>& broadcast_dims) {
+    const std::vector<int>& broadcast_dims,
+    std::vector<DenseTensor*> cache,
+    bool use_cache) {
   // Get All the Batches, so perm is
   auto all_valid = LabelMap(1);
   auto recover_dim = GetShapeByType<int>(all_labels,
@@ -398,36 +428,77 @@ DenseTensor PerformContraction(
                                          all_valid,
                                          label2shape,
                                          broadcast_dims,
-                                         LabelType::Batch);
+                                         {LabelType::Batch});
   auto preprocess = [&](const DenseTensor& t,
                         const LabelMap& perm,
-                        const std::vector<int>& ellipsis) -> DenseTensor {
-    auto frees = GetShapeByType<int>(
-        all_labels, label2type, perm, label2shape, ellipsis, LabelType::Free);
+                        const std::vector<int>& ellipsis,
+                        int operand_idx) -> DenseTensor {
+    // reshape
+    auto frees = GetShapeByType<int>(all_labels,
+                                     label2type,
+                                     perm,
+                                     label2shape,
+                                     ellipsis,
+                                     {LabelType::AO, LabelType::BO});
     auto conts = GetShapeByType<int>(all_labels,
                                      label2type,
                                      perm,
                                      label2shape,
                                      ellipsis,
-                                     LabelType::Contraction);
-    auto trans_t = PerformTranspose<T, Context>(
-        dev_ctx, t, perm, all_labels, ellipsis, label2type);
-    auto mul_dims = GetShapeByType<int>(
-        all_labels, label2type, perm, label2shape, ellipsis, LabelType::Batch);
+                                     {LabelType::Contraction});
+    std::vector<char> reordered_all_labels = all_labels;
+    if (operand_idx == 1) {
+      reordered_all_labels = TransformLabelsOrder(all_labels,
+                                                  label2type,
+                                                  {LabelType::Batch,
+                                                   LabelType::Contraction,
+                                                   LabelType::AO,
+                                                   LabelType::BO,
+                                                   LabelType::Reduction});
+    }
+    // reduction
+    DenseTensor trans_t;
+    if (FLAGS_einsum_opt && use_cache && cache[operand_idx] != nullptr &&
+        cache[operand_idx]->IsInitialized()) {
+      trans_t.ShareBufferWith(*(cache[operand_idx]));
+      VLOG(5) << "Cache Used!";
+    } else {
+      auto reduct_t = PerformReduction<T, Context>(
+          dev_ctx, t, perm, all_labels, ellipsis, label2type);
+      trans_t = PerformTranspose<T, Context>(
+          dev_ctx, reduct_t, perm, reordered_all_labels, ellipsis, label2type);
+      if (FLAGS_einsum_opt && cache[operand_idx] != nullptr)
+        cache[operand_idx]->ShareBufferWith(trans_t);
+    }
+    auto mul_dims = GetShapeByType<int>(all_labels,
+                                        label2type,
+                                        perm,
+                                        label2shape,
+                                        ellipsis,
+                                        {LabelType::Batch});
     recover_dim.insert(recover_dim.end(), frees.begin(), frees.end());
-    mul_dims.push_back(
-        std::accumulate(frees.begin(), frees.end(), 1, std::multiplies<int>()));
-    mul_dims.push_back(
-        std::accumulate(conts.begin(), conts.end(), 1, std::multiplies<int>()));
+    if (operand_idx == 0) {
+      mul_dims.push_back(std::accumulate(
+          frees.begin(), frees.end(), 1, std::multiplies<int>()));
+      mul_dims.push_back(std::accumulate(
+          conts.begin(), conts.end(), 1, std::multiplies<int>()));
+    } else {
+      mul_dims.push_back(std::accumulate(
+          conts.begin(), conts.end(), 1, std::multiplies<int>()));
+      mul_dims.push_back(std::accumulate(
+          frees.begin(), frees.end(), 1, std::multiplies<int>()));
+    }
     VLOG(5) << "PerformContraction: mul_dims: "
             << paddle::string::join_strings(mul_dims, ",");
     trans_t.Resize(make_ddim(mul_dims));
     return trans_t;
   };
-  auto trans_a = preprocess(A, label2perm[0], ellipsis_dims[0]);
-  auto trans_b = preprocess(B, label2perm[1], ellipsis_dims[1]);
+
+  // Reduction, Reshape and Matmul
+  auto trans_a = preprocess(A, label2perm[0], ellipsis_dims[0], 0);
+  auto trans_b = preprocess(B, label2perm[1], ellipsis_dims[1], 1);
   auto after_contraction =
-      Matmul<T, Context>(dev_ctx, trans_a, trans_b, false, true);
+      Matmul<T, Context>(dev_ctx, trans_a, trans_b, false, false);
   VLOG(5) << "PerformContraction: recover_dim: "
           << paddle::string::join_strings(recover_dim, ",");
   after_contraction.Resize(make_ddim(recover_dim));
@@ -458,17 +529,23 @@ void TransposeToOutput(const Context& dev_ctx,
       axis.push_back(it - all_labels.begin() + offset);
     }
   }
-  if (is_no_need_transpose(axis)) return output->ShareBufferWith(to_trans);
+  if (is_no_need_transpose(axis)) {
+    output->ShareBufferWith(to_trans);
+    return;
+  }
   VLOG(5) << "call TransposeToOutput: with axis: "
           << paddle::string::join_strings(axis, ",");
-  return TransposeKernel<T, Context>(dev_ctx, to_trans, axis, output);
+  TransposeKernel<T, Context>(dev_ctx, to_trans, axis, output);
 }
 
 template <typename T, typename Context>
-void EinsumKernel(const Context& dev_ctx,
-                  const std::vector<const DenseTensor*>& inputs,
-                  const std::string& equation,
-                  DenseTensor* out) {
+void EinsumKernelImpl(const Context& dev_ctx,
+                      const std::vector<char>& forward_all_labels,
+                      const std::vector<const DenseTensor*>& inputs,
+                      const std::string& equation,
+                      DenseTensor* out,
+                      std::vector<DenseTensor*> cache,
+                      bool is_forward = true) {
   ValidationCheck(equation);
   // collect the following informations to prepare einsum.
   LabelMap labelshape(0);
@@ -484,6 +561,9 @@ void EinsumKernel(const Context& dev_ctx,
     input_dims.push_back(i->dims());
   }
   std::string right;
+  if (!is_forward) {
+    all_labels = forward_all_labels;
+  }
   ParseEinsumEquation(equation,
                       input_dims,
                       &labelshape,
@@ -498,22 +578,18 @@ void EinsumKernel(const Context& dev_ctx,
   if (inputs.size() == 2) {
     auto& A = inputs[0];
     auto& B = inputs[1];
-    // Reduce Procedure
-    auto reduce_A = PerformReduction<T, Context>(
-        dev_ctx, *A, label2perms[0], all_labels, ellipsis_dims[0], labeltype);
-    auto reduce_B = PerformReduction<T, Context>(
-        dev_ctx, *B, label2perms[1], all_labels, ellipsis_dims[1], labeltype);
-    // Contract Procedure
-    dev_ctx.template Alloc<T>(out);
+    // Reduction and Contract Procedure
     auto after_contraction = PerformContraction<T, Context>(dev_ctx,
-                                                            reduce_A,
-                                                            reduce_B,
+                                                            *A,
+                                                            *B,
                                                             label2perms,
                                                             all_labels,
                                                             labeltype,
                                                             labelshape,
                                                             ellipsis_dims,
-                                                            broadcast_dims);
+                                                            broadcast_dims,
+                                                            cache,
+                                                            !is_forward);
     TransposeToOutput<T, Context>(dev_ctx,
                                   after_contraction,
                                   right,
@@ -545,4 +621,37 @@ void EinsumKernel(const Context& dev_ctx,
   }
 }
 
+template <typename T, typename Context>
+void EinsumKernelRaw(const Context& dev_ctx,
+                     const std::vector<const DenseTensor*>& inputs,
+                     const std::string& equation,
+                     DenseTensor* out,
+                     std::vector<DenseTensor*> cache) {
+  std::vector<char> tmp;
+  // for the sake of compatibility, we may load and run v2.3 EinsumOp. Output
+  // may have nullptr and the cache.size() is not equal to inputs.size(). refer
+  // to BuildPhiKernelContext for details.
+  int diff = inputs.size() - cache.size();
+  for (int i = 0; i < diff; ++i) {
+    cache.push_back(nullptr);
+  }
+  EinsumKernelImpl<T, Context>(
+      dev_ctx, tmp, inputs, equation, out, cache, /*forward=*/true);
+}
+
+template <typename T, typename Context>
+void EinsumKernel(const Context& dev_ctx,
+                  const std::vector<const DenseTensor*>& inputs,
+                  const std::string& equation,
+                  DenseTensor* out) {
+  std::vector<char> place_holder;
+  std::vector<DenseTensor*> cache_tensor(
+      inputs.size());  // set empty; TA, TB, TdC
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    cache_tensor[i] = nullptr;
+  }
+  EinsumKernelImpl<T, Context>(
+      dev_ctx, place_holder, inputs, equation, out, cache_tensor, true);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index fa1f15672b903..3c06b238d145c 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -513,6 +513,20 @@ void MultiplyDoubleGradKernel(const Context& dev_ctx,
                                         funcs::InverseMultiplyFunctor<T>>(
           dev_ctx, dout, ddy_safe, dx, axis);
     }
+  } else {
+    if (dx && dy) {
+      phi::funcs::ElemwiseGradCompute<Context, T, MulGradDX<T>, MulGradDY<T>>(
+          dev_ctx,
+          ddx_safe,
+          ddy_safe,
+          dout,
+          dout,
+          axis,
+          dx,
+          dy,
+          MulGradDX<T>(),
+          MulGradDY<T>());
+    }
   }
 }
 
@@ -683,6 +697,43 @@ struct MinGradDy {
   }
 };
 
+template <typename T>
+struct HeavisideGradDx {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+    return dout * static_cast<T>(0);
+  }
+};
+
+template <typename T>
+struct HeavisideGradDy {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+    return dout * static_cast<T>(x == static_cast<T>(0));
+  }
+};
+
+template <typename T, typename Context>
+void ElementwiseHeavisideGradKernel(const Context& dev_ctx,
+                                    const DenseTensor& x,
+                                    const DenseTensor& y,
+                                    const DenseTensor& dout,
+                                    int axis,
+                                    DenseTensor* dx,
+                                    DenseTensor* dy) {
+  funcs::ElementwiseGradPreProcess(dout, dx);
+  phi::funcs::
+      ElemwiseGradCompute<Context, T, HeavisideGradDx<T>, HeavisideGradDy<T>>(
+          dev_ctx,
+          x,
+          y,
+          dout,
+          dout,
+          axis,
+          dx,
+          dy,
+          HeavisideGradDx<T>(),
+          HeavisideGradDy<T>());
+}
+
 template <typename T>
 struct PowGradDX {
   HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
diff --git a/paddle/phi/kernels/impl/mv_kernel_impl.h b/paddle/phi/kernels/impl/mv_kernel_impl.h
index 1754ea323ceb9..4baee25a0993a 100644
--- a/paddle/phi/kernels/impl/mv_kernel_impl.h
+++ b/paddle/phi/kernels/impl/mv_kernel_impl.h
@@ -23,7 +23,7 @@ void MvKernel(const Context& dev_ctx,
               const DenseTensor& x,
               const DenseTensor& vec,
               DenseTensor* out) {
-  auto dim_x = x.dims();
+  const auto& dim_x = x.dims();
 
   // get data ptr
   const T* x_data = x.data<T>();
diff --git a/paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h b/paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h
index db19a04337932..f71f6cd990aa1 100644
--- a/paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h
@@ -32,8 +32,8 @@ void PixelShuffleGradKernel(const Context& ctx,
   ctx.template Alloc<T>(dx);
   int factor = upscale_factor;
   bool channel_last = (data_format == "NHWC");
-  auto do_dims = dout->dims();
-  auto dx_dims = dx->dims();
+  const auto& do_dims = dout->dims();
+  const auto& dx_dims = dx->dims();
 
   DenseTensor t(*dout);
   if (!channel_last) {
diff --git a/paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h b/paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h
index 2303db4ea57d6..c5e41b4902951 100644
--- a/paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h
+++ b/paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h
@@ -31,8 +31,8 @@ void PixelShuffleKernel(const Context& ctx,
   ctx.template Alloc<T>(out);
   int factor = upscale_factor;
   bool channel_last = (data_format == "NHWC");
-  auto in_dims = in->dims();
-  auto o_dims = out->dims();
+  const auto& in_dims = in->dims();
+  const auto& o_dims = out->dims();
 
   DenseTensor t(*in);
   if (!channel_last) {
diff --git a/paddle/phi/kernels/impl/pixel_unshuffle_grad_kernel_impl.h b/paddle/phi/kernels/impl/pixel_unshuffle_grad_kernel_impl.h
index cb02539f2e890..399c6a56727e2 100644
--- a/paddle/phi/kernels/impl/pixel_unshuffle_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/pixel_unshuffle_grad_kernel_impl.h
@@ -33,8 +33,8 @@ void PixelUnshuffleGradKernel(const Context& dev_ctx,
   dev_ctx.template Alloc<T>(dx);
   int factor = downscale_factor;
   bool channel_last = (data_format == "NHWC");
-  auto do_dims = dout->dims();
-  auto dx_dims = dx->dims();
+  const auto& do_dims = dout->dims();
+  const auto& dx_dims = dx->dims();
 
   DenseTensor t(*dout);
   if (!channel_last) {
diff --git a/paddle/phi/kernels/impl/pixel_unshuffle_kernel_impl.h b/paddle/phi/kernels/impl/pixel_unshuffle_kernel_impl.h
index 0a140b270ba1b..7ffce62dacf65 100644
--- a/paddle/phi/kernels/impl/pixel_unshuffle_kernel_impl.h
+++ b/paddle/phi/kernels/impl/pixel_unshuffle_kernel_impl.h
@@ -32,8 +32,8 @@ void PixelUnshuffleKernel(const Context& dev_ctx,
   dev_ctx.template Alloc<T>(out);
   int factor = downscale_factor;
   bool channel_last = (data_format == "NHWC");
-  auto in_dims = in->dims();
-  auto o_dims = out->dims();
+  const auto& in_dims = in->dims();
+  const auto& o_dims = out->dims();
 
   DenseTensor t(*in);
   if (!channel_last) {
diff --git a/paddle/phi/kernels/impl/searchsorted_kernel_impl.h b/paddle/phi/kernels/impl/searchsorted_kernel_impl.h
index 82bd9fba2a66d..e3cd6f5828d04 100644
--- a/paddle/phi/kernels/impl/searchsorted_kernel_impl.h
+++ b/paddle/phi/kernels/impl/searchsorted_kernel_impl.h
@@ -158,7 +158,7 @@ static void VisitDataType(DataType type, Visitor visitor) {
     visitor.template apply<int64_t>();
   } else {
     PADDLE_THROW(errors::InvalidArgument(
-        "The recieved values data type %s can not meet input requirements. "
+        "The received values data type %s can not meet input requirements. "
         "Because the given values data type of searchsorted operators must be "
         "float32, float64, int32 or int64. Please input appropriate "
         "sorted_sequence again! ",
diff --git a/paddle/phi/kernels/impl/squeeze_kernel_impl.h b/paddle/phi/kernels/impl/squeeze_kernel_impl.h
index d2b40824a91c9..bb1627d40925f 100644
--- a/paddle/phi/kernels/impl/squeeze_kernel_impl.h
+++ b/paddle/phi/kernels/impl/squeeze_kernel_impl.h
@@ -22,8 +22,8 @@ template <typename T, typename Context>
 void SqueezeKernel(const Context& dev_ctx,
                    const DenseTensor& x,
                    const std::vector<int>& axes,
-                   DenseTensor* xshape,
-                   DenseTensor* out) {
+                   DenseTensor* out,
+                   DenseTensor* xshape) {
   auto x_dims = x.dims();
   auto out_dims = funcs::GetOutputSqueezeShape(axes, x_dims, true);
 
diff --git a/paddle/phi/kernels/impl/unfold_grad_kernel_impl.h b/paddle/phi/kernels/impl/unfold_grad_kernel_impl.h
index 5556654ee7c0d..0724cffdd4448 100644
--- a/paddle/phi/kernels/impl/unfold_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/unfold_grad_kernel_impl.h
@@ -35,7 +35,7 @@ void UnfoldGradKernel(const Context& ctx,
 
   if (!x_grad) return;
 
-  auto x_dims = x_grad->dims();
+  const auto& x_dims = x_grad->dims();
   const int batch_size = static_cast<int>(x_dims[0]);
 
   int out_height = phi::funcs::CalcOutputSize(x_dims[2],
diff --git a/paddle/phi/kernels/impl/unfold_kernel_impl.h b/paddle/phi/kernels/impl/unfold_kernel_impl.h
index e914f6cacbde9..4526d1c3dcd7d 100644
--- a/paddle/phi/kernels/impl/unfold_kernel_impl.h
+++ b/paddle/phi/kernels/impl/unfold_kernel_impl.h
@@ -36,7 +36,7 @@ void UnfoldKernel(const Context& ctx,
   paddle::operators::math::
       Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
           im2col;
-  auto x_dims = x.dims();
+  const auto& x_dims = x.dims();
 
   int out_height = phi::funcs::CalcOutputSize(x_dims[2],
                                               kernel_sizes[0],
diff --git a/paddle/phi/kernels/impl/unsqueeze_kernel_impl.h b/paddle/phi/kernels/impl/unsqueeze_kernel_impl.h
index 58efff16a5a5a..02110d631fb4d 100644
--- a/paddle/phi/kernels/impl/unsqueeze_kernel_impl.h
+++ b/paddle/phi/kernels/impl/unsqueeze_kernel_impl.h
@@ -22,8 +22,8 @@ template <typename T, typename Context>
 void UnsqueezeKernel(const Context& dev_ctx,
                      const DenseTensor& x,
                      const IntArray& axes,
-                     DenseTensor* xshape,
-                     DenseTensor* out) {
+                     DenseTensor* out,
+                     DenseTensor* xshape) {
   auto x_dims = x.dims();
   auto out_dims = out->dims();
   if (axes.FromTensor()) {
diff --git a/paddle/phi/kernels/instance_norm_grad_kernel.h b/paddle/phi/kernels/instance_norm_grad_kernel.h
new file mode 100644
index 0000000000000..7924c767ab61e
--- /dev/null
+++ b/paddle/phi/kernels/instance_norm_grad_kernel.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void InstanceNormGradKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& y_grad,
+                            paddle::optional<const DenseTensor&> scale,
+                            const DenseTensor& saved_mean,
+                            const DenseTensor& saved_variance,
+                            float epsilon,
+                            DenseTensor* x_grad,
+                            DenseTensor* scale_grad,
+                            DenseTensor* bias_grad);
+
+template <typename T, typename Context>
+void InstanceNormDoubleGradKernel(const Context& dev_ctx,
+                                  const DenseTensor& x,
+                                  paddle::optional<const DenseTensor&> scale,
+                                  const DenseTensor& saved_mean,
+                                  const DenseTensor& saved_variance,
+                                  const DenseTensor& dy,
+                                  paddle::optional<const DenseTensor&> ddx,
+                                  paddle::optional<const DenseTensor&> ddscale,
+                                  paddle::optional<const DenseTensor&> ddbias,
+                                  float epsilon,
+                                  DenseTensor* dx,
+                                  DenseTensor* dscale,
+                                  DenseTensor* ddy);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/instance_norm_kernel.h b/paddle/phi/kernels/instance_norm_kernel.h
new file mode 100644
index 0000000000000..8c50025a73ce0
--- /dev/null
+++ b/paddle/phi/kernels/instance_norm_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void InstanceNormKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        paddle::optional<const DenseTensor&> scale,
+                        paddle::optional<const DenseTensor&> bias,
+                        float epsilon,
+                        DenseTensor* y,
+                        DenseTensor* saved_mean,
+                        DenseTensor* saved_variance);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/kps/bitwise_kernel.cu b/paddle/phi/kernels/kps/bitwise_kernel.cu
index 44859785f2fb8..285b18927af80 100644
--- a/paddle/phi/kernels/kps/bitwise_kernel.cu
+++ b/paddle/phi/kernels/kps/bitwise_kernel.cu
@@ -51,9 +51,9 @@ void BitwiseNotKernel(const Context& dev_ctx,
   dev_ctx.template Alloc<T>(out);
   std::vector<const DenseTensor*> ins = {&x};
   std::vector<DenseTensor*> outs = {out};
-  funcs::BitwiseNotFunctor<T> func;
-  funcs::BroadcastKernel<ElementwiseType::kUnary, T, T>(
-      dev_ctx, ins, &outs, -1, func);
+  funcs::BitwiseNotFunctor<T> unary_func;
+  funcs::ElementwiseKernel<T, funcs::BitwiseNotFunctor<T>>(
+      dev_ctx, ins, &outs, unary_func);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/kps/elementwise_add_kernel.cu b/paddle/phi/kernels/kps/elementwise_add_kernel.cu
index 8f7d45771d9d0..98e39ada32b8b 100644
--- a/paddle/phi/kernels/kps/elementwise_add_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_add_kernel.cu
@@ -33,6 +33,14 @@ void AddKernel(const Context& dev_ctx,
   AddRawKernel<T>(dev_ctx, x, y, axis, out);
 }
 
+template <typename T, typename Context>
+void GradAddKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   DenseTensor* out) {
+  AddKernel<T>(dev_ctx, x, y, out);
+}
+
 }  // namespace phi
 
 #ifdef PADDLE_WITH_XPU_KP
@@ -71,4 +79,18 @@ PD_REGISTER_KERNEL(add,
                    phi::dtype::bfloat16,
                    complex64,
                    complex128) {}
+
+PD_REGISTER_KERNEL(grad_add,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::GradAddKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   complex64,
+                   complex128) {}
 #endif
diff --git a/paddle/phi/kernels/kps/elementwise_kernel.cu b/paddle/phi/kernels/kps/elementwise_kernel.cu
index 821fda52ab102..d387096a70b75 100644
--- a/paddle/phi/kernels/kps/elementwise_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_kernel.cu
@@ -54,6 +54,8 @@ void FloorDivideKernel(const Context& dev_ctx,
   int axis = -1;
   FloorDivideRawKernel<T>(dev_ctx, x, y, axis, out);
 }
+// Create the definition of Heaviside
+DEFINE_CUDA_ELEMENTWISE_OP(ElementwiseHeaviside)
 // Create the definition of Pow
 DEFINE_CUDA_ELEMENTWISE_OP(ElementwisePow)
 template <typename T, typename Context>
@@ -130,6 +132,14 @@ PD_REGISTER_KERNEL(floor_divide_raw,
                    phi::FloorDivideRawKernel,
                    int,
                    int64_t) {}
+PD_REGISTER_KERNEL(elementwise_heaviside_raw,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::ElementwiseHeavisideRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
 PD_REGISTER_KERNEL(elementwise_pow_raw,
                    KPS,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h
index e02f4450a8bab..fabc6c0d13e7c 100644
--- a/paddle/phi/kernels/primitive/compute_primitives.h
+++ b/paddle/phi/kernels/primitive/compute_primitives.h
@@ -271,6 +271,20 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out,
   }
 }
 
+template <typename InT,
+          typename OutT,
+          int NX,
+          int NY,
+          int BlockSize,
+          class OpFunc>
+__device__ __forceinline__ void ElementwiseBinary(
+    OutT* out, const InT* in1, const InT* in2, OpFunc compute, int read_lens) {
+#pragma unroll
+  for (int idx = 0; idx < NX * NY; ++idx) {
+    out[idx] = static_cast<OutT>(compute(in1[idx], in2[idx]));
+  }
+}
+
 /**
  * @brief Ternary calculation according to OpFunc. Shape of input and output
  * are the same.
diff --git a/paddle/phi/kernels/primitive/compute_primitives_xpu2.h b/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
index 4d65dd6dd5d87..eb45def836edc 100644
--- a/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
@@ -17,6 +17,7 @@
 #include "xpu/kernel/cluster_header.h"
 #include "xpu/kernel/debug.h"
 #include "xpu/kernel/math.h"
+#include "xpu/kernel/simd_header.h"
 
 namespace phi {
 namespace kps {
@@ -158,6 +159,19 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out,
   }
 }
 
+template <typename InT,
+          typename OutT,
+          int NX,
+          int NY,
+          int BlockSize,
+          class OpFunc>
+__device__ __forceinline__ void ElementwiseBinary(
+    OutT* out, const InT* in1, const InT* in2, OpFunc compute, int read_lens) {
+  for (int idx = 0; idx < read_lens; ++idx) {
+    out[idx] = static_cast<OutT>(compute(in1[idx], in2[idx]));
+  }
+}
+
 /**
  * @brief Ternary calculation according to OpFunc. Shape of input and output
  * are the same.
@@ -329,14 +343,12 @@ __device__ __forceinline__ void Reduce(T* out,
                                        ReduceFunctor reducer,
                                        bool reduce_last_dim) {
   if (Mode == details::kGlobalMode) {
+    if (reduce_last_dim) {
 #pragma unroll
-    for (int i = 0; i < NY; ++i) {
-#pragma unroll
-      for (int j = 0; j < NX; ++j) {
-        out[i] = reducer(out[i], in[i * NX + j]);
+      for (int i = 0; i < NY * NX; i++) {  // reduce along blockDim.x
+        details::BlockXReduce<T, ReduceFunctor, 1>(&out[i], reducer);
       }
     }
-    details::BlockXReduce<T, ReduceFunctor, NY>(out, reducer);
   } else {  // else  kLocalMode
 #pragma unroll
     for (int i = 0; i < NY; ++i) {
diff --git a/paddle/phi/kernels/primitive/datamover_primitives.h b/paddle/phi/kernels/primitive/datamover_primitives.h
index 993349f2d9e14..8b0c42c9d19b1 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives.h
@@ -82,10 +82,10 @@ struct FastDivMod {
  * index of the output data. if input or output shape is [dim0, dim1] then dims
  * must be [dim1, dim0].
  */
-template <int kDims>
 struct BroadcastConfig {
-  FastDivMod divmoders[kDims];
+  FastDivMod divmoders[phi::DDim::kMaxRank];
   uint32_t strides[phi::DDim::kMaxRank];
+  int kDims;
   HOSTDEVICE BroadcastConfig() {}
 
   HOSTDEVICE BroadcastConfig(const std::vector<int64_t>& out_dims,
@@ -109,7 +109,7 @@ struct BroadcastConfig {
                                             std::multiplies<int64_t>())
                           : strides_in[i];
     }
-
+    kDims = dim_size;
     memcpy(strides, strides_in.data(), kDims * sizeof(uint32_t));
     memcpy(divmoders, divmoders_in.data(), kDims * sizeof(FastDivMod));
   }
@@ -246,6 +246,14 @@ __device__ __forceinline__ void Init(T* dst, T init_data) {
   }
 }
 
+template <typename T, int NX>
+__device__ __forceinline__ void Init(T* dst, T init_data, int read_lens) {
+#pragma unroll
+  for (int i = 0; i < NX; i++) {
+    dst[i] = init_data;
+  }
+}
+
 /**
  * The difference from the above function is that
  * it supports different data types of inputs.
@@ -311,6 +319,38 @@ __device__ __forceinline__ void ReadData(T* dst,
   }
 }
 
+template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
+__device__ __forceinline__ void ReadData(T* dst,
+                                         const T* __restrict__ src,
+                                         int num,
+                                         int read_lens) {
+  if (IsBoundary) {  // blockDim.x * NX > num
+    int thread_offset = threadIdx.x * NX;
+#pragma unroll
+    for (int idx = 0; idx < NX; ++idx) {
+      if (idx + thread_offset < num) {
+        dst[idx] = src[thread_offset + idx];
+      }
+    }
+  } else {  // blockDim,x * NX < num
+    constexpr int kVectorSize = (NX % 4 == 0) ? 4 : (NX % 2 == 0) ? 2 : 1;
+    constexpr int kVectorsPerThread = NX / kVectorSize;
+    int thread_offset = threadIdx.x * kVectorsPerThread;
+
+    using VecType = details::VectorType<T, kVectorSize>;
+    const VecType* vec_input = reinterpret_cast<const VecType*>(src);
+    VecType vec_temp[kVectorsPerThread];
+
+#pragma unroll
+    for (int i = 0; i < kVectorsPerThread; ++i) {
+      vec_temp[i] = vec_input[thread_offset + i];
+#pragma unroll
+      for (int idx = 0; idx < NX; ++idx) {
+        dst[idx] = *(reinterpret_cast<T*>(vec_temp) + idx);
+      }
+    }
+  }
+}
 /**
  * @brief Read 1D data from global memory to register. The difference
  * from the above function is that it supports different data types of inputs.
@@ -396,17 +436,12 @@ __device__ __forceinline__ void ReadData(ArgsT* dst,
  * stride_nx: Each read one element stride stride_nx elements in the last dim.
  * stride_ny: Each read one element stride stride_ny elements in the first dim.
  */
-template <typename T,
-          int NX,
-          int NY,
-          int BlockSize,
-          int Rank,
-          bool IsBoundary = false>
+template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
 __device__ __forceinline__ void ReadDataBc(
     T* dst,
     const T* __restrict__ src,
     uint32_t block_offset,
-    details::BroadcastConfig<Rank> config,
+    const details::BroadcastConfig& config,
     int total_num_output,
     int stride_nx,
     int stride_ny) {
@@ -425,7 +460,8 @@ __device__ __forceinline__ void ReadDataBc(
         }
       }
 #pragma unroll
-      for (int i = 0; i < Rank; ++i) {
+      for (int i = 0; i < phi::DDim::kMaxRank; ++i) {
+        if (i >= config.kDims) break;
         auto fast_divmoder = config.divmoders[i].Divmod(index_output);
         index_output = fast_divmoder.val[0];
         index_src += fast_divmoder.val[1] * config.strides[i];
@@ -576,6 +612,36 @@ __device__ __forceinline__ void WriteData(T* dst,
   }
 }
 
+template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
+__device__ __forceinline__ void WriteData(T* dst,
+                                          T* __restrict__ src,
+                                          int num,
+                                          int read_lens) {
+  if (IsBoundary) {
+    int thread_offset = threadIdx.x * NX;
+#pragma unroll
+    for (int idx = 0; idx < NX; ++idx) {
+      if ((thread_offset + idx) < num) {
+        dst[thread_offset + idx] = src[idx];
+      }
+    }
+  } else {
+    // Vector type
+    constexpr int kVectorSize = (NX % 4 == 0) ? 4 : (NX % 2 == 0) ? 2 : 1;
+    constexpr int kVectorsPerThread = NX / kVectorSize;
+
+    int thread_offset = threadIdx.x * kVectorsPerThread;
+    using VecType = details::VectorType<T, kVectorSize>;
+    VecType* vec_dst = reinterpret_cast<VecType*>(dst);
+    VecType vec_temp[kVectorsPerThread];
+#pragma unroll
+    for (int idx = 0; idx < kVectorsPerThread; ++idx) {
+      vec_temp[idx] = *(reinterpret_cast<VecType*>(src) + idx);
+      vec_dst[thread_offset + idx] = vec_temp[idx];
+    }
+  }
+}
+
 /**
  * @brief Write 2D data from register to global memory according to Tx type, and
  * store it as Ty type.
@@ -715,18 +781,14 @@ __device__ __forceinline__ void Init(T* dst, T* init_data, int num) {
  * coordinate mapping relationship between output data and input data.
  * total_num_output: Total number of original output.
  */
-template <typename T,
-          int NX,
-          int NY,
-          int BlockSize,
-          int Rank,
-          bool IsBoundary = false>
+template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
 __device__ __forceinline__ void ReadDataBc(
     T* dst,
     const T* __restrict__ src,
     uint32_t block_offset,
-    details::BroadcastConfig<Rank> config,
-    int total_num_output) {
+    const details::BroadcastConfig& config,
+    int total_num_output,
+    int read_lens = NX) {
   uint32_t thread_offset = block_offset + threadIdx.x * NX;
   uint32_t index_src = 0;
 
@@ -740,7 +802,8 @@ __device__ __forceinline__ void ReadDataBc(
       }
     }
 #pragma unroll
-    for (int i = 0; i < Rank; ++i) {
+    for (int i = 0; i < phi::DDim::kMaxRank; ++i) {
+      if (i >= config.kDims) break;
       auto fast_divmoder = config.divmoders[i].Divmod(index_output);
       index_output = fast_divmoder.val[0];
       index_src += fast_divmoder.val[1] * config.strides[i];
diff --git a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
index a18fc7cbb3119..1e5dfe2a542b0 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
@@ -21,6 +21,39 @@ namespace phi {
 namespace kps {
 namespace details {
 
+enum class OptType {    // Optimize type of calc after input shape compressed
+  CanNotOptimize = -1,  // can not optimize, broadcast first
+  N_1,                  // just like {1} op {100} or {100} op {1}
+  MN_N,                 // just like {100} op {3, 100} or {3, 100} op {100}
+  MN_M,                 // just like {3} op {3, 100} or {3, 100} op {3}
+  MNK_1N1,              // just like {3} op {2, 3, 100} or {2, 3, 100} op {3}
+  MNK_M1K,  // just like {2, 1, 100} op {2, 3, 100} or {2, 3, 100} op {2, 1,
+            // 100}
+};
+
+// Rules to determine whether dimensions can be merged
+// rule 0 - xshape[idx] == yshape[idx]
+// rule 1 - xshape[idx] == 1 && yshape[idx] != 1
+// rule 2 - xshape[idx] != 1 && yshape[idx] == 1
+static int judge_case(int a, int b) {
+  if (a == b) {
+    return 0;
+  } else if (a == 1 && b != 1) {
+    return 1;
+  } else if (a != 1 && b == 1) {
+    return 2;
+  }
+  return -1;
+}
+
+static bool case_is_same(int case_front, int case_back) {
+  if (case_front == case_back) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
 template <typename T, int VecSize>
 struct alignas(sizeof(T) * VecSize) VectorType {
   T val[VecSize];
@@ -32,16 +65,25 @@ struct alignas(sizeof(T) * VecSize) VectorType {
  * must be [dim1, dim0].
  */
 #pragma pack(4)
-template <int kDims>
 struct BroadcastConfig {
   int strides_in[phi::DDim::kMaxRank];
   int strides_out[phi::DDim::kMaxRank];
   int in_dim[phi::DDim::kMaxRank];
-
+  int dim_after_cmp[phi::DDim::kMaxRank];
+  int y_dim_after_cmp[phi::DDim::kMaxRank];
+  int dim_size_after_cmp = 0;
+  int cmp_res = 0;
+  OptType cmp_type = OptType::CanNotOptimize;
+  int m = 1;
+  int n = 1;
+  int k = 1;
+  int buf_len = 0;
+  int kDims;
   HOSTDEVICE BroadcastConfig() {}
 
   HOSTDEVICE BroadcastConfig(const std::vector<int64_t>& out_dims,
                              const std::vector<int64_t>& in_dims,
+                             const std::vector<int64_t>& y_in_dims,
                              int dim_size) {
     std::vector<int> strides_in_tmp;
     std::vector<int> strides_out_tmp;
@@ -57,22 +99,193 @@ struct BroadcastConfig {
     for (int i = 0; i < dim_size; i++) {
       dim_tmp[i] = in_dims[i];
     }
-
+    kDims = dim_size;
     memcpy(strides_in, strides_in_tmp.data(), kDims * sizeof(int));
     memcpy(strides_out, strides_out_tmp.data(), kDims * sizeof(int));
     memcpy(in_dim, dim_tmp.data(), kDims * sizeof(int));
+
+    cmp_res = get_mnk_for_broadcast_ops(in_dims, y_in_dims);
+    get_opt_type();
+    buf_len = get_buf_len();
+  }
+
+  int get_buf_len() {
+    if (cmp_type == OptType::CanNotOptimize) {
+      return 256;
+    }
+    int max_buf_len = 512;
+    int buf_len = m / 16 * 16;
+    if (buf_len == 0) {
+      buf_len = m;
+    }
+    return std::min(max_buf_len, buf_len);
   }
 
   __device__ inline int operator()(int index_output) const {
     int index_src = 0;
-#pragma unroll
-    for (int i = kDims - 1; i >= 0; --i) {
-      int tmp_index = (index_output / strides_out[i]);
-      index_output = index_output - tmp_index * strides_out[i];
-      index_src += (tmp_index % in_dim[i]) * strides_in[i];
+
+    switch (cmp_type) {
+      int div, mod, tmp_index;
+      case OptType::MNK_M1K:
+        div = index_output / (m * n);
+        mod = index_output % (m * n) % m;
+        index_src = div * m + mod;
+        break;
+      case OptType::MNK_1N1:
+        // index_src = index_output / m % n;
+        index_src = index_output % (m * n) / m;
+        break;
+      case OptType::N_1:
+        index_src = 0;
+        break;
+      case OptType::MN_N:
+        index_src = index_output / m;
+        break;
+      case OptType::MN_M:
+        index_src = index_output % m;
+        break;
+      case OptType::CanNotOptimize:
+        for (int i = kDims - 1; i >= 0; --i) {
+          tmp_index = (index_output / strides_out[i]);
+          index_output = index_output - tmp_index * strides_out[i];
+          index_src += (tmp_index % in_dim[i]) * strides_in[i];
+        }
+        break;
     }
     return index_src;
   }
+
+  void get_opt_type() {
+    if (dim_size_after_cmp == 1) {
+      if (dim_after_cmp[0] == 1 && y_dim_after_cmp[0] != 1) {  // {1} op {n}
+        n = y_dim_after_cmp[0];
+        cmp_type = OptType::N_1;
+      } else if (dim_after_cmp[0] != 1 &&
+                 y_dim_after_cmp[0] == 1) {  // {n} op {1}
+        n = dim_after_cmp[0];
+        cmp_type = OptType::N_1;
+      } else {
+        cmp_type = OptType::CanNotOptimize;  // xshape == yshape
+      }
+    }
+    if (dim_size_after_cmp == 2) {
+      if (dim_after_cmp[0] == 1 && dim_after_cmp[1] != 1 &&
+          y_dim_after_cmp[0] != 1 &&
+          y_dim_after_cmp[1] != 1) {  // {n} op {m, n}
+        m = y_dim_after_cmp[0];
+        n = y_dim_after_cmp[1];
+        cmp_type = OptType::MN_N;
+      } else if (dim_after_cmp[0] != 1 && dim_after_cmp[1] == 1 &&
+                 y_dim_after_cmp[0] != 1 &&
+                 y_dim_after_cmp[1] != 1) {  // {m} op {m, n}
+        m = y_dim_after_cmp[0];
+        n = y_dim_after_cmp[1];
+        cmp_type = OptType::MN_M;
+      } else if (dim_after_cmp[0] != 1 && dim_after_cmp[1] != 1 &&
+                 y_dim_after_cmp[0] == 1 &&
+                 y_dim_after_cmp[1] != 1) {  // {m, n} op {n}
+        m = dim_after_cmp[0];
+        n = dim_after_cmp[1];
+        cmp_type = OptType::MN_N;
+      } else if (dim_after_cmp[0] != 1 && dim_after_cmp[1] != 1 &&
+                 y_dim_after_cmp[0] != 1 &&
+                 y_dim_after_cmp[1] == 1) {  // {m, n} op {m}
+        m = dim_after_cmp[0];
+        n = dim_after_cmp[1];
+        cmp_type = OptType::MN_M;
+      } else {
+        cmp_type = OptType::CanNotOptimize;
+      }
+    }
+    if (dim_size_after_cmp == 3) {
+      if (dim_after_cmp[0] == 1 && dim_after_cmp[1] != 1 &&
+          dim_after_cmp[2] == 1 && y_dim_after_cmp[0] != 1 &&
+          y_dim_after_cmp[1] != 1 &&
+          y_dim_after_cmp[2] != 1) {  // {1, n, 1} op {m, n, k}
+        m = y_dim_after_cmp[0];
+        n = y_dim_after_cmp[1];
+        k = y_dim_after_cmp[2];
+        cmp_type = OptType::MNK_1N1;
+      } else if (dim_after_cmp[0] != 1 && dim_after_cmp[1] != 1 &&
+                 dim_after_cmp[2] != 1 && y_dim_after_cmp[0] == 1 &&
+                 y_dim_after_cmp[1] != 1 &&
+                 y_dim_after_cmp[2] == 1) {  // {m, n, k} op {1, n, 1}
+        m = dim_after_cmp[0];
+        n = dim_after_cmp[1];
+        k = dim_after_cmp[2];
+        cmp_type = OptType::MNK_1N1;
+      } else if (dim_after_cmp[0] != 1 && dim_after_cmp[1] == 1 &&
+                 dim_after_cmp[2] != 1 && y_dim_after_cmp[0] != 1 &&
+                 y_dim_after_cmp[1] != 1 &&
+                 y_dim_after_cmp[2] != 1) {  // {m, 1, k} op {m, n, k}
+        m = y_dim_after_cmp[0];
+        n = y_dim_after_cmp[1];
+        k = y_dim_after_cmp[2];
+        cmp_type = OptType::MNK_M1K;
+      } else if (dim_after_cmp[0] != 1 && dim_after_cmp[1] != 1 &&
+                 dim_after_cmp[2] != 1 && y_dim_after_cmp[0] != 1 &&
+                 y_dim_after_cmp[1] == 1 &&
+                 y_dim_after_cmp[2] != 1) {  // {m, n, k} op {m, 1, k}
+        m = dim_after_cmp[0];
+        n = dim_after_cmp[1];
+        k = dim_after_cmp[2];
+        cmp_type = OptType::MNK_M1K;
+      } else {
+        cmp_type = OptType::CanNotOptimize;
+      }
+    }
+  }
+
+  int get_mnk_for_broadcast_ops(const std::vector<int64_t>& xshape,
+                                const std::vector<int64_t>& yshape) {
+    int idx = 0;
+    int cmp_x = 0;
+    int cmp_y = 0;
+    bool is_same = false;
+
+    std::vector<int64_t> xshape_after_remove_ones = xshape;
+    std::vector<int64_t> yshape_after_remove_ones = yshape;
+    // first step: remove excess ones
+    std::vector<int64_t>::iterator x_iter = xshape_after_remove_ones.begin();
+    std::vector<int64_t>::iterator y_iter = yshape_after_remove_ones.begin();
+    for (; x_iter != xshape_after_remove_ones.end();) {
+      if (*x_iter == 1 && *y_iter == 1) {
+        x_iter = xshape_after_remove_ones.erase(x_iter);
+        y_iter = yshape_after_remove_ones.erase(y_iter);
+      } else {
+        x_iter++;
+        y_iter++;
+      }
+    }
+    // second step: compress dims
+    int after_cmp_idx = 0;
+    for (int i = 0; i < 3; i++) {
+      cmp_x = xshape_after_remove_ones[idx];
+      cmp_y = yshape_after_remove_ones[idx];
+      while ((idx + 1) < xshape_after_remove_ones.size()) {
+        is_same = case_is_same(judge_case(xshape_after_remove_ones[idx],
+                                          yshape_after_remove_ones[idx]),
+                               judge_case(xshape_after_remove_ones[idx + 1],
+                                          yshape_after_remove_ones[idx + 1]));
+        if (is_same) {
+          cmp_x = cmp_x * xshape_after_remove_ones[idx + 1];
+          cmp_y = cmp_y * yshape_after_remove_ones[idx + 1];
+          idx++;
+        } else {
+          break;
+        }
+      }
+      idx = idx + 1;
+      dim_after_cmp[after_cmp_idx] = cmp_x;
+      y_dim_after_cmp[after_cmp_idx] = cmp_y;
+      after_cmp_idx++;
+      if (idx == xshape_after_remove_ones.size()) {
+        dim_size_after_cmp = after_cmp_idx;
+        return 0;
+      }
+    }
+    return -1;  // can not compress dims
+  }
 };
 #pragma pack()
 
@@ -199,6 +412,14 @@ __device__ __inline__ void Init(T* dst, T init_data) {
   }
 }
 
+template <typename T, int NX>
+__device__ __inline__ void Init(T* dst, T init_data, int read_lens) {
+#pragma unroll
+  for (int i = 0; i < read_lens; i++) {
+    dst[i] = init_data;
+  }
+}
+
 /**
  * The difference from the above function is that
  * it supports different data types of inputs.
@@ -251,6 +472,26 @@ __device__ __inline__ void ReadData(T* dst,
   }
 }
 
+template <typename T, int NX, int NY, int BlockSize, bool IsBoundary>
+__device__ __inline__ void ReadData(T* dst,
+                                    const T _global_ptr_* src,
+                                    int num,
+                                    int read_lens) {
+  int thread_offset = core_id() * read_lens;
+  __local__ T in_temp[1];
+  if (IsBoundary) {  // core_num() * read_lens > num
+#pragma unroll
+    for (int idx = 0; idx < read_lens; ++idx) {
+      if (idx + thread_offset < num) {
+        GM2LM(src + thread_offset + idx, in_temp, sizeof(T));
+        dst[idx] = in_temp[0];
+      }
+    }
+  } else {  // core_num() * read_lens < num
+    GM2LM(src + thread_offset, dst, read_lens * sizeof(T));
+  }
+}
+
 /**
  * @brief Read 1D data from global memory to register. The difference
  * from the above function is that it supports different data types of inputs.
@@ -312,7 +553,6 @@ __device__ __forceinline__ void ReadData(ArgsT* dst,
  * NY: The number of data rows loaded by each thread.
  * BlockSize: Identifies the current device thread index method. For xpu,
  * core_id() is used as the index.
- * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
  * IsBoundary: Indicates whether to perform block access storage out-of-bounds
  * judgment. When the number of data processed by the block is less than
  * NX x NY x core_num(), boundary judgment is required to avoid memory access
@@ -328,16 +568,11 @@ __device__ __forceinline__ void ReadData(ArgsT* dst,
  * stride_nx: Each read one element stride stride_nx elements in the last dim.
  * stride_ny: Each read one element stride stride_ny elements in the first dim.
  */
-template <typename T,
-          int NX,
-          int NY,
-          int BlockSize,
-          int Rank,
-          bool IsBoundary = false>
+template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
 __device__ __inline__ void ReadDataBc(T* dst,
                                       const T _global_ptr_* src,
                                       uint32_t block_offset,
-                                      details::BroadcastConfig<Rank> config,
+                                      const details::BroadcastConfig& config,
                                       int total_num_output,
                                       int stride_nx,
                                       int stride_ny) {
@@ -479,10 +714,32 @@ __device__ __forceinline__ void ReadDataReduce(
  * size: The current block needs to load size elements continuously.
  */
 
+template <typename T, int NX, int NY, int BlockSize, bool IsBoundary>
+__device__ void WriteData(T _global_ptr_* dst,
+                          const T* src,
+                          int num,
+                          int read_lens) {
+  int thread_offset = core_id() * read_lens;
+  __local__ T in_temp[1];
+
+  if (IsBoundary) {  // core_num() * read_lens > num
+#pragma unroll
+    for (int idx = 0; idx < read_lens; ++idx) {
+      if (idx + thread_offset < num) {
+        in_temp[0] = src[idx];
+        LM2GM(in_temp, dst + idx + thread_offset, sizeof(T));
+      }
+    }
+  } else {  // core_num() * read_lens < num
+    LM2GM(src, dst + thread_offset, read_lens * sizeof(T));
+  }
+}
+
 template <typename T, int NX, int NY, int BlockSize, bool IsBoundary>
 __device__ void WriteData(T _global_ptr_* dst, const T* src, int num) {
   int thread_offset = core_id() * NX;
   __local__ T in_temp[1];
+
   if (IsBoundary) {  // core_num() * NX > num
 #pragma unroll
     for (int idx = 0; idx < NX; ++idx) {
@@ -621,6 +878,272 @@ __device__ __inline__ void Init(T* dst, T* init_data, int num) {
   }
 }
 
+/**
+ * @brief Read data from global memory to local memory with broadcast
+ * {m, 1, k}-> {m, n, k} form.
+ *
+ * @template paraments
+ * T: Data type of register.
+ * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX.
+ * src: The original input data pointer of kernel.
+ * thread_offset: The data offset of this thread.
+ * config: Calculation configuration of broadcast. It is used to calculate the
+ * coordinate mapping relationship between output data and input data.
+ * read_lens: The number of data continuously loaded by each thread.
+ */
+template <typename T>
+__device__ __inline__ void ReadDataBcM1kMnk(
+    T* dst,
+    const T _global_ptr_* src,
+    int thread_offset,
+    const details::BroadcastConfig& config,
+    int read_lens) {
+  int index_output = thread_offset;
+  int index_base = config(index_output);
+  int m = config.m;
+  int n = config.n;
+
+  int m_pos = index_base % m;
+  if ((m - m_pos) < read_lens) {
+    int last_col = m - m_pos;
+    GM2LM(src + index_base, dst, last_col * sizeof(T));
+    int n_pos = index_output % (m * n) / m;
+    int next_part_index = 0;
+    if (n_pos != config.n - 1) {
+      next_part_index = index_base / m * m;
+    } else {
+      next_part_index = (index_base / m + 1) * m;
+    }
+    GM2LM(src + next_part_index,
+          dst + last_col,
+          (read_lens - last_col) * sizeof(T));
+  } else {
+    GM2LM(src + index_base, dst, read_lens * sizeof(T));
+  }
+}
+
+/**
+ * @brief Read data from global memory to local memory with broadcast
+ * {m, 1}-> {m, n} form.
+ *
+ * @template paraments
+ * T: Data type of register.
+ * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX.
+ * src: The original input data pointer of kernel.
+ * thread_offset: The data offset of this thread.
+ * config: Calculation configuration of broadcast. It is used to calculate the
+ * coordinate mapping relationship between output data and input data.
+ * read_lens: The number of data continuously loaded by each thread.
+ */
+template <typename T>
+__device__ __inline__ void ReadDataBcM1Mn(
+    T* dst,
+    const T _global_ptr_* src,
+    int thread_offset,
+    const details::BroadcastConfig& config,
+    int read_lens) {
+  int index_output = thread_offset;
+  int index_base = config(index_output);
+  int m = config.m;
+  int n = config.n;
+
+  int m_pos = index_base % m;
+  if ((m - m_pos) < read_lens) {
+    int last_col = m - m_pos;
+    GM2LM(src + index_base, dst, last_col * sizeof(T));
+    GM2LM(src, dst + last_col, (read_lens - last_col) * sizeof(T));
+  } else {
+    GM2LM(src + index_base, dst, read_lens * sizeof(T));
+  }
+}
+
+/**
+ * @brief Read data from global memory to local memory with broadcast
+ * {1, n}-> {m, n} form.
+ *
+ * @template paraments
+ * T: Data type of register.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX.
+ * src: The original input data pointer of kernel.
+ * thread_offset: The data offset of this thread.
+ * config: Calculation configuration of broadcast. It is used to calculate the
+ * coordinate mapping relationship between output data and input data.
+ * read_lens: The number of data continuously loaded by each thread.
+ */
+template <typename T>
+__device__ __inline__ void ReadDataBc1NMn(
+    T* dst,
+    const T _global_ptr_* src,
+    int thread_offset,
+    const details::BroadcastConfig& config,
+    int read_lens) {
+  int index_output = thread_offset;
+  int index_base = config(index_output);
+  int m = config.m;
+  int n = config.n;
+  T in_temp;
+
+  int m_pos = index_output % m;
+  if ((m - m_pos) < read_lens) {
+    int last_col = m - m_pos;
+    GM2LM(src + index_base, &in_temp, sizeof(T));
+    for (int i = 0; i < last_col; i++) {
+      dst[i] = in_temp;
+    }
+    GM2LM(src + index_base + 1, &in_temp, sizeof(T));
+    for (int i = 0; i < read_lens - last_col; i++) {
+      dst[last_col + i] = in_temp;
+    }
+  } else {
+    GM2LM(src + index_base, &in_temp, sizeof(T));
+    for (int i = 0; i < read_lens; i++) {
+      dst[i] = in_temp;
+    }
+  }
+}
+
+/**
+ * @brief Read data from global memory to local memory with broadcast
+ * {1, n, 1}-> {m, n, k} form.
+ *
+ * @template paraments
+ * T: Data type of register.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX.
+ * src: The original input data pointer of kernel.
+ * thread_offset: The data offset of this thread.
+ * config: Calculation configuration of broadcast. It is used to calculate the
+ * coordinate mapping relationship between output data and input data.
+ * read_lens: The number of data continuously loaded by each thread.
+ */
+template <typename T>
+__device__ __inline__ void ReadDataBc1N1Mnk(
+    T* dst,
+    const T _global_ptr_* src,
+    int thread_offset,
+    const details::BroadcastConfig& config,
+    int read_lens) {
+  int index_output = thread_offset;
+  int index_base = config(index_output);
+  int m = config.m;
+  int n = config.n;
+  T in_temp;
+
+  int m_pos = index_output % m;
+  if ((m - m_pos) < read_lens) {
+    int last_col = m - m_pos;
+    GM2LM(src + index_base, &in_temp, sizeof(T));
+    for (int i = 0; i < last_col; i++) {
+      dst[i] = in_temp;
+    }
+    int n_pos = index_output % (m * n) / m;
+    int next_part_index = 0;
+    if (n_pos != n - 1) {
+      next_part_index = n_pos + 1;
+    } else {
+      next_part_index = 0;
+    }
+    GM2LM(src + next_part_index, &in_temp, sizeof(T));
+    for (int i = 0; i < read_lens - last_col; i++) {
+      dst[last_col + i] = in_temp;
+    }
+  } else {
+    GM2LM(src + index_base, &in_temp, sizeof(T));
+    for (int i = 0; i < read_lens; i++) {
+      dst[i] = in_temp;
+    }
+  }
+}
+
+/**
+ * @brief Read data from global memory to local memory with broadcast
+ * {1}-> {n} form.
+ *
+ * @template paraments
+ * T: Data type of register.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX.
+ * src: The original input data pointer of kernel.
+ * thread_offset: The data offset of this thread.
+ * config: Calculation configuration of broadcast. It is used to calculate the
+ * coordinate mapping relationship between output data and input data.
+ * read_lens: The number of data continuously loaded by each thread.
+ */
+template <typename T>
+__device__ __inline__ void ReadDataBc1N(T* dst,
+                                        const T _global_ptr_* src,
+                                        int thread_offset,
+                                        const details::BroadcastConfig& config,
+                                        int read_lens) {
+  int index_output = thread_offset;
+  int index_base = config(index_output);
+  T in_temp;
+
+  GM2LM(src + index_base, &in_temp, sizeof(T));
+  for (int i = 0; i < read_lens; i++) {
+    dst[i] = in_temp;
+  }
+}
+
+/**
+ * @brief Read data from global memory to local memory with broadcast
+ * form which can not compress.
+ *
+ * @template paraments
+ * T: Data type of register.
+ * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
+ *
+ * @param：
+ * dst: The register pointer of the thread, the size is NX.
+ * src: The original input data pointer of kernel.
+ * thread_offset: The data offset of this thread.
+ * config: Calculation configuration of broadcast. It is used to calculate the
+ * coordinate mapping relationship between output data and input data.
+ * total_num_output: Total number of original output.
+ * read_lens: The number of data continuously loaded by each thread.
+ */
+template <typename T, bool IsBoundary = false>
+__device__ __inline__ void ReadDataBcCanNotCmp(
+    T* dst,
+    const T _global_ptr_* src,
+    int thread_offset,
+    const details::BroadcastConfig& config,
+    int total_num_output,
+    int read_lens) {
+  int index_output = thread_offset;
+  int index_base = config(index_output);
+  T in_temp;
+  int cache_size = 256;
+  __local__ T src_temp[cache_size];
+  GM2LM(src + index_base, src_temp, cache_size * sizeof(T));
+
+  for (int nx = 0; nx < read_lens; ++nx) {
+    index_output = thread_offset + nx;
+    if (IsBoundary) {
+      if (index_output >= total_num_output) {
+        break;
+      }
+    }
+    int index_src = config(index_output);
+    if (index_src >= index_base && index_src < index_base + cache_size) {
+      in_temp = src_temp[index_src - index_base];
+    } else {
+      GM2LM(src + index_src, &in_temp, sizeof(T));
+    }
+    dst[nx] = in_temp;
+  }
+}
+
 /**
  * @brief Read 1D data from global memory to register with broadcast form.
  *
@@ -630,7 +1153,6 @@ __device__ __inline__ void Init(T* dst, T* init_data, int num) {
  * NY: The number of data rows loaded by each thread, only NY = 1 was supported.
  * BlockSize: Identifies the current device thread index method. For xpu,
  * core_id() is used as the index.
- * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
  * IsBoundary: Indicates whether to perform block access storage out-of-bounds
  * judgment. When the number of data processed by the block is less than
  * NX x NY x core_num(), boundary judgment is required to avoid memory access
@@ -642,36 +1164,31 @@ __device__ __inline__ void Init(T* dst, T* init_data, int num) {
  * block_offset: The data offset of this block, core_num() * blockIdx.x * NX;
  * config: Calculation configuration of broadcast. It is used to calculate the
  * coordinate mapping relationship between output data and input data.
+ * read_lens: The number of data continuously loaded by each thread.
  * total_num_output: Total number of original output.
  */
-template <typename T,
-          int NX,
-          int NY,
-          int BlockSize,
-          int Rank,
-          bool IsBoundary = false>
-__device__ __inline__ void ReadDataBc(
-    T* dst,
-    const T _global_ptr_* src,
-    uint32_t block_offset,
-    const details::BroadcastConfig<Rank>& config,
-    int total_num_output) {
-  int thread_offset = block_offset + core_id() * NX;
-  int index_src = 0;
+template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
+__device__ __inline__ void ReadDataBc(T* dst,
+                                      const T _global_ptr_* src,
+                                      uint32_t block_offset,
+                                      const details::BroadcastConfig& config,
+                                      int total_num_output,
+                                      int read_lens) {
+  int thread_offset = block_offset + core_id() * read_lens;
 
-  __local__ T in_temp;
-#pragma unroll
-  for (int nx = 0; nx < NX; ++nx) {
-    int index_output = thread_offset + nx;
-    index_src = 0;
-    if (IsBoundary) {
-      if (index_output >= total_num_output) {
-        break;
-      }
-    }
-    index_src = config(index_output);
-    GM2LM(src + index_src, &in_temp, sizeof(T));
-    dst[nx] = in_temp;
+  if (config.cmp_type == details::OptType::MNK_M1K) {
+    ReadDataBcM1kMnk<T>(dst, src, thread_offset, config, read_lens);
+  } else if (config.cmp_type == details::OptType::N_1) {
+    ReadDataBc1N<T>(dst, src, thread_offset, config, read_lens);
+  } else if (config.cmp_type == details::OptType::MN_M) {
+    ReadDataBcM1Mn<T>(dst, src, thread_offset, config, read_lens);
+  } else if (config.cmp_type == details::OptType::MN_N) {
+    ReadDataBc1NMn<T>(dst, src, thread_offset, config, read_lens);
+  } else if (config.cmp_type == details::OptType::MNK_1N1) {
+    ReadDataBc1N1Mnk<T>(dst, src, thread_offset, config, read_lens);
+  } else {
+    ReadDataBcCanNotCmp<T, IsBoundary>(
+        dst, src, thread_offset, config, total_num_output, read_lens);
   }
 }
 
diff --git a/paddle/phi/kernels/primitive/functor_primitives_xpu2.h b/paddle/phi/kernels/primitive/functor_primitives_xpu2.h
index fdcbb5ec9cc8d..35cea6e692787 100644
--- a/paddle/phi/kernels/primitive/functor_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/functor_primitives_xpu2.h
@@ -55,21 +55,21 @@ struct DivideFunctor {
   inline DivideFunctor() { n_inv = static_cast<Tx>(1.0f); }
 
   explicit inline DivideFunctor(int n)
-      : n_inv(static_cast<Tx>(((float)1.0) / (static_cast<float>(n)))) {}
+      : n_inv(static_cast<Tx>(1.0f / (static_cast<float>(n)))) {}
 
   inline Ty operator()(const Tx& x) const { return static_cast<Ty>(x * n_inv); }
 
   __device__ inline DivideFunctor() { n_inv = static_cast<Tx>(1.0f); }
 
   __device__ inline DivideFunctor(int n)
-      : n_inv(static_cast<Tx>(((float)1.0) / (static_cast<float>(n)))) {}
+      : n_inv(static_cast<Tx>(1.0f / (static_cast<float>(n)))) {}
 
   __device__ inline Ty operator()(const Tx& x) const {
     return static_cast<Ty>(x * n_inv);
   }
 
   __device__ inline void SetDiv(int n) {
-    n_inv = static_cast<Tx>(((float)1.0) / (static_cast<float>(n)));
+    n_inv = static_cast<Tx>(1.0f / (static_cast<float>(n)));
   }
 
  private:
@@ -97,8 +97,7 @@ struct SquareFunctor {
  */
 template <typename T>
 struct MinFunctor {
-  inline T initial() { /*return static_cast<T>(std::numeric_limits<T>::max());*/
-  }
+  inline T initial() { return static_cast<T>(std::numeric_limits<T>::max()); }
 
   __device__ T operator()(const T& a, const T& b) const {
     return (b < a) ? b : a;
@@ -111,7 +110,7 @@ struct MinFunctor {
 template <typename T>
 struct MaxFunctor {
   inline T initial() {
-    // return static_cast<T>(std::numeric_limits<T>::lowest());
+    return static_cast<T>(std::numeric_limits<T>::lowest());
   }
 
   __device__ T operator()(const T& a, const T& b) const {
@@ -124,8 +123,7 @@ struct MaxFunctor {
  */
 template <typename T>
 struct AddFunctor {
-  inline T initial() { /*return static_cast<T>(0.0f);*/
-  }
+  inline T initial() { return static_cast<T>(0.0f); }
 
   __device__ T operator()(const T a, const T b) const { return b + a; }
 };
@@ -135,8 +133,7 @@ struct AddFunctor {
  */
 template <typename T>
 struct MulFunctor {
-  inline T initial() { /*return static_cast<T>(1.0f);*/
-  }
+  inline T initial() { return static_cast<T>(1.0f); }
 
   __device__ T operator()(const T& a, const T& b) const { return b * a; }
 };
@@ -146,8 +143,7 @@ struct MulFunctor {
  */
 template <typename T>
 struct LogicalOrFunctor {
-  inline T initial() { /*return static_cast<T>(false);*/
-  }
+  inline T initial() { return static_cast<T>(false); }
 
   __device__ T operator()(const T& a, const T& b) const { return b || a; }
 };
diff --git a/paddle/phi/kernels/primitive/kernel_primitives.h b/paddle/phi/kernels/primitive/kernel_primitives.h
index b5a1e88acc32b..f68a046ae077a 100644
--- a/paddle/phi/kernels/primitive/kernel_primitives.h
+++ b/paddle/phi/kernels/primitive/kernel_primitives.h
@@ -40,12 +40,15 @@
 #define GRID_NUM_X cluster_num()
 #define GRID_NUM_Y 0
 #define GRID_NUM_Z 0
-
+#define VecSizeL 512
+#define VecSizeM 256
+#define VecSizeS 128
 #else
 
 #define KPStream gpuStream_t
 #define KPDevice phi::GPUContext
 #define _ptr_
+#define __simd__
 
 #define THREAD_ID_X threadIdx.x
 #define THREAD_ID_Y threadIdx.y
@@ -63,6 +66,9 @@
 #define GRID_NUM_Y gridDim.y
 #define GRID_NUM_Z gridDim.z
 
+#define VecSizeL 4
+#define VecSizeM 2
+#define VecSizeS 1
 #endif
 
 // include file
diff --git a/paddle/phi/kernels/sparse/activation_grad_kernel.cc b/paddle/phi/kernels/sparse/activation_grad_kernel.cc
deleted file mode 100644
index 9eca14e660939..0000000000000
--- a/paddle/phi/kernels/sparse/activation_grad_kernel.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/kernels/sparse/activation_grad_kernel.h"
-#include "paddle/phi/kernels/activation_grad_kernel.h"
-#include "paddle/phi/kernels/copy_kernel.h"
-#include "paddle/phi/kernels/empty_kernel.h"
-
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-
-namespace phi {
-namespace sparse {
-
-template <typename T, typename Context>
-void SparseReluGradKernel(const Context& dev_ctx,
-                          const SparseCooTensor& x,
-                          const SparseCooTensor& out_grad,
-                          SparseCooTensor* x_grad) {
-  DenseTensor non_zero_indices =
-      phi::EmptyLike<T, Context>(dev_ctx, x.non_zero_indices());
-  DenseTensor non_zero_elements =
-      phi::EmptyLike<T, Context>(dev_ctx, x.non_zero_elements());
-  phi::Copy(dev_ctx,
-            x.non_zero_indices(),
-            dev_ctx.GetPlace(),
-            false,
-            &non_zero_indices);
-  phi::ReluGradKernel<T, Context>(dev_ctx,
-                                  x.non_zero_elements(),
-                                  out_grad.non_zero_elements(),
-                                  &non_zero_elements);
-  x_grad->SetMember(non_zero_indices, non_zero_elements, x.dims(), true);
-}
-
-}  // namespace sparse
-}  // namespace phi
-
-PD_REGISTER_KERNEL(sparse_relu_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::sparse::SparseReluGradKernel,
-                   float,
-                   double) {
-  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
-}
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_REGISTER_KERNEL(sparse_relu_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::sparse::SparseReluGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {
-  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
-}
-#endif
diff --git a/paddle/phi/kernels/sparse/activation_grad_kernel.h b/paddle/phi/kernels/sparse/activation_grad_kernel.h
deleted file mode 100644
index aab4a3e5a590b..0000000000000
--- a/paddle/phi/kernels/sparse/activation_grad_kernel.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/phi/core/sparse_coo_tensor.h"
-
-namespace phi {
-namespace sparse {
-
-template <typename T, typename Context>
-void SparseReluGradKernel(const Context& dev_ctx,
-                          const SparseCooTensor& x,
-                          const SparseCooTensor& out_grad,
-                          SparseCooTensor* x_grad);
-
-}  // namespace sparse
-}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/activation_kernel.cc b/paddle/phi/kernels/sparse/activation_kernel.cc
deleted file mode 100644
index a1a00897d33cf..0000000000000
--- a/paddle/phi/kernels/sparse/activation_kernel.cc
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/kernels/sparse/activation_kernel.h"
-#include "paddle/phi/kernels/copy_kernel.h"
-#include "paddle/phi/kernels/empty_kernel.h"
-
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-
-namespace phi {
-namespace sparse {
-
-template <typename T, typename Context>
-void SparseReluKernel(const Context& dev_ctx,
-                      const SparseCooTensor& x,
-                      SparseCooTensor* out) {
-  DenseTensor non_zero_indices =
-      phi::EmptyLike<T, Context>(dev_ctx, x.non_zero_indices());
-  DenseTensor non_zero_elements =
-      phi::EmptyLike<T, Context>(dev_ctx, x.non_zero_elements());
-  phi::Copy(dev_ctx,
-            x.non_zero_indices(),
-            dev_ctx.GetPlace(),
-            false,
-            &non_zero_indices);
-  phi::ReluKernel<T, Context>(
-      dev_ctx, x.non_zero_elements(), &non_zero_elements);
-  out->SetMember(non_zero_indices, non_zero_elements, x.dims(), true);
-}
-
-}  // namespace sparse
-}  // namespace phi
-
-PD_REGISTER_KERNEL(sparse_relu,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::sparse::SparseReluKernel,
-                   float,
-                   double) {
-  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
-}
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_REGISTER_KERNEL(sparse_relu,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::sparse::SparseReluKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {
-  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
-}
-#endif
diff --git a/paddle/phi/kernels/sparse/activation_kernel.h b/paddle/phi/kernels/sparse/activation_kernel.h
deleted file mode 100644
index 568c0aa8b2ecb..0000000000000
--- a/paddle/phi/kernels/sparse/activation_kernel.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/sparse_coo_tensor.h"
-#include "paddle/phi/kernels/activation_kernel.h"
-#include "paddle/phi/kernels/empty_kernel.h"
-
-namespace phi {
-namespace sparse {
-
-template <typename T, typename Context>
-void SparseReluKernel(const Context& dev_ctx,
-                      const SparseCooTensor& x,
-                      SparseCooTensor* out);
-
-template <typename T, typename Context>
-SparseCooTensor SparseRelu(const Context& dev_ctx, const SparseCooTensor& x) {
-  DenseTensor indices, values;
-  SparseCooTensor coo(indices, values, x.dims());
-  SparseReluKernel<T, Context>(dev_ctx, x, &coo);
-  return coo;
-}
-
-}  // namespace sparse
-}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/convolution_kernel.h b/paddle/phi/kernels/sparse/convolution_kernel.h
index 6120d6339a7eb..62a72a9dd4115 100644
--- a/paddle/phi/kernels/sparse/convolution_kernel.h
+++ b/paddle/phi/kernels/sparse/convolution_kernel.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/kernels/empty_kernel.h"
diff --git a/paddle/phi/kernels/sparse/copy_kernel.h b/paddle/phi/kernels/sparse/copy_kernel.h
index a43621a4dfeed..70e2aaef8a888 100644
--- a/paddle/phi/kernels/sparse/copy_kernel.h
+++ b/paddle/phi/kernels/sparse/copy_kernel.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
index 69ac0417f763d..2301d31d7a6c2 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
@@ -171,24 +171,17 @@ void SparseCooToCsrKernel(const Context& dev_ctx,
   int batchs = x_dims.size() == 2 ? 1 : x_dims[0];
   int rows = x_dims.size() == 2 ? x_dims[0] : x_dims[1];
 
-  const auto place = dev_ctx.GetPlace();
-  DenseTensorMeta crows_meta(
-      DataType::INT64, {batchs * (rows + 1)}, DataLayout::NCHW);
-  DenseTensorMeta cols_meta(DataType::INT64, {non_zero_num}, DataLayout::NCHW);
-  DenseTensorMeta values_meta(
-      x.dtype(), {non_zero_num}, x.non_zero_elements().layout());
-  phi::DenseTensor non_zero_crows(
-      phi::make_intrusive<paddle::experimental::SharedStorage>(place),
-      std::move(crows_meta));
-  phi::DenseTensor non_zero_cols(
-      phi::make_intrusive<paddle::experimental::SharedStorage>(place),
-      std::move(cols_meta));
-  phi::DenseTensor non_zero_elements(
-      phi::make_intrusive<paddle::experimental::SharedStorage>(place),
-      std::move(values_meta));
-  int64_t* csr_crows_data = non_zero_crows.mutable_data<int64_t>(place);
-  int64_t* csr_cols_data = non_zero_cols.mutable_data<int64_t>(place);
-  T* csr_values_data = non_zero_elements.mutable_data<T>(place);
+  phi::DenseTensor non_zero_crows;
+  non_zero_crows.Resize({batchs * (rows + 1)});
+  int64_t* csr_crows_data = dev_ctx.template Alloc<int64_t>(&non_zero_crows);
+
+  phi::DenseTensor non_zero_cols;
+  non_zero_cols.Resize({non_zero_num});
+  int64_t* csr_cols_data = dev_ctx.template Alloc<int64_t>(&non_zero_cols);
+
+  phi::DenseTensor non_zero_elements;
+  non_zero_elements.Resize({non_zero_num});
+  T* csr_values_data = dev_ctx.template Alloc<T>(&non_zero_elements);
 
   const auto& coo_indices = x.non_zero_indices();
   const auto& coo_values = x.non_zero_elements();
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
index 960d7eab26463..b208e70e04046 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
@@ -22,6 +22,8 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/sparse/common_shape.h"
 #include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
 
+#include "paddle/fluid/platform/enforce.h"
+
 namespace phi {
 namespace sparse {
 
@@ -173,20 +175,12 @@ void DenseToSparseCooKernel(const Context& dev_ctx,
 
   const auto values_dims =
       phi::funcs::sparse::InferDenseDims(x_dims, sparse_dim, non_zero_num);
-  DenseTensorMeta indices_meta(DataType::INT64,
-                               {sparse_dim, static_cast<int64_t>(non_zero_num)},
-                               DataLayout::NCHW);
-  DenseTensorMeta values_meta(x.meta().dtype, values_dims, x.meta().layout);
-  phi::DenseTensor indices(
-      phi::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(indices_meta));
-  phi::DenseTensor values(
-      phi::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(values_meta));
-  int64_t* indices_data = indices.mutable_data<int64_t>(place);
-  T* sparse_data = values.mutable_data<T>(place);
+  phi::DenseTensor indices = phi::Empty<int64_t>(
+      dev_ctx, {sparse_dim, static_cast<int64_t>(non_zero_num)});
+  int64_t* indices_data = indices.data<int64_t>();
+  phi::DenseTensor values;
+  values.Resize(values_dims);
+  T* sparse_data = dev_ctx.template Alloc<T>(&values);
 
   // 3. calc indices by indexs and get values by indexs
   config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
@@ -382,24 +376,13 @@ void SparseCooToCsrKernel(const Context& dev_ctx,
   int batchs = x_dims.size() == 2 ? 1 : x_dims[0];
   int rows = x_dims.size() == 2 ? x_dims[0] : x_dims[1];
 
-  const auto place = dev_ctx.GetPlace();
-  DenseTensorMeta crows_meta(
-      DataType::INT64, {batchs * (rows + 1)}, DataLayout::NCHW);
-  DenseTensorMeta cols_meta(DataType::INT64, {non_zero_num}, DataLayout::NCHW);
-  DenseTensorMeta values_meta(
-      x.dtype(), {non_zero_num}, x.non_zero_elements().layout());
-  phi::DenseTensor non_zero_crows(
-      phi::make_intrusive<paddle::experimental::SharedStorage>(place),
-      std::move(crows_meta));
-  phi::DenseTensor non_zero_cols(
-      phi::make_intrusive<paddle::experimental::SharedStorage>(place),
-      std::move(cols_meta));
-  phi::DenseTensor non_zero_elements(
-      phi::make_intrusive<paddle::experimental::SharedStorage>(place),
-      std::move(values_meta));
-  int64_t* csr_crows_data = non_zero_crows.mutable_data<int64_t>(place);
-  int64_t* csr_cols_data = non_zero_cols.mutable_data<int64_t>(place);
-  T* csr_values_data = non_zero_elements.mutable_data<T>(place);
+  phi::DenseTensor non_zero_crows =
+      phi::Empty<int64_t>(dev_ctx, {batchs * (rows + 1)});
+  phi::DenseTensor non_zero_cols = phi::Empty<int64_t>(dev_ctx, {non_zero_num});
+  phi::DenseTensor non_zero_elements = phi::Empty<T>(dev_ctx, {non_zero_num});
+  int64_t* csr_crows_data = non_zero_crows.data<int64_t>();
+  int64_t* csr_cols_data = non_zero_cols.data<int64_t>();
+  T* csr_values_data = non_zero_elements.data<T>();
 
   const auto& coo_indices = x.non_zero_indices();
   const auto& coo_values = x.non_zero_elements();
@@ -416,10 +399,8 @@ void SparseCooToCsrKernel(const Context& dev_ctx,
   auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batchs, 1);
   if (batchs > 1) {
     DenseTensorMeta batchs_meta(DataType::INT64, {batchs}, DataLayout::NCHW);
-    phi::DenseTensor batchs_offset(
-        phi::make_intrusive<paddle::experimental::SharedStorage>(place),
-        std::move(batchs_meta));
-    int64_t* batchs_offset_ptr = batchs_offset.mutable_data<int64_t>(place);
+    phi::DenseTensor batchs_offset = phi::Empty<int64_t>(dev_ctx, {batchs});
+    int64_t* batchs_offset_ptr = batchs_offset.data<int64_t>();
     GetBatchsOffset<<<config.block_per_grid.x,
                       config.thread_per_block.x,
                       0,
diff --git a/paddle/phi/kernels/sparse/sparse_utils_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
index d39790fcea5e3..93abf70b24412 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_kernel.h
+++ b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
diff --git a/paddle/phi/kernels/sparse/unary_grad_kernel.cc b/paddle/phi/kernels/sparse/unary_grad_kernel.cc
new file mode 100644
index 0000000000000..1fd3ef2711299
--- /dev/null
+++ b/paddle/phi/kernels/sparse/unary_grad_kernel.cc
@@ -0,0 +1,183 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sparse/unary_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/core/sparse_csr_tensor.h"
+#include "paddle/phi/kernels/activation_grad_kernel.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+#define DEFINE_SPARSE_UNARY_GRAD_KERNEL(DenseKernelFunc)                    \
+  namespace phi {                                                           \
+  namespace sparse {                                                        \
+                                                                            \
+  template <typename T, typename Context>                                   \
+  void SparseCoo##DenseKernelFunc(const Context& dev_ctx,                   \
+                                  const SparseCooTensor& x_or_out,          \
+                                  const SparseCooTensor& out_grad,          \
+                                  SparseCooTensor* x_grad) {                \
+    DenseTensor non_zero_indices =                                          \
+        phi::EmptyLike<T, Context>(dev_ctx, x_or_out.non_zero_indices());   \
+    DenseTensor non_zero_elements =                                         \
+        phi::EmptyLike<T, Context>(dev_ctx, x_or_out.non_zero_elements());  \
+    phi::Copy(dev_ctx,                                                      \
+              x_or_out.non_zero_indices(),                                  \
+              dev_ctx.GetPlace(),                                           \
+              false,                                                        \
+              &non_zero_indices);                                           \
+    phi::DenseKernelFunc<T, Context>(dev_ctx,                               \
+                                     x_or_out.non_zero_elements(),          \
+                                     out_grad.non_zero_elements(),          \
+                                     &non_zero_elements);                   \
+    x_grad->SetMember(                                                      \
+        non_zero_indices, non_zero_elements, x_or_out.dims(), true);        \
+  }                                                                         \
+                                                                            \
+  template <typename T, typename Context>                                   \
+  void SparseCsr##DenseKernelFunc(const Context& dev_ctx,                   \
+                                  const SparseCsrTensor& x_or_out,          \
+                                  const SparseCsrTensor& out_grad,          \
+                                  SparseCsrTensor* out) {                   \
+    DenseTensor non_zero_crows =                                            \
+        phi::EmptyLike<T, Context>(dev_ctx, x_or_out.non_zero_crows());     \
+    DenseTensor non_zero_cols =                                             \
+        phi::EmptyLike<T, Context>(dev_ctx, x_or_out.non_zero_cols());      \
+    DenseTensor non_zero_elements =                                         \
+        phi::EmptyLike<T, Context>(dev_ctx, x_or_out.non_zero_elements());  \
+    phi::Copy(dev_ctx,                                                      \
+              x_or_out.non_zero_crows(),                                    \
+              dev_ctx.GetPlace(),                                           \
+              false,                                                        \
+              &non_zero_crows);                                             \
+    phi::Copy(dev_ctx,                                                      \
+              x_or_out.non_zero_cols(),                                     \
+              dev_ctx.GetPlace(),                                           \
+              false,                                                        \
+              &non_zero_cols);                                              \
+    phi::DenseKernelFunc<T, Context>(dev_ctx,                               \
+                                     x_or_out.non_zero_elements(),          \
+                                     out_grad.non_zero_elements(),          \
+                                     &non_zero_elements);                   \
+    out->SetMember(                                                         \
+        non_zero_crows, non_zero_cols, non_zero_elements, x_or_out.dims()); \
+  }                                                                         \
+  }                                                                         \
+  }
+
+#define REGISTER_CPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) \
+  PD_REGISTER_KERNEL(sparse_coo_##kernel_name,                         \
+                     CPU,                                              \
+                     ALL_LAYOUT,                                       \
+                     phi::sparse::SparseCoo##DenseKernelFunc,          \
+                     float,                                            \
+                     double) {                                         \
+    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);     \
+  }                                                                    \
+  PD_REGISTER_KERNEL(sparse_csr_##kernel_name,                         \
+                     CPU,                                              \
+                     ALL_LAYOUT,                                       \
+                     phi::sparse::SparseCsr##DenseKernelFunc,          \
+                     float,                                            \
+                     double) {                                         \
+    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);     \
+  }
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#define REGISTER_GPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) \
+  PD_REGISTER_KERNEL(sparse_coo_##kernel_name,                         \
+                     GPU,                                              \
+                     ALL_LAYOUT,                                       \
+                     phi::sparse::SparseCoo##DenseKernelFunc,          \
+                     float,                                            \
+                     double,                                           \
+                     phi::dtype::float16) {                            \
+    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);     \
+  }                                                                    \
+                                                                       \
+  PD_REGISTER_KERNEL(sparse_csr_##kernel_name,                         \
+                     GPU,                                              \
+                     ALL_LAYOUT,                                       \
+                     phi::sparse::SparseCsr##DenseKernelFunc,          \
+                     float,                                            \
+                     double,                                           \
+                     phi::dtype::float16) {                            \
+    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);     \
+  }
+#else
+// This macro definition is empty when GPU is disabled
+#define REGISTER_GPU_SPARSE_UNARY_KERNEL(sparse_kernel_name, DenseKernelFunc)
+#endif
+
+#define REGISTER_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) \
+  REGISTER_CPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc)   \
+  REGISTER_GPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc)
+
+#define DEFINE_AND_REGISTER_SPARSE_UNARY_GRAD_KERNEL(kernel_name,     \
+                                                     DenseKernelFunc) \
+  DEFINE_SPARSE_UNARY_GRAD_KERNEL(DenseKernelFunc)                    \
+  REGISTER_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc)
+
+// NOTE: the following code is to bypass the restriction of Paddle
+// kernel registration mechanism. Do NOT refactor them unless you
+// know what you are doing.
+// If you want to implement any new kernel, please follow `sin_grad`,
+// `tanh_grad` etc, do NOT follow the following `relu_grad`.
+DEFINE_SPARSE_UNARY_GRAD_KERNEL(ReluGradKernel)
+
+PD_REGISTER_KERNEL(sparse_coo_relu_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseCooReluGradKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
+PD_REGISTER_KERNEL(sparse_csr_relu_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseCsrReluGradKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
+}
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(sparse_coo_relu_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseCooReluGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
+
+PD_REGISTER_KERNEL(sparse_csr_relu_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseCsrReluGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
+}
+#endif
+
+DEFINE_AND_REGISTER_SPARSE_UNARY_GRAD_KERNEL(sin_grad, SinGradKernel)
+DEFINE_AND_REGISTER_SPARSE_UNARY_GRAD_KERNEL(sqrt_grad, SqrtGradKernel)
+DEFINE_AND_REGISTER_SPARSE_UNARY_GRAD_KERNEL(tanh_grad, TanhGradKernel)
diff --git a/paddle/phi/kernels/sparse/unary_grad_kernel.h b/paddle/phi/kernels/sparse/unary_grad_kernel.h
new file mode 100644
index 0000000000000..24ea4fee1a4fd
--- /dev/null
+++ b/paddle/phi/kernels/sparse/unary_grad_kernel.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/core/sparse_csr_tensor.h"
+
+#define DECLARE_SPARSE_UNARY_GRAD_KERNEL(name)                      \
+  template <typename T, typename Context>                           \
+  void SparseCoo##name##GradKernel(const Context& dev_ctx,          \
+                                   const SparseCooTensor& x,        \
+                                   const SparseCooTensor& out_grad, \
+                                   SparseCooTensor* x_grad);        \
+                                                                    \
+  template <typename T, typename Context>                           \
+  void SparseCsr##name##GradKernel(const Context& dev_ctx,          \
+                                   const SparseCsrTensor& x,        \
+                                   const SparseCsrTensor& out_grad, \
+                                   SparseCsrTensor* x_grad);
+
+namespace phi {
+namespace sparse {
+
+DECLARE_SPARSE_UNARY_GRAD_KERNEL(Relu)
+DECLARE_SPARSE_UNARY_GRAD_KERNEL(Sqrt)
+DECLARE_SPARSE_UNARY_GRAD_KERNEL(Sin)
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/unary_kernel.cc b/paddle/phi/kernels/sparse/unary_kernel.cc
new file mode 100644
index 0000000000000..e02d7757664fa
--- /dev/null
+++ b/paddle/phi/kernels/sparse/unary_kernel.cc
@@ -0,0 +1,177 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sparse/unary_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/core/sparse_csr_tensor.h"
+#include "paddle/phi/kernels/activation_kernel.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+#define DEFINE_SPARSE_UNARY_KERNEL(DenseKernelFunc)                      \
+  namespace phi {                                                        \
+  namespace sparse {                                                     \
+                                                                         \
+  template <typename T, typename Context>                                \
+  void SparseCoo##DenseKernelFunc(const Context& dev_ctx,                \
+                                  const SparseCooTensor& x,              \
+                                  SparseCooTensor* out) {                \
+    DenseTensor non_zero_indices =                                       \
+        phi::EmptyLike<T, Context>(dev_ctx, x.non_zero_indices());       \
+    DenseTensor non_zero_elements =                                      \
+        phi::EmptyLike<T, Context>(dev_ctx, x.non_zero_elements());      \
+    phi::Copy(dev_ctx,                                                   \
+              x.non_zero_indices(),                                      \
+              dev_ctx.GetPlace(),                                        \
+              false,                                                     \
+              &non_zero_indices);                                        \
+    phi::DenseKernelFunc<T, Context>(                                    \
+        dev_ctx, x.non_zero_elements(), &non_zero_elements);             \
+    out->SetMember(non_zero_indices, non_zero_elements, x.dims(), true); \
+  }                                                                      \
+                                                                         \
+  template <typename T, typename Context>                                \
+  void SparseCsr##DenseKernelFunc(const Context& dev_ctx,                \
+                                  const SparseCsrTensor& x,              \
+                                  SparseCsrTensor* out) {                \
+    DenseTensor non_zero_crows =                                         \
+        phi::EmptyLike<T, Context>(dev_ctx, x.non_zero_crows());         \
+    DenseTensor non_zero_cols =                                          \
+        phi::EmptyLike<T, Context>(dev_ctx, x.non_zero_cols());          \
+    DenseTensor non_zero_elements =                                      \
+        phi::EmptyLike<T, Context>(dev_ctx, x.non_zero_elements());      \
+    phi::Copy(dev_ctx,                                                   \
+              x.non_zero_crows(),                                        \
+              dev_ctx.GetPlace(),                                        \
+              false,                                                     \
+              &non_zero_crows);                                          \
+    phi::Copy(dev_ctx,                                                   \
+              x.non_zero_cols(),                                         \
+              dev_ctx.GetPlace(),                                        \
+              false,                                                     \
+              &non_zero_cols);                                           \
+    phi::DenseKernelFunc<T, Context>(                                    \
+        dev_ctx, x.non_zero_elements(), &non_zero_elements);             \
+    out->SetMember(                                                      \
+        non_zero_crows, non_zero_cols, non_zero_elements, x.dims());     \
+  }                                                                      \
+  }                                                                      \
+  }
+
+#define REGISTER_CPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) \
+  PD_REGISTER_KERNEL(sparse_coo_##kernel_name,                         \
+                     CPU,                                              \
+                     ALL_LAYOUT,                                       \
+                     phi::sparse::SparseCoo##DenseKernelFunc,          \
+                     float,                                            \
+                     double) {                                         \
+    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);     \
+  }                                                                    \
+  PD_REGISTER_KERNEL(sparse_csr_##kernel_name,                         \
+                     CPU,                                              \
+                     ALL_LAYOUT,                                       \
+                     phi::sparse::SparseCsr##DenseKernelFunc,          \
+                     float,                                            \
+                     double) {                                         \
+    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);     \
+  }
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#define REGISTER_GPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) \
+  PD_REGISTER_KERNEL(sparse_coo_##kernel_name,                         \
+                     GPU,                                              \
+                     ALL_LAYOUT,                                       \
+                     phi::sparse::SparseCoo##DenseKernelFunc,          \
+                     float,                                            \
+                     double,                                           \
+                     phi::dtype::float16) {                            \
+    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);     \
+  }                                                                    \
+                                                                       \
+  PD_REGISTER_KERNEL(sparse_csr_##kernel_name,                         \
+                     GPU,                                              \
+                     ALL_LAYOUT,                                       \
+                     phi::sparse::SparseCsr##DenseKernelFunc,          \
+                     float,                                            \
+                     double,                                           \
+                     phi::dtype::float16) {                            \
+    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);     \
+  }
+#else
+// This macro definition is empty when GPU is disabled
+#define REGISTER_GPU_SPARSE_UNARY_KERNEL(sparse_kernel_name, DenseKernelFunc)
+#endif
+
+#define REGISTER_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) \
+  REGISTER_CPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc)   \
+  REGISTER_GPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc)
+
+#define DEFINE_AND_REGISTER_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) \
+  DEFINE_SPARSE_UNARY_KERNEL(DenseKernelFunc)                                 \
+  REGISTER_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc)
+
+// NOTE: the following code is to bypass the restriction of Paddle
+// kernel registration mechanism. Do NOT refactor them unless you
+// know what you are doing.
+// If you want to implement any new kernel, please follow `sin`,
+// `tanh` etc, do NOT follow `sqrt`.
+DEFINE_SPARSE_UNARY_KERNEL(SqrtKernel)
+
+PD_REGISTER_KERNEL(sparse_coo_sqrt,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseCooSqrtKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
+PD_REGISTER_KERNEL(sparse_csr_sqrt,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseCsrSqrtKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
+}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(sparse_coo_sqrt,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseCooSqrtKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
+
+PD_REGISTER_KERNEL(sparse_csr_sqrt,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseCsrSqrtKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
+}
+
+#endif
+
+DEFINE_AND_REGISTER_SPARSE_UNARY_KERNEL(sin, SinKernel)
+DEFINE_AND_REGISTER_SPARSE_UNARY_KERNEL(tanh, TanhKernel)
+DEFINE_AND_REGISTER_SPARSE_UNARY_KERNEL(relu, ReluKernel)
diff --git a/paddle/phi/kernels/sparse/unary_kernel.h b/paddle/phi/kernels/sparse/unary_kernel.h
new file mode 100644
index 0000000000000..4470173c143db
--- /dev/null
+++ b/paddle/phi/kernels/sparse/unary_kernel.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/core/sparse_csr_tensor.h"
+#include "paddle/phi/kernels/activation_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+#define DECLARE_SPARSE_UNARY_KERNEL(name)                                      \
+  template <typename T, typename Context>                                      \
+  void SparseCoo##name##Kernel(                                                \
+      const Context& dev_ctx, const SparseCooTensor& x, SparseCooTensor* out); \
+                                                                               \
+  template <typename T, typename Context>                                      \
+  void SparseCsr##name##Kernel(                                                \
+      const Context& dev_ctx, const SparseCsrTensor& x, SparseCsrTensor* out);
+
+namespace phi {
+namespace sparse {
+
+DECLARE_SPARSE_UNARY_KERNEL(Relu)
+DECLARE_SPARSE_UNARY_KERNEL(Sqrt)
+DECLARE_SPARSE_UNARY_KERNEL(Sin)
+
+template <typename T, typename Context>
+SparseCooTensor SparseRelu(const Context& dev_ctx, const SparseCooTensor& x) {
+  DenseTensor indices, values;
+  SparseCooTensor coo(indices, values, x.dims());
+  SparseCooReluKernel<T, Context>(dev_ctx, x, &coo);
+  return coo;
+}
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/squeeze_kernel.h b/paddle/phi/kernels/squeeze_kernel.h
index 22254eacfcefc..bd8f508cbb1db 100644
--- a/paddle/phi/kernels/squeeze_kernel.h
+++ b/paddle/phi/kernels/squeeze_kernel.h
@@ -23,6 +23,6 @@ template <typename T, typename Context>
 void SqueezeKernel(const Context& dev_ctx,
                    const DenseTensor& x,
                    const std::vector<int>& axes,
-                   DenseTensor* xshape,
-                   DenseTensor* out);
+                   DenseTensor* out,
+                   DenseTensor* xshape);
 }  // namespace phi
diff --git a/paddle/phi/kernels/strings/cpu/strings_copy_kernel.cc b/paddle/phi/kernels/strings/cpu/strings_copy_kernel.cc
index 156cea63f171c..41889f9cc5ed7 100644
--- a/paddle/phi/kernels/strings/cpu/strings_copy_kernel.cc
+++ b/paddle/phi/kernels/strings/cpu/strings_copy_kernel.cc
@@ -15,6 +15,8 @@ limitations under the License. */
 #include "paddle/phi/kernels/strings/strings_copy_kernel.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+#include "glog/logging.h"
+
 namespace phi {
 namespace strings {
 
diff --git a/paddle/phi/kernels/strings/strings_empty_kernel.h b/paddle/phi/kernels/strings/strings_empty_kernel.h
index 1add1963614d8..8a014f2a78c2c 100644
--- a/paddle/phi/kernels/strings/strings_empty_kernel.h
+++ b/paddle/phi/kernels/strings/strings_empty_kernel.h
@@ -15,7 +15,6 @@
 #pragma once
 
 #include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/core/string_tensor.h"
 #include "paddle/phi/infermeta/strings/nullary.h"
diff --git a/paddle/phi/kernels/strings/strings_lower_upper_kernel.h b/paddle/phi/kernels/strings/strings_lower_upper_kernel.h
index 97f530164528a..db6c267a8586d 100644
--- a/paddle/phi/kernels/strings/strings_lower_upper_kernel.h
+++ b/paddle/phi/kernels/strings/strings_lower_upper_kernel.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/string_tensor.h"
 #include "paddle/phi/infermeta/strings/unary.h"
 #include "paddle/phi/kernels/strings/case_utils.h"
diff --git a/paddle/phi/kernels/tril_indices_kernel.h b/paddle/phi/kernels/tril_indices_kernel.h
new file mode 100644
index 0000000000000..1132a539ee6d1
--- /dev/null
+++ b/paddle/phi/kernels/tril_indices_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TrilIndicesKernel(const Context& dev_ctx,
+                       int rows,
+                       int cols,
+                       int offset,
+                       DataType dtype,
+                       DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/unsqueeze_kernel.h b/paddle/phi/kernels/unsqueeze_kernel.h
index d18bde1c2efab..4622a9b0a859c 100644
--- a/paddle/phi/kernels/unsqueeze_kernel.h
+++ b/paddle/phi/kernels/unsqueeze_kernel.h
@@ -24,6 +24,6 @@ template <typename T, typename Context>
 void UnsqueezeKernel(const Context& dev_ctx,
                      const DenseTensor& x,
                      const IntArray& axes,
-                     DenseTensor* xshape,
-                     DenseTensor* out);
+                     DenseTensor* out,
+                     DenseTensor* xshape);
 }  // namespace phi
diff --git a/paddle/phi/ops/compat/activation_sig.cc b/paddle/phi/ops/compat/activation_sig.cc
index 157eaa279debb..ac0c3021a3273 100644
--- a/paddle/phi/ops/compat/activation_sig.cc
+++ b/paddle/phi/ops/compat/activation_sig.cc
@@ -67,6 +67,7 @@ DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Log, "log", );                 // NOLINT
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Log2, "log2", );               // NOLINT
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Log10, "log10", );             // NOLINT
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Log1p, "log1p", );             // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Celu, "celu", "alpha");        // NOLINT
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(HardSwish,
                                "hard_swish",
                                "threshold" comma "scale" comma
@@ -181,6 +182,30 @@ KernelSignature LogDoubleGradOpArgumentMapping(
       "log_double_grad", {"X", "DOut", "DDX"}, {}, {"DX", "DDOut"});
 }
 
+KernelSignature SqrtDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "sqrt_double_grad", {"Out", "DX", "DDX"}, {}, {"DOut", "DDOut"});
+}
+
+KernelSignature RsqrtDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "rsqrt_double_grad", {"Out", "DX", "DDX"}, {}, {"DOut", "DDOut"});
+}
+
+KernelSignature CeluDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "celu_double_grad", {"X", "DOut", "DDX"}, {"alpha"}, {"DX", "DDOut"});
+}
+
+KernelSignature SquareDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "square_double_grad", {"X", "DOut", "DDX"}, {}, {"DX", "DDOut"});
+}
+
 KernelSignature PowOpArgumentMapping(const ArgumentMappingContext& ctx) {
   if (ctx.HasInput("FactorTensor")) {
     return KernelSignature("pow", {"X"}, {"FactorTensor"}, {"Out"});
@@ -209,6 +234,10 @@ PD_REGISTER_BASE_KERNEL_NAME(softshrink_grad, soft_shrink_grad);
 PD_REGISTER_BASE_KERNEL_NAME(elu_grad_grad, elu_double_grad);
 PD_REGISTER_BASE_KERNEL_NAME(sigmoid_grad_grad, sigmoid_double_grad);
 PD_REGISTER_BASE_KERNEL_NAME(log_grad_grad, log_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(sqrt_grad_grad, sqrt_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(rsqrt_grad_grad, rsqrt_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(celu_grad_grad, celu_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(square_grad_grad, square_double_grad);
 
 PD_REGISTER_ARG_MAPPING_FN(cos_grad, phi::CosGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(tan_grad, phi::TanGradOpArgumentMapping);
@@ -229,7 +258,11 @@ PD_REGISTER_ARG_MAPPING_FN(square_grad, phi::SquareGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(reciprocal_grad,
                            phi::ReciprocalGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(sqrt_grad, phi::SqrtGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(sqrt_grad_grad,
+                           phi::SqrtDoubleGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(rsqrt_grad, phi::RsqrtGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(rsqrt_grad_grad,
+                           phi::RsqrtDoubleGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(mish_grad, phi::MishGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(stanh_grad, phi::STanhGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(softplus_grad, phi::SoftplusGradOpArgumentMapping);
@@ -286,3 +319,8 @@ PD_REGISTER_ARG_MAPPING_FN(floor_grad, phi::FloorGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(ceil_grad, phi::CeilGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(pow_grad, phi::PowGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(pow, phi::PowOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(celu_grad, phi::CeluGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(celu_grad_grad,
+                           phi::CeluDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(square_grad_grad,
+                           phi::SquareDoubleGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/einsum_sig.cc b/paddle/phi/ops/compat/einsum_sig.cc
index 0b3cc3425df45..5e45bcf97ce0e 100644
--- a/paddle/phi/ops/compat/einsum_sig.cc
+++ b/paddle/phi/ops/compat/einsum_sig.cc
@@ -17,14 +17,15 @@ limitations under the License. */
 namespace phi {
 
 KernelSignature EinsumOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature("einsum", {"Operands"}, {"equation"}, {"Out"});
+  return KernelSignature(
+      "einsum", {"Operands"}, {"equation"}, {"Out", "InnerCache"});
 }
 
 KernelSignature EinsumGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
   return KernelSignature("einsum_grad",
-                         {"Operands", {"Out@GRAD"}},
+                         {"Operands", "InnerCache", "Out@GRAD"},
                          {"equation"},
-                         {{"Operands@GRAD"}});
+                         {"Operands@GRAD"});
 }
 }  // namespace phi
 
diff --git a/paddle/phi/ops/compat/elementwise_sig.cc b/paddle/phi/ops/compat/elementwise_sig.cc
index 13a5a6fd4a449..17fb1858373d9 100644
--- a/paddle/phi/ops/compat/elementwise_sig.cc
+++ b/paddle/phi/ops/compat/elementwise_sig.cc
@@ -25,6 +25,11 @@ KernelSignature ElementwiseAddOpArgumentMapping(
   return KernelSignature("add_raw", {"X", "Y"}, {"axis"}, {"Out"});
 }
 
+KernelSignature ElementwiseGradAddOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("grad_add", {"X", "Y"}, {}, {"Out"});
+}
+
 KernelSignature ElementwiseSubOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   int axis = paddle::any_cast<int>(ctx.Attr("axis"));
@@ -95,6 +100,16 @@ KernelSignature ElementwiseFloorDivOpArgumentMapping(
   return KernelSignature("floor_divide_raw", {"X", "Y"}, {"axis"}, {"Out"});
 }
 
+KernelSignature ElementwiseHeavisideOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  int axis = paddle::any_cast<int>(ctx.Attr("axis"));
+  if (axis == -1) {
+    return KernelSignature("elementwise_heaviside", {"X", "Y"}, {}, {"Out"});
+  }
+  return KernelSignature(
+      "elementwise_heaviside_raw", {"X", "Y"}, {"axis"}, {"Out"});
+}
+
 KernelSignature ElementwisePowOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   int axis = paddle::any_cast<int>(ctx.Attr("axis"));
@@ -208,6 +223,15 @@ KernelSignature ElementwiseMinGradOpArgumentMapping(
   return KernelSignature(
       "minimum_grad", {"X", "Y", "Out@GRAD"}, {"axis"}, {"X@GRAD", "Y@GRAD"});
 }
+
+KernelSignature ElementwiseHeavisideGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("elementwise_heaviside_grad",
+                         {"X", "Y", "Out@GRAD"},
+                         {"axis"},
+                         {"X@GRAD", "Y@GRAD"});
+}
+
 KernelSignature ElementwisePowGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature("elementwise_pow_grad",
@@ -258,6 +282,8 @@ PD_REGISTER_ARG_MAPPING_FN(elementwise_mod,
                            phi::ElementwiseModOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(elementwise_floordiv,
                            phi::ElementwiseFloorDivOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_heaviside,
+                           phi::ElementwiseHeavisideOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(elementwise_pow,
                            phi::ElementwisePowOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(elementwise_add_grad,
@@ -292,5 +318,8 @@ PD_REGISTER_ARG_MAPPING_FN(elementwise_max_grad,
                            phi::ElementwiseMaxGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(elementwise_min_grad,
                            phi::ElementwiseMinGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(elementwise_heaviside_grad,
+                           phi::ElementwiseHeavisideGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(elementwise_pow_grad,
                            phi::ElementwisePowGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(grad_add, phi::ElementwiseGradAddOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/instance_norm_sig.cc b/paddle/phi/ops/compat/instance_norm_sig.cc
new file mode 100644
index 0000000000000..2b490078512b1
--- /dev/null
+++ b/paddle/phi/ops/compat/instance_norm_sig.cc
@@ -0,0 +1,56 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature InstanceNormOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("instance_norm",
+                         {"X", "Scale", "Bias"},
+                         {"epsilon"},
+                         {"Y", "SavedMean", "SavedVariance"});
+}
+
+KernelSignature InstanceNormGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("instance_norm_grad",
+                         {"X", "Y@GRAD", "Scale", "SavedMean", "SavedVariance"},
+                         {"epsilon"},
+                         {"X@GRAD", "Scale@GRAD", "Bias@GRAD"});
+}
+KernelSignature InstanceNormDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("instance_norm_double_grad",
+                         {"X",
+                          "Scale",
+                          "SavedMean",
+                          "SavedVariance",
+                          "DY",
+                          "DDX",
+                          "DDScale",
+                          "DDBias"},
+                         {"epsilon"},
+                         {"DX", "DScale", "DDY"});
+}
+}  // namespace phi
+
+PD_REGISTER_BASE_KERNEL_NAME(instance_norm_grad_grad,
+                             instance_norm_double_grad);
+PD_REGISTER_ARG_MAPPING_FN(instance_norm, phi::InstanceNormOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(instance_norm_grad,
+                           phi::InstanceNormGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(instance_norm_grad_grad,
+                           phi::InstanceNormDoubleGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/scale_sig.cc b/paddle/phi/ops/compat/scale_sig.cc
index 95deb007d99d9..8061a1fbd610a 100644
--- a/paddle/phi/ops/compat/scale_sig.cc
+++ b/paddle/phi/ops/compat/scale_sig.cc
@@ -30,7 +30,7 @@ namespace phi {
  * The infrt declare like:
  *
  * def PDKEL_Reshape_to_CPU : Pat<
- *     (PD_ReshapeOp $x, $shape_tensor， $shape_attr), // OpMaker arguements
+ *     (PD_ReshapeOp $x, $shape_tensor， $shape_attr), // OpMaker arguments
  *     (PDKEL_ReshapeKernelAttr $x, fn($shape_attr)>;  // Kernel arguments
  * def PDKEL_Reshape_to_CPU : Pat<
  *     (PD_ReshapeOp $x, $shape_tensor， $shape_attr),
diff --git a/paddle/phi/ops/compat/squeeze_sig.cc b/paddle/phi/ops/compat/squeeze_sig.cc
index c65d77df9808e..cd6d5fc7253df 100644
--- a/paddle/phi/ops/compat/squeeze_sig.cc
+++ b/paddle/phi/ops/compat/squeeze_sig.cc
@@ -18,7 +18,7 @@
 namespace phi {
 
 KernelSignature SqueezeOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature("squeeze", {"X"}, {"axes"}, {"XShape", "Out"});
+  return KernelSignature("squeeze", {"X"}, {"axes"}, {"Out", "XShape"});
 }
 
 KernelSignature SqueezeGradOpArgumentMapping(
diff --git a/paddle/phi/ops/compat/unsqueeze_sig.cc b/paddle/phi/ops/compat/unsqueeze_sig.cc
index c802c2684b282..aee83933e5b97 100644
--- a/paddle/phi/ops/compat/unsqueeze_sig.cc
+++ b/paddle/phi/ops/compat/unsqueeze_sig.cc
@@ -21,14 +21,14 @@ KernelSignature UnsqueezeOpArgumentMapping(const ArgumentMappingContext& ctx) {
   if (ctx.InputSize("AxesTensorList") > 0) {
     VLOG(2) << "unsqueeze2 in AxesTensorList";
     return KernelSignature(
-        "unsqueeze", {"X"}, {"AxesTensorList"}, {"XShape", "Out"});
+        "unsqueeze", {"X"}, {"AxesTensorList"}, {"Out", "XShape"});
   } else if (ctx.InputSize("AxesTensor") > 0) {
     VLOG(2) << "unsqueeze2 in AxesTensor";
     return KernelSignature(
-        "unsqueeze", {"X"}, {"AxesTensor"}, {"XShape", "Out"});
+        "unsqueeze", {"X"}, {"AxesTensor"}, {"Out", "XShape"});
   } else {
     VLOG(2) << "unsqueeze2 in axes";
-    return KernelSignature("unsqueeze", {"X"}, {"axes"}, {"XShape", "Out"});
+    return KernelSignature("unsqueeze", {"X"}, {"axes"}, {"Out", "XShape"});
   }
 }
 
diff --git a/paddle/phi/tests/api/scale_api.h b/paddle/phi/tests/api/scale_api.h
index 190fef3d94657..16143fb11e0ff 100644
--- a/paddle/phi/tests/api/scale_api.h
+++ b/paddle/phi/tests/api/scale_api.h
@@ -19,7 +19,6 @@
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -69,10 +68,7 @@ PADDLE_API Tensor scale_kernel_context(const Tensor& x,
   kernel_context.EmplaceBackAttr(bias);
   kernel_context.EmplaceBackAttr(bias_after_scale);
 
-  auto dense_out = std::make_shared<phi::DenseTensor>(
-      phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPhiPlace(kernel_backend)),
-      phi::DenseTensorMeta());
+  auto dense_out = std::make_shared<phi::DenseTensor>();
   phi::MetaTensor meta_out(dense_out.get());
   phi::UnchangedInferMeta(*dense_x, &meta_out);
   kernel_context.EmplaceBackOutput(dense_out.get());
@@ -236,10 +232,7 @@ Tensor scale_switch_case(const Tensor& x,
 
   auto dense_x = std::dynamic_pointer_cast<phi::DenseTensor>(x.impl());
 
-  auto dense_out = std::make_shared<phi::DenseTensor>(
-      phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPhiPlace(kernel_backend)),
-      phi::DenseTensorMeta());
+  auto dense_out = std::make_shared<phi::DenseTensor>();
   phi::MetaTensor meta_out(dense_out.get());
   phi::UnchangedInferMeta(*dense_x, &meta_out);
 
diff --git a/paddle/phi/tests/api/test_matmul_api.cc b/paddle/phi/tests/api/test_matmul_api.cc
index e2c324a6775c8..0d4ec7bd4f592 100644
--- a/paddle/phi/tests/api/test_matmul_api.cc
+++ b/paddle/phi/tests/api/test_matmul_api.cc
@@ -179,8 +179,18 @@ TEST(API, matmul_double_grad) {
   auto dx_grad = paddle::experimental::full({3, 3}, 2.0);
 
   // 2. test API
-  const auto out = paddle::experimental::matmul_double_grad(
-      x, y, out_grad, dx_grad, {}, false, false);
+  std::vector<std::vector<paddle::experimental::Tensor>> out(
+      3, std::vector<paddle::experimental::Tensor>(1));
+  paddle::experimental::matmul_double_grad(x,
+                                           y,
+                                           out_grad,
+                                           dx_grad,
+                                           {},
+                                           false,
+                                           false,
+                                           &out[0][0],
+                                           &out[1][0],
+                                           &out[2][0]);
 
   // 3. check result
   ASSERT_EQ(out.size(), 3UL);
diff --git a/paddle/phi/tests/api/test_sparse_conv_api.cc b/paddle/phi/tests/api/test_sparse_conv_api.cc
index 7c4aa16425907..c00113389adb7 100644
--- a/paddle/phi/tests/api/test_sparse_conv_api.cc
+++ b/paddle/phi/tests/api/test_sparse_conv_api.cc
@@ -77,11 +77,11 @@ void TestConv3dBase(const std::vector<int>& indices,
          kernel.size() * sizeof(T));
 
   if (!std::is_same<T, phi::dtype::float16>::value) {
-    auto outs = paddle::experimental::sparse::conv3d(
+    auto tensor_out = paddle::experimental::sparse::conv3d(
         x, weight, paddings, dilations, strides, 1, false);
 
-    auto out = std::dynamic_pointer_cast<phi::SparseCooTensor>(
-        std::get<0>(outs).impl());
+    auto out =
+        std::dynamic_pointer_cast<phi::SparseCooTensor>(tensor_out.impl());
     ASSERT_EQ(correct_out_dims.size(), out->dims().size());
     for (int i = 0; i < correct_out_dims.size(); i++) {
       ASSERT_EQ(correct_out_dims[i], out->dims()[i]);
diff --git a/paddle/phi/tests/common/CMakeLists.txt b/paddle/phi/tests/common/CMakeLists.txt
index ca6d20045d171..150336a1ed694 100644
--- a/paddle/phi/tests/common/CMakeLists.txt
+++ b/paddle/phi/tests/common/CMakeLists.txt
@@ -2,6 +2,7 @@ cc_test(phi_test_backend SRCS test_backend.cc DEPS gtest)
 cc_test(phi_test_data_layout SRCS test_data_layout.cc DEPS gtest)
 cc_test(phi_test_data_type SRCS test_data_type.cc DEPS gtest)
 cc_test(phi_test_place SRCS test_place.cc DEPS phi_place)
+cc_test(phi_test_int_array SRCS test_int_array.cc DEPS int_array api_int_array phi phi_api)
 if (WITH_GPU)
     nv_test(phi_test_scalar SRCS test_scalar.cu DEPS scalar api_scalar)
 endif()
diff --git a/paddle/phi/tests/common/test_int_array.cc b/paddle/phi/tests/common/test_int_array.cc
new file mode 100644
index 0000000000000..b6c4f2b1ea8e3
--- /dev/null
+++ b/paddle/phi/tests/common/test_int_array.cc
@@ -0,0 +1,159 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/api/include/api.h"
+
+#include "paddle/phi/api/include/context_pool.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/int_array.h"
+#include "paddle/phi/kernels/full_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "gtest/gtest.h"
+
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
+#endif
+
+namespace phi {
+namespace tests {
+
+TEST(IntArray, ConstructFromCPUDenseTensor) {
+  auto& pool = paddle::experimental::DeviceContextPool::Instance();
+  const auto* dev_ctx =
+      static_cast<const phi::CPUContext*>(pool.Get(CPUPlace()));
+  phi::DenseTensor shape = Full<int>(*dev_ctx, {2}, 3);
+  phi::DenseTensor out = Full<int>(*dev_ctx, shape, 1);
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 3);
+  ASSERT_EQ(out.dims()[1], 3);
+  ASSERT_EQ(out.numel(), 9);
+}
+
+TEST(IntArray, ConstructFromCPUDenseTensorVector) {
+  auto& pool = paddle::experimental::DeviceContextPool::Instance();
+  const auto* dev_ctx =
+      static_cast<const phi::CPUContext*>(pool.Get(CPUPlace()));
+  phi::DenseTensor shape0 = Full<int>(*dev_ctx, {1}, 3);
+  phi::DenseTensor shape1 = Full<int64_t>(*dev_ctx, {1}, 3);
+  std::vector<phi::DenseTensor> shape{shape0, shape1};
+  phi::DenseTensor out = Full<int>(*dev_ctx, shape, 1);
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 3);
+  ASSERT_EQ(out.dims()[1], 3);
+  ASSERT_EQ(out.numel(), 9);
+}
+
+TEST(IntArray, ConstructFromCPUTensor) {
+  auto shape = paddle::experimental::full({2}, 3, DataType::INT64);
+  auto out = paddle::experimental::full(shape, 1);
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 3);
+  ASSERT_EQ(out.dims()[1], 3);
+  ASSERT_EQ(out.numel(), 9);
+}
+
+TEST(IntArray, ConstructFromCPUTensorVector) {
+  auto shape0 = paddle::experimental::full({2}, 3, DataType::INT64);
+  auto shape1 = paddle::experimental::full({2}, 3, DataType::INT32);
+
+  std::vector<paddle::experimental::Tensor> shape{shape0, shape0};
+  auto out = paddle::experimental::full(shape, 1);
+
+  std::vector<paddle::experimental::Tensor> shape_new{shape0, shape1};
+  auto out1 = paddle::experimental::full(shape_new, 1);
+
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 3);
+  ASSERT_EQ(out.dims()[1], 3);
+  ASSERT_EQ(out.numel(), 9);
+
+  ASSERT_EQ(out1.dims().size(), 2);
+  ASSERT_EQ(out1.dims()[0], 3);
+  ASSERT_EQ(out1.dims()[1], 3);
+  ASSERT_EQ(out1.numel(), 9);
+}
+
+TEST(IntArray, ThrowException) {
+  auto shape = paddle::experimental::full({2}, 3, DataType::FLOAT32);
+  auto create_int_array = [&shape]() -> paddle::experimental::IntArray {
+    paddle::experimental::IntArray int_array{shape};
+    return int_array;
+  };
+  ASSERT_ANY_THROW(create_int_array());
+}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+TEST(IntArray, ConstructFromGPUDenseTensor) {
+  auto& pool = paddle::experimental::DeviceContextPool::Instance();
+  const auto* dev_ctx =
+      static_cast<const phi::GPUContext*>(pool.Get(GPUPlace()));
+  phi::DenseTensor shape = Full<int>(*dev_ctx, {2}, 3);
+  phi::DenseTensor out = Full<int>(*dev_ctx, shape, 1);
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 3);
+  ASSERT_EQ(out.dims()[1], 3);
+  ASSERT_EQ(out.numel(), 9);
+}
+
+TEST(IntArray, ConstructFromGPUDenseTensorVector) {
+  auto& pool = paddle::experimental::DeviceContextPool::Instance();
+  const auto* dev_ctx =
+      static_cast<const phi::GPUContext*>(pool.Get(GPUPlace()));
+  phi::DenseTensor shape0 = Full<int>(*dev_ctx, {1}, 3);
+  phi::DenseTensor shape1 = Full<int64_t>(*dev_ctx, {1}, 3);
+  std::vector<phi::DenseTensor> shape{shape0, shape1};
+  phi::DenseTensor out = Full<int>(*dev_ctx, shape, 1);
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 3);
+  ASSERT_EQ(out.dims()[1], 3);
+  ASSERT_EQ(out.numel(), 9);
+}
+
+TEST(IntArray, ConstructFromGPUTensor) {
+  auto shape = paddle::experimental::full({2}, 3, DataType::INT64, GPUPlace());
+  auto out = paddle::experimental::full(shape, 1);
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 3);
+  ASSERT_EQ(out.dims()[1], 3);
+  ASSERT_EQ(out.numel(), 9);
+}
+
+TEST(IntArray, ConstructFromGPUTensorVector) {
+  auto shape0 = paddle::experimental::full({2}, 3, DataType::INT64, GPUPlace());
+  auto shape1 = paddle::experimental::full({2}, 3, DataType::INT32, GPUPlace());
+
+  std::vector<paddle::experimental::Tensor> shape{shape0, shape0};
+  auto out = paddle::experimental::full(shape, 1);
+
+  std::vector<paddle::experimental::Tensor> shape_new{shape0, shape1};
+  auto out1 = paddle::experimental::full(shape_new, 1);
+
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 3);
+  ASSERT_EQ(out.dims()[1], 3);
+  ASSERT_EQ(out.numel(), 9);
+
+  ASSERT_EQ(out1.dims().size(), 2);
+  ASSERT_EQ(out1.dims()[0], 3);
+  ASSERT_EQ(out1.dims()[1], 3);
+  ASSERT_EQ(out1.numel(), 9);
+}
+#endif
+
+}  // namespace tests
+}  // namespace phi
diff --git a/paddle/phi/tests/core/test_custom_kernel.cc b/paddle/phi/tests/core/test_custom_kernel.cc
index 2a5b8ec8fa000..abd77e2862410 100644
--- a/paddle/phi/tests/core/test_custom_kernel.cc
+++ b/paddle/phi/tests/core/test_custom_kernel.cc
@@ -23,7 +23,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/kernel_context.h"
@@ -264,10 +263,7 @@ TEST(CustomKernel, custom_kernel_dot) {
   kernel_context.EmplaceBackAttr(fake_attr_int64_vec);
   kernel_context.EmplaceBackAttr(fake_attr_int_vec);
 
-  auto dense_out = std::make_shared<phi::DenseTensor>(
-      phi::make_intrusive<paddle::experimental::SharedStorage>(
-          phi::TransToPhiPlace(backend)),
-      phi::DenseTensorMeta());
+  auto dense_out = std::make_shared<phi::DenseTensor>();
 
   phi::MetaTensor meta_out(dense_out.get());
   phi::DotInferMeta(*dense_x, *dense_y, &meta_out);
diff --git a/paddle/phi/tests/core/test_dense_tensor.cc b/paddle/phi/tests/core/test_dense_tensor.cc
index ddfa184df2c1e..42814317b9c83 100644
--- a/paddle/phi/tests/core/test_dense_tensor.cc
+++ b/paddle/phi/tests/core/test_dense_tensor.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "gtest/gtest.h"
 
+#include "glog/logging.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/tests/core/allocator.h"
 
diff --git a/paddle/phi/tests/core/test_sparse_coo_tensor.cc b/paddle/phi/tests/core/test_sparse_coo_tensor.cc
index 5d0e16b0528e7..5e7642bbfdcb0 100644
--- a/paddle/phi/tests/core/test_sparse_coo_tensor.cc
+++ b/paddle/phi/tests/core/test_sparse_coo_tensor.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "gtest/gtest.h"
 
+#include "glog/logging.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/tests/core/allocator.h"
diff --git a/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc
index 43640da270aad..05781156cd1d6 100644
--- a/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc
@@ -24,9 +24,9 @@ limitations under the License. */
 #include "paddle/phi/kernels/activation_grad_kernel.h"
 #include "paddle/phi/kernels/activation_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
-#include "paddle/phi/kernels/sparse/activation_grad_kernel.h"
-#include "paddle/phi/kernels/sparse/activation_kernel.h"
 #include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
+#include "paddle/phi/kernels/sparse/unary_grad_kernel.h"
+#include "paddle/phi/kernels/sparse/unary_kernel.h"
 
 namespace phi {
 namespace tests {
@@ -70,7 +70,7 @@ TEST(DEV_API, sparse_relu) {
 
   SparseCooTensor sparse_out_grad(
       sparse_coo.non_zero_indices(), dense_out, {3, 4});
-  sparse::SparseReluGradKernel<float>(
+  sparse::SparseCooReluGradKernel<float>(
       dev_ctx_cpu, sparse_coo, sparse_out_grad, &sparse_grad_x);
 
   cmp = memcmp(dense_grad_x.data<float>(),
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 36154b23f3f12..0f70f9a8f3564 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -88,7 +88,6 @@ set UPLOAD_TP_FILE=OFF
 rem ------initialize set git config------
 git config --global core.longpaths true
 
-
 rem ------initialize the python environment------
 set PYTHON_EXECUTABLE=%PYTHON_ROOT%\python.exe
 set PATH=%PYTHON_ROOT%\Scripts;%PYTHON_ROOT%;%PATH%
@@ -305,10 +304,14 @@ if %errorlevel% NEQ 0 exit /b 1
 
 call :cmake || goto cmake_error
 call :build || goto build_error
-call :test_inference || goto test_inference_error
-call :test_inference_ut || goto test_inference_ut_error
+call :test_inference
+if %errorlevel% NEQ 0 set error_code=%errorlevel%
+call :test_inference_ut
+if %errorlevel% NEQ 0 set error_code=%errorlevel%
+
 call :zip_cc_file || goto zip_cc_file_error
 call :zip_c_file || goto zip_c_file_error
+if %error_code% NEQ 0 goto test_inference_error
 goto:success
 
 rem "Other configurations are added here"
@@ -760,12 +763,15 @@ for /F %%i in ("%libsize%") do (
 
 cd /d %work_dir%\paddle\fluid\inference\api\demo_ci
 %cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo %WITH_TENSORRT% %TENSORRT_ROOT% %WITH_ONNXRUNTIME% %MSVC_STATIC_CRT% "%CUDA_TOOLKIT_ROOT_DIR%"
+
 goto:eof
 
 :test_inference_error
 ::echo 1 > %cache_dir%\error_code.txt
 ::type %cache_dir%\error_code.txt
-echo Testing fluid library for inference failed!
+echo    ==========================================
+echo    Testing inference library failed!
+echo    ==========================================
 exit /b 1
 
 rem ---------------------------------------------------------------------------------------------
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 3a2c51fe72b20..2eda74b769c04 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -344,6 +344,7 @@ function abort(){
 }
 
 function check_style() {
+    set +x
     trap 'abort' 0
     set -e
 
@@ -368,7 +369,7 @@ function check_style() {
         if ! pre-commit run --files $file_name ; then
             commit_files=off
         fi
-    done 
+    done
 
     export PATH=${OLD_PATH}
     
@@ -378,6 +379,7 @@ function check_style() {
         exit 4
     fi
     trap : 0
+    set -x 
 }
 
 #=================================================
@@ -981,9 +983,6 @@ function generate_upstream_develop_api_spec() {
 
     rm -rf ${PADDLE_ROOT}/build/Makefile ${PADDLE_ROOT}/build/CMakeCache.txt
     cmake_change=`git diff --name-only upstream/$BRANCH | grep "cmake/external" || true`
-    if [[ ${cmake_change} ]];then
-        rm -rf ${PADDLE_ROOT}/build/third_party
-    fi
 
     cd ${PADDLE_ROOT}
     git checkout .
@@ -995,6 +994,9 @@ function generate_upstream_develop_api_spec() {
     if [ "$url_return" == '200' ];then
         mkdir -p ${PADDLE_ROOT}/build/python/dist && wget -q -P ${PADDLE_ROOT}/build/python/dist ${dev_url}
     else
+        if [[ ${cmake_change} ]];then
+            rm -rf ${PADDLE_ROOT}/build/third_party
+        fi
         cmake_gen $1
         build $2
     fi
@@ -3063,6 +3065,7 @@ function main() {
         ;;
       build_and_check_gpu)
         set +e
+        set +x
         check_style_info=$(check_style)
         check_style_code=$?
         example_info_gpu=""
@@ -3074,6 +3077,7 @@ function main() {
         example_info=$(exec_samplecode_test cpu)
         example_code=$?
         summary_check_problems $check_style_code $[${example_code_gpu} + ${example_code}] "$check_style_info" "${example_info_gpu}\n${example_info}"
+        set -x
         assert_api_spec_approvals
         ;;
       check_whl_size)
diff --git a/paddle/utils/CMakeLists.txt b/paddle/utils/CMakeLists.txt
index 64c88a47b4393..7669c06b2c2b7 100644
--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
@@ -1,3 +1,5 @@
 add_subdirectory(string)
-cc_test(small_vector_test SRCS small_vector_test.cc DEPS gtest gflags)
+
 cc_test(array_ref_test SRCS array_ref_test.cc DEPS gtest gflags)
+cc_test(small_vector_test SRCS small_vector_test.cc DEPS gtest gflags)
+cc_test(variant_test SRCS variant_test.cc DEPS gtest)
diff --git a/paddle/utils/optional.h b/paddle/utils/optional.h
index eec5f32be7226..2b5a657f4d42e 100644
--- a/paddle/utils/optional.h
+++ b/paddle/utils/optional.h
@@ -100,7 +100,11 @@ class reference_content {
  public:  // structors
   ~reference_content() {}
 
+// TODO(zhiqiu): remove it
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
   reference_content(RefT r) : content_(r) {}
+#pragma GCC diagnostic pop
 
   reference_content(const reference_content& operand)
       : content_(operand.content_) {}
diff --git a/paddle/utils/variant.h b/paddle/utils/variant.h
index 50bdc4287e21a..4348abc9cbff0 100644
--- a/paddle/utils/variant.h
+++ b/paddle/utils/variant.h
@@ -2199,6 +2199,18 @@ class impl : public copy_assignment<traits<Ts...>> {
     }
   }
 
+  inline const std::type_info &type() const {
+    return visitation::alt::visit_alt_at(
+        this->index(),
+#ifdef MPARK_GENERIC_LAMBDAS
+        [](auto &alt) -> const std::type_info & { return typeid(alt.value); }
+#else
+        typer {}
+#endif
+        ,
+        *this);
+  }
+
  private:
 #ifndef MPARK_GENERIC_LAMBDAS
   struct swapper {
@@ -2208,6 +2220,13 @@ class impl : public copy_assignment<traits<Ts...>> {
       swap(this_alt.value, that_alt.value);
     }
   };
+
+  struct typer {
+    template <typename Alt>
+    inline const std::type_info &operator()(Alt &alt) const {
+      return typeid(alt.value);
+    }
+  };
 #endif
 
   inline constexpr bool move_nothrow() const {
@@ -2432,6 +2451,8 @@ class variant {
     impl_.swap(that.impl_);
   }
 
+  inline const std::type_info &type() noexcept { return impl_.type(); }
+
  private:
   detail::impl<Ts...> impl_;
 
diff --git a/paddle/utils/variant_test.cc b/paddle/utils/variant_test.cc
new file mode 100644
index 0000000000000..e690269d801c1
--- /dev/null
+++ b/paddle/utils/variant_test.cc
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/utils/variant.h"
+#include "gtest/gtest.h"
+#include "paddle/phi/core/enforce.h"
+
+TEST(interface_test, type) {
+  using phi::enforce::demangle;
+
+  paddle::variant<bool, int, float> var;
+
+  var = true;
+  EXPECT_EQ(demangle(var.type().name()), "bool");
+
+  var = 0;
+  EXPECT_EQ(demangle(var.type().name()), "int");
+
+  var = 0.f;
+  EXPECT_EQ(demangle(var.type().name()), "float");
+}
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index fe5f2c25ca551..fdcd560658146 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -14,6 +14,8 @@ elseif(WITH_ASCEND_CL)
   SET(PACKAGE_NAME "paddlepaddle-npu")
 elseif(WITH_XPU)
   SET(PACKAGE_NAME "paddlepaddle-xpu")
+elseif(WITH_IPU)
+  SET(PACKAGE_NAME "paddlepaddle-ipu")
 else()
   SET(PACKAGE_NAME "paddlepaddle")
 endif()
diff --git a/python/paddle/README.rst b/python/paddle/README.rst
index e779f1264c451..2d48ee4b26caf 100644
--- a/python/paddle/README.rst
+++ b/python/paddle/README.rst
@@ -88,7 +88,7 @@ If you want to install paddlepaddle-gpu with cuda version of 9.0 ,10.0 ,10.1 ,or
 
 After the installation is complete, you can use `python` or `python3` to enter the Python interpreter and then use `import paddle.fluid` and `fluid.install_check.run_check()`
 
-If `Your Paddle Fluid is installed succesfully!` appears, to verify that the installation was successful.
+If `Your Paddle Fluid is installed successfully!` appears, to verify that the installation was successful.
 
 
 
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 8ef007a1a1bef..c667e2e8ca5ba 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -105,6 +105,7 @@
 from .tensor.creation import assign  # noqa: F401
 from .tensor.creation import complex  # noqa: F401
 from .tensor.creation import clone  # noqa: F401
+from .tensor.creation import tril_indices  #noqa: F401
 from .tensor.linalg import matmul  # noqa: F401
 from .tensor.linalg import dot  # noqa: F401
 from .tensor.linalg import norm  # noqa: F401
@@ -270,6 +271,7 @@
 from .tensor.math import fmin  # noqa: F401
 from .tensor.math import inner  # noqa: F401
 from .tensor.math import outer  # noqa: F401
+from .tensor.math import heaviside  # noqa: F401
 from .tensor.math import frac  # noqa: F401
 
 from .tensor.random import bernoulli  # noqa: F401
@@ -637,4 +639,6 @@
            'renorm',
            'take_along_axis',
            'put_along_axis',
+           'heaviside',
+           'tril_indices',
 ]
diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py
index 5132f23079f1f..96a94d898467f 100644
--- a/python/paddle/amp/auto_cast.py
+++ b/python/paddle/amp/auto_cast.py
@@ -54,25 +54,25 @@ def auto_cast(enable=True,
 
         with paddle.amp.auto_cast():
             conv = conv2d(data)
-            print(conv.dtype) # FP16
+            print(conv.dtype) # paddle.float32
 
         with paddle.amp.auto_cast(enable=False):
             conv = conv2d(data)
-            print(conv.dtype) # FP32
+            print(conv.dtype) # paddle.float32
 
         with paddle.amp.auto_cast(custom_black_list={'conv2d'}):
             conv = conv2d(data)
-            print(conv.dtype) # FP32
+            print(conv.dtype) # paddle.float32
 
         a = paddle.rand([2,3])
         b = paddle.rand([2,3])
         with paddle.amp.auto_cast(custom_white_list={'elementwise_add'}):
             c = a + b
-            print(c.dtype) # FP16
+            print(c.dtype) # paddle.float32
         
         with paddle.amp.auto_cast(custom_white_list={'elementwise_add'}, level='O2'):
             d = a + b
-            print(d.dtype) # FP16
+            print(d.dtype) # paddle.float32
 
     """
     return amp_guard(enable, custom_white_list, custom_black_list, level, dtype)
diff --git a/python/paddle/autograd/primreg.py b/python/paddle/autograd/primreg.py
deleted file mode 100644
index cffb4bc050b4b..0000000000000
--- a/python/paddle/autograd/primreg.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import functools
-
-
-class Registry(object):
-    """ A general registry object. """
-    __slots__ = ['name', 'tab']
-
-    def __init__(self, name):
-        self.name = name
-        self.tab = {}
-
-    def register(self, name, value):
-        assert name not in self.tab
-        self.tab[name] = value
-
-    def lookup(self, name):
-        assert name in self.tab, f'No registry entry is found with name: {name}'
-        return self.tab[name]
-
-
-_primop_fn = Registry('primop_fn')
-_orig2prim = Registry('orig2prim')
-_prim2orig = Registry('prim2orig')
-_primop_jvp = Registry('primop_jvp')
-_primop_transpose = Registry('primop_transpose')
-_primop_position_argnames = Registry('primop_position_argnames')
-
-
-def REGISTER_FN(op_type, *position_argnames):
-    """Decorator for registering the Python function for a primitive op."""
-
-    assert isinstance(op_type, str)
-
-    _primop_position_argnames.register(op_type, position_argnames)
-
-    def wrapper(f):
-        _primop_fn.register(op_type, f)
-        return f
-
-    return wrapper
diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py
index b33dc1aaeb086..8cb4f5f765611 100644
--- a/python/paddle/device/cuda/__init__.py
+++ b/python/paddle/device/cuda/__init__.py
@@ -224,7 +224,7 @@ def max_memory_allocated(device=None):
             f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
         )
     device_id = extract_cuda_device_id(device, op_name=name)
-    return core.memory_stat_get_peak("Allocated", device_id)
+    return core.device_memory_stat_peak_value("Allocated", device_id)
 
 
 def max_memory_reserved(device=None):
@@ -255,7 +255,7 @@ def max_memory_reserved(device=None):
             f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
         )
     device_id = extract_cuda_device_id(device, op_name=name)
-    return core.memory_stat_get_peak("Reserved", device_id)
+    return core.device_memory_stat_peak_value("Reserved", device_id)
 
 
 def memory_allocated(device=None):
@@ -290,7 +290,7 @@ def memory_allocated(device=None):
             f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
         )
     device_id = extract_cuda_device_id(device, op_name=name)
-    return core.memory_stat_get_current("Allocated", device_id)
+    return core.device_memory_stat_current_value("Allocated", device_id)
 
 
 def memory_reserved(device=None):
@@ -321,7 +321,7 @@ def memory_reserved(device=None):
             f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
         )
     device_id = extract_cuda_device_id(device, op_name=name)
-    return core.memory_stat_get_current("Reserved", device_id)
+    return core.device_memory_stat_current_value("Reserved", device_id)
 
 
 def _set_current_stream(stream):
diff --git a/python/paddle/distributed/auto_parallel/completion.py b/python/paddle/distributed/auto_parallel/completion.py
index 8c286c02015bf..31bdc4cc650af 100644
--- a/python/paddle/distributed/auto_parallel/completion.py
+++ b/python/paddle/distributed/auto_parallel/completion.py
@@ -1250,3 +1250,70 @@ def complete_update_annotation(self, serial_main_program=None):
                     self._dist_context.set_op_dist_attr_for_program(
                         op, op_dist_attr)
                     continue
+
+    def complete_prim_annotation(self, serial_main_program=None):
+        """
+        fill default data parallel annotation for program with primitive operators.
+
+        Arguments:
+            serial_main_program: partial annotated serial_main_program.
+        Returns:
+            serial_main_program: completed annotated serial_main_program.
+        """
+        if serial_main_program is None:
+            serial_main_program = self._dist_context.serial_main_program
+        else:
+            self._dist_context.serial_main_program = serial_main_program
+
+        import time
+
+        start_time = time.time()
+        self._dist_context._is_initialized = True
+
+        start_time = time.time()
+        self._dist_context._init_dist_attr_for_program()
+
+        start_time = time.time()
+        self._init_global_mesh_for_program()
+
+        # Do the validation check and amend some completion
+        start_time = time.time()
+        self._dist_context.amend_dist_attr_for_program()
+        self._dist_context.validate_dist_attr_for_program()
+
+    def _init_global_mesh_for_program(self):
+        # Copy the dist tensors and dist ops annotated by users from the default context
+        # global mesh
+        from paddle.distributed.auto_parallel.process_group import get_world_process_group
+        world_ranks = get_world_process_group().ranks
+
+        for block in self._dist_context._serial_main_program.blocks:
+            for tensor in block.vars.values():
+                # Copy the distributed tensors in the default context
+                dist_tensor = self._dist_context.get_dist_tensor_for_program(
+                    tensor)
+                assert dist_tensor is not None
+                dist_tensor.dist_attr.process_mesh = world_ranks
+            for op in block.ops:
+                # Copy the distributed operators in the default context
+                dist_op = self._dist_context.get_dist_op_for_program(op)
+                assert dist_op is not None
+                dist_op.dist_attr.process_mesh = world_ranks
+
+                # Find the most compatible implemenetations from the distributed operator
+                op_dist_impls = find_best_compatible_distributed_operator_impl(
+                    dist_op, fwd=True)
+                if op_dist_impls is not None:
+                    backup_op_dist_attr = copy.deepcopy(dist_op.dist_attr)
+                    for op_dist_impl in op_dist_impls:
+                        dim_changed = op_dist_impl.update_dims_mapping(dist_op)
+                        if op_dist_impl.is_auto_compatible(dist_op):
+                            if op_dist_impl.type == "elementwise":
+                                dist_op.dist_attr.impl_type = "default"
+                            else:
+                                dist_op.dist_attr.impl_type = op_dist_impl.type
+                            # op_dist_attr.impl_type = op_dist_impl.type
+                            dist_op.dist_attr.impl_idx = op_dist_impl.idx
+                            break
+                        else:
+                            dist_op.dist_attr = backup_op_dist_attr
diff --git a/python/paddle/distributed/auto_parallel/cost/__init__.py b/python/paddle/distributed/auto_parallel/cost/__init__.py
index 7bc8a81b79f8e..ea6b3bc5b7e76 100644
--- a/python/paddle/distributed/auto_parallel/cost/__init__.py
+++ b/python/paddle/distributed/auto_parallel/cost/__init__.py
@@ -12,9 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
-from .base_cost import OP_COST_FACTORY
+from .base_cost import _g_op_cost_factory
 from .base_cost import Cost
-from .comm_op_cost import AllreduceSumCost
-from .comp_op_cost import MatmulV2OpCost
+from .base_cost import CommContext
+from .base_cost import build_comm_desc
 from .tensor_cost import TensorCost
 from .estimate_cost import CostEstimator
+
+from .comp_op_cost import MatmulV2OpCost
+
+from .comm_op_cost import SendOpCost
+from .comm_op_cost import RecvOpCost
+from .comm_op_cost import IdentityOpCost
+from .comm_op_cost import BroadcastOpCost
+from .comm_op_cost import AllgatherOpCost
+from .comm_op_cost import AllreduceSumOpCost
diff --git a/python/paddle/distributed/auto_parallel/cost/base_cost.py b/python/paddle/distributed/auto_parallel/cost/base_cost.py
index c4ebd836129e2..763f78c510615 100644
--- a/python/paddle/distributed/auto_parallel/cost/base_cost.py
+++ b/python/paddle/distributed/auto_parallel/cost/base_cost.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,13 +13,29 @@
 # limitations under the License
 
 from collections import OrderedDict
+from functools import reduce
+
 import paddle
 
+from ..cluster import LinkType
+from ..process_group import get_process_group
+
 COMM_OP_TYPE = [
-    "send_v2", "recv_v2", "c_broadcast", "c_allgather", "c_allreduce_sum"
+    "send_v2", "recv_v2", "c_broadcast", "c_allgather", "c_allreduce_sum",
+    "c_identity"
 ]
 NON_COMP_TYPE = ["while"] + COMM_OP_TYPE
-OP_COST_FACTORY = {}
+_g_op_cost_factory = {}
+
+
+def build_comm_desc(op_type, group_ranks, dtype, shape, attrs=None):
+    desc = {}
+    desc["op"] = op_type
+    desc["group_ranks"] = group_ranks
+    desc["inputs"] = {"X": [(dtype, shape)]}
+    if attrs is not None:
+        desc["attrs"] = attrs
+    return desc
 
 
 def _parse_op_to_desc(op, dist_context=None):
@@ -126,66 +142,137 @@ class CommContext:
     _instance = None
     _has_instance = False
 
-    def __init__(self, cluster):
-        if CommContext._has_instance:
-            return
-        self.cluster = cluster
-        self._alpha_base_ring = 8.4
-        self._alpha_base_tree = 0
-        self._alpha_inter = None
-        self._alpha_intra
-        self._beta = {}
-
     def __new__(cls, *args, **kwargs):
         if cls._instance is None:
-            cls._instance = super().__new__(cls, *args, **kwargs)
+            cls._instance = super().__new__(cls)
             _has_instance = True
         return cls._instance
 
-    @property
-    def alpha_inter(self):
-        if self._alpha_inter is None:
-            if cluster.alpha.inter == "NVL":
-                self._alpha_inter = 3.4
-            elif cluster.alpha.inter == "PHB":
-                self._alpha_inter = 5.7
-        return self._alpha_inter
-
-    @property
-    def alpha_intra(self):
-        if self._alpha_intra is None:
-            if cluster.alpha.intra == "NVL":
-                self._alpha_intra = 28
-            elif cluster.alpha.intra == "PHB":
-                self._alpha_intra = 28
-        return self._alpha_intra
-
-    @property
-    def alpha_base_ring(self):
-        return self._alpha_base_ring
-
-    @property
-    def alpha_base_tree(self):
-        return self._alpha_base_tree
-
-    def get_beta(self, ranks):
+    def __init__(self, cluster):
+        if CommContext._has_instance:
+            return
+        self.beta = {}
+        self.hops = {}
+        assert cluster is not None
+        self.cluster = cluster
+        # if cluster has no info about those vars, it will be set by default
+        self.base_ring = None
+        self.base_tree = None
+        # self.base_inter_ring = None
+        # self.base_inter_tree = None
+        self.intra_ring = None
+        self.intra_tree = None
+        self.inter_ring = None
+        self.inter_tree = None
+        self.switch = None
+        self._post_init()
+
+    def _post_init(self):
+        alpha_latency = self.cluster.alpha_latency
+        if alpha_latency is None:
+            # set default
+            self.base_ring = 8.4
+            self.base_tree = 0.
+            # NVL in default
+            self.intra_ring = 3.4
+            self.intra_tree = 28
+            # NET in default
+            self.inter_ring = 9.6
+            self.inter_tree = 28
+            self.switch = 10.0
+        else:
+            base_ring = alpha_latency.base_ring
+            self.base_ring = base_ring if base_ring is not None else 8.4
+
+            base_tree = alpha_latency.base_tree
+            self.base_tree = base_tree if base_tree is not None else 0.
+
+            intra_ring = alpha_latency.intra_ring
+            if intra_ring == LinkType.NVL:
+                self.intra_ring = 3.4
+            elif intra_ring == LinkType.PHB:
+                self.intra_ring = 5.7
+            elif intra_ring is not None:
+                self.intra_ring = intra_ring
+            else:
+                # NVL Default
+                self.intra_ring = 3.4
+
+            intra_tree = alpha_latency.intra_tree
+            if intra_tree == LinkType.NVL:
+                self.intra_tree = 28
+            elif intra_tree == LinkType.PHB:
+                self.intra_tree = 28
+            elif intra_tree is not None:
+                self.intra_tree = intra_tree
+            else:
+                # NVL Default
+                self.intra_tree = 28
+
+            inter_ring = alpha_latency.inter_ring
+            if inter_ring == LinkType.NET:
+                self.inter_ring = 9.6
+            elif inter_ring is not None:
+                self.inter_ring = inter_ring
+            else:
+                # NET Default
+                self.inter_ring = 9.6
+
+            inter_tree = alpha_latency.inter_tree
+            if inter_tree == LinkType.NET:
+                self.inter_tree = 28
+            elif inter_tree is not None:
+                self.inter_tree = inter_tree
+            else:
+                # NET Default
+                self.inter_tree = 28
+
+            switch = alpha_latency.switch
+            self.switch = switch if switch is not None else 10
+
+            assert self.base_ring is not None
+            assert self.base_tree is not None
+            assert self.intra_ring is not None
+            assert self.intra_tree is not None
+            assert self.inter_ring is not None
+            assert self.inter_tree is not None
+            assert self.switch is not None
+
+    def get_max_beta(self, ranks):
+        # NOTE: Get beta by ring, even in the case of tree such as tree broadcast
+        ranks = self.cluster.convert_rank_to_device_id(ranks)
         key = ','.join(map(str, sorted(ranks)))
         max_beta = None
-        if key in self._beta.keys:
-            max_beta = self._beta[key]
+        if key in self.beta:
+            max_beta = self.beta[key]
         else:
             for i in range(len(ranks)):
                 for j in range(i + 1, len(ranks)):
-                    if min_beta == None:
-                        min_beta = cluster.get_beta(ranks[i], ranks[j])
+                    forward_order_beta = self.cluster.get_beta(ranks[i],
+                                                               ranks[j])
+                    backward_order_beta = self.cluster.get_beta(ranks[j],
+                                                                ranks[i])
+                    beta = forward_order_beta if forward_order_beta > backward_order_beta else backward_order_beta
+                    if max_beta == None:
+                        max_beta = beta
                     else:
-                        beta = cluster.get_beta(ranks[i], ranks[j])
                         if beta > max_beta:
                             max_beta = beta
-            self._beta[key] = max_beta
+            self.beta[key] = max_beta
 
         return max_beta
 
+    def get_hops(self, ranks):
+        key = ','.join(map(str, sorted(ranks)))
+        hops = 0
+        for i in range(len(ranks)):
+            for j in range(i + 1, len(ranks)):
+                hop = self.cluster.get_hop(ranks[i], ranks[j])
+                hops += hop
+        self.hops[key] = hops
+
+        return hops
+
 
 class Cost:
     def __init__(self, time=0, memory=0, flops=0):
@@ -198,11 +285,13 @@ def _check_time(self, val):
 
     def _check_memory(self, val):
         assert isinstance(
-            val, int) and val >= 0, "Memory must be int and greater than 0."
+            val,
+            int) and val >= 0, "Memory must be int and greater than equal to 0."
 
     def _check_flops(self, val):
         assert isinstance(
-            val, int) and val >= 0, "FLOPs must be int and greater than 0."
+            val,
+            int) and val >= 0, "FLOPs must be int and greater than equal to 0."
 
     @property
     def time(self):
@@ -250,11 +339,9 @@ def __sub__(self, rhs):
 
 class OpCost:
     def __init__(self, op=None, op_desc=None):
-        assert (op is not None and op_desc is None) or (op is None and
-                                                        op_desc is not None)
         self._op = op
         self._op_desc = op_desc
-        self._cost = self.calc_cost()
+        self._cost = None
 
     @property
     def op(self):
@@ -264,6 +351,18 @@ def op(self):
     def op_desc(self):
         return self._op_desc
 
+    @property
+    def time(self):
+        return self.cost.time
+
+    @property
+    def memory(self):
+        return self.cost.memory
+
+    @property
+    def flops(self):
+        return self.cost.flops
+
     @property
     def cost(self):
         return self._cost
@@ -284,6 +383,40 @@ def calc_cost(self):
         cost = Cost(time, memory, flops)
         return cost
 
+    def __add__(self, rhs):
+        assert isinstance(rhs, (OpCost, Cost))
+        time = 0
+        memory = 0
+        flops = 0
+        if isinstance(rhs, OpCost):
+            time = self.cost.time + rhs.cost.time
+            memory = self.cost.memory + rhs.cost.memory
+            flops = self.cost.flops + rhs.cost.flops
+            assert (time >= 0 and memory >= 0 and flops >= 0)
+        elif isinstance(rhs, Cost):
+            time = self.time + rhs.time
+            memory = self.memory + rhs.memory
+            flops = self.flops + rhs.flops
+            assert (time >= 0 and memory >= 0 and flops >= 0)
+        return Cost(time, memory, flops)
+
+    def __sub__(self, rhs):
+        assert isinstance(rhs, (OpCost, Cost))
+        time = 0
+        memory = 0
+        flops = 0
+        if isinstance(rhs, OpCost):
+            time = self.cost.time - rhs.cost.time
+            memory = self.cost.memory - rhs.cost.memory
+            flops = self.cost.flops - rhs.cost.flops
+            assert (time >= 0 and memory >= 0 and flops >= 0)
+        elif isinstance(rhs, Cost):
+            time = self.time - rhs.time
+            memory = self.memory - rhs.memory
+            flops = self.flops - rhs.flops
+            assert (time >= 0 and memory >= 0 and flops >= 0)
+        return Cost(time, memory, flops)
+
 
 class CommOpCost(OpCost):
     OP_TYPE = "COMM"
@@ -292,11 +425,83 @@ def __init__(self, op=None, op_desc=None, comm_context=None):
         super(CommOpCost, self).__init__(op=op, op_desc=op_desc)
         self._check_comm_op_type()
         self._comm_context = comm_context
+        self._group_ranks = None
+        self._comm_count = None
+        self._hops = None
+        self._rank_count = len(self.group_ranks)
+        self._machine_count = None
+        self._cost = self.calc_cost()
 
     @property
     def comm_context(self):
         return self._comm_context
 
+    @property
+    def comm_count(self):
+        if self._comm_count is None:
+            dtype = None
+            shape = None
+            if self.op is not None:
+                vars = self.op.block.vars
+                # NOTE: The tensor communicated input_name is "X" in default. Otherwise, this function should be overrided
+                var_name = self.op.input("X")[0]
+                var = vars[var_name]
+                dtype = var.dtype
+                shape = var.shape
+            elif self.op_desc is not None:
+                dtype = self.op_desc["inputs"]["X"][0][0]
+                shape = self.op_desc["inputs"]["X"][0][1]
+
+            factor = None
+            if dtype == paddle.float32 or dtype == paddle.int32:
+                factor = 4
+            elif dtype == paddle.int64:
+                factor = 8
+            elif dtype == paddle.uint8:
+                factor = 1
+            elif dtype == paddle.float16:
+                factor = 2
+            else:
+                raise TypeError("This dtype {} is not supported now".format(
+                    dtype))
+            comm_count = reduce(lambda x, y: x * y, shape) * factor
+            self._comm_count = comm_count
+
+        return self._comm_count
+
+    @property
+    def rank_count(self):
+        return self._rank_count
+
+    @property
+    def machine_count(self):
+        if self._machine_count is None:
+            cluster = self._comm_context.cluster
+            self._machine_count = cluster.get_involved_machine_count(
+                self.group_ranks)
+        return self._machine_count
+
+    @property
+    def hops(self):
+        if self._hops is None:
+            self._hops = self.comm_context.get_hops(self.group_ranks)
+        return self._hops
+
+    @property
+    def group_ranks(self):
+        if self._group_ranks is None:
+            if self.op_desc is not None:
+                self._group_ranks = self.op_desc["group_ranks"]
+            elif self.op is not None:
+                ring_id = op.attrs("ring_id")
+                process_group = get_process_group(ring_id)
+                if process_group is None:
+                    raise ValueError(
+                        "There not exists process group whose ring_id is {}.".
+                        format(ring_id))
+                self._group_ranks = process_group.ranks
+        return self._group_ranks
+
     @classmethod
     def _check_comm_op_type(cls):
         if cls.OP_TYPE != "COMM":
@@ -311,6 +516,7 @@ class CompOpCost(OpCost):
     def __init__(self, op=None, op_desc=None, cluster=None):
         super(CompOpCost, self).__init__(op=op, op_desc=op_desc)
         self._check_comp_op_type()
+        self._cost = self.calc_cost()
         self.cluster = cluster
 
     @classmethod
@@ -325,18 +531,22 @@ def register_op_cost(cls):
     op_type = cls.OP_TYPE
 
     def register(op_type):
-        OP_COST_FACTORY[op_type] = cls
+        global _g_op_cost_factory
+        _g_op_cost_factory[op_type] = cls
 
-    return register(op_type)
+    register(op_type)
+    return cls
 
 
-def calc_time_from_model(op=None, desc=None, cluster=None, comm_context=None):
+def calc_time_by_modeling(op=None, desc=None, cluster=None):
     op_type = op.type if op is not None else desc["op"]
     if op_type in COMM_OP_TYPE:
-        op_cost = OP_COST_FACTORY[op_type](op=op,
-                                           op_desc=desc,
-                                           comm_context=comm_context)
+        op_cost = _g_op_cost_factory[op_type](op=op,
+                                              op_desc=desc,
+                                              comm_context=CommContext(cluster))
     elif op_type not in NON_COMP_TYPE:
-        op_cost = OP_COST_FACTORY[op_type](op=op, op_desc=desc, cluster=cluster)
+        op_cost = _g_op_cost_factory[op_type](op=op,
+                                              op_desc=desc,
+                                              cluster=cluster)
     time = op_cost.calc_time()
     return time
diff --git a/python/paddle/distributed/auto_parallel/cost/comm_op_cost.py b/python/paddle/distributed/auto_parallel/cost/comm_op_cost.py
index 359f6b6e7862c..a32fdf1824e62 100644
--- a/python/paddle/distributed/auto_parallel/cost/comm_op_cost.py
+++ b/python/paddle/distributed/auto_parallel/cost/comm_op_cost.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,17 +12,149 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
-from .base_cost import register_op_cost, CommOpCost, OP_COST_FACTORY
+import math
+
+from .base_cost import register_op_cost, CommOpCost, _g_op_cost_factory
 
 
 @register_op_cost
-class AllreduceSumCost(CommOpCost):
+class AllreduceSumOpCost(CommOpCost):
     OP_TYPE = "c_allreduce_sum"
 
     def __init__(self, op=None, op_desc=None, comm_context=None):
-        super(OP_COST_FACTORY["c_allreduce_sum"], self).__init__(
+        super(AllreduceSumOpCost, self).__init__(
+            op=op, op_desc=op_desc, comm_context=comm_context)
+
+    def calc_time(self):
+        # use tree if cross machine and use ring if in a single machine
+        time = None
+        cluster = self.comm_context.cluster
+        if not cluster.cross_machine(self.group_ranks):
+            time = self.calc_time_ring()
+        else:
+            time = self.calc_time_tree()
+
+        return time
+
+    def calc_time_ring(self):
+        alpha = self.comm_context.base_ring
+        alpha += 2 * (
+            self.rank_count - self.machine_count) * self.comm_context.intra_ring
+        alpha += 2 * (self.machine_count - 1) * (
+            self.comm_context.inter_ring + self.hops * self.comm_context.switch)
+        beta = self.comm_context.get_max_beta(self.group_ranks)
+        time = alpha + 2 * (self.rank_count - 1
+                            ) / self.rank_count * self.comm_count * beta
+
+        return time
+
+    def calc_time_tree(self):
+        alpha = self.comm_context.base_tree
+        alpha += 2 * (self.rank_count / self.machine_count - 1
+                      ) * self.comm_context.intra_tree
+        alpha += math.log2(self.machine_count) * (
+            self.comm_context.inter_tree + self.hops * self.comm_context.switch)
+        beta = self.comm_context.get_max_beta(self.group_ranks)
+
+        time = alpha + 2 * self.comm_count * beta
+
+        return time
+
+
+@register_op_cost
+class AllgatherOpCost(CommOpCost):
+    OP_TYPE = "c_allgather"
+
+    def __init__(self, op=None, op_desc=None, comm_context=None):
+        super(AllgatherOpCost, self).__init__(
+            op=op, op_desc=op_desc, comm_context=comm_context)
+
+    def calc_time(self):
+        time = self.calc_time_ring()
+        return time
+
+    def calc_time_ring(self):
+        alpha = self.comm_context.base_ring
+        alpha += (
+            self.rank_count - self.machine_count) * self.comm_context.intra_ring
+        alpha += (self.machine_count - 1) * (
+            self.comm_context.inter_ring + self.hops * self.comm_context.switch)
+        beta = self.comm_context.get_max_beta(self.group_ranks)
+        time = alpha + (self.rank_count - 1
+                        ) / self.rank_count * self.comm_count * beta
+        return time
+
+
+@register_op_cost
+class BroadcastOpCost(CommOpCost):
+    OP_TYPE = "c_broadcast"
+
+    def __init__(self, op=None, op_desc=None, comm_context=None):
+        super(BroadcastOpCost, self).__init__(
+            op=op, op_desc=op_desc, comm_context=comm_context)
+
+    def calc_time(self):
+        time = self.calc_time_ring()
+        return time
+
+    def calc_time_ring(self):
+        alpha = self.comm_context.base_ring
+        if self.machine_count > 1:
+            alpha += self.comm_context.inter_ring + self.hops * self.comm_context.switch
+        else:
+            alpha += self.comm_context.intra_ring
+        beta = self.comm_context.get_max_beta(self.group_ranks)
+        time = alpha + self.comm_count * beta
+
+        return time
+
+
+@register_op_cost
+class IdentityOpCost(CommOpCost):
+    OP_TYPE = "c_identity"
+
+    def __init__(self, op=None, op_desc=None, comm_context=None):
+        super(IdentityOpCost, self).__init__(
             op=op, op_desc=op_desc, comm_context=comm_context)
 
     def calc_time(self):
-        # NOTE: The actual formula will be filled in the future.
         return 0
+
+
+@register_op_cost
+class RecvOpCost(CommOpCost):
+    OP_TYPE = "recv_v2"
+
+    def __init__(self, op=None, op_desc=None, comm_context=None):
+        super(RecvOpCost, self).__init__(
+            op=op, op_desc=op_desc, comm_context=comm_context)
+
+    def calc_time(self):
+        alpha = self.comm_context.base_ring
+        if self.machine_count > 1:
+            alpha += self.comm_context.inter_ring + self.hops * self.comm_context.switch
+        else:
+            alpha += self.comm_context.intra_ring
+        beta = self.comm_context.get_max_beta(self.group_ranks)
+        time = alpha + self.comm_count * beta
+        return time
+
+
+@register_op_cost
+class SendOpCost(CommOpCost):
+    OP_TYPE = "send_v2"
+
+    def __init__(self, op=None, op_desc=None, comm_context=None):
+        super(SendOpCost, self).__init__(
+            op=op, op_desc=op_desc, comm_context=comm_context)
+
+    def calc_time(self):
+        alpha = self.comm_context.base_ring
+        if self.machine_count > 1:
+            alpha += self.comm_context.inter_ring + self.hops * self.comm_context.switch
+        else:
+            alpha += self.comm_context.intra_ring
+        beta = self.comm_context.get_max_beta(self.group_ranks)
+        time = alpha + self.comm_count * beta
+
+        return time
diff --git a/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py b/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py
index c4d88cb25dc1e..28d2e2d5a3088 100644
--- a/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py
+++ b/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,628 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
-from .base_cost import Cost, register_op_cost, CompOpCost, OP_COST_FACTORY
+from .base_cost import Cost, register_op_cost, CompOpCost, _g_op_cost_factory
+
+
+@register_op_cost
+class AssignOpCost(CompOpCost):
+    OP_TYPE = "assign"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(AssignOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class AssignValueOpCost(CompOpCost):
+    OP_TYPE = "assign_value"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(AssignValueOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class BeamSearchOpCost(CompOpCost):
+    OP_TYPE = "beam_search"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(BeamSearchOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class BeamSearchDecodeOpCost(CompOpCost):
+    OP_TYPE = "beam_search_decode"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(BeamSearchDecodeOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class CastOpCost(CompOpCost):
+    OP_TYPE = "cast"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(CastOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class ConcatOpCost(CompOpCost):
+    OP_TYPE = "concat"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(ConcatOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class ElementwiseAddOpCost(CompOpCost):
+    OP_TYPE = "elementwise_add"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(ElementwiseAddOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class ElementwiseAddGradOpCost(CompOpCost):
+    OP_TYPE = "elementwise_add_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(ElementwiseAddGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class ElementwiseDivOpCost(CompOpCost):
+    OP_TYPE = "elementwise_div"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(ElementwiseDivOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class ElementwiseDivGradOpCost(CompOpCost):
+    OP_TYPE = "elementwise_div_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(ElementwiseDivGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class ElementwiseMulOpCost(CompOpCost):
+    OP_TYPE = "elementwise_mul"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(ElementwiseMulOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class ElementwiseMulGradOpCost(CompOpCost):
+    OP_TYPE = "elementwise_mul_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(ElementwiseMulGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class ElementwiseSubOpCost(CompOpCost):
+    OP_TYPE = "elementwise_sub"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(ElementwiseSubOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class EmbeddingOpCost(CompOpCost):
+    OP_TYPE = "c_embedding"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(EmbeddingOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class EmbeddingGradOpCost(CompOpCost):
+    OP_TYPE = "c_embedding_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(EmbeddingGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class FillConstantOpCost(CompOpCost):
+    OP_TYPE = "fill_constant"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(FillConstantOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class FillConstantBatchSizeLikeOpCost(CompOpCost):
+    OP_TYPE = "fill_constant_batch_size_like"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(FillConstantBatchSizeLikeOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class FillConstantBatchSizeLikeGradOpCost(CompOpCost):
+    OP_TYPE = "fill_constant_batch_size_like_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(FillConstantBatchSizeLikeGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class GatherOpCost(CompOpCost):
+    OP_TYPE = "gather"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(GatherOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class GeluOpCost(CompOpCost):
+    OP_TYPE = "gelu"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(GeluOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class GeluGradOpCost(CompOpCost):
+    OP_TYPE = "gelu_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(GeluGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class GreaterEqualOpCost(CompOpCost):
+    OP_TYPE = "greater_equal"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(GreaterEqualOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class IncrementOpCost(CompOpCost):
+    OP_TYPE = "increment"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(IncrementOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class IsEmptyOpCost(CompOpCost):
+    OP_TYPE = "is_empty"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(IsEmptyOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class LayerNormOpCost(CompOpCost):
+    OP_TYPE = "layer_norm"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(LayerNormOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class LayerNormGradOpCost(CompOpCost):
+    OP_TYPE = "layer_norm_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(LayerNormGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class LessThanOpCost(CompOpCost):
+    OP_TYPE = "less_than"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(LessThanOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class LogicalNotOpCost(CompOpCost):
+    OP_TYPE = "logical_not"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(LogicalNotOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class LogicalAndOpCost(CompOpCost):
+    OP_TYPE = "logical_and"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(LogicalAndOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class LodResetOpCost(CompOpCost):
+    OP_TYPE = "lod_reset"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(LodResetOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class LogOpCost(CompOpCost):
+    OP_TYPE = "log"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(LogOpCost, self).__init__(op=op, op_desc=op_desc, cluster=cluster)
+
+    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class LookupTableV2OpCost(CompOpCost):
+    OP_TYPE = "lookup_table_v2"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(LookupTableV2OpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class LookupTableV2GradOpCost(CompOpCost):
+    OP_TYPE = "lookup_table_v2_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(LookupTableV2GradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class MatmulOpCost(CompOpCost):
+    OP_TYPE = "matmul"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(MatmulOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class MatmulGradOpCost(CompOpCost):
+    OP_TYPE = "matmul_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(MatmulGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
 
 
 @register_op_cost
@@ -20,10 +641,10 @@ class MatmulV2OpCost(CompOpCost):
     OP_TYPE = "matmul_v2"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(OP_COST_FACTORY["matmul_v2"], self).__init__(
+        super(MatmulV2OpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    # For a concrete COMP OP, the calc_time and calc_flops function needs to be overrided
+    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
diff --git a/python/paddle/distributed/auto_parallel/dist_attribute.py b/python/paddle/distributed/auto_parallel/dist_attribute.py
index 857f141f30b1f..6fa5b756c75c3 100644
--- a/python/paddle/distributed/auto_parallel/dist_attribute.py
+++ b/python/paddle/distributed/auto_parallel/dist_attribute.py
@@ -485,10 +485,10 @@ def __str__(self):
                                                      self.process_mesh)
 
         for arg_name, tensor_dist_attr in self.inputs_dist_attrs.items():
-            str += "\n\t\t{}'s: {},".format(arg_name, tensor_dist_attr)
+            str += "\n\t\t{}'s (input): {},".format(arg_name, tensor_dist_attr)
 
         for arg_name, tensor_dist_attr in self.outputs_dist_attrs.items():
-            str += "\n\t\t{}'s: {},".format(arg_name, tensor_dist_attr)
+            str += "\n\t\t{}'s (output): {},".format(arg_name, tensor_dist_attr)
 
         str += "\n\t\timpl type: {}, ".format(self._impl_type)
         str += "impl idx: {}".format(self._impl_idx)
diff --git a/python/paddle/distributed/auto_parallel/dist_context.py b/python/paddle/distributed/auto_parallel/dist_context.py
index 5082ac987f456..7299f84504bf3 100644
--- a/python/paddle/distributed/auto_parallel/dist_context.py
+++ b/python/paddle/distributed/auto_parallel/dist_context.py
@@ -55,10 +55,10 @@ class DistributedContext:
     def __init__(self,
                  serial_main_prog=None,
                  serial_startup_prog=None,
-                 dist_main_progs=None,
-                 dist_startup_progs=None,
-                 serial_loss=None,
                  serial_optimizer=None,
+                 serial_loss=None,
+                 feed_vars=None,
+                 fetch_vars=None,
                  strategy=None):
         # Data members related to original programs (unchanged)
         self._original_serial_main_program = serial_main_prog
@@ -75,8 +75,10 @@ def __init__(self,
         # Data members related to programs (changed)
         self._serial_main_program = None
         self._serial_startup_program = None
-        self._serial_loss = None
-        self._serial_optimizer = None
+        self._serial_loss = serial_loss
+        self._serial_optimizer = serial_optimizer
+        self._serial_feed_vars = feed_vars
+        self._serial_fetch_vars = fetch_vars
 
         # Data members related to the program
         self._dist_tensors_for_program = {}
@@ -92,12 +94,8 @@ def __init__(self,
 
         # Data members related to the distributed programs
         # Distributed programs
-        self._dist_main_programs = dist_main_progs
-        if not self._dist_main_programs:
-            self._dist_main_programs = {}
-        self._dist_startup_programs = dist_startup_progs
-        if not self._dist_startup_programs:
-            self._dist_startup_programs = {}
+        self._dist_main_programs = {}
+        self._dist_startup_programs = {}
 
         # Distributed Strategy
         self._strategy = strategy
@@ -117,6 +115,9 @@ def __init__(self,
 
         self._is_initialized = False
 
+        # flag whether scale gradient with dp size
+        self._gradient_scale = True
+
     @property
     def serial_main_program(self):
         return self._serial_main_program
@@ -132,34 +133,26 @@ def serial_main_program(self, program):
     def serial_startup_program(self):
         return self._serial_startup_program
 
-    # @serial_startup_program.setter
-    # def serial_startup_program(self, serial_startup_program):
-    #     self._serial_startup_program = serial_startup_program
-
     @property
     def serial_loss(self):
         return self._serial_loss
 
-    # @serial_loss.setter
-    # def serial_loss(self, serial_loss):
-    #     self._serial_loss = serial_loss
-
     @property
     def serial_optimizer(self):
         return self._serial_optimizer
 
-    # @serial_optimizer.setter
-    # def serial_optimizer(self, serial_optimizer):
-    #     self._serial_optimizer = serial_optimizer
+    @property
+    def serial_feed_vars(self):
+        return self._serial_feed_vars
+
+    @property
+    def serial_fetch_vars(self):
+        return self._serial_fetch_vars
 
     @property
     def strategy(self):
         return self._strategy
 
-    # @strategy.setter
-    # def strategy(self, strategy):
-    #     self._strategy = strategy
-
     @property
     def serial_graph(self):
         return self._serial_graph
@@ -197,6 +190,14 @@ def has_annotation(self):
         return len(self._dist_tensors_for_program) or len(
             self._dist_ops_for_program)
 
+    @property
+    def gradient_scale(self):
+        return self._gradient_scale
+
+    @gradient_scale.setter
+    def gradient_scale(self, gs):
+        self._gradient_scale = gs
+
     def initialize(self):
         if not self._is_initialized:
             self._serial_main_program = self._original_serial_main_program.clone(
@@ -678,7 +679,7 @@ def validate_dist_attr_for_program(self):
                         dist_op.serial_op.type)
                 if (dist_op is not None) and (not dist_op.validate_dist_attr()):
                     assert False, "Operator {} has a wrong distributed attributes {}.".format(
-                        dist_op.serial_op.type, dist_tensor.dist_attr)
+                        dist_op.serial_op.type, dist_op.dist_attr)
         return True
 
     def __deepcopy__(self, memo):
diff --git a/python/paddle/distributed/auto_parallel/dist_loader.py b/python/paddle/distributed/auto_parallel/dist_loader.py
index cc08bc1a901b7..aa315db5292de 100644
--- a/python/paddle/distributed/auto_parallel/dist_loader.py
+++ b/python/paddle/distributed/auto_parallel/dist_loader.py
@@ -34,7 +34,7 @@ def __init__(self,
         self.data_parallel_world_size = data_parallel_world_size
         self.data_parallel_rank = data_parallel_rank
         self.drop_lost = drop_last
-        if data_parallel_world_size is not None:
+        if data_parallel_world_size is not None and batch_size is not None:
             assert batch_size % data_parallel_world_size == 0
 
     @abc.abstractmethod
@@ -56,12 +56,12 @@ def __init__(self,
                  steps_per_epoch=None,
                  data_parallel_world_size=None,
                  data_parallel_rank=None,
-                 drop_last=False,
-                 sample_generator=True):
+                 drop_last=False):
         self.feed_list = feed_list
         self.places = places
         self.steps_per_epoch = steps_per_epoch
-        self._sample_generator = sample_generator
+        self.dp_world_size = 1 if data_parallel_world_size is None else data_parallel_world_size
+        self.dp_rank = 0 if data_parallel_rank is None else data_parallel_rank
 
         super(NonIterableGeneratorLoader, self).__init__(
             dataset, batch_size, epochs, data_parallel_world_size,
@@ -85,7 +85,10 @@ def _infer_steps(self):
         if self.steps_per_epoch is not None:
             return self.steps_per_epoch
         try:
-            steps_per_epoch = len(self.dataset) // self.batch_size
+            if self.batch_size is None:
+                steps_per_epoch = len(self.dataset)
+            else:
+                steps_per_epoch = len(self.dataset) // self.batch_size
         except:
             raise ValueError(
                 "Pleace set `steps_per_epoch` or implement `__len__` methond in dataset class."
@@ -102,17 +105,28 @@ def sample_data_generator():
                 for idx in range(len(data)):
                     batch_data[idx].append(data[idx])
                 if (step + 1) % self.batch_size == 0:
-                    yield batch_data
+                    partial_data = []
+                    for d in batch_data:
+                        array = np.array(d)
+                        partial_data.append(
+                            np.split(array, self.dp_world_size)[self.dp_rank])
+                    yield partial_data[:len(self.feed_list)]
                     batch_data = None
 
         def batch_data_generator():
             for data in self.dataset:
                 data = flatten(data)
-                yield data
+                partial_data = []
+                for d in data:
+                    assert d.shape[0] % self.dp_world_size == 0, \
+                        "Please padding dataset with data parallel size"
+                    partial_data.append(
+                        np.split(d, self.dp_world_size)[self.dp_rank])
+                yield partial_data[:len(self.feed_list)]
 
         dataloader = paddle.fluid.io.DataLoader.from_generator(
             feed_list=self.feed_list, capacity=70, iterable=False)
-        if self._sample_generator:
+        if self.batch_size is not None:
             dataloader.set_batch_generator(sample_data_generator, self.places)
         else:
             dataloader.set_batch_generator(batch_data_generator, self.places)
diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py
index ea6aeb513ffb9..c38953ca9e64d 100644
--- a/python/paddle/distributed/auto_parallel/engine.py
+++ b/python/paddle/distributed/auto_parallel/engine.py
@@ -34,12 +34,9 @@
 from paddle.distributed.utils import get_logger
 from paddle.distributed.passes import new_pass, PassContext
 
-from .mapper import mapping
 from .cluster import Cluster
-from .reshard import Resharder
-from .planner import Planner
-from .completion import Completer
-from .partitioner import Partitioner
+from .planner_v2 import Planner
+from .parallelizer_v2 import Parallelizer
 from .dist_op import DistributedOperator
 from .dist_saver import DistributedSaver
 from .dist_loader import NonIterableGeneratorLoader
@@ -48,8 +45,6 @@
 from .process_group import get_all_process_groups, get_world_process_group
 from .dist_context import DistributedContext, get_default_distributed_context
 
-paddle.enable_static()
-
 
 class Engine:
     def __init__(self,
@@ -79,13 +74,13 @@ def __init__(self,
         self._dist_main_progs = defaultdict(dict)  # dist main programs
         self._dist_startup_progs = defaultdict(dict)  # dist startup programs
         self._dist_contexts = {}
-        self._pass_contexts = {}
         self._feed_vars = {}
         self._fetch_vars = {}
 
     def prepare(self,
                 optimizer=None,
                 loss=None,
+                gradient_scale=True,
                 metrics=None,
                 mode='train',
                 all_ranks=False):
@@ -94,10 +89,28 @@ def prepare(self,
         self._loss = loss
         self._metrics = to_list(metrics)
         self._mode = mode
-        self._build(mode)  # build forward program
-        self._plan(mode)  # completion & planner
-        self._parallel(mode, all_ranks)  # parallel
-        self._initialize(mode)  # init comm and startup program
+        self._gradient_scale = gradient_scale
+        # Build forward program
+        self._build(mode)
+        # Do the planning process
+        planner = Planner(mode, self._dist_contexts[mode])
+        planner.plan()
+        # Parallelize program based on the planner's results
+        # For now, the completer has to be passed to the planner,
+        # because we may use it to complete the annotation of the backwarkward and update.
+        parallelizer = Parallelizer(mode, planner.completer,
+                                    self._dist_contexts[mode])
+        if not all_ranks:
+            parallelizer.parallel(self._cur_rank)
+        else:
+            parallelizer.parallel_all()
+        # Get the distributed main programs and startup programs
+        self._dist_main_progs[mode] = self._dist_contexts[
+            mode].dist_main_programs
+        self._dist_startup_progs[mode] = self._dist_contexts[
+            mode].dist_startup_programs
+        # Init comm and startup program
+        self._initialize(mode)
 
     def _build(self, mode):
         serial_main_prog = self._serial_main_progs.get(mode, None)
@@ -133,34 +146,10 @@ def _build(self, mode):
         self._serial_main_progs[mode] = serial_main_prog
         self._serial_startup_progs[mode] = serial_startup_prog
         self._dist_contexts[mode] = DistributedContext(
-            serial_main_prog, serial_startup_prog, self._dist_main_progs[mode],
-            self._dist_startup_progs[mode])
-        self._pass_contexts[mode] = PassContext()
-
-    def _plan(self, mode):
-
-        # NOTE: [HighOrderGrad]. There are grad ops in forward phase, and it need
-        # dependency of backward-forward ops in forward completition.
-        defualt_ctx = get_default_distributed_context()
-        self._dist_contexts[mode]._dist_op_context = defualt_ctx.dist_op_context
-
-        # Complete the distributed annotation
-        serial_main_prog = self._serial_main_progs[mode]
-        self._completer = Completer(self._dist_contexts[mode])
-        self._completer.complete_forward_annotation(serial_main_prog)
-        # TODO: add auto planner process
-        # parse forward sub block
-        self._dist_contexts[mode].block_state.parse_forward_blocks(
-            serial_main_prog)
-
-    def _parallel(self, mode, all_ranks=False):
-        if not all_ranks:
-            self._parallel_program(mode, self._cur_rank)
-        else:
-            world_process_group = get_world_process_group()
-            all_ranks = world_process_group.ranks
-            for rank in all_ranks:
-                self._parallel_program(mode, rank)
+            self._serial_main_progs[mode], self._serial_startup_progs[mode],
+            self._optimizer, losses, self._feed_vars[mode],
+            self._fetch_vars[mode], self.strategy)
+        self._dist_contexts[mode].gradient_scale = self._gradient_scale
 
     def _initialize(self, mode):
         if self._nranks > 1:
@@ -189,145 +178,20 @@ def _initialize(self, mode):
                 prune_startup_prog = dist_startup_prog._prune(uninitialized)
                 self._executor.run(prune_startup_prog)
 
-    def _parallel_program(self, mode, rank):
-        serial_main_program = self._serial_main_progs[mode]
-        serial_startup_program = self._serial_startup_progs[mode]
-        dist_context = self._dist_contexts[mode]
-        if mode == "train" and self._optimizer:
-            # Generate backward
-            serial_loss = self._fetch_vars[mode]["loss"][0]
-            params_grads = self._generate_backward(
-                serial_main_program, serial_startup_program, serial_loss)
-            # Apply pre optimization passes
-            self._apply_pre_optimization(serial_main_program,
-                                         serial_startup_program, serial_loss,
-                                         params_grads)
-            # Do logical partition
-            partitioner = Partitioner(dist_context, rank)
-            dist_main_prog, dist_startup_prog, dist_params_grads = partitioner.partition(
-                serial_main_program, serial_startup_program, params_grads)
-            # Generate optimizer
-            self._generate_optimizer(dist_main_prog, dist_startup_prog,
-                                     dist_params_grads)
-            # Do reshard process
-            set_grad_var_shape(dist_main_prog, dist_context)
-            make_data_unshard(dist_main_prog, dist_startup_prog, dist_context)
-            resharder = Resharder(dist_main_prog, dist_startup_prog, rank,
-                                  dist_context, dist_params_grads)
-            resharder.reshard()
-            # Apply post optimization passes
-            self._apply_post_optimization(dist_main_prog, dist_startup_prog,
-                                          rank, dist_params_grads)
-        else:
-            # Apply pre optimization passes
-            self._apply_pre_optimization(serial_main_program,
-                                         serial_startup_program, None, None)
-            # Do logical partition
-            partitioner = Partitioner(dist_context, rank)
-            dist_main_prog, dist_startup_prog, dist_params_grads = partitioner.partition(
-                serial_main_program, serial_startup_program, [])
-            # Do reshard process
-            make_data_unshard(dist_main_prog, dist_startup_prog, dist_context)
-            resharder = Resharder(dist_main_prog, dist_startup_prog, rank,
-                                  dist_context, [], 1)
-            resharder.reshard()
-
-        # clone program for test
-        if mode != 'train':
-            dist_main_prog = dist_main_prog.clone(for_test=True)
-            dist_startup_prog = dist_startup_prog.clone(for_test=True)
-
-        self._dist_main_progs[mode][rank] = dist_main_prog
-        self._dist_startup_progs[mode][rank] = dist_startup_prog
-
-    def _generate_backward(self, main_program, startup_program, loss):
-        with program_guard(main_program, startup_program):
-            params_grads = append_backward(
-                loss,
-                distop_context=self._dist_contexts[self.mode].dist_op_context)
-        self._completer.complete_backward_annotation(main_program)
-        self._dist_contexts[self.mode].block_state.parse_backward_blocks(
-            main_program)
-        return params_grads
-
-    def _generate_optimizer(self, main_program, startup_program, params_grads):
-        with program_guard(main_program, startup_program):
-            optimizer_ops = copy.deepcopy(self._optimizer).apply_gradients(
-                params_grads)
-        self._completer.complete_update_annotation(main_program)
-        return optimizer_ops
-
-    def _apply_pre_optimization(self, main_program, startup_program, loss,
-                                params_grads):
-
-        # apply amp pass
-        if self.strategy.amp:
-            config = copy.deepcopy(self.strategy.amp_configs)
-            config["dist_context"] = self._dist_contexts[self.mode]
-            config["params_grads"] = params_grads
-            config["loss"] = loss
-            config["input_data"] = self._feed_vars[self.mode][
-                "inputs"] + self._feed_vars[self.mode]["labels"]
-            if config["use_pure_fp16"]:
-                config["base_opt"] = self._optimizer
-                auto_parallel_fp16_pass = new_pass("auto_parallel_fp16", config)
-                auto_parallel_fp16_pass.apply([main_program],
-                                              [startup_program],
-                                              self._pass_contexts[self.mode])
-            else:
-                auto_parallel_amp_pass = new_pass("auto_parallel_amp", config)
-                auto_parallel_amp_pass.apply([main_program], [startup_program],
-                                             self._pass_contexts[self.mode])
-
-        # apply recompute pass
-        if self.strategy.recompute:
-            config = copy.deepcopy(self.strategy.recompute_configs)
-            config["dist_context"] = self._dist_contexts[self.mode]
-            config["no_grad_set"] = None
-            config["loss"] = loss
-            auto_parallel_recompute_pass = new_pass("auto_parallel_recompute",
-                                                    config)
-            auto_parallel_recompute_pass.apply([main_program],
-                                               [startup_program],
-                                               self._pass_contexts[self.mode])
-
-    def _apply_post_optimization(self, main_program, startup_program, rank,
-                                 params_grads):
-        if self.strategy.sharding:
-            config = copy.deepcopy(self.strategy.sharding_configs)
-            config["dist_context"] = self._dist_contexts[self.mode]
-            config["params_grads"] = params_grads
-            config["global_rank"] = rank
-            auto_parallel_sharding_pass = new_pass("auto_parallel_sharding",
-                                                   config)
-            auto_parallel_sharding_pass.apply([main_program],
-                                              [startup_program],
-                                              self._pass_contexts[self.mode])
-
-        if self.strategy.gradient_merge:
-            config = copy.deepcopy(self.strategy.gradient_merge_configs)
-            config["dist_context"] = self._dist_contexts[self.mode]
-            config["params_grads"] = params_grads
-            auto_parallel_gradient_merge_pass = new_pass(
-                "auto_parallel_gradient_merge_pass", config)
-            auto_parallel_gradient_merge_pass.apply(
-                [main_program], [startup_program],
-                self._pass_contexts[self.mode])
-
     def fit(self,
             train_data,
             batch_size=1,
             epochs=1,
             steps_per_epoch=None,
             use_program_cache=False,
-            return_numpy=True,
-            sample_generator=True):
+            return_numpy=True):
         # TODO: callbacks
         # TODO: evaluate after training
         self.mode = 'train'
-        assert self.mode in self._dist_main_progs, "train model is not ready, please call `engine.prepare(mode='train')` first."
-        train_dataloader = self._create_dataloader(
-            train_data, batch_size, epochs, steps_per_epoch, sample_generator)
+        assert self.mode in self._dist_main_progs, \
+            "train model is not ready, please call `engine.prepare(mode='train')` first."
+        train_dataloader = self._create_dataloader(train_data, batch_size,
+                                                   epochs, steps_per_epoch)
 
         outputs = []
         for epoch in range(epochs):
@@ -346,12 +210,11 @@ def evaluate(self,
                  eval_data,
                  batch_size=1,
                  use_program_cache=False,
-                 return_numpy=True,
-                 sample_generator=True):
+                 return_numpy=True):
         self.mode = 'eval'
-        assert self.mode in self._dist_main_progs, "eval model is not ready, please call `engine.prepare(mode='eval')` first."
-        eval_dataloader = self._create_dataloader(
-            eval_data, batch_size, sample_generator=sample_generator)
+        assert self.mode in self._dist_main_progs, \
+            "eval model is not ready, please call `engine.prepare(mode='eval')` first."
+        eval_dataloader = self._create_dataloader(eval_data, batch_size)
 
         outputs = []
         for step, data in enumerate(eval_dataloader):
@@ -365,12 +228,11 @@ def predict(self,
                 test_data,
                 batch_size=1,
                 use_program_cache=False,
-                return_numpy=True,
-                sample_generator=True):
+                return_numpy=True):
         self.mode = 'predict'
-        assert self.mode in self._dist_main_progs, "predict model is not ready, please call `engine.prepare(mode='predict')` first."
-        test_dataloader = self._create_dataloader(
-            test_data, batch_size, sample_generator=sample_generator)
+        assert self.mode in self._dist_main_progs, \
+            "predict model is not ready, please call `engine.prepare(mode='predict')` first."
+        test_dataloader = self._create_dataloader(test_data, batch_size)
 
         outputs = []
         for step, data in enumerate(test_dataloader):
@@ -441,21 +303,30 @@ def _create_dataloader(self,
                            dataset,
                            batch_size,
                            epochs=1,
-                           steps_per_epoch=None,
-                           sample_generator=True):
-        feed_list = self._feed_vars[self.mode]["inputs"] + self._feed_vars[
-            self.mode]["labels"]
+                           steps_per_epoch=None):
         dist_main_prog = self._dist_main_progs[self.mode][self._cur_rank]
         dist_startup_prog = self._dist_startup_progs[self.mode][self._cur_rank]
         dist_context = self._dist_contexts[self.mode]
         dist_main_block = dist_main_prog.global_block()
-        serial_main_prog = self._serial_main_progs[self.mode]
-        serial_main_block = serial_main_prog.global_block()
+
+        # get feed_list from dist_program
+        inputs_var = self._feed_vars[self.mode]["inputs"]
+        labels_var = self._feed_vars[self.mode]["labels"]
+        feed_list = []
+        for var in inputs_var + labels_var:
+            if var.name in dist_main_block.vars:
+                feed_list.append(dist_main_block.vars[var.name])
+        dp_world_size, dp_rank = self._get_data_parallel_info(feed_list[0],
+                                                              dist_context)
+
+        # remove the first three ops if multi run fit/evaluate/predict
         op_size = len(dist_main_block.ops)
         if dist_main_block.ops[0].type == 'create_py_reader':
             op_size -= 3
             for _ in range(3):
                 dist_main_block._remove_op(0, sync=False)
+
+        # insert read op at the end of program
         places = paddle.static.cuda_places()
         with fluid.program_guard(dist_main_prog, dist_startup_prog):
             dataloader = NonIterableGeneratorLoader(
@@ -465,7 +336,10 @@ def _create_dataloader(self,
                 batch_size,
                 epochs,
                 steps_per_epoch,
-                sample_generator=sample_generator)
+                data_parallel_world_size=dp_world_size,
+                data_parallel_rank=dp_rank)
+
+        # move read op from the end of program to the start of program
         new_op_size = len(dist_main_block.ops)
         for _ in range(new_op_size - 1, op_size - 1, -1):
             op = dist_main_block.ops[new_op_size - 1]
@@ -474,17 +348,6 @@ def _create_dataloader(self,
             new_op = Operator(
                 dist_main_block, new_op_desc, type=new_op_desc.type())
             dist_main_block.ops.insert(0, new_op)
-            for in_name in new_op.input_arg_names:
-                if "lod_tensor_blocking_queue" in in_name:
-                    continue
-                if in_name not in dist_main_block.vars:
-                    in_var = serial_main_block._var_recursive(in_name)
-                    dist_main_block._clone_variable(in_var, in_var.persistable)
-            for out_name in new_op.output_arg_names:
-                if out_name not in dist_main_block.vars:
-                    out_var = serial_main_block._var_recursive(out_name)
-                    dist_main_block._clone_variable(out_var,
-                                                    out_var.persistable)
             dist_op = DistributedOperator(new_op)
             dist_context.add_dist_op_for_program(dist_op)
         for _ in range(new_op_size - op_size):
@@ -524,6 +387,29 @@ def _set_data_parallel(self, var):
 
         return var
 
+    def _get_data_parallel_info(self, var, dist_context):
+        # get data parallel world size and current data parallel rank
+        from .utils import _get_comm_group, _get_corresponding_rank
+
+        tensor_dist_attr = dist_context.get_tensor_dist_attr_for_program(var)
+        process_mesh = tensor_dist_attr.process_mesh
+        dims_mapping = tensor_dist_attr.dims_mapping
+
+        if self._cur_rank not in process_mesh.processes:
+            rank_id = _get_corresponding_rank(dist_context, process_mesh,
+                                              self._cur_rank)
+        else:
+            rank_id = self._cur_rank
+
+        batch_size_axis = dims_mapping[0]
+        if batch_size_axis > -1 and process_mesh.topology[batch_size_axis] > 1:
+            group_ranks = _get_comm_group(process_mesh.processes,
+                                          process_mesh.topology,
+                                          batch_size_axis, rank_id)
+            return len(group_ranks), group_ranks.index(rank_id)
+
+        return None, None
+
     def save(self, path, training=True, mode=None):
         if not mode:
             mode = self.mode
diff --git a/python/paddle/distributed/auto_parallel/operators/__init__.py b/python/paddle/distributed/auto_parallel/operators/__init__.py
index 3f06b34b53ed9..3ff474697205e 100644
--- a/python/paddle/distributed/auto_parallel/operators/__init__.py
+++ b/python/paddle/distributed/auto_parallel/operators/__init__.py
@@ -32,3 +32,4 @@
 from . import dist_slice
 from . import dist_fused_feedforward
 from . import dist_fused_attention
+from . import dist_reduce_p
diff --git a/python/paddle/distributed/auto_parallel/operators/common.py b/python/paddle/distributed/auto_parallel/operators/common.py
index 5d43c56827274..441eb88a9f1ee 100644
--- a/python/paddle/distributed/auto_parallel/operators/common.py
+++ b/python/paddle/distributed/auto_parallel/operators/common.py
@@ -24,9 +24,10 @@
 
 
 def is_elementwise_op(op_type):
-    for eltwise_op in _g_elementwise_ops:
-        if eltwise_op in op_type:
-            return True
+    if op_type in _g_elementwise_ops:
+        return True
+    if "elementwise" in op_type:
+        return True
     return False
 
 
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_default.py b/python/paddle/distributed/auto_parallel/operators/dist_default.py
index 0696b728d161b..6d9b48ea1e87c 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_default.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_default.py
@@ -18,7 +18,7 @@
 from .common import register_distributed_operator_impl, is_parameter_related
 from ..utils import is_dim_shard
 from ..utils import is_dim_replicate
-from ..utils import is_valid_list_index
+from ..utils import is_valid_list_index, is_prim_op
 from ..utils import compute_compatible_dim_mapping
 from ..utils import compute_compatible_dims_mapping
 from ..utils import compute_compatible_and_update_dim_mapping
@@ -35,6 +35,55 @@
 __op_not_need_param_init__ = ["while", "cond"]
 
 
+def prim_operator_data_parallel_functor(ctx, src_op):
+    dist_op_context = ctx.dist_op_context
+    main_block = dist_op_context.work_block
+    startup_block = dist_op_context.startup_block
+
+    var_name = src_op.output_arg_names[0]
+    if var_name in ctx.grads_params:
+        assert var_name not in ctx.synced_gradient, "in primtive mode, grad is already {} synced".format(
+            var_name)
+        ctx.synced_gradient.add(var_name)
+        sync_group = new_process_group(ctx.data_parallel_group)
+
+        allreduce_op = main_block.append_op(
+            type='c_allreduce_sum',
+            inputs={'X': [var_name]},
+            outputs={'Out': [var_name]},
+            attrs={
+                'ring_id': sync_group.id,
+                'use_calc_stream': True,
+                OP_ROLE_KEY: OpRole.Backward
+            })
+
+        param = ctx.grads_params[var_name]
+        startup_block = dist_op_context.startup_block
+        new_op = startup_block.append_op(
+            type='c_broadcast',
+            inputs={'X': [param]},
+            outputs={'Out': [param]},
+            attrs={
+                'ring_id': sync_group.id,
+                'root': 0,
+                'use_calc_stream': True,
+                OP_ROLE_KEY: OpRole.Forward
+            })
+
+        grad_var = main_block.var(var_name)
+        dims_mapping = ctx.get_tensor_dist_attr_for_program(
+            grad_var).dims_mapping
+        dist_attr = ctx.get_op_dist_attr_for_program(src_op)
+        process_mesh = dist_attr.process_mesh
+        op_attr = OperatorDistributedAttribute()
+        op_attr.process_mesh = process_mesh
+        op_attr.set_output_dims_mapping(grad_var.name, dims_mapping)
+        op_attr.set_input_dims_mapping(grad_var.name, dims_mapping)
+        ctx.set_op_dist_attr_for_program(allreduce_op, op_attr)
+
+    return
+
+
 class DistributedDefault(DistributedOperatorImplContainer):
     def __init__(self, op_type):
         super(DistributedDefault, self).__init__(op_type)
@@ -201,10 +250,8 @@ def update_dims_mapping(self, dist_op):
         changed = False
         op_desc = dist_op.serial_op.desc
         op_dist_attr = dist_op.dist_attr
-        # The following statement will be replaced by a more elegent way
-        if op_desc.type() == "shape" \
-            or op_desc.type() == "slice" \
-                or op_desc.type() == "while":
+
+        if op_desc.type() == "while":
             return False
 
         input_names = op_desc.input_names()
@@ -273,6 +320,8 @@ def update_dims_mapping(self, dist_op):
                 )[0])
                 if input_tensor.is_parameter:
                     continue
+            if op_desc.type() in ["shape", "slice"]:
+                continue
             serial_tensor = dist_op.get_serial_output(arg_name)
             if serial_tensor.is_parameter:
                 continue
@@ -292,7 +341,6 @@ def update_dims_mapping(self, dist_op):
 
     @staticmethod
     def forward(ctx, *args, **kwargs):
-
         dist_op_context = ctx.dist_op_context
         main_block = dist_op_context.work_block
         startup_block = dist_op_context.startup_block
@@ -315,7 +363,7 @@ def forward(ctx, *args, **kwargs):
                 output_name)
 
         # replicate op in dist program
-        dist_op_desc = main_block.desc.append_op()
+        dist_op_desc = main_block.append_op(type='nop').desc
         dist_op_desc.copy_from(src_op.desc)
         set_dist_op_desc_original_id(dist_op_desc, src_op.desc, ctx)
         for input_name in src_op.desc.input_names():
@@ -323,7 +371,12 @@ def forward(ctx, *args, **kwargs):
         for output_name in src_op.desc.output_names():
             dist_op_desc.set_output(output_name, kwargs[output_name])
 
-        main_block._sync_with_cpp()
+        # data parallel synchronization for primtive operators
+        from paddle.incubate.autograd import prim_enabled
+        if prim_enabled():
+            assert is_prim_op(src_op)
+            prim_operator_data_parallel_functor(ctx, src_op)
+            return
 
         # param initialization sync
         if src_op.type in __op_not_need_param_init__:
@@ -373,8 +426,6 @@ def forward(ctx, *args, **kwargs):
                         op_attr.set_input_dims_mapping(param.name, dims_mapping)
                         ctx.set_op_dist_attr_for_program(new_op, op_attr)
 
-                startup_block._sync_with_cpp()
-
     @staticmethod
     def backward(ctx, *args, **kwargs):
 
@@ -457,6 +508,7 @@ def backward(ctx, *args, **kwargs):
             if len(allreduce_vars) > 0:
 
                 for varname in allreduce_vars:
+                    added_ops = []
 
                     grad_var = main_block.var(varname)
                     allreduce_op = main_block.append_op(
@@ -468,20 +520,23 @@ def backward(ctx, *args, **kwargs):
                             'use_calc_stream': True,
                             OP_ROLE_KEY: OpRole.Backward
                         })
+                    added_ops.append(allreduce_op)
 
-                    scale_op = main_block.append_op(
-                        type='scale',
-                        inputs={'X': grad_var},
-                        outputs={'Out': grad_var},
-                        attrs={
-                            'scale': 1.0 / dp_degree,
-                            OP_ROLE_KEY: OpRole.Backward
-                        })
+                    if ctx.gradient_scale:
+                        scale_op = main_block.append_op(
+                            type='scale',
+                            inputs={'X': grad_var},
+                            outputs={'Out': grad_var},
+                            attrs={
+                                'scale': 1.0 / dp_degree,
+                                OP_ROLE_KEY: OpRole.Backward
+                            })
+                        added_ops.append(scale_op)
 
                     dims_mapping = ctx.get_tensor_dist_attr_for_program(
                         grad_var).dims_mapping
                     process_mesh = dist_attr.process_mesh
-                    for op in [allreduce_op, scale_op]:
+                    for op in added_ops:
                         op_attr = OperatorDistributedAttribute()
                         op_attr.process_mesh = process_mesh
                         op_attr.set_output_dims_mapping(grad_var.name,
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py b/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py
index aac7f16b6909b..78589afc498ee 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py
@@ -80,12 +80,20 @@ def is_output_compatible(self, dist_op):
         op_dist_attr = dist_op.dist_attr
         dims_mapping_list = []
         output_arg_names = op_desc.output_arg_names()
+        max_dims_mapping_len = -1
         for arg_name in output_arg_names:
             dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
+            if max_dims_mapping_len < len(dims_mapping):
+                max_dims_mapping_len = len(dims_mapping)
             dims_mapping_list.append(dims_mapping)
 
-        if compute_compatible_dims_mapping(dims_mapping_list) is None:
-            return False
+        for idx in range(max_dims_mapping_len):
+            dim_mappings = []
+            for dims_mapping in dims_mapping_list:
+                if idx < len(dims_mapping):
+                    dim_mappings.append(dims_mapping[-(idx + 1)])
+            if compute_compatible_dim_mapping(dim_mappings) is None:
+                return False
         return True
 
     def is_auto_compatible(self, dist_op):
@@ -94,19 +102,26 @@ def is_auto_compatible(self, dist_op):
             return False
         op_dist_attr = dist_op.dist_attr
         dims_mapping_list = []
+
         input_arg_names = op_desc.input_arg_names()
-        max_dims_mapping_len = -1
+        input_max_dims_mapping_len = -1
         for arg_name in input_arg_names:
             dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
-            if max_dims_mapping_len < len(dims_mapping):
-                max_dims_mapping_len = len(dims_mapping)
+            if input_max_dims_mapping_len < len(dims_mapping):
+                input_max_dims_mapping_len = len(dims_mapping)
             dims_mapping_list.append(dims_mapping)
+
         output_arg_names = op_desc.output_arg_names()
+        output_max_dims_mapping_len = -1
         for arg_name in output_arg_names:
             dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
-            assert len(dims_mapping) == max_dims_mapping_len
+            if output_max_dims_mapping_len < len(dims_mapping):
+                output_max_dims_mapping_len = len(dims_mapping)
             dims_mapping_list.append(dims_mapping)
 
+        assert input_max_dims_mapping_len == output_max_dims_mapping_len
+        max_dims_mapping_len = input_max_dims_mapping_len
+
         for idx in range(max_dims_mapping_len):
             dim_mappings = []
             for dims_mapping in dims_mapping_list:
@@ -121,35 +136,58 @@ def update_dims_mapping(self, dist_op):
         changed = False
         op_desc = dist_op.serial_op.desc
         op_dist_attr = dist_op.dist_attr
+        dims_mapping_list = []
+
         input_arg_names = op_desc.input_arg_names()
         input_dims_mapping_dict = {}
         input_dims_mapping_lens = {}
-        max_dims_mapping_len = -1
+        input_max_dims_mapping_len = -1
         for arg_name in input_arg_names:
             dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
-            if max_dims_mapping_len < len(dims_mapping):
-                max_dims_mapping_len = len(dims_mapping)
+            if input_max_dims_mapping_len < len(dims_mapping):
+                input_max_dims_mapping_len = len(dims_mapping)
             input_dims_mapping_dict[arg_name] = dims_mapping
             input_dims_mapping_lens[arg_name] = len(dims_mapping)
-
-        dims_mapping_list = []
         for arg_name in input_arg_names:
-            if input_dims_mapping_lens[arg_name] < max_dims_mapping_len:
-                new_dims_mapping = [-1 for _ in range(max_dims_mapping_len)]
+            if input_dims_mapping_lens[arg_name] < input_max_dims_mapping_len:
+                new_dims_mapping = [
+                    -1 for _ in range(input_max_dims_mapping_len)
+                ]
                 for i in range(input_dims_mapping_lens[arg_name]):
-                    new_idx = (max_dims_mapping_len -
+                    new_idx = (input_max_dims_mapping_len -
                                input_dims_mapping_lens[arg_name]) + i
                     new_dims_mapping[new_idx] = input_dims_mapping_dict[
                         arg_name][i]
                 dims_mapping_list.append(new_dims_mapping)
             else:
                 dims_mapping_list.append(input_dims_mapping_dict[arg_name])
+
         output_arg_names = op_desc.output_arg_names()
+        output_dims_mapping_dict = {}
+        output_dims_mapping_lens = {}
+        output_max_dims_mapping_len = -1
         for arg_name in output_arg_names:
             dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
-            assert len(dims_mapping) == max_dims_mapping_len
-            dims_mapping_list.append(dims_mapping)
+            if output_max_dims_mapping_len < len(dims_mapping):
+                output_max_dims_mapping_len = len(dims_mapping)
+            output_dims_mapping_dict[arg_name] = dims_mapping
+            output_dims_mapping_lens[arg_name] = len(dims_mapping)
+        for arg_name in output_arg_names:
+            if output_dims_mapping_lens[arg_name] < output_max_dims_mapping_len:
+                new_dims_mapping = [
+                    -1 for _ in range(output_max_dims_mapping_len)
+                ]
+                for i in range(output_dims_mapping_lens[arg_name]):
+                    new_idx = (output_max_dims_mapping_len -
+                               output_dims_mapping_lens[arg_name]) + i
+                    new_dims_mapping[new_idx] = output_dims_mapping_dict[
+                        arg_name][i]
+                dims_mapping_list.append(new_dims_mapping)
+            else:
+                dims_mapping_list.append(output_dims_mapping_dict[arg_name])
 
+        assert input_max_dims_mapping_len == output_max_dims_mapping_len
+        max_dims_mapping_len = input_max_dims_mapping_len
         compatible_dims_mapping = compute_compatible_dims_mapping(
             dims_mapping_list)
         if compatible_dims_mapping is None:
@@ -175,11 +213,24 @@ def update_dims_mapping(self, dist_op):
                     changed = True
 
         for arg_name in output_arg_names:
-            dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
-            if compatible_dims_mapping != dims_mapping:
-                op_dist_attr.set_output_dims_mapping(arg_name,
-                                                     compatible_dims_mapping)
-                changed = True
+            if output_dims_mapping_lens[arg_name] < max_dims_mapping_len:
+                new_dims_mapping = [
+                    -1 for _ in range(output_dims_mapping_lens[arg_name])
+                ]
+                for i in range(output_dims_mapping_lens[arg_name]):
+                    new_idx = (max_dims_mapping_len -
+                               output_dims_mapping_lens[arg_name]) + i
+                    new_dims_mapping[i] = compatible_dims_mapping[new_idx]
+                if new_dims_mapping != output_dims_mapping_dict[arg_name]:
+                    op_dist_attr.set_output_dims_mapping(arg_name,
+                                                         new_dims_mapping)
+                    changed = True
+            else:
+                if compatible_dims_mapping != output_dims_mapping_dict[
+                        arg_name]:
+                    op_dist_attr.set_output_dims_mapping(
+                        arg_name, compatible_dims_mapping)
+                    changed = True
 
         return changed
 
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_reduce_p.py b/python/paddle/distributed/auto_parallel/operators/dist_reduce_p.py
new file mode 100644
index 0000000000000..755dcab4be34f
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/operators/dist_reduce_p.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+from .common import DistributedOperatorImplContainer
+from .common import DistributedOperatorImpl
+from .common import register_distributed_operator_impl_container
+from .common import register_distributed_operator_impl, is_parameter_related
+from ..utils import is_dim_shard
+from ..utils import is_dim_replicate
+from ..utils import is_valid_list_index
+from ..utils import compute_compatible_dim_mapping
+from ..utils import compute_compatible_dims_mapping
+from ..utils import compute_compatible_and_update_dim_mapping
+from ..utils import set_dist_op_desc_original_id
+from ..dist_attribute import OperatorDistributedAttribute
+from paddle.fluid import core, unique_name
+from paddle.fluid.framework import _non_static_mode
+from paddle.fluid.framework import Program, Parameter, Variable, program_guard
+from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
+from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY
+from ..process_group import new_process_group
+from ..utils import _get_comm_group, _get_corresponding_rank
+
+
+class DistributedReducePrimtive(DistributedOperatorImplContainer):
+    def __init__(self, op_type):
+        super(DistributedReducePrimtive, self).__init__(op_type)
+
+
+register_distributed_operator_impl_container(
+    DistributedReducePrimtive("reduce_p"))
+
+
+# Batch Dimension Reduce Primitive
+class DistributedReducePrimtiveImpl0(DistributedOperatorImpl):
+    def __init__(self, name):
+        super(DistributedReducePrimtiveImpl0, self).__init__(name)
+        self._forward_implemented = True
+        self._backward_implemented = True
+
+    def is_input_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+
+        return len(op_desc.input_arg_names()) == 1
+
+    def is_output_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        outputs = op_desc.output_arg_names()
+
+        if len(outputs) != 1:
+            return False
+
+        output_name = outputs[0]
+        output_var = dist_op.serial_op.block.var(output_name)
+        if output_var.shape != (1, ):
+            return False
+
+        return True
+
+    def is_auto_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+
+        return self.is_input_compatible(dist_op) and self.is_output_compatible(
+            dist_op)
+
+    def update_dims_mapping(self, dist_op):
+        changed = False
+
+        return changed
+
+    @staticmethod
+    def forward(ctx, *args, **kwargs):
+
+        dist_op_context = ctx.dist_op_context
+        main_block = dist_op_context.work_block
+        startup_block = dist_op_context.startup_block
+        src_op = dist_op_context.cur_src_op
+        rank_id = dist_op_context.rank_id
+
+        # check validation of inputs / outputs
+        for input_name in src_op.desc.input_names():
+            assert input_name in kwargs, "input [{}] is not given".format(
+                input_name)
+            assert len(kwargs[input_name]) == len(
+                src_op.desc.input(input_name)
+            ), "number of tensor for input [{}] is not match".format(input_name)
+        for output_name in src_op.desc.output_names():
+            assert output_name in kwargs, "input [{}] is not given".format(
+                output_name)
+            assert len(kwargs[output_name]) == len(
+                src_op.desc.output(output_name)
+            ), "number of tensor for input [{}] is not match".format(
+                output_name)
+
+        # replicate op in dist program
+        dist_op_desc = main_block.append_op(type='nop').desc
+        dist_op_desc.copy_from(src_op.desc)
+        set_dist_op_desc_original_id(dist_op_desc, src_op.desc, ctx)
+        for input_name in src_op.desc.input_names():
+            dist_op_desc.set_input(input_name, kwargs[input_name])
+        for output_name in src_op.desc.output_names():
+            dist_op_desc.set_output(output_name, kwargs[output_name])
+
+        # batch dimension synchronization
+        var_name = src_op.output_arg_names[0]
+        sync_group = new_process_group(ctx.data_parallel_group)
+        allreduce_op = main_block.append_op(
+            type='c_allreduce_sum',
+            inputs={'X': [var_name]},
+            outputs={'Out': [var_name]},
+            attrs={
+                'ring_id': sync_group.id,
+                'use_calc_stream': True,
+                OP_ROLE_KEY: OpRole.Forward
+            })
+
+        # dist attr
+        var = main_block.var(var_name)
+        tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(var)
+        op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
+        new_op_attr = OperatorDistributedAttribute()
+        new_op_attr.process_mesh = op_dist_attr.process_mesh
+        new_op_attr.set_output_dims_mapping(var.name,
+                                            tensor_dist_attr.dims_mapping)
+        new_op_attr.set_input_dims_mapping(var.name,
+                                           tensor_dist_attr.dims_mapping)
+        ctx.set_op_dist_attr_for_program(allreduce_op, new_op_attr)
+
+    @staticmethod
+    def backward(ctx, *args, **kwargs):
+        raise RuntimeError(
+            "primitive operator does NOT have backward function, op type: {}".
+            format(str(op.type)))
+
+
+register_distributed_operator_impl(
+    "reduce_p", DistributedReducePrimtiveImpl0("batch_dimension_reduce_p"))
diff --git a/python/paddle/distributed/auto_parallel/parallelizer_v2.py b/python/paddle/distributed/auto_parallel/parallelizer_v2.py
new file mode 100644
index 0000000000000..6a94bbd3130b9
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/parallelizer_v2.py
@@ -0,0 +1,168 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from collections import defaultdict
+
+from paddle.fluid import program_guard
+from paddle.fluid.backward import append_backward
+from paddle.distributed.passes import new_pass
+
+from .reshard import Resharder
+from .partitioner import Partitioner
+from .dist_op import DistributedOperator
+from .dist_saver import DistributedSaver
+from .dist_loader import NonIterableGeneratorLoader
+from .utils import make_data_unshard, set_grad_var_shape
+from .utils import print_program_with_dist_attr, to_list
+from .process_group import get_all_process_groups, get_world_process_group
+from .dist_context import DistributedContext, get_default_distributed_context
+
+
+class Parallelizer:
+    def __init__(self, mode, completer, dist_context):
+        self._mode = mode
+        self._completer = completer
+        self._dist_context = dist_context
+        self._dist_context.initialize()
+        self._pass_context = self._dist_context.pass_context
+        self._strategy = self._dist_context.strategy
+
+    def parallel_all(self):
+        world_process_group = get_world_process_group()
+        all_ranks = world_process_group.ranks
+        for rank in all_ranks:
+            self.parallel(rank)
+
+    def parallel(self, rank):
+        serial_main_program = self._dist_context.serial_main_program
+        serial_startup_program = self._dist_context.serial_startup_program
+        serial_optimizer = self._dist_context.serial_optimizer
+        if self._mode == "train" and serial_optimizer:
+            # Generate backward
+            serial_loss = self._dist_context.serial_fetch_vars["loss"][0]
+            params_grads = self._generate_backward(
+                serial_main_program, serial_startup_program, serial_loss)
+            # Apply pre optimization passes
+            self._apply_pre_optimization(serial_main_program,
+                                         serial_startup_program, serial_loss,
+                                         serial_optimizer, params_grads)
+            # Do logical partition
+            partitioner = Partitioner(self._dist_context, rank)
+            dist_main_prog, dist_startup_prog, dist_params_grads = partitioner.partition(
+                serial_main_program, serial_startup_program, params_grads)
+            # Generate optimizer
+            self._generate_optimizer(dist_main_prog, dist_startup_prog,
+                                     serial_optimizer, dist_params_grads)
+            # Do reshard process
+            set_grad_var_shape(dist_main_prog, self._dist_context)
+            resharder = Resharder(dist_main_prog, dist_startup_prog, rank,
+                                  self._dist_context, dist_params_grads)
+            resharder.reshard()
+            # Apply post optimization passes
+            self._apply_post_optimization(dist_main_prog, dist_startup_prog,
+                                          rank, dist_params_grads)
+        else:
+            # Apply pre optimization passes
+            self._apply_pre_optimization(
+                serial_main_program, serial_startup_program, None, None, None)
+            # Do logical partition
+            partitioner = Partitioner(self._dist_context, rank)
+            dist_main_prog, dist_startup_prog, dist_params_grads = partitioner.partition(
+                serial_main_program, serial_startup_program, [])
+            # Do reshard process
+            resharder = Resharder(dist_main_prog, dist_startup_prog, rank,
+                                  self._dist_context, [], 1)
+            resharder.reshard()
+
+        # Clone program for test
+        if self._mode != 'train':
+            dist_main_prog = dist_main_prog.clone(for_test=True)
+            dist_startup_prog = dist_startup_prog.clone(for_test=True)
+
+        # Store the distributed programs for further usages
+        self._dist_context.dist_main_programs[rank] = dist_main_prog
+        self._dist_context.dist_startup_programs[rank] = dist_startup_prog
+
+    def _generate_backward(self, main_program, startup_program, loss):
+        with program_guard(main_program, startup_program):
+            params_grads = append_backward(
+                loss, distop_context=self._dist_context.dist_op_context)
+        self._completer.complete_backward_annotation(main_program)
+        self._dist_context.block_state.parse_backward_blocks(main_program)
+        return params_grads
+
+    def _generate_optimizer(self, main_program, startup_program, optimizer,
+                            params_grads):
+        with program_guard(main_program, startup_program):
+            optimizer_ops = copy.deepcopy(optimizer).apply_gradients(
+                params_grads)
+        self._completer.complete_update_annotation(main_program)
+        return optimizer_ops
+
+    def _apply_pre_optimization(self, main_program, startup_program, loss,
+                                optimizer, params_grads):
+        if self._strategy is None:
+            return
+        # apply amp pass
+        if self._strategy.amp:
+            config = copy.deepcopy(self._strategy.amp_configs)
+            config["dist_context"] = self._dist_context
+            config["params_grads"] = params_grads
+            config["loss"] = loss
+            config["input_data"] = self._dist_context.serial_feed_vars["inputs"] \
+                + self._dist_context.serial_feed_vars["labels"]
+            if config["use_pure_fp16"]:
+                config["base_opt"] = optimizer
+                auto_parallel_fp16_pass = new_pass("auto_parallel_fp16", config)
+                auto_parallel_fp16_pass.apply(
+                    [main_program], [startup_program], self._pass_context)
+            else:
+                auto_parallel_amp_pass = new_pass("auto_parallel_amp", config)
+                auto_parallel_amp_pass.apply([main_program], [startup_program],
+                                             self._pass_context)
+
+        # apply recompute pass
+        if self._strategy.recompute:
+            config = copy.deepcopy(self._strategy.recompute_configs)
+            config["dist_context"] = self._dist_context
+            config["no_grad_set"] = None
+            config["loss"] = loss
+            auto_parallel_recompute_pass = new_pass("auto_parallel_recompute",
+                                                    config)
+            auto_parallel_recompute_pass.apply(
+                [main_program], [startup_program], self._dist_context)
+
+    def _apply_post_optimization(self, main_program, startup_program, rank,
+                                 params_grads):
+        if self._strategy is None:
+            return
+        if self._strategy.sharding:
+            config = copy.deepcopy(self._strategy.sharding_configs)
+            config["dist_context"] = self._dist_context
+            config["params_grads"] = params_grads
+            config["global_rank"] = rank
+            auto_parallel_sharding_pass = new_pass("auto_parallel_sharding",
+                                                   config)
+            auto_parallel_sharding_pass.apply(
+                [main_program], [startup_program], self._dist_context)
+
+        if self._strategy.gradient_merge:
+            config = copy.deepcopy(self._strategy.gradient_merge_configs)
+            config["dist_context"] = self._dist_context
+            config["params_grads"] = params_grads
+            auto_parallel_gradient_merge_pass = new_pass(
+                "auto_parallel_gradient_merge_pass", config)
+            auto_parallel_gradient_merge_pass.apply(
+                [main_program], [startup_program], self._dist_context)
diff --git a/python/paddle/distributed/auto_parallel/partitioner.py b/python/paddle/distributed/auto_parallel/partitioner.py
index fe091cd08b72b..91a31dd1b922e 100644
--- a/python/paddle/distributed/auto_parallel/partitioner.py
+++ b/python/paddle/distributed/auto_parallel/partitioner.py
@@ -263,6 +263,11 @@ def partition_block(self, ref_block, target_block):
                 dist_op_backward_impl.backward(
                     self._dist_context, **kinputs, **koutputs,
                     **{"grad_var_to_var": grad_var_to_var})
+            elif int(op.attr('op_role')) == 2:
+                kinputs, koutputs = dist_op_context.prepare_context(op)
+                dist_op_impl = get_distributed_operator_impl_container(
+                    "default").get_impl(0)
+                dist_op_impl.backward(self._dist_context, **kinputs, **koutputs)
             else:
                 raise NotImplementedError(
                     "partitioner only support forward op and backward op, but got {}".
diff --git a/python/paddle/distributed/auto_parallel/planner.py b/python/paddle/distributed/auto_parallel/planner.py
index 73df0da10339e..b97c09bd59da8 100755
--- a/python/paddle/distributed/auto_parallel/planner.py
+++ b/python/paddle/distributed/auto_parallel/planner.py
@@ -35,7 +35,6 @@
 from .dist_context import DistributedContext, DistributedOperatorContext
 from .dist_attribute import OperatorDistributedAttribute, TensorDistributedAttribute
 
-paddle.enable_static()
 paddle.seed(123)
 random.seed(123)
 np.random.seed(123)
diff --git a/python/paddle/distributed/auto_parallel/planner_v2.py b/python/paddle/distributed/auto_parallel/planner_v2.py
new file mode 100755
index 0000000000000..7db17e98d07ee
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/planner_v2.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .completion import Completer
+from .dist_context import get_default_distributed_context
+from .utils import print_program_with_dist_attr
+
+
+class Planner:
+    def __init__(self, mode, dist_context):
+        self._mode = mode
+        self._dist_context = dist_context
+
+        # NOTE: [HighOrderGrad]. There are grad ops in forward phase, and it need
+        # dependency of backward-forward ops in forward completion.
+        default_ctx = get_default_distributed_context()
+        self._dist_context._dist_op_context = default_ctx.dist_op_context
+        self._dist_context.initialize()
+
+        self._completer = Completer(self._dist_context)
+
+    @property
+    def completer(self):
+        return self._completer
+
+    def plan(self):
+        self._completer.complete_forward_annotation()
+        # parse forward sub block
+        self._dist_context.block_state.parse_forward_blocks(
+            self._dist_context.serial_main_program)
+        # TODO: add the auto searcher
diff --git a/python/paddle/distributed/auto_parallel/process_group.py b/python/paddle/distributed/auto_parallel/process_group.py
index 471448b031dde..d1b6e57ddc123 100644
--- a/python/paddle/distributed/auto_parallel/process_group.py
+++ b/python/paddle/distributed/auto_parallel/process_group.py
@@ -156,6 +156,6 @@ def __str__(self):
 
 
 # Note that Process group 0 is reserved for representing all ranks.
-# At the begining, group 0 is empty and new ranks will be added automatically. 
+# At the beginning, group 0 is empty and new ranks will be added automatically. 
 _g_process_group_map = {}
 _g_process_group_map[0] = ProcessGroup(0, [])
diff --git a/python/paddle/distributed/auto_parallel/reshard.py b/python/paddle/distributed/auto_parallel/reshard.py
index 3df4ef91122a7..7481ec736f09e 100644
--- a/python/paddle/distributed/auto_parallel/reshard.py
+++ b/python/paddle/distributed/auto_parallel/reshard.py
@@ -513,6 +513,8 @@ def remove_no_need_vars(auto_parallel_main_prog, dist_params_grads):
                     idx += 1
 
             for var in remove_vars:
+                if block.vars[var].is_data:
+                    continue
                 block._remove_var(var)
 
     @staticmethod
diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py
index ac07b49f45c3b..fbe3a43a7917a 100644
--- a/python/paddle/distributed/auto_parallel/utils.py
+++ b/python/paddle/distributed/auto_parallel/utils.py
@@ -1101,6 +1101,10 @@ def is_loss_op(op):
         int(op.all_attrs()[OP_ROLE_KEY]) == (int(core.op_proto_and_checker_maker.OpRole.Forward) | int(core.op_proto_and_checker_maker.OpRole.Loss))
 
 
+def is_prim_op(op):
+    return op.type.endswith("_p")
+
+
 def get_loss_op(block):
     loss_ops = []
     for op in block.ops:
@@ -1118,6 +1122,9 @@ def set_var_dist_attr(dist_context, var, dims_mapping, process_mesh, **kwargs):
     tensor_dist_attr.dims_mapping = dims_mapping
     # TODO get global mesh group
     tensor_dist_attr.process_mesh = process_mesh
+    if "mark_annotated" in kwargs and kwargs["mark_annotated"]:
+        tensor_dist_attr.mark_annotated("dims_mapping")
+        tensor_dist_attr.mark_annotated("process_mesh")
     dist_context.set_tensor_dist_attr_for_program(var, tensor_dist_attr)
     return tensor_dist_attr
 
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index e33a3dba669ab..5f481bd0dca41 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -226,9 +226,15 @@ def _new_process_group_impl(backend,
                             world_size,
                             group_name,
                             pg_options,
-                            group_id=0):
+                            group_id=0,
+                            src_rank=None,
+                            dst_rank=None):
     pg = None
     genv = _get_global_env()
+    if backend != 'heter':
+        assert src_rank is None and dst_rank is None, (
+            "src_rank and dst_rank "
+            "can only be set for heter backend.")
     assert backend in _valid_backend_list, "Unsupported backend: %s." % backend
     if backend == "gloo":
         place = core.CPUPlace()
@@ -269,7 +275,9 @@ def _new_process_group_impl(backend,
             gloo_rank=cluster_id,
             gloo_size=len(cluster_size),
             with_switch=True,
-            switch_endpoint=switch_ep)
+            switch_endpoint=switch_ep,
+            src_rank=src_rank,
+            dst_rank=dst_rank)
 
     return pg
 
@@ -322,6 +330,17 @@ def barrier(group=None):
         attrs={'ring_id': ring_id})
 
 
+# _custom_gid provides a way for users to
+# set the group id, which is usually useful
+# to be compatible with the static mode.
+_custom_gid = None
+
+
+def _set_custom_gid(gid):
+    global _custom_gid
+    _custom_gid = gid
+
+
 def new_group(ranks=None, backend=None):
     """
 
@@ -345,12 +364,13 @@ def new_group(ranks=None, backend=None):
             paddle.distributed.all_reduce(tindata, group=gp, use_calc_stream=False)
 
     """
+    global _custom_gid
     global _group_map
     if in_dygraph_mode():
         global _default_group_name
-        gid = _new_ring_id()
+        gid = _custom_gid if _custom_gid else _new_ring_id()
         group_name = _default_group_name + str(gid)
-        if ranks is None or len(ranks) > 1:
+        if backend != 'heter' and (ranks is None or len(ranks) > 1):
             global_group = _get_default_group()
             global_rank = global_group.rank
             global_ranks = global_group.ranks
@@ -362,8 +382,10 @@ def new_group(ranks=None, backend=None):
                 "equal to that of the default global group.")
         size = len(ranks)
         ranks = sorted(ranks)
-        if size > 1 and global_rank in ranks:
-            rank = ranks.index(global_rank)
+        if backend == 'heter' or (size > 1 and global_rank in ranks):
+            rank = 0 if backend == 'heter' else ranks.index(global_rank)
+            src_rank = ranks[0] if backend == 'heter' else None
+            dst_rank = ranks[1] if backend == 'heter' else None
             pg = _new_process_group_impl(
                 backend,
                 _default_store,
@@ -371,7 +393,9 @@ def new_group(ranks=None, backend=None):
                 size,
                 group_name,
                 pg_options=None,
-                group_id=gid)
+                group_id=gid,
+                src_rank=src_rank,
+                dst_rank=dst_rank)
         else:
             rank = -1
             pg = None
@@ -379,6 +403,11 @@ def new_group(ranks=None, backend=None):
         _group_map_by_name[group_name] = group
         _group_map[gid] = group
 
+        # TODO(shenliang03): This is a temporary solution to solve the problem of 
+        # hang caused by tcp
+        tmp = paddle.to_tensor([1], dtype="int32")
+        paddle.distributed.all_reduce(tmp, group=group, use_calc_stream=True)
+        paddle.distributed.wait(tmp)
         return group
 
     if not backend:
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 9d20e432d8961..414edb9b66d8d 100644
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -534,7 +534,7 @@ def fleet_desc_configs(self, configs):
         support_sparse_accessor_class = [
             'DownpourSparseValueAccessor', 'DownpourCtrAccessor',
             'DownpourCtrDoubleAccessor', 'DownpourUnitAccessor',
-            'DownpourDoubleUnitAccessor'
+            'DownpourDoubleUnitAccessor', 'DownpourCtrDymfAccessor'
         ]
         from google.protobuf.descriptor import FieldDescriptor
         table_param = self.strategy.downpour_table_param
@@ -616,6 +616,8 @@ def set_sparse_table_config(table_data, config):
 
             if accessor_class.find("Double") >= 0:
                 table_data.accessor.accessor_class = 'CtrDoubleAccessor'
+            elif accessor_class.find("Dymf") >= 0:
+                table_data.accessor.accessor_class = 'CtrDymfAccessor'
             else:
                 table_data.accessor.accessor_class = 'CtrCommonAccessor'
 
@@ -1168,9 +1170,9 @@ def sharding_configs(self):
 
             dp_degree(int, optional): specific the number of data parallelism group; when dp_degree >= 2, it will introduce dp_degree ways data parallelism as the outer parallelsim for the inner parallelsim. User is responsible to ensure global_world_size = mp_degree * sharding_degree * pp_degree * dp_degree. Default is 1.
 
-            mp_degree(int, optional): [Hybrid parallelism ONLY] specific the the number of gpus within each megatron parallelism group; and megatron parallelism will turn be off if mp_degree=1.  Default is 1.
+            mp_degree(int, optional): [Hybrid parallelism ONLY] specific the number of gpus within each megatron parallelism group; and megatron parallelism will turn be off if mp_degree=1.  Default is 1.
 
-            pp_degree(int, optional): [Hybrid parallelism ONLY] specific the the number of gpus within each pipeline parallelism group; and pipeline parallelism will turn be off if pp_degree=1.  Default is 1.
+            pp_degree(int, optional): [Hybrid parallelism ONLY] specific the number of gpus within each pipeline parallelism group; and pipeline parallelism will turn be off if pp_degree=1.  Default is 1.
 
             pp_allreduce_in_optimize(bool, optional): [Hybrid parallelism ONLY] move the allreduce operations from backward stage to update(optimize) stage when pipeline parallelsim is on. 
             This configuration will affect the communication speed of Hybrid parallelism training depeneded on network topology. this strategy is experimental by now..  Default is False.
@@ -1485,7 +1487,7 @@ def localsgd_configs(self):
 
         **Notes**:
             k_steps(int) The local steps for training before parameter synchronization. Default 1.
-            begin_step(int) The step of begining training by localsgd. Default 1.
+            begin_step(int) The step of beginning training by localsgd. Default 1.
 
         Examples:
 
@@ -1544,7 +1546,7 @@ def adaptive_localsgd_configs(self):
             init_k_steps(int) The initial steps for training before adaptive localsgd.
                               Then, the adaptive localsgd method will modify init_k_steps automatically.
                               Default 1.
-            begin_step(int) The step of begining training by adaptive localsgd. Default 1.
+            begin_step(int) The step of beginning training by adaptive localsgd. Default 1.
 
         Examples:
 
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index c5a9df50589cc..343cca7f4f0d3 100644
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -556,7 +556,7 @@ def launch():
 
         - ``--selected_mlus``: mlus aliases, recommend to use ``--mlus``.
 
-        - ``training_script``: The full path to the single GPU training program/script to be launched in parallel, followed by all the arguments for the training script. e.g., ``traing.py``
+        - ``training_script``: The full path to the single GPU training program/script to be launched in parallel, followed by all the arguments for the training script. e.g., ``training.py``
 
         - ``training_script_args``: The args of training_script. e.g., ``--lr=0.1``
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py
index 8f1a4de86de0d..3a52041dc7e2c 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py
@@ -1372,7 +1372,7 @@ def _apply(self):
         max_v = self.op.attr("max")
         seed = self.op.attr("seed")
         dtype = self.op.attr("dtype")
-        assert max_v > min_v, "assert max_v > min_v, but recieved " + \
+        assert max_v > min_v, "assert max_v > min_v, but received " + \
                "as max_v={}, min_v={} ".format(max_v, min_v)
 
         tensor1 = self._create_ge_tensor([len(shape)], 2, shape)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
index b7edf5830025d..d487f35324df9 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
@@ -170,7 +170,7 @@ def minimize(self,
         result = self._inner_optimizer.minimize(loss, startup_program,
                                                 parameters, no_grad_set)
 
-        # sync parameters accross sharding ranks
+        # sync parameters across sharding ranks
         self._sharding_sync_parameters()
 
         return result
@@ -181,7 +181,7 @@ def step(self):
         # actually updating
         self._inner_optimizer.step()
 
-        # sync parameters accross sharding ranks
+        # sync parameters across sharding ranks
         self._sharding_sync_parameters()
 
     # TODO is it a good way to make _grad_clip a property
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
index c4d42f90615fc..90440ff9d0ea9 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -138,9 +138,16 @@ def _get_hybrid_degree(self):
         if pp_degree > 1:
             assert strategy.pipeline is True
 
-        assert global_world_size == mp_degree * sharding_degree * pp_degree * dp_degree, \
-            "global work size [{}], mp_degree [{}], sharding_degree [{}], pp_degree [{}], dp_degree [{}].".format(
-                global_world_size, mp_degree, sharding_degree, pp_degree, dp_degree)
+        if os.getenv("PADDLE_MANUAL_PIPELINE_STAGE", None):
+            assert pp_degree == 2, ("For manually set pipeline, only "
+                                    "pp_degree = 2 is supported.")
+            assert global_world_size == mp_degree * sharding_degree * dp_degree, \
+                "global work size [{}], mp_degree [{}], sharding_degree [{}], dp_degree [{}].".format(
+                    global_world_size, mp_degree, sharding_degree, dp_degree)
+        else:
+            assert global_world_size == mp_degree * sharding_degree * pp_degree * dp_degree, \
+                "global work size [{}], mp_degree [{}], sharding_degree [{}], pp_degree [{}], dp_degree [{}].".format(
+                    global_world_size, mp_degree, sharding_degree, pp_degree, dp_degree)
 
         # FIXME (JZ-LIANG) deprecated hybrid_dp
         if sharding_configs["hybrid_dp"]:
@@ -268,7 +275,11 @@ def _inner_opt_minimize(self, loss, startup_program, parameter_list,
         if self.pp_degree > 1:
             startup_program = startup_program._pipeline_opt['startup_program']
             print("pp_rank:", self.pp_rank)
-            main_program = program_list[self.pp_rank]
+            if os.getenv("PADDLE_MANUAL_PIPELINE_STAGE", None):
+                main_program = program_list[int(
+                    os.getenv("PADDLE_MANUAL_PIPELINE_STAGE"))]
+            else:
+                main_program = program_list[self.pp_rank]
             with open("main_%d" % self.role_maker._worker_index(), 'w') as f:
                 f.writelines(str(main_program))
             main_block = main_program.global_block()
@@ -633,14 +644,15 @@ def _init_pair_comm(self, pair, ring_id):
             self.pp_group_endpoints[pair[1]],
         ]
         pp_rank = 0 if self.pp_rank == pair[0] else 1
-        self._collective_helper._init_communicator(
-            self._startup_program,
-            self.current_endpoint,
-            pp_group_endpoints,
-            pp_rank,
-            ring_id,
-            False,
-            sync=False)
+        if os.getenv("PADDLE_MANUAL_PIPELINE_STAGE", None) is None:
+            self._collective_helper._init_communicator(
+                self._startup_program,
+                self.current_endpoint,
+                pp_group_endpoints,
+                pp_rank,
+                ring_id,
+                False,
+                sync=False)
 
     def _init_npu_pipeline_comm(self, startup_block):
         # NOTE(wangxi): some bug with hccl, must set pp_degree be even number
@@ -714,14 +726,15 @@ def _init_npu_pipeline_comm(self, startup_block):
 
     def _init_pipeline_comm(self, startup_block):
         # TODO (JZ-LIANG) to unify pp_rank_ and pp_rank
-        self._collective_helper._init_communicator(
-            self._startup_program,
-            self.current_endpoint,
-            self.pp_group_endpoints,
-            self.pp_rank,
-            self.pp_ring_id,
-            False,
-            sync=False)
+        if os.getenv("PADDLE_MANUAL_PIPELINE_STAGE", None) is None:
+            self._collective_helper._init_communicator(
+                self._startup_program,
+                self.current_endpoint,
+                self.pp_group_endpoints,
+                self.pp_rank,
+                self.pp_ring_id,
+                False,
+                sync=False)
 
         if core.is_compiled_with_npu():
             self._init_npu_pipeline_comm(startup_block)
@@ -1387,17 +1400,27 @@ def _build_groups(self):
         # NOTE (JZ-LIANG) support outter-pure-dp to scale the throughput in 3D parallelism
         # e.g. mp-sharding-pp-dp
         # sharding-hybrid-dp as one senario of outter-pure-dp 
-        assert self.global_word_size == self.mp_degree * self.sharding_degree * self.pp_degree * self.dp_degree, "mp_degree: [{}], sharding_degree: [{}], pp_degree: [{}], dp_degree: [{}]; BUT global nrank: [{}]".format(
-            self.mp_degree, self.sharding_degree, self.pp_degree,
-            self.dp_degree, self.global_word_size)
+        local_pp_degree = self.pp_degree
+        if os.getenv("PADDLE_MANUAL_PIPELINE_STAGE", None):
+            assert self.pp_degree == 2, ("For manually set pipeline, only "
+                                         "pp_degree = 2 is supported.")
+            assert self.global_word_size == self.mp_degree * self.sharding_degree * self.dp_degree, \
+                "global work size [{}], mp_degree [{}], sharding_degree [{}], dp_degree [{}].".format(
+                    self.global_word_size, self.mp_degree, self.sharding_degree, self.dp_degree)
+            local_pp_degree = 1
+        else:
+            assert self.global_word_size == self.mp_degree * self.sharding_degree * self.pp_degree * self.dp_degree, "mp_degree: [{}], sharding_degree: [{}], pp_degree: [{}], dp_degree: [{}]; BUT global nrank: [{}]".format(
+                self.mp_degree, self.sharding_degree, self.pp_degree,
+                self.dp_degree, self.global_word_size)
 
         if self.dp_degree > 1:
             self.dp_ring_id = 2
-            self.dp_rank = self.global_rank // (self.sharding_degree *
-                                                self.mp_degree * self.pp_degree)
+            self.dp_rank = self.global_rank // (
+                self.sharding_degree * self.mp_degree * local_pp_degree)
             dp_first_rank_idx = self.global_rank % (
-                self.sharding_degree * self.mp_degree * self.pp_degree)
-            dp_offset = (self.sharding_degree * self.mp_degree * self.pp_degree)
+                self.sharding_degree * self.mp_degree * local_pp_degree)
+            dp_offset = (self.sharding_degree * self.mp_degree *
+                         local_pp_degree)
             self.dp_group_endpoints = []
             for i in range(self.dp_degree):
                 self.dp_group_endpoints.append(self.global_endpoints[
diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
index b6698a200e945..de36f8503a651 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
@@ -18,6 +18,7 @@
 import numpy as np
 from paddle import _C_ops
 import paddle.fluid.core as core
+from paddle.fluid.framework import _in_legacy_dygraph, _non_static_mode, in_dygraph_mode
 
 _hcg = None
 _use_cache = False
@@ -148,9 +149,15 @@ def set_send_message(self, tensor):
 
 
 def _is_valid_send_recv_partial(tensor, mp_degree):
-    tensor_numel = np.prod(tensor.shape)
-    assert tensor_numel != 0, "can't send/recv zero element"
-    return mp_degree > 1 and tensor_numel % mp_degree == 0
+
+    if _in_legacy_dygraph():
+        tensor_numel = np.prod(tensor.shape)
+        assert tensor_numel != 0, "can't send/recv zero element"
+        return mp_degree > 1 and tensor_numel % mp_degree == 0
+    elif in_dygraph_mode():
+        # TODO(shenliang03) support mp+pp optimizer in future. 
+        # (partial_send/partial_recv/partial_allgather_)
+        return False
 
 
 def send_partial(tensor,
diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
index 59bcf50ffb798..6c8badd64e161 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
@@ -19,7 +19,7 @@
 from paddle import _C_ops
 from paddle.autograd import PyLayer, EagerPyLayer
 from paddle.fluid import framework
-from ...utils.recompute import check_recompute_necessary, detach_variable
+from ...utils.recompute import check_recompute_necessary, detach_variable, swith_rng_state_tracker
 from ..parallel_layers.random import get_rng_state_tracker
 from paddle.fluid.framework import in_dygraph_mode
 
@@ -151,20 +151,6 @@ def _merge_activation(tensor):
     return _all_gather(tensor, group=mp_group)
 
 
-@contextlib.contextmanager
-def _swith_rng_state_tracker(rng_state, tracker):
-    orig_cuda_rng_state = paddle.get_cuda_rng_state()
-    orig_cuda_rng_tracker = get_rng_state_tracker().get_states_tracker()
-
-    paddle.set_cuda_rng_state(rng_state)
-    get_rng_state_tracker().set_states_tracker(tracker)
-    try:
-        yield
-    finally:
-        paddle.set_cuda_rng_state(orig_cuda_rng_state)
-        get_rng_state_tracker().set_states_tracker(orig_cuda_rng_tracker)
-
-
 class _HPEagerRecomputeFunction(EagerPyLayer):
     """
     Compared with paddle.distributed.fleet.utils.recompute, there are the following differences:
@@ -261,8 +247,8 @@ def backward(ctx, *args):
             tracer._has_grad = True
 
             # need restore auto_cast state as well as w/b list
-            with _swith_rng_state_tracker(ctx.fwd_cuda_rng_state,
-                                          ctx.fwd_cuda_rng_state_tracker):
+            with swith_rng_state_tracker(ctx.fwd_cuda_rng_state,
+                                         ctx.fwd_cuda_rng_state_tracker):
                 with paddle.amp.auto_cast(
                         enable=ctx.is_fw_autocast,
                         custom_white_list=ctx.amp_white_list,
@@ -393,8 +379,8 @@ def backward(ctx, *args):
             tracer._has_grad = True
 
             # need restore auto_cast state as well as w/b list
-            with _swith_rng_state_tracker(ctx.fwd_cuda_rng_state,
-                                          ctx.fwd_cuda_rng_state_tracker):
+            with swith_rng_state_tracker(ctx.fwd_cuda_rng_state,
+                                         ctx.fwd_cuda_rng_state_tracker):
                 with paddle.amp.auto_cast(
                         enable=ctx.is_fw_autocast,
                         custom_white_list=ctx.amp_white_list,
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
index 049d3ffa3694f..e44b5d2515d83 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
@@ -205,7 +205,7 @@ def _clear_gradients(self):
             for param in list(self._unslice_params):
                 param.clear_gradient(False)
                 tmp_var = param.cuda(DEV_ID)
-                param._clear_data()
+
                 if tmp_var.dtype == Type.fp32.value and param2dtype[
                         param.name] == Type.fp16.value:
                     tmp_var = paddle.cast(tmp_var, Type.fp16.value)
@@ -272,6 +272,8 @@ def _handle_unslice_params(self):
                 master_tensor = paddle.cast(param, Type.fp32.value)
                 master_tensor.name = param.name
                 self._optim._master_weights[param.name] = master_tensor
+            if self._offload:
+                param.master_weight = paddle.cast(param, Type.fp32.value).cpu()
             param2dtype[param.name] = param.dtype
             p_align = self._param2align(param)
             self._unslice_params2align[param.name] = p_align
@@ -369,7 +371,6 @@ def _param_storage(self, param, buffer_size):
         tmp_var.get_tensor().set(param_cpu.get_tensor(), core.CPUPlace())
         del tmp_var
         param.get_tensor()._set_dims(param_shape)
-        param._clear_data()
 
         # Current rank param_storage
         if self._offload:
@@ -379,6 +380,9 @@ def _param_storage(self, param, buffer_size):
                 value=tmp_tensor,
                 place=core.CPUPlace(),
                 name="slice@" + param.name)
+            with device_guard():
+                param.master_weight = paddle.cast(param.fw_storage,
+                                                  Type.fp32.value)
         else:
             param.fw_storage = core.eager.Tensor(
                 value=buffer._slice(start, end), name="slice@" + param.name)
@@ -389,6 +393,7 @@ def _param_storage(self, param, buffer_size):
             master_tensor = paddle.cast(param.fw_storage, Type.fp32.value)
             master_tensor.name = param.name
             self._optim._master_weights[param.fw_storage.name] = master_tensor
+        param._clear_data()
 
     def _register_forward_hooks(self, layer):
         """
@@ -480,9 +485,8 @@ def _update_params(self):
             collective.all_reduce(tensor=grad_storage.buffer, group=self._group)
         if self._offload:
             for param in list(self._unslice_params):
-                tmp_var = _device2cpu(param, convert_dtype=True)
-                tmp_var._share_buffer_to(param)
-                del tmp_var
+                param._clear_data()
+                param.master_weight._share_buffer_to(param)
 
             for grad_storage in self._grad_storages.values():
                 for p in grad_storage._params:
@@ -568,7 +572,8 @@ def allreduce_(*_):
                     del self._task_flow.full_param[param.name]
 
                     if self._offload:
-                        param.fw_storage = _device2cpu(param.fw_storage, True)
+                        param.fw_storage._clear_data()
+                        param.master_weight._share_buffer_to(param.fw_storage)
 
         return allreduce_
 
@@ -856,6 +861,7 @@ def _PartitionParam(param):
     if not hasattr(param, "fw_storage"):
         setattr(param, "fw_storage", None)
         setattr(param, "bw_storage", None)
+        setattr(param, "master_weight", None)
         setattr(param, "status", "all")
         setattr(param, "use_count", 0)
     return param
@@ -864,6 +870,7 @@ def _PartitionParam(param):
 def _UnsliceParam(param):
     if not hasattr(param, "unslice"):
         setattr(param, "unslice", True)
+        setattr(param, "master_weight", None)
     return param
 
 
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
index f96273cc84caf..7bb1517f12169 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
@@ -199,7 +199,7 @@ def _clear_gradients(self):
                 param.clear_gradient(False)
                 param._gradient_set_empty(False)
                 tmp_var = param.cuda(DEV_ID)
-                param._clear()
+
                 if tmp_var.dtype == Type.fp32.value and param2dtype[
                         param.name] == Type.fp16.value:
                     tmp_var = paddle.cast(tmp_var, Type.fp16.value)
@@ -220,19 +220,14 @@ def _update_params_slice(self):
             self._optim._param_groups = slice_params + list(
                 self._unslice_params)
         else:
-            params_name_list = list(map(lambda p: p.name, update_list))
-            fw_storage_name_list = list(
-                map(lambda p: p.fw_storage.name, update_list))
             for param_group in self._optim._param_groups:
                 p_group = []
                 for p in param_group['params']:
-                    if p.name in params_name_list:
+                    if hasattr(p, "fw_storage"):
                         p_group.append(p.fw_storage)
-                    elif p.name in fw_storage_name_list:
-                        p_group.append(update_list[fw_storage_name_list.index(
-                            p.name)].fw_storage)
-                    elif p in self._unslice_params:
+                    else:
                         p_group.append(p)
+
                 param_group['params'] = p_group
 
     def forward(self, *inputs, **kwargs):
@@ -268,6 +263,8 @@ def _handle_unslice_params(self):
             if param.dtype == Type.fp16.value and not self._offload:
                 self._optim._master_weights[param.name] = paddle.cast(
                     param, Type.fp32.value)
+            if self._offload:
+                param.master_weight = paddle.cast(param, Type.fp32.value).cpu()
             param2dtype[param.name] = param.dtype
             p_align = self._param2align(param)
             self._unslice_params2align[param.name] = p_align
@@ -335,11 +332,12 @@ def _add_manage_info(trainable_param):
                 self._param2buffer[param.name].append(
                     (rank_ * pre_buffer, (rank_ + 1) * pre_buffer))
 
-            # 3.Flatten layer params and release other rank buffer
-            self._param_storage(param, buffer_size)
             # Record param's dtype
             param2dtype[param.name] = param.dtype
 
+            # 3.Flatten layer params and release other rank buffer
+            self._param_storage(param, buffer_size)
+
     def _param_storage(self, param, buffer_size):
         """
         This is a function to simplify the handling of parameter InternalStorages.
@@ -365,13 +363,15 @@ def _param_storage(self, param, buffer_size):
         tmp_var.value().get_tensor().set(param_cpu.value().get_tensor(),
                                          core.CPUPlace())
         param.value().get_tensor()._set_dims(param_shape)
-        param._clear()
 
         # Current rank param_storage
         if self._offload:
             param.fw_storage = core.VarBase(
                 buffer._slice(start, end),
                 core.CPUPlace(), "slice@" + param.name)
+            with device_guard(device="cpu"):
+                param.master_weight = paddle.cast(param.fw_storage,
+                                                  Type.fp32.value)
         else:
             param.fw_storage = core.VarBase(
                 buffer._slice(start, end), "slice@" + param.name)
@@ -381,6 +381,7 @@ def _param_storage(self, param, buffer_size):
         if param.dtype == Type.fp16.value and not self._offload:
             self._optim._master_weights[param.fw_storage.name] = paddle.cast(
                 param.fw_storage, Type.fp32.value)
+        param._clear()
 
     def _register_forward_hooks(self, layer):
         """
@@ -482,9 +483,8 @@ def _update_params(self):
 
         if self._offload:
             for param in list(self._unslice_params):
-                tmp_var = _device2cpu(param, convert_dtype=True)
-                tmp_var._share_buffer_to(param)
-                tmp_var._clear()
+                param._clear()
+                param.master_weight._share_buffer_to(param)
 
             for grad_storage in self._grad_storages.values():
                 for p in grad_storage._params:
@@ -553,8 +553,9 @@ def allreduce_(*_):
                         cpu_grad = _device2cpu(
                             core.VarBase(full_grad._slice(start, end))
                             .detach().clone(), True)
-                        param.bw_storage = paddle.add(param.bw_storage,
-                                                      cpu_grad)
+                        with device_guard(device="cpu"):
+                            param.bw_storage = paddle.add(param.bw_storage,
+                                                          cpu_grad)
                     else:
                         # param.bw_storage.add_(
                         #     core.VarBase(full_grad._slice(start, end))
@@ -581,7 +582,8 @@ def allreduce_(*_):
                     tmp_var._clear()
 
                     if self._offload:
-                        param.fw_storage = _device2cpu(param.fw_storage, True)
+                        param.fw_storage._clear()
+                        param.master_weight._share_buffer_to(param.fw_storage)
 
         return allreduce_
 
@@ -869,6 +871,7 @@ def _PartitionParam(param):
     if not hasattr(param, "fw_storage"):
         setattr(param, "fw_storage", None)
         setattr(param, "bw_storage", None)
+        setattr(param, "master_weight", None)
         setattr(param, "status", "all")
         setattr(param, "use_count", 0)
     return param
@@ -877,6 +880,7 @@ def _PartitionParam(param):
 def _UnsliceParam(param):
     if not hasattr(param, "unslice"):
         setattr(param, "unslice", True)
+        setattr(param, "master_weight", None)
     return param
 
 
diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
index 1285e1f3323ff..d0b5c915e11cd 100644
--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
@@ -162,29 +162,36 @@ def sharding_reduce_gradients(parameter_list, hcg):
         sharding_nrank = hcg.get_sharding_parallel_group().nranks
         for param in parameter_list:
             if param.trainable and (param._grad_ivar() is not None):
-
-                g_var = param._grad_ivar()
-
-                # need use trace_op to allreduce 
-                # paddle.distributed.all_reduce(
-                #     g_var, group=hcg.get_sharding_parallel_group(), use_calc_stream=True)
-                paddle.fluid.framework._dygraph_tracer().trace_op(
-                    type="c_allreduce_sum",
-                    inputs={'X': g_var},
-                    outputs={'Out': g_var},
-                    attrs={
-                        'ring_id': hcg.get_sharding_parallel_group().id,
-                        'use_calc_stream': True
-                    })
-
-                # grad / sharding_rank
-                div_factor = paddle.to_tensor(sharding_nrank, dtype=g_var.dtype)
-                paddle.fluid.framework._dygraph_tracer().trace_op(
-                    type="elementwise_div",
-                    inputs={'X': g_var,
-                            'Y': div_factor},
-                    outputs={'Out': g_var},
-                    attrs={'axis': -1})
+                if in_dygraph_mode():
+                    param.grad.scale_(1.0 / sharding_nrank)
+                    paddle.distributed.all_reduce(
+                        param.grad,
+                        group=hcg.get_sharding_parallel_group(),
+                        use_calc_stream=True)
+
+                elif _in_legacy_dygraph():
+                    g_var = param._grad_ivar()
+                    # need use trace_op to allreduce 
+                    # paddle.distributed.all_reduce(
+                    #     g_var, group=hcg.get_sharding_parallel_group(), use_calc_stream=True)
+                    paddle.fluid.framework._dygraph_tracer().trace_op(
+                        type="c_allreduce_sum",
+                        inputs={'X': g_var},
+                        outputs={'Out': g_var},
+                        attrs={
+                            'ring_id': hcg.get_sharding_parallel_group().id,
+                            'use_calc_stream': True
+                        })
+
+                    # grad / sharding_rank
+                    div_factor = paddle.to_tensor(
+                        sharding_nrank, dtype=g_var.dtype)
+                    paddle.fluid.framework._dygraph_tracer().trace_op(
+                        type="elementwise_div",
+                        inputs={'X': g_var,
+                                'Y': div_factor},
+                        outputs={'Out': g_var},
+                        attrs={'axis': -1})
 
 
 def broadcast_sharding_parameters(model, hcg):
diff --git a/python/paddle/distributed/fleet/utils/recompute.py b/python/paddle/distributed/fleet/utils/recompute.py
index c767be77d8384..b8d1c881a08f9 100755
--- a/python/paddle/distributed/fleet/utils/recompute.py
+++ b/python/paddle/distributed/fleet/utils/recompute.py
@@ -53,18 +53,24 @@ def check_recompute_necessary(inputs):
 
 
 @contextlib.contextmanager
-def swith_rng_state(rng_state):
+def swith_rng_state_tracker(rng_state, tracker):
+    from paddle.distributed.fleet.meta_parallel.parallel_layers.random import get_rng_state_tracker
     orig_cuda_rng_state = paddle.get_cuda_rng_state()
+    orig_cuda_rng_tracker = get_rng_state_tracker().get_states_tracker()
+
     paddle.set_cuda_rng_state(rng_state)
+    get_rng_state_tracker().set_states_tracker(tracker)
     try:
         yield
     finally:
         paddle.set_cuda_rng_state(orig_cuda_rng_state)
+        get_rng_state_tracker().set_states_tracker(orig_cuda_rng_tracker)
 
 
 class EagerRecomputeFunction(EagerPyLayer):
     @staticmethod
     def forward(ctx, run_function, preserve_rng_state, *args):
+        from paddle.distributed.fleet.meta_parallel.parallel_layers.random import get_rng_state_tracker
         if framework._dygraph_tracer()._has_grad:
             check_recompute_necessary(args)
 
@@ -98,6 +104,8 @@ def forward(ctx, run_function, preserve_rng_state, *args):
                     "Recompute with RNG perserve is not support current device: {}.".
                     format(cur_device))
             ctx.fw_cuda_rng_state = paddle.get_cuda_rng_state()
+            ctx.fwd_cuda_rng_state_tracker = get_rng_state_tracker(
+            ).get_states_tracker()
 
         # TODO support AMP
         tracer = framework._dygraph_tracer()
@@ -126,6 +134,7 @@ def forward(ctx, run_function, preserve_rng_state, *args):
 
     @staticmethod
     def backward(ctx, *args):
+        from paddle.distributed.fleet.meta_parallel.parallel_layers.random import get_rng_state_tracker
         with paddle.fluid.dygraph.guard():
             # TODO need to check the recompute calling is vaild or not
 
@@ -143,7 +152,8 @@ def backward(ctx, *args):
             # NOTE support AMP
             # need restore auto_cast state as well as w/b list
             if ctx.preserve_rng_state:
-                with swith_rng_state(ctx.fw_cuda_rng_state):
+                with swith_rng_state_tracker(ctx.fw_cuda_rng_state,
+                                             ctx.fwd_cuda_rng_state_tracker):
                     with paddle.amp.auto_cast(
                             enable=ctx.is_fw_autocast,
                             custom_white_list=ctx.amp_white_list,
@@ -199,6 +209,7 @@ def backward(ctx, *args):
 class RecomputeFunction(PyLayer):
     @staticmethod
     def forward(ctx, run_function, preserve_rng_state, *args):
+        from paddle.distributed.fleet.meta_parallel.parallel_layers.random import get_rng_state_tracker
         if framework._dygraph_tracer()._has_grad:
             check_recompute_necessary(args)
 
@@ -232,6 +243,8 @@ def forward(ctx, run_function, preserve_rng_state, *args):
                     "Recompute with RNG perserve is not support current device: {}.".
                     format(cur_device))
             ctx.fw_cuda_rng_state = paddle.get_cuda_rng_state()
+            ctx.fwd_cuda_rng_state_tracker = get_rng_state_tracker(
+            ).get_states_tracker()
 
         # TODO support AMP
         tracer = framework._dygraph_tracer()
@@ -260,6 +273,7 @@ def forward(ctx, run_function, preserve_rng_state, *args):
 
     @staticmethod
     def backward(ctx, *args):
+        from paddle.distributed.fleet.meta_parallel.parallel_layers.random import get_rng_state_tracker
         with paddle.fluid.dygraph.guard():
             # TODO need to check the recompute calling is vaild or not
 
@@ -277,7 +291,8 @@ def backward(ctx, *args):
             # NOTE support AMP
             # need restore auto_cast state as well as w/b list
             if ctx.preserve_rng_state:
-                with swith_rng_state(ctx.fw_cuda_rng_state):
+                with swith_rng_state_tracker(ctx.fw_cuda_rng_state,
+                                             ctx.fwd_cuda_rng_state_tracker):
                     with paddle.amp.auto_cast(
                             enable=ctx.is_fw_autocast,
                             custom_white_list=ctx.amp_white_list,
diff --git a/python/paddle/distributed/launch/context/__init__.py b/python/paddle/distributed/launch/context/__init__.py
index 08c8f0835c5e1..fbea5d0db869e 100644
--- a/python/paddle/distributed/launch/context/__init__.py
+++ b/python/paddle/distributed/launch/context/__init__.py
@@ -17,6 +17,7 @@
 from .node import Node
 from .status import Status
 from .args_envs import parse_args, fetch_envs, env_args_mapping
+import six
 
 import logging
 
@@ -39,6 +40,12 @@ def __init__(self, enable_plugin=True):
         if enable_plugin:
             self._enable_plugin()
 
+    def print(self):
+        self.logger.info("-----------  Configuration  ----------------------")
+        for arg, value in sorted(six.iteritems(vars(self.args))):
+            self.logger.info("%s: %s" % (arg, value))
+        self.logger.info("--------------------------------------------------")
+
     def is_legacy_mode(self):
         if self.args.legacy:
             return True
diff --git a/python/paddle/distributed/launch/context/args_envs.py b/python/paddle/distributed/launch/context/args_envs.py
index b624281e44db3..ea8bf3d597a79 100644
--- a/python/paddle/distributed/launch/context/args_envs.py
+++ b/python/paddle/distributed/launch/context/args_envs.py
@@ -85,7 +85,7 @@ def parse_args():
     base_group.add_argument(
         "--run_mode",
         type=str,
-        default="collective",
+        default=None,
         help="run mode of the job, collective/ps/ps-heter")
 
     base_group.add_argument(
@@ -125,7 +125,7 @@ def parse_args():
     ps_group.add_argument(
         "--gloo_port", type=int, default=6767, help="gloo http port")
     ps_group.add_argument(
-        "--with_gloo", type=str, default="0", help="use gloo or not")
+        "--with_gloo", type=str, default="1", help="use gloo or not")
 
     # parameter elastic mode
     elastic_group = parser.add_argument_group("Elastic Parameters")
diff --git a/python/paddle/distributed/launch/controllers/__init__.py b/python/paddle/distributed/launch/controllers/__init__.py
index 706131300f0d8..f1c6ea5399a46 100644
--- a/python/paddle/distributed/launch/controllers/__init__.py
+++ b/python/paddle/distributed/launch/controllers/__init__.py
@@ -29,4 +29,5 @@
 def init(ctx):
     for c in _controllers:
         if c.enable(ctx):
+            ctx.print()
             return c(ctx)
diff --git a/python/paddle/distributed/launch/controllers/collective.py b/python/paddle/distributed/launch/controllers/collective.py
index 3763bac041451..5225fd6e81ff1 100644
--- a/python/paddle/distributed/launch/controllers/collective.py
+++ b/python/paddle/distributed/launch/controllers/collective.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .controller import Controller
+from .controller import Controller, ControleMode
 
 import json
 import os
@@ -23,8 +23,10 @@
 class CollectiveController(Controller):
     @classmethod
     def enable(cls, ctx):
+        # collective is the default mode
         if ctx:
             ctx.logger.debug("{} enabled".format(cls.__name__))
+            ctx.args.run_mode = ControleMode.COLLECTIVE
             return True
         else:
             return False
@@ -85,6 +87,7 @@ def build_pod(self):
                 "PADDLE_LOCAL_SIZE": "{}".format(self.pod.replicas),
                 "PADDLE_GLOBAL_RANK": "{}".format(i + rank_offset),
                 "PADDLE_LOCAL_RANK": "{}".format(i),
+                "PADDLE_NNODES": "{}".format(self.job.replicas),
                 ## compatible env
                 "PADDLE_TRAINER_ENDPOINTS": ",".join(job_endpoints),
                 "PADDLE_CURRENT_ENDPOINT": endpoints[i],
@@ -106,6 +109,7 @@ class CollectiveElasticController(CollectiveController):
     def enable(cls, ctx):
         if ctx.args.master and ctx.args.master.startswith("etcd://"):
             ctx.logger.debug("{} enabled".format(cls.__name__))
+            ctx.args.run_mode = ControleMode.COLLECTIVE
             return True
         else:
             return False
diff --git a/python/paddle/distributed/launch/controllers/controller.py b/python/paddle/distributed/launch/controllers/controller.py
index 69b2237f0ba7d..f069bfbcd3501 100644
--- a/python/paddle/distributed/launch/controllers/controller.py
+++ b/python/paddle/distributed/launch/controllers/controller.py
@@ -21,6 +21,7 @@
 from paddle.distributed.launch.job.container import Container
 
 from .master import Master
+from .watcher import Watcher
 
 import time
 
@@ -39,6 +40,8 @@ def __init__(self, ctx):
         self.ctx = ctx
         self.master = Master.factory(self.ctx)
 
+        self.watcher = Watcher(self.ctx)
+
         self.job = Job(nnodes=self.ctx.args.nnodes,
                        mode=self.ctx.args.run_mode,
                        jid=self.ctx.args.job_id)
@@ -114,6 +117,9 @@ def watch(self) -> bool:
 
     def stop(self, sigint=None):
         self.ctx.logger.debug("Controller stop")
+
+        self.watcher.stop()
+
         self.master.stop()
         self.pod.stop(sigint)
 
diff --git a/python/paddle/distributed/launch/controllers/master.py b/python/paddle/distributed/launch/controllers/master.py
index 43eda4cdffa24..742fea9e16de7 100644
--- a/python/paddle/distributed/launch/controllers/master.py
+++ b/python/paddle/distributed/launch/controllers/master.py
@@ -276,10 +276,20 @@ def fetch_peer_alive(self):
         return peer_alive
 
     def wait_peer_ready(self, replicas_min, replicas_max, timeout):
+        timeout = timeout if timeout > 1 else 3
+
         end = time.time() + timeout
+        np_pre = len(self.fetch_peer_alive())
         while not self.ctx.status.is_done() and time.time() < end:
-            if len(self.fetch_peer_alive()) == replicas_max:
+            np = len(self.fetch_peer_alive())
+            if np == replicas_max:
+                # maximum replicas reached, return immediately
                 return (True, replicas_max)
+            elif np != np_pre:
+                # replicas are changing, reset timeout
+                end = time.time() + timeout
+                np_pre = np
+                time.sleep(0.2)
             else:
                 time.sleep(0.5)
 
diff --git a/python/paddle/distributed/launch/controllers/ps.py b/python/paddle/distributed/launch/controllers/ps.py
index 6504f1240ee09..037bd313bbc03 100644
--- a/python/paddle/distributed/launch/controllers/ps.py
+++ b/python/paddle/distributed/launch/controllers/ps.py
@@ -171,6 +171,7 @@ def _build_pod_with_master(self):
 
         for i in range(server_num):
             e = {
+                "PADDLE_NNODES": "{}".format(self.job.replicas),
                 "PADDLE_PSERVERS_IP_PORT_LIST": ",".join(server_endpoints),
                 "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints),
                 "PADDLE_PORT":
@@ -186,6 +187,7 @@ def _build_pod_with_master(self):
 
         for i in range(trainer_num):
             e = {
+                "PADDLE_NNODES": "{}".format(self.job.replicas),
                 "PADDLE_PSERVERS_IP_PORT_LIST": ",".join(server_endpoints),
                 "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints),
                 "PADDLE_PORT":
diff --git a/python/paddle/distributed/launch/controllers/watcher.py b/python/paddle/distributed/launch/controllers/watcher.py
new file mode 100644
index 0000000000000..4d49b924f1e81
--- /dev/null
+++ b/python/paddle/distributed/launch/controllers/watcher.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..utils.nvsmi import get_gpu_process, get_gpu_util, get_gpu_info
+import time
+import os
+
+from threading import Thread
+
+
+class Watcher(object):
+    def __init__(self, ctx):
+        self.ctx = ctx
+
+        self.interval = 10
+
+        self.gpu_util = []
+
+        # gpu log file
+        self.gpus = self.ctx.args.devices or self.ctx.node.device.labels
+        if len(self.gpus) > 0:
+            fn = os.path.join(self.ctx.args.log_dir,
+                              "{}.gpu.log".format(self.ctx.args.job_id))
+            os.makedirs(os.path.dirname(fn), exist_ok=True)
+            self.gpu_fd = open(fn, 'w')
+        else:
+            return
+
+        # start
+        self.proc = Thread(target=self.watch)
+        self.proc.daemon = True
+        self.proc.start()
+
+    def watch(self):
+        if not len(self.gpus) > 0:
+            return
+
+        self._print_gpu_info()
+
+        util_key = "index,utilization_gpu,memory_total,memory_used,memory_free,timestamp"
+        self.gpu_fd.write(util_key)
+        self.gpu_fd.write('\n')
+
+        while not self.ctx.status.is_done():
+            self._save_gpu_log(util_key)
+            time.sleep(self.interval)
+
+        if hasattr(self, "gpu_fd"):
+            self.gpu_fd.close()
+
+    def _print_gpu_info(self):
+        try:
+            info_key = "index,uuid,driver_version,name,gpu_serial,display_active,display_mode"
+            self.gpu_fd.write(info_key)
+            self.gpu_fd.write('\n')
+            for line in get_gpu_info(self.gpus):
+                self.gpu_fd.write(line.str(info_key))
+                self.gpu_fd.write('\n')
+            self.gpu_fd.write('\n')
+
+            process_key = "pid,process_name,gpu_uuid,gpu_name,used_memory"
+            self.gpu_fd.write(process_key)
+            self.gpu_fd.write('\n')
+            for line in get_gpu_process(self.gpus):
+                self.gpu_fd.write(line.str(process_key))
+                self.gpu_fd.write('\n')
+            self.gpu_fd.write('\n')
+
+            self.gpu_fd.flush()
+        except:
+            self.ctx.log.error("save gpu info failed")
+
+    def _save_gpu_log(self, util_key):
+        try:
+            for line in get_gpu_util(self.gpus):
+                self.gpu_fd.write(line.str(util_key))
+                self.gpu_fd.write('\n')
+            self.gpu_fd.flush()
+        except:
+            self.ctx.log.error("save gpu log failed")
+
+    def stop(self):
+        if hasattr(self, "proc"):
+            self.proc.join()
diff --git a/python/paddle/distributed/launch/main.py b/python/paddle/distributed/launch/main.py
index 400a447260252..b2c87e737c82d 100644
--- a/python/paddle/distributed/launch/main.py
+++ b/python/paddle/distributed/launch/main.py
@@ -54,7 +54,7 @@ def launch():
 
         - ``--devices``: The selected accelerate devices on nodes, can be gpu/xpu/npu/mlu etc.. e.g., ``--devices=0,1,2,3`` will launch four training processes each bound to one device.
 
-        - ``training_script``: The full path to the single GPU training program/script to be launched in parallel, followed by all the arguments for the training script. e.g., ``traing.py``
+        - ``training_script``: The full path to the single GPU training program/script to be launched in parallel, followed by all the arguments for the training script. e.g., ``training.py``
 
         - ``training_script_args``: The args of training_script. e.g., ``--lr=0.1``
 
diff --git a/python/paddle/distributed/launch/plugins/__init__.py b/python/paddle/distributed/launch/plugins/__init__.py
index 35a44ed942c20..13c09b4c27c26 100644
--- a/python/paddle/distributed/launch/plugins/__init__.py
+++ b/python/paddle/distributed/launch/plugins/__init__.py
@@ -17,6 +17,7 @@
 __all__ = []
 
 
+# print configuration after args are well filled in controller init
 def log(ctx):
     ctx.logger.info("-----------  Configuration  ----------------------")
     for arg, value in sorted(six.iteritems(vars(ctx.args))):
@@ -59,4 +60,4 @@ def rewrite_host_ip(ctx):
         ctx.node.ip = ctx.args.host
 
 
-enabled_plugins = [collective_compatible, rewrite_host_ip, process_args, log]
+enabled_plugins = [collective_compatible, rewrite_host_ip, process_args]
diff --git a/python/paddle/distributed/launch/utils/nvsmi.py b/python/paddle/distributed/launch/utils/nvsmi.py
new file mode 100644
index 0000000000000..82a23189ac6af
--- /dev/null
+++ b/python/paddle/distributed/launch/utils/nvsmi.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import subprocess
+import shlex
+import os
+import json
+import shutil
+
+
+class Info(object):
+    def __repr__(self):
+        return str(self.__dict__)
+
+    def json(self):
+        return json.dumps(self.__dict__)
+
+    def dict(self):
+        return self.__dict__
+
+    def str(self, keys=None):
+        if keys is None:
+            keys = self.__dict__.keys()
+
+        if isinstance(keys, str):
+            keys = keys.split(',')
+
+        values = [str(self.__dict__.get(k, '')) for k in keys]
+        return ",".join(values)
+
+
+def query_smi(query=None, query_type="gpu", index=None, dtype=None):
+    """
+    query_type: gpu/compute
+    """
+
+    if not has_nvidia_smi():
+        return []
+
+    cmd = ["nvidia-smi", "--format=csv,noheader,nounits"]
+    if isinstance(query, list) and query_type == "gpu":
+        cmd.extend(["--query-gpu={}".format(",".join(query))])
+    elif isinstance(query, list) and query_type.startswith("compute"):
+        cmd.extend(["--query-compute-apps={}".format(",".join(query))])
+    else:
+        return
+
+    if isinstance(index, list) and len(index) > 0:
+        cmd.extend(["--id={}".format(",".join(index))])
+    if not isinstance(dtype, list) or len(dtype) != len(query):
+        dtype = [str] * len(query)
+
+    output = subprocess.check_output(cmd, timeout=3)
+    lines = output.decode("utf-8").split(os.linesep)
+    ret = []
+    for line in lines:
+        if not line:
+            continue
+        info = Info()
+        for k, v, d in zip(query, line.split(", "), dtype):
+            setattr(info, k.replace(".", "_"), d(v))
+        ret.append(info)
+    return ret
+
+
+def get_gpu_info(index=None):
+    q = "index,uuid,driver_version,name,gpu_serial,display_active,display_mode".split(
+        ",")
+    d = [int, str, str, str, str, str, str]
+    index = index if index is None or isinstance(
+        index, list) else str(index).split(",")
+
+    return query_smi(q, index=index, dtype=d)
+
+
+def get_gpu_util(index=None):
+    q = "index,utilization.gpu,memory.total,memory.used,memory.free,timestamp".split(
+        ",")
+    d = [int, int, int, int, int, str]
+    index = index if index is None or isinstance(
+        index, list) else str(index).split(",")
+
+    return query_smi(q, index=index, dtype=d)
+
+
+def get_gpu_process(index=None):
+    q = "pid,process_name,gpu_uuid,gpu_name,used_memory".split(",")
+    d = [int, str, str, str, int]
+    index = index if index is None or isinstance(
+        index, list) else str(index).split(",")
+
+    return query_smi(q, index=index, query_type="compute", dtype=d)
+
+
+def has_nvidia_smi():
+    return shutil.which("nvidia-smi")
+
+
+if __name__ == '__main__':
+    print(get_gpu_info(0))
+    print(get_gpu_util(0))
+    print(get_gpu_process(0))
+
+    u = get_gpu_util()
+    for i in u:
+        print(i.str())
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index 53d35a251c8c8..8cd6c4647dce4 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -233,8 +233,13 @@ def train():
         master_addr, master_port = endpoints.split(":")
         master_port = int(master_port)
         is_master = rank == 0
-        default_store = core.TCPStore(master_addr, master_port, is_master,
-                                      world_size)
+        stop_check_timeout = int(os.getenv("FLAGS_stop_check_timeout", "900"))
+        default_store = core.TCPStore(
+            master_addr,
+            master_port,
+            is_master,
+            world_size,
+            stop_check_timeout=stop_check_timeout)
         _set_default_store(default_store)
         pg = _new_process_group_impl(
             backend,
diff --git a/python/paddle/distributed/passes/auto_parallel_fp16.py b/python/paddle/distributed/passes/auto_parallel_fp16.py
index 69c3eef7e3771..9dda310e5c022 100644
--- a/python/paddle/distributed/passes/auto_parallel_fp16.py
+++ b/python/paddle/distributed/passes/auto_parallel_fp16.py
@@ -306,7 +306,7 @@ def _insert_forward_cast_ops(self, op, idx, block, src_dtype, dst_dtype,
                     in_var_dist_attr = consume_op_attr.get_input_dist_attr(
                         in_var.name)
                     assert in_var_dist_attr is not None
-                    # truely insert cast op
+                    # truly insert cast op
                     if cast_var is None or cast_var.dtype != dst_dtype:
                         # NOTE we make the cast op and var's dist attr as the op that consume the
                         # cast var instead of the op which generates the var
diff --git a/python/paddle/distributed/passes/cpp_pass.py b/python/paddle/distributed/passes/cpp_pass.py
index 4a4e5ecbbb495..72525255b7eaa 100644
--- a/python/paddle/distributed/passes/cpp_pass.py
+++ b/python/paddle/distributed/passes/cpp_pass.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from .pass_base import PassType, CPPPassWrapper, register_pass
+from paddle.fluid.framework import core, _apply_pass as _apply_cpp_pass
 
 
 @register_pass("fuse_elewise_add_act")
@@ -93,3 +94,35 @@ def cpp_name(self):
 
     def _type(self):
         return PassType.CALC_OPT
+
+
+@register_pass("build_cinn")
+class BuildCINNPass(CPPPassWrapper):
+    def __init__(self):
+        super(BuildCINNPass, self).__init__()
+        self.set_attr("allow_ops", [])
+        self.set_attr("deny_ops", [])
+
+    @property
+    def cpp_name(self):
+        return "build_cinn_pass"
+
+    def _type(self):
+        return PassType.CALC_OPT
+
+    def _apply_single_impl(self, main_program, startup_program, context):
+        allow_ops = ";".join(self.get_attr("allow_ops"))
+        deny_ops = ";".join(self.get_attr("deny_ops"))
+
+        assert 'FLAGS_allow_cinn_ops' in core.globals(
+        ), "PaddlePaddle is not compiled with CINN support"
+        old_allow_ops = core.globals()['FLAGS_allow_cinn_ops']
+        old_deny_ops = core.globals()['FLAGS_deny_cinn_ops']
+        try:
+            core.globals()['FLAGS_allow_cinn_ops'] = allow_ops
+            core.globals()['FLAGS_deny_cinn_ops'] = deny_ops
+            _apply_cpp_pass(main_program, startup_program, self.cpp_name, {},
+                            self.cpp_attr_types)
+        finally:
+            core.globals()['FLAGS_allow_cinn_ops'] = old_allow_ops
+            core.globals()['FLAGS_deny_cinn_ops'] = old_deny_ops
diff --git a/python/paddle/distributed/passes/ps_trainer_pass.py b/python/paddle/distributed/passes/ps_trainer_pass.py
index 76e617c7dafcf..6112a9a1f45b6 100755
--- a/python/paddle/distributed/passes/ps_trainer_pass.py
+++ b/python/paddle/distributed/passes/ps_trainer_pass.py
@@ -375,12 +375,12 @@ def dag_check_up_and_reorder(program, inputs, outputs):
                 if attrs['use_ps_gpu']:
                     _program.global_block()._insert_op(
                         index=distributed_idx,
-                        type="pull_box_sparse",
+                        type="pull_gpups_sparse",
                         inputs={"Ids": inputs,
                                 'W': w},
                         outputs={"Out": outputs},
                         attrs={
-                            "size": w.shape[1],
+                            "size": [w.shape[1] for i in inputs],
                             "is_distributed": True,
                             "is_sparse": True
                         })
@@ -614,15 +614,24 @@ def _check_conflict(self, other_pass):
         return True
 
     def _add_push_box_sparse_op(self, program):
+        insert_index = -1
+        for idx, op in list(enumerate(program.global_block().ops)):
+            if op.type == "lookup_table_grad":
+                insert_index = idx
         for op in program.global_block().ops:
-            if op.type != "pull_box_sparse":
+            if op.type != "pull_box_sparse" and op.type != "pull_gpups_sparse":
                 continue
             grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
                 op.desc, cpt.to_text(set()), [])
             for op_desc in grad_op_desc:
-                new_op_desc = program.global_block().desc.append_op()
+                new_op_desc = program.global_block().desc._insert_op(
+                    insert_index + 1)
                 new_op_desc.copy_from(op_desc)
                 new_op_desc._set_attr(op_role_attr_name, backward)
+                new_op = paddle.fluid.framework.Operator(program.global_block(),
+                                                         new_op_desc)
+                program.global_block().ops.insert(insert_index + 1, new_op)
+                program.global_block()._sync_with_cpp()
 
     def _remove_optimizer_var(self, program):
         embedding_w = {}
@@ -670,7 +679,7 @@ def _remove_lookup_table_grad_op_and_var(self, program):
                     lookup_table_grad_var[name] = 1
 
         for idx, op in list(enumerate(program.global_block().ops)):
-            if op.type == "pull_box_sparse":
+            if op.type == "pull_box_sparse" or op.type == "pull_gpups_sparse":
                 continue
             for key_name in op.input_names:
                 for var in op.input(key_name):
diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py
index c6df7559a22e8..888d517116a15 100755
--- a/python/paddle/distributed/ps/the_one_ps.py
+++ b/python/paddle/distributed/ps/the_one_ps.py
@@ -1013,12 +1013,13 @@ def sync_strategy_envs():
             if self.context['ps_mode'] == DistributedMode.GEO:
                 self._communicator.init_params(init_params)
             else:
-                if role_id == 0:
-                    self._init_all_params(scopes, send_ctx, dense_map)
+                if not self.context['use_ps_gpu']:
+                    if role_id == 0:
+                        self._init_all_params(scopes, send_ctx, dense_map)
 
             fleet.util.barrier()
-
-        self._pull_all_dense(scopes, send_ctx, dense_map)
+        if not self.context['use_ps_gpu']:
+            self._pull_all_dense(scopes, send_ctx, dense_map)
         fleet.util.barrier()
 
         if self.context['ps_mode'] == DistributedMode.GEO:
diff --git a/python/paddle/distributed/ps/utils/public.py b/python/paddle/distributed/ps/utils/public.py
index e7edc6fd859a6..7acfd6cfe19f5 100755
--- a/python/paddle/distributed/ps/utils/public.py
+++ b/python/paddle/distributed/ps/utils/public.py
@@ -748,7 +748,7 @@ def _append_heter_op(op, current_heter_block_ops, heter_ops):
 def union_forward_gradient_op(program_block_ops_list):
     """
     before analyzing the input & output of each block in program_block_list, we should
-    union the forward op and corresponding gradient op to elimincate the uneccessary variable
+    union the forward op and corresponding gradient op to elimincate the unnecessary variable
     transmit
     """
     """
diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py
index 4adb19830522b..66545a8a249ba 100644
--- a/python/paddle/distributed/spawn.py
+++ b/python/paddle/distributed/spawn.py
@@ -74,7 +74,7 @@ def _py_supported_check():
 def _options_valid_check(options):
     # `print_config` keeped as a debug options, not show to users
     supported_options = [
-        'start_method', 'ips', 'gpus', 'xpus', 'print_config', 'backend'
+        'start_method', 'ips', 'gpus', 'xpus', 'mlus', 'print_config', 'backend'
     ]
     deprecated_options = [
         'selected_devices', 'started_port', 'cluster_node_ips', 'node_ip',
@@ -99,6 +99,8 @@ def _get_default_nprocs():
         return core.get_cuda_device_count()
     elif 'xpu' in device:
         return core.get_xpu_device_count()
+    elif 'mlu' in device:
+        return core.get_mlu_device_count()
     elif 'cpu' in device:
         return multiprocessing.cpu_count()
     else:
@@ -113,6 +115,8 @@ def _get_default_backend():
         return 'nccl'
     elif 'xpu' in device:
         return 'bkcl'
+    elif 'mlu' in device:
+        return 'cncl'
     elif 'cpu' in device:
         return 'gloo'
     else:
@@ -232,6 +236,40 @@ def _get_subprocess_env_list(nprocs, options):
                     raise ValueError("The selected xpu card %s cannot found in "
                                      "XPU_VISIBLE_DEVICES (%s)." %
                                      (card_id, ",".join(env_devices_list)))
+    elif options['backend'] == 'cncl':
+        args.selected_devices = options.get('mlus', None)
+        if args.selected_devices is None:
+            args.selected_devices = options.get('selected_devices', None)
+        env_devices = os.getenv("MLU_VISIBLE_DEVICES", None)
+        if env_devices is None or env_devices == "":
+            env_devices_list = [
+                str(x) for x in six.moves.range(core.get_mlu_device_count())
+            ]
+        else:
+            env_devices_list = env_devices.split(',')
+        if args.selected_devices is None:
+            if len(env_devices_list) < nprocs:
+                raise RuntimeError(
+                    "the number of visible devices(%d) is less than the number "
+                    "of spawn processes(%d), please ensure that the correct "
+                    "`nprocs` argument is passed or the environment variable "
+                    "`MLU_VISIBLE_DEVICES` is correctly configured." %
+                    (len(env_devices_list), nprocs))
+            args.selected_devices = ",".join(
+                [str(env_devices_list[x]) for x in range(0, nprocs)])
+        else:
+            selected_device_list = args.selected_devices.split(',')
+            if len(selected_device_list) != nprocs:
+                raise ValueError(
+                    "The number of selected devices(%s) is not equal to "
+                    "the number of spawn processes(%d), please ensure that the "
+                    "correct `nprocs` and `mlus` arguments are passed." %
+                    (len(selected_device_list), nprocs))
+            for card_id in selected_device_list:
+                if card_id not in env_devices_list:
+                    raise ValueError("The selected mlu card %s cannot found in "
+                                     "MLU_VISIBLE_DEVICES (%s)." %
+                                     (card_id, ",".join(env_devices_list)))
     elif options['backend'] == 'gloo':
         # TODO check gpu / xpu flag must not exist
         warnings.warn(
@@ -303,6 +341,8 @@ def _set_trainer_env(env_dict, backend):
         set_flags({'FLAGS_selected_gpus': env_dict['FLAGS_selected_gpus']})
     elif backend == 'bkcl':
         set_flags({'FLAGS_selected_xpus': env_dict['FLAGS_selected_xpus']})
+    elif backend == 'cncl':
+        set_flags({'FLAGS_selected_mlus': env_dict['FLAGS_selected_mlus']})
     else:
         #NOTE(xiongkun) why not raise Error ? 
         # So far, we added support for CPU parallel, and will be applied when paddle is not 
@@ -396,9 +436,9 @@ def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options):
     Start multiple processes with ``spawn`` method for parallel training.
 
     .. note::
-        ``spawn`` now only supports GPU or XPU collective mode. The collective mode
-        of GPU and XPU cannot be started at the same time, so the option `gpus` and
-        `xpus` cannot be configured at the same time.
+        ``spawn`` now only supports GPU or XPU or MLU collective mode. The collective mode
+        of GPU and XPU and MLU cannot be started at the same time, so the option `gpus` and
+        `xpus` and 'mlus' cannot be configured at the same time.
 
     Args:
         func (function): The target function is called by spawned process.
@@ -425,7 +465,9 @@ def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options):
             selected gpus, such as "0,1,2,3". Default: None;
             (3) xpus (string): The training process will run on the
             selected xpus, such as "0,1,2,3". Default: None;
-            (4) ips (string): Paddle cluster nodes ips, such as
+            (4) mlus (string): The training process will run on the
+            selected mlus, such as "0,1,2,3". Default: None;
+            (5) ips (string): Paddle cluster nodes ips, such as
             "192.168.0.16,192.168.0.17". Default: "127.0.0.1" .
 
     Returns:
@@ -457,7 +499,7 @@ def train(print_result=False):
 
                 # 2. create data parallel layer & optimizer
                 layer = LinearNet()
-                dp_layer = paddle.DataParallel(layer, process_group=process_group)
+                dp_layer = paddle.DataParallel(layer, group = process_group)
 
                 loss_fn = nn.MSELoss()
                 adam = opt.Adam(
diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py
index de7359bcd7337..30cd63ed80ea7 100644
--- a/python/paddle/distributed/utils.py
+++ b/python/paddle/distributed/utils.py
@@ -686,6 +686,15 @@ def _prepare_trainer_env(cluster, trainer, backend=None):
             "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
             "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints())
         }
+    elif backend == 'cncl':
+        proc_env = {
+            "FLAGS_selected_mlus":
+            "%s" % ",".join([str(g) for g in trainer.gpus]),
+            "PADDLE_TRAINER_ID": "%d" % trainer.rank,
+            "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint,
+            "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
+            "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints())
+        }
     elif backend == 'gloo':
         # NOTE (xiongkun) default fall back into cpu only
         proc_env = {
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index bc53c130286aa..145ecc83cfc26 100755
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -32,6 +32,7 @@
     from collections.abc import Sequence
 except:
     from collections import Sequence
+
 __all__ = [
     'append_backward',
     'gradients',
@@ -2113,6 +2114,11 @@ def gradients(targets, inputs, target_gradients=None, no_grad_set=None):
     check_type(target_gradients, 'target_gradients', (
         framework.Variable, list, tuple, type(None)), 'paddle.static.gradients')
 
+    from ..incubate.autograd.primx import _gradients
+    from ..incubate.autograd.utils import prim_enabled
+    if prim_enabled():
+        return _gradients(targets, inputs, target_gradients)
+
     outs = calc_gradient(targets, inputs, target_gradients, no_grad_set)
     return _as_list(outs)
 
diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
index 9d9fbd39a5767..e543bc1e17b2c 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
@@ -426,6 +426,7 @@ def _optimize_fp32_graph(self, graph):
         graph = self._apply_pass(graph, 'depthwise_conv_mkldnn_pass')
         graph = self._apply_pass(graph, 'conv_bn_fuse_pass')
         graph = self._apply_pass(graph, 'conv_eltwiseadd_bn_fuse_pass')
+        graph = self._apply_pass(graph, 'conv_affine_channel_mkldnn_fuse_pass')
         graph = self._apply_pass(graph, 'conv_transpose_bn_fuse_pass')
         graph = self._apply_pass(graph,
                                  'conv_transpose_eltwiseadd_bn_fuse_pass')
@@ -667,4 +668,5 @@ def _quantize_fp32_graph(self, graph):
             graph, 'cpu_quantize_pass', ['quant_var_scales', 'data_layout'],
             [self._var_quant_scales, self._get_data_layout(graph)])
         graph = self._apply_pass(graph, 'cpu_quantize_squash_pass')
+        graph = self._apply_pass(graph, 'int8_scale_calculation_mkldnn_pass')
         return graph
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
index 629529ff1b965..56d77f77b5083 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
@@ -405,7 +405,7 @@ def test_post_training_abs_max_mobilenetv1(self):
         is_full_quantize = False
         is_use_cache_file = False
         is_optimize_model = False
-        # The accuracy diff of post-traing quantization (abs_max) maybe bigger
+        # The accuracy diff of post-training quantization (abs_max) maybe bigger
         diff_threshold = 0.05
         self.run_test(model, algo, round_type, data_urls, data_md5s,
                       quantizable_op_type, is_full_quantize, is_use_cache_file,
diff --git a/python/paddle/fluid/contrib/sparsity/asp.py b/python/paddle/fluid/contrib/sparsity/asp.py
index 30439ad736d26..c366af7237d1b 100644
--- a/python/paddle/fluid/contrib/sparsity/asp.py
+++ b/python/paddle/fluid/contrib/sparsity/asp.py
@@ -20,12 +20,13 @@
 import copy
 import numpy as np
 import paddle
+from paddle.fluid.framework import dygraph_only
 from paddle.fluid import global_scope, program_guard, layers
 from paddle.fluid.initializer import ConstantInitializer
 from paddle.fluid.contrib import sparsity
+from paddle.fluid import core
 from paddle.fluid.contrib.sparsity.supported_layer_list import supported_layers_and_prune_func_map
 from paddle.fluid.contrib.sparsity.supported_layer_list import _default_pruning
-from paddle.fluid import core
 
 OpRole = core.op_proto_and_checker_maker.OpRole
 OP_ROLE_KEY = core.op_proto_and_checker_maker.kOpRoleAttrName()
@@ -35,45 +36,90 @@
 ]
 
 
-def set_excluded_layers(main_program, param_names):
+def set_excluded_layers(param_names, main_program=None):
     r"""
     Set parameter name of layers which would not be pruned as sparse weights.
 
     Args:
+        param_names (list of string): A list contains names of parameters.
         main_program (Program, optional): Program with model definition and its parameters.
-        param_names (list): A list contains names of parameters.
+                                          If None is given, then it would be set as `paddle.static.default_main_program().
+                                          Default is None.
     Examples:
-        .. code-block:: python
-
-            import paddle
-            from paddle.static import sparsity
-
-            paddle.enable_static()
-
-            main_program = paddle.static.Program()
-            startup_program = paddle.static.Program()
-
-            with paddle.static.program_guard(main_program, startup_program):
-                input_data = paddle.static.data(name='data', shape=[None, 128])
-                label = paddle.static.data(name='label', shape=[None, 10])
-                hidden = paddle.static.nn.fc(x=input_data, num_flatten_dims=-1, size=32, activation=None, name="need_sparse_fc")
-                hidden = paddle.static.nn.fc(x=hidden, num_flatten_dims=-1, size=32, activation=None, name="need_dense_fc")
-                prob = paddle.static.nn.fc(x=hidden, num_flatten_dims=-1, size=10, activation=None)
-                loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
-
-                # Setup exluded layers out from ASP workflow.
-                # Please note, excluded_layers must be set before calling `optimizer.minimize()`.
-                sparsity.set_excluded_layers(main_program, ["need_dense_fc"])
-
-                optimizer = paddle.optimizer.SGD(learning_rate=0.1)
-                optimizer = paddle.static.amp.decorate(optimizer )
-                # Calling sparsity.decorate() to wrap minimize() in optimizer, which 
-                # will insert necessary masking operations for ASP workflow.
-                optimizer = sparsity.decorate(optimizer)
-                optimizer.minimize(loss, startup_program)
+        1. Usage of Dynamic Graph
+
+            .. code-block:: python
+
+                import paddle
+
+                class MyLayer(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyLayer, self).__init__()
+                        self.conv1 = paddle.nn.Conv2D(
+                            in_channels=3, out_channels=4, kernel_size=3, padding=2)
+                        self.linear1 = paddle.nn.Linear(4624, 100)
+
+                    def forward(self, img):
+                        hidden = self.conv1(img)
+                        hidden = paddle.flatten(hidden, start_axis=1)
+                        prediction = self.linear1(hidden)
+                        return prediction
+
+                my_layer = MyLayer()
+                optimizer = paddle.optimizer.SGD(
+                    learning_rate=0.01, parameters=my_layer.parameters())
+
+                # Need to set excluded layers before calling decorate
+                paddle.incubate.asp.set_excluded_layers([my_layer.linear1.full_name()])
+
+                optimizer = paddle.incubate.asp.decorate(optimizer)
+
+        2. Usage of Static Graph
+
+            .. code-block:: python
+
+                import paddle
+
+                paddle.enable_static()
+
+                class MyLayer(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyLayer, self).__init__()
+                        self.conv1 = paddle.nn.Conv2D(
+                            in_channels=3, out_channels=4, kernel_size=3, padding=2)
+                        self.linear1 = paddle.nn.Linear(4624, 100)
+
+                    def forward(self, img):
+                        hidden = self.conv1(img)
+                        hidden = paddle.flatten(hidden, start_axis=1)
+                        prediction = self.linear1(hidden)
+                        return prediction
+
+                main_program = paddle.static.Program()
+                startup_program = paddle.static.Program()
+
+                with paddle.static.program_guard(main_program, startup_program):
+                    input_data = paddle.static.data(name='data', shape=[None, 3, 224, 224])
+                    label = paddle.static.data(name='label', shape=[None, 100])
+                    my_layer = MyLayer()
+                    prob = my_layer(input_data)
+                    loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
+
+                    # Setup exluded layers out from ASP workflow.
+                    # Please note, excluded_layers must be set before calling optimizer.minimize().
+                    paddle.incubate.asp.set_excluded_layers([my_layer.linear1.full_name()], main_program)
+
+                    optimizer = paddle.optimizer.SGD(learning_rate=0.1)
+                    optimizer = paddle.static.amp.decorate(optimizer )
+                    # Calling paddle.incubate.asp.decorate() to wrap minimize() in optimizer, which 
+                    # will insert necessary masking operations for ASP workflow.
+                    optimizer = paddle.incubate.asp.decorate(optimizer)
+                    optimizer.minimize(loss, startup_program)
     """
+    if main_program is None:
+        main_program = paddle.static.default_main_program()
     ASPHelper.set_excluded_layers(
-        main_program=main_program, param_names=param_names)
+        param_names=param_names, main_program=main_program)
 
 
 def reset_excluded_layers(main_program=None):
@@ -83,153 +129,310 @@ def reset_excluded_layers(main_program=None):
 
     Args:
         main_program (Program, optional): Program with model definition and its parameters.
-        Examples:
-        .. code-block:: python
+                                          If None is given, then this function would reset all excluded_layers.
+                                          Default is None.
+    Examples:
+        1. Usage of Dynamic Graph
 
-            import paddle
-            from paddle.static import sparsity
+            .. code-block:: python
 
-            paddle.enable_static()
+                import paddle
 
-            main_program = paddle.static.Program()
-            startup_program = paddle.static.Program()
+                class MyLayer(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyLayer, self).__init__()
+                        self.conv1 = paddle.nn.Conv2D(
+                            in_channels=3, out_channels=4, kernel_size=3, padding=2)
+                        self.linear1 = paddle.nn.Linear(4624, 100)
 
-            with paddle.static.program_guard(main_program, startup_program):
-                input_data = paddle.static.data(name='data', shape=[None, 128])
-                label = paddle.static.data(name='label', shape=[None, 10])
-                hidden = paddle.static.nn.fc(x=input_data, num_flatten_dims=-1, size=32, activation=None, name="my_first_fc")
-                hidden = paddle.static.nn.fc(x=hidden, num_flatten_dims=-1, size=32, activation=None, name="my_second_fc")
-                prob = paddle.static.nn.fc(x=hidden, num_flatten_dims=-1, size=10, activation=None)
-                loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
+                    def forward(self, img):
+                        hidden = self.conv1(img)
+                        hidden = paddle.flatten(hidden, start_axis=1)
+                        prediction = self.linear1(hidden)
+                        return prediction
 
-                # Setup exluded layers out from ASP workflow.
-                # Please note, excluded_layers must be set before calling `optimizer.minimize()`.
-                sparsity.set_excluded_layers(main_program, ["my_second_fc"])
-                # Now the weights of "my_second_fc" would not be included in Automatic SParsity's workflow.
+                my_layer = MyLayer()
+                optimizer = paddle.optimizer.SGD(
+                    learning_rate=0.01, parameters=my_layer.parameters())
+
+                # Need to set excluded layers before calling decorate
+                paddle.incubate.asp.set_excluded_layers([my_layer.linear1.full_name()])
+                # Reset excluded_layers, all supported layers would be included into Automatic SParsity's workflow.
+                # Please note, reset_excluded_layers also must be called before calling sparsity.decorate().
+                paddle.incubate.asp.reset_excluded_layers()
+
+                optimizer = paddle.incubate.asp.decorate(optimizer)
+
+        2. Usage of Static Graph
+
+            .. code-block:: python
 
-            # Reset excluded_layers, all FC layers would be included into Automatic SParsity's workflow.
-            # Please note, reset_excluded_layers also must be called before calling `optimizer.minimize()`.
-            sparsity.reset_excluded_layers(main_program)
+                import paddle
+
+                paddle.enable_static()
+
+                class MyLayer(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyLayer, self).__init__()
+                        self.conv1 = paddle.nn.Conv2D(
+                            in_channels=3, out_channels=4, kernel_size=3, padding=2)
+                        self.linear1 = paddle.nn.Linear(4624, 100)
+
+                    def forward(self, img):
+                        hidden = self.conv1(img)
+                        hidden = paddle.flatten(hidden, start_axis=1)
+                        prediction = self.linear1(hidden)
+                        return prediction
+
+                main_program = paddle.static.Program()
+                startup_program = paddle.static.Program()
+
+                with paddle.static.program_guard(main_program, startup_program):
+                    input_data = paddle.static.data(name='data', shape=[None, 3, 224, 224])
+                    label = paddle.static.data(name='label', shape=[None, 100])
+                    my_layer = MyLayer()
+                    prob = my_layer(input_data)
+                    loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
+
+                    # Setup exluded layers out from ASP workflow.
+                    # Please note, excluded_layers must be set before calling optimizer.minimize().
+                    paddle.incubate.asp.set_excluded_layers([my_layer.linear1.full_name()], main_program)
+                    # Reset excluded_layers, all supported layers would be included into Automatic SParsity's workflow.
+                    # Please note, reset_excluded_layers also must be called before calling optimizer.minimize().
+                    paddle.incubate.asp.reset_excluded_layers(main_program)
+
+                    optimizer = paddle.optimizer.SGD(learning_rate=0.1)
+                    optimizer = paddle.static.amp.decorate(optimizer )
+                    # Calling paddle.incubate.asp.decorate() to wrap minimize() in optimizer, which 
+                    # will insert necessary masking operations for ASP workflow.
+                    optimizer = paddle.incubate.asp.decorate(optimizer)
+                    optimizer.minimize(loss, startup_program)
     """
     ASPHelper.reset_excluded_layers(main_program=main_program)
 
 
 def decorate(optimizer):
     r"""
-    Wrap the given optimizer as a OptimizerWithSparsityGuarantee, 
-    which would insert necessary ops for ASP workflows when calling minimize()
+    Wrap the given optimizer as a OptimizerWithSparsityGuarantee,
+    If runnig with dynamic graph mode. ASP would creates mask variables for supported parameters.
+    Else if in static graph mode, ASP would creates mask variables and inserts necessary ops 
+    when calling minimize()
 
     Args:
         optimizer (Optimizer): A Optimizer used for training.
     Returns:
         OptimizerWithSparsityGuarantee: A wrapper for ASP to decorate `minimize` function of the given optimizer.
     Examples:
-        .. code-block:: python
+        1. Usage of Dynamic Graph
 
-            import paddle
-            from paddle.static import sparsity
+            .. code-block:: python
+
+                import paddle
+
+                class MyLayer(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyLayer, self).__init__()
+                        self.conv1 = paddle.nn.Conv2D(
+                            in_channels=3, out_channels=4, kernel_size=3, padding=2)
+                        self.linear1 = paddle.nn.Linear(4624, 32)
+                        self.linear2 = paddle.nn.Linear(32, 32)
+                        self.linear3 = paddle.nn.Linear(32, 10)
 
-            main_program = paddle.static.Program()
-            startup_program = paddle.static.Program()
+                    def forward(self, img):
+                        hidden = self.conv1(img)
+                        hidden = paddle.flatten(hidden, start_axis=1)
+                        hidden = self.linear1(hidden)
+                        hidden = self.linear2(hidden)
+                        prediction = self.linear3(hidden)
+                        return prediction
 
-            paddle.enable_static()
+                my_layer = MyLayer()
+                optimizer = paddle.optimizer.SGD(
+                    learning_rate=0.01, parameters=my_layer.parameters())
 
-            with paddle.static.program_guard(main_program, startup_program):
-                input_data = paddle.static.data(name='data', shape=[None, 128])
-                label = paddle.static.data(name='label', shape=[None, 10])
-                hidden = paddle.static.nn.fc(x=input_data, num_flatten_dims=-1, size=32, activation=None)
-                prob = paddle.static.nn.fc(x=hidden, num_flatten_dims=-1, size=10, activation=None)
-                loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
+                # Calling paddle.incubate.asp.decorate() to wrap step() in optimizer, which 
+                # will apply necessary masking operations for ASP workflow.
+                # In dynamic graph mode, ASP would create related mask variables during decoration.
+                optimizer = paddle.incubate.asp.decorate(optimizer)
 
-                optimizer = paddle.optimizer.SGD(learning_rate=0.1)
-                optimizer = sparsity.decorate(optimizer)
-                # if do sparse training with Fleet, please replace above decorate with:
-                # strategy = paddle.distributed.fleet.DistributedStrategy()
-                # strategy.asp = True
-                # optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        2. Usage of Static Graph
 
-                optimizer.minimize(loss, startup_program)
+            .. code-block:: python
+
+                import paddle
+
+                paddle.enable_static()
+
+                class MyLayer(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyLayer, self).__init__()
+                        self.conv1 = paddle.nn.Conv2D(
+                            in_channels=3, out_channels=4, kernel_size=3, padding=2)
+                        self.linear1 = paddle.nn.Linear(4624, 100)
+
+                    def forward(self, img):
+                        hidden = self.conv1(img)
+                        hidden = paddle.flatten(hidden, start_axis=1)
+                        prediction = self.linear1(hidden)
+                        return prediction
+
+                main_program = paddle.static.Program()
+                startup_program = paddle.static.Program()
+
+                with paddle.static.program_guard(main_program, startup_program):
+                    input_data = paddle.static.data(name='data', shape=[None, 3, 224, 224])
+                    label = paddle.static.data(name='label', shape=[None, 100])
+                    my_layer = MyLayer()
+                    prob = my_layer(input_data)
+                    loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
+
+                    optimizer = paddle.optimizer.SGD(learning_rate=0.1)
+                    # Calling paddle.incubate.asp.decorate() to wrap minimize() in optimizer, which 
+                    # will insert necessary masking operations for ASP workflow.
+                    # In static graph mode, ASP creates related mask variables 
+                    # during minimize().
+                    optimizer = paddle.incubate.asp.decorate(optimizer)
+                    optimizer.minimize(loss, startup_program)
     """
     return ASPHelper.decorate(optimizer)
 
 
-def prune_model(main_program=None,
-                n=2,
-                m=4,
-                mask_algo='mask_1d',
-                with_mask=True):
+def prune_model(model, n=2, m=4, mask_algo='mask_1d', with_mask=True):
     r"""
-    Pruning parameters of supported layers in :attr:`main_program` via 
+    Pruning parameters of supported layers in :attr:`model` via 
     specified mask generation function given by :attr:`mask_algo`. This 
     function supports both training and inference controlled by :attr:`with_mask`.
     If :attr:`with_mask` is True, it would also prune parameter related ASP mask Variables,
     else only prunes parameters.
 
-    *Note*: If parameters are supported and in FP16, please set :attr:`n`=2, :attr:`m`=4, 
-    if they in FP32, then :attr:`n`=1, :attr:`m`=2` to further enable Sparse Tensor Core acceleration.
-
-    *Note*: If calling this function with :attr:`with_mask`, it should call `OptimizerWithSparsityGuarantee.minimize` 
+    *Note*: (Static graph mode) If calling this function with :attr:`with_mask`, it should call `OptimizerWithSparsityGuarantee.minimize` 
     and initialization (`exe.run(startup_program`)) before (For successfully obtain mask Variable). 
     Typically set `with_mask` as true for training (have called `OptimizerWithSparsityGuarantee.minimize`) and false for 
-    inference only. To obtain OptimizerWithSparsityGuarantee, please see `sparsity.decoreate()`.
+    inference only. To obtain OptimizerWithSparsityGuarantee, please see `paddle.incubate.asp.decoreate()`.
 
     Args:
-        main_program (Program, optional): Program with model definition and its parameters. Default is `paddle.static.default_main_program()
-        n (int): n of `n:m` sparse pattern.
-        m (int): m of `n:m` sparse pattern.
+        model (Program|nn.Layer): Program with model definition and its parameters, or a object of `paddle.nn.Layer`.
+        n (int, optional): n of `n:m` sparse pattern. Default is 2.
+        m (int, optional): m of `n:m` sparse pattern. Default is 4.
         mask_algo (string, optional): The function name to generate spase mask. Default is `mask_1d`.
                                       The vaild inputs should be one of 'mask_1d', 'mask_2d_greedy' and 'mask_2d_best'.
         with_mask (bool, optional): To prune mask Variables related to parameters or not. Ture is purning also, False is not. Defalut is True.
     Returns:
         dictionary: A dictionary with key: `parameter name` (string) and value: its corresponding mask Variable.
     Examples:
-        .. code-block:: python
-
-            import paddle
-            from paddle.static import sparsity
-
-            paddle.enable_static()
-
-            main_program = paddle.static.Program()
-            startup_program = paddle.static.Program()
-
-            with paddle.static.program_guard(main_program, startup_program):
-                input_data = paddle.static.data(name='data', shape=[None, 128])
-                label = paddle.static.data(name='label', shape=[None, 10])
-                hidden = paddle.static.nn.fc(x=input_data, num_flatten_dims=-1, size=32, activation=None, name="need_sparse_fc")
-                hidden = paddle.static.nn.fc(x=hidden, num_flatten_dims=-1, size=32, activation=None, name="need_dense_fc")
-                prob = paddle.static.nn.fc(x=hidden, num_flatten_dims=-1, size=10, activation=None)
-                loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
-
-                # Setup exluded layers out from ASP workflow.
-                # Please note, excluded_layers must be set before calling `optimizer.minimize()`.
-                sparsity.set_excluded_layers(main_program, ["need_dense_fc"])
+        1. Usage of Dynamic Graph
 
-                optimizer = paddle.optimizer.SGD(learning_rate=0.1)
-                optimizer = paddle.static.amp.decorate(optimizer )
-                # Calling sparsity.decorate() to wrap minimize() in optimizer, which 
-                # will insert necessary masking operations for ASP workflow.
-                optimizer = sparsity.decorate(optimizer)
-                optimizer.minimize(loss, startup_program)
+            .. code-block:: python
 
-            device = paddle.device.get_device()
-            place = paddle.set_device(device)
+                import paddle
+                import numpy as np
+
+                class MyLayer(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyLayer, self).__init__()
+                        self.conv1 = paddle.nn.Conv2D(
+                            in_channels=3, out_channels=4, kernel_size=3, padding=2)
+                        self.linear1 = paddle.nn.Linear(4624, 32)
+                        self.linear2 = paddle.nn.Linear(32, 32)
+                        self.linear3 = paddle.nn.Linear(32, 10)
+
+                    def forward(self, img):
+                        hidden = self.conv1(img)
+                        hidden = paddle.flatten(hidden, start_axis=1)
+                        hidden = self.linear1(hidden)
+                        hidden = self.linear2(hidden)
+                        prediction = self.linear3(hidden)
+                        return prediction
+
+                my_layer = MyLayer()
+                loss_fn = paddle.nn.MSELoss(reduction='mean')
+
+                optimizer = paddle.optimizer.SGD(
+                    learning_rate=0.01, parameters=my_layer.parameters())
+
+                # Calling paddle.incubate.asp.decorate() to wrap step() in optimizer, which 
+                # will apply necessary masking operations for ASP workflow.
+                # In dynamic graph mode, ASP would create related mask variables during decoration.
+                optimizer = paddle.incubate.asp.decorate(optimizer)
+
+                # Must call paddle.incubate.asp.decorate() first before calling paddle.incubate.asp.prune_model()
+                paddle.incubate.asp.prune_model(my_layer, mask_algo='mask_2d_best')
+
+                for i in range(10):
+                    imgs = paddle.to_tensor(
+                        np.random.randn(64, 3, 32, 32),
+                        dtype='float32', stop_gradient=False)
+                    labels = paddle.to_tensor(
+                        np.random.randint(10, size=(64, 1)),
+                        dtype='float32', stop_gradient=False)
+                    output = my_layer(imgs)
+                    loss = loss_fn(output, labels)
+                    loss.backward()
+                    optimizer.step()
+                    optimizer.clear_grad()
+
+        2. Usage of Static Graph
 
-            exe = paddle.static.Executor(place)
-            exe.run(startup_program)
+            .. code-block:: python
 
-            # Must call `exe.run(startup_program)` first before calling `sparsity.prune_model`
-            sparsity.prune_model(main_program, mask_algo='mask_2d_best')
+                import paddle
+                import numpy as np
+
+                paddle.enable_static()
+
+                class MyLayer(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyLayer, self).__init__()
+                        self.conv1 = paddle.nn.Conv2D(
+                            in_channels=3, out_channels=4, kernel_size=3, padding=2)
+                        self.linear1 = paddle.nn.Linear(4624, 32)
+                        self.linear2 = paddle.nn.Linear(32, 32)
+                        self.linear3 = paddle.nn.Linear(32, 10)
+
+                    def forward(self, img):
+                        hidden = self.conv1(img)
+                        hidden = paddle.flatten(hidden, start_axis=1)
+                        hidden = self.linear1(hidden)
+                        hidden = self.linear2(hidden)
+                        prediction = self.linear3(hidden)
+                        return prediction
+
+                main_program = paddle.static.Program()
+                startup_program = paddle.static.Program()
+
+                with paddle.static.program_guard(main_program, startup_program):
+                    input_data = paddle.static.data(name='data', shape=[None, 3, 32, 32])
+                    label = paddle.static.data(name='label', shape=[None, 1])
+                    my_layer = MyLayer()
+                    prob = my_layer(input_data)
+                    loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
+
+                    optimizer = paddle.optimizer.SGD(learning_rate=0.1)
+                    # Calling paddle.incubate.asp.decorate() to wrap minimize() in optimizer, which 
+                    # will insert necessary masking operations for ASP workflow.
+                    # In static graph mode, ASP creates related mask variables 
+                    # during minimize().
+                    optimizer = paddle.incubate.asp.decorate(optimizer)
+                    optimizer.minimize(loss, startup_program)
+
+                device = paddle.device.get_device()
+                place = paddle.set_device(device)
+
+                exe = paddle.static.Executor(place)
+                exe.run(startup_program)
+
+                # Must call exe.run(startup_program) first before calling paddle.asp.prune_model()
+                paddle.incubate.asp.prune_model(my_layer, mask_algo='mask_2d_best')
+                # it also be accepted to call 
+                # paddle.incubate.asp.prune_model(main_program, mask_algo='mask_2d_best')
+
+                for i in range(10):
+                    imgs = np.random.randn(64, 3, 32, 32).astype('float32')
+                    labels = np.random.randint(10, size=(64, 1)).astype('float32')
+                    exe.run(main_program, feed={'data':imgs, 'label':labels})
     """
-    if main_program is not None and hasattr(
-            main_program,
-            "distributed_info_") and main_program.distributed_info_[
-                "sharding_degree"] > 1 and paddle.fluid.is_compiled_with_cuda():
-        gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
-        place = paddle.CUDAPlace(gpu_id)
-    else:
-        device = paddle.device.get_device()
-        place = paddle.set_device(device)
+    device = paddle.device.get_device()
+    place = paddle.set_device(device)
 
     MaskAlgo_mapping = {
         'mask_1d': sparsity.MaskAlgo.MASK_1D,
@@ -237,11 +440,26 @@ def prune_model(main_program=None,
         'mask_2d_best': sparsity.MaskAlgo.MASK_2D_BEST
     }
     assert (mask_algo in MaskAlgo_mapping), \
-           'The "mask_algo" should be one of ["mask_1d", "mask_2d_greedy", "mask_2d_best"]'
+        'The "mask_algo" should be one of ["mask_1d", "mask_2d_greedy", "mask_2d_best"]'
+
+    prune_func = None
+    if isinstance(model, paddle.nn.Layer):
+        prune_func = ASPHelper.prune_model_by_layer
+    elif isinstance(model, paddle.static.Program):
+        prune_func = ASPHelper.prune_model_by_program
+        if hasattr(model, "distributed_info_") and \
+           model.distributed_info_["sharding_degree"] > 1 and \
+           paddle.fluid.is_compiled_with_cuda():
+            gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
+            place = paddle.CUDAPlace(gpu_id)
+    else:
+        raise TypeError(
+            "model should be paddle.nn.Layer or paddle.static.Program, but got {}".
+            format(type(model)))
 
-    return ASPHelper.prune_model(
-        place=place,
-        main_program=main_program,
+    return prune_func(
+        place,
+        model,
         n=n,
         m=m,
         mask_algo=MaskAlgo_mapping[mask_algo],
@@ -300,7 +518,7 @@ class ASPHelper(object):
     __asp_info = {}
 
     @classmethod
-    def set_excluded_layers(cls, main_program, param_names):
+    def set_excluded_layers(cls, param_names, main_program):
         r"""
         This is the implementation of `sparsity.set_excluded_layers`, for details please see explanation in `sparsity.set_excluded_layers`.
         """
@@ -313,8 +531,8 @@ def reset_excluded_layers(cls, main_program=None):
         This is the implementation of `sparsity.reset_excluded_layers`, for details please see explanation in `sparsity.reset_excluded_layers`.
         """
         if main_program is None:
-            for asp_info in cls.__asp_info:
-                asp_info.reset_excluded_layers()
+            for prog in cls.__asp_info:
+                cls.__asp_info[prog].reset_excluded_layers()
         else:
             cls._get_program_asp_info(main_program).reset_excluded_layers()
 
@@ -323,16 +541,25 @@ def decorate(optimizer):
         r"""
         This is the implementation of `sparsity.decorate`, for details please see explanation in `sparsity.decorate`.
         """
+        if paddle.in_dynamic_mode():
+            # main_prog and startup_prog would be used with paddle.static.program_guard
+            # to create ASP masks. Moreover, main_prog is a key to map paddle.static.Program
+            # to its own ASP informantion, like ASP mask variables. For dynamic graph, we use
+            # default_main_program as the key.
+            main_prog = paddle.static.default_main_program()
+            startup_prog = paddle.static.default_startup_program()
+            ASPHelper._create_mask_variables(main_prog, startup_prog,
+                                             optimizer._parameter_list)
         return OptimizerWithSparsityGuarantee(optimizer)
 
     @classmethod
-    def prune_model(cls,
-                    place,
-                    main_program=None,
-                    n=2,
-                    m=4,
-                    mask_algo=sparsity.MaskAlgo.MASK_1D,
-                    with_mask=True):
+    def prune_model_by_program(cls,
+                               place,
+                               main_program=None,
+                               n=2,
+                               m=4,
+                               mask_algo=sparsity.MaskAlgo.MASK_1D,
+                               with_mask=True):
         r"""
         This is the implementation of `sparsity.prune_model`, for details please see explanation in `sparsity.prune_model`.
         """
@@ -366,9 +593,63 @@ def prune_model(cls,
                         np.array(weight_mask_tensor).dtype)
                     weight_mask_tensor.set(weight_sparse_mask, place)
                 asp_info.update_masks(param.name, weight_sparse_mask)
-
         return asp_info.masks.copy()
 
+    @classmethod
+    def prune_model_by_layer(cls,
+                             place,
+                             layer,
+                             n=2,
+                             m=4,
+                             mask_algo=sparsity.MaskAlgo.MASK_1D,
+                             with_mask=True):
+        r"""
+        This is the implementation of `sparsity.prune_model`, for details please see explanation in `sparsity.prune_model`.
+        """
+        if paddle.in_dynamic_mode():
+            main_program = paddle.static.default_main_program()
+            asp_info = cls._get_program_asp_info(main_program)
+
+            for param in layer.parameters():
+                if ASPHelper._is_supported_layer(main_program, param.name):
+                    weight_nparray = param.numpy()
+
+                    prune_func = ASPHelper._get_prune_func_by_name(param.name)
+
+                    weight_pruned_nparray, weight_sparse_mask = \
+                        prune_func(weight_nparray, m, n, mask_algo, param.name)
+
+                    weight_pruned_nparray = weight_pruned_nparray.astype(
+                        weight_nparray.dtype)
+                    param.set_value(weight_pruned_nparray)
+
+                    if with_mask:
+                        weight_mask_param = asp_info.mask_vars.get(param.name,
+                                                                   None)
+                        assert weight_mask_param is not None, \
+                            'Cannot find {} variable, please call sparsity.decorate() to' \
+                            ' decorate your optimizer first!'.format(ASPHelper._get_mask_name(param.name))
+                        weight_mask_param.set_value(weight_sparse_mask)
+
+                    asp_info.update_masks(param.name, weight_sparse_mask)
+
+            return asp_info.masks.copy()
+        else:
+            # This for loop is only used to obtain Block and Program from
+            # first parameters.
+            target_program = None
+            for param in layer.parameters():
+                target_program = param.block.program
+            assert target_program is not None, \
+                    'Cannot get paddle.static.Program from Paddle.nn.Layer.'
+            return ASPHelper.prune_model_by_program(
+                place,
+                target_program,
+                n=n,
+                m=m,
+                mask_algo=mask_algo,
+                with_mask=with_mask)
+
     @staticmethod
     def _get_mask_name(param_name):
         r"""
@@ -393,13 +674,15 @@ def _get_not_ASP_relevant_vars(main_program):
         """
         var_list = []
         for param in main_program.global_block().all_parameters():
-            if ASPHelper.MASK_APPENDDED_NAME not in param.name:
+            param_name_list = param.name.split('.')
+
+            if ASPHelper.MASK_APPENDDED_NAME not in param_name_list:
                 var_list.append(param)
         return var_list
 
     @classmethod
     def _get_program_asp_info(cls, main_program):
-        if not main_program in cls.__asp_info:
+        if main_program not in cls.__asp_info:
             cls.__asp_info[main_program] = ProgramASPInfo()
         return cls.__asp_info[main_program]
 
@@ -508,14 +791,37 @@ def _minimize(cls,
 
         optimizer_ops, params_and_grads = optimizer.minimize(
             loss, startup_program, parameter_list, no_grad_set=no_grad_set)
-        cls._create_mask_variables(main_program, startup_program,
-                                   params_and_grads)
-        cls._insert_sparse_mask_ops(main_program, params_and_grads)
+
+        params_only = [pg[0] for pg in params_and_grads]
+        cls._create_mask_variables(main_program, startup_program, params_only)
+        cls._insert_sparse_mask_ops(main_program, params_only)
         return optimizer_ops, params_and_grads
 
     @classmethod
-    def _create_mask_variables(cls, main_program, startup_program,
-                               params_and_grads):
+    @dygraph_only
+    def _step(cls, optimizer):
+        r"""
+        This function is a decorator of `step` function in `Optimizer`.
+        There are three steps:
+
+        1. Call :attr:`optimizer`.step()
+        2. Mask parameters with sparse masks.
+
+        *Note*: Please use `ASP.decorate` instead when applying distributed training with `Fleet`. 
+        (Due to there is a invisiable graphs optimization in `Fleet.minimize()` which make training graph 
+        cannot be modified anymore.)
+
+        Args:
+            optimizer (Optimizer): A Optimizer used for training.
+        """
+        optimizer.step()
+        main_prog = paddle.static.default_main_program()
+        with paddle.fluid.dygraph.no_grad():
+            ASPHelper._insert_sparse_mask_ops(main_prog,
+                                              optimizer._parameter_list)
+
+    @classmethod
+    def _create_mask_variables(cls, main_program, startup_program, params):
         r"""
         Create sparse mask Tensors according to supported layers in :attr:`main_program`.
         This function is called in second step of `ASPHelper._minimize`
@@ -523,48 +829,45 @@ def _create_mask_variables(cls, main_program, startup_program,
         Args:
             main_program (Program): Program with model definition and its parameters.
             startup_program (Program): Program for initializing parameters.
-            params_and_grads (list): Variable pairs of parameters and their gradients.
+            params (list): Variable parameters.
         """
         asp_info = cls._get_program_asp_info(main_program)
         with program_guard(main_program, startup_program):
-            for param_and_grad in params_and_grads:
-                if ASPHelper._is_supported_layer(main_program,
-                                                 param_and_grad[0].name):
-                    mask_param = layers.create_parameter(
-                        name=ASPHelper._get_mask_name(param_and_grad[0].name),
-                        shape=param_and_grad[0].shape,
-                        dtype=param_and_grad[0].dtype,
-                        default_initializer=ConstantInitializer(value=1.0))
-                    mask_param.stop_gradient = True
-                    mask_param.trainable = False
-                    asp_info.update_mask_vars(param_and_grad[0].name,
-                                              mask_param)
+            for param in params:
+                if ASPHelper._is_supported_layer(main_program, param.name):
+                    if param.name not in asp_info.mask_vars:
+                        mask_param = layers.create_parameter(
+                            name=ASPHelper._get_mask_name(param.name),
+                            shape=param.shape,
+                            dtype=param.dtype,
+                            default_initializer=ConstantInitializer(value=1.0))
+                        mask_param.stop_gradient = True
+                        mask_param.trainable = False
+                        asp_info.update_mask_vars(param.name, mask_param)
 
     @classmethod
-    def _insert_sparse_mask_ops(cls, main_program, param_grads):
+    def _insert_sparse_mask_ops(cls, main_program, params):
         r"""
         Insert masking ops in the end of parameters update.
         This function is called in third step of `ASPHelper._minimize`
 
         Args:
             main_program (Program): Program with model definition and its parameters.
-            params_and_grads (list): Variable pairs of parameters and their gradients.
+            params (list): Variable parameters.
         """
         block = main_program.global_block()
         asp_info = cls._get_program_asp_info(main_program)
-        for param_grad in param_grads:
-            if param_grad[0].name in asp_info.mask_vars:
+        for param in params:
+            if param.name in asp_info.mask_vars:
                 block.append_op(
                     type='elementwise_mul',
-                    inputs={
-                        "X": param_grad[0],
-                        'Y': asp_info.mask_vars[param_grad[0].name]
-                    },
-                    outputs={'Out': param_grad[0]},
+                    inputs={"X": param,
+                            'Y': asp_info.mask_vars[param.name]},
+                    outputs={'Out': param},
                     attrs={
                         'axis': -1,
                         'use_mkldnn': False,
-                        OP_ROLE_KEY: OpRole.Optimize
+                        OP_ROLE_KEY: int(OpRole.Optimize)
                     })
 
 
@@ -579,8 +882,9 @@ class OptimizerWithSparsityGuarantee(object):
 
     def __init__(self, optimizer):
         self._optimizer = optimizer
-        self._learning_rate = optimizer._learning_rate
-        self._learning_rate_map = optimizer._learning_rate_map
+
+    def __getattr__(self, item):
+        return getattr(self._optimizer, item)
 
     def minimize(self,
                  loss,
@@ -605,3 +909,55 @@ def minimize(self,
             startup_program=startup_program,
             parameter_list=parameter_list,
             no_grad_set=no_grad_set)
+
+    @dygraph_only
+    def step(self):
+        r"""
+        This function is a decorator of `step` function in `Optimizer`.
+        There are three steps:
+
+        1. Call :attr:`optimizer`.step()
+        2. Mask parameters with sparse masks.
+
+        *Note*: Please use `ASP.decorate` instead when applying distributed training with `Fleet`. 
+        (Due to there is a invisiable graphs optimization in `Fleet.minimize()` which make training graph 
+        cannot be modified anymore.)
+
+        Args:
+            optimizer (Optimizer): A Optimizer used for training.
+        """
+        ASPHelper._step(self._optimizer)
+
+    @dygraph_only
+    def state_dict(self):
+        r"""
+        This function is a decorator of `state_dict` function in `Optimizer`.
+
+        Returns:
+            state_dict(dict) : dict contains all the Tensor used by optimizer
+        """
+        state_dict = self._optimizer.state_dict()
+        asp_info = ASPHelper._get_program_asp_info(
+            paddle.static.default_main_program())
+        for param_name, var in asp_info.mask_vars.items():
+            state_dict.update({ASPHelper._get_mask_name(param_name): var})
+        return state_dict
+
+    @dygraph_only
+    def set_state_dict(self, state_dict):
+        r"""
+        This function is a decorator of `set_state_dict` function in `Optimizer`.
+        Args: 
+            state_dict(dict) : Dict contains all the Tensor needed by optimizer
+        Return:
+            None
+        """
+        asp_info = ASPHelper._get_program_asp_info(
+            paddle.static.default_main_program())
+        for param_name, var in asp_info.mask_vars.items():
+            param_mask_name = ASPHelper._get_mask_name(param_name)
+            assert param_mask_name in state_dict, \
+                "The {} is not found.".format(param_mask_name)
+            var.set_value(state_dict[param_mask_name])
+            asp_info.update_masks(param_name, var.numpy())
+        return self._optimizer.set_state_dict(state_dict)
diff --git a/python/paddle/fluid/contrib/sparsity/utils.py b/python/paddle/fluid/contrib/sparsity/utils.py
index 8b8c043bc4bad..a28f7fc2b4ed6 100644
--- a/python/paddle/fluid/contrib/sparsity/utils.py
+++ b/python/paddle/fluid/contrib/sparsity/utils.py
@@ -94,13 +94,12 @@ def calculate_density(x):
         float: The density of :attr:`x`.
     Examples:
         .. code-block:: python
-
+          import paddle
           import numpy as np
-          import paddle.static.sparsity as sparsity
 
           x = np.array([[0, 1, 3, 0],
                         [1, 1, 0, 1]])
-          sparsity.calculate_density(x) # 0.625
+          paddle.incubate.asp.calculate_density(x) # 0.625
     """
     x_flattened = x.flatten()
     return float(np.nonzero(x_flattened)[0].size) / x_flattened.size
diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py
index 1ae57bcb30310..ba5c709b1d877 100644
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
@@ -257,7 +257,7 @@ def load_dygraph(model_path, **configs):
                     para_dict = structured_para_dict
         else:
             # load state dict by `io.save_params/persistables` save format
-            # TODO(chenweihang): [ Now only supports loading parameters seperately ]
+            # TODO(chenweihang): [ Now only supports loading parameters separately ]
             # If users save all parameters as one file, the [ variable.name -> variable ]
             # mapping info will lost, so users need to give variable list, but users build 
             # variable list in dygraph mode is difficult, we recommend users to use
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/assert_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/assert_transformer.py
index e2fcf4f2c2712..4d5076108cd31 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/assert_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/assert_transformer.py
@@ -37,7 +37,7 @@ def transform(self):
 
     def visit_Assert(self, node):
         convert_assert_node = gast.parse(
-            'paddle.jit.dy2static.convert_assert({test}, {msg})'.format(
+            '_jst.convert_assert({test}, {msg})'.format(
                 test=ast_to_source_code(node.test),
                 msg=ast_to_source_code(node.msg)
                 if node.msg else "")).body[0].value
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py
index a80dfa11402c5..c16d1ff17f707 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py
@@ -71,7 +71,7 @@ def visit_Call(self, node):
         if PDB_SET in func_str:
             return node
 
-        new_func_str = "paddle.jit.dy2static.convert_call({})".format(func_str)
+        new_func_str = "_jst.convert_call({})".format(func_str)
         new_func_ast = gast.parse(new_func_str).body[0].value
         node.func = new_func_ast
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/cast_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/cast_transformer.py
index ef2d062d2d018..50733e4d896e4 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/cast_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/cast_transformer.py
@@ -39,8 +39,8 @@ def visit_Call(self, node):
         func_str = ast_to_source_code(node.func).strip()
         if func_str in self._castable_type and len(node.args) > 0:
             args_str = ast_to_source_code(node.args[0]).strip()
-            new_func_str = "paddle.jit.dy2static.convert_var_dtype({}, '{}')".format(
-                args_str, func_str)
+            new_func_str = "_jst.convert_var_dtype({}, '{}')".format(args_str,
+                                                                     func_str)
             new_node = gast.parse(new_func_str).body[0].value
             return new_node
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
index 3a7b012b02bee..576baf6cc299a 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
@@ -167,7 +167,7 @@ def convert_logical_not(x):
     A function representation of a Python ``not`` statement.
 
     Args:
-        x(bool|Tensor): Operand of of ``not`` operator.
+        x(bool|Tensor): Operand of ``not`` operator.
 
     Returns:
         A python bool variable or a bool Tensor.
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
index 8fc5a691d212c..157822430d234 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
@@ -536,7 +536,7 @@ def create_name_nodes(name_ids):
     return_vars = create_name_nodes(return_name_ids)
 
     convert_ifelse_layer = gast.parse(
-        'paddle.jit.dy2static.convert_ifelse('
+        '_jst.convert_ifelse('
         '{pred}, {true_fn}, {false_fn}, {true_args}, {false_args}, {return_vars})'.
         format(
             pred=ast_to_source_code(pred),
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/list_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/list_transformer.py
index e62def897d2eb..0951635162e5e 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/list_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/list_transformer.py
@@ -129,7 +129,7 @@ def _transform_slice_to_tensor_write(self, node):
         elif slice_is_num(target_node):
             value_code = ast_to_source_code(node.value)
             i = "paddle.cast(" \
-                "x=paddle.jit.dy2static.to_static_variable({})," \
+                "x=_jst.to_static_variable({})," \
                 "dtype='int64')".format(ast_to_source_code(slice_node))
             assign_code = "{} = paddle.tensor.array_write(x={}, i={}, array={})" \
                 .format(target_name, value_code, i, target_name)
@@ -252,7 +252,7 @@ def _replace_pop(self, node):
         # 2. pop stmt for a list or dict if len(args_str) == 1
         # 3. pop stmt for a dict if len(args_str) == 2
         if len(args_str) <= 2:
-            new_pop_str = "paddle.jit.dy2static.convert_pop({}, {})"\
+            new_pop_str = "_jst.convert_pop({}, {})"\
                 .format(target_str, ",".join(args_str))
             new_pop_node = gast.parse(new_pop_str).body[0].value
             return new_pop_node
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/logical_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/logical_transformer.py
index e5c093f9a9255..bd573521f1b4e 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/logical_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/logical_transformer.py
@@ -57,8 +57,7 @@ def visit_UnaryOp(self, node):
         self.generic_visit(node)
         if isinstance(node.op, gast.Not):
             arg = ast_to_source_code(node.operand)
-            new_node_str = "paddle.jit.dy2static.convert_logical_not({})".format(
-                arg)
+            new_node_str = "_jst.convert_logical_not({})".format(arg)
             # NOTE: gast.parse returns Module(body=[expr(value=...)])
             new_node = gast.parse(new_node_str).body[0].value
             return new_node
@@ -67,13 +66,12 @@ def visit_UnaryOp(self, node):
     def visit_Compare(self, node):
         self.generic_visit(node)
         left_str = ast_to_source_code(node.left).strip()
-        if left_str.startswith("paddle.jit.dy2static.convert_var_shape"):
+        if left_str.startswith("_jst.convert_var_shape"):
             # check left and comparators are all converted var shape
             compare_arg_strs = left_str
             for i, comparator in enumerate(node.comparators):
                 comparator_str = ast_to_source_code(comparator).strip()
-                if not comparator_str.startswith(
-                        "paddle.jit.dy2static.convert_var_shape"):
+                if not comparator_str.startswith("_jst.convert_var_shape"):
                     return node
                 op_str = cmpop_node_to_str(node.ops[i])
                 compare_arg_strs += (", '" + op_str + "', " + comparator_str)
@@ -81,7 +79,7 @@ def visit_Compare(self, node):
             # Now all left and comparators are converted shape
             # Replace some comparsion operation because of difference between
             # Python and Paddle
-            new_node_str = "paddle.jit.dy2static.convert_shape_compare({})".format(
+            new_node_str = "_jst.convert_shape_compare({})".format(
                 compare_arg_strs)
             new_node = gast.parse(new_node_str).body[0].value
             return new_node
@@ -119,7 +117,7 @@ def _create_bool_op_node(self, nodes, api_type):
             nodes = [pre_logic_node] + [post_logic_node]
 
         args = [ast_to_source_code(child) for child in nodes]
-        new_node_str = "paddle.jit.dy2static.convert_logical_{}(lambda:{}, lambda:{})".format(
+        new_node_str = "_jst.convert_logical_{}(lambda:{}, lambda:{})".format(
             api_type, args[0], args[1])
         # NOTE: gast.parse return Module(body=[expr(...)])
         new_node = gast.parse(new_node_str).body[0].value
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
index 4e5a3f7b70851..8014a00bff983 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
@@ -89,7 +89,7 @@ def create_while_nodes(condition_name, body_name, loop_var_names):
         else:
             assign_loop_var_names.append(name)
 
-    while_func_name = "paddle.jit.dy2static.convert_while_loop"
+    while_func_name = "_jst.convert_while_loop"
     while_node_str = "[{}] = {}({}, {}, [{}])".format(
         ",".join(assign_loop_var_names), while_func_name, condition_name,
         body_name, ",".join(loop_var_names))
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
index 7960617369e3f..f045d01c99bab 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
@@ -50,6 +50,5 @@ def visit_Print(self, node):
         return gast.Expr(value=convert_print_node)
 
     def _create_print_node(self, print_args):
-        convert_print_func = gast.parse(
-            'paddle.jit.dy2static.convert_print').body[0].value
+        convert_print_func = gast.parse('_jst.convert_print').body[0].value
         return gast.Call(func=convert_print_func, args=print_args, keywords=[])
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index b860740f71b25..2efb6965085de 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -197,10 +197,12 @@ def from_func_and_args(cls, function_spec, args, kwargs, class_instance):
     def __hash__(self):
         error_msg = "Arguments to a `@paddle.jit.to_static` must be a hashable Python objects (or nested structures of these types)."
         with_hook = self.kwargs.get("with_hook", False)
-        return hash((id(self.function_spec),
-                     make_hashable(self.input_args_with_spec, error_msg),
-                     make_hashable(self.input_kwargs_with_spec, error_msg),
-                     self._spec_names_id, self.class_instance, with_hook))
+        is_train = self.kwargs.get("is_train", False)
+        return hash(
+            (id(self.function_spec),
+             make_hashable(self.input_args_with_spec, error_msg),
+             make_hashable(self.input_kwargs_with_spec, error_msg),
+             self._spec_names_id, self.class_instance, with_hook, is_train))
 
     def __eq__(self, other):
         return (type(self) is type(other)) and hash(self) == hash(other)
@@ -357,7 +359,7 @@ def __call__(self, *args, **kwargs):
 
         try:
             concrete_program, partial_program_layer = self.get_concrete_program(
-                *args, **kwargs)
+                *args, **kwargs, is_train=self._is_train_mode())
 
             # 3. synchronize self.training attribute.
             if isinstance(self._class_instance, layers.Layer):
@@ -383,6 +385,12 @@ def __call__(self, *args, **kwargs):
                     " if you can't handle this {} yourself.".format(type(e)))
                 raise e
 
+    def _is_train_mode(self):
+        if self._class_instance is not None:
+            return self._class_instance.training
+        else:
+            return self._training
+
     def _call_dygraph_function(self, *args, **kwargs):
         """
         Calls dygraph function directly and returns the outputs.
@@ -415,6 +423,8 @@ def get_concrete_program(self, *args, **kwargs):
         """
 
         with_hook = kwargs.get("with_hook", False)
+        is_train = kwargs.get("is_train", True)
+        if "is_train" in kwargs: kwargs.pop("is_train")
         if "with_hook" in kwargs: kwargs.pop("with_hook")
         # 1. unify args/kwargs and replace Tensor with InputSpec
         if len(args) != len(self._function_spec.args_name):
@@ -430,7 +440,8 @@ def get_concrete_program(self, *args, **kwargs):
             input_kwargs_with_spec,
             self._class_instance,
             **self._kwargs,
-            with_hook=with_hook)
+            with_hook=with_hook,
+            is_train=is_train)
 
         # 3. check whether hit the cache or build a new program for the input arguments
         concrete_program, partial_program_layer = self._program_cache[cache_key]
@@ -525,7 +536,9 @@ def concrete_program_specify_input_spec(self,
             has_input_spec = (desired_input_spec is not None)
             if has_input_spec:
                 concrete_program, _ = self.get_concrete_program(
-                    *desired_input_spec, with_hook=with_hook)
+                    *desired_input_spec,
+                    with_hook=with_hook,
+                    is_train=self._is_train_mode())
                 return concrete_program
             else:
                 raise ValueError(
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py
index 0c7a8bf421a12..8ac659dbead99 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py
@@ -336,7 +336,7 @@ def _replace_return_in_stmt_list(self, stmt_list, return_node, return_name,
         # Here assume that the parent node of return is gast.If
         if isinstance(parent_node_of_return, gast.If):
             # Prepend control flow boolean nodes such as '__return@1 = True'
-            node_str = "{} = paddle.jit.dy2static.create_bool_as_type({}, True)".format(
+            node_str = "{} = _jst.create_bool_as_type({}, True)".format(
                 return_name,
                 ast_to_source_code(parent_node_of_return.test).strip())
 
@@ -449,7 +449,7 @@ def _replace_after_node_to_if_in_stmt_list(
         # Here assume that the parent node of return is gast.If
         if isinstance(parent_node_of_return, gast.If):
             # Prepend control flow boolean nodes such as '__return@1 = False'
-            node_str = "{} = paddle.jit.dy2static.create_bool_as_type({}, False)".format(
+            node_str = "{} = _jst.create_bool_as_type({}, False)".format(
                 return_name,
                 ast_to_source_code(parent_node_of_return.test).strip())
             assign_false_node = gast.parse(node_str).body[0]
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py
index 7733226cc09f2..d5b23d2f53b1c 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py
@@ -42,7 +42,7 @@ def create_convert_shape_node(var_shape_node,
         if slice_node is not None and slice_is_num(slice_node):
             args.append(ast_to_source_code(slice_node.slice).strip())
 
-        convert_var_shape_func = "paddle.jit.dy2static.convert_var_shape({}, in_control_flow={})".format(
+        convert_var_shape_func = "_jst.convert_var_shape({}, in_control_flow={})".format(
             ",".join(args), in_control_flow)
         api_shape_node = gast.parse(convert_var_shape_func).body[0].value
 
@@ -59,14 +59,14 @@ def create_convert_shape_node(var_shape_node,
 
 
 def create_choose_shape_node(attr_shape_name, api_shape_name, slice_node=None):
-    eval_exist_func = "paddle.jit.dy2static.eval_if_exist_else_none('{}', globals())".format(
+    eval_exist_func = "_jst.eval_if_exist_else_none('{}', globals())".format(
         api_shape_name)
     args = [attr_shape_name, eval_exist_func]
 
     if slice_node is not None and slice_is_num(slice_node):
         args.append(ast_to_source_code(slice_node.slice).strip())
-    choose_shape_func = "paddle.jit.dy2static.choose_shape_attr_or_api({})".format(
-        ",".join(args))
+    choose_shape_func = "_jst.choose_shape_attr_or_api({})".format(",".join(
+        args))
     choose_shape_node = gast.parse(choose_shape_func).body[0].value
     if slice_node is not None and not slice_is_num(slice_node):
         return gast.Subscript(
@@ -84,7 +84,7 @@ class ShapeAttributeTransformer(gast.NodeTransformer):
     def visit_Attribute(self, node):
         if node.attr == 'shape':
             args = ast_to_source_code(node.value).strip()
-            convert_var_shape_func = "paddle.jit.dy2static.convert_var_shape_simple({})".format(
+            convert_var_shape_func = "_jst.convert_var_shape_simple({})".format(
                 args)
             api_shape_node = gast.parse(convert_var_shape_func).body[0].value
             return api_shape_node
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
index d440e387da597..91c2c5dc65aab 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
@@ -185,6 +185,7 @@ def is_api_in_module(node, module_prefix):
         import paddle.fluid as fluid
         import paddle.fluid.dygraph as dygraph
         import paddle.fluid.layers as layers
+        import paddle.jit.dy2static as _jst
 
         from paddle.fluid.dygraph import to_variable
         from paddle import to_tensor
@@ -521,8 +522,8 @@ def remove_if_exit(filepath):
 def _inject_import_statements():
     import_statements = [
         "import paddle", "from paddle import Tensor",
-        "import paddle.fluid as fluid", "from typing import *",
-        "import numpy as np"
+        "import paddle.fluid as fluid", "import paddle.jit.dy2static as _jst",
+        "from typing import *", "import numpy as np"
     ]
     return '\n'.join(import_statements) + '\n'
 
@@ -1168,7 +1169,7 @@ def _build_var_len_assign_node(self):
         else:
             iter_var_name = ast_to_source_code(self.iter_node).strip()
 
-        convert_len_node_source_str = '{} = paddle.jit.dy2static.convert_len({})'.format(
+        convert_len_node_source_str = '{} = _jst.convert_len({})'.format(
             self.iter_var_len_name, iter_var_name)
 
         convert_len_node = gast.parse(convert_len_node_source_str).body[0]
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py
index 2cd6c5e43f7e1..7ce5aede4995d 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py
@@ -77,14 +77,12 @@ def data_layer_not_check(name, shape, dtype='float32', lod_level=0):
 
 
 def to_static_variable_gast_node(name):
-    func_code = "{} = paddle.jit.dy2static.to_static_variable({})".format(name,
-                                                                          name)
+    func_code = "{} = _jst.to_static_variable({})".format(name, name)
     return gast.parse(func_code).body[0]
 
 
 def create_static_variable_gast_node(name):
-    func_code = "{} = paddle.jit.dy2static\
-        .data_layer_not_check(name='{}', shape=[-1], dtype='float32')".format(
+    func_code = "{} = _jst.data_layer_not_check(name='{}', shape=[-1], dtype='float32')".format(
         name, unique_name.generate(name))
     return gast.parse(func_code).body[0]
 
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index a93facbc34a5b..add3d73efc7e1 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -101,8 +101,11 @@ def _to_static_var(self, to_parameter=False, **kwargs):
         # Note: getattr(self, attr, None) will call x.grad=x.gradient(), but gradient() only available in dygraph.
         # It will fail. So, for propery that different between dynamic and static graph, should not getattr(self, attr, None).
         attr_not_need_keys = ['grad', 'T', 'place', '_place_str']
+        param_keys = ['stop_gradient', 'trainable']
         if isinstance(self, (ParamBase, EagerParamBase)):
             attr_kwargs = self.__dict__.copy()
+            for key in param_keys:
+                attr_kwargs[key] = getattr(self, key)
         else:
             attr_names = []
             for name in dir(self):
@@ -904,10 +907,8 @@ def values(self):
                     #[1, 2, 3, 4, 5]
         """
 
-        if self.is_sparse_coo():
-            return _C_ops.final_state_sparse_coo_values(self)
-        elif self.is_sparse_csr():
-            return _C_ops.final_state_sparse_csr_values(self)
+        if self.is_sparse_coo() or self.is_sparse_csr():
+            return _C_ops.final_state_sparse_values(self)
         else:
             raise ValueError(
                 "only SparseCooTensor and SparseCsrTensor have method values")
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index c6ff3a583d6a3..164545d0a0595 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -873,7 +873,7 @@ def _get_targets(_optimize_ops, _fetch_list, item):
                 _fetch_list.append(item)
             else:
                 raise TypeError(
-                    "The item in fetch_list should be str, variable or optimize_op, but recieved %s.",
+                    "The item in fetch_list should be str, variable or optimize_op, but received %s.",
                     type(item))
 
         for index, item in enumerate(fetch_list):
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 16a5e25472557..6957dd8c5e30c 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -2864,9 +2864,10 @@ def _to_readable_code(self, skip_op_callstack=True):
                 continue
 
             # it is bytes of serialized protobuf 
-            if self.type == 'cinn_launch' and name == 'compilation_key':
-                # value = core.get_readable_comile_key(self.desc)
-                v = self.desc.attr(name)
+            if is_compiled_with_cinn(
+            ) and self.type == 'cinn_launch' and name == 'compilation_key':
+                key = self.desc.attr(name)
+                v = core.get_serialize_comile_key(key)
                 prog = Program()
                 prog = prog.parse_from_string(v)
                 s = prog._to_readable_code()
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
index 46f26e8e52cd5..51e89cc301cf3 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
@@ -293,12 +293,12 @@ def dag_check_up_and_reorder(program, inputs, outputs):
                 if use_ps_gpu:
                     program.global_block()._insert_op(
                         index=distributed_idx,
-                        type="pull_box_sparse",
+                        type="pull_gpups_sparse",
                         inputs={"Ids": inputs,
                                 'W': w},
                         outputs={"Out": outputs},
                         attrs={
-                            "size": w.shape[1],
+                            "size": [w.shape[1] for i in inputs],
                             "is_distributed": True,
                             "is_sparse": True
                         })
@@ -576,7 +576,7 @@ def _add_push_box_sparse_op(program):
         op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
         backward = core.op_proto_and_checker_maker.OpRole.Backward
         for op in program.global_block().ops:
-            if op.type != "pull_box_sparse":
+            if op.type != "pull_box_sparse" and op.type != "pull_gpups_sparse":
                 continue
             grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
                 op.desc, cpt.to_text(set()), [])
@@ -599,7 +599,7 @@ def _remove_lookup_table_grad_op_and_var(program):
                     lookup_table_grad_var[name] = 1
 
         for idx, op in list(enumerate(program.global_block().ops)):
-            if op.type == "pull_box_sparse":
+            if op.type == "pull_box_sparse" or op.type == "pull_gpups_sparse":
                 continue
             for key_name in op.input_names:
                 for var in op.input(key_name):
@@ -1407,7 +1407,7 @@ def get_communicate_var_info(program,
 def union_forward_gradient_op(program_block_ops_list):
     """
     before analyzing the input & output of each block in program_block_list, we should
-    union the forward op and corresponding gradient op to elimincate the uneccessary variable
+    union the forward op and corresponding gradient op to elimincate the unnecessary variable
     transmit
     """
     """
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
index 40ff41fe89f47..dd9d7e760a8e5 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
@@ -103,9 +103,9 @@ def init_worker(self):
             # prepare for client to client communication
             if self._role_maker.is_worker():
                 info = self._fleet_ptr.get_clients_info()
-                print("IIIIFO: {}".format(info))
+                print("Client Info: {}".format(info))
                 all_info = self._role_maker._worker_gather(info[0])
-                print("ALL info: {}".format(all_info))
+                print("All Client Info: {}".format(all_info))
                 self._fleet_ptr.gather_clients(all_info)
                 self._fleet_ptr.set_client2client_config(
                     self._client2client_request_timeout_ms,
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
index 8dfe9c32cd973..5f0af296441ff 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
@@ -124,14 +124,15 @@ def add_sparse_table(self, table_id, strategy):
 
             support_accessor_class = [
                 'DownpourFeatureValueAccessor', 'DownpourCtrAccessor',
-                'DownpourSparseValueAccessor', 'DownpourCtrDoubleAccessor',
-                'DownpourUnitAccessor', 'DownpourDoubleUnitAccessor'
+                'DownpourCtrDymfAccessor', 'DownpourSparseValueAccessor',
+                'DownpourCtrDoubleAccessor', 'DownpourUnitAccessor',
+                'DownpourDoubleUnitAccessor'
             ]
             if strategy.get('sparse_accessor_class') is not None:
                 accessor_class = strategy.get('sparse_accessor_class')
                 if accessor_class not in support_accessor_class:
                     raise ValueError(
-                        "support sparse_accessor_class: ['DownpourFeatureValueAccessor', 'DownpourCtrAccessor', \
+                        "support sparse_accessor_class: ['DownpourFeatureValueAccessor', 'DownpourCtrAccessor', 'DownpourCtrDymfAccessor', \
                         'DownpourSparseValueAccessor', 'DownpourCtrDoubleAccessor'], \
                             but actual %s" % (accessor_class))
             else:
@@ -141,6 +142,7 @@ def add_sparse_table(self, table_id, strategy):
 
             if accessor_class == 'DownpourFeatureValueAccessor' \
                     or accessor_class == 'DownpourCtrAccessor' \
+                    or accessor_class == 'DownpourCtrDymfAccessor' \
                     or accessor_class == 'DownpourCtrDoubleAccessor':
                 table.accessor.sparse_sgd_param.learning_rate = strategy.get(
                     'sparse_learning_rate', 0.05)
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
index 5d7dacc007e6b..9483556d46f59 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
@@ -339,6 +339,7 @@ def _check_config_fleet_with_program_op(self, strategy, table_name,
         # set sparse_embedx_dim in the strategy according to accessor and use_cvm config
         if accessor == "DownpourFeatureValueAccessor" \
                 or accessor == "DownpourCtrAccessor" \
+                or accessor == "DownpourCtrDymfAccessor" \
                 or accessor == "DownpourDoubleUnitAccessor" \
                 or accessor == "DownpourUnitAccessor":
             if st.get("sparse_embedx_dim") is not None \
@@ -586,6 +587,7 @@ def _minimize(self,
                 # set sparse_embedx_dim in strategy,
                 # user do not have to set it in config_fleet
                 if accessor == "DownpourFeatureValueAccessor" \
+                        or accessor == "DownpourCtrDymfAccessor" \
                         or accessor == "DownpourCtrAccessor" \
                         or accessor == "DownpourDoubleUnitAccessor" \
                         or accessor == "DownpourUnitAccessor":
@@ -873,7 +875,8 @@ def _minimize(self,
         if server._server.downpour_server_param.downpour_table_param[
                 0].accessor.accessor_class in [
                     "DownpourCtrAccessor", "DownpourCtrDoubleAccessor",
-                    "DownpourUnitAccessor", "DownpourDoubleUnitAccessor"
+                    "DownpourUnitAccessor", "DownpourDoubleUnitAccessor",
+                    "DownpourCtrDymfAccessor"
                 ]:
             opt_info["dump_slot"] = True
         elif server._server.downpour_server_param.downpour_table_param[
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 7c7f101286e24..8b25c93d7ce08 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -1846,8 +1846,7 @@ def get_tensor(var):
 @static_only
 def save(program, model_path, protocol=4, **configs):
     """
-    :api_attr: Static Graph
-
+    
     This function save parameters, optimizer information and network description to model_path.
 
     The parameters contains all the trainable Tensor, will save to a file with suffix ".pdparams".
diff --git a/python/paddle/fluid/layer_helper_base.py b/python/paddle/fluid/layer_helper_base.py
index ce6fe6918b56b..47f0c02d28725 100644
--- a/python/paddle/fluid/layer_helper_base.py
+++ b/python/paddle/fluid/layer_helper_base.py
@@ -234,7 +234,7 @@ def __weight_normalize(g, v, dim):
                 x=g, y=norm)  # The shapes of g and norm are the same.
             # Currently, elementwise_mul only support broadcast when the shape
             # of y is a subset of the shape of x. Thus, we reshape y to squeeze
-            # to achive the subset.
+            # to achieve the subset.
             w = elementwise_mul(
                 x=v,
                 y=scale if dim is None else reshape(
diff --git a/python/paddle/fluid/layers/collective.py b/python/paddle/fluid/layers/collective.py
index 43eb436f65e78..0b4211cbb63dc 100644
--- a/python/paddle/fluid/layers/collective.py
+++ b/python/paddle/fluid/layers/collective.py
@@ -14,7 +14,9 @@
 
 from __future__ import print_function
 from ..layer_helper import LayerHelper, unique_name
-from ..framework import Variable
+from ..framework import Variable, in_dygraph_mode, _in_legacy_dygraph
+import paddle
+from paddle import _C_ops
 
 
 def _allreduce(x, out=None, reduce_type="sum", sync_mode=False):
@@ -107,6 +109,21 @@ def _c_broadcast(x, root=0, ring_id=0, use_calc_stream=False):
 
 def _c_allgather(x, nranks, ring_id=0, use_calc_stream=False):
     op_type = 'c_allgather'
+
+    if in_dygraph_mode():
+        group = paddle.distributed.collective._get_default_group()
+        tensor_shape = list(x.shape)
+        tensor_shape[0] *= nranks
+        out = paddle.empty(tensor_shape, x.dtype)
+        task = group.process_group.all_gather(x, out)
+        task.wait()
+        return out
+
+    if _in_legacy_dygraph():
+        attrs = ('nranks', nranks, 'ring_id', ring_id, 'use_calc_stream',
+                 use_calc_stream)
+        return _C_ops.c_allgather(x, *attrs)
+
     helper = LayerHelper(op_type, **locals())
     out_shape = list(x.shape[:])
     if out_shape[0] > 0:
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 200e8feec1e6a..97506ead5fad4 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -737,7 +737,7 @@ def _pull_gpups_sparse(input,
         for i in range(len(inputs))
     ]
     w = helper.create_parameter(
-        attr=helper.param_attr, shape=[11], dtype=dtype, is_bias=False)
+        attr=helper.param_attr, shape=[size[0]], dtype=dtype, is_bias=False)
     helper.append_op(
         type='pull_gpups_sparse',
         inputs={'Ids': inputs,
@@ -6533,7 +6533,7 @@ def squeeze(input, axes, name=None):
 
     """
     if in_dygraph_mode():
-        return _C_ops.final_state_squeeze(input, axes)[1]
+        return _C_ops.final_state_squeeze(input, axes)
     if _in_legacy_dygraph():
         out, _ = _C_ops.squeeze2(input, 'axes', axes)
         return out
@@ -6598,7 +6598,7 @@ def unsqueeze(input, axes, name=None):
         if _in_legacy_dygraph():
             out, _ = _C_ops.unsqueeze2(input, 'axes', axes)
             return out
-        return _C_ops.final_state_unsqueeze(input, axes)[1]
+        return _C_ops.final_state_unsqueeze(input, axes)
 
     check_type(axes, 'axis/axes', (int, list, tuple, Variable), 'unsqueeze')
     check_variable_and_dtype(input, 'input', [
@@ -13744,7 +13744,7 @@ def get_tensor_from_selected_rows(x, name=None):
            x.height = 20
            x.value = [[1, 1] [2, 2] [2, 2] [3, 3] [6, 6]]
 
-        Ouput is LoDTensor:
+        Output is LoDTensor:
            out.shape = [5, 2]
            out.data = [[1, 1],
                        [2, 2],
diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py
index 707a1dc2cbc2f..b04cf90e1d8f9 100644
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -673,7 +673,7 @@ def birnn(cell_fw,
     birnn creates a bidirectional recurrent neural network specified by 
     RNNCell `cell_fw` and `cell_bw`, which performs :code:`cell.call()` 
     (for dygraph mode :code:`cell.forward`) repeatedly until reaches to 
-    the maximum length of `inputs` and then concat the ouputs for both RNNs
+    the maximum length of `inputs` and then concat the outputs for both RNNs
     along the last axis.
 
     Arguments:
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index a9b1fa6ff0205..b02c154584e9c 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -760,8 +760,14 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
             place = _current_expected_place()
             if force_cpu:
                 place = core.CPUPlace()
+            if isinstance(shape, (list, tuple)):
+                for item in shape:
+                    if not isinstance(item, Variable):
+                        shape = list(
+                            map(lambda x: x.numpy().flat[0] if isinstance(x, Variable) else x,
+                                shape))
+                        break
 
-            shape = utils.convert_shape_to_list(shape)
             if not isinstance(dtype, core.VarDesc.VarType):
                 dtype = convert_np_dtype_to_dtype_(dtype)
             out = _C_ops.final_state_full(shape, float(value), dtype, place)
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index bb14fb9a86f15..49fb5399d8aec 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -6005,7 +6005,14 @@ def device_cmp(device1, device2):
         for p in program_list:
             self._create_vars(p.global_block(), main_block)
 
-        self.local_rank %= len(device_list)
+        if os.getenv("PADDLE_MANUAL_PIPELINE_STAGE", None):
+            self.local_rank = int(os.getenv("PADDLE_MANUAL_PIPELINE_STAGE"))
+            assert self.local_rank < len(device_list), (
+                "Manually specified "
+                "pipeline stage must be less than total number of pipeline "
+                "stages.")
+        else:
+            self.local_rank %= len(device_list)
         # Step3.5: optimize forward send sync_comm to overlap send and recv
         self._optimize_forward_send_sync(program_list[self.local_rank])
 
diff --git a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py
index 3cef228d14d6e..d52882acfc9ac 100644
--- a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py
+++ b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import os
+import site
 from paddle.fluid import core
 from distutils.sysconfig import get_python_lib
 from distutils.core import setup, Extension
@@ -42,10 +43,11 @@ def build_extensions(self):
     paddle_extra_compile_args += ['-D_GLIBCXX_USE_CXX11_ABI=0']
 
 # include path
-site_packages_path = get_python_lib()
-paddle_custom_kernel_include = [
-    os.path.join(site_packages_path, 'paddle', 'include'),
-]
+site_packages_path = site.getsitepackages()
+paddle_custom_kernel_include = list(
+    map(lambda path: os.path.join(path, 'paddle', 'include'),
+        site_packages_path))
+
 # include path third_party
 compile_third_party_path = os.path.join(os.environ['PADDLE_ROOT'],
                                         'build/third_party')
@@ -56,9 +58,8 @@ def build_extensions(self):
 ]
 
 # libs path
-paddle_custom_kernel_library_dir = [
-    os.path.join(site_packages_path, 'paddle', 'fluid'),
-]
+paddle_custom_kernel_library_dir = list(
+    map(lambda path: os.path.join(path, 'paddle', 'fluid'), site_packages_path))
 
 # libs
 libs = [':core_avx.so']
diff --git a/python/paddle/fluid/tests/custom_op/CMakeLists.txt b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
index c76b3da7428e3..b4adeb9575af6 100644
--- a/python/paddle/fluid/tests/custom_op/CMakeLists.txt
+++ b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
@@ -10,6 +10,11 @@ if(WITH_GPU OR APPLE)
     set_tests_properties(test_custom_relu_op_jit PROPERTIES TIMEOUT 180)
     set_tests_properties(test_custom_relu_model PROPERTIES TIMEOUT 180)
     set_tests_properties(test_context_pool PROPERTIES TIMEOUT 180)
+    if($ENV{USE_STANDALONE_EXECUTOR})
+        # these test will fail in some server due to PR#42149, temporarily set it use old executor.
+        set_tests_properties(test_custom_relu_op_setup PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
+        set_tests_properties(test_custom_relu_model PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
+    endif()
 endif()
 
 py_test(test_custom_raw_op_kernel_op SRCS test_custom_raw_op_kernel_op.py)
diff --git a/python/paddle/fluid/tests/custom_op/custom_linear_op.cc b/python/paddle/fluid/tests/custom_op/custom_linear_op.cc
index a561c845aba2b..ebfaaecd49093 100644
--- a/python/paddle/fluid/tests/custom_op/custom_linear_op.cc
+++ b/python/paddle/fluid/tests/custom_op/custom_linear_op.cc
@@ -23,6 +23,16 @@ std::vector<paddle::Tensor> PhiLinearForward(const paddle::Tensor& x,
   return {paddle::add(paddle::matmul(x, weight), bias)};
 }
 
+std::vector<paddle::Tensor> PhiLinearBackward(const paddle::Tensor& x,
+                                              const paddle::Tensor& weight,
+                                              const paddle::Tensor& bias,
+                                              const paddle::Tensor& out_grad) {
+  auto x_grad = paddle::matmul(out_grad, weight, false, true);
+  auto weight_grad = paddle::matmul(x, out_grad, true, false);
+  auto bias_grad = paddle::experimental::sum(out_grad, {0});
+  return {x_grad, weight_grad, bias_grad};
+}
+
 std::vector<std::vector<int64_t>> LinearInferShape(
     const std::vector<int64_t>& x_shape,
     const std::vector<int64_t>& weight_shape,
@@ -86,9 +96,14 @@ std::vector<paddle::DataType> LinearInferDtype(
   return {x_dtype};
 }
 
-PD_BUILD_OP(pten_linear)
+PD_BUILD_OP(phi_linear)
     .Inputs({"X", "Weight", "Bias"})
     .Outputs({"Out"})
     .SetKernelFn(PD_KERNEL(PhiLinearForward))
     .SetInferShapeFn(PD_INFER_SHAPE(LinearInferShape))
     .SetInferDtypeFn(PD_INFER_DTYPE(LinearInferDtype));
+
+PD_BUILD_GRAD_OP(phi_linear)
+    .Inputs({"X", "Weight", "Bias", paddle::Grad("Out")})
+    .Outputs({paddle::Grad("X"), paddle::Grad("Weight"), paddle::Grad("Bias")})
+    .SetKernelFn(PD_KERNEL(PhiLinearBackward));
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_linear.py b/python/paddle/fluid/tests/custom_op/test_custom_linear.py
index be49513da35dd..fba512d511c36 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_linear.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_linear.py
@@ -40,43 +40,56 @@
     verbose=True)
 
 
-def linear_dynamic(func, dtype, np_x, np_weight, np_bias):
-    paddle.set_device("cpu")
-    x = paddle.to_tensor(np_x, dtype=dtype)
-    weight = paddle.to_tensor(np_weight, dtype=dtype)
-    bias = paddle.to_tensor(np_bias, dtype=dtype)
+def linear_dynamic(func, device, dtype, np_x, np_weight, np_bias):
+    paddle.set_device(device)
+    x = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=False)
+    weight = paddle.to_tensor(np_weight, dtype=dtype, stop_gradient=False)
+    bias = paddle.to_tensor(np_bias, dtype=dtype, stop_gradient=False)
     out = func(x, weight, bias)
-    return out.numpy()
+    out.backward()
+    return out.numpy(), x.grad.numpy(), weight.grad.numpy(), bias.grad.numpy()
 
 
-def linear_static(func, dtype, np_x, np_weight, np_bias):
+def linear_static(func, device, dtype, np_x, np_weight, np_bias):
     paddle.enable_static()
-    paddle.set_device("cpu")
+    paddle.set_device(device)
     with static.scope_guard(static.Scope()):
         with static.program_guard(static.Program()):
-            x = static.data(name="x", shape=np_x.shape, dtype=dtype)
+            x = static.data(name="x", shape=[None, np_x.shape[1]], dtype=dtype)
             weight = static.data(
                 name="weight", shape=np_weight.shape, dtype=dtype)
             bias = static.data(name="bias", shape=np_bias.shape, dtype=dtype)
+            x.stop_gradient = False
+            weight.stop_gradient = False
+            bias.stop_gradient = False
             out = func(x, weight, bias)
+            mean_out = paddle.mean(out)
+            static.append_backward(mean_out)
 
             exe = static.Executor()
             exe.run(static.default_startup_program())
 
-            out_v, = exe.run(static.default_main_program(),
-                             feed={
-                                 "x": np_x.astype(dtype),
-                                 "weight": np_weight.astype(dtype),
-                                 "bias": np_bias.astype(dtype)
-                             },
-                             fetch_list=[out.name])
+            out_v, x_grad_v, weight_grad_v, bias_grad_v = exe.run(
+                static.default_main_program(),
+                feed={
+                    "x": np_x.astype(dtype),
+                    "weight": np_weight.astype(dtype),
+                    "bias": np_bias.astype(dtype)
+                },
+                fetch_list=[
+                    out.name, x.name + "@GRAD", weight.name + "@GRAD",
+                    bias.name + "@GRAD"
+                ])
     paddle.disable_static()
-    return out_v
+    return out_v, x_grad_v, weight_grad_v, bias_grad_v
 
 
 class TestCustomLinearJit(unittest.TestCase):
     def setUp(self):
         self.dtypes = ['float32', 'float64']
+        self.devices = ['cpu']
+        if paddle.is_compiled_with_cuda():
+            self.devices.append('gpu')
         self.np_x = np.random.random((3, 2)).astype("float32")
         self.np_weight = np.full([2, 4], fill_value=0.5, dtype="float32")
         self.np_bias = np.ones([4], dtype="float32")
@@ -88,20 +101,34 @@ def check_output(self, out, pd_out, name):
                                                            pd_out))
 
     def test_static(self):
-        for dtype in self.dtypes:
-            pten_out = linear_static(custom_ops.pten_linear, dtype, self.np_x,
-                                     self.np_weight, self.np_bias)
-            pd_out = linear_static(F.linear, dtype, self.np_x, self.np_weight,
-                                   self.np_bias)
-            self.check_output(pten_out, pd_out, "pten_out")
+        for device in self.devices:
+            for dtype in self.dtypes:
+                phi_out, phi_x_grad, phi_weight_grad, phi_bias_grad = linear_static(
+                    custom_ops.phi_linear, device, dtype, self.np_x,
+                    self.np_weight, self.np_bias)
+                pd_out, pd_x_grad, pd_weight_grad, pd_bias_grad = linear_static(
+                    F.linear, device, dtype, self.np_x, self.np_weight,
+                    self.np_bias)
+                self.check_output(phi_out, pd_out, "out")
+                self.check_output(phi_x_grad, pd_x_grad, "x_grad")
+                self.check_output(phi_weight_grad, pd_weight_grad,
+                                  "weight_grad")
+                self.check_output(phi_bias_grad, pd_bias_grad, "bias_grad")
 
     def func_dynamic(self):
-        for dtype in self.dtypes:
-            pten_out = linear_dynamic(custom_ops.pten_linear, dtype, self.np_x,
-                                      self.np_weight, self.np_bias)
-            pd_out = linear_dynamic(F.linear, dtype, self.np_x, self.np_weight,
-                                    self.np_bias)
-            self.check_output(pten_out, pd_out, "pten_out")
+        for device in self.devices:
+            for dtype in self.dtypes:
+                phi_out, phi_x_grad, phi_weight_grad, phi_bias_grad = linear_dynamic(
+                    custom_ops.phi_linear, device, dtype, self.np_x,
+                    self.np_weight, self.np_bias)
+                pd_out, pd_x_grad, pd_weight_grad, pd_bias_grad = linear_dynamic(
+                    F.linear, device, dtype, self.np_x, self.np_weight,
+                    self.np_bias)
+                self.check_output(phi_out, pd_out, "phi_out")
+                self.check_output(phi_x_grad, pd_x_grad, "x_grad")
+                self.check_output(phi_weight_grad, pd_weight_grad,
+                                  "weight_grad")
+                self.check_output(phi_bias_grad, pd_bias_grad, "bias_grad")
 
     def test_dynamic(self):
         with _test_eager_guard():
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_tanh_double_grad.py b/python/paddle/fluid/tests/custom_op/test_custom_tanh_double_grad.py
index 5664c00d74f89..3b3a0e2edec98 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_tanh_double_grad.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_tanh_double_grad.py
@@ -21,8 +21,7 @@
 from paddle.utils.cpp_extension import load, get_build_directory
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 from utils import paddle_includes, extra_cc_args, extra_nvcc_args
-from paddle.fluid.framework import _test_eager_guard, _enable_legacy_dygraph
-_enable_legacy_dygraph()
+from paddle.fluid.framework import _test_eager_guard
 
 # Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
@@ -64,7 +63,7 @@ def setUp(self):
         self.dtypes = ['float32', 'float64']
         self.devices = ['cpu']
 
-    def test_func_double_grad_dynamic(self):
+    def func_double_grad_dynamic(self):
         for device in self.devices:
             for dtype in self.dtypes:
                 x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
@@ -85,6 +84,11 @@ def test_func_double_grad_dynamic(self):
                     "custom op out grad: {},\n paddle api out grad: {}".format(
                         dout, pd_dout))
 
+    def test_func_double_grad_dynamic(self):
+        with _test_eager_guard():
+            self.func_double_grad_dynamic()
+        self.func_double_grad_dynamic()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 08e24f86a29a4..2918e8501c3d0 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -185,6 +185,8 @@ endif()
 # Temporally disable test_deprecated_decorator
 LIST(REMOVE_ITEM TEST_OPS test_deprecated_decorator)
 
+LIST(REMOVE_ITEM TEST_OPS test_tensordot)
+
 if(WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_multiprocess_reader_exception)
     LIST(REMOVE_ITEM TEST_OPS test_trainer_desc)
@@ -1036,6 +1038,7 @@ set_tests_properties(test_imperative_selected_rows_to_lod_tensor PROPERTIES TIME
 set_tests_properties(test_index_select_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_parallel_ssa_graph_inference_feed_partial_data PROPERTIES TIMEOUT 120)
 set_tests_properties(test_parallel_executor_crf PROPERTIES TIMEOUT 120)
+#set_tests_properties(test_tensordot PROPERTIES TIMEOUT 200)
 set_tests_properties(test_imperative_save_load PROPERTIES TIMEOUT 120)
 set_tests_properties(test_partial_eager_deletion_transformer PROPERTIES TIMEOUT 120)
 set_tests_properties(test_parallel_executor_seresnext_with_reduce_gpu PROPERTIES TIMEOUT 120)
@@ -1148,8 +1151,8 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
     set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200)
     set_tests_properties(test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120)
     set_tests_properties(test_dygraph_sharding_optimizer_stage2 PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_dygraph_sharding_stage2 PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_dygraph_sharding_stage3 PROPERTIES TIMEOUT 200)
+    set_tests_properties(test_dygraph_sharding_stage2 PROPERTIES TIMEOUT 200)
+    set_tests_properties(test_dygraph_sharding_stage3 PROPERTIES TIMEOUT 350)
     set_tests_properties(test_dygraph_group_sharded_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_auto_parallel_parallelizer PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120)
@@ -1233,9 +1236,6 @@ if(WITH_GPU OR WITH_ROCM)
 endif()
 set_tests_properties(test_inplace_addto_strategy PROPERTIES TIMEOUT 120)
 set_tests_properties(test_eigvals_op PROPERTIES TIMEOUT 400)
-set_tests_properties(test_tensordot PROPERTIES TIMEOUT 1000)
-set_tests_properties(test_tensordot PROPERTIES LABELS "RUN_TYPE=NIGHTLY")
-set_tests_properties(test_tensordot PROPERTIES ENVIRONMENT "FLAGS_USE_STANDALONE_EXECUTOR=False")
 set_tests_properties(test_cuda_memory_reserved PROPERTIES ENVIRONMENT "FLAGS_allocator_strategy=auto_growth")
 if (WITH_GLOO)
     set_tests_properties(test_parallel_dygraph_dataparallel_cpuonly PROPERTIES TIMEOUT 30)
@@ -1243,3 +1243,14 @@ if (WITH_GLOO)
     set_tests_properties(test_parallel_dygraph_sparse_embedding_gloo PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_dygraph_sparse_embedding_over_height_gloo PROPERTIES TIMEOUT 120)
 endif()
+
+if($ENV{USE_STANDALONE_EXECUTOR})
+    # these test will fail in some server due to PR#42149, temporarily set it use old executor.
+    set_tests_properties(test_apply_pass_to_program PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
+    set_tests_properties(test_buffer_shared_memory_reuse_pass PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
+    set_tests_properties(test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
+    set_tests_properties(test_imperative_optimizer PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
+    set_tests_properties(test_imperative_star_gan_with_gradient_penalty PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
+    set_tests_properties(test_switch_autotune PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
+    set_tests_properties(test_imperative_mnist_sorted_gradient PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0)
+endif()
diff --git a/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt b/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt
index b6b313465ab20..76856d88e1789 100644
--- a/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt
@@ -1,8 +1,8 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
-list(REMOVE_ITEM TEST_OPS "test_fleet_with_asp")
-list(REMOVE_ITEM TEST_OPS "test_fleet_with_asp_amp")
+list(REMOVE_ITEM TEST_OPS "test_fleet_with_asp_static")
+list(REMOVE_ITEM TEST_OPS "test_fleet_with_asp_dynamic")
 list(REMOVE_ITEM TEST_OPS "test_fleet_with_asp_sharding")
 
 foreach(TEST_OP ${TEST_OPS})
@@ -10,9 +10,9 @@ foreach(TEST_OP ${TEST_OPS})
 endforeach(TEST_OP)
 
 if(WITH_DISTRIBUTE)
-    py_test_modules(test_fleet_with_asp MODULES test_fleet_with_asp ENVS ${dist_ENVS})
     if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL)
-        py_test_modules(test_fleet_with_asp_amp MODULES test_fleet_with_asp_amp ENVS ${dist_ENVS})
+        py_test_modules(test_fleet_with_asp_dynamic MODULES test_fleet_with_asp_dynamic ENVS ${dist_ENVS})
+        py_test_modules(test_fleet_with_asp_static MODULES test_fleet_with_asp_static ENVS ${dist_ENVS})
     endif()
 endif()
 
@@ -21,3 +21,8 @@ if((WITH_DISTRIBUTE) AND (NOT WIN32) AND (NOT APPLE))
         py_test_modules(test_fleet_with_asp_sharding MODULES test_fleet_with_asp_sharding ENVS ${dist_ENVS})
     endif()
 endif()
+
+set_tests_properties(test_asp_pruning_dynamic PROPERTIES TIMEOUT 30)
+set_tests_properties(test_asp_pruning_static PROPERTIES TIMEOUT 30)
+set_tests_properties(test_asp_optimize_dynamic PROPERTIES TIMEOUT 30)
+set_tests_properties(test_asp_optimize_static PROPERTIES TIMEOUT 30)
diff --git a/python/paddle/fluid/tests/unittests/asp/asp_pruning_base.py b/python/paddle/fluid/tests/unittests/asp/asp_pruning_base.py
index d41a7b2b842e8..e594bc5c34eb3 100644
--- a/python/paddle/fluid/tests/unittests/asp/asp_pruning_base.py
+++ b/python/paddle/fluid/tests/unittests/asp/asp_pruning_base.py
@@ -20,7 +20,6 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle.static import sparsity
 from paddle.fluid.contrib.sparsity.asp import ASPHelper
 import numpy as np
 
@@ -60,7 +59,7 @@ def run_training_pruning_test(self, get_mask_gen_func, get_mask_check_func):
             loss = fluid.layers.mean(
                 fluid.layers.cross_entropy(
                     input=self.predict, label=self.label))
-            optimizer = sparsity.decorate(
+            optimizer = paddle.incubate.asp.decorate(
                 fluid.optimizer.SGD(learning_rate=0.01))
             optimizer.minimize(loss, self.startup_program)
 
@@ -75,7 +74,7 @@ def run_training_pruning_test(self, get_mask_gen_func, get_mask_check_func):
     def __pruning_and_checking(self, exe, place, mask_func_name,
                                check_func_name, with_mask):
         exe.run(self.startup_program)
-        sparsity.prune_model(
+        paddle.incubate.asp.prune_model(
             self.main_program, mask_algo=mask_func_name, with_mask=with_mask)
         for param in self.main_program.global_block().all_parameters():
             if ASPHelper._is_supported_layer(self.main_program, param.name):
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py b/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py
index a2b499a9e01c3..dca56076dbceb 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py
@@ -66,6 +66,97 @@ def test_add_supported_layer_via_name(self):
             my_own_layer_name in supported_layers_and_prune_func_map)
 
 
+class TestASPDynamicCustomerizedPruneFunc(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+
+        class CustomerLayer(paddle.nn.Layer):
+            def __init__(self):
+                super(CustomerLayer, self).__init__()
+
+                self.weight = self.create_parameter(
+                    shape=[32, 32], attr=None, dtype='float32', is_bias=False)
+                self.linear1 = paddle.nn.Linear(32, 32)
+                self.linear2 = paddle.nn.Linear(32, 10)
+
+            def forward(self, input_):
+                hidden = paddle.nn.functional.linear(
+                    x=input_, weight=self.weight)
+                hidden = self.linear1(hidden)
+                out = self.linear2(hidden)
+                return out
+
+        sparsity.add_supported_layer(CustomerLayer, my_own_pruning)
+
+        self.layer = CustomerLayer()
+        self.customer_prefix = paddle.fluid.dygraph.layers._convert_camel_to_snake(
+            CustomerLayer.__name__)
+        self.supported_layer_count_ref = 3
+
+    def test_inference_pruning(self):
+
+        sparsity.prune_model(self.layer, mask_algo="mask_1d", with_mask=False)
+
+        supported_layer_count = 0
+        for param in self.layer.parameters():
+            mat = param.numpy()
+
+            if sparsity.asp.ASPHelper._is_supported_layer(
+                    paddle.static.default_main_program(), param.name):
+                supported_layer_count += 1
+                if (self.customer_prefix in param.name):
+                    self.assertLessEqual(
+                        np.sum(mat.flatten() - static_tensor.flatten()), 1e-4)
+                else:
+                    self.assertTrue(
+                        sparsity.check_sparsity(
+                            mat.T,
+                            func_name=sparsity.CheckMethod.CHECK_1D,
+                            n=2,
+                            m=4))
+        self.assertEqual(supported_layer_count, self.supported_layer_count_ref)
+
+    def test_training_pruning(self):
+        optimizer = paddle.optimizer.SGD(learning_rate=0.01,
+                                         parameters=self.layer.parameters())
+        optimizer = sparsity.decorate(optimizer)
+
+        sparsity.prune_model(self.layer, mask_algo="mask_1d", with_mask=True)
+
+        supported_layer_count = 0
+        for param in self.layer.parameters():
+            mat = param.numpy()
+
+            if sparsity.asp.ASPHelper._is_supported_layer(
+                    paddle.static.default_main_program(), param.name):
+
+                mat_mask = sparsity.asp.ASPHelper._get_program_asp_info(
+                    paddle.static.default_main_program()).mask_vars[
+                        param.name].numpy()
+
+                supported_layer_count += 1
+                if (self.customer_prefix in param.name):
+                    self.assertLessEqual(
+                        np.sum(mat.flatten() - static_tensor.flatten()), 1e-4)
+                    self.assertLessEqual(
+                        np.sum(mat_mask.flatten() - static_tensor_mask.flatten(
+                        )), 1e-4)
+                else:
+                    self.assertTrue(
+                        sparsity.check_sparsity(
+                            mat.T,
+                            func_name=sparsity.CheckMethod.CHECK_1D,
+                            n=2,
+                            m=4))
+                    self.assertTrue(
+                        sparsity.check_sparsity(
+                            mat_mask.T,
+                            func_name=sparsity.CheckMethod.CHECK_1D,
+                            n=2,
+                            m=4))
+        self.assertEqual(supported_layer_count, self.supported_layer_count_ref)
+
+
 class TestASPStaticCustomerizedPruneFunc(unittest.TestCase):
     def setUp(self):
         paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_optimize_dynamic.py b/python/paddle/fluid/tests/unittests/asp/test_asp_optimize_dynamic.py
new file mode 100644
index 0000000000000..e127dca225116
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_optimize_dynamic.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.contrib.sparsity.asp import ASPHelper
+import numpy as np
+
+
+class MyLayer(paddle.nn.Layer):
+    def __init__(self):
+        super(MyLayer, self).__init__()
+        self.conv1 = paddle.nn.Conv2D(
+            in_channels=3, out_channels=2, kernel_size=3, padding=2)
+        self.linear1 = paddle.nn.Linear(1352, 32)
+        self.linear2 = paddle.nn.Linear(32, 32)
+        self.linear3 = paddle.nn.Linear(32, 10)
+
+    def forward(self, img):
+        hidden = self.conv1(img)
+        hidden = paddle.flatten(hidden, start_axis=1)
+        hidden = self.linear1(hidden)
+        hidden = self.linear2(hidden)
+        prediction = self.linear3(hidden)
+        return prediction
+
+
+class TestASPDynamicOptimize(unittest.TestCase):
+    def setUp(self):
+
+        self.layer = MyLayer()
+
+        self.place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            self.place = paddle.CUDAPlace(0)
+
+        self.optimizer = paddle.optimizer.SGD(
+            learning_rate=0.01, parameters=self.layer.parameters())
+
+    def test_is_supported_layers(self):
+        program = paddle.static.default_main_program()
+
+        names = [
+            'embedding_0.w_0', 'fack_layer_0.w_0', 'conv2d_0.w_0',
+            'conv2d_0.b_0', 'conv2d_1.w_0', 'conv2d_1.b_0', 'fc_0.w_0',
+            'fc_0.b_0', 'fc_1.w_0', 'fc_1.b_0', 'linear_2.w_0', 'linear_2.b_0'
+        ]
+        ref = [
+            False, False, True, False, True, False, True, False, True, False,
+            True, False
+        ]
+        for i, name in enumerate(names):
+            self.assertTrue(
+                ref[i] == ASPHelper._is_supported_layer(program, name))
+
+        paddle.incubate.asp.set_excluded_layers(['fc_1', 'conv2d_0'])
+        ref = [
+            False, False, False, False, True, False, True, False, False, False,
+            True, False
+        ]
+        for i, name in enumerate(names):
+            self.assertTrue(
+                ref[i] == ASPHelper._is_supported_layer(program, name))
+
+        paddle.incubate.asp.reset_excluded_layers()
+        ref = [
+            False, False, True, False, True, False, True, False, True, False,
+            True, False
+        ]
+        for i, name in enumerate(names):
+            self.assertTrue(
+                ref[i] == ASPHelper._is_supported_layer(program, name))
+
+    def test_decorate(self):
+        param_names = [param.name for param in self.layer.parameters()]
+        self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
+
+        program = paddle.static.default_main_program()
+
+        for name in param_names:
+            mask_var = ASPHelper._get_program_asp_info(program).mask_vars.get(
+                name, None)
+            if ASPHelper._is_supported_layer(program, name):
+                self.assertTrue(mask_var is not None)
+            else:
+                self.assertTrue(mask_var is None)
+
+    def test_asp_training(self):
+        self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
+
+        paddle.incubate.asp.prune_model(self.layer)
+
+        imgs = paddle.to_tensor(
+            np.random.randn(32, 3, 24, 24),
+            dtype='float32',
+            place=self.place,
+            stop_gradient=False)
+        labels = paddle.to_tensor(
+            np.random.randint(
+                10, size=(32, 1)),
+            dtype='float32',
+            place=self.place,
+            stop_gradient=False)
+
+        loss_fn = paddle.nn.MSELoss(reduction='mean')
+
+        output = self.layer(imgs)
+        loss = loss_fn(output, labels)
+        loss.backward()
+        self.optimizer.step()
+        self.optimizer.clear_grad()
+
+        for param in self.layer.parameters():
+            if ASPHelper._is_supported_layer(
+                    paddle.static.default_main_program(), param.name):
+                mat = param.numpy()
+                self.assertTrue(
+                    paddle.fluid.contrib.sparsity.check_sparsity(
+                        mat.T, n=2, m=4))
+
+    def test_asp_training_with_amp(self):
+        self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
+
+        paddle.incubate.asp.prune_model(self.layer)
+
+        imgs = paddle.to_tensor(
+            np.random.randn(32, 3, 24, 24),
+            dtype='float32',
+            place=self.place,
+            stop_gradient=False)
+        labels = paddle.to_tensor(
+            np.random.randint(
+                10, size=(32, 1)),
+            dtype='float32',
+            place=self.place,
+            stop_gradient=False)
+
+        loss_fn = paddle.nn.MSELoss(reduction='mean')
+        scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+
+        with paddle.amp.auto_cast(enable=True):
+            output = self.layer(imgs)
+            loss = loss_fn(output, labels)
+        scaled = scaler.scale(loss)
+        scaled.backward()
+        scaler.minimize(self.optimizer, scaled)
+        self.optimizer.clear_grad()
+
+        for param in self.layer.parameters():
+            if ASPHelper._is_supported_layer(
+                    paddle.static.default_main_program(), param.name):
+                mat = param.numpy()
+                self.assertTrue(
+                    paddle.fluid.contrib.sparsity.check_sparsity(
+                        mat.T, n=2, m=4))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_optimize.py b/python/paddle/fluid/tests/unittests/asp/test_asp_optimize_static.py
similarity index 89%
rename from python/paddle/fluid/tests/unittests/asp/test_asp_optimize.py
rename to python/paddle/fluid/tests/unittests/asp/test_asp_optimize_static.py
index 9e5e3c924f1a5..b51e28cdcb9fc 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_optimize.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_optimize_static.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
 # 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -20,21 +20,20 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle.static import sparsity
 from paddle.fluid.contrib.sparsity.asp import ASPHelper
 import numpy as np
 
 paddle.enable_static()
 
 
-class TestASPHelper(unittest.TestCase):
+class TestASPStaticOptimize(unittest.TestCase):
     def setUp(self):
         self.main_program = fluid.Program()
         self.startup_program = fluid.Program()
 
         def build_model():
             img = fluid.data(
-                name='img', shape=[None, 3, 32, 32], dtype='float32')
+                name='img', shape=[None, 3, 24, 24], dtype='float32')
             label = fluid.data(name='label', shape=[None, 1], dtype='int64')
             hidden = fluid.layers.conv2d(
                 input=img, num_filters=4, filter_size=3, padding=2, act="relu")
@@ -87,7 +86,7 @@ def test_is_supported_layers(self):
             self.assertTrue(
                 ref[i] == ASPHelper._is_supported_layer(program, name))
 
-        sparsity.set_excluded_layers(program, ['fc_1', 'conv2d_0'])
+        paddle.incubate.asp.set_excluded_layers(['fc_1', 'conv2d_0'], program)
         ref = [
             False, False, False, False, True, False, True, False, False, False,
             True, False
@@ -96,7 +95,7 @@ def test_is_supported_layers(self):
             self.assertTrue(
                 ref[i] == ASPHelper._is_supported_layer(program, name))
 
-        sparsity.reset_excluded_layers(program)
+        paddle.incubate.asp.reset_excluded_layers(program)
         ref = [
             False, False, True, False, True, False, True, False, True, False,
             True, False
@@ -109,7 +108,7 @@ def test_decorate(self):
         param_names = self.__get_param_names(self.main_program.global_block()
                                              .all_parameters())
         with fluid.program_guard(self.main_program, self.startup_program):
-            self.optimizer = sparsity.decorate(self.optimizer)
+            self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
             self.optimizer.minimize(self.loss, self.startup_program)
         param_names_after_minimize = self.__get_param_names(
             self.main_program.global_block().all_parameters())
@@ -119,7 +118,7 @@ def test_decorate(self):
 
     def test_asp_training(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            self.optimizer = sparsity.decorate(self.optimizer)
+            self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
             self.optimizer.minimize(self.loss, self.startup_program)
 
         place = paddle.CPUPlace()
@@ -129,10 +128,10 @@ def test_asp_training(self):
         feeder = fluid.DataFeeder(feed_list=[self.img, self.label], place=place)
 
         exe.run(self.startup_program)
-        sparsity.prune_model(self.main_program)
+        paddle.incubate.asp.prune_model(self.main_program)
 
-        data = (np.random.randn(64, 3, 32, 32), np.random.randint(
-            10, size=(64, 1)))
+        data = (np.random.randn(32, 3, 24, 24), np.random.randint(
+            10, size=(32, 1)))
         exe.run(self.main_program, feed=feeder.feed([data]))
 
         for param in self.main_program.global_block().all_parameters():
@@ -149,7 +148,7 @@ def test_asp_training_with_amp(self):
             with fluid.program_guard(self.main_program, self.startup_program):
                 self.optimizer = fluid.contrib.mixed_precision.decorator.decorate(
                     self.optimizer)
-                self.optimizer = sparsity.decorate(self.optimizer)
+                self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
                 self.optimizer.minimize(self.loss, self.startup_program)
 
             exe = fluid.Executor(place)
@@ -157,10 +156,10 @@ def test_asp_training_with_amp(self):
                 feed_list=[self.img, self.label], place=place)
 
             exe.run(self.startup_program)
-            sparsity.prune_model(self.main_program)
+            paddle.incubate.asp.prune_model(self.main_program)
 
-            data = (np.random.randn(64, 3, 32, 32), np.random.randint(
-                10, size=(64, 1)))
+            data = (np.random.randn(32, 3, 24, 24), np.random.randint(
+                10, size=(32, 1)))
             exe.run(self.main_program, feed=feeder.feed([data]))
 
             for param in self.main_program.global_block().all_parameters():
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_best.py b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_best.py
deleted file mode 100644
index e99509187038c..0000000000000
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_best.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle
-import unittest
-from paddle.static import sparsity
-from paddle.fluid.tests.unittests.asp.asp_pruning_base import TestASPHelperPruningBase
-
-paddle.enable_static()
-
-
-class TestASPHelperPruning2DBest(TestASPHelperPruningBase):
-    def test_2D_best_inference_pruning(self):
-        self.run_inference_pruning_test(
-            'mask_2d_best', paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D)
-
-    def test_2D_best_training_pruning(self):
-        self.run_training_pruning_test(
-            'mask_2d_best', paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_greedy.py b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_greedy.py
deleted file mode 100644
index 7ad6c3ae02275..0000000000000
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_greedy.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle
-from paddle.static import sparsity
-from paddle.fluid.tests.unittests.asp.asp_pruning_base import TestASPHelperPruningBase
-
-paddle.enable_static()
-
-
-class TestASPHelperPruning2DGreedy(TestASPHelperPruningBase):
-    def test_2D_greedy_inference_pruning(self):
-        self.run_inference_pruning_test(
-            'mask_2d_greedy',
-            paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D)
-
-    def test_2D_greedy_training_pruning(self):
-        self.run_training_pruning_test(
-            'mask_2d_greedy',
-            paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_dynamic.py b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_dynamic.py
new file mode 100644
index 0000000000000..b0fad0b64002a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_dynamic.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+from paddle.fluid import core
+from paddle.fluid.contrib.sparsity.asp import ASPHelper
+
+
+class MyLayer(paddle.nn.Layer):
+    def __init__(self):
+        super(MyLayer, self).__init__()
+        self.conv1 = paddle.nn.Conv2D(
+            in_channels=3, out_channels=2, kernel_size=3, padding=2)
+        self.linear1 = paddle.nn.Linear(1352, 32)
+        self.linear2 = paddle.nn.Linear(32, 10)
+
+    def forward(self, img):
+        hidden = self.conv1(img)
+        hidden = paddle.flatten(hidden, start_axis=1)
+        hidden = self.linear1(hidden)
+        prediction = self.linear2(hidden)
+        return prediction
+
+
+class TestASPDynamicPruningBase(unittest.TestCase):
+    def setUp(self):
+        self.layer = MyLayer()
+
+        place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+
+        self.img = paddle.to_tensor(
+            np.random.uniform(
+                low=-0.5, high=0.5, size=(32, 3, 24, 24)),
+            dtype=np.float32,
+            place=place,
+            stop_gradient=False)
+
+        self.set_config()
+
+    def set_config(self):
+        self.mask_gen_func = 'mask_1d'
+        self.mask_check_func = paddle.fluid.contrib.sparsity.CheckMethod.CHECK_1D
+
+    def test_inference_pruning(self):
+        self.__pruning_and_checking(False)
+
+    def test_training_pruning(self):
+
+        optimizer = paddle.optimizer.SGD(learning_rate=0.01,
+                                         parameters=self.layer.parameters())
+        optimizer = paddle.incubate.asp.decorate(optimizer)
+
+        self.__pruning_and_checking(True)
+
+    def __pruning_and_checking(self, with_mask):
+
+        paddle.incubate.asp.prune_model(
+            self.layer, mask_algo=self.mask_gen_func, with_mask=with_mask)
+
+        for param in self.layer.parameters():
+            if ASPHelper._is_supported_layer(
+                    paddle.static.default_main_program(), param.name):
+                mat = param.numpy()
+                self.assertTrue(
+                    paddle.fluid.contrib.sparsity.check_sparsity(
+                        mat.T, func_name=self.mask_check_func, n=2, m=4))
+
+
+class TestASPDynamicPruning1D(TestASPDynamicPruningBase):
+    def set_config(self):
+        self.mask_gen_func = 'mask_1d'
+        self.mask_check_func = paddle.fluid.contrib.sparsity.CheckMethod.CHECK_1D
+
+
+class TestASPDynamicPruning2DBest(TestASPDynamicPruningBase):
+    def set_config(self):
+        self.mask_gen_func = 'mask_2d_best'
+        self.mask_check_func = paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D
+
+
+class TestASPDynamicPruning2DGreedy(TestASPDynamicPruningBase):
+    def set_config(self):
+        self.mask_gen_func = 'mask_2d_greedy'
+        self.mask_check_func = paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_static.py b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_static.py
new file mode 100644
index 0000000000000..a9986f24b0265
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_static.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import threading, time
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.contrib.sparsity.asp import ASPHelper
+import numpy as np
+
+paddle.enable_static()
+
+
+class TestASPStaticPruningBase(unittest.TestCase):
+    def setUp(self):
+        self.main_program = fluid.Program()
+        self.startup_program = fluid.Program()
+
+        def build_model():
+            img = fluid.data(
+                name='img', shape=[None, 3, 24, 24], dtype='float32')
+            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+            hidden = fluid.layers.conv2d(
+                input=img, num_filters=2, filter_size=3, padding=2, act="relu")
+            hidden = fluid.layers.fc(input=hidden, size=32, act='softmax')
+            prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+            return img, label, prediction
+
+        with fluid.program_guard(self.main_program, self.startup_program):
+            self.img, self.label, self.predict = build_model()
+
+        self.set_config()
+
+    def set_config(self):
+        self.mask_gen_func = 'mask_1d'
+        self.mask_check_func = paddle.fluid.contrib.sparsity.CheckMethod.CHECK_1D
+
+    def test_inference_pruning(self):
+        place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+        exe = fluid.Executor(place)
+
+        self.__pruning_and_checking(exe, place, False)
+
+    def test_training_pruning(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            loss = fluid.layers.mean(
+                fluid.layers.cross_entropy(
+                    input=self.predict, label=self.label))
+            optimizer = paddle.incubate.asp.decorate(
+                fluid.optimizer.SGD(learning_rate=0.01))
+            optimizer.minimize(loss, self.startup_program)
+
+        place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+        exe = fluid.Executor(place)
+
+        self.__pruning_and_checking(exe, place, True)
+
+    def __pruning_and_checking(self, exe, place, with_mask):
+        exe.run(self.startup_program)
+        paddle.incubate.asp.prune_model(
+            self.main_program,
+            mask_algo=self.mask_gen_func,
+            with_mask=with_mask)
+        for param in self.main_program.global_block().all_parameters():
+            if ASPHelper._is_supported_layer(self.main_program, param.name):
+                mat = np.array(fluid.global_scope().find_var(param.name)
+                               .get_tensor())
+                self.assertTrue(
+                    paddle.fluid.contrib.sparsity.check_sparsity(
+                        mat.T, func_name=self.mask_check_func, n=2, m=4))
+
+
+class TestASPStaticPruning1D(TestASPStaticPruningBase):
+    def set_config(self):
+        self.mask_gen_func = 'mask_1d'
+        self.mask_check_func = paddle.fluid.contrib.sparsity.CheckMethod.CHECK_1D
+
+
+class TestASPStaticPruning2DBest(TestASPStaticPruningBase):
+    def set_config(self):
+        self.mask_gen_func = 'mask_2d_best'
+        self.mask_check_func = paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D
+
+
+class TestASPStaticPruning2DGreedy(TestASPStaticPruningBase):
+    def set_config(self):
+        self.mask_gen_func = 'mask_2d_greedy'
+        self.mask_check_func = paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_save_load.py b/python/paddle/fluid/tests/unittests/asp/test_asp_save_load.py
new file mode 100644
index 0000000000000..653cbbf84091b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_save_load.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.contrib.sparsity.asp import ASPHelper
+import numpy as np
+
+
+class MyLayer(paddle.nn.Layer):
+    def __init__(self):
+        super(MyLayer, self).__init__()
+        self.conv1 = paddle.nn.Conv2D(
+            in_channels=3, out_channels=4, kernel_size=3, padding=2)
+        self.linear1 = paddle.nn.Linear(4624, 32)
+        self.linear2 = paddle.nn.Linear(32, 32)
+        self.linear3 = paddle.nn.Linear(32, 10)
+
+    def forward(self, img):
+        hidden = self.conv1(img)
+        hidden = paddle.flatten(hidden, start_axis=1)
+        hidden = self.linear1(hidden)
+        hidden = self.linear2(hidden)
+        prediction = self.linear3(hidden)
+        return prediction
+
+
+class TestASPDynamicOptimize(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+
+        self.layer = MyLayer()
+
+        self.place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            self.place = paddle.CUDAPlace(0)
+
+        self.optimizer = paddle.optimizer.SGD(
+            learning_rate=0.01, parameters=self.layer.parameters())
+        self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
+        paddle.incubate.asp.prune_model(self.layer)
+
+    def test_save_and_load(self):
+        path = "/tmp/paddle_asp_save_dy/"
+        net_path = path + "asp_net.pdparams"
+        opt_path = path + "asp_opt.pdopt"
+
+        paddle.save(self.layer.state_dict(), net_path)
+        paddle.save(self.optimizer.state_dict(), opt_path)
+
+        asp_info = ASPHelper._get_program_asp_info(
+            paddle.static.default_main_program())
+        for param_name in asp_info.mask_vars:
+            mask = asp_info.mask_vars[param_name]
+            asp_info.update_mask_vars(
+                param_name, paddle.ones(
+                    shape=mask.shape, dtype=mask.dtype))
+            asp_info.update_masks(param_name, np.ones(shape=mask.shape))
+
+        net_state_dict = paddle.load(net_path)
+        opt_state_dict = paddle.load(opt_path)
+
+        self.layer.set_state_dict(net_state_dict)
+        self.optimizer.set_state_dict(opt_state_dict)
+
+        imgs = paddle.to_tensor(
+            np.random.randn(64, 3, 32, 32),
+            dtype='float32',
+            place=self.place,
+            stop_gradient=False)
+        labels = paddle.to_tensor(
+            np.random.randint(
+                10, size=(64, 1)),
+            dtype='float32',
+            place=self.place,
+            stop_gradient=False)
+
+        loss_fn = paddle.nn.MSELoss(reduction='mean')
+
+        output = self.layer(imgs)
+        loss = loss_fn(output, labels)
+        loss.backward()
+        self.optimizer.step()
+        self.optimizer.clear_grad()
+
+        for param in self.layer.parameters():
+            if ASPHelper._is_supported_layer(
+                    paddle.static.default_main_program(), param.name):
+                mat = param.numpy()
+                self.assertTrue(
+                    paddle.fluid.contrib.sparsity.check_sparsity(
+                        mat.T, n=2, m=4))
+
+
+class TestASPStaticOptimize(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+
+        self.main_program = fluid.Program()
+        self.startup_program = fluid.Program()
+
+        def build_model():
+            img = fluid.data(
+                name='img', shape=[None, 3, 32, 32], dtype='float32')
+            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+            hidden = fluid.layers.conv2d(
+                input=img, num_filters=4, filter_size=3, padding=2, act="relu")
+            hidden = fluid.layers.fc(input=hidden, size=32, act='relu')
+            prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+            return img, label, prediction
+
+        with fluid.program_guard(self.main_program, self.startup_program):
+            self.img, self.label, predict = build_model()
+            self.loss = fluid.layers.mean(
+                fluid.layers.cross_entropy(
+                    input=predict, label=self.label))
+            self.optimizer = fluid.optimizer.SGD(learning_rate=0.01)
+            self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
+            self.optimizer.minimize(self.loss, self.startup_program)
+
+        self.place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            self.place = paddle.CUDAPlace(0)
+        self.exe = fluid.Executor(self.place)
+        self.exe.run(self.startup_program)
+
+        paddle.incubate.asp.prune_model(self.main_program)
+
+    def test_save_and_load(self):
+        path = "/tmp/paddle_asp_save_st/"
+        param_path = path + "asp.pdparams"
+        model_path = path + "asp.pdmodel"
+
+        paddle.save(self.main_program.state_dict(), param_path)
+        paddle.save(self.main_program, model_path)
+
+        prog = paddle.load(model_path)
+
+        state_dict = paddle.load(param_path)
+        prog.set_state_dict(state_dict)
+
+        feeder = fluid.DataFeeder(
+            feed_list=[self.img, self.label], place=self.place)
+
+        data = (np.random.randn(64, 3, 32, 32), np.random.randint(
+            10, size=(64, 1)))
+        self.exe.run(prog, feed=feeder.feed([data]))
+
+        for param in prog.global_block().all_parameters():
+            if ASPHelper._is_supported_layer(prog, param.name):
+                mat = np.array(fluid.global_scope().find_var(param.name)
+                               .get_tensor())
+                self.assertTrue(
+                    paddle.fluid.contrib.sparsity.check_sparsity(
+                        mat.T, n=2, m=4))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_utils.py b/python/paddle/fluid/tests/unittests/asp/test_asp_utils.py
index 4aac878763b6f..67ec54367d382 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_utils.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_utils.py
@@ -18,7 +18,6 @@
 import unittest
 import threading, time
 import paddle
-from paddle.static import sparsity
 import numpy as np
 
 
@@ -41,9 +40,9 @@ def test_density(self):
         x = np.array([[1.0, 1.0, 1.0, 0.0, 1.0], [1.0, 1.0, 0.0, 0.0, 1.0],
                       [1.0, 0.0, 0.0, 0.0, 1.0], [1.0, 1.0, 0.0, 0.0, 1.0],
                       [0.0, 1.0, 0.0, 0.0, 1.0]])
-        self.assertEqual(sparsity.calculate_density(x), 0.56)
+        self.assertEqual(paddle.incubate.asp.calculate_density(x), 0.56)
         x[:, 0] = 0.0
-        self.assertEqual(sparsity.calculate_density(x), 0.4)
+        self.assertEqual(paddle.incubate.asp.calculate_density(x), 0.4)
 
     def test_check_mask_1d(self):
         x = np.array([[1.0, 0.0, 0.0, 1.0, 1.0], [1.0, 1.0, 0.0, 0.0, 1.0],
@@ -219,3 +218,7 @@ def __test_1D_2D_sparse_mask_generation_methods(self, x):
                 func_name=paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D,
                 n=2,
                 m=4))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp.py b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp.py
deleted file mode 100644
index 074aedb947613..0000000000000
--- a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.distributed.fleet as fleet
-import paddle.distributed.fleet.base.role_maker as role_maker
-import unittest
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import os
-from paddle.static import sparsity
-from paddle.fluid.contrib.sparsity.asp import ASPHelper
-import numpy as np
-cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES')
-if cuda_visible_devices is None or cuda_visible_devices == "":
-    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
-else:
-    os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devices.split(',')[0]
-
-paddle.enable_static()
-
-
-class TestFleetWithASP(unittest.TestCase):
-    def setUp(self):
-        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213"
-        os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213"
-        os.environ["PADDLE_TRAINERS_NUM"] = "1"
-        os.environ["PADDLE_TRAINER_ID"] = "0"
-
-    def net(self, main_prog, startup_prog):
-        with fluid.program_guard(main_prog, startup_prog):
-            input_x = paddle.static.data(
-                name="x", shape=[-1, 32], dtype='float32')
-            input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
-
-            fc_1 = fluid.layers.fc(input=input_x, size=64, act='tanh')
-            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
-            cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
-            avg_cost = paddle.mean(x=cost)
-
-            strategy = paddle.distributed.fleet.DistributedStrategy()
-            strategy.asp = True
-        return avg_cost, strategy, input_x, input_y
-
-    def test_with_asp(self):
-        fleet.init(is_collective=True)
-        train_prog, startup_prog = fluid.Program(), fluid.Program()
-        avg_cost, strategy, input_x, input_y = self.net(train_prog,
-                                                        startup_prog)
-
-        with fluid.program_guard(train_prog, startup_prog):
-            optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
-            optimizer = fleet.distributed_optimizer(
-                optimizer, strategy=strategy)
-            optimizer.minimize(avg_cost)
-
-        place = fluid.CUDAPlace(0) if paddle.fluid.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
-
-        exe = fluid.Executor(place)
-        feeder = fluid.DataFeeder(feed_list=[input_x, input_y], place=place)
-        exe.run(startup_prog)
-
-        sparsity.prune_model(train_prog)
-
-        data = (np.random.randn(64, 32), np.random.randint(2, size=(64, 1)))
-        exe.run(train_prog, feed=feeder.feed([data]))
-
-        for param in train_prog.global_block().all_parameters():
-            if ASPHelper._is_supported_layer(train_prog, param.name):
-                mat = np.array(fluid.global_scope().find_var(param.name)
-                               .get_tensor())
-                self.assertTrue(
-                    paddle.fluid.contrib.sparsity.check_sparsity(
-                        mat.T, n=2, m=4))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_dynamic.py b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_dynamic.py
new file mode 100644
index 0000000000000..3ced15bf15881
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_dynamic.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
+import unittest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import os
+from paddle.fluid.contrib.sparsity.asp import ASPHelper
+import numpy as np
+cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES')
+if cuda_visible_devices is None or cuda_visible_devices == "":
+    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+else:
+    os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devices.split(',')[0]
+
+
+class MyLayer(paddle.nn.Layer):
+    def __init__(self):
+        super(MyLayer, self).__init__()
+        self.linear1 = paddle.nn.Linear(32, 32)
+        self.linear2 = paddle.nn.Linear(32, 10)
+
+    def forward(self, x):
+        hidden = self.linear1(x)
+        prediction = self.linear2(hidden)
+        return prediction
+
+
+class TestFleetWithASPDynamic(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213"
+        os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213"
+        os.environ["PADDLE_TRAINERS_NUM"] = "1"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+
+        self.layer = MyLayer()
+
+        self.place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            self.place = paddle.CUDAPlace(0)
+
+        self.optimizer = paddle.optimizer.SGD(
+            learning_rate=0.01, parameters=self.layer.parameters())
+
+    def test_with_asp(self):
+        fleet.init(is_collective=True)
+
+        self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
+        paddle.incubate.asp.prune_model(self.layer)
+
+        self.optimizer = fleet.distributed_optimizer(self.optimizer)
+        self.layer = fleet.distributed_model(self.layer)
+
+        imgs = paddle.to_tensor(
+            np.random.randn(64, 32),
+            dtype='float32',
+            place=self.place,
+            stop_gradient=False)
+        labels = paddle.to_tensor(
+            np.random.randint(
+                10, size=(64, 1)),
+            dtype='float32',
+            place=self.place,
+            stop_gradient=False)
+
+        loss_fn = paddle.nn.MSELoss(reduction='mean')
+
+        output = self.layer(imgs)
+        loss = loss_fn(output, labels)
+        loss.backward()
+        self.optimizer.step()
+        self.optimizer.clear_grad()
+
+        for param in self.layer.parameters():
+            if ASPHelper._is_supported_layer(
+                    paddle.static.default_main_program(), param.name):
+                mat = param.numpy()
+                self.assertTrue(
+                    paddle.fluid.contrib.sparsity.check_sparsity(
+                        mat.T, n=2, m=4))
+
+
+class TestFleetWithASPAMPDynamic(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213"
+        os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213"
+        os.environ["PADDLE_TRAINERS_NUM"] = "1"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+
+        self.layer = MyLayer()
+
+        self.place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            self.place = paddle.CUDAPlace(0)
+
+        self.optimizer = paddle.optimizer.SGD(
+            learning_rate=0.01, parameters=self.layer.parameters())
+
+    def test_with_asp(self):
+        fleet.init(is_collective=True)
+
+        self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
+        paddle.incubate.asp.prune_model(self.layer)
+
+        self.optimizer = fleet.distributed_optimizer(self.optimizer)
+        self.layer = fleet.distributed_model(self.layer)
+
+        imgs = paddle.to_tensor(
+            np.random.randn(64, 32),
+            dtype='float32',
+            place=self.place,
+            stop_gradient=False)
+        labels = paddle.to_tensor(
+            np.random.randint(
+                10, size=(64, 1)),
+            dtype='float32',
+            place=self.place,
+            stop_gradient=False)
+
+        loss_fn = paddle.nn.MSELoss(reduction='mean')
+        scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+
+        with paddle.amp.auto_cast(enable=True):
+            output = self.layer(imgs)
+            loss = loss_fn(output, labels)
+        scaled = scaler.scale(loss)
+        scaled.backward()
+        scaler.minimize(self.optimizer, scaled)
+        self.optimizer.clear_grad()
+
+        for param in self.layer.parameters():
+            if ASPHelper._is_supported_layer(
+                    paddle.static.default_main_program(), param.name):
+                mat = param.numpy()
+                self.assertTrue(
+                    paddle.fluid.contrib.sparsity.check_sparsity(
+                        mat.T, n=2, m=4))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_amp.py b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_static.py
similarity index 67%
rename from python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_amp.py
rename to python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_static.py
index a34d7e69872e2..2023c0051401f 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_amp.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_static.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -32,7 +32,62 @@
 paddle.enable_static()
 
 
-class TestFleetWithASP(unittest.TestCase):
+class TestFleetWithASPStatic(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213"
+        os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213"
+        os.environ["PADDLE_TRAINERS_NUM"] = "1"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+
+    def net(self, main_prog, startup_prog):
+        with fluid.program_guard(main_prog, startup_prog):
+            input_x = paddle.static.data(
+                name="x", shape=[-1, 32], dtype='float32')
+            input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
+
+            fc_1 = fluid.layers.fc(input=input_x, size=64, act='tanh')
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+            cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
+            avg_cost = paddle.mean(x=cost)
+
+            strategy = paddle.distributed.fleet.DistributedStrategy()
+            strategy.asp = True
+        return avg_cost, strategy, input_x, input_y
+
+    def test_with_asp(self):
+        fleet.init(is_collective=True)
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy, input_x, input_y = self.net(train_prog,
+                                                        startup_prog)
+
+        with fluid.program_guard(train_prog, startup_prog):
+            optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+            optimizer = fleet.distributed_optimizer(
+                optimizer, strategy=strategy)
+            optimizer.minimize(avg_cost)
+
+        place = fluid.CUDAPlace(0) if paddle.fluid.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+
+        exe = fluid.Executor(place)
+        feeder = fluid.DataFeeder(feed_list=[input_x, input_y], place=place)
+        exe.run(startup_prog)
+
+        sparsity.prune_model(train_prog)
+
+        data = (np.random.randn(64, 32), np.random.randint(2, size=(64, 1)))
+        exe.run(train_prog, feed=feeder.feed([data]))
+
+        for param in train_prog.global_block().all_parameters():
+            if ASPHelper._is_supported_layer(train_prog, param.name):
+                mat = np.array(fluid.global_scope().find_var(param.name)
+                               .get_tensor())
+                self.assertTrue(
+                    paddle.fluid.contrib.sparsity.check_sparsity(
+                        mat.T, n=2, m=4))
+
+
+class TestFleetWithASPAMPStatic(unittest.TestCase):
     def setUp(self):
         os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213"
         os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213"
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
index 7c747338593a3..346939fb5ce28 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
@@ -29,4 +29,6 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
     py_test_modules(test_dist_pnorm MODULES test_dist_pnorm ENVS ${dist_ENVS})
     py_test_modules(test_dist_slice MODULES test_dist_slice ENVS ${dist_ENVS})
     py_test_modules(test_cluster MODULES test_cluster ENVS ${dist_ENVS})
+    py_test_modules(test_comm_cost MODULES test_comm_cost ENVS ${dist_ENVS})
+    py_test_modules(test_comp_cost MODULES test_comp_cost ENVS ${dist_ENVS})
 endif()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
index b039bb76dcb03..66addd1be085b 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
@@ -127,8 +127,7 @@ def train():
     engine.prepare(optimizer, loss)
     engine.fit(dataset,
                batch_size=batch_size,
-               steps_per_epoch=batch_num * batch_size,
-               sample_generator=True)
+               steps_per_epoch=batch_num * batch_size)
 
     eval_dataset = MyDataset(batch_size)
     engine.prepare(optimizer, loss, mode='eval')
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/high_order_grad.py b/python/paddle/fluid/tests/unittests/auto_parallel/high_order_grad.py
index 9a9efe7ab2dd0..cc0acae2fb1c1 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/high_order_grad.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/high_order_grad.py
@@ -23,6 +23,9 @@
 from paddle.incubate.autograd import Hessian
 from paddle.distributed.auto_parallel.engine import Engine
 
+np.random.seed(1234)
+paddle.seed(1234)
+
 
 class FCNet:
     def __init__(self, num_ins, num_outs, num_layers, hidden_size):
@@ -114,6 +117,7 @@ def loss_func(eq_loss, bc_u, bc_value):
 
 
 def main():
+    paddle.enable_static()
     # dataset
     train_dataset = LaplaceDataset(10)
     # optimizer
@@ -136,10 +140,8 @@ def main():
         inputs_spec=inputs_spec,
         labels_spec=labels_spec,
         strategy=dist_strategy)
-    paddle.seed(1234 + engine._cur_rank)
     engine.prepare(optimizer=optimizer, loss=loss_func)
-    res = engine.fit(train_dataset, sample_generator=False)
-    assert np.allclose(res[-1], 2.840593)
+    res = engine.fit(train_dataset, batch_size=None)
 
     dist_context = engine.dist_context
     block = engine.main_program.global_block()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_comm_cost.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_comm_cost.py
new file mode 100644
index 0000000000000..898408becacdf
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_comm_cost.py
@@ -0,0 +1,158 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import json
+
+import paddle
+from paddle.distributed.auto_parallel.cluster import Cluster
+from paddle.distributed.auto_parallel.cost import CommContext
+from paddle.distributed.auto_parallel.cost import build_comm_desc
+from paddle.distributed.auto_parallel.cost import AllreduceSumOpCost
+from paddle.distributed.auto_parallel.cost import AllgatherOpCost
+from paddle.distributed.auto_parallel.cost import BroadcastOpCost
+from paddle.distributed.auto_parallel.cost import SendOpCost
+from paddle.distributed.auto_parallel.cost import RecvOpCost
+from paddle.distributed.auto_parallel.cost import IdentityOpCost
+
+from test_cluster import cluster_json, multi_cluster_json
+
+
+class TestCommOpCost(unittest.TestCase):
+    def test_comm_cost(self):
+        # Build cluster
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        cluster_json_path = os.path.join(file_dir, "auto_parallel_cluster.json")
+        cluster_json_object = json.loads(cluster_json)
+        with open(cluster_json_path, "w") as cluster_json_file:
+            json.dump(cluster_json_object, cluster_json_file)
+        cluster = Cluster()
+        cluster.build_from_file(cluster_json_path)
+
+        # Build CommConetxt
+        CommContext._has_instance = None
+        CommContext._instance = None
+        comm_context = CommContext(cluster)
+
+        # Check AllreduceSumCost 128MB ring cost
+        allreduce_sum_op_desc = build_comm_desc(
+            "c_allreduce_sum", [0, 1, 2, 3, 4, 5, 6, 7], paddle.float32,
+            [1, 32 * (10**6)])
+        allreduce_sum_op_cost = AllreduceSumOpCost(
+            op_desc=allreduce_sum_op_desc, comm_context=comm_context)
+
+        # Check AllgatherOpCost cost
+        allgather_op_desc = build_comm_desc("c_allgather",
+                                            [0, 1, 2, 3, 4, 5, 6, 7],
+                                            paddle.float32, [1, 32 * (10**6)])
+        allgather_op_cost = AllgatherOpCost(
+            op_desc=allgather_op_desc, comm_context=comm_context)
+        self.assertTrue(allgather_op_cost.time > 0)
+
+        # Check BroadcastOpCost cost
+        broadcast_op_desc = build_comm_desc("c_broadcast",
+                                            [0, 1, 2, 3, 4, 5, 6, 7],
+                                            paddle.float32, [1, 32 * (10**6)])
+        broadcast_op_cost = BroadcastOpCost(
+            op_desc=broadcast_op_desc, comm_context=comm_context)
+        self.assertTrue(broadcast_op_cost.time > 0)
+
+        # Check SendOpCost cost
+        send_op_desc = build_comm_desc("send_v2", [0, 1], paddle.float32,
+                                       [1, 32 * (10**6)])
+        send_op_cost = SendOpCost(
+            op_desc=send_op_desc, comm_context=comm_context)
+        self.assertTrue(send_op_cost.time > 0)
+
+        # Check RecvOpCost cost
+        recv_op_desc = build_comm_desc("recv_v2", [0, 1], paddle.float32,
+                                       [1, 32 * (10**6)])
+        recv_op_cost = RecvOpCost(
+            op_desc=recv_op_desc, comm_context=comm_context)
+        self.assertTrue(recv_op_cost.time > 0)
+
+        # Check IdentityOpCost cost
+        identity_op_desc = build_comm_desc("c_identity", [0, 1], paddle.float32,
+                                           [1, 32 * (10**6)])
+        identity_op_cost = IdentityOpCost(
+            op_desc=identity_op_desc, comm_context=comm_context)
+        self.assertTrue(identity_op_cost.time >= 0)
+
+        # Remove unnecessary files
+        if os.path.exists(cluster_json_path):
+            os.remove(cluster_json_path)
+
+    def test_cross_machine_comm_cost(self):
+        # Build cluster
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        cluster_json_path = os.path.join(file_dir, "auto_parallel_cluster.json")
+        cluster_json_object = json.loads(multi_cluster_json)
+        with open(cluster_json_path, "w") as cluster_json_file:
+            json.dump(cluster_json_object, cluster_json_file)
+        cluster = Cluster()
+        cluster.build_from_file(cluster_json_path)
+
+        # Build CommConetxt
+        CommContext._has_instance = None
+        CommContext._instance = None
+        comm_context = CommContext(cluster)
+
+        # Check AllreduceSumCost 128MB ring cost
+        allreduce_sum_op_desc = build_comm_desc(
+            "c_allreduce_sum",
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+            paddle.float32, [1, 32 * (10**6)])
+        allreduce_sum_op_cost = AllreduceSumOpCost(
+            op_desc=allreduce_sum_op_desc, comm_context=comm_context)
+
+        # Check AllgatherOpCost cost
+        allgather_op_desc = build_comm_desc(
+            "c_allgather",
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+            paddle.float32, [1, 32 * (10**6)])
+        allgather_op_cost = AllgatherOpCost(
+            op_desc=allgather_op_desc, comm_context=comm_context)
+        self.assertTrue(allgather_op_cost.time > 0)
+
+        # Check BroadcastOpCost cost
+        broadcast_op_desc = build_comm_desc(
+            "c_broadcast",
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+            paddle.float32, [1, 32 * (10**6)])
+        broadcast_op_cost = BroadcastOpCost(
+            op_desc=broadcast_op_desc, comm_context=comm_context)
+        self.assertTrue(broadcast_op_cost.time > 0)
+
+        # Check SendOpCost cost
+        send_op_desc = build_comm_desc("send_v2", [0, 1], paddle.float32,
+                                       [1, 32 * (10**6)])
+        send_op_cost = SendOpCost(
+            op_desc=send_op_desc, comm_context=comm_context)
+        self.assertTrue(send_op_cost.time > 0)
+
+        # Check RecvOpCost cost
+        recv_op_desc = build_comm_desc("recv_v2", [0, 1], paddle.float32,
+                                       [1, 32 * (10**6)])
+        recv_op_cost = RecvOpCost(
+            op_desc=recv_op_desc, comm_context=comm_context)
+        self.assertTrue(recv_op_cost.time > 0)
+
+        # Remove unnecessary files
+        if os.path.exists(cluster_json_path):
+            os.remove(cluster_json_path)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_comp_cost.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_comp_cost.py
new file mode 100644
index 0000000000000..4cdd51e42adf0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_comp_cost.py
@@ -0,0 +1,253 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import json
+
+import paddle
+from paddle.distributed.auto_parallel.cluster import Cluster
+from paddle.distributed.auto_parallel.cost.comp_op_cost import AssignOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import AssignValueOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import BeamSearchOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import BeamSearchDecodeOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import CastOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import ConcatOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import ElementwiseAddOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import ElementwiseAddGradOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import ElementwiseDivOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import ElementwiseDivGradOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import ElementwiseMulOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import ElementwiseMulGradOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import ElementwiseSubOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import EmbeddingOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import EmbeddingGradOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import FillConstantOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import FillConstantBatchSizeLikeOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import FillConstantBatchSizeLikeGradOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import GatherOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import GeluOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import GeluGradOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import GreaterEqualOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import IncrementOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import IsEmptyOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import LayerNormOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import LayerNormGradOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import LessThanOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import LogicalNotOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import LogicalAndOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import LodResetOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import LogOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import LookupTableV2OpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import LookupTableV2GradOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import MatmulOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import MatmulGradOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import MatmulV2OpCost
+
+from test_cluster import cluster_json
+
+
+class TestCompOpCost(unittest.TestCase):
+    def test_comp_cost(self):
+        # Build cluster
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        cluster_json_path = os.path.join(file_dir, "auto_parallel_cluster.json")
+        cluster_json_object = json.loads(cluster_json)
+        with open(cluster_json_path, "w") as cluster_json_file:
+            json.dump(cluster_json_object, cluster_json_file)
+        cluster = Cluster()
+        cluster.build_from_file(cluster_json_path)
+
+        op_cost = AssignOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = AssignValueOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = BeamSearchOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = BeamSearchDecodeOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = CastOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = ConcatOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = ElementwiseAddOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = ElementwiseAddGradOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = ElementwiseDivOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = ElementwiseDivGradOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = ElementwiseMulOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = ElementwiseMulGradOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = ElementwiseSubOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = EmbeddingOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = EmbeddingGradOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = FillConstantOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = FillConstantBatchSizeLikeOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = FillConstantBatchSizeLikeGradOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = GatherOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = GeluOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = GeluGradOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = GreaterEqualOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = IncrementOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = IsEmptyOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = LayerNormOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = LayerNormGradOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = LessThanOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = LogicalNotOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = LogicalAndOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = LodResetOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = LogOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = LookupTableV2OpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = LookupTableV2GradOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = MatmulOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = MatmulV2OpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        # Remove unnecessary files
+        if os.path.exists(cluster_json_path):
+            os.remove(cluster_json_path)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py
index 0914126feb852..aa0bf719fab29 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py
@@ -79,7 +79,6 @@ def parallelizer(program_func, rank):
 
 class TestDistSlice(unittest.TestCase):
     def test_dist_slice_dp2(self):
-
         for rank in range(2):
             dist_main_prog, dist_context = parallelizer(make_program_dp2, rank)
             ops = dist_main_prog.global_block().ops
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_new_cost_model.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_new_cost_model.py
index 0cd3041ea4d25..c0df01ada58f9 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_new_cost_model.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_new_cost_model.py
@@ -13,12 +13,17 @@
 # limitations under the License.
 
 import unittest
+import os
+import json
 
 import paddle
 import paddle.distributed.auto_parallel.cost as cost_model
 from paddle.distributed.auto_parallel.cost.base_cost import parse_to_desc
 from paddle.distributed.auto_parallel.cost.base_cost import parse_desc_to_str
-from paddle.distributed.auto_parallel.cost.base_cost import calc_time_from_model
+from paddle.distributed.auto_parallel.cost.base_cost import calc_time_by_modeling
+from paddle.distributed.auto_parallel.cluster import Cluster
+from paddle.distributed.auto_parallel.cost import CommContext
+from test_cluster import cluster_json, multi_cluster_json
 
 paddle.enable_static()
 
@@ -45,26 +50,44 @@ def test_comp_cost(self):
             if op.type == "matmul_v2":
                 matmul_v2_op = op
                 break
-        matmul_v2_cost = cost_model.OP_COST_FACTORY["matmul_v2"](
+        matmul_v2_cost = cost_model._g_op_cost_factory["matmul_v2"](
             op=matmul_v2_op)
         desc = parse_to_desc(op=matmul_v2_op)
         desc_str = parse_desc_to_str(desc)
         self.assertIsNotNone(desc_str)
         self.assertTrue(check_cost(matmul_v2_cost.cost))
-        time = calc_time_from_model(op=matmul_v2_op)
+        time = calc_time_by_modeling(op=matmul_v2_op)
         self.assertEqual(time, matmul_v2_cost.cost.time)
         tensor_cost = cost_model.TensorCost(tensor=x)
         # check memory
         self.assertEqual(tensor_cost.cost.memory, 1600)
 
     def test_comm_cost(self):
+        # Build cluster
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        cluster_json_path = os.path.join(file_dir, "auto_parallel_cluster.json")
+        cluster_json_object = json.loads(cluster_json)
+        with open(cluster_json_path, "w") as cluster_json_file:
+            json.dump(cluster_json_object, cluster_json_file)
+        cluster = Cluster()
+        cluster.build_from_file(cluster_json_path)
+
+        # Build CommConetxt
+        CommContext._has_instance = None
+        CommContext._instance = None
+        comm_context = CommContext(cluster)
         desc = {}
         desc["op"] = "c_allreduce_sum"
-        desc["inputs"] = {"X": [([100, 200], paddle.float32)]}
-        allreduce_cost = cost_model.OP_COST_FACTORY["c_allreduce_sum"](
-            op_desc=desc)
+        desc["inputs"] = {"X": [(paddle.float32, [100, 200])]}
+        desc["group_ranks"] = [0, 1]
+        allreduce_cost = cost_model._g_op_cost_factory["c_allreduce_sum"](
+            op_desc=desc, comm_context=CommContext(cluster))
         self.assertTrue(check_cost(allreduce_cost.cost))
 
+        # Remove unnecessary files
+        if os.path.exists(cluster_json_path):
+            os.remove(cluster_json_path)
+
     def test_cost_estimator(self):
         train_program = paddle.static.Program()
         cost_estimator = cost_model.CostEstimator(train_program)
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_prim_dist_op.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_prim_dist_op.py
new file mode 100644
index 0000000000000..f9ab6f37f3ce7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_prim_dist_op.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import paddle.distributed.auto_parallel as auto
+
+from paddle.fluid import program_guard
+from paddle.incubate.autograd import prim2orig, enable_prim, prim_enabled
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
+import paddle.distributed.auto_parallel as auto
+from paddle.distributed.auto_parallel.completion import Completer
+from paddle.distributed.auto_parallel.partitioner import Partitioner
+from paddle.distributed.auto_parallel.utils import set_var_dist_attr
+from paddle.distributed.auto_parallel.dist_context import DistributedContext, get_default_distributed_context, set_default_distributed_context
+
+paddle.enable_static()
+enable_prim()
+nranks = 2
+rank = 0
+
+
+class TestPrimDistOp(unittest.TestCase):
+    def setUp(self):
+        self.main_program = paddle.static.Program()
+        self.startup_program = paddle.static.Program()
+        self.layer_help = LayerHelper('TestPrimDistOp')
+
+        with paddle.static.program_guard(self.main_program,
+                                         self.startup_program):
+            self.init_prog()
+
+    def init_prog(self):
+        # block = self.main_program.global_block()
+        # block = self.main_program.global_block()
+        self.w = self.layer_help.create_parameter(
+            dtype="float", shape=[20], attr=None)
+        self.w_grad = paddle.static.data(
+            name='w_grad', shape=[20], dtype='float')
+        self.tmp1 = paddle.static.data(name='tmp1', shape=[20], dtype='float')
+        self.tmp2 = paddle.static.data(name='tmp2', shape=[20], dtype='float')
+        self.batch_reduced = paddle.static.data(
+            name='batch_reduced', shape=[1], dtype='float')
+        self.attrs = {}
+
+        default_dist_context = get_default_distributed_context()
+        _global_process_mesh = auto.ProcessMesh(list(range(nranks)))
+        tensor_dist_attr = set_var_dist_attr(
+            default_dist_context,
+            self.tmp1, [-1],
+            _global_process_mesh,
+            mark_annotated=True)
+        tensor_dist_attr = set_var_dist_attr(
+            default_dist_context,
+            self.tmp1, [-1],
+            _global_process_mesh,
+            mark_annotated=True)
+
+        op = self.layer_help.append_op(
+            type="add_p",
+            inputs={'X': self.tmp1,
+                    'Y': self.w},
+            outputs={'Z': self.w_grad},
+            attrs=self.attrs)
+
+        op = self.layer_help.append_op(
+            type="reduce_p",
+            inputs={'X': self.tmp2},
+            outputs={'Y': self.batch_reduced},
+            attrs={"axis": [0]})
+
+    def test_loss_and_grad_allreduce(self):
+
+        dist_context = DistributedContext(self.main_program,
+                                          self.startup_program)
+        completer = Completer(dist_context)
+        completer.complete_prim_annotation(self.main_program)
+        dist_context.block_state.parse_forward_blocks(self.main_program)
+        dist_context.block_state.parse_backward_blocks(self.main_program)
+        dist_context.grads_params = dict()
+        dist_context.grads_params[self.w_grad.name] = self.w.name
+        dist_context.synced_gradient = set()
+        dist_context.data_parallel_group = list(range(nranks))
+        partitioner = Partitioner(dist_context, rank)
+        dist_main_prog, dist_startup_prog, _ = partitioner.partition(
+            self.main_program, self.startup_program, [(self.w, self.w_grad)])
+        ops = dist_main_prog.global_block().ops
+
+        self.assertTrue(ops[1].type == "c_allreduce_sum")
+        self.assertTrue(ops[3].type == "c_allreduce_sum")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_relaunch_with_gpt_planner.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_relaunch_with_gpt_planner.py
index 8782f01ea5ff3..bc1d0a70182b4 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_relaunch_with_gpt_planner.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_relaunch_with_gpt_planner.py
@@ -53,6 +53,10 @@ def test_relaunch_with_planner(self):
                                               "auto_parallel_rank_mapping.json")
         if os.path.exists(rank_mapping_json_path):
             os.remove(rank_mapping_json_path)
+        files_path = [path for path in os.listdir('.') if '.pkl' in path]
+        for path in files_path:
+            if os.path.exists(path):
+                os.remove(path)
         log_path = os.path.join(file_dir, "log")
         if os.path.exists(log_path):
             shutil.rmtree(log_path)
diff --git a/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt b/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt
index 46af5509d244b..37216241b8f08 100644
--- a/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt
@@ -8,3 +8,4 @@ endforeach(TEST_OP)
 
 set_tests_properties(test_autograd_functional_dynamic PROPERTIES TIMEOUT 160)
 set_tests_properties(test_autograd_functional_static PROPERTIES TIMEOUT 160)
+set_tests_properties(test_gradients_and_minimize PROPERTIES TIMEOUT 60)
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_gradients_and_minimize.py b/python/paddle/fluid/tests/unittests/autograd/test_gradients_and_minimize.py
new file mode 100644
index 0000000000000..092ddb4094d03
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/autograd/test_gradients_and_minimize.py
@@ -0,0 +1,143 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle
+from paddle.incubate.autograd.primx import prim2orig
+from paddle.incubate.autograd.utils import enable_prim, disable_prim, prim_enabled
+
+paddle.enable_static()
+
+
+class TestGradients(unittest.TestCase):
+    def test_third_order(self):
+        enable_prim()
+        main = paddle.static.Program()
+        startup = paddle.static.Program()
+        with paddle.static.program_guard(main, startup):
+            x = paddle.static.data(name='x', shape=[1], dtype='float32')
+            x2 = paddle.multiply(x, x)
+            x3 = paddle.multiply(x2, x)
+            x4 = paddle.multiply(x3, x)
+
+            grad1, = paddle.static.gradients([x4], [x])
+            grad2, = paddle.static.gradients([grad1], [x])
+            grad3, = paddle.static.gradients([grad2], [x])
+
+            prim2orig(main.block(0))
+
+        feed = {x.name: np.array([2.]).astype('float32')}
+        fetch_list = [grad3.name]
+        result = [np.array([48.])]
+
+        place = paddle.CPUPlace()
+        if paddle.device.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+        exe = paddle.static.Executor(place)
+        exe.run(startup)
+        outs = exe.run(main, feed=feed, fetch_list=fetch_list)
+        np.allclose(outs, result)
+        disable_prim()
+
+    def test_fourth_order(self):
+        enable_prim()
+        main = paddle.static.Program()
+        startup = paddle.static.Program()
+        with paddle.static.program_guard(main, startup):
+            x = paddle.static.data(name='x', shape=[1], dtype='float32')
+            x2 = paddle.multiply(x, x)
+            x3 = paddle.multiply(x2, x)
+            x4 = paddle.multiply(x3, x)
+            x5 = paddle.multiply(x4, x)
+            out = paddle.sqrt(x5 + x4)
+
+            grad1, = paddle.static.gradients([out], [x])
+            grad2, = paddle.static.gradients([grad1], [x])
+            grad3, = paddle.static.gradients([grad2], [x])
+            grad4, = paddle.static.gradients([grad3], [x])
+
+            prim2orig(main.block(0))
+
+        feed = {x.name: np.array([2.]).astype('float32'), }
+        fetch_list = [grad4.name]
+        # (3*(-5*x^2-16*x-16))/(16*(x+1)^3.5)
+        result = [np.array([-0.27263762711])]
+
+        place = paddle.CPUPlace()
+        if paddle.device.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+        exe = paddle.static.Executor(place)
+        exe.run(startup)
+        outs = exe.run(main, feed=feed, fetch_list=fetch_list)
+        np.allclose(outs, result)
+        disable_prim()
+
+
+class TestMinimize(unittest.TestCase):
+    def model(self, x, w, bias, opt):
+        paddle.seed(0)
+        place = paddle.CPUPlace()
+        if paddle.device.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+        exe = paddle.static.Executor(place)
+        main = paddle.static.Program()
+        startup = paddle.static.Program()
+        with paddle.static.program_guard(main, startup):
+            input_x = paddle.static.data('x', x.shape, dtype=x.dtype)
+            input_x.stop_gradient = False
+            params_w = paddle.static.create_parameter(
+                shape=w.shape, dtype=w.dtype, is_bias=False)
+            params_bias = paddle.static.create_parameter(
+                shape=bias.shape, dtype=bias.dtype, is_bias=True)
+            y = paddle.tanh(paddle.matmul(input_x, params_w) + params_bias)
+            loss = paddle.norm(y, p=2)
+            opt = opt
+            _, grads = opt.minimize(loss)
+            if prim_enabled():
+                prim2orig(main.block(0))
+        exe.run(startup)
+        grads = exe.run(main,
+                        feed={'x': x,
+                              'w': w,
+                              'bias': bias},
+                        fetch_list=grads)
+        return grads
+
+    def test_adam(self):
+        x = np.random.rand(2, 20)
+        w = np.random.rand(20, 2)
+        bias = np.random.rand(2)
+        enable_prim()
+        prim_grads = self.model(x, w, bias, paddle.optimizer.Adam(0.01))
+        disable_prim()
+        orig_grads = self.model(x, w, bias, paddle.optimizer.Adam(0.01))
+        for orig, prim in zip(orig_grads, prim_grads):
+            np.testing.assert_allclose(orig, prim)
+
+    def test_sgd(self):
+        x = np.random.rand(2, 20)
+        w = np.random.rand(20, 2)
+        bias = np.random.rand(2)
+        enable_prim()
+        prim_grads = self.model(x, w, bias, paddle.optimizer.SGD(0.01))
+        disable_prim()
+        orig_grads = self.model(x, w, bias, paddle.optimizer.SGD(0.01))
+        for orig, prim in zip(orig_grads, prim_grads):
+            np.testing.assert_allclose(orig, prim)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_jvp_and_transpose.py b/python/paddle/fluid/tests/unittests/autograd/test_jvp_and_transpose.py
new file mode 100644
index 0000000000000..d6ff931a936a2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/autograd/test_jvp_and_transpose.py
@@ -0,0 +1,696 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.layers.utils import flatten
+from paddle.incubate.autograd.primrules import _orig2prim, _prim2orig, _jvp, _transpose
+
+paddle.enable_static()
+
+
+############################ Test linearize rules ############################
+class TestAddPJVPAndTranspose(unittest.TestCase):
+    def setUp(self):
+        self.main_program = paddle.static.Program()
+        self.startup_program = paddle.static.Program()
+        self.layer_help = LayerHelper('TestPrim2Orig')
+
+        with paddle.static.program_guard(self.main_program,
+                                         self.startup_program):
+            self.init_data()
+
+    def init_data(self):
+        # Set prim op
+        self.op_type = 'add_p'
+        X = paddle.static.data(name='X', shape=[2, 2], dtype='float')
+        Y = paddle.static.data(name='Y', shape=[2, 2], dtype='float')
+        self.prim_input = {'X': X, 'Y': Y}
+        self.prim_output = {
+            'Z':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.prim_attrs = {}
+
+        # Set JVP
+        X_DOT = paddle.static.data(name='X_DOT', shape=[2, 2], dtype='float')
+        Y_DOT = paddle.static.data(name='Y_DOT', shape=[2, 2], dtype='float')
+        self.jvp_args = (X_DOT, Y_DOT)
+        self.jvp_out_shape_map = {0: self.prim_output['Z']}
+
+        # Set transpose
+        check_dot = lambda v: True
+        Z_BAR = paddle.static.data(name='Z_BAR', shape=[2, 2], dtype='float')
+        self.transpose_args = (check_dot, Z_BAR)
+        self.transpose_out_shape_map = {0: X, 1: Y}
+
+        self.all_ops = [
+            # prim op:
+            'add_p',
+            # jvp op:
+            'add_p',
+            # transpose op:
+        ]
+
+    def test_op(self):
+        with paddle.static.program_guard(self.main_program,
+                                         self.startup_program):
+            op = self.layer_help.append_op(
+                type=self.op_type,
+                inputs=self.prim_input,
+                outputs=self.prim_output,
+                attrs=self.prim_attrs)
+
+            jvp_out = _jvp(op, *self.jvp_args)
+            jvp_out = flatten(jvp_out)
+            for k, v in self.jvp_out_shape_map.items():
+                self.assertEqual(jvp_out[k].shape, v.shape)
+
+            # Some prim ops dont have transpose rule
+            if hasattr(self, 'transpose_args'):
+                transpose_out = _transpose(op, *self.transpose_args)
+                transpose_out = flatten(transpose_out)
+                for k, v in self.transpose_out_shape_map.items():
+                    self.assertEqual(transpose_out[k].shape, v.shape)
+
+            all_ops = [op.type for op in self.main_program.block(0).ops]
+            self.assertEqual(sorted(all_ops), sorted(self.all_ops))
+
+
+class TestSubPJVPAndTranspose(TestAddPJVPAndTranspose):
+    def init_data(self):
+        # Set prim op
+        self.op_type = 'sub_p'
+        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
+        Y = paddle.static.data(name='Y', shape=[5, 6], dtype='int64')
+        self.prim_input = {'X': X, 'Y': Y}
+        self.prim_output = {
+            'Z':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.prim_attrs = {}
+
+        # Set JVP
+        X_DOT = paddle.static.data(name='X_DOT', shape=[5, 6], dtype='int64')
+        Y_DOT = paddle.static.data(name='Y_DOT', shape=[5, 6], dtype='int64')
+        self.jvp_args = (X_DOT, Y_DOT)
+        self.jvp_out_shape_map = {0: self.prim_output['Z']}
+
+        # Set transpose
+        check_dot = lambda v: True
+        Z_BAR = paddle.static.data(name='Z_BAR', shape=[5, 6], dtype='int64')
+        self.transpose_args = (check_dot, Z_BAR)
+        self.transpose_out_shape_map = {0: X, 1: Y}
+
+        self.all_ops = [
+            # prim op:
+            'sub_p',
+            # jvp op:
+            'sub_p',
+            # transpose op:
+            'fill_constant_p',
+            'sub_p'
+        ]
+
+
+class TestMulPJVPAndTranspose(TestAddPJVPAndTranspose):
+    def init_data(self):
+        # Set prim op
+        self.op_type = 'mul_p'
+        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
+        Y = paddle.static.data(name='Y', shape=[5, 6], dtype='int64')
+        self.prim_input = {'X': X, 'Y': Y}
+        self.prim_output = {
+            'Z':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.prim_attrs = {}
+
+        # Set JVP
+        X_DOT = paddle.static.data(name='X_DOT', shape=[5, 6], dtype='int64')
+        Y_DOT = paddle.static.data(name='Y_DOT', shape=[5, 6], dtype='int64')
+        self.jvp_args = (X_DOT, Y_DOT)
+        self.jvp_out_shape_map = {0: self.prim_output['Z']}
+
+        # Set transpose
+        check_dot = lambda v: v is X
+        Z_BAR = paddle.static.data(name='Z_BAR', shape=[5, 6], dtype='int64')
+        self.transpose_args = (check_dot, Z_BAR)
+        self.transpose_out_shape_map = {0: X, }
+
+        self.all_ops = [
+            # prim op:
+            'mul_p',
+            # jvp op:
+            'mul_p',
+            'mul_p',
+            'add_p',
+            # transpose op:
+            'mul_p'
+        ]
+
+
+class TestDivPJVPAndTranspose(TestAddPJVPAndTranspose):
+    def init_data(self):
+        # Set prim op
+        self.op_type = 'div_p'
+        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
+        Y = paddle.static.data(name='Y', shape=[5, 6], dtype='int64')
+        self.prim_input = {'X': X, 'Y': Y}
+        self.prim_output = {
+            'Z':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.prim_attrs = {}
+
+        # Set JVP
+        X_DOT = paddle.static.data(name='X_DOT', shape=[5, 6], dtype='int64')
+        Y_DOT = paddle.static.data(name='Y_DOT', shape=[5, 6], dtype='int64')
+        self.jvp_args = (X_DOT, Y_DOT)
+        self.jvp_out_shape_map = {0: self.prim_output['Z']}
+
+        # Set transpose
+        check_dot = lambda v: v is X
+        Z_BAR = paddle.static.data(name='Z_BAR', shape=[5, 6], dtype='int64')
+        self.transpose_args = (check_dot, Z_BAR)
+        self.transpose_out_shape_map = {0: X, }
+
+        self.all_ops = [
+            # prim op:
+            'div_p',
+            # jvp op:
+            'div_p',
+            'div_p',
+            'mul_p',
+            'mul_p',
+            'sub_p',
+            # transpose op:
+            'div_p'
+        ]
+
+
+class TestSqrtPJVPAndTranspose(TestAddPJVPAndTranspose):
+    def init_data(self):
+        # Set prim op
+        self.op_type = 'sqrt_p'
+        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
+        self.prim_input = {'X': X, }
+        self.prim_output = {
+            'Y':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.prim_attrs = {}
+
+        # Set JVP
+        X_DOT = paddle.static.data(name='X_DOT', shape=[5, 6], dtype='int64')
+        self.jvp_args = (X_DOT, )
+        self.jvp_out_shape_map = {0: self.prim_output['Y']}
+
+        self.all_ops = [
+            # prim op:
+            'sqrt_p',
+            # jvp op:
+            'div_p',
+            'mul_p',
+            'fill_constant_p',
+            # 'sqrt_p',
+            # transpose op:
+        ]
+
+
+class TestTanhPJVPAndTranspose(TestAddPJVPAndTranspose):
+    def init_data(self):
+        # Set prim op
+        self.op_type = 'tanh_p'
+        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
+        self.prim_input = {'X': X, }
+        self.prim_output = {
+            'Y':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.prim_attrs = {}
+
+        # Set JVP
+        X_DOT = paddle.static.data(name='X_DOT', shape=[5, 6], dtype='int64')
+        self.jvp_args = (X_DOT, )
+        self.jvp_out_shape_map = {0: self.prim_output['Y']}
+
+        self.all_ops = [
+            # prim op:
+            'tanh_p',
+            # jvp op:
+            'mul_p',
+            'sub_p',
+            'fill_constant_p',
+            'mul_p',
+            # transpose op:
+        ]
+
+
+class TestReshapePJVPAndTranspose(TestAddPJVPAndTranspose):
+    def init_data(self):
+        # Set prim op
+        self.op_type = 'reshape_p'
+        X = paddle.static.data(name='X', shape=[8, 8], dtype='int64')
+        self.prim_input = {'X': X, }
+        self.prim_output = {
+            'Y':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.prim_attrs = {'shape': [2, 32]}
+
+        # Set JVP
+        X_DOT = paddle.static.data(name='X_DOT', shape=[8, 8], dtype='int64')
+        self.jvp_args = (X_DOT, )
+        self.jvp_out_shape_map = {0: self.prim_output['Y']}
+
+        # Set transpose
+        check_dot = lambda v: v is X
+        Y_BAR = paddle.static.data(name='Y_BAR', shape=[2, 32], dtype='int64')
+        self.transpose_args = (check_dot, Y_BAR)
+        self.transpose_out_shape_map = {0: X, }
+
+        self.all_ops = [
+            # prim op:
+            'reshape_p',
+            # jvp op:
+            'reshape_p',
+            # transpose op:
+            'reshape_p',
+        ]
+
+
+class TestBroadcastPJVPAndTranspose(TestAddPJVPAndTranspose):
+    def init_data(self):
+        # Set prim op
+        self.op_type = 'broadcast_p'
+        X = paddle.static.data(name='X', shape=[10, 1], dtype='int64')
+        self.prim_input = {'X': X, }
+        self.prim_output = {
+            'Y':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.prim_attrs = {'shape': [2, 10, 7]}
+
+        # Set JVP
+        X_DOT = paddle.static.data(name='X_DOT', shape=[10, 7], dtype='int64')
+        self.jvp_args = (X_DOT, )
+        self.jvp_out_shape_map = {0: self.prim_output['Y']}
+
+        # Set transpose
+        check_dot = lambda v: v is X
+        Y_BAR = paddle.static.data(
+            name='Y_BAR', shape=[2, 10, 7], dtype='int64')
+        self.transpose_args = (check_dot, Y_BAR)
+        self.transpose_out_shape_map = {0: X, }
+
+        self.all_ops = [
+            # prim op:
+            'broadcast_p',
+            # jvp op:
+            'broadcast_p',
+            # transpose op:
+            'reduce_p',
+            'reshape_p'
+        ]
+
+
+class TestTransposePJVPAndTranspose(TestAddPJVPAndTranspose):
+    def init_data(self):
+        # Set prim op
+        self.op_type = 'transpose_p'
+        X = paddle.static.data(name='X', shape=[2, 3, 4, 5], dtype='int64')
+        self.prim_input = {'X': X, }
+        self.prim_output = {
+            'Y':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.prim_attrs = {'axis': [0, 2, 3, 1]}
+
+        # Set JVP
+        X_DOT = paddle.static.data(
+            name='X_DOT', shape=[2, 3, 4, 5], dtype='int64')
+        self.jvp_args = (X_DOT, )
+        self.jvp_out_shape_map = {0: self.prim_output['Y']}
+
+        # Set transpose
+        check_dot = lambda v: v is X
+        Y_BAR = paddle.static.data(
+            name='Y_BAR', shape=[2, 4, 5, 3], dtype='int64')
+        self.transpose_args = (check_dot, Y_BAR)
+        self.transpose_out_shape_map = {0: X, }
+
+        self.all_ops = [
+            # prim op:
+            'transpose_p',
+            # jvp op:
+            'transpose_p',
+            # transpose op:
+            'transpose_p',
+        ]
+
+
+class TestSplitPJVPAndTranspose(TestAddPJVPAndTranspose):
+    def init_data(self):
+        # Set prim op
+        self.op_type = 'split_p'
+        X = paddle.static.data(name='X', shape=[2, 7, 10], dtype='int64')
+        self.prim_input = {'X': X, }
+        self.prim_output = {
+            'YS': [
+                self.layer_help.create_variable_for_type_inference(
+                    dtype=X.dtype) for i in range(4)
+            ]
+        }
+        self.prim_attrs = {'num_or_sections': [2, 3, 4, 1], 'axis': 2}
+
+        # Set JVP
+        X_DOT = paddle.static.data(
+            name='X_DOT', shape=[2, 7, 10], dtype='int64')
+        self.jvp_args = (X_DOT, )
+        self.jvp_out_shape_map = {
+            0: self.prim_output['YS'][0],
+            1: self.prim_output['YS'][1],
+            2: self.prim_output['YS'][2],
+            3: self.prim_output['YS'][3],
+        }
+
+        # Set transpose
+        check_dot = lambda v: v is X
+        YS_BAR = [
+            paddle.static.data(
+                name='Y_BAR1', shape=[2, 7, 2], dtype='int64'),
+            paddle.static.data(
+                name='Y_BAR2', shape=[2, 7, 3], dtype='int64'),
+            paddle.static.data(
+                name='Y_BAR3', shape=[2, 7, 4], dtype='int64'),
+            paddle.static.data(
+                name='Y_BAR4', shape=[2, 7, 1], dtype='int64'),
+        ]
+        self.transpose_args = (check_dot, YS_BAR)
+        self.transpose_out_shape_map = {0: X, }
+
+        self.all_ops = [
+            # prim op:
+            'split_p',
+            # jvp op:
+            'split_p',
+            # transpose op:
+            'concat_p',
+        ]
+
+
+class TestConcatPJVPAndTranspose(TestAddPJVPAndTranspose):
+    def init_data(self):
+        # Set prim op
+        self.op_type = 'concat_p'
+        X = paddle.static.data(name='X', shape=[3, 9, 5], dtype='float64')
+        Y = paddle.static.data(name='Y', shape=[3, 2, 5], dtype='float64')
+        Z = paddle.static.data(name='Z', shape=[3, 3, 5], dtype='float64')
+        self.prim_input = {'XS': [X, Y, Z], }
+        self.prim_output = {
+            'Y':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.prim_attrs = {'axis': 1}
+
+        # Set JVP
+        XS_DOT = [
+            paddle.static.data(
+                name='X_DOT1', shape=[3, 9, 5], dtype='float64'),
+            paddle.static.data(
+                name='X_DOT2', shape=[3, 2, 5], dtype='float64'),
+            paddle.static.data(
+                name='X_DOT3', shape=[3, 3, 5], dtype='float64'),
+        ]
+        self.jvp_args = (XS_DOT, )
+        self.jvp_out_shape_map = {0: self.prim_output['Y']}
+
+        # Set transpose
+        check_dot = lambda v: v is X or v is Y or v is Z
+        Y_BAR = paddle.static.data(
+            name='Y_BAR', shape=[3, 14, 5], dtype='float64')
+        self.transpose_args = (check_dot, Y_BAR)
+        self.transpose_out_shape_map = {
+            0: X,
+            1: Y,
+            2: Z,
+        }
+
+        self.all_ops = [
+            # prim op:
+            'concat_p',
+            # jvp op:
+            'concat_p',
+            # transpose op:
+            'split_p',
+        ]
+
+
+class TestReducePJVPAndTranspose(TestAddPJVPAndTranspose):
+    def init_data(self):
+        # Set prim op
+        self.op_type = 'reduce_p'
+        X = paddle.static.data(name='X', shape=[2, 3, 4, 5], dtype='float64')
+        self.prim_input = {'X': X}
+        self.prim_output = {
+            'Y':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.prim_attrs = {'axis': [2], 'keepdim': False}
+
+        # Set JVP
+        X_DOT = paddle.static.data(
+            name='X_DOT1', shape=[2, 3, 4, 5], dtype='float64')
+        self.jvp_args = (X_DOT, )
+        self.jvp_out_shape_map = {0: self.prim_output['Y']}
+
+        # Set transpose
+        check_dot = lambda v: v is X
+        Y_BAR = paddle.static.data(
+            name='Y_BAR', shape=[2, 3, 5], dtype='float64')
+        self.transpose_args = (check_dot, Y_BAR)
+        self.transpose_out_shape_map = {0: X, }
+
+        self.all_ops = [
+            # prim op:
+            'reduce_p',
+            # jvp op:
+            'reduce_p',
+            # transpose op:
+            'reshape_p',
+            'broadcast_p',
+        ]
+
+
+class TestMatmulPJVPAndTranspose(TestAddPJVPAndTranspose):
+    def init_data(self):
+        # Set prim op
+        self.op_type = 'matmul_p'
+        X = paddle.static.data(name='X', shape=[2, 3], dtype='float64')
+        Y = paddle.static.data(name='Y', shape=[3, 4], dtype='float64')
+        self.prim_input = {'X': X, 'Y': Y}
+        self.prim_output = {
+            'Z':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.prim_attrs = {}
+
+        # Set JVP
+        X_DOT = paddle.static.data(name='X_DOT', shape=[2, 3], dtype='float64')
+        Y_DOT = paddle.static.data(name='Y_DOT', shape=[3, 4], dtype='float64')
+        self.jvp_args = (X_DOT, Y_DOT)
+        self.jvp_out_shape_map = {0: self.prim_output['Z']}
+
+        # Set transpose
+        check_dot = lambda v: v is X
+        Z_BAR = paddle.static.data(name='Z_BAR', shape=[2, 4], dtype='float64')
+        self.transpose_args = (check_dot, Z_BAR)
+        self.transpose_out_shape_map = {0: X, }
+
+        self.all_ops = [
+            # prim op:
+            'matmul_p',
+            # jvp op:
+            'matmul_p',
+            'matmul_p',
+            'add_p',
+            # transpose op:
+            'matmul_p',
+            'transpose_p',
+        ]
+
+
+class TestSliceSelectPJVPAndTranspose(TestAddPJVPAndTranspose):
+    def init_data(self):
+        # Set prim op
+        self.op_type = 'slice_select_p'
+        X = paddle.static.data(name='X', shape=[3, 20], dtype='float64')
+        self.prim_input = {'X': X, }
+        self.prim_output = {
+            'Y':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.prim_attrs = {
+            'axis': [1],
+            'starts': [0],
+            'ends': [20],
+            'strides': [2]
+        }
+
+        # Set JVP
+        X_DOT = paddle.static.data(name='X_DOT', shape=[3, 20], dtype='float64')
+        self.jvp_args = (X_DOT, )
+        self.jvp_out_shape_map = {0: self.prim_output['Y']}
+
+        # Set transpose
+        check_dot = lambda v: v is X
+        Y_BAR = paddle.static.data(name='Y_BAR', shape=[3, 10], dtype='float64')
+        self.transpose_args = (check_dot, Y_BAR)
+        self.transpose_out_shape_map = {0: X, }
+
+        self.all_ops = [
+            # prim op:
+            'slice_select_p',
+            # jvp op:
+            'slice_select_p',
+            # transpose op:
+            'slice_assign_p',
+            'fill_constant_p',
+        ]
+
+
+class TestSliceAssignPJVPAndTranspose(TestAddPJVPAndTranspose):
+    def init_data(self):
+        # Set prim op
+        self.op_type = 'slice_assign_p'
+        X = paddle.static.data(name='X', shape=[3, 20], dtype='float64')
+        Y = paddle.static.data(name='Y', shape=[3, 5], dtype='float64')
+        self.prim_input = {'X': X, 'Y': Y}
+        self.prim_output = {
+            'Z':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.prim_attrs = {
+            'axis': [1],
+            'starts': [0],
+            'ends': [10],
+            'strides': [2]
+        }
+
+        # Set JVP
+        X_DOT = paddle.static.data(name='X_DOT', shape=[3, 20], dtype='float64')
+        Y_DOT = paddle.static.data(name='Y_DOT', shape=[3, 5], dtype='float64')
+        self.jvp_args = (X_DOT, Y_DOT)
+        self.jvp_out_shape_map = {0: self.prim_output['Z']}
+
+        # Set transpose
+        check_dot = lambda v: v is X or v is Y
+        Z_BAR = paddle.static.data(name='Z_BAR', shape=[3, 20], dtype='float64')
+        self.transpose_args = (check_dot, Z_BAR)
+        self.transpose_out_shape_map = {0: X, 1: Y}
+
+        self.all_ops = [
+            # prim op:
+            'slice_assign_p',
+            # jvp op:
+            'slice_assign_p',
+            # transpose op:
+            'slice_assign_p',
+            'slice_select_p',
+            'fill_constant_p'
+        ]
+
+
+class TestGatherPJVPAndTranspose(TestAddPJVPAndTranspose):
+    def init_data(self):
+        # Set prim op
+        self.op_type = 'gather_p'
+        X = paddle.static.data(name='X', shape=[9, 5], dtype='float64')
+        IndexTensor = paddle.static.data(
+            name='IndexTensor', shape=[3], dtype='int32')
+        self.prim_input = {'X': X, 'IndexTensor': IndexTensor}
+        self.prim_output = {
+            'Y':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.prim_attrs = {'axis': 1}
+
+        # Set JVP
+        X_DOT = paddle.static.data(name='X_DOT', shape=[9, 5], dtype='float64')
+        self.jvp_args = (
+            X_DOT,
+            IndexTensor, )
+        self.jvp_out_shape_map = {0: self.prim_output['Y']}
+
+        # Set transpose
+        check_dot = lambda v: v is X
+        Y_BAR = paddle.static.data(name='Y_BAR', shape=[9, 3], dtype='float64')
+        self.transpose_args = (check_dot, Y_BAR)
+        self.transpose_out_shape_map = {0: X, }
+
+        self.all_ops = [
+            # prim op:
+            'gather_p',
+            # jvp op:
+            'gather_p',
+            # transpose op:
+            'scatter_add_p',
+            'fill_constant_p',
+        ]
+
+
+class TestScatterAddPJVPAndTranspose(TestAddPJVPAndTranspose):
+    def init_data(self):
+        # Set prim op
+        self.op_type = 'scatter_add_p'
+        X = paddle.static.data(name='X', shape=[9, 5], dtype='float64')
+        Y = paddle.static.data(name='Y', shape=[9, 3], dtype='float64')
+        IndexTensor = paddle.static.data(
+            name='IndexTensor', shape=[3], dtype='int32')
+        self.prim_input = {'X': X, 'Y': Y, 'IndexTensor': IndexTensor}
+        self.prim_output = {
+            'Z':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.prim_attrs = {'axis': 1}
+
+        # Set JVP
+        X_DOT = paddle.static.data(name='X_DOT', shape=[9, 5], dtype='float64')
+        Y_DOT = paddle.static.data(name='Y_DOT', shape=[9, 3], dtype='float64')
+        self.jvp_args = (X_DOT, Y_DOT)
+        self.jvp_out_shape_map = {0: self.prim_output['Z']}
+
+        # Set transpose
+        check_dot = lambda v: v is X or v is Y
+        Z_BAR = paddle.static.data(name='Z_BAR', shape=[9, 5], dtype='float64')
+        self.transpose_args = (check_dot, Z_BAR)
+        self.transpose_out_shape_map = {0: X, 1: Y}
+
+        self.all_ops = [
+            # prim op:
+            'scatter_add_p',
+            # jvp op:
+            'scatter_add_p',
+            # transpose op:
+            'scatter_add_p',
+            'fill_constant_p',
+            'gather_p'
+        ]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_orig2prim.py b/python/paddle/fluid/tests/unittests/autograd/test_orig2prim.py
new file mode 100644
index 0000000000000..24c8febccf5c0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/autograd/test_orig2prim.py
@@ -0,0 +1,360 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.layers.utils import flatten
+from paddle.incubate.autograd.primrules import _orig2prim, _prim2orig, _jvp, _transpose
+
+paddle.enable_static()
+
+
+############################ Test orig2prim rules ############################
+class TestElementWiseAddOrig2Prim(unittest.TestCase):
+    def setUp(self):
+        self.main_program = paddle.static.Program()
+        self.startup_program = paddle.static.Program()
+        self.layer_help = LayerHelper('TestOrig2Prim')
+
+        with paddle.static.program_guard(self.main_program,
+                                         self.startup_program):
+            self.init_data()
+
+    def init_data(self):
+        self.op_type = 'elementwise_add'
+        X = paddle.static.data(name='X', shape=[2, 2], dtype='float')
+        Y = paddle.static.data(name='Y', shape=[2, 2], dtype='float')
+
+        self.input = {'X': X, 'Y': Y}
+        self.output = {
+            'Out':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {}
+
+        self.orig2prim_args = (X, Y)
+        self.all_ops = ['elementwise_add', 'add_p']
+        # { prim_op_output_index: orig_op_output_var }
+        self.out_map = {0: self.output['Out']}
+
+    def test_op(self):
+        with paddle.static.program_guard(self.main_program,
+                                         self.startup_program):
+            op = self.layer_help.append_op(
+                type=self.op_type,
+                inputs=self.input,
+                outputs=self.output,
+                attrs=self.attrs)
+
+            prim_out = _orig2prim(op, *self.orig2prim_args)
+            all_ops = [op.type for op in self.main_program.block(0).ops]
+
+            self.assertEqual(sorted(all_ops), sorted(self.all_ops))
+            prim_out = flatten(prim_out)
+            for k, v in self.out_map.items():
+                self.assertEqual(prim_out[k].shape, v.shape)
+
+
+class TestSqrtOrig2Prim(TestElementWiseAddOrig2Prim):
+    def init_data(self):
+        self.op_type = 'sqrt'
+        X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
+
+        self.input = {'X': X, }
+        self.output = {
+            'Out':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {}
+
+        self.orig2prim_args = (X, )
+        self.all_ops = ['sqrt', 'sqrt_p']
+        # { prim_op_output_index: orig_op_output_var }
+        self.out_map = {0: self.output['Out']}
+
+
+class TestElementWiseMulOrig2Prim(TestElementWiseAddOrig2Prim):
+    def init_data(self):
+        self.op_type = 'elementwise_mul'
+        X = paddle.static.data(name='X', shape=[8, 8], dtype='float')
+        Y = paddle.static.data(name='Y', shape=[8, 8], dtype='float')
+
+        self.input = {'X': X, 'Y': Y}
+        self.output = {
+            'Out':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {}
+
+        self.orig2prim_args = (X, Y)
+        self.all_ops = ['elementwise_mul', 'mul_p']
+        # { prim_op_output_index: orig_op_output_var }
+        self.out_map = {0: self.output['Out']}
+
+
+class TestMatmulV2Orig2Prim(TestElementWiseAddOrig2Prim):
+    def init_data(self):
+        self.op_type = 'matmul_v2'
+        X = paddle.static.data(name='X', shape=[3, 4], dtype='float')
+        Y = paddle.static.data(name='Y', shape=[4, 3], dtype='float')
+
+        self.input = {'X': X, 'Y': Y}
+        self.output = {
+            'Out':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {'trans_x': True, 'trans_y': True}
+
+        self.orig2prim_args = (X, Y)
+        self.all_ops = ['matmul_v2', 'transpose_p', 'transpose_p', 'matmul_p']
+        self.out_map = {0: self.output['Out']}
+
+
+class TestTanhOrig2Prim(TestElementWiseAddOrig2Prim):
+    def init_data(self):
+        self.op_type = 'tanh'
+        X = paddle.static.data(name='X', shape=[3, 4], dtype='float')
+
+        self.input = {'X': X, }
+        self.output = {
+            'Out':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {}
+
+        self.orig2prim_args = (X, )
+        self.all_ops = ['tanh', 'tanh_p']
+        self.out_map = {0: self.output['Out']}
+
+
+class TestReshape2Orig2Prim(TestElementWiseAddOrig2Prim):
+    def init_data(self):
+        self.op_type = 'reshape2'
+        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
+
+        self.input = {'X': X, }
+        self.output = {
+            'Out': X,
+            'XShape':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {'shape': [6, 5]}
+
+        self.orig2prim_args = (
+            None,
+            None,
+            X, )
+        self.all_ops = ['reshape2', 'reshape_p', 'fill_constant_p']
+        # Do not checke XShape
+        self.out_map = {0: self.output['Out']}
+
+
+class TestConcatOrig2Prim(TestElementWiseAddOrig2Prim):
+    def init_data(self):
+        self.op_type = 'concat'
+        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
+        Y = paddle.static.data(name='Y', shape=[3, 6], dtype='int64')
+
+        self.input = {'X': [X, Y], }
+        self.output = {
+            'Out':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {'axis': 0}
+
+        self.orig2prim_args = (
+            None,
+            (X, Y), )
+        self.all_ops = ['concat', 'concat_p']
+        self.out_map = {0: self.output['Out']}
+
+
+class TestSliceOrig2Prim(TestElementWiseAddOrig2Prim):
+    def init_data(self):
+        self.op_type = 'slice'
+        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
+
+        self.input = {'Input': X, }
+        self.output = {
+            'Out':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {
+            'axes': [0],
+            'starts': [1],
+            'ends': [4],
+        }
+
+        self.orig2prim_args = (None, None, X, None, None)
+        self.all_ops = ['slice', 'slice_select_p']
+        self.out_map = {0: self.output['Out']}
+
+
+class TestFillZerosLikeOrig2Prim(TestElementWiseAddOrig2Prim):
+    def init_data(self):
+        self.op_type = 'fill_zeros_like'
+        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
+
+        self.input = {'X': X, }
+        self.output = {
+            'Out':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {}
+
+        self.orig2prim_args = (X, )
+        self.all_ops = ['fill_zeros_like', 'fill_constant_p']
+        self.out_map = {0: self.output['Out']}
+
+
+class TestSumOrig2Prim(TestElementWiseAddOrig2Prim):
+    def init_data(self):
+        self.op_type = 'sum'
+        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
+        Y = paddle.static.data(name='Y', shape=[5, 6], dtype='int64')
+
+        self.input = {'X': X, 'Y': Y}
+        self.output = {
+            'Out':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {}
+
+        self.orig2prim_args = ((X, Y), )
+        self.all_ops = ['sum', 'add_p']
+        self.out_map = {0: self.output['Out']}
+
+
+class TestPNormOrig2Prim1(TestElementWiseAddOrig2Prim):
+    def init_data(self):
+        self.op_type = 'p_norm'
+        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
+
+        self.input = {'X': X, }
+        self.output = {
+            'Out':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {
+            'porder': 1,
+            'asvector': True,
+        }
+
+        self.orig2prim_args = (X, )
+        self.all_ops = ['p_norm', 'reshape_p', 'sqrt_p', 'reduce_p', 'mul_p']
+        self.out_map = {0: self.output['Out']}
+
+
+class TestPNormOrig2Prim2(TestElementWiseAddOrig2Prim):
+    def init_data(self):
+        self.op_type = 'p_norm'
+        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
+
+        self.input = {'X': X, }
+        self.output = {
+            'Out':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {
+            'porder': 2,
+            'asvector': True,
+        }
+
+        self.orig2prim_args = (X, )
+        self.all_ops = ['p_norm', 'reshape_p', 'sqrt_p', 'reduce_p', 'mul_p']
+        self.out_map = {0: self.output['Out']}
+
+
+class TestIndexSelectOrig2Prim(TestElementWiseAddOrig2Prim):
+    def init_data(self):
+        self.op_type = 'index_select'
+        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
+        Index = paddle.static.data(name='Index', shape=[2], dtype='int32')
+
+        self.input = {'X': X, 'Index': Index}
+        self.output = {
+            'Out':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {'dim': 0, }
+
+        self.orig2prim_args = (
+            Index,
+            X, )
+        self.all_ops = ['index_select', 'gather_p']
+        self.out_map = {0: self.output['Out']}
+
+
+class TestElementwiseSubOrig2Prim(TestElementWiseAddOrig2Prim):
+    def init_data(self):
+        self.op_type = 'elementwise_sub'
+        X = paddle.static.data(name='X', shape=[5, 6], dtype='int32')
+        Y = paddle.static.data(name='Y', shape=[6], dtype='int32')
+
+        self.input = {'X': X, 'Y': Y}
+        self.output = {
+            'Out':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {'dim': 0, }
+
+        self.orig2prim_args = (
+            X,
+            Y, )
+        self.all_ops = ['elementwise_sub', 'broadcast_p', 'sub_p']
+        self.out_map = {0: self.output['Out']}
+
+
+class TestScaleOrig2Prim(TestElementWiseAddOrig2Prim):
+    def init_data(self):
+        self.op_type = 'scale'
+        X = paddle.static.data(name='X', shape=[10, 7], dtype='int32')
+
+        self.input = {'X': X, }
+        self.output = {
+            'Out':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {'scale': 2.0, 'bias': 1.0, 'bias_after_scale': True}
+
+        self.orig2prim_args = (
+            None,
+            X, )
+        self.all_ops = [
+            'scale', 'fill_constant_p', 'fill_constant_p', 'mul_p', 'add_p'
+        ]
+        self.out_map = {0: self.output['Out']}
+
+
+class TestAssignOrig2Prim(TestElementWiseAddOrig2Prim):
+    def init_data(self):
+        self.op_type = 'assign'
+        X = paddle.static.data(name='X', shape=[10, 7], dtype='int32')
+
+        self.input = {'X': X, }
+        self.output = {
+            'Out':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {}
+
+        self.orig2prim_args = (X, )
+        self.all_ops = ['assign', 'fill_constant_p', 'add_p']
+        self.out_map = {0: self.output['Out']}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_prim2orig.py b/python/paddle/fluid/tests/unittests/autograd/test_prim2orig.py
new file mode 100644
index 0000000000000..15ab016fc543d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/autograd/test_prim2orig.py
@@ -0,0 +1,381 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.layers.utils import flatten
+from paddle.incubate.autograd.primrules import _orig2prim, _prim2orig, _jvp, _transpose
+
+paddle.enable_static()
+
+
+############################ Test prim2orig rules ############################
+class TestAddPPrim2Orig(unittest.TestCase):
+    def setUp(self):
+        self.main_program = paddle.static.Program()
+        self.startup_program = paddle.static.Program()
+        self.layer_help = LayerHelper('TestPrim2Orig')
+
+        with paddle.static.program_guard(self.main_program,
+                                         self.startup_program):
+            self.init_data()
+
+    def init_data(self):
+        self.op_type = 'add_p'
+        X = paddle.static.data(name='X', shape=[2, 2], dtype='float')
+        Y = paddle.static.data(name='Y', shape=[2, 2], dtype='float')
+
+        self.input = {'X': X, 'Y': Y}
+        self.output = {
+            'Z':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {}
+
+        self.prim2orig_args = (X, Y)
+        self.all_ops = ['add_p', 'elementwise_add']
+        # { prim_op_output_var: orign_op_out_index }
+        self.out_map = {self.output['Z']: 0}
+
+    def test_op(self):
+        with paddle.static.program_guard(self.main_program,
+                                         self.startup_program):
+            op = self.layer_help.append_op(
+                type=self.op_type,
+                inputs=self.input,
+                outputs=self.output,
+                attrs=self.attrs)
+
+            orig_out = _prim2orig(op, *self.prim2orig_args)
+            all_ops = [op.type for op in self.main_program.block(0).ops]
+            self.assertEqual(sorted(all_ops), sorted(self.all_ops))
+            orig_out = flatten(orig_out)
+            for k, v in self.out_map.items():
+                self.assertEqual(k.shape, orig_out[v].shape)
+
+
+class TestSubPPrim2Orig(TestAddPPrim2Orig):
+    def init_data(self):
+        self.op_type = 'sub_p'
+        X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
+        Y = paddle.static.data(name='Y', shape=[7, 8], dtype='float64')
+
+        self.input = {'X': X, 'Y': Y}
+        self.output = {
+            'Z':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {}
+
+        self.prim2orig_args = (X, Y)
+        self.all_ops = ['sub_p', 'elementwise_sub']
+        self.out_map = {self.output['Z']: 0}
+
+
+class TestMulPPrim2Orig(TestAddPPrim2Orig):
+    def init_data(self):
+        self.op_type = 'mul_p'
+        X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
+        Y = paddle.static.data(name='Y', shape=[7, 8], dtype='float64')
+
+        self.input = {'X': X, 'Y': Y}
+        self.output = {
+            'Z':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {}
+
+        self.prim2orig_args = (X, Y)
+        self.all_ops = ['mul_p', 'elementwise_mul']
+        self.out_map = {self.output['Z']: 0}
+
+
+class TestDivPPrim2Orig(TestAddPPrim2Orig):
+    def init_data(self):
+        self.op_type = 'div_p'
+        X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
+        Y = paddle.static.data(name='Y', shape=[7, 8], dtype='float64')
+
+        self.input = {'X': X, 'Y': Y}
+        self.output = {
+            'Z':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {}
+
+        self.prim2orig_args = (X, Y)
+        self.all_ops = ['div_p', 'elementwise_div']
+        self.out_map = {self.output['Z']: 0}
+
+
+class TestSqrtPPrim2Orig(TestAddPPrim2Orig):
+    def init_data(self):
+        self.op_type = 'sqrt_p'
+        X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
+
+        self.input = {'X': X, }
+        self.output = {
+            'Y':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {}
+
+        self.prim2orig_args = (X, )
+        self.all_ops = ['sqrt_p', 'sqrt']
+        self.out_map = {self.output['Y']: 0}
+
+
+class TestTanhPPrim2Orig(TestAddPPrim2Orig):
+    def init_data(self):
+        self.op_type = 'tanh_p'
+        X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
+
+        self.input = {'X': X, }
+        self.output = {
+            'Y':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {}
+
+        self.prim2orig_args = (X, )
+        self.all_ops = ['tanh_p', 'tanh']
+        self.out_map = {self.output['Y']: 0}
+
+
+class TestReshapePPrim2Orig(TestAddPPrim2Orig):
+    def init_data(self):
+        self.op_type = 'reshape_p'
+        X = paddle.static.data(name='X', shape=[2, 8], dtype='float64')
+
+        self.input = {'X': X, }
+        self.output = {
+            'Y':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {'shape': [4, 4]}
+
+        self.prim2orig_args = (X, )
+        self.all_ops = ['reshape_p', 'reshape2']
+        self.out_map = {self.output['Y']: 0}
+
+
+class TestBroadcastPPrim2Orig(TestAddPPrim2Orig):
+    def init_data(self):
+        self.op_type = 'broadcast_p'
+        X = paddle.static.data(name='X', shape=[2, 8], dtype='float64')
+
+        self.input = {'X': X, }
+        self.output = {
+            'Y':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {'shape': [10, 2, 8]}
+
+        self.prim2orig_args = (X, )
+        self.all_ops = ['broadcast_p', 'expand_v2']
+        self.out_map = {self.output['Y']: 0}
+
+
+class TestTransposePPrim2Orig(TestAddPPrim2Orig):
+    def init_data(self):
+        self.op_type = 'transpose_p'
+        X = paddle.static.data(name='X', shape=[7, 8, 9, 10], dtype='float64')
+
+        self.input = {'X': X, }
+        self.output = {
+            'Y':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {'axis': [1, 2, 0, 3]}
+
+        self.prim2orig_args = (X, )
+        self.all_ops = ['transpose_p', 'transpose2']
+        self.out_map = {self.output['Y']: 0}
+
+
+class TestSplitPPrim2Orig(TestAddPPrim2Orig):
+    def init_data(self):
+        self.op_type = 'split_p'
+        X = paddle.static.data(name='X', shape=[3, 9, 5], dtype='float64')
+
+        self.input = {'X': X, }
+        self.output = {
+            'YS': [
+                self.layer_help.create_variable_for_type_inference(
+                    dtype=X.dtype) for i in range(3)
+            ]
+        }
+        self.attrs = {'num_or_sections': [2, 3, 4], 'axis': 1}
+
+        self.prim2orig_args = (X, )
+        self.all_ops = ['split_p', 'split']
+        self.out_map = {
+            self.output['YS'][0]: 0,
+            self.output['YS'][1]: 1,
+            self.output['YS'][2]: 2,
+        }
+
+
+class TestConcatPPrim2Orig(TestAddPPrim2Orig):
+    def init_data(self):
+        self.op_type = 'concat_p'
+        X = paddle.static.data(name='X', shape=[3, 9, 5], dtype='float64')
+        Y = paddle.static.data(name='Y', shape=[2, 9, 5], dtype='float64')
+        Z = paddle.static.data(name='Z', shape=[1, 9, 5], dtype='float64')
+
+        self.input = {'XS': [X, Y, Z], }
+        self.output = {
+            'Y':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {'axis': 0}
+
+        self.prim2orig_args = ((X, Y, Z), )
+        self.all_ops = ['concat_p', 'concat']
+        self.out_map = {self.output['Y']: 0}
+
+
+class TestReducePPrim2Orig(TestAddPPrim2Orig):
+    def init_data(self):
+        self.op_type = 'reduce_p'
+        X = paddle.static.data(name='X', shape=[3, 9, 5], dtype='float64')
+
+        self.input = {'X': X}
+        self.output = {
+            'Y':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {'axis': [1], 'keepdim': True}
+
+        self.prim2orig_args = (X, )
+        self.all_ops = ['reduce_p', 'reduce_sum']
+        self.out_map = {self.output['Y']: 0}
+
+
+class TestMatmulPPrim2Orig(TestAddPPrim2Orig):
+    def init_data(self):
+        self.op_type = 'matmul_p'
+        X = paddle.static.data(name='X', shape=[9, 5], dtype='float64')
+        Y = paddle.static.data(name='Y', shape=[5, 9], dtype='float64')
+
+        self.input = {'X': X, 'Y': Y}
+        self.output = {
+            'Z':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {}
+
+        self.prim2orig_args = (X, Y)
+        self.all_ops = ['matmul_p', 'matmul_v2']
+        self.out_map = {self.output['Z']: 0}
+
+
+class TestSliceSelectPPrim2Orig(TestAddPPrim2Orig):
+    def init_data(self):
+        self.op_type = 'slice_select_p'
+        X = paddle.static.data(name='X', shape=[9, 5], dtype='float64')
+
+        self.input = {'X': X, }
+        self.output = {
+            'Y':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {'axis': [0], 'starts': [1], 'ends': [8], 'strides': [2]}
+
+        self.prim2orig_args = (X, )
+        self.all_ops = ['slice_select_p', 'strided_slice']
+        self.out_map = {self.output['Y']: 0}
+
+
+class TestSliceAssignPPrim2Orig(TestAddPPrim2Orig):
+    def init_data(self):
+        self.op_type = 'slice_assign_p'
+        X = paddle.static.data(name='X', shape=[9, 5], dtype='float64')
+        Y = paddle.static.data(name='Y', shape=[9, 3], dtype='float64')
+
+        self.input = {'X': X, 'Y': Y}
+        self.output = {
+            'Z':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {'axis': [1], 'starts': [0], 'ends': [3], 'strides': [1]}
+
+        self.prim2orig_args = (X, Y)
+        self.all_ops = ['slice_assign_p', 'assign', 'set_value']
+        self.out_map = {self.output['Z']: 0}
+
+
+class TestGatherPPrim2Orig(TestAddPPrim2Orig):
+    def init_data(self):
+        self.op_type = 'gather_p'
+        X = paddle.static.data(name='X', shape=[9, 5], dtype='float64')
+        IndexTensor = paddle.static.data(
+            name='IndexTensor', shape=[3], dtype='int32')
+
+        self.input = {'X': X, 'IndexTensor': IndexTensor}
+        self.output = {
+            'Y':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {'axis': 0, }
+
+        self.prim2orig_args = (
+            IndexTensor,
+            X, )
+        self.all_ops = ['gather_p', 'gather']
+        self.out_map = {self.output['Y']: 0}
+
+
+class TestScatterAddPPrim2Orig(TestAddPPrim2Orig):
+    def init_data(self):
+        self.op_type = 'scatter_add_p'
+        X = paddle.static.data(name='X', shape=[9, 5], dtype='float64')
+        Y = paddle.static.data(name='Y', shape=[3, 5], dtype='float64')
+        IndexTensor = paddle.static.data(
+            name='IndexTensor', shape=[3], dtype='int32')
+
+        self.input = {'X': X, 'Y': Y, 'IndexTensor': IndexTensor}
+        self.output = {
+            'Z':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {'axis': 0, }
+
+        self.prim2orig_args = (IndexTensor, X, Y)
+        self.all_ops = [
+            'scatter_add_p', 'fill_any_like', 'scatter', 'elementwise_add'
+        ]
+        self.out_map = {self.output['Z']: 0}
+
+
+class TestFillConstantPPrim2Orig(TestAddPPrim2Orig):
+    def init_data(self):
+        self.op_type = 'fill_constant_p'
+
+        self.input = {}
+        self.output = {
+            'Y':
+            self.layer_help.create_variable_for_type_inference(paddle.int32)
+        }
+        self.attrs = {'value': 10, 'shape': [5, 5], 'dtype': paddle.int32}
+
+        self.prim2orig_args = ()
+        self.all_ops = ['fill_constant_p', 'fill_constant']
+        self.out_map = {self.output['Y']: 0}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_primops.py b/python/paddle/fluid/tests/unittests/autograd/test_primops.py
similarity index 95%
rename from python/paddle/fluid/tests/unittests/test_primops.py
rename to python/paddle/fluid/tests/unittests/autograd/test_primops.py
index cbf77c2666611..e6a8c4ec3fe4c 100644
--- a/python/paddle/fluid/tests/unittests/test_primops.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_primops.py
@@ -14,12 +14,13 @@
 
 import unittest
 import numpy as np
-
 import paddle
-from paddle.autograd.primops import (
+from paddle.incubate.autograd.primops import (
     neg, set_value, add, sub, mul, div, sqrt, tanh, reshape, broadcast,
     transpose, split, concat, reduce, matmul, slice_select, slice_assign,
     gather, scatter_add, fill_const)
+from paddle.incubate.autograd.primx import Transform, topo_path, orig2prim, prim2orig, _gradients
+from paddle.incubate.autograd.utils import enable_prim, disable_prim, prim_enabled
 
 
 class TestPyPrimOps(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_transform.py b/python/paddle/fluid/tests/unittests/autograd/test_transform.py
new file mode 100644
index 0000000000000..a2b75f5d7bb1a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/autograd/test_transform.py
@@ -0,0 +1,313 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle
+from paddle.incubate.autograd.primx import Transform, orig2prim, prim2orig
+from paddle.fluid.layers.utils import flatten
+
+paddle.enable_static()
+
+
+class TestAutoGradTransformForAdd(unittest.TestCase):
+    def setUp(self):
+        self.main_program = paddle.static.Program()
+        self.startup_program = paddle.static.Program()
+
+        with paddle.static.program_guard(self.main_program,
+                                         self.startup_program):
+            self.init_data()
+
+    def init_data(self):
+        # { input_index: input_shape }
+        self.xs_shape_map = {0: (20, 40), 1: (20, 40)}
+        # { output_index: output_shape }
+        self.ys_shape_map = {0: (20, 40)}
+        X0 = paddle.static.data(
+            name='X0', shape=self.xs_shape_map[0], dtype='float32')
+        X0.stop_gradient = False
+        X1 = paddle.static.data(
+            name='X1', shape=self.xs_shape_map[1], dtype='float32')
+        X1.stop_gradient = False
+
+        A = paddle.tanh(X0)
+        B = paddle.tanh(X1)
+        Y = paddle.add(A, B)
+
+        self.orig_xs = [X0, X1]
+        self.orig_ys = [Y, ]
+
+        self.orig_ops = ['tanh', 'tanh', 'elementwise_add']
+        self.orig2prim_ops = ['tanh_p', 'tanh_p', 'add_p']
+        self.linearize_ops = self.orig2prim_ops + [
+            # call fill_const() in linearize() function
+            'fill_constant_p',
+            'fill_constant_p',
+            # linearized op
+            'mul_p',
+            'sub_p',
+            'fill_constant_p',
+            'mul_p',
+            'mul_p',
+            'sub_p',
+            'fill_constant_p',
+            'mul_p',
+            'add_p',
+        ]
+        self.transpose_ops = self.orig2prim_ops + [
+            # call fill_const() in transpose() function
+            'fill_constant_p',
+            # linearized op after remove path
+            'fill_constant_p',
+            'fill_constant_p',
+            'mul_p',
+            'sub_p',
+            'fill_constant_p',
+            'mul_p',
+            'sub_p',
+            'fill_constant_p',
+            # transposed op
+            'mul_p',
+            'mul_p'
+        ]
+        self.prim2orig_ops = [
+            'tanh', 'tanh', 'elementwise_add', 'fill_constant', 'fill_constant',
+            'fill_constant', 'elementwise_mul', 'elementwise_sub',
+            'fill_constant', 'elementwise_mul', 'elementwise_sub',
+            'fill_constant', 'elementwise_mul', 'elementwise_mul'
+        ]
+
+    def test_run(self):
+        # Must using with program_guard(), otherwise prim ops will append other block
+        with paddle.static.program_guard(self.main_program,
+                                         self.startup_program):
+            ad = Transform(self.main_program.block(0))
+            orig_ops = [op.type for op in self.main_program.block(0).ops]
+            self.assertEqual(sorted(orig_ops), sorted(self.orig_ops))
+
+            # Test orig2prim
+            orig2prim(block=self.main_program.block(0))
+            orig2prim_ops = [op.type for op in self.main_program.block(0).ops]
+            self.assertEqual(sorted(orig2prim_ops), sorted(self.orig2prim_ops))
+
+            # Test linearize
+            xs_dot, ys_dot = ad.linearize(self.orig_xs, self.orig_ys)
+            linearize_ops = [op.type for op in self.main_program.block(0).ops]
+            self.assertEqual(sorted(linearize_ops), sorted(self.linearize_ops))
+            flatten_xs_dot = flatten(xs_dot)
+            for k, v in self.xs_shape_map.items():
+                self.assertEqual(flatten_xs_dot[k].shape, v)
+            flatten_ys_dot = flatten(ys_dot)
+            for k, v in self.ys_shape_map.items():
+                self.assertEqual(flatten_ys_dot[k].shape, v)
+
+            # Test transpose
+            ys_bar, xs_bar = ad.transpose(ys_dot, xs_dot, retain_fwd=False)
+            transpose_ops = [op.type for op in self.main_program.block(0).ops]
+            self.assertEqual(sorted(transpose_ops), sorted(self.transpose_ops))
+            flatten_xs_bar = flatten(xs_bar)
+            for k, v in self.xs_shape_map.items():
+                # There may be None in the result of transpose like gather op
+                if flatten_xs_bar[k] is not None:
+                    self.assertEqual(flatten_xs_bar[k].shape, v)
+            flatten_ys_bar = flatten(ys_bar)
+            for k, v in self.ys_shape_map.items():
+                self.assertEqual(flatten_ys_bar[k].shape, v)
+
+            # Test prim2orig
+            prim2orig(block=self.main_program.block(0))
+            prim2orig_ops = [op.type for op in self.main_program.block(0).ops]
+            self.assertEqual(sorted(prim2orig_ops), sorted(self.prim2orig_ops))
+
+
+class TestAutoGradTransformForMatmul(TestAutoGradTransformForAdd):
+    def init_data(self):
+        # { input_index: input_shape }
+        self.xs_shape_map = {0: (100, 2), 1: (5, 2)}
+        # { output_index: output_shape }
+        self.ys_shape_map = {0: (100, 5)}
+        X0 = paddle.static.data(
+            'X0', shape=self.xs_shape_map[0], dtype='float32')
+        X0.stop_gradient = False
+        X1 = paddle.static.data(
+            'X1', shape=self.xs_shape_map[1], dtype='float32')
+        X1.stop_gradient = False
+
+        A = paddle.reshape(X1, [2, 5])
+        B = paddle.scale(A, scale=2.0, bias=2.0)
+        Y = paddle.matmul(X0, B)
+
+        self.orig_xs = [X0, X1]
+        self.orig_ys = [Y, ]
+
+        self.orig_ops = ['reshape2', 'scale', 'matmul_v2']
+        self.orig2prim_ops = [
+            'reshape_p', 'fill_constant_p', 'fill_constant_p',
+            'fill_constant_p', 'mul_p', 'add_p', 'matmul_p'
+        ]
+        self.linearize_ops = self.orig2prim_ops + [
+            # call fill_const() in linearize() function
+            'fill_constant_p',
+            'fill_constant_p',
+            # linearized op
+            'reshape_p',
+            'mul_p',
+            # 'mul_p', # JVP rules handle `None` input, some op will not be appended
+            # 'add_p',
+            # 'add_p',
+            'matmul_p',
+            'matmul_p',
+            'add_p'
+        ]
+        self.transpose_ops = self.orig2prim_ops + [
+            # call fill_const() in transpose() function
+            'fill_constant_p',
+            # linearized op after remove path
+            'fill_constant_p',
+            'fill_constant_p',
+            'mul_p',
+            # transposed op
+            'transpose_p',
+            'matmul_p',
+            'transpose_p',
+            'matmul_p',
+            # 'mul_p',
+            'reshape_p',
+        ]
+
+        self.prim2orig_ops = [
+            'reshape2',
+            'fill_constant',
+            'fill_constant',
+            'fill_constant',
+            'elementwise_mul',
+            'elementwise_add',
+            'matmul_v2',
+            'fill_constant',
+            'fill_constant',
+            'fill_constant',
+            'elementwise_mul',
+            'transpose2',
+            'matmul_v2',
+            'transpose2',
+            'matmul_v2',
+            # 'elementwise_mul',
+            'reshape2',
+        ]
+
+
+class TestAutoGradTransformForIndexSelect(TestAutoGradTransformForAdd):
+    def init_data(self):
+        # { input_index: input_shape }
+        self.xs_shape_map = {0: (7, 8, 9), 1: (8, 1), 2: (7, 8, 9), 3: (3, )}
+        # { output_index: output_shape }
+        self.ys_shape_map = {0: (3, 16, 9)}
+
+        X0 = paddle.static.data(
+            'X0', shape=self.xs_shape_map[0], dtype='float32')
+        X0.stop_gradient = False
+        X1 = paddle.static.data(
+            'X1', shape=self.xs_shape_map[1], dtype='float32')
+        X1.stop_gradient = False
+        X2 = paddle.static.data(
+            'X2', shape=self.xs_shape_map[2], dtype='float32')
+        X2.stop_gradient = False
+        X3 = paddle.static.data('X3', shape=self.xs_shape_map[3], dtype='int32')
+        X3.stop_gradient = False
+
+        A = paddle.add(X0, X1)  # (7, 8, 9)
+        B = paddle.norm(x=A, p=2)  # (1, )
+        C = paddle.subtract(X2, B)  # (7, 8, 9)
+        D = paddle.concat(x=(A, C), axis=1)  # (7, 16, 9)
+        Y = paddle.index_select(D, X3, axis=0)  # (3, 16, 9)
+
+        self.orig_xs = [X0, X1, X2, X3]
+        self.orig_ys = [Y, ]
+        self.orig_ops = [
+            'elementwise_add', 'p_norm', 'elementwise_sub', 'concat',
+            'index_select'
+        ]
+        self.orig2prim_ops = [
+            'broadcast_p', 'add_p', 'reshape_p', 'mul_p', 'reduce_p', 'sqrt_p',
+            'broadcast_p', 'sub_p', 'concat_p', 'gather_p'
+        ]
+        self.linearize_ops = self.orig2prim_ops + [
+            # call fill_const() in linearize() function
+            'fill_constant_p',
+            'fill_constant_p',
+            'fill_constant_p',
+            'fill_constant_p',
+            # linearized op
+            'broadcast_p',
+            'add_p',
+            'reshape_p',
+            'mul_p',
+            'mul_p',
+            'add_p',
+            'reduce_p',
+            'fill_constant_p',  # 'sqrt_p', Will not append sqrt_p op when apply JVP for sqrt_p
+            'mul_p',
+            'div_p',
+            'broadcast_p',
+            'sub_p',
+            'concat_p',
+            'gather_p'
+        ]
+        self.transpose_ops = self.orig2prim_ops + [
+            # call fill_const() in transpose() function
+            'fill_constant_p',
+            # linearized op after remove path
+            'fill_constant_p',
+            'fill_constant_p',
+            'fill_constant_p',
+            'fill_constant_p',
+            'fill_constant_p',
+            'mul_p',
+            # transposed op
+            'reduce_p',
+            'reshape_p',
+            'reshape_p',
+            'mul_p',
+            'mul_p',
+            'reshape_p',
+            'broadcast_p',
+            'div_p',
+            'reduce_p',
+            'reshape_p',
+            'fill_constant_p',
+            'sub_p',
+            'split_p',
+            'fill_constant_p',
+            'scatter_add_p',
+            'add_p',  # The output of the op is used by multiple subsequent ops
+            'add_p',
+        ]
+
+        self.prim2orig_ops = [
+            'expand_v2', 'elementwise_add', 'reshape2', 'elementwise_mul',
+            'reduce_sum', 'sqrt', 'expand_v2', 'elementwise_sub', 'concat',
+            'gather', 'fill_constant', 'fill_constant', 'fill_constant',
+            'fill_constant', 'fill_constant', 'fill_constant',
+            'elementwise_mul', 'reduce_sum', 'reshape2', 'reshape2',
+            'elementwise_mul', 'elementwise_mul', 'reshape2', 'expand_v2',
+            'elementwise_div', 'reduce_sum', 'reshape2', 'fill_constant',
+            'elementwise_sub', 'split', 'fill_constant', 'fill_any_like',
+            'elementwise_add', 'scatter', 'elementwise_add', 'elementwise_add'
+        ]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/check_nan_inf_base_dygraph.py b/python/paddle/fluid/tests/unittests/check_nan_inf_base_dygraph.py
index f4217d11f2d9b..dee74fdcb1ff3 100644
--- a/python/paddle/fluid/tests/unittests/check_nan_inf_base_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/check_nan_inf_base_dygraph.py
@@ -25,8 +25,7 @@
 
 import paddle
 import paddle.nn as nn
-from paddle.fluid.framework import _enable_legacy_dygraph
-_enable_legacy_dygraph()
+from paddle.fluid.framework import _test_eager_guard
 
 np.random.seed(0)
 
@@ -94,7 +93,7 @@ def check(use_cuda):
         sgd.clear_grad()
 
 
-if __name__ == '__main__':
+def run_check():
     if paddle.is_compiled_with_cuda():
         try:
             check(use_cuda=True)
@@ -112,3 +111,9 @@ def check(use_cuda):
         print(e)
         print(type(e))
         assert type(e) == RuntimeError
+
+
+if __name__ == '__main__':
+    with _test_eager_guard():
+        run_check()
+    run_check()
diff --git a/python/paddle/fluid/tests/unittests/collective_sendrecv_op_dynamic_shape.py b/python/paddle/fluid/tests/unittests/collective_sendrecv_op_dynamic_shape.py
new file mode 100644
index 0000000000000..093af635f44f6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_sendrecv_op_dynamic_shape.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_base import TestCollectiveRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveSendRecvDynamicShape(TestCollectiveRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program):
+        ring_id = self.global_ring_id
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata",
+                shape=[10, 1000],
+                dtype='float64',
+                append_batch_size=False)
+            if self.rank == 0:
+                main_prog.global_block().append_op(
+                    type="send_v2",
+                    inputs={'X': tindata},
+                    attrs={
+                        'ring_id': ring_id,
+                        'peer': 1,
+                        'use_calc_stream': True,
+                        'dynamic_shape': True
+                    })
+            else:
+                main_prog.global_block().append_op(
+                    type="recv_v2",
+                    outputs={'Out': tindata},
+                    attrs={
+                        'peer': 0,
+                        'ring_id': ring_id,
+                        'dtype': tindata.dtype,
+                        'out_shape': tindata.shape,
+                        'use_calc_stream': True,
+                        'dynamic_shape': True
+                    })
+            return tindata
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveSendRecvDynamicShape, "sendrecv_dynamic_shape",
+                 0)
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/dist_pass_test_base.py b/python/paddle/fluid/tests/unittests/distributed_passes/dist_pass_test_base.py
index f0ed2cdc04950..786ee06487fbc 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/dist_pass_test_base.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/dist_pass_test_base.py
@@ -39,7 +39,7 @@ def prepare_python_path_and_return_module(path):
             paths.append(dirname)
         python_path = ":".join(paths)
     else:
-        python_path = path
+        python_path = dirname
     os.environ[env_name] = python_path
     print('GLOG_v=', os.environ.get('GLOG_v', None), flush=1)
     return filename[:-len(py_suffix)]
@@ -85,9 +85,9 @@ def apply_passes(self, main_prog, startup_prog):
         raise NotImplementedError()
 
     def check_main(self, model=None, gpus=None, **kwargs):
-        no_pass_rets = self._distributed_launch(
-            model=model, apply_pass=True, gpus=gpus, **kwargs)
         pass_rets = self._distributed_launch(
+            model=model, apply_pass=True, gpus=gpus, **kwargs)
+        no_pass_rets = self._distributed_launch(
             model=model, apply_pass=False, gpus=gpus, **kwargs)
         self.check_results(no_pass_rets, pass_rets)
 
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/model_zoo.py b/python/paddle/fluid/tests/unittests/distributed_passes/model_zoo.py
index 0b522b79c4e93..7eebee47e59a8 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/model_zoo.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/model_zoo.py
@@ -59,3 +59,40 @@ def reader():
     main_program = paddle.static.default_main_program()
     startup_program = paddle.static.default_startup_program()
     return main_program, startup_program, [image, label], [loss], reader
+
+
+def simple_net(place, batch_size, image_shape=[784], num_classes=10):
+    image = paddle.static.data(
+        shape=[batch_size] + image_shape, dtype='float32', name='image')
+    label = paddle.static.data(
+        shape=[batch_size, 1], dtype='int64', name='label')
+    linears = [nn.Linear(784, 784) for _ in range(3)]
+    hidden = image
+    for linear in linears:
+        hidden = linear(hidden)
+        hidden = nn.ReLU()(hidden)
+    loss_fn = nn.loss.CrossEntropyLoss()
+    loss = loss_fn(hidden, label)
+    optimizer = paddle.optimizer.Adam(learning_rate=1e-3)
+
+    dist_strategy = fleet.DistributedStrategy()
+    dist_strategy.fuse_all_reduce_ops = False
+    dist_strategy.without_graph_optimization = True
+    fleet.init(is_collective=True, strategy=dist_strategy)
+    optimizer = fleet.distributed_optimizer(optimizer)
+    optimizer.minimize(loss)
+
+    rank = paddle.distributed.get_rank()
+
+    def reader():
+        seed = get_seed_from_env()
+        np.random.seed(seed + rank)
+        for _ in range(10):
+            image_np = np.random.random(size=image.shape).astype('float32')
+            label_np = np.random.randint(
+                low=0, high=num_classes, size=label.shape).astype('int64')
+            yield image_np, label_np
+
+    main_program = paddle.static.default_main_program()
+    startup_program = paddle.static.default_startup_program()
+    return main_program, startup_program, [image, label], [loss], reader
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_build_cinn_pass_resnet.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_build_cinn_pass_resnet.py
new file mode 100644
index 0000000000000..8430eb615a20c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_build_cinn_pass_resnet.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.distributed.passes import new_pass, PassManager
+import unittest
+from dist_pass_test_base import DistPassTestBase
+from model_zoo import resnet_model
+
+
+class TestBuildCINNPass(DistPassTestBase):
+    def init(self):
+        self.atol = 0.5
+        self.rtol = 0.0
+
+    def apply_passes(self, main_prog, startup_prog):
+        pass_manager = PassManager([
+            new_pass("build_cinn"),
+            new_pass("fuse_elewise_add_act"),
+        ])
+        pass_manager.apply([main_prog], [startup_prog])
+        print(pass_manager.names)
+
+    def test_bs_32(self):
+        if paddle.is_compiled_with_cinn():
+            self.check_main(resnet_model, batch_size=32)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_build_cinn_pass_simple_net.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_build_cinn_pass_simple_net.py
new file mode 100644
index 0000000000000..e030420d32420
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_build_cinn_pass_simple_net.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.distributed.passes import new_pass, PassManager
+import unittest
+from dist_pass_test_base import DistPassTestBase
+from model_zoo import simple_net
+
+
+class TestBuildCINNPass(DistPassTestBase):
+    def init(self):
+        self.atol = 0.0
+        self.rtol = 0.0
+
+    def apply_passes(self, main_prog, startup_prog):
+        pass_manager = PassManager([
+            new_pass("build_cinn"),
+            new_pass("fuse_elewise_add_act"),
+        ])
+        pass_manager.apply([main_prog], [startup_prog])
+        op_types = [op.type for op in main_prog.global_block().ops]
+        self.assertTrue('cinn_launch' in op_types)
+
+    def test_bs_32(self):
+        if paddle.is_compiled_with_cinn():
+            self.check_main(simple_net, batch_size=32)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py
index 574a222ba18c9..a1a853f006c0d 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py
@@ -32,7 +32,6 @@
 momentum_rate = 0.9
 l2_decay = 1e-4
 batch_size = 100
-fleet.init(is_collective=True)
 
 
 class MLP(fluid.Layer):
@@ -147,4 +146,5 @@ def test_sharding_api():
 if __name__ == '__main__':
     with _test_eager_guard():
         pass
+    fleet.init(is_collective=True)
     test_sharding_api()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
index 82edd1c17a541..58432540d1b82 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
@@ -42,7 +42,6 @@
     "pp_degree": 1,
     "sharding_degree": 1
 }
-fleet.init(is_collective=True, strategy=strategy)
 
 np.random.seed(seed)
 paddle.seed(seed)
@@ -225,4 +224,5 @@ def test_dp_stage2():
 if __name__ == '__main__':
     with _test_eager_guard():
         pass
+    fleet.init(is_collective=True, strategy=strategy)
     test_dp_stage2()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py
index a7b16bbb75977..cd2d7b3f12765 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py
@@ -36,6 +36,14 @@
 batch_size = 32
 linear_size = 1000
 
+strategy = fleet.DistributedStrategy()
+strategy.hybrid_configs = {
+    "dp_degree": 2,
+    "mp_degree": 1,
+    "pp_degree": 1,
+    "sharding_degree": 1
+}
+
 np.random.seed(seed)
 paddle.seed(seed)
 
@@ -109,4 +117,5 @@ def test_sharding_stage2_offload():
 if __name__ == '__main__':
     with _test_eager_guard():
         pass
+    fleet.init(is_collective=True, strategy=strategy)
     test_sharding_stage2_offload()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
index cdb1de020f56e..fc4002ef405bd 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
@@ -39,7 +39,6 @@
 base_lr = 0.1
 momentum_rate = 0.9
 l2_decay = 1e-4
-fleet.init(is_collective=True)
 
 
 class MLP(fluid.Layer):
@@ -277,4 +276,5 @@ def test_stage2_stage3():
 if __name__ == '__main__':
     with _test_eager_guard():
         pass
+    fleet.init(is_collective=True)
     test_stage2_stage3()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py
index 2cb327a29a3da..763a7a8b97fdd 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py
@@ -34,7 +34,6 @@
 base_lr = 0.1
 momentum_rate = 0.9
 l2_decay = 1e-4
-fleet.init(is_collective=True)
 
 
 class MLP(fluid.Layer):
@@ -199,4 +198,5 @@ def test_stage3_offload():
 if __name__ == '__main__':
     with _test_eager_guard():
         pass
+    fleet.init(is_collective=True)
     test_stage3_offload()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py
index a9e94ef09b9ac..db533e6379add 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py
@@ -14,6 +14,7 @@
 
 import os
 import time
+import tempfile
 import unittest
 import numpy as np
 
@@ -33,32 +34,118 @@
 SEED = 2020
 STEP_NUM = 10
 PRINT_STEP = 2
-MODEL_SAVE_DIR = "./inference"
-MODEL_SAVE_PREFIX = "./inference/bert"
-MODEL_FILENAME = "bert" + INFER_MODEL_SUFFIX
-PARAMS_FILENAME = "bert" + INFER_PARAMS_SUFFIX
-DY_STATE_DICT_SAVE_PATH = "./bert.dygraph"
-
-
-def train(bert_config, data_reader, to_static):
-    with fluid.dygraph.guard(place):
-        fluid.default_main_program().random_seed = SEED
-        fluid.default_startup_program().random_seed = SEED
-
-        data_loader = fluid.io.DataLoader.from_generator(
-            capacity=50, iterable=True)
-        data_loader.set_batch_generator(
-            data_reader.data_generator(), places=place)
-
-        bert = PretrainModelLayer(
-            config=bert_config, weight_sharing=False, use_fp16=False)
-
-        optimizer = fluid.optimizer.Adam(parameter_list=bert.parameters())
-        step_idx = 0
-        speed_list = []
-        for input_data in data_loader():
-            src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos, labels = input_data
-            next_sent_acc, mask_lm_loss, total_loss = bert(
+
+
+class TestBert(unittest.TestCase):
+    def setUp(self):
+        self.bert_config = get_bert_config()
+        self.data_reader = get_feed_data_reader(self.bert_config)
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.model_save_dir = os.path.join(self.temp_dir.name, 'inference')
+        self.model_save_prefix = os.path.join(self.model_save_dir, 'bert')
+        self.model_filename = 'bert' + INFER_MODEL_SUFFIX
+        self.params_filename = 'bert' + INFER_PARAMS_SUFFIX
+        self.dy_state_dict_save_path = os.path.join(self.temp_dir.name,
+                                                    'bert.dygraph')
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
+    def train(self, bert_config, data_reader, to_static):
+        with fluid.dygraph.guard(place):
+            fluid.default_main_program().random_seed = SEED
+            fluid.default_startup_program().random_seed = SEED
+
+            data_loader = fluid.io.DataLoader.from_generator(
+                capacity=50, iterable=True)
+            data_loader.set_batch_generator(
+                data_reader.data_generator(), places=place)
+
+            bert = PretrainModelLayer(
+                config=bert_config, weight_sharing=False, use_fp16=False)
+
+            optimizer = fluid.optimizer.Adam(parameter_list=bert.parameters())
+            step_idx = 0
+            speed_list = []
+            for input_data in data_loader():
+                src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos, labels = input_data
+                next_sent_acc, mask_lm_loss, total_loss = bert(
+                    src_ids=src_ids,
+                    position_ids=pos_ids,
+                    sentence_ids=sent_ids,
+                    input_mask=input_mask,
+                    mask_label=mask_label,
+                    mask_pos=mask_pos,
+                    labels=labels)
+                total_loss.backward()
+                optimizer.minimize(total_loss)
+                bert.clear_gradients()
+
+                acc = np.mean(np.array(next_sent_acc.numpy()))
+                loss = np.mean(np.array(total_loss.numpy()))
+                ppl = np.mean(np.exp(np.array(mask_lm_loss.numpy())))
+
+                if step_idx % PRINT_STEP == 0:
+                    if step_idx == 0:
+                        print("Step: %d, loss: %f, ppl: %f, next_sent_acc: %f" %
+                              (step_idx, loss, ppl, acc))
+                        avg_batch_time = time.time()
+                    else:
+                        speed = PRINT_STEP / (time.time() - avg_batch_time)
+                        speed_list.append(speed)
+                        print(
+                            "Step: %d, loss: %f, ppl: %f, next_sent_acc: %f, speed: %.3f steps/s"
+                            % (step_idx, loss, ppl, acc, speed))
+                        avg_batch_time = time.time()
+
+                step_idx += 1
+                if step_idx == STEP_NUM:
+                    if to_static:
+                        fluid.dygraph.jit.save(bert, self.model_save_prefix)
+                    else:
+                        fluid.dygraph.save_dygraph(bert.state_dict(),
+                                                   self.dy_state_dict_save_path)
+                    break
+            return loss, ppl
+
+    def train_dygraph(self, bert_config, data_reader):
+        program_translator.enable(False)
+        return self.train(bert_config, data_reader, False)
+
+    def train_static(self, bert_config, data_reader):
+        program_translator.enable(True)
+        return self.train(bert_config, data_reader, True)
+
+    def predict_static(self, data):
+        paddle.enable_static()
+        exe = fluid.Executor(place)
+        # load inference model
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(
+             self.model_save_dir,
+             executor=exe,
+             model_filename=self.model_filename,
+             params_filename=self.params_filename)
+        pred_res = exe.run(inference_program,
+                           feed=dict(zip(feed_target_names, data)),
+                           fetch_list=fetch_targets)
+
+        return pred_res
+
+    def predict_dygraph(self, bert_config, data):
+        program_translator.enable(False)
+        with fluid.dygraph.guard(place):
+            bert = PretrainModelLayer(
+                config=bert_config, weight_sharing=False, use_fp16=False)
+            model_dict, _ = fluid.dygraph.load_dygraph(
+                self.dy_state_dict_save_path)
+
+            bert.set_dict(model_dict)
+            bert.eval()
+
+            input_vars = [fluid.dygraph.to_variable(x) for x in data]
+            src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos, labels = input_vars
+            pred_res = bert(
                 src_ids=src_ids,
                 position_ids=pos_ids,
                 sentence_ids=sent_ids,
@@ -66,120 +153,33 @@ def train(bert_config, data_reader, to_static):
                 mask_label=mask_label,
                 mask_pos=mask_pos,
                 labels=labels)
-            total_loss.backward()
-            optimizer.minimize(total_loss)
-            bert.clear_gradients()
-
-            acc = np.mean(np.array(next_sent_acc.numpy()))
-            loss = np.mean(np.array(total_loss.numpy()))
-            ppl = np.mean(np.exp(np.array(mask_lm_loss.numpy())))
-
-            if step_idx % PRINT_STEP == 0:
-                if step_idx == 0:
-                    print("Step: %d, loss: %f, ppl: %f, next_sent_acc: %f" %
-                          (step_idx, loss, ppl, acc))
-                    avg_batch_time = time.time()
-                else:
-                    speed = PRINT_STEP / (time.time() - avg_batch_time)
-                    speed_list.append(speed)
-                    print(
-                        "Step: %d, loss: %f, ppl: %f, next_sent_acc: %f, speed: %.3f steps/s"
-                        % (step_idx, loss, ppl, acc, speed))
-                    avg_batch_time = time.time()
-
-            step_idx += 1
-            if step_idx == STEP_NUM:
-                if to_static:
-                    fluid.dygraph.jit.save(bert, MODEL_SAVE_PREFIX)
-                else:
-                    fluid.dygraph.save_dygraph(bert.state_dict(),
-                                               DY_STATE_DICT_SAVE_PATH)
-                break
-        return loss, ppl
-
-
-def train_dygraph(bert_config, data_reader):
-    program_translator.enable(False)
-    return train(bert_config, data_reader, False)
-
-
-def train_static(bert_config, data_reader):
-    program_translator.enable(True)
-    return train(bert_config, data_reader, True)
-
-
-def predict_static(data):
-    paddle.enable_static()
-    exe = fluid.Executor(place)
-    # load inference model
-    [inference_program, feed_target_names,
-     fetch_targets] = fluid.io.load_inference_model(
-         MODEL_SAVE_DIR,
-         executor=exe,
-         model_filename=MODEL_FILENAME,
-         params_filename=PARAMS_FILENAME)
-    pred_res = exe.run(inference_program,
-                       feed=dict(zip(feed_target_names, data)),
-                       fetch_list=fetch_targets)
-
-    return pred_res
-
-
-def predict_dygraph(bert_config, data):
-    program_translator.enable(False)
-    with fluid.dygraph.guard(place):
-        bert = PretrainModelLayer(
-            config=bert_config, weight_sharing=False, use_fp16=False)
-        model_dict, _ = fluid.dygraph.load_dygraph(DY_STATE_DICT_SAVE_PATH)
-
-        bert.set_dict(model_dict)
-        bert.eval()
-
-        input_vars = [fluid.dygraph.to_variable(x) for x in data]
-        src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos, labels = input_vars
-        pred_res = bert(
-            src_ids=src_ids,
-            position_ids=pos_ids,
-            sentence_ids=sent_ids,
-            input_mask=input_mask,
-            mask_label=mask_label,
-            mask_pos=mask_pos,
-            labels=labels)
-        pred_res = [var.numpy() for var in pred_res]
+            pred_res = [var.numpy() for var in pred_res]
 
-        return pred_res
-
-
-def predict_dygraph_jit(data):
-    with fluid.dygraph.guard(place):
-        bert = fluid.dygraph.jit.load(MODEL_SAVE_PREFIX)
-        bert.eval()
-
-        src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos, labels = data
-        pred_res = bert(src_ids, pos_ids, sent_ids, input_mask, mask_label,
-                        mask_pos, labels)
-        pred_res = [var.numpy() for var in pred_res]
-
-        return pred_res
+            return pred_res
 
+    def predict_dygraph_jit(self, data):
+        with fluid.dygraph.guard(place):
+            bert = fluid.dygraph.jit.load(self.model_save_prefix)
+            bert.eval()
 
-def predict_analysis_inference(data):
-    output = PredictorTools(MODEL_SAVE_DIR, MODEL_FILENAME, PARAMS_FILENAME,
-                            data)
-    out = output()
-    return out
+            src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos, labels = data
+            pred_res = bert(src_ids, pos_ids, sent_ids, input_mask, mask_label,
+                            mask_pos, labels)
+            pred_res = [var.numpy() for var in pred_res]
 
+            return pred_res
 
-class TestBert(unittest.TestCase):
-    def setUp(self):
-        self.bert_config = get_bert_config()
-        self.data_reader = get_feed_data_reader(self.bert_config)
+    def predict_analysis_inference(self, data):
+        output = PredictorTools(self.model_save_dir, self.model_filename,
+                                self.params_filename, data)
+        out = output()
+        return out
 
     def test_train(self):
-        static_loss, static_ppl = train_static(self.bert_config,
-                                               self.data_reader)
-        dygraph_loss, dygraph_ppl = train_dygraph(self.bert_config,
-                                                  self.data_reader)
+        static_loss, static_ppl = self.train_static(self.bert_config,
+                                                    self.data_reader)
+        dygraph_loss, dygraph_ppl = self.train_dygraph(self.bert_config,
+                                                       self.data_reader)
         self.assertTrue(
             np.allclose(static_loss, dygraph_loss),
             msg="static_loss: {} \n dygraph_loss: {}".format(static_loss,
@@ -193,10 +193,10 @@ def test_train(self):
 
     def verify_predict(self):
         for data in self.data_reader.data_generator()():
-            dygraph_pred_res = predict_dygraph(self.bert_config, data)
-            static_pred_res = predict_static(data)
-            dygraph_jit_pred_res = predict_dygraph_jit(data)
-            predictor_pred_res = predict_analysis_inference(data)
+            dygraph_pred_res = self.predict_dygraph(self.bert_config, data)
+            static_pred_res = self.predict_static(data)
+            dygraph_jit_pred_res = self.predict_dygraph_jit(data)
+            predictor_pred_res = self.predict_analysis_inference(data)
 
             for dy_res, st_res, dy_jit_res, predictor_res in zip(
                     dygraph_pred_res, static_pred_res, dygraph_jit_pred_res,
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
index 00af9c96ba9cc..bec9b35a7febb 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
@@ -12,10 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import math
 import numpy as np
 import unittest
 import paddle
+import tempfile
 from paddle.jit import to_static
 import paddle.fluid as fluid
 from paddle.fluid import ParamAttr
@@ -422,11 +424,6 @@ class Args(object):
     prop_boundary_ratio = 0.5
     num_sample = 2
     num_sample_perbin = 2
-    model_save_dir = "./inference"
-    model_save_prefix = "./inference/bmn"
-    model_filename = "bmn" + INFER_MODEL_SUFFIX
-    params_filename = "bmn" + INFER_PARAMS_SUFFIX
-    dy_param_path = './bmn_dy_param'
 
 
 def optimizer(cfg, parameter_list):
@@ -559,78 +556,6 @@ def reader():
     return reader
 
 
-def train_bmn(args, place, to_static):
-    program_translator.enable(to_static)
-    loss_data = []
-
-    with fluid.dygraph.guard(place):
-        paddle.seed(SEED)
-        paddle.framework.random._manual_program_seed(SEED)
-        global local_random
-        local_random = np.random.RandomState(SEED)
-
-        bmn = BMN(args)
-        adam = optimizer(args, parameter_list=bmn.parameters())
-
-        train_reader = fake_data_reader(args, 'train')
-
-        for epoch in range(args.epoch):
-            for batch_id, data in enumerate(train_reader()):
-                video_feat = np.array(
-                    [item[0] for item in data]).astype(DATATYPE)
-                gt_iou_map = np.array(
-                    [item[1] for item in data]).astype(DATATYPE)
-                gt_start = np.array([item[2] for item in data]).astype(DATATYPE)
-                gt_end = np.array([item[3] for item in data]).astype(DATATYPE)
-
-                x_data = to_variable(video_feat)
-                gt_iou_map = to_variable(gt_iou_map)
-                gt_start = to_variable(gt_start)
-                gt_end = to_variable(gt_end)
-                gt_iou_map.stop_gradient = True
-                gt_start.stop_gradient = True
-                gt_end.stop_gradient = True
-
-                pred_bm, pred_start, pred_end = bmn(x_data)
-
-                loss, tem_loss, pem_reg_loss, pem_cls_loss = bmn_loss_func(
-                    pred_bm, pred_start, pred_end, gt_iou_map, gt_start, gt_end,
-                    args)
-                avg_loss = fluid.layers.mean(loss)
-
-                avg_loss.backward()
-                adam.minimize(avg_loss)
-                bmn.clear_gradients()
-                # log loss data to verify correctness
-                loss_data += [
-                    avg_loss.numpy()[0], tem_loss.numpy()[0],
-                    pem_reg_loss.numpy()[0], pem_cls_loss.numpy()[0]
-                ]
-
-                if args.log_interval > 0 and (
-                        batch_id % args.log_interval == 0):
-                    print('[TRAIN] Epoch {}, iter {} '.format(epoch, batch_id)
-                                + '\tLoss = {}, \ttem_loss = {}, \tpem_reg_loss = {}, \tpem_cls_loss = {}'.format(
-                        '%f' % avg_loss.numpy()[0], '%f' % tem_loss.numpy()[0], \
-                        '%f' % pem_reg_loss.numpy()[0], '%f' % pem_cls_loss.numpy()[0]))
-
-                # validation
-                if batch_id % args.valid_interval == 0 and batch_id > 0:
-                    bmn.eval()
-                    val_loss_data = val_bmn(bmn, args)
-                    bmn.train()
-                    loss_data += val_loss_data
-
-                if batch_id == args.train_batch_num:
-                    if to_static:
-                        fluid.dygraph.jit.save(bmn, args.model_save_prefix)
-                    else:
-                        fluid.dygraph.save_dygraph(bmn.state_dict(),
-                                                   args.dy_param_path)
-                    break
-        return np.array(loss_data)
-
-
 # Validation
 def val_bmn(model, args):
     val_reader = fake_data_reader(args, 'valid')
@@ -677,10 +602,93 @@ def setUp(self):
         self.place = fluid.CPUPlace() if not fluid.is_compiled_with_cuda() \
             else fluid.CUDAPlace(0)
 
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.model_save_dir = os.path.join(self.temp_dir.name, 'inference')
+        self.model_save_prefix = os.path.join(self.model_save_dir, 'bmn')
+        self.model_filename = "bmn" + INFER_MODEL_SUFFIX
+        self.params_filename = "bmn" + INFER_PARAMS_SUFFIX
+        self.dy_param_path = os.path.join(self.temp_dir.name, 'bmn_dy_param')
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
+    def train_bmn(self, args, place, to_static):
+        program_translator.enable(to_static)
+        loss_data = []
+
+        with fluid.dygraph.guard(place):
+            paddle.seed(SEED)
+            paddle.framework.random._manual_program_seed(SEED)
+            global local_random
+            local_random = np.random.RandomState(SEED)
+
+            bmn = BMN(args)
+            adam = optimizer(args, parameter_list=bmn.parameters())
+
+            train_reader = fake_data_reader(args, 'train')
+
+            for epoch in range(args.epoch):
+                for batch_id, data in enumerate(train_reader()):
+                    video_feat = np.array(
+                        [item[0] for item in data]).astype(DATATYPE)
+                    gt_iou_map = np.array(
+                        [item[1] for item in data]).astype(DATATYPE)
+                    gt_start = np.array(
+                        [item[2] for item in data]).astype(DATATYPE)
+                    gt_end = np.array(
+                        [item[3] for item in data]).astype(DATATYPE)
+
+                    x_data = to_variable(video_feat)
+                    gt_iou_map = to_variable(gt_iou_map)
+                    gt_start = to_variable(gt_start)
+                    gt_end = to_variable(gt_end)
+                    gt_iou_map.stop_gradient = True
+                    gt_start.stop_gradient = True
+                    gt_end.stop_gradient = True
+
+                    pred_bm, pred_start, pred_end = bmn(x_data)
+
+                    loss, tem_loss, pem_reg_loss, pem_cls_loss = bmn_loss_func(
+                        pred_bm, pred_start, pred_end, gt_iou_map, gt_start,
+                        gt_end, args)
+                    avg_loss = fluid.layers.mean(loss)
+
+                    avg_loss.backward()
+                    adam.minimize(avg_loss)
+                    bmn.clear_gradients()
+                    # log loss data to verify correctness
+                    loss_data += [
+                        avg_loss.numpy()[0], tem_loss.numpy()[0],
+                        pem_reg_loss.numpy()[0], pem_cls_loss.numpy()[0]
+                    ]
+
+                    if args.log_interval > 0 and (
+                            batch_id % args.log_interval == 0):
+                        print('[TRAIN] Epoch {}, iter {} '.format(epoch, batch_id)
+                                    + '\tLoss = {}, \ttem_loss = {}, \tpem_reg_loss = {}, \tpem_cls_loss = {}'.format(
+                            '%f' % avg_loss.numpy()[0], '%f' % tem_loss.numpy()[0], \
+                            '%f' % pem_reg_loss.numpy()[0], '%f' % pem_cls_loss.numpy()[0]))
+
+                    # validation
+                    if batch_id % args.valid_interval == 0 and batch_id > 0:
+                        bmn.eval()
+                        val_loss_data = val_bmn(bmn, args)
+                        bmn.train()
+                        loss_data += val_loss_data
+
+                    if batch_id == args.train_batch_num:
+                        if to_static:
+                            fluid.dygraph.jit.save(bmn, self.model_save_prefix)
+                        else:
+                            fluid.dygraph.save_dygraph(bmn.state_dict(),
+                                                       self.dy_param_path)
+                        break
+            return np.array(loss_data)
+
     def test_train(self):
 
-        static_res = train_bmn(self.args, self.place, to_static=True)
-        dygraph_res = train_bmn(self.args, self.place, to_static=False)
+        static_res = self.train_bmn(self.args, self.place, to_static=True)
+        dygraph_res = self.train_bmn(self.args, self.place, to_static=False)
         self.assertTrue(
             np.allclose(dygraph_res, static_res),
             "dygraph_res: {},\n static_res: {}".format(
@@ -726,8 +734,7 @@ def predict_dygraph(self, data):
         with fluid.dygraph.guard(self.place):
             bmn = BMN(self.args)
             # load dygraph trained parameters
-            model_dict, _ = fluid.load_dygraph(self.args.dy_param_path +
-                                               ".pdparams")
+            model_dict, _ = fluid.load_dygraph(self.dy_param_path + ".pdparams")
             bmn.set_dict(model_dict)
             bmn.eval()
 
@@ -743,10 +750,10 @@ def predict_static(self, data):
         # load inference model
         [inference_program, feed_target_names,
          fetch_targets] = fluid.io.load_inference_model(
-             self.args.model_save_dir,
+             self.model_save_dir,
              executor=exe,
-             model_filename=self.args.model_filename,
-             params_filename=self.args.params_filename)
+             model_filename=self.model_filename,
+             params_filename=self.params_filename)
         pred_res = exe.run(inference_program,
                            feed={feed_target_names[0]: data},
                            fetch_list=fetch_targets)
@@ -755,7 +762,7 @@ def predict_static(self, data):
 
     def predict_dygraph_jit(self, data):
         with fluid.dygraph.guard(self.place):
-            bmn = fluid.dygraph.jit.load(self.args.model_save_prefix)
+            bmn = fluid.dygraph.jit.load(self.model_save_prefix)
             bmn.eval()
 
             x = to_variable(data)
@@ -765,9 +772,8 @@ def predict_dygraph_jit(self, data):
             return pred_res
 
     def predict_analysis_inference(self, data):
-        output = PredictorTools(self.args.model_save_dir,
-                                self.args.model_filename,
-                                self.args.params_filename, [data])
+        output = PredictorTools(self.model_save_dir, self.model_filename,
+                                self.params_filename, [data])
         out = output()
         return out
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_build_strategy.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_build_strategy.py
index f7d469327a307..95ea5ad227eeb 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_build_strategy.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_build_strategy.py
@@ -18,8 +18,7 @@
 import numpy as np
 from paddle.jit import ProgramTranslator
 
-from test_resnet import ResNet, train, predict_dygraph_jit
-from test_resnet import predict_dygraph, predict_static, predict_analysis_inference
+from test_resnet import ResNet, ResNetHelper
 
 program_translator = ProgramTranslator()
 
@@ -31,20 +30,20 @@ def setUp(self):
         self.build_strategy.fuse_bn_act_ops = True
         self.build_strategy.fuse_bn_add_act_ops = True
         self.build_strategy.enable_addto = True
+        self.resnet_helper = ResNetHelper()
         # NOTE: for enable_addto
         paddle.fluid.set_flags({"FLAGS_max_inplace_grad_add": 8})
 
     def train(self, to_static):
         program_translator.enable(to_static)
-
-        return train(to_static, self.build_strategy)
+        return self.resnet_helper.train(to_static, self.build_strategy)
 
     def verify_predict(self):
         image = np.random.random([1, 3, 224, 224]).astype('float32')
-        dy_pre = predict_dygraph(image)
-        st_pre = predict_static(image)
-        dy_jit_pre = predict_dygraph_jit(image)
-        predictor_pre = predict_analysis_inference(image)
+        dy_pre = self.resnet_helper.predict_dygraph(image)
+        st_pre = self.resnet_helper.predict_static(image)
+        dy_jit_pre = self.resnet_helper.predict_dygraph_jit(image)
+        predictor_pre = self.resnet_helper.predict_analysis_inference(image)
         self.assertTrue(
             np.allclose(dy_pre, st_pre),
             msg="dy_pre:\n {}\n, st_pre: \n{}.".format(dy_pre, st_pre))
@@ -69,7 +68,7 @@ def test_in_static_mode_mkldnn(self):
         paddle.fluid.set_flags({'FLAGS_use_mkldnn': True})
         try:
             if paddle.fluid.core.is_compiled_with_mkldnn():
-                train(True, self.build_strategy)
+                self.resnet_helper.train(True, self.build_strategy)
         finally:
             paddle.fluid.set_flags({'FLAGS_use_mkldnn': False})
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_container.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_container.py
index 2c82f5c699087..74f4a895d1583 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_container.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_container.py
@@ -12,9 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import paddle
 import unittest
 import numpy as np
+import tempfile
 
 
 class BufferLayers(paddle.nn.Layer):
@@ -66,11 +68,15 @@ class TestSequential(unittest.TestCase):
     def setUp(self):
         paddle.set_device('cpu')
         self.seed = 2021
+        self.temp_dir = tempfile.TemporaryDirectory()
         self._init_config()
 
     def _init_config(self):
         self.net = SequentialNet(BufferLayers, 10, 3)
-        self.model_path = './sequential_net'
+        self.model_path = os.path.join(self.temp_dir.name, 'sequential_net')
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
 
     def _init_seed(self):
         paddle.seed(self.seed)
@@ -108,7 +114,8 @@ def _test_load(self, net, x):
 class TestNestSequential(TestSequential):
     def _init_config(self):
         self.net = NestSequentialNet()
-        self.model_path = './nested_sequential_net'
+        self.model_path = os.path.join(self.temp_dir.name,
+                                       'nested_sequential_net')
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py
index fb918f4ae00ed..2e2918facf896 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py
@@ -24,6 +24,7 @@
 from paddle.fluid.dygraph import ProgramTranslator
 from paddle.fluid.dygraph.dygraph_to_static.convert_call_func import CONVERSION_OPTIONS
 from test_program_translator import get_source_code
+import paddle.jit.dy2static as _jst
 
 program_translator = ProgramTranslator()
 
@@ -255,7 +256,7 @@ def _get_answer_code(self):
         return get_source_code(self.answer_func)
 
     def _get_transformed_code(self):
-        transformed_func = paddle.jit.dy2static.convert_call(self.func)
+        transformed_func = _jst.convert_call(self.func)
         return get_source_code(transformed_func)
 
     def test_code(self):
@@ -275,7 +276,7 @@ def set_func(self):
     def set_answer_func(self):
         class StaticCode():
             def func_convert_then_not_to_static(x):
-                y = paddle.jit.dy2static.convert_call(func_not_to_static)(x)
+                y = _jst.convert_call(func_not_to_static)(x)
                 return y
 
         self.answer_func = StaticCode.func_convert_then_not_to_static
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
index 67091f5fabb2e..35dfe550552a9 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
@@ -14,7 +14,8 @@
 
 import numpy as np
 import unittest
-
+import os
+import tempfile
 import paddle
 import paddle.fluid as fluid
 from paddle.static import InputSpec
@@ -100,7 +101,11 @@ def test_instance_same_class(self):
 
 class TestInputSpec(unittest.TestCase):
     def setUp(self):
-        pass
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.model_path = os.path.join(self.temp_dir.name, 'simple_net')
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
 
     def test_with_input_spec(self):
         with fluid.dygraph.guard(fluid.CPUPlace()):
@@ -116,8 +121,8 @@ def test_with_input_spec(self):
 
             # 2. test save load
             net.inner_function(x)
-            jit.save(net, './simple_net')
-            infer_net = fluid.dygraph.jit.load('./simple_net')
+            jit.save(net, self.model_path)
+            infer_net = fluid.dygraph.jit.load(self.model_path)
             pred = infer_net(x)
             self.assertTrue(np.allclose(out.numpy(), pred.numpy()))
 
@@ -438,12 +443,19 @@ def forward(self):
 
 
 class TestSetBuffers(unittest.TestCase):
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.model_path = os.path.join(self.temp_dir.name, 'SetBuffersNet1')
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def test_set_buffers1(self):
         paddle.disable_static()
         net = SetBuffersNet1()
         out = net()
         self.assertEqual(out.numpy().tolist(), [2])
-        paddle.jit.save(net, './SetBuffersNet1')
+        paddle.jit.save(net, self.model_path)
         paddle.enable_static()
 
     def test_set_buffers2(self):
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_drop_path.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_drop_path.py
new file mode 100644
index 0000000000000..7383c834ba9a4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_drop_path.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle
+
+
+def drop_path(x, training=False):
+    if not training:
+        return x
+    else:
+        return 2 * x
+
+
+class DropPath(paddle.nn.Layer):
+    def __init__(self):
+        super(DropPath, self).__init__()
+
+    @paddle.jit.to_static
+    def forward(self, x):
+        return drop_path(x, self.training)
+
+
+class TestTrainEval(unittest.TestCase):
+    def setUp(self):
+        self.model = DropPath()
+
+    def tearDown(self):
+        pass
+
+    def test_train_and_eval(self):
+        x = paddle.to_tensor([1, 2, 3]).astype("int64")
+        eval_out = x.numpy()
+        train_out = x.numpy() * 2
+        self.model.train()
+        self.assertTrue(np.allclose(self.model(x).numpy(), train_out))
+        self.model.eval()
+        self.assertTrue(np.allclose(self.model(x).numpy(), eval_out))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py
index 750ed615e7109..337e9cd720229 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py
@@ -16,6 +16,8 @@
 
 import numpy as np
 import unittest
+import os
+import tempfile
 
 import paddle
 import paddle.fluid as fluid
@@ -532,12 +534,20 @@ def test_transformed_result_compare(self):
 
 
 class TestForZip(unittest.TestCase):
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def test_for_zip_error(self):
         with self.assertRaises(RuntimeError):
-            paddle.jit.save(for_zip_error, './for_zip_error')
+            model_path = os.path.join(self.temp_dir.name, 'for_zip_error')
+            paddle.jit.save(for_zip_error, model_path)
 
     def test_for_zip(self):
-        paddle.jit.save(for_zip, './for_zip')
+        model_path = os.path.join(self.temp_dir.name, 'for_zip')
+        paddle.jit.save(for_zip, model_path)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_grad.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_grad.py
index 6c43796215848..b5160e210c1b4 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_grad.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_grad.py
@@ -17,6 +17,8 @@
 import numpy as np
 import paddle
 import unittest
+import os
+import tempfile
 
 
 class GradLayer(paddle.nn.Layer):
@@ -88,8 +90,15 @@ def setUp(self):
         self.func = GradLinearLayer()
         self.x = paddle.ones(shape=[10, 2, 5], dtype='float32')
         self.x.stop_gradient = False
-        self.infer_model_path = "double_grad_infer_model"
-        self.train_model_path = "double_grad_train_model"
+
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.infer_model_path = os.path.join(self.temp_dir.name,
+                                             'double_grad_infer_model')
+        self.train_model_path = os.path.join(self.temp_dir.name,
+                                             'double_grad_train_model')
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
 
     def test_save_infer_program(self):
         input_spec = [
@@ -113,7 +122,7 @@ def test_save_train_program(self):
             avg_loss = paddle.mean(paddle.abs(out - 1))
             avg_loss.backward()
             optimizer.minimize(avg_loss)
-            print(self.x.grad.mean())
+
             self.func.clear_gradients()
 
         paddle.jit.save(self.func, self.train_model_path)
@@ -129,8 +138,15 @@ def setUp(self):
         self.func = NoGradLinearLayer()
         self.x = paddle.ones(shape=[10, 2, 5], dtype='float32')
         self.x.stop_gradient = False
-        self.infer_model_path = "no_grad_infer_model"
-        self.train_model_path = "no_grad_train_model"
+
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.infer_model_path = os.path.join(self.temp_dir.name,
+                                             'no_grad_infer_model')
+        self.train_model_path = os.path.join(self.temp_dir.name,
+                                             'no_grad_train_model')
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
index 19965821e8750..e0a9a3ad2af07 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
@@ -19,6 +19,7 @@
 import unittest
 
 import os
+import tempfile
 os.environ["CUDA_VISIBLE_DEVICES"] = "2"
 
 import paddle
@@ -406,11 +407,6 @@ class Args(object):
     base_learning_rate = 0.01
     bigru_num = 2
     print_steps = 1
-    model_save_dir = "./inference"
-    model_save_prefix = "./inference/lac"
-    model_filename = "lac" + INFER_MODEL_SUFFIX
-    params_filename = "lac" + INFER_PARAMS_SUFFIX
-    dy_param_path = "./lac_dy_param"
 
 
 def get_random_input_data(batch_size, vocab_size, num_labels, max_seq_len=64):
@@ -458,84 +454,86 @@ def create_dataloader(reader, place):
     return data_loader
 
 
-def do_train(args, to_static):
-    program_translator.enable(to_static)
-    place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
-    ) else fluid.CPUPlace()
-    with fluid.dygraph.guard(place):
-        paddle.seed(SEED)
-        paddle.framework.random._manual_program_seed(SEED)
-
-        reader = get_random_input_data(args.batch_size, args.vocab_size,
-                                       args.num_labels)
-        train_loader = create_dataloader(reader, place)
-
-        model = LexNet(args)
-        optimizer = fluid.optimizer.AdamOptimizer(
-            learning_rate=args.base_learning_rate,
-            parameter_list=model.parameters())
-        chunk_eval = ChunkEval(
-            int(math.ceil((args.num_labels - 1) / 2.0)), "IOB")
-
-        step = 0
-        chunk_evaluator = fluid.metrics.ChunkEvaluator()
-        chunk_evaluator.reset()
-
-        loss_data = []
-        for epoch_id in range(args.epoch):
-            for batch in train_loader():
-                words, targets, length = batch
-                start_time = time.time()
-                avg_cost, crf_decode = model(words, targets, length)
-                loss_data.append(avg_cost.numpy()[0])
-
-                # backward and optimization
-                avg_cost.backward()
-                optimizer.minimize(avg_cost)
-                model.clear_gradients()
-                end_time = time.time()
-
-                if step % args.print_steps == 0:
-                    (precision, recall, f1_score, num_infer_chunks,
-                     num_label_chunks, num_correct_chunks) = chunk_eval(
-                         input=crf_decode, label=targets, seq_length=length)
-                    outputs = [avg_cost, precision, recall, f1_score]
-                    avg_cost, precision, recall, f1_score = [
-                        np.mean(x.numpy()) for x in outputs
-                    ]
-
-                    print(
-                        "[train] step = %d, loss = %f, P: %f, R: %f, F1: %f, elapsed time %f"
-                        % (step, avg_cost, precision, recall, f1_score,
-                           end_time - start_time))
-
-                step += 1
-        # save inference model
-        if to_static:
-            fluid.dygraph.jit.save(
-                layer=model,
-                path=args.model_save_prefix,
-                input_spec=[input_specs[0], input_specs[-1]],
-                output_spec=[crf_decode])
-        else:
-            fluid.dygraph.save_dygraph(model.state_dict(), args.dy_param_path)
-
-        return np.array(loss_data)
-
-
 class TestLACModel(unittest.TestCase):
     def setUp(self):
         self.args = Args()
         self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.model_save_dir = os.path.join(self.temp_dir.name, 'inference')
+        self.model_save_prefix = os.path.join(self.model_save_dir, 'lac')
+        self.model_filename = "lac" + INFER_MODEL_SUFFIX
+        self.params_filename = "lac" + INFER_PARAMS_SUFFIX
+        self.dy_param_path = os.path.join(self.temp_dir.name, 'lac_dy_param')
+
+    def train(self, args, to_static):
+        program_translator.enable(to_static)
+        place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.dygraph.guard(place):
+            paddle.seed(SEED)
+            paddle.framework.random._manual_program_seed(SEED)
+
+            reader = get_random_input_data(args.batch_size, args.vocab_size,
+                                           args.num_labels)
+            train_loader = create_dataloader(reader, place)
+
+            model = LexNet(args)
+            optimizer = fluid.optimizer.AdamOptimizer(
+                learning_rate=args.base_learning_rate,
+                parameter_list=model.parameters())
+            chunk_eval = ChunkEval(
+                int(math.ceil((args.num_labels - 1) / 2.0)), "IOB")
+
+            step = 0
+            chunk_evaluator = fluid.metrics.ChunkEvaluator()
+            chunk_evaluator.reset()
+
+            loss_data = []
+            for epoch_id in range(args.epoch):
+                for batch in train_loader():
+                    words, targets, length = batch
+                    start_time = time.time()
+                    avg_cost, crf_decode = model(words, targets, length)
+                    loss_data.append(avg_cost.numpy()[0])
+
+                    # backward and optimization
+                    avg_cost.backward()
+                    optimizer.minimize(avg_cost)
+                    model.clear_gradients()
+                    end_time = time.time()
+
+                    if step % args.print_steps == 0:
+                        (precision, recall, f1_score, num_infer_chunks,
+                         num_label_chunks, num_correct_chunks) = chunk_eval(
+                             input=crf_decode, label=targets, seq_length=length)
+                        outputs = [avg_cost, precision, recall, f1_score]
+                        avg_cost, precision, recall, f1_score = [
+                            np.mean(x.numpy()) for x in outputs
+                        ]
+
+                        print(
+                            "[train] step = %d, loss = %f, P: %f, R: %f, F1: %f, elapsed time %f"
+                            % (step, avg_cost, precision, recall, f1_score,
+                               end_time - start_time))
+
+                    step += 1
+            # save inference model
+            if to_static:
+                fluid.dygraph.jit.save(
+                    layer=model,
+                    path=self.model_save_prefix,
+                    input_spec=[input_specs[0], input_specs[-1]],
+                    output_spec=[crf_decode])
+            else:
+                fluid.dygraph.save_dygraph(model.state_dict(),
+                                           self.dy_param_path)
 
-    def train(self, to_static):
-        out = do_train(self.args, to_static)
-        return out
+            return np.array(loss_data)
 
     def test_train(self):
-        st_out = self.train(to_static=True)
-        dy_out = self.train(to_static=False)
+        st_out = self.train(self.args, to_static=True)
+        dy_out = self.train(self.args, to_static=False)
         self.assertTrue(
             np.allclose(dy_out, st_out),
             msg="dygraph output:\n{},\nstatic output:\n {}.".format(dy_out,
@@ -565,8 +563,7 @@ def predict_dygraph(self, batch):
         with fluid.dygraph.guard(self.place):
             model = LexNet(self.args)
             # load dygraph trained parameters
-            model_dict, _ = fluid.load_dygraph(self.args.dy_param_path +
-                                               ".pdparams")
+            model_dict, _ = fluid.load_dygraph(self.dy_param_path + ".pdparams")
             model.set_dict(model_dict)
             model.eval()
 
@@ -585,10 +582,10 @@ def predict_static(self, batch):
         # load inference model
         [inference_program, feed_target_names,
          fetch_targets] = fluid.io.load_inference_model(
-             self.args.model_save_dir,
+             self.model_save_dir,
              executor=exe,
-             model_filename=self.args.model_filename,
-             params_filename=self.args.params_filename)
+             model_filename=self.model_filename,
+             params_filename=self.params_filename)
 
         words, targets, length = batch
         pred_res = exe.run(
@@ -601,7 +598,7 @@ def predict_static(self, batch):
     def predict_dygraph_jit(self, batch):
         words, targets, length = batch
         with fluid.dygraph.guard(self.place):
-            model = fluid.dygraph.jit.load(self.args.model_save_prefix)
+            model = fluid.dygraph.jit.load(self.model_save_prefix)
             model.eval()
 
             pred_res = model(to_variable(words), to_variable(length))
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_layer_hook.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_layer_hook.py
index dcb41cfc6aba7..357d9611053da 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_layer_hook.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_layer_hook.py
@@ -14,8 +14,9 @@
 
 import unittest
 import paddle
-
+import os
 import numpy as np
+import tempfile
 
 
 def forward_post_hook1(layer, input, output):
@@ -54,7 +55,11 @@ class TestNestLayerHook(unittest.TestCase):
     def setUp(self):
         paddle.seed(2022)
         self.x = paddle.randn([4, 10])
-        self.path = "./net_hook"
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.path = os.path.join(self.temp_dir.name, 'net_hook')
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
 
     def train_net(self, to_static=False):
         paddle.seed(2022)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lstm.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lstm.py
index cce2a383dd8e9..8d54e199800cd 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lstm.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lstm.py
@@ -16,6 +16,8 @@
 import paddle
 import unittest
 from paddle import nn
+import os
+import tempfile
 
 
 class LSTMLayer(nn.Layer):
@@ -40,6 +42,12 @@ def forward(self, x):
 
 
 class TestLstm(unittest.TestCase):
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def run_lstm(self, to_static):
         paddle.jit.ProgramTranslator().enable(to_static)
 
@@ -78,11 +86,12 @@ def test_save_in_eval(self, with_training=True):
         x = paddle.randn((2, 10, 12))
         net = paddle.jit.to_static(
             net, input_spec=[paddle.static.InputSpec(shape=[-1, 10, 12])])
-        paddle.jit.save(net, 'simple_lstm')
+        model_path = os.path.join(self.temp_dir.name, 'simple_lstm')
+        paddle.jit.save(net, model_path)
 
         dygraph_out = net(x)
         # load saved model
-        load_net = paddle.jit.load('simple_lstm')
+        load_net = paddle.jit.load(model_path)
 
         static_out = load_net(x)
         self.assertTrue(
@@ -115,6 +124,12 @@ def forward(self, x):
 
 
 class TestSaveInEvalMode(unittest.TestCase):
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def test_save_in_eval(self):
         paddle.jit.ProgramTranslator().enable(True)
         net = LinearNet()
@@ -131,9 +146,11 @@ def test_save_in_eval(self):
         # save directly
         net = paddle.jit.to_static(
             net, input_spec=[paddle.static.InputSpec(shape=[-1, 10])])
-        paddle.jit.save(net, 'linear_net')
+
+        model_path = os.path.join(self.temp_dir.name, 'linear_net')
+        paddle.jit.save(net, model_path)
         # load saved model
-        load_net = paddle.jit.load('linear_net')
+        load_net = paddle.jit.load(model_path)
 
         x = paddle.randn((2, 10))
         eval_out = net(x)
@@ -146,6 +163,12 @@ def test_save_in_eval(self):
 
 
 class TestEvalAfterSave(unittest.TestCase):
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def test_eval_after_save(self):
         x = paddle.randn((2, 10, 12)).astype('float32')
         net = Net(12, 2)
@@ -159,8 +182,9 @@ def test_eval_after_save(self):
         x = paddle.randn((2, 10, 12)).astype('float32')
         dy_out = net(x)
         # save model
-        paddle.jit.save(net, 'jit.save/lstm', input_spec=[x])
-        load_net = paddle.jit.load('jit.save/lstm')
+        model_path = os.path.join(self.temp_dir.name, 'jit.save/lstm')
+        paddle.jit.save(net, model_path, input_spec=[x])
+        load_net = paddle.jit.load(model_path)
         load_out = load_net(x)
         self.assertTrue(np.allclose(dy_out.numpy(), load_out.numpy()))
         # eval
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
index 2b8307461b8f5..2bb3879efb753 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
@@ -15,6 +15,8 @@
 from __future__ import print_function
 
 import unittest
+import os
+import tempfile
 from time import time
 
 import numpy as np
@@ -134,6 +136,10 @@ def setUp(self):
             paddle.dataset.mnist.train(),
             batch_size=self.batch_size,
             drop_last=True)
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
 
 
 class TestMNISTWithToStatic(TestMNIST):
@@ -227,9 +233,10 @@ def train(self, to_static=False):
 
     def check_jit_save_load(self, model, inputs, input_spec, to_static, gt_out):
         if to_static:
-            infer_model_path = "./test_mnist_inference_model_by_jit_save"
-            model_save_dir = "./inference"
-            model_save_prefix = "./inference/mnist"
+            infer_model_path = os.path.join(
+                self.temp_dir.name, 'test_mnist_inference_model_by_jit_save')
+            model_save_dir = os.path.join(self.temp_dir.name, 'inference')
+            model_save_prefix = os.path.join(model_save_dir, 'mnist')
             model_filename = "mnist" + INFER_MODEL_SUFFIX
             params_filename = "mnist" + INFER_PARAMS_SUFFIX
             fluid.dygraph.jit.save(
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
index f58041cbb6c8d..7b98ced95e22c 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+import tempfile
 import time
 import numpy as np
 import paddle
@@ -439,11 +441,11 @@ class Args(object):
     train_step = 10
     place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
     ) else fluid.CPUPlace()
-    model_save_dir = "./inference"
-    model_save_prefix = "./inference/" + model
-    model_filename = model + INFER_MODEL_SUFFIX
-    params_filename = model + INFER_PARAMS_SUFFIX
-    dy_state_dict_save_path = model + ".dygraph"
+    model_save_dir = None
+    model_save_prefix = None
+    model_filename = None
+    params_filename = None
+    dy_state_dict_save_path = None
 
 
 def train_mobilenet(args, to_static):
@@ -571,13 +573,21 @@ def predict_analysis_inference(args, data):
 class TestMobileNet(unittest.TestCase):
     def setUp(self):
         self.args = Args()
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.args.model_save_dir = os.path.join(self.temp_dir.name,
+                                                "./inference")
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
 
     def train(self, model_name, to_static):
         self.args.model = model_name
-        self.args.model_save_prefix = "./inference/" + model_name
+        self.args.model_save_prefix = os.path.join(self.temp_dir.name,
+                                                   "./inference/" + model_name)
         self.args.model_filename = model_name + INFER_MODEL_SUFFIX
         self.args.params_filename = model_name + INFER_PARAMS_SUFFIX
-        self.args.dy_state_dict_save_path = model_name + ".dygraph"
+        self.args.dy_state_dict_save_path = os.path.join(
+            self.temp_dir.name, model_name + ".dygraph")
         out = train_mobilenet(self.args, to_static)
         return out
 
@@ -590,10 +600,12 @@ def assert_same_loss(self, model_name):
 
     def assert_same_predict(self, model_name):
         self.args.model = model_name
-        self.args.model_save_prefix = "./inference/" + model_name
+        self.args.model_save_prefix = os.path.join(self.temp_dir.name,
+                                                   "./inference/" + model_name)
         self.args.model_filename = model_name + INFER_MODEL_SUFFIX
         self.args.params_filename = model_name + INFER_PARAMS_SUFFIX
-        self.args.dy_state_dict_save_path = model_name + ".dygraph"
+        self.args.dy_state_dict_save_path = os.path.join(
+            self.temp_dir.name, model_name + ".dygraph")
         local_random = np.random.RandomState(SEED)
         image = local_random.random_sample([1, 3, 224, 224]).astype('float32')
         dy_pre = predict_dygraph(self.args, image)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py
index e3d34184a38fc..8dac888993590 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py
@@ -65,7 +65,7 @@ def set_test_func(self):
         self.func = simple_func
 
     def set_static_lineno(self):
-        self.static_abs_lineno_list = [6, 7, 8]
+        self.static_abs_lineno_list = [7, 8, 9]
 
     def set_dygraph_info(self):
         self.line_num = 3
@@ -149,7 +149,7 @@ def set_test_func(self):
         self.func = nested_func
 
     def set_static_lineno(self):
-        self.static_abs_lineno_list = [6, 8, 9, 10, 11]
+        self.static_abs_lineno_list = [7, 9, 10, 11, 12]
 
     def set_dygraph_info(self):
         self.line_num = 5
@@ -174,7 +174,7 @@ def set_test_func(self):
         self.func = decorated_func
 
     def set_static_lineno(self):
-        self.static_abs_lineno_list = [6, 7]
+        self.static_abs_lineno_list = [7, 8]
 
     def set_dygraph_info(self):
         self.line_num = 2
@@ -208,7 +208,7 @@ def set_test_func(self):
         self.func = decorated_func2
 
     def set_static_lineno(self):
-        self.static_abs_lineno_list = [6, 7]
+        self.static_abs_lineno_list = [7, 8]
 
     def set_dygraph_info(self):
         self.line_num = 2
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
index 427e4c2252451..4f55dbd324c21 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
@@ -135,22 +135,23 @@ def test_switch_eval_and_train(self):
             x = fluid.dygraph.to_variable(x_data)
             linear_net(x)
 
-            _, partial_layer = linear_net.forward.program_cache.last()[-1]
+            _, train_partial_layer = linear_net.forward.program_cache.last()[-1]
             # check default mode is for training
-            self.assertEqual(partial_layer.program,
-                             partial_layer._train_program)
+            self.assertEqual(train_partial_layer.program,
+                             train_partial_layer._train_program)
 
             # switch to run test program after `eval()`
             linear_net.eval()
             linear_net(x)
-            self.assertEqual(partial_layer.program,
-                             partial_layer._infer_program)
+            _, eval_partial_layer = linear_net.forward.program_cache.last()[-1]
+            self.assertEqual(eval_partial_layer.program,
+                             eval_partial_layer._infer_program)
 
             # switch back into training
             linear_net.train()
             linear_net(x)
-            self.assertEqual(partial_layer.program,
-                             partial_layer._train_program)
+            self.assertEqual(train_partial_layer.program,
+                             train_partial_layer._train_program)
 
 
 class TestWithNoGrad(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
index b0ffbac88fb42..4e90c73baa944 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
@@ -27,6 +27,7 @@
 from paddle.fluid.dygraph.jit import declarative
 from paddle.fluid.dygraph.nn import Linear
 from paddle.fluid.dygraph.dygraph_to_static.utils import func_to_source_code
+import paddle.jit.dy2static as _jst
 
 from ifelse_simple_func import dyfunc_with_if_else
 
@@ -76,40 +77,38 @@ def false_fn_0(x_v):
             x_v = x_v + 1
             return x_v
 
-        x_v = paddle.jit.dy2static.convert_ifelse(
+        x_v = _jst.convert_ifelse(
             fluid.layers.mean(x_v)[0] > 5, true_fn_0, false_fn_0, (x_v, ),
             (x_v, ), (x_v, ))
-        __return_0 = paddle.jit.dy2static.create_bool_as_type(label is not None,
-                                                              False)
+        __return_0 = _jst.create_bool_as_type(label is not None, False)
 
         def true_fn_1(__return_0, __return_value_0, label, x_v):
             loss = fluid.layers.cross_entropy(x_v, label)
-            __return_0 = paddle.jit.dy2static.create_bool_as_type(
-                label is not None, True)
+            __return_0 = _jst.create_bool_as_type(label is not None, True)
             __return_value_0 = loss
             return __return_0, __return_value_0
 
         def false_fn_1(__return_0, __return_value_0):
             return __return_0, __return_value_0
 
-        __return_0, __return_value_0 = (paddle.jit.dy2static.convert_ifelse(
+        __return_0, __return_value_0 = _jst.convert_ifelse(
             label is not None, true_fn_1, false_fn_1,
             (__return_0, __return_value_0, label, x_v),
-            (__return_0, __return_value_0), (__return_0, __return_value_0)))
+            (__return_0, __return_value_0), (__return_0, __return_value_0))
 
         def true_fn_2(__return_0, __return_value_0, x_v):
-            __return_1 = paddle.jit.dy2static.create_bool_as_type(
-                paddle.jit.dy2static.convert_logical_not(__return_0), True)
+            __return_1 = _jst.create_bool_as_type(
+                _jst.convert_logical_not(__return_0), True)
             __return_value_0 = x_v
             return __return_value_0
 
         def false_fn_2(__return_value_0):
             return __return_value_0
 
-        __return_value_0 = paddle.jit.dy2static.convert_ifelse(
-            paddle.jit.dy2static.convert_logical_not(__return_0), true_fn_2,
-            false_fn_2, (__return_0, __return_value_0,
-                         x_v), (__return_value_0, ), (__return_value_0, ))
+        __return_value_0 = _jst.convert_ifelse(
+            _jst.convert_logical_not(__return_0), true_fn_2, false_fn_2,
+            (__return_0, __return_value_0,
+             x_v), (__return_value_0, ), (__return_value_0, ))
         return __return_value_0
 
 
@@ -128,40 +127,38 @@ def false_fn_3(x_v):
             x_v = x_v + 1
             return x_v
 
-        x_v = paddle.jit.dy2static.convert_ifelse(
+        x_v = _jst.convert_ifelse(
             fluid.layers.mean(x_v)[0] > 5, true_fn_3, false_fn_3, (x_v, ),
             (x_v, ), (x_v, ))
-        __return_2 = paddle.jit.dy2static.create_bool_as_type(label is not None,
-                                                              False)
+        __return_2 = _jst.create_bool_as_type(label is not None, False)
 
         def true_fn_4(__return_2, __return_value_1, label, x_v):
             loss = fluid.layers.cross_entropy(x_v, label)
-            __return_2 = paddle.jit.dy2static.create_bool_as_type(
-                label is not None, True)
+            __return_2 = _jst.create_bool_as_type(label is not None, True)
             __return_value_1 = loss
             return __return_2, __return_value_1
 
         def false_fn_4(__return_2, __return_value_1):
             return __return_2, __return_value_1
 
-        __return_2, __return_value_1 = paddle.jit.dy2static.convert_ifelse(
-            label is not None, true_fn_4, false_fn_4, (
-                __return_2, __return_value_1, label, x_v),
+        __return_2, __return_value_1 = _jst.convert_ifelse(
+            label is not None, true_fn_4, false_fn_4,
+            (__return_2, __return_value_1, label, x_v),
             (__return_2, __return_value_1), (__return_2, __return_value_1))
 
         def true_fn_5(__return_2, __return_value_1, x_v):
-            __return_3 = paddle.jit.dy2static.create_bool_as_type(
-                paddle.jit.dy2static.convert_logical_not(__return_2), True)
+            __return_3 = _jst.create_bool_as_type(
+                _jst.convert_logical_not(__return_2), True)
             __return_value_1 = x_v
             return __return_value_1
 
         def false_fn_5(__return_value_1):
             return __return_value_1
 
-        __return_value_1 = paddle.jit.dy2static.convert_ifelse(
-            paddle.jit.dy2static.convert_logical_not(__return_2), true_fn_5,
-            false_fn_5, (__return_2, __return_value_1,
-                         x_v), (__return_value_1, ), (__return_value_1, ))
+        __return_value_1 = _jst.convert_ifelse(
+            _jst.convert_logical_not(__return_2), true_fn_5, false_fn_5,
+            (__return_2, __return_value_1,
+             x_v), (__return_value_1, ), (__return_value_1, ))
         return __return_value_1
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
index efb69b530efc9..1a531c65bbf1e 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
@@ -14,8 +14,10 @@
 
 from __future__ import print_function
 
+import os
 import math
 import time
+import tempfile
 import unittest
 
 import numpy as np
@@ -39,11 +41,6 @@
 place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() \
     else fluid.CPUPlace()
 
-MODEL_SAVE_DIR = "./inference"
-MODEL_SAVE_PREFIX = "./inference/resnet"
-MODEL_FILENAME = "resnet" + INFER_MODEL_SUFFIX
-PARAMS_FILENAME = "resnet" + INFER_PARAMS_SUFFIX
-DY_STATE_DICT_SAVE_PATH = "./resnet.dygraph"
 program_translator = ProgramTranslator()
 
 if fluid.is_compiled_with_cuda():
@@ -212,130 +209,148 @@ def __reader__():
     return __reader__
 
 
-def train(to_static, build_strategy=None):
-    """
-    Tests model decorated by `dygraph_to_static_output` in static mode. For users, the model is defined in dygraph mode and trained in static mode.
-    """
-    with fluid.dygraph.guard(place):
-        np.random.seed(SEED)
-        paddle.seed(SEED)
-        paddle.framework.random._manual_program_seed(SEED)
-
-        train_reader = paddle.batch(
-            reader_decorator(paddle.dataset.flowers.train(use_xmap=False)),
-            batch_size=batch_size,
-            drop_last=True)
-        data_loader = fluid.io.DataLoader.from_generator(
-            capacity=5, iterable=True)
-        data_loader.set_sample_list_generator(train_reader)
-
-        resnet = ResNet()
-        if to_static:
-            resnet = paddle.jit.to_static(resnet, build_strategy=build_strategy)
-        optimizer = optimizer_setting(parameter_list=resnet.parameters())
-
-        for epoch in range(epoch_num):
-            total_loss = 0.0
-            total_acc1 = 0.0
-            total_acc5 = 0.0
-            total_sample = 0
-
-            for batch_id, data in enumerate(data_loader()):
-                start_time = time.time()
-                img, label = data
-
-                pred = resnet(img)
-                loss = fluid.layers.cross_entropy(input=pred, label=label)
-                avg_loss = fluid.layers.mean(x=loss)
-                acc_top1 = fluid.layers.accuracy(input=pred, label=label, k=1)
-                acc_top5 = fluid.layers.accuracy(input=pred, label=label, k=5)
-
-                avg_loss.backward()
-                optimizer.minimize(avg_loss)
-                resnet.clear_gradients()
-
-                total_loss += avg_loss
-                total_acc1 += acc_top1
-                total_acc5 += acc_top5
-                total_sample += 1
-
-                end_time = time.time()
-                if batch_id % 2 == 0:
-                    print( "epoch %d | batch step %d, loss %0.3f, acc1 %0.3f, acc5 %0.3f, time %f" % \
-                        ( epoch, batch_id, total_loss.numpy() / total_sample, \
-                            total_acc1.numpy() / total_sample, total_acc5.numpy() / total_sample, end_time-start_time))
-                if batch_id == 10:
-                    if to_static:
-                        fluid.dygraph.jit.save(resnet, MODEL_SAVE_PREFIX)
-                    else:
-                        fluid.dygraph.save_dygraph(resnet.state_dict(),
-                                                   DY_STATE_DICT_SAVE_PATH)
-                    # avoid dataloader throw abort signaal
-                    data_loader._reset()
-                    break
-
-    return total_loss.numpy()
-
-
-def predict_dygraph(data):
-    program_translator.enable(False)
-    with fluid.dygraph.guard(place):
-        resnet = ResNet()
-
-        model_dict, _ = fluid.dygraph.load_dygraph(DY_STATE_DICT_SAVE_PATH)
-        resnet.set_dict(model_dict)
-        resnet.eval()
-
-        pred_res = resnet(fluid.dygraph.to_variable(data))
-
-        return pred_res.numpy()
-
-
-def predict_static(data):
-    paddle.enable_static()
-    exe = fluid.Executor(place)
-    [inference_program, feed_target_names,
-     fetch_targets] = fluid.io.load_inference_model(
-         MODEL_SAVE_DIR,
-         executor=exe,
-         model_filename=MODEL_FILENAME,
-         params_filename=PARAMS_FILENAME)
-
-    pred_res = exe.run(inference_program,
-                       feed={feed_target_names[0]: data},
-                       fetch_list=fetch_targets)
-
-    return pred_res[0]
-
-
-def predict_dygraph_jit(data):
-    with fluid.dygraph.guard(place):
-        resnet = fluid.dygraph.jit.load(MODEL_SAVE_PREFIX)
-        resnet.eval()
-
-        pred_res = resnet(data)
-
-        return pred_res.numpy()
-
-
-def predict_analysis_inference(data):
-    output = PredictorTools(MODEL_SAVE_DIR, MODEL_FILENAME, PARAMS_FILENAME,
-                            [data])
-    out = output()
-    return out
+class ResNetHelper:
+    def __init__(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.model_save_dir = os.path.join(self.temp_dir.name, 'inference')
+        self.model_save_prefix = os.path.join(self.model_save_dir, 'resnet')
+        self.model_filename = 'resnet' + INFER_MODEL_SUFFIX
+        self.params_filename = 'resnet' + INFER_PARAMS_SUFFIX
+        self.dy_state_dict_save_path = os.path.join(self.temp_dir.name,
+                                                    'resnet.dygraph')
+
+    def __del__(self):
+        self.temp_dir.cleanup()
+
+    def train(self, to_static, build_strategy=None):
+        """
+        Tests model decorated by `dygraph_to_static_output` in static mode. For users, the model is defined in dygraph mode and trained in static mode.
+        """
+        with fluid.dygraph.guard(place):
+            np.random.seed(SEED)
+            paddle.seed(SEED)
+            paddle.framework.random._manual_program_seed(SEED)
+
+            train_reader = paddle.batch(
+                reader_decorator(paddle.dataset.flowers.train(use_xmap=False)),
+                batch_size=batch_size,
+                drop_last=True)
+            data_loader = fluid.io.DataLoader.from_generator(
+                capacity=5, iterable=True)
+            data_loader.set_sample_list_generator(train_reader)
+
+            resnet = ResNet()
+            if to_static:
+                resnet = paddle.jit.to_static(
+                    resnet, build_strategy=build_strategy)
+            optimizer = optimizer_setting(parameter_list=resnet.parameters())
+
+            for epoch in range(epoch_num):
+                total_loss = 0.0
+                total_acc1 = 0.0
+                total_acc5 = 0.0
+                total_sample = 0
+
+                for batch_id, data in enumerate(data_loader()):
+                    start_time = time.time()
+                    img, label = data
+
+                    pred = resnet(img)
+                    loss = fluid.layers.cross_entropy(input=pred, label=label)
+                    avg_loss = fluid.layers.mean(x=loss)
+                    acc_top1 = fluid.layers.accuracy(
+                        input=pred, label=label, k=1)
+                    acc_top5 = fluid.layers.accuracy(
+                        input=pred, label=label, k=5)
+
+                    avg_loss.backward()
+                    optimizer.minimize(avg_loss)
+                    resnet.clear_gradients()
+
+                    total_loss += avg_loss
+                    total_acc1 += acc_top1
+                    total_acc5 += acc_top5
+                    total_sample += 1
+
+                    end_time = time.time()
+                    if batch_id % 2 == 0:
+                        print( "epoch %d | batch step %d, loss %0.3f, acc1 %0.3f, acc5 %0.3f, time %f" % \
+                            ( epoch, batch_id, total_loss.numpy() / total_sample, \
+                                total_acc1.numpy() / total_sample, total_acc5.numpy() / total_sample, end_time-start_time))
+                    if batch_id == 10:
+                        if to_static:
+                            fluid.dygraph.jit.save(resnet,
+                                                   self.model_save_prefix)
+                        else:
+                            fluid.dygraph.save_dygraph(
+                                resnet.state_dict(),
+                                self.dy_state_dict_save_path)
+                        # avoid dataloader throw abort signaal
+                        data_loader._reset()
+                        break
+
+        return total_loss.numpy()
+
+    def predict_dygraph(self, data):
+        program_translator.enable(False)
+        with fluid.dygraph.guard(place):
+            resnet = ResNet()
+
+            model_dict, _ = fluid.dygraph.load_dygraph(
+                self.dy_state_dict_save_path)
+            resnet.set_dict(model_dict)
+            resnet.eval()
+
+            pred_res = resnet(fluid.dygraph.to_variable(data))
+
+            return pred_res.numpy()
+
+    def predict_static(self, data):
+        paddle.enable_static()
+        exe = fluid.Executor(place)
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(
+             self.model_save_dir,
+             executor=exe,
+             model_filename=self.model_filename,
+             params_filename=self.params_filename)
+
+        pred_res = exe.run(inference_program,
+                           feed={feed_target_names[0]: data},
+                           fetch_list=fetch_targets)
+
+        return pred_res[0]
+
+    def predict_dygraph_jit(self, data):
+        with fluid.dygraph.guard(place):
+            resnet = fluid.dygraph.jit.load(self.model_save_prefix)
+            resnet.eval()
+
+            pred_res = resnet(data)
+
+            return pred_res.numpy()
+
+    def predict_analysis_inference(self, data):
+        output = PredictorTools(self.model_save_dir, self.model_filename,
+                                self.params_filename, [data])
+        out = output()
+        return out
 
 
 class TestResnet(unittest.TestCase):
+    def setUp(self):
+        self.resnet_helper = ResNetHelper()
+
     def train(self, to_static):
         program_translator.enable(to_static)
-        return train(to_static)
+        return self.resnet_helper.train(to_static)
 
     def verify_predict(self):
         image = np.random.random([1, 3, 224, 224]).astype('float32')
-        dy_pre = predict_dygraph(image)
-        st_pre = predict_static(image)
-        dy_jit_pre = predict_dygraph_jit(image)
-        predictor_pre = predict_analysis_inference(image)
+        dy_pre = self.resnet_helper.predict_dygraph(image)
+        st_pre = self.resnet_helper.predict_static(image)
+        dy_jit_pre = self.resnet_helper.predict_dygraph_jit(image)
+        predictor_pre = self.resnet_helper.predict_analysis_inference(image)
         self.assertTrue(
             np.allclose(dy_pre, st_pre),
             msg="dy_pre:\n {}\n, st_pre: \n{}.".format(dy_pre, st_pre))
@@ -360,7 +375,7 @@ def test_in_static_mode_mkldnn(self):
         fluid.set_flags({'FLAGS_use_mkldnn': True})
         try:
             if paddle.fluid.core.is_compiled_with_mkldnn():
-                train(to_static=True)
+                self.resnet_helper.train(to_static=True)
         finally:
             fluid.set_flags({'FLAGS_use_mkldnn': False})
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py
index 0cf96b7159579..c79a86015eb4e 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py
@@ -19,6 +19,7 @@
 import math
 import time
 import unittest
+import tempfile
 
 import numpy as np
 
@@ -37,11 +38,6 @@
 place = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda() \
     else paddle.CPUPlace()
 
-MODEL_SAVE_DIR = "./inference"
-MODEL_SAVE_PREFIX = "./inference/resnet_v2"
-MODEL_FILENAME = "resnet_v2" + paddle.fluid.dygraph.io.INFER_MODEL_SUFFIX
-PARAMS_FILENAME = "resnet_v2" + paddle.fluid.dygraph.io.INFER_PARAMS_SUFFIX
-DY_STATE_DICT_SAVE_PATH = "./resnet_v2.dygraph"
 program_translator = paddle.jit.ProgramTranslator()
 
 if paddle.is_compiled_with_cuda():
@@ -210,133 +206,145 @@ def __reader__():
     return __reader__
 
 
-def train(to_static):
-    """
-    Tests model decorated by `dygraph_to_static_output` in static mode. For users, the model is defined in dygraph mode and trained in static mode.
-    """
-    paddle.disable_static(place)
-    np.random.seed(SEED)
-    paddle.seed(SEED)
-    paddle.framework.random._manual_program_seed(SEED)
-
-    train_reader = paddle.batch(
-        reader_decorator(paddle.dataset.flowers.train(use_xmap=False)),
-        batch_size=batch_size,
-        drop_last=True)
-    data_loader = paddle.io.DataLoader.from_generator(capacity=5, iterable=True)
-    data_loader.set_sample_list_generator(train_reader)
-
-    resnet = ResNet()
-    optimizer = optimizer_setting(parameter_list=resnet.parameters())
-
-    for epoch in range(epoch_num):
-        total_loss = 0.0
-        total_acc1 = 0.0
-        total_acc5 = 0.0
-        total_sample = 0
-
-        for batch_id, data in enumerate(data_loader()):
-            start_time = time.time()
-            img, label = data
-
-            pred = resnet(img)
-            loss = paddle.nn.functional.cross_entropy(input=pred, label=label)
-            avg_loss = paddle.mean(x=loss)
-            acc_top1 = paddle.metric.accuracy(input=pred, label=label, k=1)
-            acc_top5 = paddle.metric.accuracy(input=pred, label=label, k=5)
-
-            avg_loss.backward()
-            optimizer.minimize(avg_loss)
-            resnet.clear_gradients()
-
-            total_loss += avg_loss
-            total_acc1 += acc_top1
-            total_acc5 += acc_top5
-            total_sample += 1
-
-            end_time = time.time()
-            if batch_id % 2 == 0:
-                print( "epoch %d | batch step %d, loss %0.3f, acc1 %0.3f, acc5 %0.3f, time %f" % \
-                    ( epoch, batch_id, total_loss.numpy() / total_sample, \
-                        total_acc1.numpy() / total_sample, total_acc5.numpy() / total_sample, end_time-start_time))
-            if batch_id == 10:
-                if to_static:
-                    paddle.jit.save(resnet, MODEL_SAVE_PREFIX)
-                else:
-                    paddle.fluid.dygraph.save_dygraph(resnet.state_dict(),
-                                                      DY_STATE_DICT_SAVE_PATH)
-                    # avoid dataloader throw abort signaal
-                data_loader._reset()
-                break
-    paddle.enable_static()
-
-    return total_loss.numpy()
-
-
-def predict_dygraph(data):
-    program_translator.enable(False)
-    paddle.disable_static(place)
-    resnet = ResNet()
-
-    model_dict, _ = paddle.fluid.dygraph.load_dygraph(DY_STATE_DICT_SAVE_PATH)
-    resnet.set_dict(model_dict)
-    resnet.eval()
-
-    pred_res = resnet(
-        paddle.to_tensor(
-            data=data, dtype=None, place=None, stop_gradient=True))
-
-    ret = pred_res.numpy()
-    paddle.enable_static()
-    return ret
-
-
-def predict_static(data):
-    exe = paddle.static.Executor(place)
-    [inference_program, feed_target_names,
-     fetch_targets] = paddle.static.load_inference_model(
-         MODEL_SAVE_DIR,
-         executor=exe,
-         model_filename=MODEL_FILENAME,
-         params_filename=PARAMS_FILENAME)
-
-    pred_res = exe.run(inference_program,
-                       feed={feed_target_names[0]: data},
-                       fetch_list=fetch_targets)
-
-    return pred_res[0]
-
-
-def predict_dygraph_jit(data):
-    paddle.disable_static(place)
-    resnet = paddle.jit.load(MODEL_SAVE_PREFIX)
-    resnet.eval()
-
-    pred_res = resnet(data)
-
-    ret = pred_res.numpy()
-    paddle.enable_static()
-    return ret
-
-
-def predict_analysis_inference(data):
-    output = PredictorTools(MODEL_SAVE_DIR, MODEL_FILENAME, PARAMS_FILENAME,
-                            [data])
-    out = output()
-    return out
-
-
 class TestResnet(unittest.TestCase):
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+        self.model_save_dir = os.path.join(self.temp_dir.name, "./inference")
+        self.model_save_prefix = os.path.join(self.temp_dir.name,
+                                              "./inference/resnet_v2")
+        self.model_filename = "resnet_v2" + paddle.fluid.dygraph.io.INFER_MODEL_SUFFIX
+        self.params_filename = "resnet_v2" + paddle.fluid.dygraph.io.INFER_PARAMS_SUFFIX
+        self.dy_state_dict_save_path = os.path.join(self.temp_dir.name,
+                                                    "./resnet_v2.dygraph")
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
+    def do_train(self, to_static):
+        """
+        Tests model decorated by `dygraph_to_static_output` in static mode. For users, the model is defined in dygraph mode and trained in static mode.
+        """
+        paddle.disable_static(place)
+        np.random.seed(SEED)
+        paddle.seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
+
+        train_reader = paddle.batch(
+            reader_decorator(paddle.dataset.flowers.train(use_xmap=False)),
+            batch_size=batch_size,
+            drop_last=True)
+        data_loader = paddle.io.DataLoader.from_generator(
+            capacity=5, iterable=True)
+        data_loader.set_sample_list_generator(train_reader)
+
+        resnet = ResNet()
+        optimizer = optimizer_setting(parameter_list=resnet.parameters())
+
+        for epoch in range(epoch_num):
+            total_loss = 0.0
+            total_acc1 = 0.0
+            total_acc5 = 0.0
+            total_sample = 0
+
+            for batch_id, data in enumerate(data_loader()):
+                start_time = time.time()
+                img, label = data
+
+                pred = resnet(img)
+                loss = paddle.nn.functional.cross_entropy(
+                    input=pred, label=label)
+                avg_loss = paddle.mean(x=loss)
+                acc_top1 = paddle.metric.accuracy(input=pred, label=label, k=1)
+                acc_top5 = paddle.metric.accuracy(input=pred, label=label, k=5)
+
+                avg_loss.backward()
+                optimizer.minimize(avg_loss)
+                resnet.clear_gradients()
+
+                total_loss += avg_loss
+                total_acc1 += acc_top1
+                total_acc5 += acc_top5
+                total_sample += 1
+
+                end_time = time.time()
+                if batch_id % 2 == 0:
+                    print( "epoch %d | batch step %d, loss %0.3f, acc1 %0.3f, acc5 %0.3f, time %f" % \
+                        ( epoch, batch_id, total_loss.numpy() / total_sample, \
+                            total_acc1.numpy() / total_sample, total_acc5.numpy() / total_sample, end_time-start_time))
+                if batch_id == 10:
+                    if to_static:
+                        paddle.jit.save(resnet, self.model_save_prefix)
+                    else:
+                        paddle.fluid.dygraph.save_dygraph(
+                            resnet.state_dict(), self.dy_state_dict_save_path)
+                        # avoid dataloader throw abort signaal
+                    data_loader._reset()
+                    break
+        paddle.enable_static()
+
+        return total_loss.numpy()
+
+    def predict_dygraph(self, data):
+        program_translator.enable(False)
+        paddle.disable_static(place)
+        resnet = ResNet()
+
+        model_dict, _ = paddle.fluid.dygraph.load_dygraph(
+            self.dy_state_dict_save_path)
+        resnet.set_dict(model_dict)
+        resnet.eval()
+
+        pred_res = resnet(
+            paddle.to_tensor(
+                data=data, dtype=None, place=None, stop_gradient=True))
+
+        ret = pred_res.numpy()
+        paddle.enable_static()
+        return ret
+
+    def predict_static(self, data):
+        exe = paddle.static.Executor(place)
+        [inference_program, feed_target_names,
+         fetch_targets] = paddle.static.load_inference_model(
+             self.model_save_dir,
+             executor=exe,
+             model_filename=self.model_filename,
+             params_filename=self.params_filename)
+
+        pred_res = exe.run(inference_program,
+                           feed={feed_target_names[0]: data},
+                           fetch_list=fetch_targets)
+
+        return pred_res[0]
+
+    def predict_dygraph_jit(self, data):
+        paddle.disable_static(place)
+        resnet = paddle.jit.load(self.model_save_prefix)
+        resnet.eval()
+
+        pred_res = resnet(data)
+
+        ret = pred_res.numpy()
+        paddle.enable_static()
+        return ret
+
+    def predict_analysis_inference(self, data):
+        output = PredictorTools(self.model_save_dir, self.model_filename,
+                                self.params_filename, [data])
+        out = output()
+        return out
+
     def train(self, to_static):
         program_translator.enable(to_static)
-        return train(to_static)
+        return self.do_train(to_static)
 
     def verify_predict(self):
         image = np.random.random([1, 3, 224, 224]).astype('float32')
-        dy_pre = predict_dygraph(image)
-        st_pre = predict_static(image)
-        dy_jit_pre = predict_dygraph_jit(image)
-        predictor_pre = predict_analysis_inference(image)
+        dy_pre = self.predict_dygraph(image)
+        st_pre = self.predict_static(image)
+        dy_jit_pre = self.predict_dygraph_jit(image)
+        predictor_pre = self.predict_analysis_inference(image)
         self.assertTrue(
             np.allclose(dy_pre, st_pre),
             msg="dy_pre:\n {}\n, st_pre: \n{}.".format(dy_pre, st_pre))
@@ -361,7 +369,7 @@ def test_in_static_mode_mkldnn(self):
         paddle.fluid.set_flags({'FLAGS_use_mkldnn': True})
         try:
             if paddle.fluid.core.is_compiled_with_mkldnn():
-                train(to_static=True)
+                self.train(to_static=True)
         finally:
             paddle.fluid.set_flags({'FLAGS_use_mkldnn': False})
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py
index b431d5ae048a9..794aa17038cd6 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import os
+import tempfile
 import unittest
 import numpy as np
 
@@ -48,6 +49,12 @@ def forward(self, x):
 
 
 class TestDyToStaticSaveInferenceModel(unittest.TestCase):
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def test_save_inference_model(self):
         fc_size = 20
         x_data = np.random.random((fc_size, fc_size)).astype('float32')
@@ -66,8 +73,10 @@ def test_save_inference_model(self):
                 adam.minimize(loss)
                 layer.clear_gradients()
             # test for saving model in dygraph.guard
-            infer_model_prefix = "./test_dy2stat_inference_in_guard/model"
-            infer_model_dir = "./test_dy2stat_inference_in_guard"
+            infer_model_prefix = os.path.join(
+                self.temp_dir.name, "test_dy2stat_inference_in_guard/model")
+            infer_model_dir = os.path.join(self.temp_dir.name,
+                                           "test_dy2stat_inference_in_guard")
             fluid.dygraph.jit.save(
                 layer=layer,
                 path=infer_model_prefix,
@@ -90,8 +99,10 @@ def check_save_inference_model(self,
 
         expected_persistable_vars = set([p.name for p in model.parameters()])
 
-        infer_model_prefix = "./test_dy2stat_inference/model"
-        infer_model_dir = "./test_dy2stat_inference"
+        infer_model_prefix = os.path.join(self.temp_dir.name,
+                                          "test_dy2stat_inference/model")
+        infer_model_dir = os.path.join(self.temp_dir.name,
+                                       "test_dy2stat_inference")
         model_filename = "model" + INFER_MODEL_SUFFIX
         params_filename = "model" + INFER_PARAMS_SUFFIX
         fluid.dygraph.jit.save(
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_load.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_load.py
index 54eefe7c4f21d..c5677756f501d 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_load.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_load.py
@@ -15,6 +15,8 @@
 from __future__ import print_function
 
 import unittest
+import os
+import tempfile
 
 import numpy as np
 import paddle.fluid as fluid
@@ -30,6 +32,14 @@
 
 
 class TestDyToStaticSaveLoad(unittest.TestCase):
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.model_path = os.path.join(self.temp_dir.name,
+                                       "test_dy2stat_save_load")
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def test_save_load_same_result(self):
         program_translator = ProgramTranslator()
         x_data = np.random.randn(30, 10, 32).astype('float32')
@@ -50,7 +60,8 @@ def test_save_load_same_result(self):
                 adam.minimize(static_loss)
                 net.clear_gradients()
             # Save parameters
-            fluid.save_dygraph(net.state_dict(), "./test_dy2stat_save_load")
+
+            fluid.save_dygraph(net.state_dict(), self.model_path)
             # minimize() will update parameter, call net() to get output and avg_loss.
             # Switch into eval mode.
             net.eval()
@@ -61,7 +72,7 @@ def test_save_load_same_result(self):
             dygraph_net = Linear(32, 64)
 
             # Load parameters
-            model_dict, _ = fluid.load_dygraph("./test_dy2stat_save_load")
+            model_dict, _ = fluid.load_dygraph(self.model_path)
             dygraph_net.set_dict(model_dict)
             # Switch into eval mode.
             dygraph_net.eval()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
index eba7e238bb590..7ac1f40de99eb 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
@@ -16,6 +16,8 @@
 import math
 import time
 import unittest
+import os
+import tempfile
 import numpy as np
 
 import paddle
@@ -35,11 +37,6 @@
 EPOCH_NUM = 1
 PRINT_STEP = 2
 STEP_NUM = 10
-MODEL_SAVE_DIR = "./inference"
-MODEL_SAVE_PREFIX = "./inference/se_resnet"
-MODEL_FILENAME = "se_resnet" + INFER_MODEL_SUFFIX
-PARAMS_FILENAME = "se_resnet" + INFER_PARAMS_SUFFIX
-DY_STATE_DICT_SAVE_PATH = "./se_resnet.dygraph"
 
 place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() \
     else fluid.CPUPlace()
@@ -327,129 +324,6 @@ def forward(self, inputs, label):
         return out, avg_loss, acc_top1, acc_top5
 
 
-def train(train_reader, to_static):
-    program_translator = ProgramTranslator()
-    program_translator.enable(to_static)
-
-    np.random.seed(SEED)
-
-    with fluid.dygraph.guard(place):
-        paddle.seed(SEED)
-        paddle.framework.random._manual_program_seed(SEED)
-        se_resnext = SeResNeXt()
-        optimizer = optimizer_setting(train_parameters, se_resnext.parameters())
-
-        for epoch_id in range(EPOCH_NUM):
-            total_loss = 0.0
-            total_acc1 = 0.0
-            total_acc5 = 0.0
-            total_sample = 0
-            step_idx = 0
-            speed_list = []
-            for step_id, data in enumerate(train_reader()):
-                dy_x_data = np.array([x[0].reshape(3, 224, 224)
-                                      for x in data]).astype('float32')
-                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
-                    BATCH_SIZE, 1)
-
-                img = to_variable(dy_x_data)
-                label = to_variable(y_data)
-                label.stop_gradient = True
-
-                pred, avg_loss, acc_top1, acc_top5 = se_resnext(img, label)
-
-                dy_out = avg_loss.numpy()
-                avg_loss.backward()
-
-                optimizer.minimize(avg_loss)
-                se_resnext.clear_gradients()
-
-                lr = optimizer._global_learning_rate().numpy()
-                total_loss += dy_out
-                total_acc1 += acc_top1.numpy()
-                total_acc5 += acc_top5.numpy()
-                total_sample += 1
-                if step_id % PRINT_STEP == 0:
-                    if step_id == 0:
-                        logging.info( "epoch %d | step %d, loss %0.3f, acc1 %0.3f, acc5 %0.3f" % \
-                                      ( epoch_id, step_id, total_loss / total_sample, \
-                                        total_acc1 / total_sample, total_acc5 / total_sample))
-                        avg_batch_time = time.time()
-                    else:
-                        speed = PRINT_STEP / (time.time() - avg_batch_time)
-                        speed_list.append(speed)
-                        logging.info( "epoch %d | step %d, loss %0.3f, acc1 %0.3f, acc5 %0.3f, speed %.3f steps/s" % \
-                                      ( epoch_id, step_id, total_loss / total_sample, \
-                                        total_acc1 / total_sample, total_acc5 / total_sample, speed))
-                        avg_batch_time = time.time()
-
-                step_idx += 1
-                if step_idx == STEP_NUM:
-                    if to_static:
-                        fluid.dygraph.jit.save(
-                            se_resnext,
-                            MODEL_SAVE_PREFIX, [img],
-                            output_spec=[pred])
-                    else:
-                        fluid.dygraph.save_dygraph(se_resnext.state_dict(),
-                                                   DY_STATE_DICT_SAVE_PATH)
-                    break
-        return pred.numpy(), avg_loss.numpy(), acc_top1.numpy(), acc_top5.numpy(
-        )
-
-
-def predict_dygraph(data):
-    program_translator = ProgramTranslator()
-    program_translator.enable(False)
-    with fluid.dygraph.guard(place):
-        se_resnext = SeResNeXt()
-
-        model_dict, _ = fluid.dygraph.load_dygraph(DY_STATE_DICT_SAVE_PATH)
-        se_resnext.set_dict(model_dict)
-        se_resnext.eval()
-
-        label = np.random.random([1, 1]).astype("int64")
-        img = fluid.dygraph.to_variable(data)
-        label = fluid.dygraph.to_variable(label)
-        pred_res, _, _, _ = se_resnext(img, label)
-
-        return pred_res.numpy()
-
-
-def predict_static(data):
-    paddle.enable_static()
-    exe = fluid.Executor(place)
-    [inference_program, feed_target_names,
-     fetch_targets] = fluid.io.load_inference_model(
-         MODEL_SAVE_DIR,
-         executor=exe,
-         model_filename=MODEL_FILENAME,
-         params_filename=PARAMS_FILENAME)
-
-    pred_res = exe.run(inference_program,
-                       feed={feed_target_names[0]: data},
-                       fetch_list=fetch_targets)
-
-    return pred_res[0]
-
-
-def predict_dygraph_jit(data):
-    with fluid.dygraph.guard(place):
-        se_resnext = fluid.dygraph.jit.load(MODEL_SAVE_PREFIX)
-        se_resnext.eval()
-
-        pred_res = se_resnext(data)
-
-        return pred_res.numpy()
-
-
-def predict_analysis_inference(data):
-    output = PredictorTools(MODEL_SAVE_DIR, MODEL_FILENAME, PARAMS_FILENAME,
-                            [data])
-    out = output()
-    return out
-
-
 class TestSeResnet(unittest.TestCase):
     def setUp(self):
         self.train_reader = paddle.batch(
@@ -457,13 +331,148 @@ def setUp(self):
                 use_xmap=False, cycle=True),
             batch_size=BATCH_SIZE,
             drop_last=True)
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+        self.model_save_dir = os.path.join(self.temp_dir.name, "inference")
+        self.model_save_prefix = os.path.join(self.temp_dir.name,
+                                              "inference/se_resnet")
+        self.model_filename = "se_resnet" + INFER_MODEL_SUFFIX
+        self.params_filename = "se_resnet" + INFER_PARAMS_SUFFIX
+        self.dy_state_dict_save_path = os.path.join(self.temp_dir.name,
+                                                    "se_resnet.dygraph")
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
+    def train(self, train_reader, to_static):
+        program_translator = ProgramTranslator()
+        program_translator.enable(to_static)
+
+        np.random.seed(SEED)
+
+        with fluid.dygraph.guard(place):
+            paddle.seed(SEED)
+            paddle.framework.random._manual_program_seed(SEED)
+            se_resnext = SeResNeXt()
+            optimizer = optimizer_setting(train_parameters,
+                                          se_resnext.parameters())
+
+            for epoch_id in range(EPOCH_NUM):
+                total_loss = 0.0
+                total_acc1 = 0.0
+                total_acc5 = 0.0
+                total_sample = 0
+                step_idx = 0
+                speed_list = []
+                for step_id, data in enumerate(train_reader()):
+                    dy_x_data = np.array(
+                        [x[0].reshape(3, 224, 224)
+                         for x in data]).astype('float32')
+                    y_data = np.array(
+                        [x[1] for x in data]).astype('int64').reshape(
+                            BATCH_SIZE, 1)
+
+                    img = to_variable(dy_x_data)
+                    label = to_variable(y_data)
+                    label.stop_gradient = True
+
+                    pred, avg_loss, acc_top1, acc_top5 = se_resnext(img, label)
+
+                    dy_out = avg_loss.numpy()
+                    avg_loss.backward()
+
+                    optimizer.minimize(avg_loss)
+                    se_resnext.clear_gradients()
+
+                    lr = optimizer._global_learning_rate().numpy()
+                    total_loss += dy_out
+                    total_acc1 += acc_top1.numpy()
+                    total_acc5 += acc_top5.numpy()
+                    total_sample += 1
+                    if step_id % PRINT_STEP == 0:
+                        if step_id == 0:
+                            logging.info( "epoch %d | step %d, loss %0.3f, acc1 %0.3f, acc5 %0.3f" % \
+                                        ( epoch_id, step_id, total_loss / total_sample, \
+                                            total_acc1 / total_sample, total_acc5 / total_sample))
+                            avg_batch_time = time.time()
+                        else:
+                            speed = PRINT_STEP / (time.time() - avg_batch_time)
+                            speed_list.append(speed)
+                            logging.info( "epoch %d | step %d, loss %0.3f, acc1 %0.3f, acc5 %0.3f, speed %.3f steps/s" % \
+                                        ( epoch_id, step_id, total_loss / total_sample, \
+                                            total_acc1 / total_sample, total_acc5 / total_sample, speed))
+                            avg_batch_time = time.time()
+
+                    step_idx += 1
+                    if step_idx == STEP_NUM:
+                        if to_static:
+                            fluid.dygraph.jit.save(
+                                se_resnext,
+                                self.model_save_prefix, [img],
+                                output_spec=[pred])
+                        else:
+                            fluid.dygraph.save_dygraph(
+                                se_resnext.state_dict(),
+                                self.dy_state_dict_save_path)
+                        break
+            return pred.numpy(), avg_loss.numpy(), acc_top1.numpy(
+            ), acc_top5.numpy()
+
+    def predict_dygraph(self, data):
+        program_translator = ProgramTranslator()
+        program_translator.enable(False)
+        with fluid.dygraph.guard(place):
+            se_resnext = SeResNeXt()
+
+            model_dict, _ = fluid.dygraph.load_dygraph(
+                self.dy_state_dict_save_path)
+            se_resnext.set_dict(model_dict)
+            se_resnext.eval()
+
+            label = np.random.random([1, 1]).astype("int64")
+            img = fluid.dygraph.to_variable(data)
+            label = fluid.dygraph.to_variable(label)
+            pred_res, _, _, _ = se_resnext(img, label)
+
+            return pred_res.numpy()
+
+    def predict_static(self, data):
+        paddle.enable_static()
+        exe = fluid.Executor(place)
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(
+             self.model_save_dir,
+             executor=exe,
+             model_filename=self.model_filename,
+             params_filename=self.params_filename)
+
+        pred_res = exe.run(inference_program,
+                           feed={feed_target_names[0]: data},
+                           fetch_list=fetch_targets)
+
+        return pred_res[0]
+
+    def predict_dygraph_jit(self, data):
+        with fluid.dygraph.guard(place):
+            se_resnext = fluid.dygraph.jit.load(self.model_save_prefix)
+            se_resnext.eval()
+
+            pred_res = se_resnext(data)
+
+            return pred_res.numpy()
+
+    def predict_analysis_inference(self, data):
+        output = PredictorTools(self.model_save_dir, self.model_filename,
+                                self.params_filename, [data])
+        out = output()
+        return out
 
     def verify_predict(self):
         image = np.random.random([1, 3, 224, 224]).astype('float32')
-        dy_pre = predict_dygraph(image)
-        st_pre = predict_static(image)
-        dy_jit_pre = predict_dygraph_jit(image)
-        predictor_pre = predict_analysis_inference(image)
+        dy_pre = self.predict_dygraph(image)
+        st_pre = self.predict_static(image)
+        dy_jit_pre = self.predict_dygraph_jit(image)
+        predictor_pre = self.predict_analysis_inference(image)
         self.assertTrue(
             np.allclose(dy_pre, st_pre),
             msg="dy_pre:\n {}\n, st_pre: \n{}.".format(dy_pre, st_pre))
@@ -483,9 +492,9 @@ def verify_predict(self):
                     flat_predictor_pre[i], flat_st_pre[i]))
 
     def test_check_result(self):
-        pred_1, loss_1, acc1_1, acc5_1 = train(
+        pred_1, loss_1, acc1_1, acc5_1 = self.train(
             self.train_reader, to_static=False)
-        pred_2, loss_2, acc1_2, acc5_2 = train(
+        pred_2, loss_2, acc1_2, acc5_2 = self.train(
             self.train_reader, to_static=True)
 
         self.assertTrue(
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_seq2seq.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_seq2seq.py
index a363526530d11..bc462ab8c95fa 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_seq2seq.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_seq2seq.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import os
+import tempfile
 import time
 import unittest
 
@@ -22,7 +23,7 @@
 from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
 
 from seq2seq_dygraph_model import BaseModel, AttentionModel
-from seq2seq_utils import Seq2SeqModelHyperParams as args
+from seq2seq_utils import Seq2SeqModelHyperParams
 from seq2seq_utils import get_data_iter
 place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace(
 )
@@ -43,7 +44,7 @@ def prepare_input(batch):
     return inputs, np.sum(tar_mask)
 
 
-def train(attn_model=False):
+def train(args, attn_model=False):
     with fluid.dygraph.guard(place):
         fluid.default_startup_program().random_seed = 2020
         fluid.default_main_program().random_seed = 2020
@@ -117,7 +118,7 @@ def train(attn_model=False):
         return loss.numpy()
 
 
-def infer(attn_model=False):
+def infer(args, attn_model=False):
     with fluid.dygraph.guard(place):
 
         if attn_model:
@@ -160,19 +161,32 @@ def infer(attn_model=False):
 
 
 class TestSeq2seq(unittest.TestCase):
+    def setUp(self):
+        self.args = Seq2SeqModelHyperParams
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.args.base_model_path = os.path.join(self.temp_dir.name,
+                                                 self.args.base_model_path)
+        self.args.attn_model_path = os.path.join(self.temp_dir.name,
+                                                 self.args.attn_model_path)
+        self.args.reload_model = os.path.join(self.temp_dir.name,
+                                              self.args.reload_model)
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def run_dygraph(self, mode="train", attn_model=False):
         program_translator.enable(False)
         if mode == "train":
-            return train(attn_model)
+            return train(self.args, attn_model)
         else:
-            return infer(attn_model)
+            return infer(self.args, attn_model)
 
     def run_static(self, mode="train", attn_model=False):
         program_translator.enable(True)
         if mode == "train":
-            return train(attn_model)
+            return train(self.args, attn_model)
         else:
-            return infer(attn_model)
+            return infer(self.args, attn_model)
 
     def _test_train(self, attn_model=False):
         dygraph_loss = self.run_dygraph(mode="train", attn_model=attn_model)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_slice.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_slice.py
index f486cbc27dca5..eecb6d8b75842 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_slice.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_slice.py
@@ -13,7 +13,8 @@
 # limitations under the License.
 
 from __future__ import print_function
-
+import os
+import tempfile
 import unittest
 import numpy as np
 
@@ -166,15 +167,20 @@ def init_dygraph_func(self):
 
 
 class TestSetValueWithLayerAndSave(unittest.TestCase):
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.model_path = os.path.join(self.temp_dir.name,
+                                       "layer_use_set_value")
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def test_set_value_with_save(self):
         prog_trans.enable(True)
         model = LayerWithSetValue(input_dim=10, hidden=1)
         x = paddle.full(shape=[5, 10], fill_value=5.0, dtype="float32")
         paddle.jit.save(
-            layer=model,
-            path="./layer_use_set_value",
-            input_spec=[x],
-            output_spec=None)
+            layer=model, path=self.model_path, input_spec=[x], output_spec=None)
 
 
 class TestSliceSupplementSpecialCase(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_spec_names.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_spec_names.py
index bafc4707c4ad9..361fcbf9c73f5 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_spec_names.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_spec_names.py
@@ -16,8 +16,6 @@
 from paddle.nn import Layer
 import numpy as np
 import unittest
-from paddle.fluid.framework import _enable_legacy_dygraph
-_enable_legacy_dygraph()
 
 
 class Net(Layer):
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
index d05be03bbfb19..5cf9d7749c358 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
@@ -597,9 +597,11 @@ def test(self):
 class TestPaddleShape(unittest.TestCase):
     def test_paddle_shape(self):
         func = paddle.jit.to_static(dyfunc_len_paddle_shape)
-        self.assertEqual('paddle.shape(x)' in func.code, True)
+        func_code = func.code.replace("\n", "").replace(" ", "")
+        self.assertEqual('paddle.shape(x)' in func_code, True)
         func = paddle.jit.to_static(dyfunc_dict_assign_shape)
-        self.assertEqual("__static_convert_var_shape_suffix" in func.code, True)
+        func_code = func.code.replace("\n", "").replace(" ", "")
+        self.assertEqual("__static_convert_var_shape_suffix" in func_code, True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py
index 06f2c60dfae9f..c8fe3e3932914 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py
@@ -15,6 +15,7 @@
 import logging
 import os
 import time
+import tempfile
 import unittest
 
 import numpy as np
@@ -371,8 +372,21 @@ def predict_static(args, batch_generator):
 
 
 class TestTransformer(unittest.TestCase):
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDwon(self):
+        self.temp_dir.cleanup()
+
     def prepare(self, mode='train'):
         args = util.ModelHyperParams()
+        args.save_dygraph_model_path = os.path.join(
+            self.temp_dir.name, args.save_dygraph_model_path)
+        args.save_static_model_path = os.path.join(self.temp_dir.name,
+                                                   args.save_static_model_path)
+        args.inference_model_dir = os.path.join(self.temp_dir.name,
+                                                args.inference_model_dir)
+        args.output_file = os.path.join(self.temp_dir.name, args.output_file)
         batch_generator = util.get_feed_data_reader(args, mode)
         return args, batch_generator
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_typing.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_typing.py
index c3c0453bde3f4..7017cdda9cd23 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_typing.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_typing.py
@@ -11,7 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import os
+import tempfile
 import paddle
 import unittest
 import numpy as np
@@ -72,11 +73,16 @@ def setUp(self):
         self.x = paddle.randn([4, 16])
         self.spec = [paddle.static.InputSpec(shape=[None, 16], dtype='float32')]
 
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def build_net(self):
         return LinearNetWithTuple(self.in_num, self.out_num)
 
     def save_and_load(self, suffix=''):
-        path = './layer_typing_' + suffix
+        path = os.path.join(self.temp_dir.name, 'layer_typing_' + suffix)
         paddle.jit.save(self.net, path, input_spec=self.spec)
         return paddle.jit.load(path)
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py b/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py
index 2583d9409a0a7..ad11083b67773 100644
--- a/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py
@@ -128,6 +128,11 @@ def setUpClass(cls):
         cls.fetch_list: List[str] = None
         cls.output_dict: Optional[Dict] = {}
 
+    def tearDown(self):
+        # Manual reset when using ipumodel
+        if self.use_ipumodel():
+            paddle.framework.core.IpuBackend.get_instance().reset()
+
     @property
     def fp16_enabled(self):
         return True
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
index 45f75f1b4df81..21a6655406729 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
@@ -27,12 +27,13 @@ def test_set_options(self):
         ipu_strategy = paddle.static.IpuStrategy()
         all_option_names = ipu_strategy._ipu_strategy.get_all_option_names()
         skip_options = []
+        skip_options.append(
+            'mean_accumulation_and_replication_reduction_strategy')
         skip_options.append('random_seed')
 
         for option_name in all_option_names:
             if option_name in skip_options:
                 continue
-
             option = ipu_strategy._ipu_strategy.get_option(option_name)
             option_type = option['type']
             option_value = option['value']
@@ -67,7 +68,7 @@ def test_set_string_options(self):
     def test_set_other_options(self):
         ipu_strategy = paddle.static.IpuStrategy()
         options = {}
-        options['dot_checks'] = ['0', '1', '2', '3']
+        options['dot_checks'] = ['Fwd0', 'Fwd1', 'Bwd0', 'PreAlias', "Final"]
         options['engine_options'] = {
             'debug.allowOutOfMemory': 'true',
             'autoReport.directory': 'path',
@@ -76,7 +77,12 @@ def test_set_other_options(self):
         options['random_seed'] = 1234
         for k, v in options.items():
             ipu_strategy.set_options({k: v})
-            assert v == ipu_strategy.get_option(k), f"set {k} to {v} failed "
+            if (isinstance(v, list)):
+                assert v.sort() == ipu_strategy.get_option(k).sort(
+                ), f"set {k} to {v} failed "
+            else:
+                assert v == ipu_strategy.get_option(
+                    k), f"set {k} to {v} failed "
 
         # The custom logger need 2 int as inputs
         logger = lambda progress, total: print(f"compile progrss: {progress}/{total}")
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_model_parallel_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_model_parallel_ipu.py
index 792b88849faf3..884162d336f35 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_model_parallel_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_model_parallel_ipu.py
@@ -148,6 +148,36 @@ def set_data_feed(self):
         }
 
 
+class TestReplicaCollectiveInference(TestBase):
+    def set_attrs(self):
+        self.ipu_options = {
+            "batches_per_step": 1,
+            "enable_pipelining": False,
+            "enable_gradient_accumulation": False,
+            "accumulation_factor": 1,
+            "enable_replicated_graphs": True,
+            "replicated_graph_count": 2,
+            "accumulate_outer_fragment": {
+                0: []
+            },
+            "replicated_collectives_settings": {
+                "prepare_schedule_for_merging_collectives": True,
+                "merge_all_reduce_collectives": True
+            }
+        }
+        self.cpu_bs = 1
+        self.ipu_bs = 1
+
+    def set_data_feed(self):
+        np_image = np.random.rand(1, 3, 10, 10).astype(np.float32)
+        self.feed_cpu = {"image": np_image}
+        self.feed_ipu = {
+            "image":
+            np.tile(np_image,
+                    [self.ipu_options['replicated_graph_count'], 1, 1, 1])
+        }
+
+
 class TestPipelineInference(TestBase):
     def set_attrs(self):
         self.ipu_options = {
@@ -190,6 +220,36 @@ def set_attrs(self):
 
 
 class TestReplicaTrain(TestTrainBase):
+    def set_attrs(self):
+        self.ipu_options = {
+            "batches_per_step": 1,
+            "enable_pipelining": False,
+            "enable_gradient_accumulation": False,
+            "accumulation_factor": 1,
+            "enable_replicated_graphs": True,
+            "replicated_graph_count": 2
+        }
+        self.cpu_bs = 2
+        self.ipu_bs = 1
+        self.optimizer = 'sgd'
+
+    def set_data_feed(self):
+        np_image = np.random.rand(1, 3, 10, 10).astype(np.float32)
+        self.feed_cpu = {"image": np.tile(np_image, [self.cpu_bs, 1, 1, 1])}
+        self.feed_ipu = {
+            "image":
+            np.tile(np_image,
+                    [self.ipu_options['replicated_graph_count'], 1, 1, 1])
+        }
+
+    def test(self):
+        cpu_outputs = self._test_base(False)
+        ipu_outputs = self._test_base(True)[::2]
+
+        self.assertTrue(np.allclose(cpu_outputs, ipu_outputs, atol=self.atol))
+
+
+class TestReplicaCollectiveTrain(TestTrainBase):
     def set_attrs(self):
         self.ipu_options = {
             "batches_per_step": 1,
@@ -198,6 +258,13 @@ def set_attrs(self):
             "accumulation_factor": 1,
             "enable_replicated_graphs": True,
             "replicated_graph_count": 2,
+            "accumulate_outer_fragment": {
+                0: []
+            },
+            "replicated_collectives_settings": {
+                "prepare_schedule_for_merging_collectives": True,
+                "merge_all_reduce_collectives": True
+            }
         }
         self.cpu_bs = 2
         self.ipu_bs = 1
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
index 661fbbc7759c6..4717dfa1eab52 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -143,5 +143,6 @@ if (WITH_MKLDNN)
   set_tests_properties(test_mkldnn_conv_mish_fuse_pass PROPERTIES TIMEOUT 300)
   set_tests_properties(test_mkldnn_fc_mish_fuse_pass PROPERTIES TIMEOUT 300)
   set_tests_properties(test_mkldnn_fc_elementwise_add_fuse_pass PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_mkldnn_conv_affine_channel_fuse_pass PROPERTIES TIMEOUT 60)
 endif()
 endif()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
index bb8c6e73fdefa..161c785ef8565 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
@@ -388,7 +388,7 @@ def run_test(prog_config):
         used_time = time.time() - start_time
         if max_duration > 0 and used_time > max_duration:
             logging.error(
-                "The duration exceeds {} seconds, if this is neccessary, try to set a larger number for parameter `max_duration`.".
+                "The duration exceeds {} seconds, if this is necessary, try to set a larger number for parameter `max_duration`.".
                 format(max_duration))
             assert False
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_identity_scale_clean_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_identity_scale_clean_pass.py
new file mode 100644
index 0000000000000..8cacb6d29af0d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_identity_scale_clean_pass.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from auto_scan_test import PassAutoScanTest
+from program_config import TensorConfig, ProgramConfig, OpConfig
+import paddle.inference as paddle_infer
+import unittest
+import hypothesis.strategies as st
+
+
+class TestIdentityScaleCleanPass(PassAutoScanTest):
+    def sample_predictor_configs(self, program_config):
+        config = self.create_trt_inference_config()
+        config.enable_tensorrt_engine(
+            max_batch_size=8,
+            workspace_size=0,
+            min_subgraph_size=0,
+            precision_mode=paddle_infer.PrecisionType.Float32,
+            use_static=False,
+            use_calib_mode=False)
+        yield config, ['relu'], (1e-5, 1e-5)
+
+    def sample_program_config(self, draw):
+        bias_after_scale = draw(st.booleans())
+        n = draw(st.integers(min_value=1, max_value=4))
+        c = draw(st.integers(min_value=1, max_value=20))
+        h = draw(st.integers(min_value=1, max_value=20))
+        w = draw(st.integers(min_value=1, max_value=20))
+
+        relu_op = OpConfig(
+            "relu", inputs={"X": ["relu_x"]}, outputs={"Out": ["relu_out"]})
+        scale_op = OpConfig(
+            "scale",
+            inputs={"X": ["relu_out"]},
+            outputs={"Out": ["scale_out"]},
+            bias=0.,
+            scale=1.,
+            bias_after_scale=True)
+        program_config = ProgramConfig(
+            ops=[relu_op, scale_op],
+            weights={},
+            inputs={"relu_x": TensorConfig(shape=[n, c, h, w])},
+            outputs=["scale_out"])
+        return program_config
+
+    def test(self):
+        self.run_and_statis(
+            max_examples=25, passes=["identity_scale_op_clean_pass"])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_affine_channel_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_affine_channel_fuse_pass.py
new file mode 100644
index 0000000000000..a35b75e69f812
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_affine_channel_fuse_pass.py
@@ -0,0 +1,158 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import PassAutoScanTest, IgnoreReasons
+from program_config import TensorConfig, ProgramConfig, OpConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume, reproduce_failure
+import hypothesis.strategies as st
+
+
+class TestConvAffineChannelFusePass(PassAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_config(self, draw):
+        padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"]))
+        groups = draw(st.integers(min_value=1, max_value=3))
+        data_format = draw(st.sampled_from(["NCHW", "NHWC"]))
+        axis = draw(st.sampled_from([1]))
+        filter_channel = draw(st.integers(min_value=1, max_value=16)) * 4
+        filter_size = draw(st.integers(min_value=1, max_value=4))
+        in_channel = groups * filter_channel
+        out_channel_factor = draw(st.integers(min_value=1, max_value=16)) * 4
+        out_channel = groups * out_channel_factor
+        batch_size = draw(st.integers(min_value=1, max_value=4))
+        dilations = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=2), min_size=2, max_size=2))
+        paddings = draw(
+            st.lists(
+                st.integers(
+                    min_value=0, max_value=2), min_size=2, max_size=2))
+        strides = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=2), min_size=2, max_size=2))
+        has_bias = draw(st.booleans())
+
+        x_shape = [
+            batch_size, in_channel, 64, 64
+        ] if data_format == "NCHW" else [batch_size, 64, 64, in_channel]
+        w_shape = [out_channel, filter_channel, filter_size, filter_size]
+        scale_shape = [out_channel]
+        bias_shape = [out_channel]
+
+        def generate_input():
+            return np.random.random(x_shape).astype(np.float32)
+
+        def generate_weight():
+            return np.random.random(w_shape).astype(np.float32)
+
+        def generate_bias():
+            return np.random.random(bias_shape).astype(np.float32)
+
+        def generate_scale_bias():
+            return np.random.random(bias_shape).astype(np.float32)
+
+        conv2d_op = OpConfig(
+            "conv2d",
+            inputs={
+                "Input": ["input_data"],
+                "Filter": ["conv2d_weight"],
+            },
+            outputs={"Output": ["conv_output"]},
+            data_format=data_format,
+            dilations=dilations,
+            padding_algorithm=padding_algorithm,
+            groups=groups,
+            paddings=paddings,
+            strides=strides,
+            has_bias=has_bias,
+            is_test=True)
+        ac_op = OpConfig(
+            "affine_channel",
+            inputs={
+                "X": ["conv_output"],
+                "Scale": ["affine_channel_scale"],
+                "Bias": ["affine_channel_bias"]
+            },
+            outputs={"Out": ["affine_channel_ouput"]},
+            data_layout=data_format)
+        if has_bias == True:
+            conv2d_op.inputs["Bias"] = ["conv2d_bias"]
+        ops = [conv2d_op, ac_op]
+
+        program_config = ProgramConfig(
+            ops=ops,
+            inputs={
+                "input_data": TensorConfig(data_gen=partial(generate_input)),
+            },
+            weights={
+                "conv2d_weight":
+                TensorConfig(data_gen=partial(generate_weight)),
+                "conv2d_bias": TensorConfig(data_gen=partial(generate_bias)),
+                "affine_channel_scale":
+                TensorConfig(data_gen=partial(generate_scale_bias)),
+                "affine_channel_bias":
+                TensorConfig(data_gen=partial(generate_scale_bias)),
+            },
+            outputs=["affine_channel_ouput"])
+        if has_bias == True:
+            program_config.weights["conv2d_bias"] = TensorConfig(
+                data_gen=partial(generate_bias))
+        return program_config
+
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_mkldnn=True)
+        yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4)
+
+    def add_ignore_pass_case(self):
+        # If the problem has been fixed, the judgment 
+        # in is_program_valid needs to be deleted!!!
+        def teller1(program_config, predictor_config):
+            if program_config.ops[0].attrs['data_format'] == "NHWC":
+                return True
+            return False
+
+        # mkldnn Output has diff with bias!
+        def teller2(program_config, predictor_config):
+            return predictor_config.mkldnn_enabled() and program_config.ops[
+                0].attrs['has_bias'] == True
+
+        self.add_ignore_check_case(
+            teller1, IgnoreReasons.PASS_ACCURACY_ERROR,
+            "The output format of conv2d is wrong when data_format attribute is NHWC, \
+            because currently its fused op (Conv2DFusion) only supports data format of channel first (NCHW)."
+        )
+
+        self.add_ignore_check_case(
+            teller2, IgnoreReasons.PASS_ACCURACY_ERROR,
+            "Currently mkldnn Output has diff with bias!")
+
+    def test(self):
+        self.run_and_statis(
+            quant=False,
+            passes=["conv_affine_channel_mkldnn_fuse_pass"], )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_int8_scale_calculation_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_int8_scale_calculation_pass.py
new file mode 100644
index 0000000000000..31415f6472587
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_int8_scale_calculation_pass.py
@@ -0,0 +1,146 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import PassAutoScanTest
+from program_config import TensorConfig, ProgramConfig, OpConfig
+import unittest
+
+import hypothesis.strategies as st
+
+
+class TestInt8ScaleCalculationMkldnnPass(PassAutoScanTest):
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_gpu=False)
+        config.pass_builder().append_pass("int8_scale_calculation_mkldnn_pass")
+        yield config, ["conv2d"], (1e-4, 1e-5)
+
+    def is_program_valid(self, prog_config):
+        paddings = prog_config.ops[0].attrs["paddings"]
+        strides = prog_config.ops[0].attrs["strides"]
+        groups = prog_config.ops[0].attrs["groups"]
+        padding_algorithm = prog_config.ops[0].attrs["padding_algorithm"]
+        dilations = prog_config.ops[0].attrs["dilations"]
+        data_format = prog_config.ops[0].attrs["data_format"]
+        filter_shape = prog_config.weights["filter"].shape
+        input_shape = prog_config.inputs["input_x"].shape
+        if padding_algorithm == "VALID":
+            if ((input_shape[2] - (dilations[0] * (filter_shape[2] - 1) + 1)) / strides[0] + 1) <= 1 or \
+            ((input_shape[3] - (dilations[1] * (filter_shape[3] - 1) + 1)) / strides[1] + 1) <= 1:
+                return False
+        if padding_algorithm == "EXPLICIT":
+            if ((input_shape[2] + paddings[0] + paddings[1] - (dilations[0] * (filter_shape[2] - 1) + 1)) / strides[0] + 1) <= 1 or \
+                ((input_shape[3] + paddings[2] + paddings[3] - (dilations[1] * (filter_shape[3] - 1) + 1)) / strides[1] + 1) <= 1:
+                return False
+        if data_format == "NCHW":
+            if input_shape[1] != filter_shape[1] * groups:
+                return False
+            if filter_shape[0] % groups != 0:
+                return False
+        else:
+            if input_shape[3] != filter_shape[1] * groups:
+                return False
+            if filter_shape[0] % groups != 0:
+                return False
+        return True
+
+    def sample_program_config(self, draw):
+        x_shape = draw(
+            st.lists(
+                st.integers(
+                    min_value=5, max_value=100), min_size=4, max_size=4))
+        x_shape[1] = draw(st.integers(min_value=5, max_value=10))
+
+        data_format = draw(st.sampled_from(["NCHW", "NHWC"]))
+
+        f_shape = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=4), min_size=4, max_size=4))
+        if data_format == "NCHW":
+            f_shape[1] = x_shape[1]
+        else:
+            f_shape[1] = x_shape[3]
+
+        strides = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=4), min_size=2, max_size=2))
+
+        padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"]))
+
+        padding = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=4), min_size=4, max_size=4))
+
+        groups = draw(st.integers(min_value=1, max_value=3))
+
+        dilations = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=4), min_size=2, max_size=2))
+
+        bias_shape = [f_shape[0]]
+        inputs = dict()
+        weights = dict()
+        use_mkldnn = True
+
+        has_bias = draw(st.booleans())
+        if has_bias:
+            inputs = {
+                "Input": ["input_x"],
+                "Filter": ["filter"],
+            }
+            weights = {
+                "filter": TensorConfig(shape=f_shape),
+                "bias": TensorConfig(shape=bias_shape),
+            }
+        else:
+            inputs = {
+                "Input": ["input_x"],
+                "Filter": ["filter"],
+            }
+            weights = {"filter": TensorConfig(shape=f_shape), }
+
+        conv2d_op = OpConfig(
+            "conv2d",
+            inputs=inputs,
+            outputs={"Output": ["conv2d_out"]},
+            strides=strides,
+            padding_algorithm=padding_algorithm,
+            paddings=padding,
+            groups=groups,
+            dilations=dilations,
+            data_format=data_format,
+            use_mkldnn=use_mkldnn,
+            mkldnn_data_type="int8")
+
+        ops = [conv2d_op]
+
+        program_config = ProgramConfig(
+            ops=ops,
+            weights=weights,
+            inputs={"input_x": TensorConfig(shape=x_shape)},
+            outputs=["conv2d_out"])
+        return program_config
+
+    def test(self):
+        self.run_and_statis(
+            quant=False,
+            max_examples=100,
+            passes=["int8_scale_calculation_mkldnn_pass"])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_shuffle_channel_detect_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_shuffle_channel_detect_pass.py
index a864e2fe5a1c8..1781eb5048347 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_shuffle_channel_detect_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_shuffle_channel_detect_pass.py
@@ -62,13 +62,13 @@ def generate_reshape2_Input():
             "transpose2",
             inputs={"X": ["reshape2_output1"], },
             outputs={
-                "Out": ["transpose2_ouput"],
+                "Out": ["transpose2_output"],
                 "XShape": ["transpose2_xshape"]
             },
             axis=axis_v)
         reshape2_op2 = OpConfig(
             "reshape2",
-            inputs={"X": ["transpose2_ouput"], },
+            inputs={"X": ["transpose2_output"], },
             outputs={
                 "Out": ["reshape2_output2"],
                 "XShape": ["reshape2_xshape2"]
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_arg_max.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_arg_max.py
new file mode 100644
index 0000000000000..719e448856995
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_arg_max.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest
+from program_config import TensorConfig, ProgramConfig
+import unittest
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import List
+
+
+class TrtConvertArgMaxTest(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        input_shape = program_config.inputs["arg_max_input"].shape
+        axis = program_config.ops[0].attrs["axis"]
+        if axis < 0:
+            axis += len(input_shape)
+        if len(input_shape) <= axis or axis == 0:
+            return False
+        return True
+
+    def sample_program_configs(self):
+        def generate_input(rank, batch):
+            dims = [batch]
+            for i in range(rank - 1):
+                dims.append((i + 1) * 8)
+            size = np.prod(dims)
+            return (np.arange(size) % 10 - 5).astype("float32").reshape(dims)
+
+        for rank in [3, 4]:
+            for batch in [1, 4]:
+                for axis in [-1, 0, 1, 2, 3]:
+                    for keepdims in [True, False]:
+                        flatten = False
+                        dtype = 2
+                        ops_config = [{
+                            "op_type": "arg_max",
+                            "op_inputs": {
+                                "X": ["arg_max_input"]
+                            },
+                            "op_outputs": {
+                                "Out": ["arg_max_out"]
+                            },
+                            "op_attrs": {
+                                "axis": axis,
+                                "keepdims": keepdims,
+                                "flatten": flatten,
+                                "dtype": dtype
+                            }
+                        }]
+                        ops = self.generate_op_config(ops_config)
+                        program_config = ProgramConfig(
+                            ops=ops,
+                            weights={},
+                            inputs={
+                                "arg_max_input": TensorConfig(data_gen=partial(
+                                    generate_input, rank, batch))
+                            },
+                            outputs=["arg_max_out"])
+                        yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        self.trt_param.workspace_size = 1024000
+        yield self.create_inference_config(), [1, 2], 1e-5
+
+    def test(self):
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py
index ec02a357a48b6..27d8247aded5a 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py
@@ -150,7 +150,7 @@ def generate_input(shape):
         for shape in [[4], [4, 32], [2, 64, 32], [1, 8, 16, 32]]:
             for op_type in [
                     "elementwise_add", "elementwise_mul", "elementwise_sub",
-                    "elementwise_div"
+                    "elementwise_div", "elementwise_pow"
             ]:
                 for axis in [0, -1]:
                     self.dims = len(shape)
@@ -309,7 +309,7 @@ def generate_input(shape):
                 input2_shape = input2_shape_list[j][i]
                 for op_type in [
                         "elementwise_add", "elementwise_mul", "elementwise_sub",
-                        "elementwise_div"
+                        "elementwise_div", "elementwise_pow"
                 ]:
                     for axis in axis_list[j][i]:
                         self.shape1 = input1_shape
@@ -411,7 +411,7 @@ def generate_weight():
                           [batch, 32, 16, 32]]:
                 for op_type in [
                         "elementwise_add", "elementwise_mul", "elementwise_sub",
-                        "elementwise_div"
+                        "elementwise_div", "elementwise_pow"
                 ]:
                     for axis in [-1 if len(shape) == 1 else 1]:
                         self.dims = len(shape)
@@ -511,18 +511,11 @@ def teller1(program_config, predictor_config):
             for weight_name in program_config.weights:
                 if weight_name in input_x_names:
                     return True
-            op_type = program_config.ops[0].type
-            if op_type in ["elementwise_sub", "elementwise_div"]:
-                input_y_names = program_config.ops[0].inputs["Y"]
-                for weight_name in program_config.weights:
-                    if weight_name in input_y_names:
-                        return True
             return False
 
         self.add_skip_case(
             teller1, SkipReasons.TRT_NOT_SUPPORT,
-            "Input X should not be parameters in elementwise op and Input Y should not be parameters in elementwise_sub or elementwise_div op"
-        )
+            "Input X should not be parameters in elementwise op.")
 
     def test(self):
         self.add_skip_trt_case()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_scale.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_scale.py
index d607a43739eb7..75783450e86bf 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_scale.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_scale.py
@@ -42,7 +42,7 @@ def generate_weight1(attrs: List[Dict[str, Any]]):
         for num_input in [0, 1]:
             for dims in [1, 2, 3, 4]:
                 for batch in [1, 2]:
-                    for scale in [0.1, 1.0]:
+                    for scale in [0.1, -1.0]:
                         for bias in [0.0, 1.2]:
                             for bias_after_scale in [False, True]:
                                 self.num_input = num_input
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_strided_slice.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_strided_slice.py
index 04eb3ab10ba7a..8bc48047c1397 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_strided_slice.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_strided_slice.py
@@ -103,6 +103,9 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
                 for x in attrs[0]["axes"]:
                     if x == 0:
                         return 0, 3
+            ver = paddle_infer.get_trt_compile_version()
+            if ver[0] * 1000 + ver[1] * 100 + ver[2] * 10 < 7000:
+                return 0, 3
             return 1, 2
 
         attrs = [
@@ -110,6 +113,12 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             for i in range(len(program_config.ops))
         ]
 
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
@@ -118,3 +127,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
 
     def test(self):
         self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unary.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unary.py
new file mode 100644
index 0000000000000..2abf0a1acda67
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unary.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import unittest
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+
+
+class TrtConvertActivationTest(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+        def generate_input1(dims, batch, attrs: List[Dict[str, Any]]):
+            if dims == 1:
+                return np.ones([32]).astype(np.float32)
+            elif dims == 2:
+                return np.ones([3, 32]).astype(np.float32)
+            elif dims == 3:
+                return np.ones([3, 32, 32]).astype(np.float32)
+            else:
+                return np.ones([batch, 3, 32, 32]).astype(np.float32)
+
+        for dims in [1, 2, 3, 4]:
+            for batch in [1, 4]:
+                for op_type in ["exp", "log"]:
+                    self.dims = dims
+                    dics = [{}]
+
+                    ops_config = [{
+                        "op_type": op_type,
+                        "op_inputs": {
+                            "X": ["input_data"]
+                        },
+                        "op_outputs": {
+                            "Out": ["output_data"]
+                        },
+                        "op_attrs": dics[0]
+                    }]
+                    ops = self.generate_op_config(ops_config)
+
+                    program_config = ProgramConfig(
+                        ops=ops,
+                        weights={},
+                        inputs={
+                            "input_data": TensorConfig(data_gen=partial(
+                                generate_input1, dims, batch, dics))
+                        },
+                        outputs=["output_data"])
+
+                    yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            if self.dims == 1:
+                self.dynamic_shape.min_input_shape = {"input_data": [1]}
+                self.dynamic_shape.max_input_shape = {"input_data": [64]}
+                self.dynamic_shape.opt_input_shape = {"input_data": [32]}
+            elif self.dims == 2:
+                self.dynamic_shape.min_input_shape = {"input_data": [1, 16]}
+                self.dynamic_shape.max_input_shape = {"input_data": [4, 32]}
+                self.dynamic_shape.opt_input_shape = {"input_data": [3, 32]}
+            elif self.dims == 3:
+                self.dynamic_shape.min_input_shape = {"input_data": [1, 16, 16]}
+                self.dynamic_shape.max_input_shape = {"input_data": [4, 32, 32]}
+                self.dynamic_shape.opt_input_shape = {"input_data": [3, 32, 32]}
+            else:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data": [1, 3, 16, 16]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [4, 3, 32, 32]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data": [1, 3, 32, 32]
+                }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            if self.dims == 1:
+                return 0, 3
+            return 1, 2
+
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
+                                                                     True), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
+                                                                     True), 1e-5
+
+    def test(self):
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box_head.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box_head.py
new file mode 100644
index 0000000000000..ece2d187fb9da
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box_head.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import List, Dict, Any
+import unittest
+
+
+class TrtConvertYoloBoxHeadTest(TrtLayerAutoScanTest):
+    def sample_program_configs(self):
+        def generate_input(attrs: List[Dict[str, Any]], batch, shape):
+            gen_shape = shape.copy()
+            gen_shape.insert(0, batch)
+            return np.random.uniform(0, 1, gen_shape).astype("float32")
+
+        input_shape = [[255, 19, 19], [255, 38, 38], [255, 76, 76]]
+        anchors = [[116, 90, 156, 198, 373, 326], [30, 61, 62, 45, 59, 119],
+                   [10, 13, 16, 30, 33, 23]]
+        class_num = 80
+        for batch in [1, 4]:
+            for i in range(len(anchors)):
+                attrs_dict = {
+                    "anchors": anchors[i],
+                    "class_num": class_num,
+                }
+                ops_config = [{
+                    "op_type": "yolo_box_head",
+                    "op_inputs": {
+                        "X": ["yolo_box_head_input"],
+                    },
+                    "op_outputs": {
+                        "Out": ["yolo_box_head_output"],
+                    },
+                    "op_attrs": attrs_dict
+                }]
+                ops = self.generate_op_config(ops_config)
+                program_config = ProgramConfig(
+                    ops=ops,
+                    weights={},
+                    inputs={
+                        "yolo_box_head_input": TensorConfig(data_gen=partial(
+                            generate_input, attrs_dict, batch, input_shape[i]))
+                    },
+                    outputs=["yolo_box_head_output"])
+
+                yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+        # for static_shape
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), [1, 2], 1e-5
+
+    def test(self):
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_yolo_box_post.py b/python/paddle/fluid/tests/unittests/ir/inference/test_yolo_box_post.py
new file mode 100644
index 0000000000000..2fb83fb039215
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_yolo_box_post.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle
+from paddle.fluid import core
+from paddle.fluid.layer_helper import LayerHelper
+paddle.enable_static()
+
+
+def yolo_box_post(box0,
+                  box1,
+                  box2,
+                  im_shape,
+                  im_scale,
+                  anchors0=[116, 90, 156, 198, 373, 326],
+                  anchors1=[30, 61, 62, 45, 59, 119],
+                  anchors2=[10, 13, 16, 30, 33, 23],
+                  class_num=80,
+                  conf_thresh=0.005,
+                  downsample_ratio0=32,
+                  downsample_ratio1=16,
+                  downsample_ratio2=8,
+                  clip_bbox=True,
+                  scale_x_y=1.,
+                  nms_threshold=0.45):
+    helper = LayerHelper('yolo_box_post', **locals())
+    output = helper.create_variable_for_type_inference(dtype=box0.dtype)
+    nms_rois_num = helper.create_variable_for_type_inference(dtype='int32')
+    inputs = {
+        'Boxes0': box0,
+        'Boxes1': box1,
+        'Boxes2': box2,
+        "ImageShape": im_shape,
+        "ImageScale": im_scale
+    }
+    outputs = {'Out': output, 'NmsRoisNum': nms_rois_num}
+
+    helper.append_op(
+        type="yolo_box_post",
+        inputs=inputs,
+        attrs={
+            'anchors0': anchors0,
+            'anchors1': anchors1,
+            'anchors2': anchors2,
+            'class_num': class_num,
+            'conf_thresh': conf_thresh,
+            'downsample_ratio0': downsample_ratio0,
+            'downsample_ratio1': downsample_ratio1,
+            'downsample_ratio2': downsample_ratio2,
+            'clip_bbox': clip_bbox,
+            'scale_x_y': scale_x_y,
+            'nms_threshold': nms_threshold
+        },
+        outputs=outputs)
+    output.stop_gradient = True
+    nms_rois_num.stop_gradient = True
+    return output, nms_rois_num
+
+
+@unittest.skipIf(not paddle.is_compiled_with_cuda(),
+                 "only support cuda kernel.")
+class TestYoloBoxPost(unittest.TestCase):
+    def test_yolo_box_post(self):
+        place = paddle.CUDAPlace(0)
+        program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        with paddle.static.program_guard(program, startup_program):
+            box0 = paddle.static.data("box0", [1, 255, 19, 19])
+            box1 = paddle.static.data("box1", [1, 255, 38, 38])
+            box2 = paddle.static.data("box2", [1, 255, 76, 76])
+            im_shape = paddle.static.data("im_shape", [1, 2])
+            im_scale = paddle.static.data("im_scale", [1, 2])
+            out, rois_num = yolo_box_post(box0, box1, box2, im_shape, im_scale)
+        exe = paddle.static.Executor(place)
+        exe.run(startup_program)
+        feed = {
+            "box0": np.random.uniform(size=[1, 255, 19, 19]).astype("float32"),
+            "box1": np.random.uniform(size=[1, 255, 38, 38]).astype("float32"),
+            "box2": np.random.uniform(size=[1, 255, 76, 76]).astype("float32"),
+            "im_shape": np.array([[608., 608.]], "float32"),
+            "im_scale": np.array([[1., 1.]], "float32")
+        }
+        outs = exe.run(program, feed=feed, fetch_list=[out.name, rois_num.name])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/pass_test.py b/python/paddle/fluid/tests/unittests/ir/pass_test.py
index aae1cc65c9220..e92821387aed4 100644
--- a/python/paddle/fluid/tests/unittests/ir/pass_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/pass_test.py
@@ -167,7 +167,7 @@ def check_output_with_place(self, place, startup_on_cpu=False, atol=1e-5):
 
     def _check_fused_ops(self, program):
         '''
-        Check the number of specified fused op is equal to the the expected
+        Check the number of specified fused op is equal to the expected
         number.
         '''
         if self.fused_op_type is None or self.num_fused_ops < 0:
diff --git a/python/paddle/fluid/tests/unittests/ir/test_ir_yolo_box_pass.py b/python/paddle/fluid/tests/unittests/ir/test_ir_yolo_box_pass.py
new file mode 100644
index 0000000000000..02fb890220431
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/test_ir_yolo_box_pass.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle
+from paddle.fluid import core
+from paddle.fluid.layer_helper import LayerHelper
+paddle.enable_static()
+
+
+def multiclass_nms(bboxes,
+                   scores,
+                   score_threshold,
+                   nms_top_k,
+                   keep_top_k,
+                   nms_threshold=0.3,
+                   normalized=True,
+                   nms_eta=1.,
+                   background_label=-1):
+    helper = LayerHelper('multiclass_nms3', **locals())
+    output = helper.create_variable_for_type_inference(dtype=bboxes.dtype)
+    index = helper.create_variable_for_type_inference(dtype='int32')
+    nms_rois_num = helper.create_variable_for_type_inference(dtype='int32')
+    inputs = {'BBoxes': bboxes, 'Scores': scores}
+    outputs = {'Out': output, 'Index': index, 'NmsRoisNum': nms_rois_num}
+
+    helper.append_op(
+        type="multiclass_nms3",
+        inputs=inputs,
+        attrs={
+            'background_label': background_label,
+            'score_threshold': score_threshold,
+            'nms_top_k': nms_top_k,
+            'nms_threshold': nms_threshold,
+            'keep_top_k': keep_top_k,
+            'nms_eta': nms_eta,
+            'normalized': normalized
+        },
+        outputs=outputs)
+    output.stop_gradient = True
+    index.stop_gradient = True
+
+    return output, index, nms_rois_num
+
+
+class TestYoloBoxPass(unittest.TestCase):
+    def test_yolo_box_pass(self):
+        program = paddle.static.Program()
+        with paddle.static.program_guard(program):
+            im_shape = paddle.static.data("im_shape", [1, 2])
+            im_scale = paddle.static.data("im_scale", [1, 2])
+            yolo_box0_x = paddle.static.data("yolo_box0_x", [1, 255, 19, 19])
+            yolo_box1_x = paddle.static.data("yolo_box1_x", [1, 255, 38, 38])
+            yolo_box2_x = paddle.static.data("yolo_box2_x", [1, 255, 76, 76])
+            div = paddle.divide(im_shape, im_scale)
+            cast = paddle.cast(div, "int32")
+            boxes0, scores0 = paddle.vision.ops.yolo_box(
+                yolo_box0_x, cast, [116, 90, 156, 198, 373, 326], 80, 0.005, 32)
+            boxes1, scores1 = paddle.vision.ops.yolo_box(
+                yolo_box1_x, cast, [30, 61, 62, 45, 59, 119], 80, 0.005, 16)
+            boxes2, scores2 = paddle.vision.ops.yolo_box(
+                yolo_box2_x, cast, [10, 13, 16, 30, 33, 23], 80, 0.005, 8)
+            transpose0 = paddle.transpose(scores0, [0, 2, 1])
+            transpose1 = paddle.transpose(scores1, [0, 2, 1])
+            transpose2 = paddle.transpose(scores2, [0, 2, 1])
+            concat0 = paddle.concat([boxes0, boxes1, boxes2], 1)
+            concat1 = paddle.concat([transpose0, transpose1, transpose2], 2)
+            out0, out1, out2 = multiclass_nms(concat0, concat1, 0.01, 1000, 100,
+                                              0.45, True, 1., 80)
+        graph = core.Graph(program.desc)
+        core.get_pass("yolo_box_fuse_pass").apply(graph)
+        graph = paddle.fluid.framework.IrGraph(graph)
+        op_nodes = graph.all_op_nodes()
+        for op_node in op_nodes:
+            op_type = op_node.op().type()
+            self.assertTrue(op_type in ["yolo_box_head", "yolo_box_post"])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/check_flags_mkldnn_ops_on_off.py b/python/paddle/fluid/tests/unittests/mkldnn/check_flags_mkldnn_ops_on_off.py
index 90614ccb3bc15..11b8858b6b195 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/check_flags_mkldnn_ops_on_off.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/check_flags_mkldnn_ops_on_off.py
@@ -20,6 +20,8 @@
 import os
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.framework import _global_flags
+from paddle.fluid.framework import _enable_legacy_dygraph
+_enable_legacy_dygraph()
 
 
 def check():
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py
index a0836c959c84b..fae52ab833b9d 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
+import paddle
 
 
 class TestDeQuantizeOp(OpTest):
@@ -110,19 +111,6 @@ def set_data_type(self):
         self.data_type = 'uint16'
 
 
-class TestDeQuantizeOp_ZeroScale(TestDeQuantizeOp):
-    def set_scale(self):
-        self.scale = 0.0
-
-    def prepare_output_int8(self):
-        self.output = np.zeros(self.input_size)
-        self.outputs = {'Output': self.output}
-
-    def test_check_output(self):
-        self.assertRaises(AttributeError, self.check_raise_error,
-                          'Dequantization scale cannot be 0.0')
-
-
 # 2-dim input
 # P - positive input, with shift
 class TestDeQuantizeOpShift_2_P(TestDeQuantizeOp):
@@ -177,28 +165,6 @@ def set_input_size(self):
         self.input_size = [2, 3, 4, 5]
 
 
-class TestDeQuantizeOp_NegativeShift(TestDeQuantizeOp):
-    def set_shift(self):
-        self.shift = -10.0
-
-    def prepare_output_int8(self):
-        self.output = np.zeros(self.input_size)
-        self.outputs = {'Output': self.output}
-
-    def test_check_output(self):
-        self.assertRaises(AttributeError, self.check_raise_error,
-                          'Dequantization shift must be nonnegative.')
-
-
-class TestDeQuantizeOp_TooBigShift(TestDeQuantizeOp_NegativeShift):
-    def set_shift(self):
-        self.shift = 300.0
-
-    def test_check_output(self):
-        self.assertRaises(
-            AttributeError, self.check_raise_error,
-            'Dequantization shift must be less than or equal to 255.')
-
-
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_fill_constant_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fill_constant_mkldnn_op.py
new file mode 100644
index 0000000000000..d729efbb0fb60
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fill_constant_mkldnn_op.py
@@ -0,0 +1,119 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool
+import paddle
+
+
+@OpTestTool.skip_if_not_cpu_bf16()
+class TestFillConstant2DOneDNNOp(OpTest):
+    def setUp(self):
+        self.op_type = "fill_constant"
+        self.dtype = np.float32
+
+        self.shape_tensor_list = None
+        self.shape_tensor = None
+        self.str_value = ""
+        real_shape = []
+        self.value = 0.1
+
+        self.set_inputs()
+        self.set_attrs()
+
+        if 'value' in self.attrs:
+            self.value = self.attrs['value']
+        if self.str_value != "":
+            self.value = float(self.str_value)
+        if 'ValueTensor' in self.inputs:
+            self.value = self.inputs['ValueTensor']
+
+        if 'shape' in self.attrs:
+            real_shape = self.attrs['shape']
+        if 'ShapeTensor' in self.inputs:
+            real_shape = list(self.inputs['ShapeTensor'])
+        if 'ShapeTensorList' in self.inputs:
+            real_shape = []
+            for shape_tensor in self.inputs['ShapeTensorList']:
+                real_shape.append(shape_tensor[1].item())
+
+        self.outputs = {'Out': np.full(real_shape, self.value)}
+
+    def set_inputs(self):
+        self.inputs = {}
+
+    def set_attrs(self):
+        self.attrs = {'shape': (3, 5), 'use_mkldnn': True, 'value': self.value}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestFillZerosLike4DShapeTensorPriorityOneDNNOp(
+        TestFillConstant2DOneDNNOp):
+    def set_inputs(self):
+        self.inputs = {'ShapeTensor': np.array([5, 6, 7, 8]).astype("int32")}
+
+
+class TestFillZerosLike4DShapeTensorListPriorityOneDNNOp(
+        TestFillConstant2DOneDNNOp):
+    def set_inputs(self):
+        shape = (4, 5, 6, 7)
+        self.shape_tensor_list = []
+        for index, elem in enumerate(shape):
+            self.shape_tensor_list.append(("x" + str(index), np.ones(
+                (1)).astype('int32') * elem))
+
+        self.inputs = {'ShapeTensorList': self.shape_tensor_list}
+
+
+class TestFillZerosLike2DStringValueInfOneDNNOp(TestFillConstant2DOneDNNOp):
+    def set_attrs(self):
+        self.str_value = "inf"
+        self.attrs = {'shape': (10, 13), 'use_mkldnn': True, 'str_value': "inf"}
+
+
+class TestFillZerosLike2DStringValueMinusInfOneDNNOp(
+        TestFillConstant2DOneDNNOp):
+    def set_attrs(self):
+        self.str_value = "-inf"
+        self.attrs = {
+            'shape': (10, 13),
+            'use_mkldnn': True,
+            'str_value': "-inf"
+        }
+
+
+class TestFillZerosLike2DStringValueFloatOneDNNOp(TestFillConstant2DOneDNNOp):
+    def set_attrs(self):
+        self.str_value = "0.123"
+        self.attrs = {
+            'shape': (10, 13),
+            'use_mkldnn': True,
+            'str_value': "0.123"
+        }
+
+
+class TestFillZerosLike2DValueTensorPriorityOneDNNOp(
+        TestFillZerosLike2DStringValueFloatOneDNNOp):
+    def set_inputs(self):
+        self.inputs = {'ValueTensor': np.atleast_1d(2.25).astype("float32")}
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py
index d13012ee33847..634288c3e875b 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_mkldnn_op.py
@@ -519,43 +519,6 @@ def init_data_type(self):
         self.data_type_ = np.int8
 
 
-class TestMatMulOpTransposeReshapeTransposeAxisNotSupportedException(
-        TestMatMulOpTransposeReshapeBasicFloat):
-    def init_params_and_out(self):
-        self.transpose_out = [0, 1, 2, 3]
-        self.reshape_out = [0, 0, self.x.shape[1] * self.y.shape[-1]]
-        self.out = np.matmul(self.x, self.y)
-
-    def test_check_output(self):
-        self.assertRaises(AttributeError, self.check_raise_error,
-                          'supported transpose axis '
-                          'for the fuse are {0, 2, 1, 3}')
-
-
-class TestMatMulOpTransposeReshapeTransposeRankNotSupportedException(
-        TestMatMulOpTransposeReshapeBasicFloat):
-    def init_params_and_out(self):
-        self.transpose_out = [0, 2, 1]
-        self.reshape_out = [0, 0, self.x.shape[1] * self.y.shape[-1]]
-        self.out = np.matmul(self.x, self.y)
-
-    def test_check_output(self):
-        self.assertRaises(AttributeError, self.check_raise_error,
-                          'transpose_out supported rank is 4')
-
-
-class TestMatMulOpTransposeReshapeRankOfReshapeNotSupportedException(
-        TestMatMulOpTransposeReshapeBasicFloat):
-    def init_params_and_out(self):
-        self.transpose_out = [0, 2, 1, 3]
-        self.reshape_out = [0, 0]
-        self.out = np.matmul(self.x, self.y)
-
-    def test_check_output(self):
-        self.assertRaises(AttributeError, self.check_raise_error,
-                          'reshape_out supported rank is 3')
-
-
 if __name__ == "__main__":
     from paddle import enable_static
     enable_static()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py
index 25701b797ec4a..69cee49c3ec61 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py
@@ -26,14 +26,11 @@
 from paddle.fluid.tests.unittests.mkldnn.test_matmul_mkldnn_op import (
     TestMatMulOpTransposeReshapeEmptyFloat,
     TestMatMulOpTransposeReshapeBasicFloat,
-    TestMatMulOpTransposeReshapeOtherDimFloat,
-    TestMatMulOpTransposeReshapeTransposeAxisNotSupportedException,
-    TestMatMulOpTransposeReshapeTransposeRankNotSupportedException,
-    TestMatMulOpTransposeReshapeRankOfReshapeNotSupportedException,
-    TestReshapeTransposeMatMulOp, TestReshapeTransposeMatMulOp4DXFloat,
-    TestReshapeTransposeMatMulOp4DYFloat, TestReshapeTransposeMatMulOp4DXYFloat,
-    TestReshapeTransposeMatMulOp2DXFloat, TestReshapeTransposeMatMulOp2DYFloat,
-    TestReshapeTransposeMatMulOp3DXFloat, TestReshapeTransposeMatMulOp3DYFloat)
+    TestMatMulOpTransposeReshapeOtherDimFloat, TestReshapeTransposeMatMulOp,
+    TestReshapeTransposeMatMulOp4DXFloat, TestReshapeTransposeMatMulOp4DYFloat,
+    TestReshapeTransposeMatMulOp4DXYFloat, TestReshapeTransposeMatMulOp2DXFloat,
+    TestReshapeTransposeMatMulOp2DYFloat, TestReshapeTransposeMatMulOp3DXFloat,
+    TestReshapeTransposeMatMulOp3DYFloat)
 
 
 def reference_matmul(X, Y, transpose_x=False, transpose_y=False):
@@ -67,6 +64,8 @@ def config(self):
         self.y_shape = (100, )
         self.trans_x = False
         self.trans_y = False
+        self._cpu_only = True
+        self.use_mkldnn = True
 
     def set_inputs(self, x, y):
         self.inputs = {'X': x, 'Y': y}
@@ -455,24 +454,6 @@ def set_op_type(self):
         self.op_type = "matmul_v2"
 
 
-class TestMatMulV2OpTransposeReshapeTransposeAxisNotSupportedException(
-        TestMatMulOpTransposeReshapeTransposeAxisNotSupportedException):
-    def set_op_type(self):
-        self.op_type = "matmul_v2"
-
-
-class TestMatMulV2OpTransposeReshapeRankOfReshapeNotSupportedException(
-        TestMatMulOpTransposeReshapeRankOfReshapeNotSupportedException):
-    def set_op_type(self):
-        self.op_type = "matmul_v2"
-
-
-class TestMatMulV2OpTransposeReshapeTransposeRankNotSupportedException(
-        TestMatMulOpTransposeReshapeTransposeRankNotSupportedException):
-    def set_op_type(self):
-        self.op_type = "matmul_v2"
-
-
 class TestMatMulV2OpReshapeTranspose(TestReshapeTransposeMatMulOp):
     def set_op_type_and_transpose_y_name(self):
         self.op_type = "matmul_v2"
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_quantize_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_quantize_mkldnn_op.py
index a7acc5f3f9bf3..c92d870565fbc 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_quantize_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_quantize_mkldnn_op.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 from paddle.fluid.tests.unittests.op_test import OpTest
+import paddle
 
 
 class TestQuantizeOp(OpTest):
@@ -104,19 +105,6 @@ def set_is_negative(self):
         self.is_nagative = False
 
 
-class TestQuantizeOp_ZeroScale(TestQuantizeOp):
-    def set_scale(self):
-        self.scale = 0.0
-
-    def prepare_output(self):
-        self.output = np.zeros(self.input_size)
-        self.outputs = {'Output': self.output}
-
-    def test_check_output(self):
-        self.assertRaises(AttributeError, self.check_raise_error,
-                          'Quantization scale cannot be 0.0')
-
-
 # 2-dim input
 # P - positive input
 class TestQuantizeOpShift_NCHW_2_P(TestQuantizeOp):
@@ -201,34 +189,6 @@ def set_output_format(self):
         self.output_format = 'NHWC'
 
 
-class TestQuantizeOp_NegativeShift(TestQuantizeOp):
-    def set_is_negative(self):
-        self.is_nagative = False
-
-    def set_scale(self):
-        self.scale = 100.0
-
-    def set_shift(self):
-        self.shift = -10.0
-
-    def prepare_output(self):
-        self.output = np.zeros(self.input_size)
-        self.outputs = {'Output': self.output}
-
-    def test_check_output(self):
-        self.assertRaises(AttributeError, self.check_raise_error,
-                          'Quantization shift must be nonnegative.')
-
-
-class TestQuantizeOp_TooBigShift(TestQuantizeOp_NegativeShift):
-    def set_shift(self):
-        self.shift = 300.0
-
-    def test_check_output(self):
-        self.assertRaises(
-            AttributeError, self.check_raise_error,
-            'Quantization shift must be less than or equal to 255.')
-
-
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
index 1da2fb8b14f75..229a2c1792c25 100644
--- a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
@@ -7,12 +7,14 @@ if (WITH_MLU)
     foreach(TEST_OP ${TEST_DIST_OPS})
         LIST(REMOVE_ITEM TEST_OPS ${TEST_OP})
     endforeach(TEST_OP)
+    LIST(REMOVE_ITEM TEST_OPS "test_spawn_mlu")
 
     foreach(TEST_OP ${TEST_OPS})
         py_test_modules(${TEST_OP} MODULES ${TEST_OP})
     endforeach(TEST_OP)
 
     if(WITH_CNCL)
+	LIST(APPEND TEST_DIST_OPS "test_spawn_mlu")
         foreach(TEST_OP ${TEST_DIST_OPS})
             py_test_modules(${TEST_OP} MODULES ${TEST_OP})
         endforeach(TEST_OP)
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_adam_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_adam_op_mlu.py
new file mode 100644
index 0000000000000..f30a391f65385
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_adam_op_mlu.py
@@ -0,0 +1,303 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from test_adam_op import adam_step
+
+paddle.enable_static()
+SEED = 2022
+
+
+class TestAdam(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "adam"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.004
+        beta1 = 0.78
+        beta2 = 0.836
+        epsilon = 1e-4
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32")
+        }
+
+        self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
+
+        param_out, moment1_out, \
+            moment2_out = adam_step(self.inputs, self.attrs)
+
+        self.outputs = {
+            'Moment1Out': moment1_out,
+            'Moment2Out': moment2_out,
+            'ParamOut': param_out,
+            'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
+            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
+        }
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-5)
+
+
+class TestAdamWithEpsilonTensor(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "adam"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.004
+        beta1 = 0.78
+        beta2 = 0.836
+        epsilon = 1e-4
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
+            'Beta1Tensor': np.array([beta1]).astype("float32"),
+            'Beta2Tensor': np.array([beta2]).astype("float32"),
+            'EpsilonTensor': np.array([epsilon]).astype("float32"),
+        }
+
+        self.attrs = {'epsilon': epsilon}
+
+        param_out, moment1_out, \
+            moment2_out = adam_step(self.inputs, self.attrs)
+
+        self.outputs = {
+            'Moment1Out': moment1_out,
+            'Moment2Out': moment2_out,
+            'ParamOut': param_out,
+            'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
+            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
+        }
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-5)
+
+
+class TestAdamOpWithSkipUpdate(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "adam"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.004
+        beta1 = 0.78
+        beta2 = 0.836
+        epsilon = 1e-4
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
+            'Beta1Tensor': np.array([beta1]).astype("float32"),
+            'Beta2Tensor': np.array([beta2]).astype("float32"),
+            'EpsilonTensor': np.array([epsilon]).astype("float32"),
+            "SkipUpdate": np.array([True]).astype("bool"),
+        }
+
+        self.attrs = {'epsilon': epsilon}
+
+        self.outputs = {
+            'Moment1Out': moment1,
+            'Moment2Out': moment2,
+            'ParamOut': param,
+            'Beta1PowOut': self.inputs['Beta1Pow'],
+            'Beta2PowOut': self.inputs['Beta2Pow'],
+        }
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-5)
+
+
+class TestAdamOpWithGlobalBetaPow(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "adam"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.004
+        beta1 = 0.78
+        beta2 = 0.836
+        epsilon = 1e-4
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
+            'Beta1Tensor': np.array([beta1]).astype("float32"),
+            'Beta2Tensor': np.array([beta2]).astype("float32"),
+            'EpsilonTensor': np.array([epsilon]).astype("float32"),
+        }
+
+        attributes = {'epsilon': epsilon}
+
+        param_out, moment1_out, \
+            moment2_out = adam_step(self.inputs, attributes)
+
+        self.attrs = {'use_global_beta_pow': True}
+
+        # use_global_beta_pow=True, Beta1PowOut and Beta2PowOut are empty.
+        self.outputs = {
+            'Moment1Out': moment1_out,
+            'Moment2Out': moment2_out,
+            'ParamOut': param_out,
+            'Beta1PowOut': np.array([]),
+            'Beta2PowOut': np.array([])
+        }
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-5)
+
+
+class TestNet(unittest.TestCase):
+    def _test(self, run_mlu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            sum = paddle.add(a, b)
+            z = paddle.pow(sum, 2.0)
+
+            fc_1 = fluid.layers.fc(input=z, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            adam = fluid.optimizer.Adam(learning_rate=0.01)
+            adam.minimize(loss)
+
+        if run_mlu:
+            place = paddle.device.MLUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_mlu(self):
+        mlu_pred, mlu_loss = self._test(True)
+        cpu_pred, cpu_loss = self._test(False)
+        self.assertTrue(np.allclose(mlu_pred, cpu_pred, rtol=1e-3))
+        self.assertTrue(np.allclose(mlu_loss, cpu_loss, rtol=1e-3))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_adamw_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_adamw_op_mlu.py
new file mode 100644
index 0000000000000..d2827725a2058
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_adamw_op_mlu.py
@@ -0,0 +1,250 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from test_adam_op import adamw_step
+
+paddle.enable_static()
+SEED = 2022
+
+
+class TestAdamW(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "adamw"
+        param = np.random.uniform(-1, 1, (105, 102)).astype("float32")
+        grad = np.random.uniform(-1, 1, (105, 102)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (105, 102)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((105, 102)).astype("float32")
+
+        learning_rate = 0.5
+        beta1 = 0.78
+        beta2 = 0.836
+        epsilon = 1e-4
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32")
+        }
+
+        self.attrs = {
+            'epsilon': epsilon,
+            'beta1': beta1,
+            'beta2': beta2,
+            "coeff": 0.9,
+            "with_decay": True
+        }
+
+        param_out, moment1_out, \
+            moment2_out = adamw_step(self.inputs, self.attrs)
+
+        self.outputs = {
+            'Moment1Out': moment1_out,
+            'Moment2Out': moment2_out,
+            'ParamOut': param_out,
+            'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
+            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
+        }
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-5)
+
+
+class TestAdamOpWithSkipUpdate(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "adamw"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.004
+        beta1 = 0.78
+        beta2 = 0.836
+        epsilon = 1e-4
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
+            'Beta1Tensor': np.array([beta1]).astype("float32"),
+            'Beta2Tensor': np.array([beta2]).astype("float32"),
+            'EpsilonTensor': np.array([epsilon]).astype("float32"),
+            "SkipUpdate": np.array([True]).astype("bool"),
+        }
+
+        self.attrs = {'epsilon': epsilon, "coeff": 0.02, "with_decay": True}
+
+        self.outputs = {
+            'Moment1Out': moment1,
+            'Moment2Out': moment2,
+            'ParamOut': param,
+            'Beta1PowOut': self.inputs['Beta1Pow'],
+            'Beta2PowOut': self.inputs['Beta2Pow'],
+        }
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-5)
+
+
+class TestAdamOpWithoutDecay(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "adamw"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.004
+        beta1 = 0.78
+        beta2 = 0.836
+        epsilon = 1e-4
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
+            'Beta1Tensor': np.array([beta1]).astype("float32"),
+            'Beta2Tensor': np.array([beta2]).astype("float32"),
+            'EpsilonTensor': np.array([epsilon]).astype("float32"),
+            "SkipUpdate": np.array([True]).astype("bool"),
+        }
+
+        self.attrs = {'epsilon': epsilon, "coeff": 0.02, "with_decay": False}
+
+        self.outputs = {
+            'Moment1Out': moment1,
+            'Moment2Out': moment2,
+            'ParamOut': param,
+            'Beta1PowOut': self.inputs['Beta1Pow'],
+            'Beta2PowOut': self.inputs['Beta2Pow'],
+        }
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-5)
+
+
+class TestNet(unittest.TestCase):
+    def _test(self, run_mlu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            sum = paddle.add(a, b)
+            z = paddle.pow(sum, 2.0)
+
+            fc_1 = fluid.layers.fc(input=z, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            adam = paddle.optimizer.AdamW(learning_rate=0.01, weight_decay=0.02)
+            adam.minimize(loss)
+
+        if run_mlu:
+            place = paddle.device.MLUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_mlu(self):
+        mlu_pred, mlu_loss = self._test(True)
+        cpu_pred, cpu_loss = self._test(False)
+        self.assertTrue(np.allclose(mlu_pred, cpu_pred, rtol=1e-3))
+        self.assertTrue(np.allclose(mlu_loss, cpu_loss, rtol=1e-3))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_assign_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_assign_op_mlu.py
new file mode 100644
index 0000000000000..85302ad76da8b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_assign_op_mlu.py
@@ -0,0 +1,52 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+
+paddle.enable_static()
+SEED = 2022
+
+
+class TestAssign(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "assign"
+        self.init_dtype()
+
+        x = np.random.random([3, 3]).astype(self.dtype)
+        self.inputs = {'X': x}
+
+        self.attrs = {}
+        self.outputs = {'Out': x}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_assign_value_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_assign_value_op_mlu.py
new file mode 100644
index 0000000000000..5ee9d369e0fd9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_assign_value_op_mlu.py
@@ -0,0 +1,77 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy
+import sys
+sys.path.append("..")
+
+import op_test
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.framework as framework
+import paddle.fluid.layers as layers
+
+paddle.enable_static()
+numpy.random.seed(2022)
+
+
+class TestAssignValueMLUOp(op_test.OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "assign_value"
+        self.inputs = {}
+        self.attrs = {}
+        self.init_data()
+
+        self.attrs["shape"] = self.value.shape
+        self.attrs["dtype"] = framework.convert_np_dtype_to_dtype_(
+            self.value.dtype)
+        self.outputs = {"Out": self.value}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def init_data(self):
+        self.value = numpy.random.random(size=(2, 5)).astype(numpy.float32)
+        self.attrs["fp32_values"] = [float(v) for v in self.value.flat]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestAssignValueMLUOp2(TestAssignValueMLUOp):
+    def init_data(self):
+        self.value = numpy.random.random(size=(2, 5)).astype(numpy.int32)
+        self.attrs["int32_values"] = [int(v) for v in self.value.flat]
+
+
+class TestAssignValueMLUOp3(TestAssignValueMLUOp):
+    def init_data(self):
+        self.value = numpy.random.random(size=(2, 5)).astype(numpy.int64)
+        self.attrs["int64_values"] = [int(v) for v in self.value.flat]
+
+
+class TestAssignValueMLUOp4(TestAssignValueMLUOp):
+    def init_data(self):
+        self.value = numpy.random.choice(
+            a=[False, True], size=(2, 5)).astype(numpy.bool)
+        self.attrs["bool_values"] = [int(v) for v in self.value.flat]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_coalesce_tensor_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_coalesce_tensor_op_mlu.py
new file mode 100644
index 0000000000000..854ac0b6826cd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_coalesce_tensor_op_mlu.py
@@ -0,0 +1,109 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append('..')
+from op_test import OpTest
+from paddle.fluid import core
+import paddle
+
+alignment = 256
+paddle.enable_static()
+
+
+class TestAllocContinuousSpace(OpTest):
+    def setUp(self):
+        self.op_type = "coalesce_tensor"
+        self.dtype, self.fluid_dtype = self.init_dtype()
+        attrs = self.init_attr()
+        self.copy_data = attrs["copy_data"]
+        self.constant = attrs["constant"]
+        self.set_constant = attrs["set_constant"]
+        self.Inputs = self.init_input()
+        self.Outputs, self.FusedOutput = self.init_output(
+            self.Inputs, self.set_constant, self.constant)
+        self.inputs = {'Input': self.Inputs}
+        self.attrs = attrs
+        self.outputs = {'Output': self.Outputs, 'FusedOutput': self.FusedOutput}
+
+    def init_dtype(self):
+        return np.float32, int(core.VarDesc.VarType.FP32)
+
+    def init_input(self):
+        inputs = []
+        inputs.append(("x1", np.random.random([20, 3]).astype(self.dtype)))
+        inputs.append(("x2", np.random.random([20]).astype(self.dtype)))
+        inputs.append(("x3", np.random.random([1]).astype(self.dtype)))
+        inputs.append(("x4", np.random.random([200, 30]).astype(self.dtype)))
+        inputs.append(("x5", np.random.random([30]).astype(self.dtype)))
+        inputs.append(("x6", np.random.random([1]).astype(self.dtype)))
+        return inputs
+
+    def init_attr(self):
+        return {
+            "copy_data": True,
+            "set_constant": False,
+            "constant": 0.0,
+            "dtype": self.fluid_dtype
+        }
+
+    def init_output(self, input_list, set_constant, constant):
+        inputs = []
+        outputs = input_list
+
+        for input in input_list:
+            length = len(input[1].flatten())
+            aligned_len = (length + alignment) / alignment * alignment
+            out = np.zeros(int(aligned_len))
+            out[0:length] = input[1].flatten()
+            inputs.append(out)
+
+        coalesce_tensor_var = np.concatenate([input for input in inputs])
+        if set_constant:
+            coalesce_tensor_var = np.ones((len(coalesce_tensor_var))) * constant
+            outputs = [(out[0],
+                        np.ones(out[1].shape).astype(self.dtype) * constant)
+                       for out in outputs]
+        return outputs, coalesce_tensor_var
+
+    def test_check_output(self):
+        self.check_output_with_place(
+            place=paddle.device.MLUPlace(0),
+            no_check_set=["FusedOutput"],
+            atol=1e-5)
+
+
+class TestAllocContinuousSpace2(TestAllocContinuousSpace):
+    def init_attr(self):
+        return {
+            "copy_data": False,
+            "set_constant": True,
+            "constant": 5,
+            "dtype": self.fluid_dtype,
+            "user_defined_size_of_dtype": 2
+        }
+
+    def test_check_output(self):
+        self.check_output_with_place(
+            place=paddle.device.MLUPlace(0),
+            no_check_set=["FusedOutput"],
+            atol=1e-5)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_layer_norm_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_layer_norm_op_mlu.py
new file mode 100644
index 0000000000000..8b32692020cbf
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_layer_norm_op_mlu.py
@@ -0,0 +1,309 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+
+from operator import mul
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+import paddle.nn.functional as F
+from functools import reduce
+import sys
+sys.path.append('..')
+from op_test import _set_use_system_allocator
+from paddle.fluid import Program, program_guard
+from paddle.fluid.contrib.mixed_precision.fp16_utils import _keep_layer_norm_scale_bias_to_fp32
+from test_layer_norm_op import _reference_layer_norm_naive, _reference_layer_norm_grad
+
+paddle.enable_static()
+
+np.random.random(123)
+
+_set_use_system_allocator(True)
+
+
+class TestLayerNormOp(unittest.TestCase):
+    def setUp(self):
+        self.use_cudnn = True
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+
+    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
+        self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
+
+    def check_forward_backward(self,
+                               shape,
+                               begin_norm_axis,
+                               has_scale=True,
+                               has_bias=True,
+                               y_grad_scale=1.0,
+                               use_mkldnn=False):
+        def test_with_place(place,
+                            shape,
+                            begin_norm_axis,
+                            use_mkldnn=use_mkldnn):
+            # attr
+            epsilon = 0.00001
+            x_shape = shape
+            D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
+            scale_shape = [D]
+
+            np.random.seed(123)
+            x = np.random.random_sample(x_shape).astype(np.float32)
+            scale = np.random.random_sample(scale_shape).astype(
+                np.float32) if has_scale else None
+            bias = np.random.random_sample(scale_shape).astype(
+                np.float32) if has_bias else None
+            y_grad = (np.random.random_sample(x_shape) *
+                      y_grad_scale).astype(np.float32)
+
+            # reference forward & backward
+            y, mean, variance = _reference_layer_norm_naive(
+                x, scale, bias, epsilon, begin_norm_axis)
+            x_grad, scale_grad, bias_grad = _reference_layer_norm_grad(
+                x, y_grad, scale, bias, mean, variance, begin_norm_axis)
+
+            var_dict = locals()
+            var_dict['y@GRAD'] = y_grad
+            var_names = ['x', 'mean', 'variance', 'y', 'y@GRAD']
+            if has_scale:
+                var_names += ['scale']
+            if has_bias:
+                var_names += ['bias']
+            ground_truth = {name: var_dict[name] for name in var_names}
+
+            program = fluid.Program()
+            with fluid.program_guard(program):
+                block = program.global_block()
+                for name in ground_truth:
+                    block.create_var(
+                        name=name,
+                        dtype='float32',
+                        shape=ground_truth[name].shape)
+                inputs = {"X": block.var('x')}
+                fetch_list = [
+                    'y',
+                    'mean',
+                    'variance',
+                    'x@GRAD',
+                ]
+                if has_scale:
+                    inputs["Scale"] = block.var('scale')
+                    fetch_list += ['scale@GRAD']
+                if has_bias:
+                    inputs["Bias"] = block.var('bias')
+                    fetch_list += ['bias@GRAD']
+                layer_norm_op = block.append_op(
+                    type="layer_norm",
+                    inputs=inputs,
+                    outputs={
+                        "Y": block.var('y'),
+                        "Mean": block.var('mean'),  # share the same memory
+                        "Variance":
+                        block.var('variance'),  # share the same memory
+                    },
+                    attrs={
+                        "epsilon": epsilon,
+                        "begin_norm_axis": begin_norm_axis,
+                        "use_mkldnn": use_mkldnn
+                    })
+                # generate backward op_desc
+                grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
+                    layer_norm_op.desc, set(), [])
+                grad_op_desc = grad_op_desc_list[0]
+                new_op_desc = block.desc.append_op()
+                new_op_desc.copy_from(grad_op_desc)
+                for var_name in grad_op_desc.output_arg_names():
+                    block.desc.var(var_name.encode("ascii"))
+                grad_op_desc.infer_var_type(block.desc)
+                grad_op_desc.infer_shape(block.desc)
+                for arg in grad_op_desc.output_arg_names():
+                    grad_var = block.desc.find_var(arg.encode("ascii"))
+                    grad_var.set_dtype(core.VarDesc.VarType.FP32)
+
+                program._sync_with_cpp()
+                exe = fluid.Executor(place)
+                out = exe.run(program,
+                              feed={
+                                  name: var_dict[name]
+                                  for name in ['x', 'scale', 'bias', 'y@GRAD']
+                              },
+                              fetch_list=fetch_list)
+
+                self.__assert_close(y, out[0], "y")
+                self.__assert_close(mean, out[1], "mean")
+                self.__assert_close(1 / np.sqrt(variance), out[2], "variance",
+                                    1e-3)
+                self.__assert_close(x_grad, out[3], "x_grad")
+                if has_scale:
+                    self.__assert_close(scale_grad,
+                                        out[fetch_list.index('scale@GRAD')],
+                                        "scale_grad", 1e-3)
+                if has_bias:
+                    self.__assert_close(bias_grad,
+                                        out[fetch_list.index('bias@GRAD')],
+                                        "bias_grad")
+
+        test_with_place(self.place, shape, begin_norm_axis)
+
+    def test_check_forward_backward_with_scale_and_bias(self):
+        self.check_forward_backward(shape=[1, 3, 4, 5], begin_norm_axis=1)
+        self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1)
+        self.check_forward_backward(
+            shape=[2, 3, 4, 5],
+            begin_norm_axis=1,
+            has_scale=False,
+            has_bias=True)
+        self.check_forward_backward(
+            shape=[2, 3, 4, 5],
+            begin_norm_axis=1,
+            has_scale=True,
+            has_bias=False)
+        self.check_forward_backward(
+            shape=[2, 3, 4, 5],
+            begin_norm_axis=1,
+            has_scale=False,
+            has_bias=False)
+        self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=3)
+        self.check_forward_backward(
+            shape=[92, 513, 129], begin_norm_axis=2, y_grad_scale=0.1)
+        self.check_forward_backward(shape=[3, 34, 1134], begin_norm_axis=2)
+        self.check_forward_backward(
+            shape=[92, 513, 1134], begin_norm_axis=2, y_grad_scale=0.1)
+        self.check_forward_backward(
+            shape=[92, 513, 1134],
+            begin_norm_axis=2,
+            has_scale=False,
+            has_bias=True,
+            y_grad_scale=0.1)
+        self.check_forward_backward(
+            shape=[92, 513, 1134],
+            begin_norm_axis=2,
+            has_scale=True,
+            has_bias=False,
+            y_grad_scale=0.1)
+        self.check_forward_backward(
+            shape=[92, 513, 1134],
+            begin_norm_axis=2,
+            has_scale=False,
+            has_bias=False,
+            y_grad_scale=0.1)
+        self.check_forward_backward(
+            shape=[512, 1024], begin_norm_axis=1, has_scale=True, has_bias=True)
+
+
+class TestLayerNormAPI(unittest.TestCase):
+    def test_case(self):
+        x = fluid.layers.data(
+            name='x',
+            shape=[64, 32, 256],
+            dtype='float32',
+            append_batch_size=False)
+        x = fluid.layers.layer_norm(
+            x,
+            scale=True,
+            shift=True,
+            begin_norm_axis=1,
+            epsilon=1e-05,
+            param_attr=None,
+            bias_attr=None)
+        x = fluid.layers.layer_norm(
+            x,
+            scale=False,
+            shift=False,
+            begin_norm_axis=1,
+            epsilon=1e-05,
+            param_attr=None,
+            bias_attr=None)
+        x = fluid.layers.layer_norm(
+            x,
+            scale=False,
+            shift=False,
+            begin_norm_axis=1,
+            epsilon=1e-05,
+            param_attr="scale",
+            bias_attr="shift")
+
+
+class TestDygraphLayerNormAPIError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            paddle.enable_static()
+
+            layer_norm = fluid.LayerNorm([32, 32])
+            # the input of LayerNorm must be Variable.
+            x1 = np.random.random((3, 32, 32)).astype('float32')
+            self.assertRaises(TypeError, layer_norm, x1)
+
+            # the input dtype of LayerNorm must be float32 or float16
+            x2 = fluid.layers.data(name='x2', shape=[3, 32, 32], dtype="int32")
+            self.assertRaises(TypeError, layer_norm, x2)
+
+
+class TestFP16ScaleBiasLayerNorm(unittest.TestCase):
+    def check_main(self, x_np, weight_np, bias_np, dtype):
+        paddle.disable_static()
+
+        weight_np = weight_np.astype(dtype)
+        bias_np = bias_np.astype(dtype)
+
+        x = paddle.to_tensor(x_np)
+        weight = paddle.to_tensor(weight_np)
+        bias = paddle.to_tensor(bias_np)
+        x.stop_gradient = False
+        weight.stop_gradient = False
+        bias.stop_gradient = False
+        y = F.layer_norm(x, x.shape[1:], weight, bias)
+        x_g, w_g, b_g = paddle.grad(y, [x, weight, bias])
+        y_np = y.numpy().astype('float32')
+        x_g_np = x_g.numpy().astype('float32')
+        w_g_np = w_g.numpy().astype('float16')
+        b_g_np = b_g.numpy().astype('float32')
+
+        paddle.enable_static()
+        return y_np, x_g_np, w_g_np, b_g_np
+
+    def test_main(self):
+        x_np = np.random.random([10, 20]).astype('float16')
+        weight_np = np.random.random([20]).astype('float16')
+        bias_np = np.random.random([20]).astype('float16')
+
+        y_np_1, x_g_np_1, w_g_np_1, b_g_np_1 = self.check_main(
+            x_np, weight_np, bias_np, 'float16')
+        y_np_2, x_g_np_2, w_g_np_2, b_g_np_2 = self.check_main(
+            x_np, weight_np, bias_np, 'float32')
+
+        def assert_equal(x, y):
+            self.assertTrue(np.array_equal(x, y))
+
+        assert_equal(y_np_1, y_np_2)
+        assert_equal(x_g_np_1, x_g_np_2)
+        assert_equal(w_g_np_1, w_g_np_2)
+        assert_equal(b_g_np_1, b_g_np_2)
+
+
+class TestGetSetKeepLayerNormScaleBiasFP32Flag(unittest.TestCase):
+    def test_main(self):
+        self.assertTrue(_keep_layer_norm_scale_bias_to_fp32())
+        _keep_layer_norm_scale_bias_to_fp32(False)
+        self.assertFalse(_keep_layer_norm_scale_bias_to_fp32())
+        _keep_layer_norm_scale_bias_to_fp32(True)
+        self.assertTrue(_keep_layer_norm_scale_bias_to_fp32())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_lookup_table_v2_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_lookup_table_v2_op_mlu.py
new file mode 100644
index 0000000000000..f9a08ba4c9b14
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_lookup_table_v2_op_mlu.py
@@ -0,0 +1,142 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2022
+
+
+class TestLookupTableV2(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "lookup_table_v2"
+
+        self.init_dtype()
+        self.init_dims()
+        self.init_padding_idx()
+        np.random.seed(SEED)
+        w = np.random.random([self.vocab, self.dim]).astype(self.dtype)
+        x = np.random.randint(
+            0, self.vocab, size=(self.bsz, self.seqlen)).astype(self.ids_dtype)
+        out = w[x]
+        if self.padding_idx != -1:
+            out[np.squeeze(x == self.padding_idx)] = np.zeros(self.dim)
+
+        self.inputs = {
+            'W': OpTest.np_dtype_to_fluid_dtype(w),
+            'Ids': OpTest.np_dtype_to_fluid_dtype(x)
+        }
+        self.attrs = {
+            'is_sparse': False,
+            'is_distributed': False,
+            'remote_prefetch': False,
+            'padding_idx': self.padding_idx
+        }
+        self.outputs = {'Out': out}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+        self.ids_dtype = np.int32
+
+    def init_dims(self):
+        self.bsz = 6
+        self.seqlen = 8
+        self.vocab = 10
+        # embedding_dim is not multiple of 32
+        self.dim = 20
+
+    def init_padding_idx(self):
+        self.padding_idx = -1
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            self.check_grad_with_place(
+                self.place, ['W'], 'Out', max_relative_error=0.01)
+        else:
+            self.check_grad_with_place(self.place, ['W'], 'Out')
+
+
+class TestLookupTableV2FP16(TestLookupTableV2):
+    no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+        self.ids_dtype = np.int32
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.no_need_check_grad = True
+
+
+class TestLookupTableV2Dim32(TestLookupTableV2):
+    def init_dims(self):
+        self.bsz = 6
+        self.seqlen = 8
+        self.vocab = 10
+        # embedding_dim is multiple of 32
+        self.dim = 64
+
+
+class TestLookupTableV2Dim32FP16(TestLookupTableV2):
+    no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+        self.ids_dtype = np.int64
+
+    def init_dims(self):
+        self.bsz = 6
+        self.seqlen = 8
+        self.vocab = 10
+        self.dim = 64
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.no_need_check_grad = True
+
+
+class TestLookupTableV2WithPadding(TestLookupTableV2):
+    def init_padding_idx(self):
+        self.padding_idx = np.random.randint(0, self.vocab)
+
+
+class TestLookupTableV2WithPadding1(TestLookupTableV2):
+    def init_padding_idx(self):
+        self.padding_idx = np.random.randint(0, self.vocab)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+        self.ids_dtype = np.int64
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_slice_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_slice_op_mlu.py
new file mode 100644
index 0000000000000..44532ddceb765
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_slice_op_mlu.py
@@ -0,0 +1,631 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+import sys
+sys.path.append('..')
+from op_test import OpTest
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import paddle
+
+paddle.enable_static()
+
+
+# Situation 1: starts(list, no tensor), ends(list, no tensor)
+# 1.1 without attr(decrease)
+class TestSliceOp(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.set_mlu()
+        self.config()
+        self.inputs = {'Input': self.input}
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            'starts': self.starts,
+            'ends': self.ends,
+            'infer_flags': self.infer_flags
+        }
+
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [1, 0, 2]
+        self.ends = [3, 3, 4]
+        self.axes = [0, 1, 2]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[1:3, 0:3, 2:4, :]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006)
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+
+
+class TestCase1(TestSliceOp):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [-3, 0, 2]
+        self.ends = [3, 100, -1]
+        self.axes = [0, 1, 2]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[-3:3, 0:100, 2:-1, :]
+
+
+class TestCase2(TestSliceOp):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [-3, 0, 2]
+        self.ends = [3, 100, -1]
+        self.axes = [0, 1, 3]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[-3:3, 0:100, :, 2:-1]
+
+
+# 1.2 with attr(decrease)
+class TestSliceOp_decs_dim(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.set_mlu()
+        self.config()
+        self.inputs = {'Input': self.input}
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            'starts': self.starts,
+            'ends': self.ends,
+            'infer_flags': self.infer_flags,
+            'decrease_axis': self.decrease_axis,
+        }
+
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [1, 0, 2]
+        self.ends = [2, 3, 4]
+        self.axes = [0, 1, 2]
+        self.decrease_axis = [0]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[1, 0:3, 2:4, :]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006)
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+
+
+class TestSliceOp_decs_dim_2(TestSliceOp_decs_dim):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [1, 0, 2]
+        self.ends = [2, 1, 4]
+        self.axes = [0, 1, 2]
+        self.decrease_axis = [0, 1]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[1, 0, 2:4, :]
+
+
+class TestSliceOp_decs_dim_3(TestSliceOp_decs_dim):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [-1, 0, 2]
+        self.ends = [1000000, 1, 4]
+        self.axes = [0, 1, 2]
+        self.decrease_axis = [0, 1]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[-1, 0, 2:4, :]
+
+
+class TestSliceOp_decs_dim_4(TestSliceOp_decs_dim):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 7]).astype("float32")
+        self.starts = [0, 1, 2, 3]
+        self.ends = [1, 2, 3, 4]
+        self.axes = [0, 1, 2, 3]
+        self.decrease_axis = [0, 1, 2, 3]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[0, 1, 2, 3:4]
+
+
+class TestSliceOp_decs_dim_5(TestSliceOp_decs_dim):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [-1]
+        self.ends = [1000000]
+        self.axes = [3]
+        self.decrease_axis = [3]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[:, :, :, -1]
+
+
+class TestSliceOp_decs_dim_6(TestSliceOp_decs_dim):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [0, 1, 2, 3]
+        self.ends = [1, 2, 3, 4]
+        self.axes = [0, 1, 2, 3]
+        self.decrease_axis = [0, 1, 2, 3]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[0, 1, 2, 3:4]
+
+
+# Situation 2: starts(list, have tensor), ends(list, no tensor)
+# without attr(decrease)
+class TestSliceOp_starts_ListTensor(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.set_mlu()
+        self.config()
+
+        starts_tensor = []
+        for index, ele in enumerate(self.starts):
+            starts_tensor.append(("x" + str(index), np.ones(
+                (1)).astype('int64') * ele))
+
+        self.inputs = {'Input': self.input, 'StartsTensorList': starts_tensor}
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            'starts': self.starts_infer,
+            'ends': self.ends,
+            'infer_flags': self.infer_flags
+        }
+
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [1, 0, 2]
+        self.ends = [3, 3, 4]
+        self.axes = [0, 1, 2]
+        self.infer_flags = [-1, 1, -1]
+        self.out = self.input[1:3, 0:3, 2:4, :]
+
+        self.starts_infer = [-1, 0, -1]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006)
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+
+
+# Situation 2: starts(list, have tensor), ends(list, no tensor)
+#  with attr(decrease)
+class TestSliceOp_decs_dim_starts_ListTensor(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.set_mlu()
+        self.config()
+
+        starts_tensor = []
+        for index, ele in enumerate(self.starts):
+            starts_tensor.append(("x" + str(index), np.ones(
+                (1)).astype('int32') * ele))
+
+        self.inputs = {'Input': self.input, 'StartsTensorList': starts_tensor}
+
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            'starts': self.starts_infer,
+            'ends': self.ends,
+            'infer_flags': self.infer_flags,
+            'decrease_axis': self.decrease_axis,
+        }
+
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [1, 0, 2]
+        self.ends = [2, 3, 4]
+        self.axes = [0, 1, 2]
+        self.decrease_axis = [0]
+        self.infer_flags = [1, -1, 1]
+        self.out = self.input[1, 0:3, 2:4, :]
+
+        self.starts_infer = [1, -1, 2]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006)
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+
+
+class TestSliceOp_decs_dim_5_starts_ListTensor(
+        TestSliceOp_decs_dim_starts_ListTensor):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [-1]
+        self.ends = [1000000]
+        self.axes = [3]
+        self.decrease_axis = [3]
+        self.infer_flags = [-1]
+        self.out = self.input[:, :, :, -1]
+
+        self.starts_infer = [-1]
+
+
+# Situation 3: starts(tensor), ends(list, no tensor)
+# with attr(decrease)
+class TestSliceOp_decs_dim_starts_OneTensor(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+        self.config()
+        self.inputs = {
+            'Input': self.input,
+            "StartsTensor": np.array(
+                self.starts, dtype="int32")
+        }
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            #'starts': self.starts,
+            'ends': self.ends,
+            'infer_flags': self.infer_flags,
+            'decrease_axis': self.decrease_axis,
+        }
+
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [1, 0, 2]
+        self.ends = [2, 3, 4]
+        self.axes = [0, 1, 2]
+        self.decrease_axis = [0]
+        self.infer_flags = [-1, -1, -1]
+        self.out = self.input[1, 0:3, 2:4, :]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006)
+
+
+# Situation 4: starts(tensor), ends(tensor)
+# without attr(decrease)
+class TestSliceOp_starts_OneTensor_ends_OneTensor(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+        self.config()
+
+        self.inputs = {
+            'Input': self.input,
+            "StartsTensor": np.array(
+                self.starts, dtype="int64"),
+            "EndsTensor": np.array(
+                self.ends, dtype="int32")
+        }
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            #'starts': self.starts,
+            #'ends': self.ends_infer,
+            'infer_flags': self.infer_flags
+        }
+
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [1, 0, 2]
+        self.ends = [3, 3, 4]
+        self.axes = [0, 1, 2]
+        self.infer_flags = [-1, -1, -1]
+        self.out = self.input[1:3, 0:3, 2:4, :]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006)
+
+
+# Situation 5: starts(tensor), ends(tensor)
+#  with attr(decrease)
+class TestSliceOp_decs_dim_starts_and_ends_OneTensor(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+        self.config()
+        self.inputs = {
+            'Input': self.input,
+            "StartsTensor": np.array(
+                self.starts, dtype="int32"),
+            "EndsTensor": np.array(
+                self.ends, dtype="int32")
+        }
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            #'starts': self.starts,
+            #'ends': self.ends,
+            'infer_flags': self.infer_flags,
+            'decrease_axis': self.decrease_axis,
+        }
+
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [1, 0, 2]
+        self.ends = [2, 1, 4]
+        self.axes = [0, 1, 2]
+        self.decrease_axis = [0, 1]
+        self.infer_flags = [-1, -1, -1]
+        self.out = self.input[1, 0, 2:4, :]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006)
+
+
+# Situation 6: starts(tensor), ends(list, have tensor)
+# without attr(decrease)
+class TestSliceOp_starts_OneTensor_ends_ListTensor(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+        self.config()
+
+        ends_tensor = []
+        for index, ele in enumerate(self.ends):
+            ends_tensor.append(("y" + str(index), np.ones(
+                (1)).astype('int32') * ele))
+
+        self.inputs = {
+            'Input': self.input,
+            "StartsTensor": np.array(
+                self.starts, dtype="int32"),
+            'EndsTensorList': ends_tensor
+        }
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            #'starts': self.starts,
+            'ends': self.ends_infer,
+            'infer_flags': self.infer_flags
+        }
+
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [1, 0, 2]
+        self.ends = [3, 3, 4]
+        self.axes = [0, 1, 2]
+        self.infer_flags = [-1, -1, -1]
+        self.out = self.input[1:3, 0:3, 2:4, :]
+
+        self.ends_infer = [-1, 3, 4]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006)
+
+
+# Test float16
+class TestFP16(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+        self.config()
+        self.inputs = {'Input': self.input}
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            'starts': self.starts,
+            'ends': self.ends,
+            'infer_flags': self.infer_flags
+        }
+
+    def config(self):
+        self.dtype = "float16"
+        self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+        self.starts = [-3, 0, 2]
+        self.ends = [3, 100, -1]
+        self.axes = [0, 1, 3]
+        self.out = self.input[-3:3, 0:100, :, 2:-1]
+        self.infer_flags = [1, 1, 1]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-5)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006)
+
+
+class TestFP16_2(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+        self.config()
+        self.inputs = {'Input': self.input}
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            'starts': self.starts,
+            'ends': self.ends,
+            'infer_flags': self.infer_flags
+        }
+
+    def config(self):
+        self.dtype = "float16"
+        self.input = np.random.random([3, 4, 10]).astype(self.dtype)
+        self.starts = [0]
+        self.ends = [1]
+        self.axes = [1]
+        self.out = self.input[:, 0:1, :]
+        self.infer_flags = [1]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-5)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['Input'],
+            'Out',
+            max_relative_error=0.006,
+            numeric_grad_delta=0.5)
+
+
+class TestSliceApiWithTensor(unittest.TestCase):
+    def test_starts_ends_is_tensor(self):
+        with paddle.fluid.dygraph.guard():
+            a = paddle.rand(shape=[4, 5, 6], dtype='float32')
+            axes = [0, 1, 2]
+            starts = [-3, 0, 2]
+            ends = [3, 2, 4]
+            a_1 = paddle.slice(
+                a,
+                axes=axes,
+                starts=paddle.to_tensor(
+                    starts, dtype='int32'),
+                ends=paddle.to_tensor(
+                    ends, dtype='int32'))
+            a_2 = paddle.slice(a, axes=axes, starts=starts, ends=ends)
+
+            self.assertTrue(np.array_equal(a_1.numpy(), a_2.numpy()))
+
+    def test_bool_tensor(self):
+        with paddle.fluid.dygraph.guard():
+            array = (np.arange(60).reshape([3, 4, 5]) % 3).astype('bool')
+            tt = paddle.to_tensor(array)
+            tt.stop_gradient = False
+
+            starts = [0, 1, 2]
+            ends = [3, 5, 4]
+            axes = [0, 1, 2]
+
+            y_paddle = paddle.slice(tt, axes, starts, ends)
+            y_np = tt[0:3, 1:5, 2:4]
+
+            self.assertTrue(paddle.bool == y_paddle.dtype)
+            self.assertTrue(np.array_equal(y_paddle.numpy(), y_np))
+
+
+class TestImperativeVarBaseGetItem(unittest.TestCase):
+    def test_getitem_with_long(self):
+        with fluid.dygraph.guard():
+            data = np.random.random((2, 80, 16128)).astype('float32')
+            var = fluid.dygraph.to_variable(data)
+            sliced = var[:, 10:, :var.shape[1]]  # var.shape[1] is 80L here
+            self.assertEqual(sliced.shape, [2, 70, 80])
+
+            sliced = var[:, var.shape[0]:, var.shape[0]:var.shape[1]]
+            self.assertEqual(sliced.shape, [2, 78, 78])
+
+    def test_getitem_with_float(self):
+        def test_float_in_slice_item():
+            with fluid.dygraph.guard():
+                data = np.random.random((2, 80, 16128)).astype('float32')
+                var = fluid.dygraph.to_variable(data)
+                sliced = var[:, 1.1:, :var.shape[1]]
+
+        self.assertRaises(Exception, test_float_in_slice_item)
+
+        def test_float_in_index():
+            with fluid.dygraph.guard():
+                data = np.random.random((2, 80, 16128)).astype('float32')
+                var = fluid.dygraph.to_variable(data)
+                sliced = var[1.1]
+
+        self.assertRaises(Exception, test_float_in_index)
+
+
+class TestInferShape(unittest.TestCase):
+    def test(self):
+        x = paddle.ones(shape=[3, 4, 5])
+        x.desc.set_shape([3, -1, 5])
+        self.assertEqual(x.shape, (3, -1, 5))
+
+        out0 = paddle.slice(x, axes=[1], starts=[0], ends=[3])
+        self.assertEqual(out0.shape, (3, 3, 5))
+
+    def test_axis_less_than_zero(self):
+
+        # Using paddle.disable_static will make other unittests fail.
+        with fluid.dygraph.guard():
+            x_arr = np.arange(0, 24, dtype=np.float32).reshape([2, 3, 4])
+            x = paddle.to_tensor(x_arr)
+
+            pp_slice = paddle.slice(x, [100, ], [0], [1])
+            np_slice = x_arr[:, :, 0:1]
+            self.assertTrue(np.array_equal(pp_slice, np_slice))
+
+            pp_slice = paddle.slice(x, (-100, ), [0], [1])
+            np_slice = x_arr[0:1]
+            self.assertTrue(np.array_equal(pp_slice, np_slice))
+
+            x_arr = np.array([], dtype=np.float32)
+            x = paddle.to_tensor(np.reshape(x_arr, (0, 0, 0)))
+
+            starts = paddle.to_tensor(
+                np.reshape(
+                    np.array(
+                        [], dtype=np.int32), (0, )))
+            ends = paddle.to_tensor(
+                np.reshape(
+                    np.array(
+                        [], dtype=np.int32), (0, )))
+
+            with self.assertRaises(ValueError):
+                paddle.slice(x, [-1000000], starts, ends)
+
+            with self.assertRaises(ValueError):
+                paddle.slice(x, [1000000], starts, ends)
+
+            with self.assertRaises(ValueError):
+                paddle.slice(x, [], starts, ends)
+
+            with self.assertRaises(ValueError):
+                paddle.slice(x, 0, starts, ends)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_spawn_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_spawn_mlu.py
new file mode 100644
index 0000000000000..773063c7a8ac9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_spawn_mlu.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import os
+
+import paddle
+import paddle.nn as nn
+import paddle.optimizer as opt
+import paddle.distributed as dist
+from paddle.distributed.spawn import _get_subprocess_env_list, _options_valid_check, _get_default_nprocs
+from paddle.fluid import core
+
+
+class LinearNet(nn.Layer):
+    def __init__(self):
+        super(LinearNet, self).__init__()
+        self._linear1 = nn.Linear(10, 10)
+        self._linear2 = nn.Linear(10, 1)
+
+    def forward(self, x):
+        return self._linear2(self._linear1(x))
+
+
+def train(print_result=False):
+    # 1. initialize parallel environment
+    dist.init_parallel_env()
+
+    # 2. create data parallel layer & optimizer
+    layer = LinearNet()
+    dp_layer = paddle.DataParallel(layer)
+
+    loss_fn = nn.MSELoss()
+    adam = opt.Adam(learning_rate=0.001, parameters=dp_layer.parameters())
+
+    # 3. run layer
+    inputs = paddle.randn([10, 10], 'float32')
+    outputs = dp_layer(inputs)
+    labels = paddle.randn([10, 1], 'float32')
+    loss = loss_fn(outputs, labels)
+
+    if print_result is True:
+        print("Rank:", int(os.getenv("PADDLE_TRAINER_ID")))
+
+    loss.backward()
+    adam.step()
+    adam.clear_grad()
+
+    return int(os.getenv("PADDLE_TRAINER_ID"))
+
+
+class TestSpawn(unittest.TestCase):
+    def test_nprocs_greater_than_device_num_error(self):
+        with self.assertRaises(RuntimeError):
+            _get_subprocess_env_list(nprocs=100, options=dict())
+
+    def test_selected_devices_error(self):
+        with self.assertRaises(ValueError):
+            options = dict()
+            options['selected_devices'] = "100,101"
+            _get_subprocess_env_list(nprocs=2, options=options)
+
+    def test_get_correct_env(self):
+        options = dict()
+        options['print_config'] = True
+        env_dict = _get_subprocess_env_list(nprocs=1, options=options)[0]
+        self.assertEqual(env_dict['PADDLE_TRAINER_ID'], '0')
+        self.assertEqual(env_dict['PADDLE_TRAINERS_NUM'], '1')
+
+    def test_nprocs_not_equal_to_selected_devices(self):
+        with self.assertRaises(ValueError):
+            options = dict()
+            options['selected_devices'] = "100,101,102"
+            _get_subprocess_env_list(nprocs=2, options=options)
+
+    def test_options_valid_check(self):
+        options = dict()
+        options['selected_devices'] = "100,101,102"
+        _options_valid_check(options)
+
+        with self.assertRaises(ValueError):
+            options['error'] = "error"
+            _options_valid_check(options)
+
+    def test_get_default_nprocs(self):
+        paddle.set_device('mlu')
+        nprocs = _get_default_nprocs()
+        self.assertEqual(nprocs, core.get_mlu_device_count())
+
+    def test_spawn(self):
+        context = dist.spawn(train, backend='cncl', nprocs=4)
+        rank_list = []
+        for i in range(4):
+            rank_list.append(context.return_queues[i].get())
+        rank_list.sort()
+        self.assertEqual(rank_list, list(range(4)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_unstack_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_unstack_op_mlu.py
new file mode 100644
index 0000000000000..a75a6aa1dfcb9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_unstack_op_mlu.py
@@ -0,0 +1,97 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import unittest
+import paddle
+
+paddle.enable_static()
+
+
+class TestUnStackOpBase(OpTest):
+    def initDefaultParameters(self):
+        self.input_dim = (5, 6, 7)
+        self.axis = 0
+
+    def initParameters(self):
+        pass
+
+    def get_y_names(self):
+        y_names = []
+        for i in range(self.input_dim[self.axis]):
+            y_names.append('y{}'.format(i))
+        return y_names
+
+    def setUp(self):
+        self.initDefaultParameters()
+        self.initParameters()
+        self.op_type = 'unstack'
+        self.set_mlu()
+        self.init_dtype()
+
+        self.x = np.random.random(size=self.input_dim).astype(self.dtype)
+
+        outs = np.split(self.x, self.input_dim[self.axis], self.axis)
+        new_shape = list(self.input_dim)
+        del new_shape[self.axis]
+        y_names = self.get_y_names()
+        tmp = []
+        for i in range(self.input_dim[self.axis]):
+            tmp.append((y_names[i], np.reshape(outs[i], new_shape)))
+
+        self.inputs = {'X': self.x}
+        self.outputs = {'Y': tmp}
+        self.attrs = {'axis': self.axis, 'num': self.input_dim[self.axis]}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X'], self.get_y_names())
+
+
+class TestStackOp3(TestUnStackOpBase):
+    def initParameters(self):
+        self.axis = -1
+
+
+class TestStackOp4(TestUnStackOpBase):
+    def initParameters(self):
+        self.axis = -3
+
+
+class TestStackOp5(TestUnStackOpBase):
+    def initParameters(self):
+        self.axis = 1
+
+
+class TestStackOp6(TestUnStackOpBase):
+    def initParameters(self):
+        self.axis = 2
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_multinomial_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_multinomial_op_npu.py
new file mode 100644
index 0000000000000..28833a7dc1dcc
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_multinomial_op_npu.py
@@ -0,0 +1,235 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import numpy as np
+import os
+
+paddle.enable_static()
+
+
+def sample_output_one_dimension(out, dim):
+    # count numbers of different categories
+    sample_prob = np.zeros(dim).astype("float32")
+    sample_index_prob = np.unique(out, return_counts=True)
+    sample_prob[sample_index_prob[0]] = sample_index_prob[1]
+    sample_prob /= sample_prob.sum()
+    return sample_prob
+
+
+def sample_output_two_dimension(out, shape):
+    num_dist = shape[0]
+    out_list = np.split(out, num_dist, axis=0)
+    sample_prob = np.zeros(shape).astype("float32")
+    for i in range(num_dist):
+        sample_index_prob = np.unique(out_list[i], return_counts=True)
+        sample_prob[i][sample_index_prob[0]] = sample_index_prob[1]
+    sample_prob /= sample_prob.sum(axis=-1, keepdims=True)
+    return sample_prob
+
+
+class TestMultinomialOp(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "multinomial"
+        self.init_data()
+        self.inputs = {"X": self.input_np}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+
+    def init_data(self):
+        # input probability is a vector, and replacement is True
+        self.input_np = np.random.rand(4)
+        self.outputs = {"Out": np.zeros(100000).astype("int64")}
+        self.attrs = {"num_samples": 100000, "replacement": True}
+
+    def test_check_output(self):
+        self.check_output_customized(
+            self.verify_output, custom_place=self.place)
+
+    def sample_output(self, out):
+        return sample_output_one_dimension(out, 4)
+
+    def verify_output(self, outs):
+        # normalize the input to get the probability
+        prob = self.input_np / self.input_np.sum(axis=-1, keepdims=True)
+        sample_prob = self.sample_output(np.array(outs[0]))
+        self.assertTrue(
+            np.allclose(
+                sample_prob, prob, rtol=0, atol=0.01),
+            "sample_prob: " + str(sample_prob) + "\nprob: " + str(prob))
+
+
+class TestMultinomialOp2(TestMultinomialOp):
+    def init_data(self):
+        # input probability is a matrix
+        self.input_np = np.random.rand(3, 4)
+        self.outputs = {"Out": np.zeros((3, 100000)).astype("int64")}
+        self.attrs = {"num_samples": 100000, "replacement": True}
+
+    def sample_output(self, out):
+        return sample_output_two_dimension(out, [3, 4])
+
+
+class TestMultinomialOp3(TestMultinomialOp):
+    def init_data(self):
+        # replacement is False. number of samples must be less than number of categories.
+        self.input_np = np.random.rand(1000)
+        self.outputs = {"Out": np.zeros(100).astype("int64")}
+        self.attrs = {"num_samples": 100, "replacement": False}
+
+    def verify_output(self, outs):
+        out = np.array(outs[0])
+        unique_out = np.unique(out)
+        self.assertEqual(
+            len(unique_out), 100,
+            "replacement is False. categories can't be sampled repeatedly")
+
+
+class TestMultinomialApi(unittest.TestCase):
+    def test_dygraph(self):
+        # input probability is a vector, and replacement is True
+        paddle.set_device('npu:0')
+        paddle.disable_static()
+        x_numpy = np.random.rand(4)
+        x = paddle.to_tensor(x_numpy)
+        out = paddle.multinomial(x, num_samples=100000, replacement=True)
+
+        sample_prob = sample_output_one_dimension(out.numpy(), 4)
+        prob = x_numpy / x_numpy.sum(axis=-1, keepdims=True)
+        self.assertTrue(
+            np.allclose(
+                sample_prob, prob, rtol=0, atol=0.01),
+            "sample_prob: " + str(sample_prob) + "\nprob: " + str(prob))
+        paddle.enable_static()
+
+    def test_dygraph2(self):
+        # input probability is a matrix, and replacement is True
+        paddle.set_device('npu:0')
+        paddle.disable_static()
+        x_numpy = np.random.rand(3, 4)
+        x = paddle.to_tensor(x_numpy)
+        out = paddle.multinomial(x, num_samples=100000, replacement=True)
+
+        sample_prob = sample_output_two_dimension(out.numpy(), [3, 4])
+        prob = x_numpy / x_numpy.sum(axis=-1, keepdims=True)
+        self.assertTrue(
+            np.allclose(
+                sample_prob, prob, rtol=0, atol=0.01),
+            "sample_prob: " + str(sample_prob) + "\nprob: " + str(prob))
+        paddle.enable_static()
+
+    def test_dygraph3(self):
+        # replacement is False. number of samples must be less than number of categories.
+        paddle.set_device('npu:0')
+        paddle.disable_static()
+        x_numpy = np.random.rand(1000)
+        x = paddle.to_tensor(x_numpy)
+        out = paddle.multinomial(x, num_samples=100, replacement=False)
+
+        unique_out = np.unique(out.numpy())
+        self.assertEqual(
+            len(unique_out), 100,
+            "replacement is False. categories can't be sampled repeatedly")
+        paddle.enable_static()
+
+    def test_dygraph4(self):
+        paddle.set_device('npu:0')
+        paddle.disable_static()
+        logits = -1 * paddle.ones([2800])
+        # Categorical.sample API will call multinomial op with replacement=True
+        cat = paddle.distribution.Categorical(logits.exp())
+        cat.sample([1])
+        paddle.enable_static()
+
+    def test_static(self):
+        paddle.set_device('npu:0')
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            x = fluid.data('x', shape=[4], dtype='float32')
+            out = paddle.multinomial(x, num_samples=100000, replacement=True)
+
+            place = fluid.NPUPlace(0)
+            exe = fluid.Executor(place)
+
+        exe.run(startup_program)
+        x_np = np.random.rand(4).astype('float32')
+        out = exe.run(train_program, feed={'x': x_np}, fetch_list=[out])
+
+        sample_prob = sample_output_one_dimension(out, 4)
+        prob = x_np / x_np.sum(axis=-1, keepdims=True)
+        self.assertTrue(
+            np.allclose(
+                sample_prob, prob, rtol=0, atol=0.01),
+            "sample_prob: " + str(sample_prob) + "\nprob: " + str(prob))
+
+
+class TestMultinomialAlias(unittest.TestCase):
+    def test_alias(self):
+        paddle.set_device('npu:0')
+        x = paddle.rand([4])
+        out1 = paddle.multinomial(x, num_samples=10, replacement=True)
+        out2 = paddle.tensor.multinomial(x, num_samples=10, replacement=True)
+        out3 = paddle.tensor.random.multinomial(
+            x, num_samples=10, replacement=True)
+
+
+class TestMultinomialError(unittest.TestCase):
+    def setUp(self):
+        paddle.set_device('npu:0')
+        paddle.disable_static()
+
+    def tearDown(self):
+        paddle.enable_static()
+
+    def test_num_sample(self):
+        def test_num_sample_less_than_0():
+            x = paddle.rand([4])
+            out = paddle.multinomial(x, num_samples=-2)
+
+        self.assertRaises(ValueError, test_num_sample_less_than_0)
+
+    def test_input_probs_dim(self):
+        def test_dim_larger_than_2():
+            x = paddle.rand([2, 3, 3])
+            out = paddle.multinomial(x)
+
+        self.assertRaises(ValueError, test_dim_larger_than_2)
+
+        def test_dim_less_than_1():
+            x_np = np.random.random([])
+            x = paddle.to_tensor(x_np)
+            out = paddle.multinomial(x)
+
+        self.assertRaises(ValueError, test_dim_less_than_1)
+
+        with self.assertRaises(ValueError):
+            prob = paddle.rand([20, 1000])
+            prob[1:0] = 0
+            out = paddle.multinomial(prob)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_take_along_axis_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_take_along_axis_op_npu.py
new file mode 100644
index 0000000000000..4aad02f7df06e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_take_along_axis_op_npu.py
@@ -0,0 +1,130 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.framework import core
+
+paddle.enable_static()
+
+
+@unittest.skip(reason="Skip unsupported ut, need paddle surpport cann 5.0.4+")
+class TestTakeAlongAxisOp(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.init_data()
+        self.op_type = "take_along_axis"
+        self.xnp = np.random.random(self.x_shape).astype(self.x_type)
+        self.target = np.take_along_axis(self.xnp, self.index, self.axis)
+        broadcast_shape_list = list(self.x_shape)
+        broadcast_shape_list[self.axis] = 1
+        self.braodcast_shape = tuple(broadcast_shape_list)
+        self.index_broadcast = np.broadcast_to(self.index, self.braodcast_shape)
+        self.inputs = {
+            'Input': self.xnp,
+            'Index': self.index_broadcast,
+        }
+        self.attrs = {'Axis': self.axis}
+        self.outputs = {'Result': self.target}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['Input'], 'Result')
+
+    def init_data(self):
+        self.x_type = "float64"
+        self.x_shape = (5, 5, 5)
+        self.index_type = "int32"
+        self.index = np.array(
+            [[[1]], [[1]], [[2]], [[4]], [[3]]]).astype(self.index_type)
+        self.axis = 2
+        self.axis_type = "int64"
+
+
+class TestCase1(TestTakeAlongAxisOp):
+    def init_data(self):
+        self.x_type = "float64"
+        self.x_shape = (5, 5, 5)
+        self.index_type = "int32"
+        self.index = np.array([[[0, 1, 2, 1, 4]]]).astype(self.index_type)
+        self.axis = 0
+        self.axis_type = "int64"
+
+
+@unittest.skip(reason="Skip unsupported ut, need paddle surpport cann 5.0.4+")
+class TestTakeAlongAxisAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(0)
+        self.shape = [3, 3]
+        self.index_shape = [1, 3]
+        self.index_np = np.array([[0, 1, 2]]).astype('int64')
+        self.x_np = np.random.random(self.shape).astype(np.float32)
+        self.place = paddle.NPUPlace(0)
+        self.axis = 0
+
+    def test_api_static(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.fluid.data('X', self.shape)
+            index = paddle.fluid.data('Index', self.index_shape, "int64")
+            out = paddle.take_along_axis(x, index, self.axis)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np,
+                                'Index': self.index_np},
+                          fetch_list=[out])
+        out_ref = np.array(
+            np.take_along_axis(self.x_np, self.index_np, self.axis))
+        for out in res:
+            self.assertEqual(np.allclose(out, out_ref, rtol=1e-03), True)
+
+    def test_api_dygraph(self):
+        paddle.disable_static(self.place)
+        x_tensor = paddle.to_tensor(self.x_np)
+        self.index = paddle.to_tensor(self.index_np)
+        out = paddle.take_along_axis(x_tensor, self.index, self.axis)
+        out_ref = np.array(
+            np.take_along_axis(self.x_np, self.index_np, self.axis))
+        self.assertEqual(np.allclose(out.numpy(), out_ref, rtol=1e-03), True)
+        paddle.enable_static()
+
+
+@unittest.skip(reason="Skip unsupported ut, need paddle surpport cann 5.0.4+")
+class TestTakeAlongAxisAPICase1(TestTakeAlongAxisAPI):
+    def setUp(self):
+        np.random.seed(0)
+        self.shape = [2, 2]
+        self.index_shape = [4, 2]
+        self.index_np = np.array(
+            [[0, 0], [1, 0], [0, 0], [1, 0]]).astype('int64')
+        self.x_np = np.random.random(self.shape).astype(np.float32)
+        self.place = paddle.NPUPlace(0)
+        self.axis = 0
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 13c72bedefa8e..f7a3dfa1102b2 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -872,7 +872,7 @@ def cal_python_api(python_api, args, kernel_sig):
             eager_tensor_outputs = egr_oups if egr_oups else self.append_input_output_for_dygraph(
                 op_proto, self.outputs, False, False, block)
 
-            # prepare attrbutes
+            # prepare attributes
             attrs_outputs = {}
             if hasattr(self, "attrs"):
                 for attrs_name in self.attrs:
@@ -906,7 +906,7 @@ def _calc_dygraph_output(self, place, parallel=False, no_check_set=None):
             outputs = self.append_input_output_for_dygraph(
                 op_proto, self.outputs, False, False, block)
 
-            # prepare attrbutes
+            # prepare attributes
             attrs_outputs = {}
             if hasattr(self, "attrs"):
                 for attrs_name in self.attrs:
@@ -2016,7 +2016,7 @@ def _get_dygraph_grad(self,
             outputs = self.append_input_output_for_dygraph(
                 op_proto, self.outputs, False, False, block)
 
-            # prepare attrbutes
+            # prepare attributes
             attrs_outputs = {}
             if hasattr(self, "attrs"):
                 for attrs_name in self.attrs:
diff --git a/python/paddle/fluid/tests/unittests/seresnext_test_base.py b/python/paddle/fluid/tests/unittests/seresnext_test_base.py
index cc40b89b585cb..bf33adcf48655 100644
--- a/python/paddle/fluid/tests/unittests/seresnext_test_base.py
+++ b/python/paddle/fluid/tests/unittests/seresnext_test_base.py
@@ -25,7 +25,7 @@ def _compare_result_with_origin_model(self,
                                           check_func,
                                           use_device,
                                           delta2=1e-5,
-                                          compare_seperately=True):
+                                          compare_separately=True):
         if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
             return
 
@@ -45,7 +45,7 @@ def _compare_result_with_origin_model(self,
             batch_size=seresnext_net.batch_size(use_device),
             use_device=use_device)
 
-        if compare_seperately:
+        if compare_separately:
             for loss in zip(func_1_first_loss, func_2_first_loss):
                 self.assertAlmostEquals(loss[0], loss[1], delta=1e-5)
             for loss in zip(func_1_last_loss, func_2_last_loss):
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py
index b57f26776234e..4dc3fe6eab6be 100644
--- a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py
@@ -69,9 +69,9 @@ def __init__(self,
         super(ParallelFusedMultiHeadAttention, self).__init__()
 
         assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
-                               "but recieved {}".format(embed_dim))
+                               "but received {}".format(embed_dim))
         assert num_heads > 0, ("Expected nhead to be greater than 0, "
-                               "but recieved {}".format(num_heads))
+                               "but received {}".format(num_heads))
 
         self.normalize_before = normalize_before
         self._dtype = self._helper.get_default_dtype()
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py
index 5f467da6a6465..ad570fc0acfb3 100644
--- a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py
@@ -172,10 +172,10 @@ def __init__(self,
                  name=None):
         super(ParallelFusedFeedForward, self).__init__()
         assert d_model > 0, (
-            "Expected d_model to be greater than 0, but recieved {}".format(
+            "Expected d_model to be greater than 0, but received {}".format(
                 d_model))
         assert dim_feedforward > 0, (
-            "Expected dim_feedforward to be greater than 0, but recieved {}".
+            "Expected dim_feedforward to be greater than 0, but received {}".
             format(dim_feedforward))
 
         self._dtype = self._helper.get_default_dtype()
diff --git a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
index 570551e82646f..919ae52447128 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
@@ -135,6 +135,34 @@ def test_grad(self):
             self.func(p)
 
 
+class TestAbsDoubleGradCheck(unittest.TestCase):
+    def abs_wrapper(self, x):
+        return paddle.abs(x[0])
+
+    @prog_scope()
+    def func(self, place):
+        shape = [2, 3, 7, 9]
+        eps = 0.0005
+        dtype = np.float64
+        x = layers.data('x', shape, False, dtype=dtype)
+        x.persistable = True
+        y = paddle.abs(x)
+        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
+        x_arr[np.abs(x_arr) < 0.005] = 0.002
+        gradient_checker.double_grad_check(
+            [x], y, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.abs_wrapper, [x], y, x_init=x_arr, place=place)
+
+    def test_grad(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 class TestReluDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
@@ -225,6 +253,9 @@ def test_grad(self):
 
 
 class TestCELUDoubleGradCheck(unittest.TestCase):
+    def celu_wrapper(self, x):
+        return paddle.nn.functional.celu(x[0], alpha=0.2)
+
     @prog_scope()
     def func(self, place):
         shape = [2, 4, 4, 4]
@@ -241,6 +272,8 @@ def func(self, place):
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.celu_wrapper, [x], y, x_init=x_arr, place=place)
 
     def test_grad(self):
         paddle.enable_static()
@@ -252,6 +285,9 @@ def test_grad(self):
 
 
 class TestSqrtDoubleGradCheck(unittest.TestCase):
+    def sqrt_wrapper(self, x):
+        return paddle.sqrt(x[0])
+
     @prog_scope()
     def func(self, place):
         shape = [2, 3, 7, 9]
@@ -266,6 +302,8 @@ def func(self, place):
 
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.sqrt_wrapper, [x], y, x_init=x_arr, place=place)
 
     def test_grad(self):
         paddle.enable_static()
@@ -277,6 +315,9 @@ def test_grad(self):
 
 
 class TestRsqrtDoubleGradCheck(unittest.TestCase):
+    def rsqrt_wrapper(self, x):
+        return paddle.rsqrt(x[0])
+
     @prog_scope()
     def func(self, place):
         shape = [2, 3, 7, 9]
@@ -291,6 +332,8 @@ def func(self, place):
 
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.rsqrt_wrapper, [x], y, x_init=x_arr, place=place)
 
     def test_grad(self):
         paddle.enable_static()
@@ -302,6 +345,9 @@ def test_grad(self):
 
 
 class TestSquareDoubleGradCheck(unittest.TestCase):
+    def square_wrapper(self, x):
+        return paddle.square(x[0])
+
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
@@ -316,6 +362,8 @@ def func(self, place):
 
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.square_wrapper, [x], y, x_init=x_arr, place=place)
 
     def test_grad(self):
         paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index 58d8610ee352d..7be3b300d55a1 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -2003,6 +2003,7 @@ def setUp(self):
         self.op_type = "celu"
         self.init_dtype()
 
+        self.python_api = paddle.nn.functional.celu
         np.random.seed(1024)
         x = np.random.uniform(-3, 3, [10, 12]).astype(self.dtype)
         alpha = 1.5
@@ -2014,7 +2015,7 @@ def setUp(self):
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_eager=True)
 
 
 class TestCELUAPI(unittest.TestCase):
@@ -2080,6 +2081,11 @@ def test_errors(self):
                 name='x_fp16', shape=[10, 12], dtype='float16')
             self.celu(x_fp16)
 
+    def test_api_eager_dygraph(self):
+        with _test_eager_guard():
+            self.test_dygraph_api()
+            self.test_errors()
+
 
 class TestReciprocal(TestActivation):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py
index d2eef785f6e07..3e2f112e964bb 100644
--- a/python/paddle/fluid/tests/unittests/test_adamw_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py
@@ -54,8 +54,8 @@ def adamw_step(inputs, attributes):
 
     moment1_out = beta1 * moment1 + (1 - beta1) * grad
     moment2_out = beta2 * moment2 + (1 - beta2) * np.square(grad)
-    lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
-    param_out = param - lr_t * (moment1_out / (np.sqrt(moment2_out) + epsilon))
+    denom = (np.sqrt(moment2_out) / np.sqrt(1.0 - beta2_pow)) + epsilon
+    param_out = param + ((moment1_out / denom) * (-(lr / (1.0 - beta1_pow))))
     return param_out, moment1_out, moment2_out
 
 
@@ -314,16 +314,16 @@ def simple_lr_setting(param, decay_rate, n_layers):
                  "core is not compiled with CUDA")
 class TestAdamWOpLayerwiseLR(TestAdamWOp):
     def setUp(self):
-        random.seed(2021)
-        np.random.seed(2021)
-        paddle.seed(2021)
+        random.seed(2022)
+        np.random.seed(2022)
+        paddle.seed(2022)
 
     def test_adamw_op_dygraph(self):
         paddle.disable_static()
-        value = np.arange(26).reshape(2, 13).astype("float32")
-        a = paddle.to_tensor(value)
-        linear1 = paddle.nn.Linear(13, 8)
-        linear2 = paddle.nn.Linear(8, 5)
+        linear1 = paddle.nn.Linear(
+            13, 8, bias_attr=paddle.nn.initializer.Constant(value=1.0))
+        linear2 = paddle.nn.Linear(
+            8, 5, bias_attr=paddle.nn.initializer.Constant(value=1.0))
 
         # fix the linear name, simple_lr_setting function will use the name
         linear1.weight.name = "linear_1.w_0"
@@ -331,33 +331,103 @@ def test_adamw_op_dygraph(self):
         linear2.weight.name = "linear_2.w_0"
         linear2.bias.name = "linear_2.b_0"
 
+        fc1_w = np.array(linear1.weight)
+        fc1_w_mon1 = np.zeros_like(fc1_w)
+        fc1_w_mon2 = np.zeros_like(fc1_w)
+        fc1_b = np.array(linear1.bias)
+        fc1_b_mon1 = np.zeros_like(fc1_b)
+        fc1_b_mon2 = np.zeros_like(fc1_b)
+
+        fc2_w = np.array(linear2.weight)
+        fc2_w_mon1 = np.zeros_like(fc2_w)
+        fc2_w_mon2 = np.zeros_like(fc2_w)
+        fc2_b = np.array(linear2.bias)
+        fc2_b_mon1 = np.zeros_like(fc2_b)
+        fc2_b_mon2 = np.zeros_like(fc2_b)
+
         simple_lr_fun = partial(simple_lr_setting, decay_rate=0.8, n_layers=2)
+        learning_rate = 0.001
+        weight_decay = 0.01
+        beta1 = 0.9
+        beta2 = 0.999
 
-        adam = paddle.optimizer.AdamW(
-            learning_rate=0.01,
+        opt = paddle.optimizer.AdamW(
+            learning_rate=learning_rate,
             parameters=[{
                 'params': linear1.parameters()
             }, {
                 'params': linear2.parameters(),
             }],
             apply_decay_param_fun=lambda name: True,
-            weight_decay=0.01,
+            weight_decay=weight_decay,
             lr_ratio=simple_lr_fun)
 
-        loss_ref = np.array(
-            [-1.7267396, -2.81524, -3.9250019, -5.05954, -6.2272625])
+        def get_numpy_output(param, grad, moment1, moment2, lr_ratio, t):
+            np_inputs = {
+                'Param': param,
+                'Grad': grad,
+                'Moment1': moment1,
+                'Moment2': moment2,
+                'LearningRate': np.array([learning_rate]).astype("float32"),
+                'Beta1Pow': np.array([beta1**t]).astype("float32"),
+                'Beta2Pow': np.array([beta2**t]).astype("float32")
+            }
+
+            np_attrs = {
+                'epsilon': 1e-8,
+                'beta1': beta1,
+                'beta2': beta2,
+                "lr_ratio": lr_ratio,
+                "coeff": weight_decay,
+                "with_decay": True
+            }
+            param_out, moment1_out, moment2_out = adamw_step(np_inputs,
+                                                             np_attrs)
+            return param_out, moment1_out, moment2_out
+
         for i in range(5):
+            a = paddle.to_tensor(
+                np.random.uniform(-1, 1, (2, 13)).astype("float32"))
             a1 = linear1(a)
             out = linear2(a1)
             out = paddle.mean(out)
             out.backward()
-            adam.step()
-            adam.clear_gradients()
-            np.testing.assert_allclose(out[0].numpy(), loss_ref[i], rtol=1e-6)
+
+            fc1_w, fc1_w_mon1, fc1_w_mon2 = get_numpy_output(
+                fc1_w,
+                np.array(linear1.weight.grad), fc1_w_mon1, fc1_w_mon2,
+                simple_lr_fun(linear1.weight), i + 1)
+            fc1_b, fc1_b_mon1, fc1_b_mon2 = get_numpy_output(
+                fc1_b,
+                np.array(linear1.bias.grad), fc1_b_mon1, fc1_b_mon2,
+                simple_lr_fun(linear1.bias), i + 1)
+            fc2_w, fc2_w_mon1, fc2_w_mon2 = get_numpy_output(
+                fc2_w,
+                np.array(linear2.weight.grad), fc2_w_mon1, fc2_w_mon2,
+                simple_lr_fun(linear2.weight), i + 1)
+            fc2_b, fc2_b_mon1, fc2_b_mon2 = get_numpy_output(
+                fc2_b,
+                np.array(linear2.bias.grad), fc2_b_mon1, fc2_b_mon2,
+                simple_lr_fun(linear2.bias), i + 1)
+
+            opt.step()
+            opt.clear_gradients()
+
+            np.testing.assert_allclose(linear1.weight.numpy(), fc1_w, rtol=1e-6)
+            np.testing.assert_allclose(linear1.bias.numpy(), fc1_b, rtol=1e-6)
+            np.testing.assert_allclose(linear2.weight.numpy(), fc2_w, rtol=1e-6)
+            np.testing.assert_allclose(linear2.bias.numpy(), fc2_b, rtol=1e-6)
 
     def test_adamw_op(self):
         paddle.enable_static()
         place = fluid.CUDAPlace(0)
+
+        learning_rate = 0.0001
+        beta1 = 0.85
+        beta2 = 0.95
+        weight_decay = 0.01
+        epsilon = 1e-8
+
         train_prog = fluid.Program()
         startup = fluid.Program()
         with fluid.program_guard(train_prog, startup):
@@ -365,42 +435,121 @@ def test_adamw_op(self):
                 x = fluid.data(name='x', shape=[None, 10], dtype='float32')
                 y = fluid.data(name='y', shape=[None, 1], dtype='float32')
 
-                fc1 = fluid.layers.fc(input=x, size=32, act=None)
-                prediction = fluid.layers.fc(input=fc1, size=1, act=None)
-                cost = fluid.layers.square_error_cost(input=prediction, label=y)
+                weight_attr1 = paddle.framework.ParamAttr(name="linear_0.w_0")
+                bias_attr1 = paddle.framework.ParamAttr(
+                    name="linear_0.b_0",
+                    initializer=paddle.nn.initializer.Constant(value=1.0))
+                weight_attr2 = paddle.framework.ParamAttr(name="linear_1.w_0")
+                bias_attr2 = paddle.framework.ParamAttr(
+                    name="linear_1.b_0",
+                    initializer=paddle.nn.initializer.Constant(value=1.0))
+                linear1 = paddle.nn.Linear(
+                    10, 32, weight_attr=weight_attr1, bias_attr=bias_attr1)
+                linear2 = paddle.nn.Linear(
+                    32, 1, weight_attr=weight_attr2, bias_attr=bias_attr2)
+
+                out = linear1(x)
+                out = linear2(out)
+
+                fc1_w_mon1 = np.zeros((linear1.weight.shape)).astype("float32")
+                fc1_w_mon2 = np.zeros((linear1.weight.shape)).astype("float32")
+                fc1_b_mon1 = np.zeros((linear1.bias.shape)).astype("float32")
+                fc1_b_mon2 = np.zeros((linear1.bias.shape)).astype("float32")
+                fc2_w_mon1 = np.zeros((linear2.weight.shape)).astype("float32")
+                fc2_w_mon2 = np.zeros((linear2.weight.shape)).astype("float32")
+                fc2_b_mon1 = np.zeros((linear2.bias.shape)).astype("float32")
+                fc2_b_mon2 = np.zeros((linear2.bias.shape)).astype("float32")
+
+                cost = fluid.layers.square_error_cost(input=out, label=y)
                 avg_cost = fluid.layers.mean(cost)
 
                 simple_lr_fun = partial(
                     simple_lr_setting, decay_rate=0.8, n_layers=2)
 
-                beta1 = fluid.layers.create_global_var(
-                    shape=[1], value=0.85, dtype='float32', persistable=True)
-                beta2 = fluid.layers.create_global_var(
-                    shape=[1], value=0.95, dtype='float32', persistable=True)
-                betas = [beta1, beta2]
                 opt = paddle.optimizer.AdamW(
-                    learning_rate=1e-5,
+                    learning_rate=learning_rate,
                     beta1=beta1,
                     beta2=beta2,
-                    weight_decay=0.01,
-                    epsilon=1e-8,
+                    weight_decay=weight_decay,
+                    epsilon=epsilon,
                     lr_ratio=simple_lr_fun)
                 opt.minimize(avg_cost)
 
+        def get_numpy_output(param, grad, moment1, moment2, lr_ratio, t):
+            np_inputs = {
+                'Param': param,
+                'Grad': grad,
+                'Moment1': moment1,
+                'Moment2': moment2,
+                'LearningRate': np.array([learning_rate]).astype("float32"),
+                'Beta1Pow': np.array([beta1**t]).astype("float32"),
+                'Beta2Pow': np.array([beta2**t]).astype("float32")
+            }
+
+            np_attrs = {
+                'epsilon': epsilon,
+                'beta1': beta1,
+                'beta2': beta2,
+                "lr_ratio": lr_ratio,
+                "coeff": weight_decay,
+                "with_decay": True
+            }
+            param_out, moment1_out, moment2_out = adamw_step(np_inputs,
+                                                             np_attrs)
+            return param_out, moment1_out, moment2_out
+
+        fetch_list1 = [
+            "linear_0.w_0", "linear_0.b_0", "linear_1.w_0", "linear_1.b_0"
+        ]
+        fetch_list2 = [
+            "linear_0.w_0", "linear_0.w_0@GRAD", "linear_0.b_0",
+            "linear_0.b_0@GRAD", "linear_1.w_0", "linear_1.w_0@GRAD",
+            "linear_1.b_0", "linear_1.b_0@GRAD"
+        ]
+
         exe = fluid.Executor(place)
         exe.run(startup)
+        test_prog = train_prog.clone(for_test=True)
 
-        loss_ref = np.array(
-            [0.33895183, 0.3159437, 0.19472016, 0.17764759, 0.1520702])
         for i in range(5):
             inputs = np.random.random(size=[8, 10]).astype('float32')
             outputs = np.random.random(size=[8, 1]).astype('float32')
-            rets = exe.run(train_prog,
-                           feed={"x": inputs,
-                                 "y": outputs},
-                           fetch_list=[avg_cost])
-            assert rets[0] is not None
-            np.testing.assert_allclose(rets[0], loss_ref[i], rtol=1e-6)
+
+            param = exe.run(test_prog,
+                            feed={"x": inputs,
+                                  "y": outputs},
+                            fetch_list=fetch_list1)
+            params_and_gras = exe.run(train_prog,
+                                      feed={"x": inputs,
+                                            "y": outputs},
+                                      fetch_list=fetch_list2)
+
+            fc1_w = param[0]
+            fc1_w_grad = params_and_gras[1]
+            fc1_b = param[1]
+            fc1_b_grad = params_and_gras[3]
+            fc2_w = param[2]
+            fc2_w_grad = params_and_gras[5]
+            fc2_b = param[3]
+            fc2_b_grad = params_and_gras[7]
+
+            fc1_w, fc1_w_mon1, fc1_w_mon2 = get_numpy_output(
+                fc1_w, fc1_w_grad, fc1_w_mon1, fc1_w_mon2,
+                simple_lr_fun(linear1.weight), i + 1)
+            fc1_b, fc1_b_mon1, fc1_b_mon2 = get_numpy_output(
+                fc1_b, fc1_b_grad, fc1_b_mon1, fc1_b_mon2,
+                simple_lr_fun(linear1.bias), i + 1)
+            fc2_w, fc2_w_mon1, fc2_w_mon2 = get_numpy_output(
+                fc2_w, fc2_w_grad, fc2_w_mon1, fc2_w_mon2,
+                simple_lr_fun(linear2.weight), i + 1)
+            fc2_b, fc2_b_mon1, fc2_b_mon2 = get_numpy_output(
+                fc2_b, fc2_b_grad, fc2_b_mon1, fc2_b_mon2,
+                simple_lr_fun(linear2.bias), i + 1)
+
+            np.testing.assert_allclose(params_and_gras[0], fc1_w, rtol=1e-6)
+            np.testing.assert_allclose(params_and_gras[2], fc1_b, rtol=1e-6)
+            np.testing.assert_allclose(params_and_gras[4], fc2_w, rtol=1e-6)
+            np.testing.assert_allclose(params_and_gras[6], fc2_b, rtol=1e-6)
 
         paddle.disable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/test_bfgs.py b/python/paddle/fluid/tests/unittests/test_bfgs.py
index 1a12913bc72e9..8a9f9f72aa068 100644
--- a/python/paddle/fluid/tests/unittests/test_bfgs.py
+++ b/python/paddle/fluid/tests/unittests/test_bfgs.py
@@ -22,9 +22,6 @@
 from paddle.incubate.optimizer.functional.bfgs import minimize_bfgs
 from paddle.fluid.framework import _test_eager_guard
 
-from paddle.fluid.framework import _enable_legacy_dygraph
-_enable_legacy_dygraph()
-
 np.random.seed(123)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_collective_sendrecv.py b/python/paddle/fluid/tests/unittests/test_collective_sendrecv.py
index 40bacaf59d2f3..d3bcd0a7e6985 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_sendrecv.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_sendrecv.py
@@ -29,6 +29,10 @@ def _setup_config(self):
     def test_sendrecv(self):
         self.check_with_place("collective_sendrecv_op.py", "sendrecv")
 
+    def test_sendrecv_dynamic_shape(self):
+        self.check_with_place("collective_sendrecv_op_dynamic_shape.py",
+                              "sendrecv_dynamic_shape")
+
     def test_sendrecv_array(self):
         self.check_with_place("collective_sendrecv_op_array.py",
                               "sendrecv_array")
diff --git a/python/paddle/fluid/tests/unittests/test_conv_transpose_nn_grad.py b/python/paddle/fluid/tests/unittests/test_conv_transpose_nn_grad.py
index a4ef15b1f0db3..b9e9224b9e402 100644
--- a/python/paddle/fluid/tests/unittests/test_conv_transpose_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_conv_transpose_nn_grad.py
@@ -27,6 +27,9 @@
 
 
 class TestConvTransposeDoubleGradCheck(unittest.TestCase):
+    def conv_transpose_wrapper(self, x):
+        return paddle.nn.functional.conv2d_transpose(x[0], x[1], groups=1)
+
     @prog_scope()
     def func(self, place):
         shape = [2, 4, 3, 3]
@@ -55,6 +58,11 @@ def func(self, place):
         else:
             gradient_checker.double_grad_check(
                 [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.conv_transpose_wrapper, [x] + w,
+            y,
+            x_init=[x_arr] + w_arr,
+            place=place)
 
     def test_grad(self):
         places = []
@@ -67,6 +75,10 @@ def test_grad(self):
 
 class TestConvTranspose2DoubleGradCheck_AsyPadding(
         TestConvTransposeDoubleGradCheck):
+    def conv_transpose_wrapper(self, x):
+        return paddle.nn.functional.conv2d_transpose(
+            x[0], x[1], groups=1, padding=[1, 0, 0, 1])
+
     @prog_scope()
     def func(self, place):
         shape = [2, 2, 3, 3]
@@ -100,10 +112,19 @@ def func(self, place):
         else:
             gradient_checker.double_grad_check(
                 [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.conv_transpose_wrapper, [x] + w,
+            y,
+            x_init=[x_arr] + w_arr,
+            place=place)
 
 
 class TestConvTranspose2DoubleGradCheck_PaddingSAME(
         TestConvTransposeDoubleGradCheck):
+    def conv_transpose_wrapper(self, x):
+        return paddle.nn.functional.conv2d_transpose(
+            x[0], x[1], groups=1, padding="SAME")
+
     @prog_scope()
     def func(self, place):
         shape = [2, 2, 3, 3]
@@ -137,10 +158,19 @@ def func(self, place):
         else:
             gradient_checker.double_grad_check(
                 [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.conv_transpose_wrapper, [x] + w,
+            y,
+            x_init=[x_arr] + w_arr,
+            place=place)
 
 
 class TestConvTranspose2DoubleGradCheck_PaddingVALID(
         TestConvTransposeDoubleGradCheck):
+    def conv_transpose_wrapper(self, x):
+        return paddle.nn.functional.conv2d_transpose(
+            x[0], x[1], groups=1, padding="VALID")
+
     @prog_scope()
     def func(self, place):
         shape = [2, 2, 3, 3]
@@ -174,10 +204,19 @@ def func(self, place):
         else:
             gradient_checker.double_grad_check(
                 [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.conv_transpose_wrapper, [x] + w,
+            y,
+            x_init=[x_arr] + w_arr,
+            place=place)
 
 
 class TestConvTranspose2DoubleGradCheck_ChannelLast(
         TestConvTransposeDoubleGradCheck):
+    def conv_transpose_wrapper(self, x):
+        return paddle.nn.functional.conv2d_transpose(
+            x[0], x[1], groups=1, padding=[1, 1], data_format="NHWC")
+
     @prog_scope()
     def func(self, place):
         shape = [2, 3, 3, 2]
@@ -213,6 +252,11 @@ def func(self, place):
         else:
             gradient_checker.double_grad_check(
                 [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.conv_transpose_wrapper, [x] + w,
+            y,
+            x_init=[x_arr] + w_arr,
+            place=place)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_corr.py b/python/paddle/fluid/tests/unittests/test_corr.py
index 99fd21c047b07..1e1dd3b369584 100644
--- a/python/paddle/fluid/tests/unittests/test_corr.py
+++ b/python/paddle/fluid/tests/unittests/test_corr.py
@@ -18,9 +18,15 @@
 import six
 import paddle
 import warnings
+import sys
+
+np_minor_version = int((np.__version__).split('.')[1])
 
 
 def numpy_corr(np_arr, rowvar=True, dtype='float64'):
+    # np.corrcoef support parameter 'dtype' since 1.20
+    if np_minor_version < 20:
+        return np.corrcoef(np_arr, rowvar=rowvar)
     return np.corrcoef(np_arr, rowvar=rowvar, dtype=dtype)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py b/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
index 6033b809f218d..14a91b0c2c5fe 100644
--- a/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
@@ -25,6 +25,8 @@
 import paddle.fluid.core as core
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "Only test cuda Random Generator")
 class TestGeneratorSeed(unittest.TestCase):
     """
     Test cases for cpu generator seed.
@@ -70,15 +72,13 @@ def test_generator_gaussian_random_dygraph(self):
         """Test Generator seed."""
         fluid.enable_dygraph()
 
-        paddle.seed(12312321111)
-        x = fluid.layers.gaussian_random([120], dtype="float32")
-        st1 = paddle.get_cuda_rng_state()
-        x1 = fluid.layers.gaussian_random([120], dtype="float32")
-        paddle.set_cuda_rng_state(st1)
-        x2 = fluid.layers.gaussian_random([120], dtype="float32")
-        paddle.seed(12312321111)
-        x3 = fluid.layers.gaussian_random([120], dtype="float32")
-        x_np = x.numpy()
+        st = paddle.get_cuda_rng_state()
+        x1 = paddle.randn([120], dtype="float32")
+        paddle.set_cuda_rng_state(st)
+        x2 = paddle.randn([120], dtype="float32")
+        paddle.set_cuda_rng_state(st)
+        x3 = paddle.randn([120], dtype="float32")
+
         x1_np = x1.numpy()
         x2_np = x2.numpy()
         x3_np = x3.numpy()
@@ -86,7 +86,7 @@ def test_generator_gaussian_random_dygraph(self):
         if core.is_compiled_with_cuda():
             print(">>>>>>> gaussian random dygraph >>>>>>>")
             self.assertTrue(np.allclose(x1_np, x2_np))
-            self.assertTrue(np.allclose(x_np, x3_np))
+            self.assertTrue(np.allclose(x2_np, x3_np))
 
     def test_generator_randint_dygraph(self):
         """Test Generator seed."""
diff --git a/python/paddle/fluid/tests/unittests/test_dataset.py b/python/paddle/fluid/tests/unittests/test_dataset.py
index 348945b73e1a4..5ef5a1016cc8b 100644
--- a/python/paddle/fluid/tests/unittests/test_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset.py
@@ -24,6 +24,7 @@
 import numpy as np
 import os
 import shutil
+import tempfile
 import unittest
 
 
@@ -82,12 +83,17 @@ def test_run_with_dump(self):
         """
         Testcase for InMemoryDataset from create to run.
         """
-        with open("test_run_with_dump_a.txt", "w") as f:
+
+        temp_dir = tempfile.TemporaryDirectory()
+        dump_a_path = os.path.join(temp_dir.name, 'test_run_with_dump_a.txt')
+        dump_b_path = os.path.join(temp_dir.name, 'test_run_with_dump_b.txt')
+
+        with open(dump_a_path, "w") as f:
             data = "1 a 1 a 1 1 2 3 3 4 5 5 5 5 1 1\n"
             data += "1 b 1 b 1 2 2 3 4 4 6 6 6 6 1 2\n"
             data += "1 c 1 c 1 3 2 3 5 4 7 7 7 7 1 3\n"
             f.write(data)
-        with open("test_run_with_dump_b.txt", "w") as f:
+        with open(dump_b_path, "w") as f:
             data = "1 d 1 d 1 4 2 3 3 4 5 5 5 5 1 4\n"
             data += "1 e 1 e 1 5 2 3 4 4 6 6 6 6 1 5\n"
             data += "1 f 1 f 1 6 2 3 5 4 7 7 7 7 1 6\n"
@@ -110,8 +116,7 @@ def test_run_with_dump(self):
             parse_content=True,
             fea_eval=True,
             candidate_size=10000)
-        dataset.set_filelist(
-            ["test_run_with_dump_a.txt", "test_run_with_dump_b.txt"])
+        dataset.set_filelist([dump_a_path, dump_b_path])
         dataset.load_into_memory()
         dataset.local_shuffle()
 
@@ -129,8 +134,7 @@ def test_run_with_dump(self):
             except Exception as e:
                 self.assertTrue(False)
 
-        os.remove("./test_run_with_dump_a.txt")
-        os.remove("./test_run_with_dump_b.txt")
+        temp_dir.cleanup()
 
     def test_dataset_config(self):
         """ Testcase for dataset configuration. """
diff --git a/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py b/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py
index 5911ada1817b6..911bee69e8b77 100644
--- a/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py
@@ -25,6 +25,7 @@
 import math
 import os
 import shutil
+import tempfile
 import unittest
 import paddle.fluid.incubate.data_generator as dg
 
@@ -282,7 +283,11 @@ def test_var_consistency_insepection(self):
         """
         Testcase for InMemoryDataset of consistency insepection of use_var_list and data_generator.
         """
-        with open("test_run_with_dump_a.txt", "w") as f:
+
+        temp_dir = tempfile.TemporaryDirectory()
+        dump_a_path = os.path.join(temp_dir.name, 'test_run_with_dump_a.txt')
+
+        with open(dump_a_path, "w") as f:
             # data = "\n"
             # data += "\n"
             data = "2 1;1 9;20002001 20001240 20001860 20003611 20000723;20002001 20001240 20001860 20003611 20000723;0;40000001;20002001 20001240 20001860 20003611 20000157 20000723 20000070 20002616 20000157 20000005;20002001 20001240 20001860 20003611 20000157 20001776 20000070 20002616 20000157 20000005;20002001 20001240 20001860 20003611 20000723 20000070 20002001 20001240 20001860 20003611 20012788 20000157;20002001 20001240 20001860 20003611 20000623 20000251 20000157 20000723 20000070 20000001 20000057;20002640 20004695 20000157 20000723 20000070 20002001 20001240 20001860 20003611;20002001 20001240 20001860 20003611 20000157 20000723 20000070 20003519 20000005;20002001 20001240 20001860 20003611 20000157 20001776 20000070 20003519 20000005;20002001 20001240 20001860 20003611 20000723 20000070 20002001 20001240 20001860 20003611 20131464;20002001 20001240 20001860 20003611 20018820 20000157 20000723 20000070 20000001 20000057;20002640 20034154 20000723 20000070 20002001 20001240 20001860 20003611;10000200;10000200;10063938;10000008;10000177;20002001 20001240 20001860 20003611 20010833 20000210 20000500 20000401 20000251 20012198 20001023 20000157;20002001 20001240 20001860 20003611 20012396 20000500 20002513 20012198 20001023 20000157;10000123;30000004;0.623 0.233 0.290 0.208 0.354 49.000 0.000 0.000 0.000 -1.000 0.569 0.679 0.733 53 17 2 0;20002001 20001240 20001860 20003611 20000723;20002001 20001240 20001860 20003611 20000723;10000047;30000004;0.067 0.000 0.161 0.005 0.000 49.000 0.000 0.000 0.000 -1.000 0.000 0.378 0.043 0 6 0 0;20002001 20001240 20001860 20003611 20000157 20000723 20000070 20002616 20000157 20000005;20002001 20001240 20001860 20003611 20000157 20000723 20000070 20003519 20000005;10000200;30000001;0.407 0.111 0.196 0.095 0.181 49.000 0.000 0.000 0.000 -1.000 0.306 0.538 0.355 48 8 0 0;20002001 20001240 20001860 20003611 20000157 20001776 20000070 20002616 20000157 20000005;20002001 20001240 20001860 20003611 20000157 20001776 20000070 20003519 20000005;10000200;30000001;0.226 0.029 0.149 0.031 0.074 49.000 0.000 0.000 0.000 -1.000 0.220 0.531 0.286 26 6 0 0;20002001 20001240 20001860 20003611 20000723 20000070 20002001 20001240 20001860 20003611 20012788 20000157;20002001 20001240 20001860 20003611 20000723 20000070 20002001 20001240 20001860 20003611 20131464;10063938;30000001;0.250 0.019 0.138 0.012 0.027 49.000 0.000 0.000 0.000 -1.000 0.370 0.449 0.327 7 2 0 0;20002001 20001240 20001860 20003611 20000723;20002001 20001240 20001860 20003611 20000723;10000003;30000002;0.056 0.000 0.139 0.003 0.000 49.000 0.000 0.000 0.000 -1.000 0.000 0.346 0.059 15 3 0 0;20002001 20001240 20001860 20003611 20000623 20000251 20000157 20000723 20000070 20000001 20000057;20002001 20001240 20001860 20003611 20018820 20000157 20000723 20000070 20000001 20000057;10000008;30000001;0.166 0.004 0.127 0.001 0.004 49.000 0.000 0.000 0.000 -1.000 0.103 0.417 0.394 10 3 0 0;20002640 20004695 20000157 20000723 20000070 20002001 20001240 20001860 20003611;20002640 20034154 20000723 20000070 20002001 20001240 20001860 20003611;10000177;30000001;0.094 0.008 0.157 0.012 0.059 49.000 0.000 0.000 0.000 -1.000 0.051 0.382 0.142 21 0 0 0;20002001 20001240 20001860 20003611 20000157 20001776 20000070 20000157;20002001 20001240 20001860 20003611 20000157 20001776 20000070 20000157;10000134;30000001;0.220 0.016 0.181 0.037 0.098 49.000 0.000 0.000 0.000 -1.000 0.192 0.453 0.199 17 1 0 0;20002001 20001240 20001860 20003611 20002640 20004695 20000157 20000723 20000070 20002001 20001240 20001860 20003611;20002001 20001240 20001860 20003611 20002640 20034154 20000723 20000070 20002001 20001240 20001860 20003611;10000638;30000001;0.000 0.000 0.000 0.000 0.000 49.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0 0 0 0;\n"
@@ -348,7 +353,7 @@ def test_var_consistency_insepection(self):
         generator_class = CTRDataset(mode=0)
         try:
             dataset._check_use_var_with_data_generator(
-                slot_data, generator_class, "test_run_with_dump_a.txt")
+                slot_data, generator_class, dump_a_path)
             print("case 1: check passed!")
         except Exception as e:
             print("warning: catch expected error")
@@ -360,7 +365,7 @@ def test_var_consistency_insepection(self):
         generator_class = CTRDataset(mode=2)
         try:
             dataset._check_use_var_with_data_generator(
-                slot_data, generator_class, "test_run_with_dump_a.txt")
+                slot_data, generator_class, dump_a_path)
         except Exception as e:
             print("warning: case 2 catch expected error")
             print(e)
@@ -371,7 +376,7 @@ def test_var_consistency_insepection(self):
         generator_class = CTRDataset(mode=3)
         try:
             dataset._check_use_var_with_data_generator(
-                slot_data, generator_class, "test_run_with_dump_a.txt")
+                slot_data, generator_class, dump_a_path)
         except Exception as e:
             print("warning: case 3 catch expected error")
             print(e)
@@ -382,7 +387,7 @@ def test_var_consistency_insepection(self):
         generator_class = CTRDataset(mode=4)
         try:
             dataset._check_use_var_with_data_generator(
-                slot_data, generator_class, "test_run_with_dump_a.txt")
+                slot_data, generator_class, dump_a_path)
         except Exception as e:
             print("warning: case 4 catch expected error")
             print(e)
@@ -393,13 +398,13 @@ def test_var_consistency_insepection(self):
         generator_class = CTRDataset(mode=5)
         try:
             dataset._check_use_var_with_data_generator(
-                slot_data, generator_class, "test_run_with_dump_a.txt")
+                slot_data, generator_class, dump_a_path)
         except Exception as e:
             print("warning: case 5 catch expected error")
             print(e)
         print("========================================")
 
-        os.remove("./test_run_with_dump_a.txt")
+        temp_dir.cleanup()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 11972059c832c..4f21b3220a9d3 100755
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -1461,6 +1461,7 @@ def check_with_place(self,
                          need_envs={},
                          log_name=""):
         if self._dygraph and (self._gloo_mode or self._nccl2_mode):
+            need_envs.update({"FLAGS_enable_eager_mode": "1"})
             with _test_eager_guard():
                 self.check_with_place_func(
                     model_file=model_file,
@@ -1468,6 +1469,7 @@ def check_with_place(self,
                     check_error_log=check_error_log,
                     need_envs=need_envs,
                     log_name=log_name)
+            need_envs.update({"FLAGS_enable_eager_mode": "0"})
             self.check_with_place_func(
                 model_file=model_file,
                 delta=delta,
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py b/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py
index e664face0483a..0a51045dee5e1 100644
--- a/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import os
 import unittest
 import paddle.fluid as fluid
 
@@ -24,9 +25,10 @@ class TestDygraphGroupSharded(TestMultipleGpus):
 
     # check group sharded logic as well as the accuracy with single mode
     def test_dygraph_group_sharded(self):
-        self.run_mnist_2gpu('dygraph_group_sharded_api.py')
+        self.run_mnist_2gpu('dygraph_group_sharded_api.py', eager_mode=False)
         self.run_mnist_2gpu('dygraph_group_sharded_api_eager.py')
 
 
 if __name__ == "__main__":
+    os.environ["FLAGS_enable_eager_mode"] = "1"
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_optimizer_stage2.py b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_optimizer_stage2.py
index deb180a2fe179..50e1985138610 100644
--- a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_optimizer_stage2.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_optimizer_stage2.py
@@ -24,7 +24,8 @@ class TestDygraphShardingOptimizerStage2(TestMultipleGpus):
 
     # check sharding logic as well as the accuracy with single mode
     def test_dygraph_sharding_optimizer_stage2(self):
-        self.run_mnist_2gpu('dygraph_sharding_optimizer_stage2.py')
+        self.run_mnist_2gpu(
+            'dygraph_sharding_optimizer_stage2.py', eager_mode=False)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py
index b7a5f9c9701c1..866577ea7aa8c 100644
--- a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import os
 import unittest
 import paddle.fluid as fluid
 
@@ -25,12 +26,14 @@ class TestDygraphShardingStage2(TestMultipleGpus):
     # check sharding logic as well as the accuracy with single mode
     def test_dygraph_sharding_stage2(self):
         self.run_mnist_2gpu('dygraph_group_sharded_stage2.py')
-        self.run_mnist_2gpu('dygraph_sharding_stage2.py')
+        self.run_mnist_2gpu('dygraph_sharding_stage2.py', eager_mode=False)
 
     def test_dygraph_sharding_stage2_offload(self):
         self.run_mnist_2gpu('dygraph_group_sharded_stage2_offload.py')
-        self.run_mnist_2gpu('dygraph_sharding_stage2_offload.py')
+        self.run_mnist_2gpu(
+            'dygraph_sharding_stage2_offload.py', eager_mode=False)
 
 
 if __name__ == "__main__":
+    os.environ["FLAGS_enable_eager_mode"] = "1"
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py
index f69b52cae528a..c1f5e06f42b53 100644
--- a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import os
 import unittest
 import paddle.fluid as fluid
 
@@ -25,12 +26,14 @@ class TestDygraphShardingStage3(TestMultipleGpus):
     # check sharding logic as well as the accuracy with single mode
     def test_dygraph_sharding_stage3(self):
         self.run_mnist_2gpu('dygraph_group_sharded_stage3.py')
-        self.run_mnist_2gpu('dygraph_sharding_stage3.py')
+        self.run_mnist_2gpu('dygraph_sharding_stage3.py', eager_mode=False)
 
     def test_dygraph_sharding_stage3_offload(self):
         self.run_mnist_2gpu('dygraph_group_sharded_stage3_offload.py')
-        self.run_mnist_2gpu('dygraph_sharding_stage3_offload.py')
+        self.run_mnist_2gpu(
+            'dygraph_sharding_stage3_offload.py', eager_mode=False)
 
 
 if __name__ == "__main__":
+    os.environ["FLAGS_enable_eager_mode"] = "1"
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_einsum.py b/python/paddle/fluid/tests/unittests/test_einsum.py
index 43b5ce96a3901..26aaf0f44f1d2 100644
--- a/python/paddle/fluid/tests/unittests/test_einsum.py
+++ b/python/paddle/fluid/tests/unittests/test_einsum.py
@@ -18,6 +18,9 @@
 import paddle
 from paddle.fluid import core
 
+import os
+os.environ['FLAGS_new_einsum'] = "0"
+
 
 class TestErrors(unittest.TestCase):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_einsum_op.py b/python/paddle/fluid/tests/unittests/test_einsum_op.py
index 565e43214ea32..1a4ae54afefe2 100644
--- a/python/paddle/fluid/tests/unittests/test_einsum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_einsum_op.py
@@ -34,7 +34,11 @@ def setUp(self):
             self.operands.append(("x" + str(idx), inp))
         self.inputs = {"Operands": self.operands}
         self.attrs = {"equation": self.equation}
-        self.outputs = {'Out': out}
+        self.outputs = {
+            'Out': out,
+            "InnerCache": [('cache_' + str(i), np.array([1.0]))
+                           for i in range(len(self.operands))]
+        }
 
     def init_input(self):
         self.inputs = []
@@ -49,7 +53,7 @@ def set_mandatory(self):
 
     def test_check_output(self):
         if not self.disable:
-            self.check_output()
+            self.check_output(no_check_set=["InnerCache"])
 
     def test_grad(self):
         if not self.disable:
diff --git a/python/paddle/fluid/tests/unittests/test_einsum_v2.py b/python/paddle/fluid/tests/unittests/test_einsum_v2.py
index 63acaf6396913..c58d46edde753 100644
--- a/python/paddle/fluid/tests/unittests/test_einsum_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_einsum_v2.py
@@ -464,5 +464,19 @@ def test_static_graph(self):
             self.check_output_equal(a, e)
 
 
+class TestStaticGraphShape(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+
+    def tearDown(self):
+        paddle.disable_static()
+
+    def test_shape(self):
+        A = paddle.static.data(name='x', shape=[-1])
+        B = paddle.static.data(name='y', shape=[384])
+        C = paddle.einsum('i,d->id', A, B)
+        self.assertEqual(C.shape, (-1, 384))
+
+
 if __name__ == "__main__":
-    u
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_heaviside_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_heaviside_op.py
new file mode 100644
index 0000000000000..8a8e74e28ec72
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_heaviside_op.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+
+
+class TestElementwiseOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_heaviside"
+        x = np.random.random((13, 17)).astype("float64")
+        y = np.random.random((13, 17)).astype("float64")
+        self.inputs = {'X': x, 'Y': y}
+        self.outputs = {'Out': np.heaviside(self.inputs['X'], self.inputs['Y'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(['Y'], 'Out', no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(['X'], 'Out', no_grad_set=set('Y'))
+
+
+class TestHeavisideBroadcast(unittest.TestCase):
+    def setUp(self):
+        self.input_1 = np.random.rand(2, 100, 13, 17).astype("float32")
+        self.input_2 = np.random.rand(100, 13, 17).astype("float32")
+        self.input_3 = np.random.rand(100, 13, 1).astype("float32")
+        self.input_4 = np.random.rand(13, 17).astype("float32")
+        self.input_5 = np.random.rand(1).astype("float32")
+
+        self.np_expected1 = np.heaviside(self.input_1, self.input_2)
+        self.np_expected2 = np.heaviside(self.input_2, self.input_3)
+        self.np_expected3 = np.heaviside(self.input_2, self.input_4)
+        self.np_expected4 = np.heaviside(self.input_4, self.input_5)
+
+    def test_broadcast(self):
+        paddle.disable_static()
+        self.tensor_1 = paddle.to_tensor(self.input_1)
+        self.tensor_2 = paddle.to_tensor(self.input_2)
+        self.tensor_3 = paddle.to_tensor(self.input_3)
+        self.tensor_4 = paddle.to_tensor(self.input_4)
+        self.tensor_5 = paddle.to_tensor(self.input_5)
+
+        res = paddle.heaviside(self.tensor_1, self.tensor_2)
+        res = res.numpy()
+        self.assertTrue(np.allclose(res, self.np_expected1))
+
+        res = paddle.heaviside(self.tensor_2, self.tensor_3)
+        res = res.numpy()
+        self.assertTrue(np.allclose(res, self.np_expected2))
+
+        res = paddle.heaviside(self.tensor_2, self.tensor_4)
+        res = res.numpy()
+        self.assertTrue(np.allclose(res, self.np_expected3))
+
+        res = paddle.heaviside(self.tensor_4, self.tensor_5)
+        res = res.numpy()
+        self.assertTrue(np.allclose(res, self.np_expected4))
+
+
+class TestHeavisideAPI_float64(unittest.TestCase):
+    def setUp(self):
+        self.x_np = np.random.random((13, 17)).astype("float64")
+        self.y_np = np.random.random((13, 17)).astype("float64")
+        self.out_np = np.heaviside(self.x_np, self.y_np)
+        self.dtype = "float64"
+
+    def test_static(self):
+        for use_cuda in ([False, True]
+                         if paddle.device.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+
+            paddle.enable_static()
+            prog = paddle.static.Program()
+            with paddle.static.program_guard(prog):
+                x = paddle.static.data(
+                    name=f"x_{self.dtype}", shape=[13, 17], dtype=self.dtype)
+                y = paddle.static.data(
+                    name=f"y_{self.dtype}", shape=[13, 17], dtype=self.dtype)
+                out = paddle.heaviside(x, y)
+
+            exe = paddle.static.Executor(place=place)
+            res = exe.run(prog,
+                          feed={
+                              f"x_{self.dtype}": self.x_np,
+                              f"y_{self.dtype}": self.y_np
+                          },
+                          fetch_list=out,
+                          use_prune=True)
+
+            self.assertTrue(np.allclose(res, self.out_np))
+
+    def test_dygraph(self):
+        for use_cuda in ([False, True]
+                         if paddle.device.is_compiled_with_cuda() else [False]):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+            paddle.disable_static(place=place)
+            result = paddle.heaviside(
+                paddle.to_tensor(self.x_np), paddle.to_tensor(self.y_np))
+
+            self.assertTrue(np.allclose(result.numpy(), self.out_np))
+
+
+class TestHeavisideAPI_float32(TestHeavisideAPI_float64):
+    def setUp(self):
+        self.x_np = np.random.random((13, 17)).astype("float32")
+        self.y_np = np.random.random((13, 17)).astype("float32")
+        self.out_np = np.heaviside(self.x_np, self.y_np)
+        self.dtype = "float32"
+
+
+class TestHeavisideAPI_int64(TestHeavisideAPI_float64):
+    def setUp(self):
+        self.x_np = np.random.random((13, 17)).astype("int64")
+        self.y_np = np.random.random((13, 17)).astype("int64")
+        self.out_np = np.heaviside(self.x_np, self.y_np)
+        self.dtype = "int64"
+
+
+class TestHeavisideAPI_int32(TestHeavisideAPI_float64):
+    def setUp(self):
+        self.x_np = np.random.random((13, 17)).astype("int32")
+        self.y_np = np.random.random((13, 17)).astype("int32")
+        self.out_np = np.heaviside(self.x_np, self.y_np)
+        self.dtype = "int32"
+
+
+class TestHeavisideError(unittest.TestCase):
+    def test_input(self):
+        paddle.disable_static()
+
+        def test_input_x():
+            paddle.heaviside(1, paddle.randn([100]))
+
+        self.assertRaises(ValueError, test_input_x)
+
+        def test_input_y():
+            paddle.heaviside(paddle.randn([100]), 1)
+
+        self.assertRaises(ValueError, test_input_y)
+
+        def test_input_xy():
+            paddle.heaviside(
+                paddle.randn([100], 'float32'), paddle.randn([100], 'float64'))
+
+        self.assertRaises(ValueError, test_input_xy)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_empty_op.py b/python/paddle/fluid/tests/unittests/test_empty_op.py
index b8ff66a910ece..371c59a1b8cce 100644
--- a/python/paddle/fluid/tests/unittests/test_empty_op.py
+++ b/python/paddle/fluid/tests/unittests/test_empty_op.py
@@ -232,28 +232,33 @@ def test_static_graph(self):
             name="shape_tensor_int32", shape=[2], dtype="int32")
         shape_tensor_int64 = fluid.data(
             name="shape_tensor_int64", shape=[2], dtype="int64")
+        shape_tensor_unknown = fluid.data(
+            name="shape_tensor_unknown", shape=[-1], dtype="int64")
 
         out_1 = paddle.empty(shape=[200, 3], dtype=dtype)
         out_2 = paddle.empty(shape=shape_tensor_int32, dtype=dtype)
         out_3 = paddle.empty(shape=shape_tensor_int64, dtype=dtype)
         out_4 = paddle.empty(shape=[200, positive_2_int32], dtype=dtype)
         out_5 = paddle.empty(shape=[200, positive_2_int64], dtype=dtype)
+        out_6 = paddle.empty(shape=shape_tensor_unknown, dtype=dtype)
 
         place = paddle.CPUPlace()
         exe = paddle.static.Executor(place)
-        res_1, res_2, res_3, res_4, res_5 = exe.run(
+        res_1, res_2, res_3, res_4, res_5, res_6 = exe.run(
             fluid.default_main_program(),
             feed={
                 "shape_tensor_int32": np.array([200, 3]).astype("int32"),
                 "shape_tensor_int64": np.array([200, 3]).astype("int64"),
+                "shape_tensor_unknown": np.array([200, 3]).astype("int64"),
             },
-            fetch_list=[out_1, out_2, out_3, out_4, out_5])
+            fetch_list=[out_1, out_2, out_3, out_4, out_5, out_6])
 
         self.__check_out__(res_1, dtype)
         self.__check_out__(res_2, dtype)
         self.__check_out__(res_3, dtype)
         self.__check_out__(res_4, dtype)
         self.__check_out__(res_5, dtype)
+        self.__check_out__(res_6, dtype)
 
 
 class TestEmptyError(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py b/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py
index 87c4656cfa809..a460c5f252777 100755
--- a/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py
+++ b/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py
@@ -22,8 +22,7 @@
 import paddle
 import paddle.nn as nn
 from paddle.dataset.common import DATA_HOME
-from paddle.fluid.framework import core, _non_static_mode, _enable_legacy_dygraph
-_enable_legacy_dygraph()
+from paddle.fluid.framework import core, _non_static_mode, _test_eager_guard
 from paddle.fluid.layer_helper import LayerHelper
 from paddle import _C_ops
 
@@ -151,13 +150,12 @@ def predict(self, data):
 class TestBertTokenizerOp(unittest.TestCase):
     def setUp(self):
         self.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
-        self.faster_tokenizer = FasterTokenizer(self.bert_tokenizer.vocab)
-        self.init_data()
         self.save_path = os.path.join(DATA_HOME, "fast_tokenizer")
         self.param_path = os.path.join(self.save_path, "model.pdparams")
         self.inference_path = os.path.join(self.save_path, "inference")
 
     def init_data(self):
+        self.faster_tokenizer = FasterTokenizer(self.bert_tokenizer.vocab)
         self.text = [
             '选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。'
             '酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，'
@@ -179,8 +177,8 @@ def init_data(self):
         self.texts_tensor = to_string_tensor(self.texts, "texts")
         self.text_pairs_tensor = to_string_tensor(self.text_pairs, "text_pairs")
 
-    def test_padding(self):
-
+    def run_padding(self):
+        self.init_data()
         self.max_seq_len = 128
         self.pad_to_max_seq_len = True
         self.is_split_into_words = False
@@ -283,7 +281,13 @@ def test_padding(self):
             np.allclose(
                 token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
 
-    def test_no_padding(self):
+    def test_padding(self):
+        with _test_eager_guard():
+            self.run_padding()
+        self.run_padding()
+
+    def run_no_padding(self):
+        self.init_data()
         self.max_seq_len = 128
         self.pad_to_max_seq_len = False
         self.is_split_into_words = False
@@ -336,7 +340,13 @@ def test_no_padding(self):
             np.allclose(
                 token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
 
-    def test_is_split_into_words(self):
+    def test_no_padding(self):
+        with _test_eager_guard():
+            self.run_no_padding()
+        self.run_no_padding()
+
+    def run_is_split_into_words(self):
+        self.init_data()
         self.is_split_into_words = True
 
         input_ids, token_type_ids = self.faster_tokenizer(
@@ -355,7 +365,13 @@ def test_is_split_into_words(self):
             np.allclose(
                 token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
 
+    def test_is_split_into_words(self):
+        with _test_eager_guard():
+            self.run_is_split_into_words()
+        self.run_is_split_into_words()
+
     def test_inference(self):
+        self.init_data()
         if not os.path.exists(self.save_path):
             os.makedirs(self.save_path, exist_ok=True)
         paddle.save(self.faster_tokenizer.state_dict(), self.param_path)
@@ -383,6 +399,7 @@ def test_inference(self):
                 token_type_ids, py_token_type_ids, rtol=0, atol=0.01))
 
     def test_feed_string_var(self):
+        self.init_data()
         paddle.enable_static()
         x = paddle.static.data(
             name="x", shape=[-1], dtype=core.VarDesc.VarType.STRINGS)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker.py
index 4655b628dab4d..f382d61c63743 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker.py
@@ -52,7 +52,7 @@ def test_ps_rolemaker(self):
         self.assertTrue(ro.is_server())
         self.assertEqual(ro.worker_num(), 2)
 
-    def test_traing_role(self):
+    def test_training_role(self):
         """Test training role."""
         os.environ["TRAINING_ROLE"] = "TEST"
         ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
index 5e8be9a852273..86ee0db30ef8c 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
@@ -116,7 +116,7 @@ def test_ps_rolemaker(self):
         self.assertEqual(ro._all_gather(1, "worker"), 1)
         self.assertEqual(ro._all_reduce(1, "sum", "worker"), 1)
 
-    def test_traing_role(self):
+    def test_training_role(self):
         """Test training role."""
         os.environ["TRAINING_ROLE"] = "TEST"
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
index 0ae005430e03b..28e03fdfd70e1 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/fluid/tests/unittests/test_full_op.py b/python/paddle/fluid/tests/unittests/test_full_op.py
index 108469cf8a732..723c4609bc96b 100644
--- a/python/paddle/fluid/tests/unittests/test_full_op.py
+++ b/python/paddle/fluid/tests/unittests/test_full_op.py
@@ -80,8 +80,10 @@ def test_api_eager(self):
         with fluid.dygraph.base.guard():
             with _test_eager_guard():
                 positive_2_int32 = fluid.layers.fill_constant([1], "int32", 2)
-
                 positive_2_int64 = fluid.layers.fill_constant([1], "int64", 2)
+                positive_4_int64 = fluid.layers.fill_constant([1], "int64", 4,
+                                                              True)
+
                 out_1 = paddle.full(
                     shape=[1, 2], dtype="float32", fill_value=1.1)
 
@@ -108,8 +110,19 @@ def test_api_eager(self):
                     shape=[1], dtype=np.float32, value=1.1)
                 out_7 = paddle.full(
                     shape=[1, 2], dtype=np.float32, fill_value=val)
+
+                out_8 = paddle.full(
+                    shape=positive_2_int32, dtype="float32", fill_value=1.1)
+
+                out_9 = paddle.full(
+                    shape=[
+                        positive_2_int32, positive_2_int64, positive_4_int64
+                    ],
+                    dtype="float32",
+                    fill_value=1.1)
+
                 # test for numpy.float64 as fill_value
-                out_8 = paddle.full_like(
+                out_10 = paddle.full_like(
                     out_7, dtype=np.float32, fill_value=np.abs(1.1))
 
                 assert np.array_equal(
@@ -133,8 +146,12 @@ def test_api_eager(self):
                 assert np.array_equal(
                     out_7, np.full(
                         [1, 2], 1.1, dtype="float32"))
+                assert np.array_equal(out_8, np.full([2], 1.1, dtype="float32"))
+                assert np.array_equal(
+                    out_9, np.full(
+                        [2, 2, 4], 1.1, dtype="float32"))
                 assert np.array_equal(
-                    out_8, np.full(
+                    out_10, np.full(
                         [1, 2], 1.1, dtype="float32"))
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_fused_attention_op.py b/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
index a3ae2a20dba23..67160f59952ef 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
@@ -26,7 +26,8 @@
 from paddle.fluid import layers
 import unittest
 from op_test import OpTest
-from paddle.fluid.framework import default_main_program
+from paddle.fluid.framework import default_main_program, _enable_legacy_dygraph
+_enable_legacy_dygraph()
 
 default_main_program().random_seed = 42
 
diff --git a/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py b/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py
index a533b5d87a5a9..8c68eb243aea8 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py
@@ -23,7 +23,8 @@
 from paddle.nn.layer.common import Linear, Dropout
 import unittest
 from op_test import OpTest
-from paddle.fluid.framework import default_main_program
+from paddle.fluid.framework import default_main_program, _enable_legacy_dygraph
+_enable_legacy_dygraph()
 
 
 class TestFusedFFNOp(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py b/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py
index 8f77972de8656..67f382a439d8c 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py
@@ -109,6 +109,7 @@ def config(self):
 
         self.x_type = np.float32
         self.attn_mask_type = np.float64
+        #self.attn_mask_type = np.bool
         self.pre_layer_norm = True
         self.has_attn_mask = True
 
@@ -168,6 +169,11 @@ def generate_input_data(self):
                     self.attn_mask = (self.attn_mask - 1.0) * 1e4
                 else:
                     self.attn_mask = (np.tril(self.attn_mask) - 1.0) * 1e4
+            elif self.attn_mask_type == np.bool:
+                if self.has_cache_kv and not self.gen_cache_kv:
+                    self.attn_mask[:, :, :, -2] = 0
+                else:
+                    self.attn_mask = np.tril(self.attn_mask)
             else:
                 raise ValueError(
                     "'attn_mask_type' should be 'int64' or 'float64'.")
@@ -394,7 +400,7 @@ def GetFusedMultiTransformerOut(self):
         epsilon = 1e-05
         ln2_epsilon = 1e-05
 
-        if attn_mask is not None:
+        if attn_mask is not None and self.attn_mask_type != np.bool:
             attn_mask = _convert_attention_mask(attn_mask, x.dtype)
 
         qkv_weights, qkv_biases = [], []
diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
index 7984ca5571658..20a55af15c441 100644
--- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
@@ -162,7 +162,7 @@ def check_clip_result(self, out, out_clip):
                 "gradient clip by global norm has wrong results!, \nu={}\nv={}\ndiff={}".
                 format(u, v, u - v))
 
-    # test whether the ouput is right when use 'set_gradient_clip'
+    # test whether the output is right when use 'set_gradient_clip'
     def test_old_gradient_clip(self):
         def func(params_grads):
             clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm)
@@ -172,7 +172,7 @@ def func(params_grads):
         self.clip_gradient = func
         self.check_gradient_clip(fluid.CPUPlace())
 
-    # test whether the ouput is right when use grad_clip
+    # test whether the output is right when use grad_clip
     def test_new_gradient_clip(self):
         def func(params_grads):
             clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm)
@@ -181,7 +181,7 @@ def func(params_grads):
         self.clip_gradient = func
         self.check_gradient_clip(fluid.CPUPlace())
 
-    # test whether the ouput is right when use grad_clip under float64
+    # test whether the output is right when use grad_clip under float64
     def test_new_gradient_clip_fp64(self):
         def func(params_grads):
             clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm)
@@ -267,7 +267,7 @@ def check_clip_result(self, out, out_clip):
                     a=u, b=v, rtol=1e-5, atol=1e-8),
                 "gradient clip by norm has wrong results!")
 
-    # test whether the ouput is right when use grad_clip
+    # test whether the output is right when use grad_clip
     def test_gradient_clip(self):
         def func(params_grads):
             clip = fluid.clip.GradientClipByNorm(clip_norm=self.clip_norm)
@@ -311,7 +311,7 @@ def check_clip_result(self, out, out_clip):
                     a=u, b=v, rtol=1e-6, atol=1e-8),
                 "gradient clip by value has wrong results!")
 
-    # test whether the ouput is right when use grad_clip
+    # test whether the output is right when use grad_clip
     def test_gradient_clip(self):
         def func(params_grads):
             clip = fluid.clip.GradientClipByValue(max=self.max, min=self.min)
@@ -397,7 +397,7 @@ def check_clip_result(self, loss, optimizer):
         self.assertTrue(
             np.isclose(
                 a=a, b=b, rtol=1e-6, atol=1e-8),
-            "gradient clip by global norm has wrong results, expetcd:%f, but recieved:%f"
+            "gradient clip by global norm has wrong results, expetcd:%f, but received:%f"
             % (a, b))
 
 
@@ -426,7 +426,7 @@ def check_clip_result(self, loss, optimizer):
             self.assertTrue(
                 np.isclose(
                     a=a, b=b, rtol=1e-6, atol=1e-8),
-                "gradient clip by norm has wrong results, expetcd:%f, but recieved:%f"
+                "gradient clip by norm has wrong results, expetcd:%f, but received:%f"
                 % (a, b))
 
 
@@ -517,7 +517,7 @@ def test_gradient_clip(self):
                 self.assertTrue(
                     np.isclose(
                         a=a, b=b, rtol=1e-3, atol=1e-8),
-                    "gradient clip by global norm has wrong results, expetcd:%f, but recieved:%f"
+                    "gradient clip by global norm has wrong results, expetcd:%f, but received:%f"
                     % (a, b))
 
 
@@ -563,7 +563,7 @@ def test_gradient_clip(self):
             self.assertTrue(
                 np.isclose(
                     a=a, b=b, rtol=1e-6, atol=1e-8),
-                "gradient clip by global norm has wrong results, expetcd:%f, but recieved:%f"
+                "gradient clip by global norm has wrong results, expetcd:%f, but received:%f"
                 % (a, b))
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
index 965ae65614a40..51ff8ec943d01 100644
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
@@ -198,7 +198,7 @@ def test_check_grad(self):
 
 
 @skip_check_grad_ci(
-    reason="For 'TestHSigmoidOpSparse', check_grad is is separately calculated by 'TestHSigmoidOpWithSparseGrad'."
+    reason="For 'TestHSigmoidOpSparse', check_grad is separately calculated by 'TestHSigmoidOpWithSparseGrad'."
 )
 class TestHSigmoidOpSparse(OpTest):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
index 60dd4948f996e..96a818549e700 100644
--- a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
@@ -321,6 +321,70 @@ def step_lr(epoch_num, learning_rate, step_size, gamma=0.1, verbose=False):
     return learning_rate * math.pow(gamma, epoch_num // step_size)
 
 
+def one_cycle_lr(epoch_num,
+                 max_learning_rate,
+                 total_steps,
+                 divide_factor=25,
+                 end_learning_rate=0.0001,
+                 phase_pct=0.3,
+                 anneal_strategy='cos',
+                 three_phase=False,
+                 verbose=False):
+    initial_lr = max_learning_rate / divide_factor
+    if three_phase:
+        _end_steps = [
+            float(phase_pct * total_steps) - 1,
+            float(2 * phase_pct * total_steps) - 2, total_steps - 1
+        ]
+        _schedule_phases = [
+            {
+                'start_lr': initial_lr,
+                'end_lr': max_learning_rate,
+            },
+            {
+                'start_lr': max_learning_rate,
+                'end_lr': initial_lr,
+            },
+            {
+                'start_lr': initial_lr,
+                'end_lr': end_learning_rate,
+            },
+        ]
+    else:
+        _end_steps = [float(phase_pct * total_steps) - 1, total_steps - 1]
+        _schedule_phases = [
+            {
+                'start_lr': initial_lr,
+                'end_lr': max_learning_rate,
+            },
+            {
+                'start_lr': max_learning_rate,
+                'end_lr': end_learning_rate,
+            },
+        ]
+
+    if anneal_strategy == 'cos':
+
+        def anneal_func(start, end, pct):
+            cos_out = math.cos(math.pi * pct) + 1
+            return end + (start - end) / 2.0 * cos_out
+    else:
+
+        def anneal_func(start, end, pct):
+            return (end - start) * pct + start
+
+    start_step = 0
+    for i, phase in enumerate(_schedule_phases):
+        end_step = _end_steps[i]
+        if epoch_num <= end_step or i == len(_schedule_phases) - 1:
+            pct = (epoch_num - start_step) / (end_step - start_step)
+            computed_lr = anneal_func(phase['start_lr'], phase['end_lr'], pct)
+            break
+        start_step = end_step
+
+    return computed_lr
+
+
 class TestLRScheduler(unittest.TestCase):
     def _test_static(self, python_func, paddle_api, kwarg, place):
         scheduler = paddle_api(**kwarg)
@@ -467,6 +531,33 @@ def test_scheduler(self):
         with self.assertRaises(ValueError):
             paddle.optimizer.lr.MultiStepDecay(
                 learning_rate=0.5, milestones=[1, 2, 3], gamma=2)
+        with self.assertRaises(TypeError):
+            paddle.optimizer.lr.OneCycleLR(
+                max_learning_rate='test', total_steps=20)
+        with self.assertRaises(ValueError):
+            paddle.optimizer.lr.OneCycleLR(
+                max_learning_rate=-1.5, total_steps=20)
+        with self.assertRaises(TypeError):
+            paddle.optimizer.lr.OneCycleLR(
+                max_learning_rate=0.1, total_steps=20, end_learning_rate='test')
+        with self.assertRaises(ValueError):
+            paddle.optimizer.lr.OneCycleLR(
+                max_learning_rate=0.1, total_steps=20, end_learning_rate=-1)
+        with self.assertRaises(TypeError):
+            paddle.optimizer.lr.OneCycleLR(
+                max_learning_rate=0.1, total_steps='test')
+        with self.assertRaises(ValueError):
+            paddle.optimizer.lr.OneCycleLR(
+                max_learning_rate=0.1, total_steps=-10)
+        with self.assertRaises(ValueError):
+            paddle.optimizer.lr.OneCycleLR(
+                max_learning_rate=0.1, total_steps=20, anneal_strategy='test')
+        with self.assertRaises(ValueError):
+            paddle.optimizer.lr.OneCycleLR(
+                max_learning_rate=0.1,
+                total_steps=20,
+                phase_pct=0.6,
+                three_phase=True)
 
         func_api_kwargs = [(noam_lr, paddle.optimizer.lr.NoamDecay, {
             "d_model": 0.01,
@@ -527,6 +618,38 @@ def test_scheduler(self):
             "learning_rate": 0.5,
             "T_max": 10,
             "verbose": False
+        }), (one_cycle_lr, paddle.optimizer.lr.OneCycleLR, {
+            "max_learning_rate": 0.1,
+            "total_steps": 20,
+            "divide_factor": 5,
+            "end_learning_rate": 0.0001,
+            "anneal_strategy": 'cos',
+            "phase_pct": 0.3,
+            "three_phase": False,
+        }), (one_cycle_lr, paddle.optimizer.lr.OneCycleLR, {
+            "max_learning_rate": 0.5,
+            "total_steps": 20,
+            "divide_factor": 10,
+            "end_learning_rate": 0.001,
+            "anneal_strategy": 'linear',
+            "phase_pct": 0.4,
+            "three_phase": False,
+        }), (one_cycle_lr, paddle.optimizer.lr.OneCycleLR, {
+            "max_learning_rate": 1.0,
+            "total_steps": 20,
+            "divide_factor": 9,
+            "end_learning_rate": 0.0001,
+            "anneal_strategy": 'cos',
+            "phase_pct": 0.3,
+            "three_phase": True,
+        }), (one_cycle_lr, paddle.optimizer.lr.OneCycleLR, {
+            "max_learning_rate": 0.3,
+            "total_steps": 20,
+            "divide_factor": 25,
+            "end_learning_rate": 0.0005,
+            "anneal_strategy": 'linear',
+            "phase_pct": 0.2,
+            "three_phase": True,
         })]
 
         for python_func, paddle_api, kwarg in func_api_kwargs:
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
index 3e06b69278d34..f6f62045b19f9 100644
--- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
@@ -495,6 +495,58 @@ def test_dygraph_fp16(self):
                     y = paddle.to_tensor(input_y)
                     result = paddle.matmul(x, y)
 
+    def test_compute_type_fp32(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                with fluid.dygraph.guard(place):
+                    paddle.set_flags({
+                        'FLAGS_gemm_use_half_precision_compute_type': False
+                    })
+                    input_x = np.random.random([2, 8, 16]).astype("float16")
+                    input_y = np.random.random([2, 16, 8]).astype("float16")
+                    for i in range(0, 16, 2):
+                        input_x[:, :, i] += 60000
+                        input_x[:, :, i + 1] -= 60000
+                    input_y[:, :, :] = 1.5
+
+                    x = paddle.to_tensor(input_x)
+                    y = paddle.to_tensor(input_y)
+                    result = paddle.matmul(x, y)
+                    result_np = np.matmul(input_x, input_y)
+                    self.assertTrue(paddle.isfinite(result)[0, 0, 0])
+                    self.assertTrue(np.isfinite(result_np)[0, 0, 0])
+                    self.assertTrue(np.array_equal(result_np, result.numpy()))
+                    paddle.set_flags({
+                        'FLAGS_gemm_use_half_precision_compute_type': True
+                    })
+
+    def test_compute_type_fp16_nan(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                with fluid.dygraph.guard(place):
+                    paddle.set_flags({
+                        'FLAGS_gemm_use_half_precision_compute_type': True
+                    })
+                    input_x = np.random.random([2, 8, 16]).astype("float16")
+                    input_y = np.random.random([2, 16, 8]).astype("float16")
+                    for i in range(0, 16, 2):
+                        input_x[:, :, i] += 60000
+                        input_x[:, :, i + 1] -= 60000
+                    input_y[:, :, :] = 1.5
+
+                    x = paddle.to_tensor(input_x)
+                    y = paddle.to_tensor(input_y)
+                    result = paddle.matmul(x, y)
+                    result_np = np.matmul(input_x, input_y)
+                    self.assertFalse(
+                        paddle.isfinite(result)[0, 0, 0])  # contains nan/inf
+                    self.assertTrue(np.isfinite(result_np)[0, 0, 0])
+                    paddle.set_flags({
+                        'FLAGS_gemm_use_half_precision_compute_type': False
+                    })
+
     def test_api_eager_dygraph(self):
         with _test_eager_guard():
             self.test_dygraph()
diff --git a/python/paddle/fluid/tests/unittests/test_nn_grad.py b/python/paddle/fluid/tests/unittests/test_nn_grad.py
index 55f87540c1b8a..4685b00b394b7 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_grad.py
@@ -157,6 +157,9 @@ def test_grad(self):
 
 
 class TestTileDoubleGradCheck(unittest.TestCase):
+    def tile_wrapper(self, x):
+        return paddle.tile(x[0], [4, 9])
+
     @prog_scope()
     def func(self, place):
         x_shape = [3, 12]
@@ -171,6 +174,8 @@ def func(self, place):
 
         gradient_checker.double_grad_check(
             [x], out, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.tile_wrapper, [x], out, x_init=x_arr, place=place)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -181,6 +186,9 @@ def test_grad(self):
 
 
 class TestExpandV2DoubleGradCheck(unittest.TestCase):
+    def expand_wrapper(self, x):
+        return paddle.expand(x[0], [4, 12])
+
     @prog_scope()
     def func(self, place):
         x_shape = [1, 12]
@@ -195,6 +203,8 @@ def func(self, place):
 
         gradient_checker.double_grad_check(
             [x], out, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.expand_wrapper, [x], out, x_init=x_arr, place=place)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -205,6 +215,10 @@ def test_grad(self):
 
 
 class TestSqueezeDoubleGradCheck(unittest.TestCase):
+    def squeeze_warpper(self, x):
+        axes = [0, 2]
+        return paddle.squeeze(x[0], axes)
+
     @prog_scope()
     def func(self, place):
         x_shape = [1, 3, 1, 40]
@@ -219,6 +233,8 @@ def func(self, place):
 
         gradient_checker.double_grad_check(
             [x], out, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.squeeze_warpper, [x], out, x_init=x_arr, place=place)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -229,6 +245,10 @@ def test_grad(self):
 
 
 class TestUnsqueezeDoubleGradCheck(unittest.TestCase):
+    def unsqueeze_wrapper(self, x):
+        axes = [1, 2]
+        return paddle.unsqueeze(x[0], axes)
+
     @prog_scope()
     def func(self, place):
         x_shape = [3, 40]
@@ -243,6 +263,8 @@ def func(self, place):
 
         gradient_checker.double_grad_check(
             [x], out, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.unsqueeze_wrapper, [x], out, x_init=x_arr, place=place)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -253,6 +275,9 @@ def test_grad(self):
 
 
 class TestClipDoubleGradCheck(unittest.TestCase):
+    def clip_wrapper(self, x):
+        return paddle.clip(x[0], min=-1., max=1.)
+
     @prog_scope()
     def func(self, place):
         x_shape = [2, 4, 10]
@@ -264,6 +289,8 @@ def func(self, place):
         x_arr = np.random.uniform(-5., 5., x_shape).astype(dtype)
 
         gradient_checker.double_grad_check([x], out, x_init=x_arr, place=place)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.clip_wrapper, [x], out, x_init=x_arr, place=place)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -318,6 +345,10 @@ def test_grad(self):
 
 
 class TestConstantPadDoubleGradCheck(unittest.TestCase):
+    def pad_wrapper(self, x):
+        pad = [1, 1, 1, 1]
+        return paddle.nn.functional.pad(x[0], pad)
+
     @prog_scope()
     def func(self, place):
         x_shape = [2, 3, 4, 5]
@@ -332,6 +363,8 @@ def func(self, place):
 
         gradient_checker.double_grad_check(
             [x], out, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.pad_wrapper, [x], out, x_init=x_arr, place=place)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -357,6 +390,9 @@ def func(self, place):
 
 
 class TestConcatDoubleGradCheck(unittest.TestCase):
+    def concat_wrapper(self, x):
+        return paddle.concat(x, axis=0)
+
     @prog_scope()
     def func(self, place):
         x_shape = [2, 3, 4, 5]
@@ -373,6 +409,11 @@ def func(self, place):
 
         gradient_checker.double_grad_check(
             [x1, x2], out, x_init=[x1_arr, x2_arr], place=place)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.concat_wrapper, [x1, x2],
+            out,
+            x_init=[x1_arr, x2_arr],
+            place=place)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -407,6 +448,10 @@ def test_grad(self):
 
 
 class TestAvgPool2DDoubleGradCheckCase2(unittest.TestCase):
+    def pool2d_wrapper(self, x):
+        return paddle.nn.functional.avg_pool2d(
+            x[0], kernel_size=2, data_format="NHWC")
+
     @prog_scope()
     def func(self, place):
         input_NHWC = fluid.layers.data(
@@ -416,13 +461,16 @@ def func(self, place):
             dtype="float32")
 
         input_NHWC.persistable = True
-        y = layers.pool2d(
-            input_NHWC, pool_size=2, pool_type="avg", data_format="NHWC")
+        y = paddle.nn.functional.avg_pool2d(
+            input_NHWC, kernel_size=2, data_format="NHWC")
         x_arr = np.random.uniform(-1, 1, [2, 5, 5, 3]).astype(np.float32)
 
         gradient_checker.double_grad_check(
             [input_NHWC], y, x_init=x_arr, place=place, eps=0.05)
 
+        gradient_checker.double_grad_check_for_dygraph(
+            self.pool2d_wrapper, [input_NHWC], y, x_init=x_arr, place=place)
+
     def test_grad(self):
         places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda():
@@ -432,6 +480,10 @@ def test_grad(self):
 
 
 class TestAvgPool2DDoubleGradCheckCase3(unittest.TestCase):
+    def pool2d_wrapper(self, x):
+        return paddle.nn.functional.avg_pool2d(
+            x[0], kernel_size=2, padding=[1, 1])
+
     @prog_scope()
     def func(self, place):
         input_NCHW = fluid.layers.data(
@@ -441,12 +493,14 @@ def func(self, place):
             dtype="float32")
 
         input_NCHW.persistable = True
-        y = layers.pool2d(
-            input_NCHW, pool_size=2, pool_type="avg", pool_padding=[1, 1])
+        y = paddle.nn.functional.avg_pool2d(
+            input_NCHW, kernel_size=2, padding=[1, 1])
         x_arr = np.random.uniform(-1, 1, [2, 3, 5, 5]).astype(np.float32)
 
         gradient_checker.double_grad_check(
             [input_NCHW], y, x_init=x_arr, place=place, eps=0.05)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.pool2d_wrapper, [input_NCHW], y, x_init=x_arr, place=place)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -457,6 +511,9 @@ def test_grad(self):
 
 
 class TestAvgPool2DDoubleGradCheckCase4(unittest.TestCase):
+    def pool2d_wrapper(self, x):
+        return paddle.nn.functional.avg_pool2d(x[0], kernel_size=[4, 4])
+
     @prog_scope()
     def func(self, place):
         input_NCHW = fluid.layers.data(
@@ -467,10 +524,13 @@ def func(self, place):
 
         input_NCHW.persistable = True
         y = layers.pool2d(input_NCHW, pool_size=[4, 4], pool_type="avg")
+        y = paddle.nn.functional.avg_pool2d(input_NCHW, kernel_size=[4, 4])
         x_arr = np.random.uniform(-1, 1, [2, 3, 5, 5]).astype(np.float32)
 
         gradient_checker.double_grad_check(
             [input_NCHW], y, x_init=x_arr, place=place, eps=0.05)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.pool2d_wrapper, [input_NCHW], y, x_init=x_arr, place=place)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
diff --git a/python/paddle/fluid/tests/unittests/test_onnx_export.py b/python/paddle/fluid/tests/unittests/test_onnx_export.py
index 5efd586d849d1..07016d4290102 100644
--- a/python/paddle/fluid/tests/unittests/test_onnx_export.py
+++ b/python/paddle/fluid/tests/unittests/test_onnx_export.py
@@ -21,7 +21,7 @@
 import paddle
 from paddle.static import InputSpec
 
-from paddle.fluid.framework import in_dygraph_mode
+from paddle.fluid.framework import in_dygraph_mode, _test_eager_guard
 
 
 class LinearNet(paddle.nn.Layer):
@@ -45,43 +45,46 @@ def forward(self, x, y, z):
 
 
 class TestExportWithTensor(unittest.TestCase):
-    def setUp(self):
+    def func_with_tensor(self):
         self.x_spec = paddle.static.InputSpec(
             shape=[None, 128], dtype='float32')
-
-    def test_with_tensor(self):
-        if in_dygraph_mode():
-            return
         model = LinearNet()
         paddle.onnx.export(model, 'linear_net', input_spec=[self.x_spec])
 
+    def test_with_tensor(self):
+        with _test_eager_guard():
+            self.func_with_tensor()
+        self.func_with_tensor()
+
 
 class TestExportWithTensor1(unittest.TestCase):
-    def setUp(self):
+    def func_with_tensor(self):
         self.x = paddle.to_tensor(np.random.random((1, 128)))
-
-    def test_with_tensor(self):
-        if in_dygraph_mode():
-            return
         model = LinearNet()
         paddle.onnx.export(model, 'linear_net', input_spec=[self.x])
 
+    def test_with_tensor(self):
+        with _test_eager_guard():
+            self.func_with_tensor()
+        self.func_with_tensor()
+
 
 class TestExportPrunedGraph(unittest.TestCase):
-    def setUp(self):
+    def func_prune_graph(self):
+        model = Logic()
         self.x = paddle.to_tensor(np.array([1]))
         self.y = paddle.to_tensor(np.array([-1]))
-
-    def test_prune_graph(self):
-        if in_dygraph_mode():
-            return
-        model = Logic()
         paddle.jit.to_static(model)
         out = model(self.x, self.y, z=True)
         paddle.onnx.export(
             model, 'pruned', input_spec=[self.x], output_spec=[out])
 
+    def test_prune_graph(self):
+        # test eager
+        with _test_eager_guard():
+            self.func_prune_graph()
+        self.func_prune_graph()
+
 
 if __name__ == '__main__':
-    if not in_dygraph_mode():
-        unittest.main()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py
index ba1e9be815de6..a0c5ce77f1d25 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
@@ -1123,7 +1123,7 @@ def test_api_eager_dygraph(self):
 
 class TestMasterWeightSaveForFP16(unittest.TestCase):
     '''
-    For Amp-O2, some optimizer(Momentum, Adam ...) will create master weights for parameters to to improve the accuracy.
+    For Amp-O2, some optimizer(Momentum, Adam ...) will create master weights for parameters to improve the accuracy.
     Master weights will be saved by optimizer::state_dict.
     '''
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sharding_parallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sharding_parallel.py
index b7e8e06029d93..503bd9d0f9797 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sharding_parallel.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sharding_parallel.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import os
 import unittest
 import paddle.fluid as fluid
 
@@ -25,7 +26,10 @@ class TestHybridParallel(TestMultipleGpus):
     # check sharding logic as well as the accuracy with single mode
     def test_hybrid_parallel_sharding_logic(self):
         self.run_mnist_2gpu('hybrid_parallel_sharding_model.py')
+        self.run_mnist_2gpu(
+            'hybrid_parallel_sharding_model.py', eager_mode=False)
 
 
 if __name__ == "__main__":
+    os.environ["FLAGS_enable_eager_mode"] = "1"
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_cpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_cpu.py
index 20a5fcb7af3b1..9b48a87bff7b9 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_cpu.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_cpu.py
@@ -32,7 +32,7 @@ def test_seresnext_with_learning_rate_decay(self):
         self._compare_result_with_origin_model(
             check_func,
             use_device=DeviceType.CPU,
-            compare_seperately=False,
+            compare_separately=False,
             delta2=1e-3)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py
index 9d1364cc592fe..ff529ce94bd25 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py
@@ -30,7 +30,7 @@ def test_seresnext_with_learning_rate_decay(self):
             optimizer=seresnext_net.optimizer,
             use_parallel_executor=False)
         self._compare_result_with_origin_model(
-            check_func, use_device=DeviceType.CUDA, compare_seperately=False)
+            check_func, use_device=DeviceType.CUDA, compare_separately=False)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_pipeline.py b/python/paddle/fluid/tests/unittests/test_pipeline.py
index 8f46119d551c6..04772a2da2871 100644
--- a/python/paddle/fluid/tests/unittests/test_pipeline.py
+++ b/python/paddle/fluid/tests/unittests/test_pipeline.py
@@ -63,7 +63,7 @@ def test_dist_train_one_device(self):
                 "pipeline_mnist_one_device.py",
                 check_error_log=True,
                 log_name=flag_name,
-                need_envs=self.need_envs())
+                need_envs={"PADDLE_MANUAL_PIPELINE_STAGE": "0"})
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
index dacb7a5b59957..3621fd1b9d445 100644
--- a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
@@ -629,7 +629,6 @@ def _calc_output(self, place, mode="test", dygraph=True):
         else:
             fluid.disable_dygraph()
         gen = paddle.seed(self._random_seed)
-        gen._is_init_py = False
         paddle.framework.random._manual_program_seed(self._random_seed)
         scope = fluid.core.Scope()
         with fluid.scope_guard(scope):
diff --git a/python/paddle/fluid/tests/unittests/test_run.py b/python/paddle/fluid/tests/unittests/test_run.py
index 365d3f931c27c..c0157c5b9068c 100644
--- a/python/paddle/fluid/tests/unittests/test_run.py
+++ b/python/paddle/fluid/tests/unittests/test_run.py
@@ -51,7 +51,9 @@ def write_file(name, ct):
 
 def get_files(pth, prefix):
     return [
-        f for f in listdir(pth) if isfile(join(pth, f)) and f.startswith(prefix)
+        f for f in listdir(pth)
+        if isfile(join(pth, f)) and f.startswith(prefix) and f !=
+        f"{prefix}.gpu.log"
     ]
 
 
@@ -93,7 +95,7 @@ def test_collective_3(self):
             shutil.rmtree('./log')
 
         port = random.randrange(6000, 8000)
-        args = "--job_id test3 --devices 0,1 --master 127.0.0.1:{} --np 2".format(
+        args = "--job_id test3 --devices 0,1 --master 127.0.0.1:{} --nnodes 2".format(
             port)
         p1 = self.pdrun(args)
         p2 = self.pdrun(args)
@@ -141,7 +143,7 @@ def test_ps_3(self):
             shutil.rmtree('./log')
 
         port = random.randrange(6000, 8000)
-        args = "--job_id ps3 --master 127.0.0.1:{} --np 2 --server_num=1 --trainer_num=1".format(
+        args = "--job_id ps3 --master 127.0.0.1:{} --nnodes 2 --server_num=1 --trainer_num=1".format(
             port)
         p1 = self.pdrun(args)
         p2 = self.pdrun(args)
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_activation_op.py b/python/paddle/fluid/tests/unittests/test_sparse_activation_op.py
deleted file mode 100644
index b4abbd56303ff..0000000000000
--- a/python/paddle/fluid/tests/unittests/test_sparse_activation_op.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-import numpy as np
-import paddle
-from paddle.fluid.framework import _test_eager_guard
-
-
-class TestSparseActivation(unittest.TestCase):
-    def test_sparse_relu(self):
-        with _test_eager_guard():
-            x = [[0, -1, 0, 2], [0, 0, -3, 0], [4, 5, 0, 0]]
-
-            def dense_relu(x):
-                dense_x = paddle.to_tensor(
-                    x, dtype='float32', stop_gradient=False)
-                dense_relu = paddle.nn.ReLU()
-                dense_out = dense_relu(dense_x)
-                dense_out.backward(dense_out)
-                return dense_out, dense_x.grad
-
-            dense_x = paddle.to_tensor(x, dtype='float32', stop_gradient=False)
-            sparse_dim = 2
-            sparse_x = dense_x.to_sparse_coo(sparse_dim)
-            sparse_relu = paddle.sparse.ReLU()
-            sparse_out = sparse_relu(sparse_x)
-            sparse_out.backward(sparse_out)
-
-            dense_out, dense_x_grad = dense_relu(x)
-            assert np.array_equal(dense_out.numpy(),
-                                  sparse_out.to_dense().numpy())
-            assert np.array_equal(dense_x_grad.numpy(),
-                                  sparse_x.grad.to_dense().numpy())
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_unary_op.py b/python/paddle/fluid/tests/unittests/test_sparse_unary_op.py
new file mode 100644
index 0000000000000..573cc5ba8cf5d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sparse_unary_op.py
@@ -0,0 +1,133 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+from typing import Union, Callable
+import numpy as np
+import paddle
+from paddle.fluid.framework import _test_eager_guard
+from paddle import _C_ops
+
+
+class TestSparseUnary(unittest.TestCase):
+    def assert_raises_on_dense_tensor(self, sparse_func):
+        with _test_eager_guard():
+            dense_x = paddle.ones((2, 3))
+            with self.assertRaises(ValueError):
+                sparse_func(dense_x)
+
+    def compare_with_dense(
+            self,
+            x,
+            to_sparse: Callable[[paddle.Tensor], paddle.Tensor],
+            dense_func: Callable[[paddle.Tensor], paddle.Tensor],
+            sparse_func: Callable[[paddle.Tensor], paddle.Tensor],
+            test_gradient: bool, ):
+        def tensor_allclose(dense_tensor: paddle.Tensor,
+                            sparse_tensor: paddle.Tensor):
+            dense_numpy = dense_tensor.numpy()
+            mask = ~np.isnan(dense_numpy)
+            return np.allclose(dense_numpy[mask],
+                               sparse_tensor.to_dense().numpy()[mask])
+
+        with _test_eager_guard():
+            dense_x = paddle.to_tensor(
+                x, dtype="float32", stop_gradient=not test_gradient)
+
+            sparse_x = to_sparse(dense_x)
+            sparse_out = sparse_func(sparse_x)
+
+            dense_x = paddle.to_tensor(
+                x, dtype="float32", stop_gradient=not test_gradient)
+            dense_out = dense_func(dense_x)
+
+            assert tensor_allclose(dense_out, sparse_out)
+
+            if test_gradient:
+                dense_out.backward(dense_out)
+                sparse_out.backward(sparse_out)
+                assert tensor_allclose(dense_x.grad, sparse_x.grad)
+
+    def test_sparse_relu(self):
+        x = [[0, -1, 0, 2], [0, 0, -3, 0], [4, 5, 0, 0]]
+        sparse_dim = 2
+        self.compare_with_dense(
+            x,
+            lambda x: x.to_sparse_coo(sparse_dim),
+            paddle.nn.ReLU(),
+            paddle.sparse.ReLU(),
+            True, )
+        self.compare_with_dense(
+            x,
+            lambda x: x.to_sparse_csr(),
+            paddle.nn.ReLU(),
+            paddle.sparse.ReLU(),
+            False, )
+        self.assert_raises_on_dense_tensor(paddle.sparse.ReLU())
+
+    def test_sparse_sqrt(self):
+        x = [[0, 16, 0, 0], [0, 0, 0, 0], [0, 4, 2, 0]]
+        sparse_dim = 2
+        self.compare_with_dense(
+            x,
+            lambda x: x.to_sparse_coo(sparse_dim),
+            paddle.sqrt,
+            paddle.sparse.sqrt,
+            True, )
+        self.compare_with_dense(
+            x,
+            lambda x: x.to_sparse_csr(),
+            paddle.sqrt,
+            paddle.sparse.sqrt,
+            False, )
+        self.assert_raises_on_dense_tensor(paddle.sparse.sqrt)
+
+    def test_sparse_sin(self):
+        x = [[0, 16, 0, 0], [0, 0, 0, 0], [0, 4, 2, 0]]
+        sparse_dim = 2
+        self.compare_with_dense(
+            x,
+            lambda x: x.to_sparse_coo(sparse_dim),
+            paddle.sin,
+            paddle.sparse.sin,
+            True, )
+        self.compare_with_dense(
+            x,
+            lambda x: x.to_sparse_csr(),
+            paddle.sin,
+            paddle.sparse.sin,
+            False, )
+        self.assert_raises_on_dense_tensor(paddle.sparse.sin)
+
+    def test_sparse_tanh(self):
+        x = [[0, 16, 0, 0], [0, 0, 0, 0], [0, -4, 2, 0]]
+        sparse_dim = 2
+        self.compare_with_dense(
+            x,
+            lambda x: x.to_sparse_coo(sparse_dim),
+            paddle.tanh,
+            paddle.sparse.tanh,
+            True, )
+        self.compare_with_dense(
+            x,
+            lambda x: x.to_sparse_csr(),
+            paddle.tanh,
+            paddle.sparse.tanh,
+            False, )
+        self.assert_raises_on_dense_tensor(paddle.sparse.tanh)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_tensordot.py b/python/paddle/fluid/tests/unittests/test_tensordot.py
index 29f3308988f6d..04b140cba4c0e 100644
--- a/python/paddle/fluid/tests/unittests/test_tensordot.py
+++ b/python/paddle/fluid/tests/unittests/test_tensordot.py
@@ -12,13 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle
-import unittest
-import paddle.fluid.core as core
-import numpy as np
 import itertools as it
+import numpy as np
+import unittest
 
-np.set_printoptions(threshold=np.inf)
+import paddle
+import paddle.fluid.core as core
 
 
 def tensordot_np(x, y, axes):
@@ -68,9 +67,16 @@ def tensordot_np(x, y, axes):
 
 class TestTensordotAPI(unittest.TestCase):
     def setUp(self):
+        self.set_place()
         self.set_dtype()
         self.set_input_shape()
         self.set_input_data()
+        self.set_test_axes()
+
+    def set_place(self):
+        self.places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(core.CUDAPlace(0))
 
     def set_dtype(self):
         self.dtype = np.float32
@@ -82,124 +88,83 @@ def set_input_shape(self):
     def set_input_data(self):
         self.x = np.random.random(self.x_shape).astype(self.dtype)
         self.y = np.random.random(self.y_shape).astype(self.dtype)
-        self.all_axes = [2]
 
-    def run_dygraph(self, place):
-        paddle.disable_static()
-        x = paddle.to_tensor(self.x, place=place)
-        y = paddle.to_tensor(self.y, place=place)
-        paddle_res = paddle.tensordot(x, y, self.axes)
-        np_res = tensordot_np(self.x, self.y, self.axes)
-        np.testing.assert_allclose(paddle_res, np_res, rtol=1e-6)
-
-    def run_static(self, place):
-        paddle.enable_static()
-        with paddle.static.program_guard(paddle.static.Program(),
-                                         paddle.static.Program()):
-            x = paddle.static.data(
-                name='x', shape=self.x_shape, dtype=self.dtype)
-            y = paddle.static.data(
-                name='y', shape=self.y_shape, dtype=self.dtype)
-            z = paddle.tensordot(x, y, self.axes)
-            exe = paddle.static.Executor(place)
-            paddle_res = exe.run(feed={'x': self.x,
-                                       'y': self.y},
-                                 fetch_list=[z])
-            np_res = tensordot_np(self.x, self.y, self.axes)
-            np.testing.assert_allclose(paddle_res[0], np_res, rtol=1e-6)
-
-    def test_cases(self):
-        self.all_axes = []
-        axial_index = range(4)
-        all_permutations = list(it.permutations(axial_index, 0)) + list(
-            it.permutations(axial_index, 1)) + list(
-                it.permutations(axial_index, 2)) + list(
-                    it.permutations(axial_index, 3)) + list(
-                        it.permutations(axial_index, 4))
-        self.all_axes.extend(list(i) for i in all_permutations)
-
-        for axes_x in all_permutations:
-            for axes_y in all_permutations:
-                if len(axes_x) < len(axes_y):
-                    supplementary_axes_x = axes_x + axes_y[len(axes_x):]
-                    if any(
-                            supplementary_axes_x.count(i) > 1
-                            for i in supplementary_axes_x):
-                        continue
-                elif len(axes_y) < len(axes_x):
-                    supplementary_axes_y = axes_y + axes_x[len(axes_y):]
-                    if any(
-                            supplementary_axes_y.count(i) > 1
-                            for i in supplementary_axes_y):
-                        continue
-                self.all_axes.append([list(axes_x), list(axes_y)])
-
-        self.all_axes.extend(range(5))
-
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-
-        for axes in self.all_axes:
-            self.axes = axes
-            for place in places:
-                self.run_dygraph(place)
-                self.run_static(place)
-
-
-class TestTensordotAPIFloat64(TestTensordotAPI):
-    def set_dtype(self):
-        self.dtype = np.float64
-
-
-class TestTensordotAPIAxesType(TestTensordotAPI):
-    def set_input_shape(self):
-        self.x_shape = [3, 4, 4]
-        self.y_shape = [4, 4, 5]
-
-    def test_cases(self):
+    def set_test_axes(self):
         self.all_axes = [
-            0, 1, 2, (1, ), [1], ((1, ), ), ([1], ), ((2, 1), (0, )), (
-                (1, 2), (0, 1)), ([1, 2], [0, 1]), ([1, 2], [0, 1]),
-            [[1, 2], [0, 1]]
+            [[3, 2], [3]], [[2, 1, 0], [2, 1]], [[1, 2, 0], [1, 3, 2]], [3, 0],
+            [[], [0, 3, 1]], [[2, 1, 0, 3], [2, 0, 1, 3]],
+            [[3, 1, 2], [1, 3, 2, 0]], [[2, 1], [0, 2]], [[2, 0, 1, 3], [2]],
+            [[1, 2, 0, 3], [0, 2, 1]], [[2, 1, 3, 0], [1, 2, 3]],
+            [[2, 0, 1, 3], [3, 1, 0, 2]], [[0, 3], [0, 3, 2, 1]],
+            [[1, 3, 2, 0], [2, 1, 0, 3]], [[1, 3, 2, 0], [1, 3, 2, 0]],
+            [[1, 0, 2], [0, 1]], [[2, 3, 0], [3, 1]],
+            [[1, 3, 2, 0], [3, 0, 1, 2]], [[3, 2, 1], [2, 0, 1]], [[0], []],
+            [[2, 3, 0], [1, 2, 0]], [[3, 0, 2, 1], [2, 1, 0, 3]],
+            [[3, 1, 2], [2, 3, 1]], [[1, 0, 2, 3], []], [[1, 2], [1, 2, 3]],
+            [[2, 0, 1, 3], [2, 0, 1]], [[3, 1, 2], [1, 3, 2]],
+            [[3, 1, 2, 0], [1, 2, 3, 0]], [[0, 2, 3], [0, 1, 2]],
+            [[3, 2, 0], [2, 0, 3, 1]], [[2, 1, 0, 3], [3, 1, 2, 0]],
+            [[1, 2, 3, 0], [1, 3, 0, 2]], [[3, 0], [2, 1]],
+            [[0, 1, 3, 2], [0, 2, 1, 3]], [[1, 0], [2, 1, 3]],
+            [[1, 0, 3, 2], [2, 3, 0, 1]], [[1, 2], [3]],
+            [[1, 2, 3, 0], [3, 2, 1, 0]], [[0, 3, 2, 1], [2, 1, 3, 0]], [0],
+            [[0, 2, 3], [3, 2, 0, 1]], [[1, 2, 3, 0], [3, 2, 1, 0]],
+            [[3, 1], [3]], [[3, 2, 0, 1], [3, 2, 0]], [[2, 3, 0, 1], [0, 3, 2]],
+            [[1], [1, 3]], [[1, 2], [2, 1, 0]], [[3, 1, 2], [3, 1, 0]],
+            [[1, 3], [3, 1, 2]], [[2, 0, 1, 3], [3, 1, 0, 2]],
+            [[1, 3, 0], [1, 3]], [[2, 3, 1], [1, 0, 2]],
+            [[1, 2, 0, 3], [0, 2, 1, 3]], [[2], [0, 1, 3]], [[1], [1, 2]],
+            [[1, 0, 2, 3], [3, 0, 1, 2]], [[0, 1, 3, 2], [1, 3, 0, 2]],
+            [[3, 0, 2, 1], [0, 2, 3]], [[1, 2, 0], [1, 2, 3]],
+            [[1, 0, 3], [2, 3, 0]], [[2, 3, 0], [3, 1, 0]], [[1, 3], [1, 0]],
+            [[2, 1, 0, 3], [2, 0, 3, 1]], [[3, 2, 0], [2, 1, 0]],
+            [[0, 1, 3], [0, 3, 1]], [[3, 1, 0], [3, 2, 1]], [[3, 2], [3, 1]],
+            [[3], [2, 1, 0]], [[1, 2, 3, 0], []], [[1, 3, 2, 0], [3, 1, 2]],
+            [[1], [0, 2]], [[3, 2, 0], [3, 2, 0]], [[3], []],
+            [[1, 0, 3], [2, 1]], [[3, 1, 0, 2], [2, 3, 1, 0]],
+            [[0, 1], [0, 3, 2]], [[0, 2, 3], [0, 2, 1]], [[1, 3, 0], [3, 0, 2]],
+            [[3, 1, 2], [1, 2, 3]], [[3, 1, 2], [3, 1, 0]],
+            [[0, 3, 1, 2], [3, 2, 1, 0]], [[0, 3], [3, 2, 1]],
+            [[2, 3], [1, 3, 0]], [[0, 3, 2], [2, 0, 3, 1]], [[2, 3], [1, 3]],
+            [[3, 1, 2, 0], [2, 3, 1, 0]], [[1, 0, 3, 2], [3, 0, 1, 2]],
+            [[3, 2, 1, 0], [0, 1, 3, 2]], [[3, 1, 2], [3]],
+            [[0, 1, 3, 2], [2, 3, 0, 1]], [[1, 2, 3, 0], [1, 3, 0, 2]],
+            [3, 1, 2], [[3, 1, 2], [0, 3, 2]], [[2, 3, 0], [1, 2, 0]],
+            [[2, 0, 3], [2, 0]], [[3, 1, 0, 2], [3, 1, 0, 2]],
+            [[0, 1, 2], [2, 0, 1]], [[1, 0, 3], [2, 3, 0]],
+            [[2, 0, 1], [0, 1, 3]], [[2, 1], [0, 1, 3]]
         ]
 
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-
-        for axes in self.all_axes:
-            self.axes = axes
-            for place in places:
-                self.run_dygraph(place)
-                self.run_static(place)
-
-        # The 'axes' with type 'Tensor' in tensordot is not available in static mode
-        paddle.disable_static()
-        for place in places:
-            self.all_axes = [
-                paddle.to_tensor([1]), (paddle.to_tensor([1])),
-                (paddle.to_tensor([1, 2]), paddle.to_tensor([0, 1])),
-                [paddle.to_tensor([1, 2]), paddle.to_tensor([0, 1])],
-                paddle.to_tensor([[1, 2], [0, 1]])
-            ]
-            for axes in self.all_axes:
-                self.axes = axes
-                for place in places:
-                    self.run_dygraph(place)
-
-    def test_error(self):
-        self.all_axes = [[[[0], [1]]], 0.1, -1, 100, [[1, 2], [0, 0]],
-                         [[1, 2], [0, -1]], [0, 1, 2, 3]]
+    def test_dygraph(self):
         paddle.disable_static()
-        x = paddle.to_tensor(self.x)
-        y = paddle.to_tensor(self.y)
         for axes in self.all_axes:
-            with self.assertRaises(BaseException):
-                paddle.tensordot(x, y, axes)
+            for place in self.places:
+                x = paddle.to_tensor(self.x, place=place)
+                y = paddle.to_tensor(self.y, place=place)
+                paddle_res = paddle.tensordot(x, y, axes)
+                np_res = tensordot_np(self.x, self.y, axes)
+                np.testing.assert_allclose(paddle_res, np_res, rtol=1e-6)
+
+    def test_static(self):
+        paddle.enable_static()
+        for axes in self.all_axes:
+            for place in self.places:
+                with paddle.static.program_guard(paddle.static.Program(),
+                                                 paddle.static.Program()):
+                    x = paddle.static.data(
+                        name='x', shape=self.x_shape, dtype=self.dtype)
+                    y = paddle.static.data(
+                        name='y', shape=self.y_shape, dtype=self.dtype)
+                    z = paddle.tensordot(x, y, axes)
+                    exe = paddle.static.Executor(place)
+                    paddle_res = exe.run(feed={'x': self.x,
+                                               'y': self.y},
+                                         fetch_list=[z])
+                    np_res = tensordot_np(self.x, self.y, axes)
+                    np.testing.assert_allclose(paddle_res[0], np_res, rtol=1e-6)
 
 
-class TestTensordotAPIAxesTypeFloat64(TestTensordotAPIAxesType):
+class TestTensordotAPIFloat64(TestTensordotAPI):
     def set_dtype(self):
         self.dtype = np.float64
 
@@ -234,5 +199,51 @@ def set_input_shape(self):
         self.y_shape = [5, 5, 1, 5]
 
 
+class TestTensordotAPIAxesType(TestTensordotAPI):
+    def set_input_shape(self):
+        self.x_shape = [3, 4, 4]
+        self.y_shape = [4, 4, 5]
+
+    def set_test_axes(self):
+        self.all_axes = [
+            0, 1, 2, (1, ), [1], ((1, ), ), ([1], ), ((2, 1), (0, )), (
+                (1, 2), (0, 1)), ([1, 2], [0, 1]), ([1, 2], [0, 1]),
+            [[1, 2], [0, 1]]
+        ]
+
+    def test_tensor_axes(self):
+        # The 'axes' with type 'Tensor' in tensordot is not available in static mode
+        paddle.disable_static()
+        tensor_axes = [
+            paddle.to_tensor([1]), (paddle.to_tensor([1])),
+            (paddle.to_tensor([1, 2]), paddle.to_tensor([0, 1])),
+            [paddle.to_tensor([1, 2]), paddle.to_tensor([0, 1])],
+            paddle.to_tensor([[1, 2], [0, 1]])
+        ]
+
+        for place in self.places:
+            for axes in tensor_axes:
+                x = paddle.to_tensor(self.x, place=place)
+                y = paddle.to_tensor(self.y, place=place)
+                paddle_res = paddle.tensordot(x, y, axes)
+                np_res = tensordot_np(self.x, self.y, axes)
+                np.testing.assert_allclose(paddle_res, np_res, rtol=1e-6)
+
+    def test_error(self):
+        self.all_axes = [[[[0], [1]]], 0.1, -1, 100, [[1, 2], [0, 0]],
+                         [[1, 2], [0, -1]], [0, 1, 2, 3]]
+        paddle.disable_static()
+        x = paddle.to_tensor(self.x)
+        y = paddle.to_tensor(self.y)
+        for axes in self.all_axes:
+            with self.assertRaises(BaseException):
+                paddle.tensordot(x, y, axes)
+
+
+class TestTensordotAPIAxesTypeFloat64(TestTensordotAPIAxesType):
+    def set_dtype(self):
+        self.dtype = np.float64
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_tril_indices_op.py b/python/paddle/fluid/tests/unittests/test_tril_indices_op.py
new file mode 100644
index 0000000000000..29b07a5fb8463
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_tril_indices_op.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+from paddle.fluid.framework import _test_eager_guard
+
+
+class TestTrilIndicesOp(OpTest):
+    def setUp(self):
+        self.op_type = "tril_indices"
+        self.inputs = {}
+        self.init_config()
+        self.outputs = {'out': self.target}
+
+    def test_check_output(self):
+        paddle.enable_static()
+        self.check_output()
+
+    def init_config(self):
+        self.attrs = {'rows': 4, 'cols': 4, 'offset': -1}
+        self.target = np.tril_indices(self.attrs['rows'], self.attrs['offset'],
+                                      self.attrs['cols'])
+        self.target = np.array(self.target)
+
+
+class TestTrilIndicesOpCase1(TestTrilIndicesOp):
+    def init_config(self):
+        self.attrs = {'rows': 0, 'cols': 0, 'offset': 0}
+        self.target = np.tril_indices(0, 0, 0)
+        self.target = np.array(self.target)
+
+
+class TestTrilIndicesOpCase2(TestTrilIndicesOp):
+    def init_config(self):
+        self.attrs = {'rows': 4, 'cols': 4, 'offset': 2}
+        self.target = np.tril_indices(self.attrs['rows'], self.attrs['offset'],
+                                      self.attrs['cols'])
+        self.target = np.array(self.target)
+
+
+class TestTrilIndicesAPICaseStatic(unittest.TestCase):
+    def test_static(self):
+        places = [
+            paddle.CPUPlace(), paddle.fluid.CUDAPlace(0)
+        ] if fluid.core.is_compiled_with_cuda() else [paddle.CPUPlace()]
+        paddle.enable_static()
+        for place in places:
+            with paddle.static.program_guard(paddle.static.Program(),
+                                             paddle.static.Program()):
+                data1 = paddle.tril_indices(4, 4, -1)
+                exe1 = paddle.static.Executor(place)
+                result1 = exe1.run(feed={}, fetch_list=[data1])
+            expected_result1 = np.tril_indices(4, -1, 4)
+            self.assertTrue(np.allclose(result1, expected_result1))
+
+
+class TestTrilIndicesAPICaseDygraph(unittest.TestCase):
+    def test_dygraph(self):
+        places = [
+            paddle.CPUPlace(), paddle.fluid.CUDAPlace(0)
+        ] if fluid.core.is_compiled_with_cuda() else [paddle.CPUPlace()]
+        for place in places:
+            with fluid.dygraph.base.guard(place=place):
+                out1 = paddle.tril_indices(4, 4, 2)
+            expected_result1 = np.tril_indices(4, 2, 4)
+            self.assertEqual((out1.numpy() == expected_result1).all(), True)
+
+    def test_dygraph_eager(self):
+        with _test_eager_guard():
+            self.test_dygraph()
+
+
+class TestTrilIndicesAPICaseError(unittest.TestCase):
+    def test_case_error(self):
+        def test_num_rows_type_check():
+            out1 = paddle.tril_indices(1.0, 1, 2)
+
+        self.assertRaises(TypeError, test_num_rows_type_check)
+
+        def test_num_columns_type_check():
+            out2 = paddle.tril_indices(4, -1, 2)
+
+        self.assertRaises(TypeError, test_num_columns_type_check)
+
+        def test_num_offset_type_check():
+            out3 = paddle.tril_indices(4, 4, 2.0)
+
+        self.assertRaises(TypeError, test_num_offset_type_check)
+
+
+class TestTrilIndicesAPICaseDefault(unittest.TestCase):
+    def test_default_CPU(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            data = paddle.tril_indices(4, None, 2)
+            exe = paddle.static.Executor(paddle.CPUPlace())
+            result = exe.run(feed={}, fetch_list=[data])
+        expected_result = np.tril_indices(4, 2)
+        self.assertTrue(np.allclose(result, expected_result))
+
+        with fluid.dygraph.base.guard(paddle.CPUPlace()):
+            out = paddle.tril_indices(4, None, 2)
+        expected_result = np.tril_indices(4, 2)
+        self.assertEqual((out.numpy() == expected_result).all(), True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_trunc_op.py b/python/paddle/fluid/tests/unittests/test_trunc_op.py
index 5bb3e99ee302f..1a6790728b137 100644
--- a/python/paddle/fluid/tests/unittests/test_trunc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_trunc_op.py
@@ -30,7 +30,7 @@ class TestTruncOp(OpTest):
     def setUp(self):
         self.op_type = "trunc"
         self.python_api = paddle.trunc
-        self.dtype = np.float64
+        self.init_dtype_type()
         np.random.seed(2021)
         self.inputs = {'X': np.random.random((20, 20)).astype(self.dtype)}
         self.outputs = {'Out': (np.trunc(self.inputs['X']))}
@@ -48,11 +48,19 @@ def test_check_grad(self):
 class TestFloatTruncOp(TestTruncOp):
     def init_dtype_type(self):
         self.dtype = np.float32
+        self.__class__.exist_fp64_check_grad = True
+
+    def test_check_grad(self):
+        pass
 
 
 class TestIntTruncOp(TestTruncOp):
     def init_dtype_type(self):
         self.dtype = np.int32
+        self.__class__.exist_fp64_check_grad = True
+
+    def test_check_grad(self):
+        pass
 
 
 class TestTruncAPI(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_bf16_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_bf16_op.py
index 2ba808a341e5e..5f4989f6c5dbd 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_bf16_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_bf16_op.py
@@ -178,7 +178,6 @@ class TestUniformRandomOpAPISeed(unittest.TestCase):
     def test_attr_tensor_API(self):
         _seed = 10
         gen = paddle.seed(_seed)
-        gen._is_init_py = False
         startup_program = fluid.Program()
         train_program = fluid.Program()
         with fluid.program_guard(train_program, startup_program):
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
index 0b27c61623089..0bca3c08f3d78 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
@@ -370,7 +370,6 @@ class TestUniformRandomOp_API_seed(unittest.TestCase):
     def test_attr_tensor_API(self):
         _seed = 10
         gen = paddle.seed(_seed)
-        gen._is_init_py = False
         startup_program = fluid.Program()
         train_program = fluid.Program()
         with fluid.program_guard(train_program, startup_program):
diff --git a/python/paddle/fluid/tests/unittests/test_unpool_op.py b/python/paddle/fluid/tests/unittests/test_unpool_op.py
index f6dc3fba6a214..95ad254a6dfb0 100644
--- a/python/paddle/fluid/tests/unittests/test_unpool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unpool_op.py
@@ -116,7 +116,7 @@ def init_test_case(self):
         self.output_size = None
 
 
-class TestUnpoolOpOuputsize(TestUnpoolOp):
+class TestUnpoolOpOutputsize(TestUnpoolOp):
     def init_test_case(self):
         self.unpool2d_forward_naive = unpool2dmax_forward_naive
         self.unpooling_type = "max"
@@ -127,7 +127,7 @@ def init_test_case(self):
         self.output_size = [9, 9]
 
 
-class TestUnpoolOpOuput(TestUnpoolOp):
+class TestUnpoolOpOutput(TestUnpoolOp):
     def init_test_case(self):
         self.unpool2d_forward_naive = unpool2dmax_forward_naive
         self.unpooling_type = "max"
diff --git a/python/paddle/fluid/tests/unittests/test_zeros_op.py b/python/paddle/fluid/tests/unittests/test_zeros_op.py
index 449f95aac297a..01d7107cfaeec 100644
--- a/python/paddle/fluid/tests/unittests/test_zeros_op.py
+++ b/python/paddle/fluid/tests/unittests/test_zeros_op.py
@@ -17,6 +17,7 @@
 import numpy as np
 from op_test import OpTest
 import paddle
+import paddle.compat as cpt
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 import paddle.fluid as fluid
@@ -96,9 +97,19 @@ def test_error2():
 
         self.assertRaises(TypeError, test_error2)
 
+    def test_shape_errors(self):
+        with fluid.dygraph.guard():
+            try:
+                shape = [-1, 5]
+                out = paddle.zeros(shape)
+            except Exception as e:
+                error_msg = cpt.get_exception_message(e)
+                assert error_msg.find("expected to be no less than 0") > 0
+
     def test_eager(self):
         with _test_eager_guard():
             self.test_errors()
+            self.test_shape_errors()
 
 
 if (__name__ == '__main__'):
diff --git a/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py b/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
index 23bbc377cae27..ea3264ba0dbb7 100644
--- a/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
@@ -35,4 +35,5 @@
     'eigh',
     'eigvalsh',
     'class_center_sample',
+    'einsum',
 ]
diff --git a/python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py b/python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py
index d5f4cef5b8759..fb1cd35c45380 100644
--- a/python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py
@@ -37,6 +37,7 @@
     'dot',
     'elementwise_add',
     'elementwise_div',
+    'elementwise_heaviside',
     'elementwise_max',
     'elementwise_min',
     'elementwise_mul',
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py
index 7aaa78856811f..b0bb9a37c16bd 100755
--- a/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py
@@ -23,41 +23,52 @@
 from paddle.fluid import compiler, Program, program_guard
 import paddle
 
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
 paddle.enable_static()
 
 
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPUAccuracyOp(OpTest):
-    def setUp(self):
-        self.op_type = "accuracy"
-        self.init_dtype()
-        n = 8192
-        infer = np.random.random((n, 1)).astype(self.dtype)
-        indices = np.random.randint(0, 2, (n, 1)).astype('int64')
-        label = np.random.randint(0, 2, (n, 1)).astype('int64')
-        self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
-        num_correct = 0
-        for rowid in range(n):
-            for ele in indices[rowid]:
-                if ele == label[rowid]:
-                    num_correct += 1
-                    break
-        self.outputs = {
-            'Accuracy': np.array([num_correct / float(n)]).astype(self.dtype),
-            'Correct': np.array([num_correct]).astype("int32"),
-            'Total': np.array([n]).astype("int32")
-        }
-        self.attrs = {'use_xpu': True}
-
-    def init_dtype(self):
-        self.dtype = np.float32
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
+class XPUTestAccuracyOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'accuracy'
+        self.use_dynamic_create_class = False
+
+    class TestXPUAccuracyOp(XPUOpTest):
+        def setUp(self):
+            self.op_type = "accuracy"
+            self.init_dtype()
+            n = 8192
+            infer = np.random.random((n, 1)).astype(self.dtype)
+            indices = np.random.randint(0, 2, (n, 1)).astype('int64')
+            label = np.random.randint(0, 2, (n, 1)).astype('int64')
+            self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
+            num_correct = 0
+            for rowid in range(n):
+                for ele in indices[rowid]:
+                    if ele == label[rowid]:
+                        num_correct += 1
+                        break
+            self.outputs = {
+                'Accuracy':
+                np.array([num_correct / float(n)]).astype(self.dtype),
+                'Correct': np.array([num_correct]).astype("int32"),
+                'Total': np.array([n]).astype("int32")
+            }
+            self.attrs = {'use_xpu': True}
+
+        def init_dtype(self):
+            self.dtype = self.in_type
+
+        def test_check_output(self):
+            if paddle.is_compiled_with_xpu():
+                place = paddle.XPUPlace(0)
+                self.check_output_with_place(place)
+
 
+support_types = get_xpu_op_support_types('accuracy')
+for stype in support_types:
+    create_test_class(globals(), XPUTestAccuracyOp, stype)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py
index c29150ef921c2..67fd9f871207b 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py
@@ -25,30 +25,43 @@
 from paddle.fluid import core
 from paddle.fluid.op import Operator
 
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
-class TestSGDOp(OpTest):
-    def setUp(self):
-        self.op_type = "sgd"
-        self.conf()
-        w = np.random.random((self.h, self.w)).astype("float32")
-        g = np.random.random((self.h, self.w)).astype("float32")
-        lr = np.array([0.1]).astype("float32")
 
-        self.inputs = {'Param': w, 'Grad': g, 'LearningRate': lr}
-        self.outputs = {'ParamOut': w - lr * g}
+class XPUTestSgdOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'sgd'
+        self.use_dynamic_create_class = False
 
-    def conf(self):
-        self.h = 102
-        self.w = 105
+    class TestSGDOp(XPUOpTest):
+        def setUp(self):
+            self.op_type = "sgd"
+            self.dtype = self.in_type
+            self.conf()
+            w = np.random.random((self.h, self.w)).astype(self.dtype)
+            g = np.random.random((self.h, self.w)).astype(self.dtype)
+            lr = np.array([0.1]).astype(self.dtype)
 
-    def test_check_output_with_place(self):
-        self.check_output_with_place(paddle.XPUPlace(0))
+            self.inputs = {'Param': w, 'Grad': g, 'LearningRate': lr}
+            self.outputs = {'ParamOut': w - lr * g}
 
+        def conf(self):
+            self.h = 102
+            self.w = 105
 
-class TestSGDOpCase8X(TestSGDOp):
-    def conf(self):
-        self.h = 10
-        self.w = 64
+        def test_check_output_with_place(self):
+            self.check_output_with_place(paddle.XPUPlace(0))
+
+    class TestSGDOpCase8X(TestSGDOp):
+        def conf(self):
+            self.h = 10
+            self.w = 64
+
+
+support_types = get_xpu_op_support_types('sgd')
+for stype in support_types:
+    create_test_class(globals(), XPUTestSgdOp, stype)
 
 
 class TestSGDOpWithLargeInput(unittest.TestCase):
diff --git a/python/paddle/framework/framework.py b/python/paddle/framework/framework.py
index e899d267289d5..350b1f1567bd8 100644
--- a/python/paddle/framework/framework.py
+++ b/python/paddle/framework/framework.py
@@ -24,7 +24,7 @@
 
 def set_default_dtype(d):
     """
-    Set default dtype. The default dtype is initially float32
+    Set default dtype. The default dtype is initially float32.
 
     Args:
         d(string|np.dtype): the dtype to make the default. It only
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index c1891d24b88c9..8e8dd7855113b 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -1039,7 +1039,7 @@ def _legacy_load(path, **configs):
                                                                      config)
         else:
             # load state dict by `io.save_params/persistables` save format
-            # TODO(chenweihang): [ Now only supports loading parameters seperately ]
+            # TODO(chenweihang): [ Now only supports loading parameters separately ]
             # If users save all parameters as one file, the [ variable.name -> variable ]
             # mapping info will lost, so users need to give variable list, but users build 
             # variable list in dygraph mode is difficult, we recommend users to use
diff --git a/python/paddle/framework/random.py b/python/paddle/framework/random.py
index a560072cf5a7b..b58d36b8e7d50 100644
--- a/python/paddle/framework/random.py
+++ b/python/paddle/framework/random.py
@@ -44,10 +44,8 @@ def seed(seed):
 
     if core.is_compiled_with_cuda():
         for i in range(core.get_cuda_device_count()):
-            core.default_cuda_generator(i)._is_init_py = True
             core.default_cuda_generator(i).manual_seed(seed)
 
-    core.default_cpu_generator()._is_init_py = True
     return core.default_cpu_generator().manual_seed(seed)
 
 
@@ -57,7 +55,7 @@ def get_cuda_rng_state():
     Get random state of cuda generators.
 
     Args:
-        None
+        None.
 
     Returns:
         GeneratorState:  object.
@@ -80,13 +78,13 @@ def get_cuda_rng_state():
 def set_cuda_rng_state(state_list):
     """
 
-    Sets generator state for all cuda generators
+    Sets generator state for all cuda generators.
 
     Args:
         state_list(list|tuple): The cuda states to set back to cuda generators. state_list is obtained from get_cuda_rng_state().
 
     Returns:
-        None
+        None.
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/hapi/dynamic_flops.py b/python/paddle/hapi/dynamic_flops.py
index 077a70c91015c..4dd1aa03aa2ca 100644
--- a/python/paddle/hapi/dynamic_flops.py
+++ b/python/paddle/hapi/dynamic_flops.py
@@ -28,10 +28,10 @@ def flops(net, input_size, custom_ops=None, print_detail=False):
     Args:
         net (paddle.nn.Layer||paddle.static.Program): The network which could be a instance of paddle.nn.Layer in 
                     dygraph or paddle.static.Program in static graph.
-        input_size (list): size of input tensor. Note that the batch_size in argument 'input_size' only support 1.
+        input_size (list): size of input tensor. Note that the batch_size in argument ``input_size`` only support 1.
         custom_ops (A dict of function, optional): A dictionary which key is the class of specific operation such as 
                     paddle.nn.Conv2D and the value is the function used to count the FLOPs of this operation. This 
-                    argument only work when argument 'net' is an instance of paddle.nn.Layer. The details could be found
+                    argument only work when argument ``net`` is an instance of paddle.nn.Layer. The details could be found
                     in following example code. Default is None.
         print_detail (bool, optional): Whether to print the detail information, like FLOPs per layer, about the net FLOPs.
                     Default is False.
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index c17a56fc28d88..a7a5e59f39409 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -29,7 +29,7 @@
 import paddle
 from paddle import fluid
 from paddle.fluid import core
-from paddle.fluid.framework import _non_static_mode
+from paddle.fluid.framework import _non_static_mode, in_dygraph_mode
 from paddle.fluid.framework import Variable
 from paddle.fluid.framework import _get_paddle_place
 from paddle.fluid.framework import _current_expected_place as _get_device
@@ -761,6 +761,15 @@ def eval_batch(self, inputs, labels=None):
         labels = [to_variable(l) for l in to_list(labels)]
 
         outputs = self.model.network.forward(*[to_variable(x) for x in inputs])
+
+        # Transfrom data to expected device
+        expected_device = paddle.device.get_device()
+        for o in to_list(outputs):
+            o._to(device=expected_device)
+
+        for l in labels:
+            l._to(device=expected_device)
+
         if self.model._loss:
             losses = self.model._loss(*(to_list(outputs) + labels))
             losses = to_list(losses)
@@ -915,7 +924,7 @@ class Model(object):
 
     When training on GPU, auto mixed precision (AMP O1) and pure float16 
     (AMP O2) training are both supported in static mode and dynamic mode.
-    In static graph mode, before traing with pure float16 (AMP O2),
+    In static graph mode, before training with pure float16 (AMP O2),
     `multi_precision` could be set to True when creating optimizer, which can
     avoid poor accuracy or slow convergence in a way, and inputs of dtype float
     should be cast to float16 by users. `paddle.static.amp.fp16_guard` API
@@ -2075,7 +2084,7 @@ def _run_one_epoch(
             #    [input1, input2, ..., label1, lable2, ...]
             # 3. custumed iterator yield concated inputs and labels:
             #   [input1, input2, ..., label1, lable2, ...]
-            # 4. custumed iterator yield seperated inputs and labels:
+            # 4. custumed iterator yield separated inputs and labels:
             #   ([input1, input2, ...], [label1, lable2, ...])
             # To handle all of these, flatten (nested) list to list.
             data = flatten(data)
@@ -2088,7 +2097,6 @@ def _run_one_epoch(
             callbacks.on_batch_begin(mode, step, logs)
 
             if mode != 'predict':
-
                 _inputs = [data[:len(self._inputs)], data[len(self._inputs):]]
                 if mode == 'train':
                     _inputs.append((step + 1) % self._accumulate == 0 or
diff --git a/python/paddle/hapi/model_summary.py b/python/paddle/hapi/model_summary.py
index 8d581f38e9b01..c3c043bd3fc2b 100644
--- a/python/paddle/hapi/model_summary.py
+++ b/python/paddle/hapi/model_summary.py
@@ -30,7 +30,7 @@ def summary(net, input_size=None, dtypes=None, input=None):
 
     Args:
         net (Layer): the network which must be a subinstance of Layer.
-        input_size (tuple|InputSpec|list[tuple|InputSpec]): size of input tensor. if model only 
+        input_size (tuple|InputSpec|list[tuple|InputSpec], optional): size of input tensor. if model only 
                     have one input, input_size can be tuple or InputSpec. if model
                     have multiple input, input_size must be a list which contain 
                     every input's shape. Note that input_size only dim of
@@ -301,14 +301,18 @@ def hook(layer, input, output):
             else:
                 layer_state_dict = layer.state_dict()
 
+            summary[m_key]["trainable_params"] = 0
+            trainable_flag = False
             for k, v in layer_state_dict.items():
                 params += np.prod(v.shape)
 
                 try:
                     if (getattr(getattr(layer, k), 'trainable')) and (
                             not getattr(getattr(layer, k), 'stop_gradient')):
+                        summary[m_key]["trainable_params"] += np.prod(v.shape)
                         summary[m_key]["trainable"] = True
-                    else:
+                        trainable_flag = True
+                    elif not trainable_flag:
                         summary[m_key]["trainable"] = False
                 except:
                     summary[m_key]["trainable"] = True
@@ -427,7 +431,7 @@ def _get_str_length(summary):
 
         if "trainable" in summary[layer]:
             if summary[layer]["trainable"] == True:
-                trainable_params += summary[layer]["nb_params"]
+                trainable_params += summary[layer]["trainable_params"]
         summary_str += line_new + "\n"
 
     def _get_input_size(input_size, size):
diff --git a/python/paddle/incubate/__init__.py b/python/paddle/incubate/__init__.py
index ff7a167f1a670..c354baf3b43b7 100644
--- a/python/paddle/incubate/__init__.py
+++ b/python/paddle/incubate/__init__.py
@@ -32,6 +32,7 @@
 import paddle.incubate.autotune
 
 from . import nn  #noqa: F401
+from . import asp  #noqa: F401
 
 __all__ = [
     'LookAhead',
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_1d.py b/python/paddle/incubate/asp/__init__.py
similarity index 51%
rename from python/paddle/fluid/tests/unittests/asp/test_asp_pruning_1d.py
rename to python/paddle/incubate/asp/__init__.py
index 7a3fa0244930c..59f794ef28aa4 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_1d.py
+++ b/python/paddle/incubate/asp/__init__.py
@@ -13,25 +13,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
-import unittest
-import paddle
-from paddle.static import sparsity
-from paddle.fluid.tests.unittests.asp.asp_pruning_base import TestASPHelperPruningBase
-
-paddle.enable_static()
-
-
-class TestASPHelperPruning1D(TestASPHelperPruningBase):
-    def test_1D_inference_pruning(self):
-        self.run_inference_pruning_test(
-            'mask_1d', paddle.fluid.contrib.sparsity.CheckMethod.CHECK_1D)
-
-    def test_1D_training_pruning(self):
-        self.run_training_pruning_test(
-            'mask_1d', paddle.fluid.contrib.sparsity.CheckMethod.CHECK_1D)
-
-
-if __name__ == '__main__':
-    unittest.main()
+from ...fluid.contrib.sparsity import calculate_density  #noqa: F401
+from ...fluid.contrib.sparsity import decorate  #noqa: F401
+from ...fluid.contrib.sparsity import prune_model  #noqa: F401
+from ...fluid.contrib.sparsity import set_excluded_layers  #noqa: F401
+from ...fluid.contrib.sparsity import reset_excluded_layers  #noqa: F401
+
+__all__ = [     #noqa
+    'calculate_density',
+    'decorate',
+    'prune_model',
+    'set_excluded_layers',
+    'reset_excluded_layers'
+]
diff --git a/python/paddle/incubate/autograd/__init__.py b/python/paddle/incubate/autograd/__init__.py
index 5528bb4d06c6f..a57dac02be4f5 100644
--- a/python/paddle/incubate/autograd/__init__.py
+++ b/python/paddle/incubate/autograd/__init__.py
@@ -12,7 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from paddle.autograd.functional import Hessian, Jacobian, jvp, vjp
+from .primx import prim2orig
+from .utils import enable_prim, disable_prim, prim_enabled
 
 __all__ = [  # noqa
-    'vjp', 'jvp', 'Jacobian', 'Hessian'
+    'vjp',
+    'jvp',
+    'Jacobian',
+    'Hessian',
+    'prim2orig',
+    'enable_prim',
+    'disable_prim',
+    'prim_enabled'
 ]
diff --git a/python/paddle/autograd/primops.py b/python/paddle/incubate/autograd/primops.py
similarity index 77%
rename from python/paddle/autograd/primops.py
rename to python/paddle/incubate/autograd/primops.py
index 66f641e54467c..11e0e51cb764c 100644
--- a/python/paddle/autograd/primops.py
+++ b/python/paddle/incubate/autograd/primops.py
@@ -13,8 +13,6 @@
 # limitations under the License.
 
 import paddle
-from paddle.fluid import unique_name, core
-from paddle.fluid.framework import default_main_program, default_startup_program
 from paddle.fluid.layer_helper import LayerHelper
 from .primreg import REGISTER_FN
 
@@ -136,7 +134,9 @@ def split(x, num_or_sections, axis=0, outs=None):
     if isinstance(num_or_sections, (list, tuple)):
         n = len(num_or_sections)
     else:
-        assert isinstance(num_or_sections, int)
+        if not isinstance(num_or_sections, int):
+            raise TypeError(
+                f'num_or_sections must be int, but got {type(num_or_sections)}.')
         n = num_or_sections
 
     attrs = {'num_or_sections': num_or_sections, 'axis': axis}
@@ -157,7 +157,8 @@ def split(x, num_or_sections, axis=0, outs=None):
 
 @REGISTER_FN('concat_p', 'XS', 'Y')
 def concat(xs, axis=0, out=None):
-    assert isinstance(xs, (list, tuple)) and len(xs) > 0
+    if isinstance(xs, paddle.fluid.framework.Variable):
+        xs = [xs]
     attrs = {'axis': axis}
     helper = LayerHelper('concat_p', **locals())
     if out is None:
@@ -172,9 +173,10 @@ def concat(xs, axis=0, out=None):
 
 @REGISTER_FN('reduce_p', 'X', 'Y')
 def reduce(x, axis, keepdim=False, out=None):
-    assert isinstance(axis, (tuple, list))
-    assert isinstance(keepdim, bool)
-
+    if not isinstance(axis, (tuple, list)):
+        raise TypeError(f'axis must be tuple or list, but got {type(axis)}')
+    if not isinstance(keepdim, bool):
+        raise TypeError(f'keepdim must be bool, but got {type(keepdim)}')
     attrs = {'axis': axis, 'keepdim': keepdim}
 
     helper = LayerHelper('reduce_p', **locals())
@@ -196,12 +198,20 @@ def matmul(x, y, out=None):
 
 @REGISTER_FN('slice_select_p', 'X', 'Y')
 def slice_select(x, axis, starts, ends, strides, out=None):
-    assert isinstance(axis, (list, tuple)), (
-        f'Argument type error. `axis` is supposed to be int, list or'
-        f' tuple but found {type(axis)}.')
-    assert isinstance(starts, (list, tuple))
-    assert isinstance(ends, (list, tuple))
-    assert len(axis) == len(starts) == len(ends) == len(strides)
+    if not isinstance(axis, (list, tuple)):
+        raise TypeError(f'Argument type error. `axis` is supposed to be list or'
+                        f' tuple but found {type(axis)}.')
+    if not isinstance(starts, (list, tuple)):
+        raise TypeError(
+            f'Argument type error. `starts` is supposed to be list or'
+            f' tuple but found {type(starts)}.')
+    if not isinstance(ends, (list, tuple)):
+        raise TypeError(f'Argument type error. `ends` is supposed to be list or'
+                        f' tuple but found {type(ends)}.')
+    assert len(axis) == len(starts) == len(ends) == len(strides), (
+        f'len(axis), len(starts), len(ends) and len(strides) should be equal, '
+        f'but len(axis)={len(axis)}, len(starts)={len(starts)}, '
+        f'len(ends)={len(ends)} and len(strides)={len(strides)}')
 
     attrs = {'axis': axis, 'starts': starts, 'ends': ends, 'strides': strides}
     helper = LayerHelper('slice_select_p', **locals())
@@ -217,8 +227,13 @@ def slice_select(x, axis, starts, ends, strides, out=None):
 
 @REGISTER_FN('slice_assign_p', 'X', 'Y', 'Z')
 def slice_assign(x, y, axis, starts, ends, strides, out=None):
-    assert len(starts) == len(ends) == len(strides) == len(axis)
-    assert len(y.shape) == len(x.shape)
+    assert len(starts) == len(ends) == len(strides) == len(axis), (
+        f'len(starts), len(ends), len(strides) and len(axis) should be equal, '
+        f'but len(starts)={len(starts)}, len(ends)={len(ends)}, '
+        f'len(strides)={len(strides)} and len(axis)={len(axis)}')
+    assert len(y.shape) == len(x.shape), (
+        f'len(y.shape) should be equal to len(x.shape), '
+        f'but len(y.shape)={len(y.shape)} and len(x.shape)={len(x.shape)}.')
 
     attrs = {'axis': axis, 'starts': starts, 'ends': ends, 'strides': strides}
     helper = LayerHelper('slice_assign_p', **locals())
@@ -233,7 +248,7 @@ def slice_assign(x, y, axis, starts, ends, strides, out=None):
     return out
 
 
-@REGISTER_FN('gather_p', 'X', 'Y')
+@REGISTER_FN('gather_p', 'X', 'IndexTensor', 'Y')
 def gather(x, indextensor, axis, out=None):
     attrs = {'axis': axis}
     helper = LayerHelper('gather_p', **locals())
@@ -250,9 +265,16 @@ def gather(x, indextensor, axis, out=None):
 
 @REGISTER_FN('scatter_add_p', 'X', 'Y', 'IndexTensor', 'Z')
 def scatter_add(x, y, indextensor, axis, out=None):
-    assert len(x.shape) == len(y.shape)
-    assert len(indextensor.shape) == 1
-    assert y.shape[axis] == indextensor.shape[0]
+    assert len(x.shape) == len(y.shape), (
+        f'len(x.shape) should be equal to len(y.shape), '
+        f'but len(x.shape)={len(x.shape)} and len(y.shape)={len(y.shape)}.')
+    assert len(
+        indextensor.shape
+    ) == 1, f'len(indextensor.shape) must be equal to 1, but got {len(indextensor.shape)}.'
+    assert y.shape[axis] == indextensor.shape[0], (
+        f'y.shape[axis] should be equal to indextensor.shape[0], '
+        f'but y.shape[axis]={y.shape[axis]} and '
+        f'indextensor.shape[0]={indextensor.shape[0]}.')
     attrs = {'axis': axis}
     helper = LayerHelper('scatter_add_p', **locals())
     if out is None:
diff --git a/python/paddle/incubate/autograd/primreg.py b/python/paddle/incubate/autograd/primreg.py
new file mode 100644
index 0000000000000..35a0dbcfc293f
--- /dev/null
+++ b/python/paddle/incubate/autograd/primreg.py
@@ -0,0 +1,289 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+
+
+class Registry(object):
+    """ A general registry object. """
+    __slots__ = ['name', 'tab']
+
+    def __init__(self, name):
+        self.name = name
+        self.tab = {}
+
+    def register(self, name, value):
+        assert name not in self.tab, f'name "{name}" should not be registered before.'
+        self.tab[name] = value
+
+    def lookup(self, name):
+        return self.tab.get(name)
+
+
+_primop_fn = Registry('primop_fn')
+_orig2prim = Registry('orig2prim')
+_prim2orig = Registry('prim2orig')
+_primop_jvp = Registry('primop_jvp')
+_primop_transpose = Registry('primop_transpose')
+_primop_position_argnames = Registry('primop_position_argnames')
+
+
+def lookup_fn(optype):
+    return _primop_fn.lookup(optype)
+
+
+def lookup_orig2prim(optype):
+    return _orig2prim.lookup(optype)
+
+
+def lookup_prim2orig(optype):
+    return _prim2orig.lookup(optype)
+
+
+def lookup_jvp(optype):
+    return _primop_jvp.lookup(optype)
+
+
+def lookup_transpose(optype):
+    return _primop_transpose.lookup(optype)
+
+
+def op_position_inputs(op):
+    """
+    Returns the position inputs of `op` as registered with REGISTER_FN.
+    
+    Args:
+        op(Operator): The op that needs to get the inputs
+
+    Returns:
+        Tensor(s): Inputs of the op
+
+    Examples: 
+        .. code-block:: python
+            @REGISTER_FN('div_p', 'X', 'Y', 'Z')
+            def div(x, y, out=None):
+                return _simple_binop(LayerHelper('div_p', **locals()))
+
+    The registered inputs are ['X', 'Y'] for div_p and accordingly this
+    function will return inputs in the order of X then Y.
+    
+    """
+    args = _primop_position_argnames.lookup(op.type)
+    assert args is not None, 'args should not be None in op_position_inputs().'
+    *input_names, _ = args
+
+    inputs = []
+    for name in input_names:
+        vars = list(map(op.block.var, op.input(name)))
+        assert len(
+            vars
+        ) >= 0, f'len(vars) should be greater than or equal to 0, but len(vars)={len(vars)}.'
+        if len(vars) > 1:
+            inputs.append(vars)
+        else:
+            inputs.append(vars[0])
+
+    return inputs
+
+
+def op_position_output(op):
+    """
+    Returns the output of `op` as registered with REGISTER_FN.
+    
+    Args:
+        op(Operator): The op that needs to get the output
+
+    Returns:
+        Tensor(s): Output of the op
+
+    Examples: 
+        .. code-block:: python
+            @REGISTER_FN('div_p', 'X', 'Y', 'Z')
+            def div(x, y, out=None):
+                return _simple_binop(LayerHelper('div_p', **locals()))
+
+    The registered output is ['Z'] for div_p and accordingly this
+    function will return output Z.
+    
+    """
+    args = _primop_position_argnames.lookup(op.type)
+    assert args is not None, 'args should not be None in op_position_output().'
+    *_, output_name = args
+
+    outvars = list(map(op.block.var, op.output(output_name)))
+    assert len(
+        outvars
+    ) >= 0, f'len(outvars) should be greater than or equal to 0, but len(outvars)={len(outvars)}.'
+    if len(outvars) > 1:
+        output = outvars
+    else:
+        output = outvars[0]
+
+    return output
+
+
+def REGISTER_FN(op_type, *position_argnames):
+    """
+    Decorator for registering the Python function for a primitive op.        
+
+    Args:
+        op_type(str): The op name
+        position_argnames(list[str]): Input and ouput names of the op
+
+    Returns:
+        wrapper: Inner wrapper function
+
+    Examples: 
+        .. code-block:: python
+        @REGISTER_FN('tanh_p', 'X', 'Y')
+        def tanh(x, out=None):
+            return _simple_unop(LayerHelper('tanh_p', **locals()))
+    
+    """
+
+    if not isinstance(op_type, str):
+        raise TypeError(f'op_type must be str, but got {type(op_type)}.')
+
+    _primop_position_argnames.register(op_type, position_argnames)
+
+    def wrapper(f):
+        _primop_fn.register(op_type, f)
+        return f
+
+    return wrapper
+
+
+def REGISTER_ORIG2PRIM(op_type):
+    """
+    Decorator for registering the lower function for an original op into sequence of primitive ops.
+    
+    Args:
+        op_type(str): The op name
+
+    Returns:
+        wrapper: Inner wrapper function
+
+    Examples:
+        .. code-block:: python
+            @REGISTER_ORIG2PRIM('tanh')
+            def tanh_orig2prim(op):
+                x, = get_input_var_list(op)
+                return primops.tanh(x)
+
+    """
+    if not isinstance(op_type, str):
+        raise TypeError(f'op_type must be str, but got {type(op_type)}.')
+
+    def wrapper(f):
+        def _lower(op, *args, **kwargs):
+            assert op.type == op_type, f'op.type should be equal to op_type, but op.type is {op.type} and op_type is {op_type}'
+            return f(op, *args, **kwargs)
+
+        _orig2prim.register(op_type, _lower)
+
+    return wrapper
+
+
+def REGISTER_PRIM2ORIG(op_type):
+    """
+    Decorator for registering the lower function for an primitive op into sequence of original ops.
+    
+    Args:
+        op_type(str): The op name
+
+    Returns:
+        wrapper: Inner wrapper function
+
+    Examples:
+        .. code-block:: python
+            @REGISTER_PRIM2ORIG('tanh_p')
+            def tanh_prim2orig(op):
+                x, = get_input_var_list(op)
+                return paddle.tanh(x)
+
+    """
+    if not isinstance(op_type, str):
+        raise TypeError(f'op_type must be str, but got {type(op_type)}.')
+
+    def wrapper(f):
+        def _lower(op, *args, **kwargs):
+            assert op.type == op_type, f'op.type should be equal to op_type, but op.type is {op.type} and op_type is {op_type}'
+            return f(op, *args, **kwargs)
+
+        _prim2orig.register(op_type, _lower)
+
+    return wrapper
+
+
+def REGISTER_JVP(op_type):
+    """
+    Decorator for registering the JVP function for a primitive op.
+    
+    Args:
+        op_type(str): The op name
+
+    Returns:
+        wrapper: Inner wrapper function
+
+    Examples:
+        .. code-block:: python
+            @REGISTER_JVP('add_p')
+            def add_jvp(op, x_dot, y_dot):
+                return primops.add(x_dot, y_dot)
+    
+    """
+    if not isinstance(op_type, str):
+        raise TypeError(f'op_type must be str, but got {type(op_type)}.')
+
+    def wrapper(f):
+        def _jvp(op, *args, **kwargs):
+            assert op.type == op_type, f'op.type should be equal to op_type, but op.type is {op.type} and op_type is {op_type}'
+            return f(op, *args, **kwargs)
+
+        _primop_jvp.register(op_type, _jvp)
+        return f
+
+    return wrapper
+
+
+def REGISTER_TRANSPOSE(op_type):
+    """
+    Decorator for registering the transpose function for a primitive op
+    that denotes a linear operation in the forward AD graph.
+    
+    Args:
+        op_type(str): The op name
+
+    Returns:
+        wrapper: Inner wrapper function
+
+    Examples:
+        .. code-block:: python
+            @REGISTER_TRANSPOSE('add_p')
+            def add_transpose(op, z_bar):
+                return z_bar, z_bar
+    
+    """
+    if not isinstance(op_type, str):
+        raise TypeError(f'op_type must be str, but got {type(op_type)}.')
+
+    def wrapper(f):
+        def _transpose(op, dot_checker, *args, **kwargs):
+            assert op.type == op_type, f'op.type should be equal to op_type, but op.type is {op.type} and op_type is {op_type}'
+            return f(op, dot_checker, *args, **kwargs)
+
+        _primop_transpose.register(op_type, _transpose)
+        return f
+
+    return wrapper
diff --git a/python/paddle/incubate/autograd/primrules.py b/python/paddle/incubate/autograd/primrules.py
new file mode 100644
index 0000000000000..075fe83e25289
--- /dev/null
+++ b/python/paddle/incubate/autograd/primrules.py
@@ -0,0 +1,724 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+from .primreg import REGISTER_ORIG2PRIM, REGISTER_PRIM2ORIG, REGISTER_JVP, REGISTER_TRANSPOSE
+from .primreg import (lookup_fn, lookup_orig2prim, lookup_prim2orig, lookup_jvp,
+                      lookup_transpose, op_position_inputs, op_position_output)
+from .primops import (neg, add, sub, mul, div, sqrt, tanh, reshape, broadcast,
+                      transpose, split, concat, reduce, matmul, slice_select,
+                      slice_assign, gather, scatter_add, fill_const, set_value)
+from .utils import get_input_var_list, get_output_var_list, INT_DTYPE_2_STRING
+
+
+def _orig2prim(op, *args):
+    _lowerrule = lookup_orig2prim(op.type)
+    return _lowerrule(op, *args)
+
+
+def _prim2orig(op, *args):
+    _lowerrule = lookup_prim2orig(op.type)
+    return _lowerrule(op, *args)
+
+
+def _jvp(op, *args):
+    _jvprule = lookup_jvp(op.type)
+    return _jvprule(op, *args)
+
+
+def _transpose(op, dot_checker, *args):
+    _transposerule = lookup_transpose(op.type)
+    return _transposerule(op, dot_checker, *args)
+
+
+def linear_jvp(op, *args, **kwargs):
+    fn = lookup_fn(op.type)
+    out_dot = fn(*args, **kwargs)
+    return out_dot
+
+
+## Register orig2prim lower rules
+"""
+These original ops are fully supported:
+
+elementwise_add
+elementwise_sub
+elementwise_mul
+tanh
+fill_zeros_like
+sum
+index_select
+scale
+assign
+sqrt
+
+These original ops are partially supported:
+
+matmul_v2
+reshape2
+concat
+slice
+p_norm
+"""
+
+
+@REGISTER_ORIG2PRIM('elementwise_add')
+def elementwise_add_orig2prim(op, x, y):
+    if x.shape != y.shape:
+        y = broadcast(y, shape=x.shape)
+    if op.attr('Scale_x') - 1.0 > 1e-5:
+        scale_x = fill_const(
+            shape=x.shape, dtype=x.dtype, value=op.attr('Scale_x'))
+        x = mul(x, scale_x)
+    if op.attr('Scale_y') - 1.0 > 1e-5:
+        scale_y = fill_const(
+            shape=y.shape, dtype=y.dtype, value=op.attr('Scale_y'))
+        y = mul(y, scale_y)
+    z = add(x, y)
+    if op.attr('Scale_out') - 1.0 > 1e-5:
+        scale_out = fill_const(
+            shape=z.shape, dtype=z.dtype, value=op.attr('Scale_out'))
+        z = mul(z, scale_out)
+    return z
+
+
+@REGISTER_ORIG2PRIM('elementwise_sub')
+def elementwise_sub_orig2prim(op, x, y):
+    if x.shape != y.shape:
+        y = broadcast(y, shape=x.shape)
+    if op.attr('Scale_x') - 1.0 > 1e-5:
+        scale_x = fill_const(
+            shape=x.shape, dtype=x.dtype, value=op.attr('Scale_x'))
+        x = mul(x, scale_x)
+    if op.attr('Scale_y') - 1.0 > 1e-5:
+        scale_y = fill_const(
+            shape=y.shape, dtype=y.dtype, value=op.attr('Scale_y'))
+        y = mul(y, scale_y)
+    z = sub(x, y)
+    if op.attr('Scale_out') - 1.0 > 1e-5:
+        scale_out = fill_const(
+            shape=z.shape, dtype=z.dtype, value=op.attr('Scale_out'))
+        z = mul(z, scale_out)
+    return z
+
+
+@REGISTER_ORIG2PRIM('elementwise_mul')
+def elementwise_mul_orig2prim(op, x, y):
+    if x.shape != y.shape:
+        y = broadcast(y, shape=x.shape)
+    if op.attr('Scale_x') - 1.0 > 1e-5:
+        scale_x = fill_const(
+            shape=x.shape, dtype=x.dtype, value=op.attr('Scale_x'))
+        x = mul(x, scale_x)
+    if op.attr('Scale_y') - 1.0 > 1e-5:
+        scale_y = fill_const(
+            shape=y.shape, dtype=y.dtype, value=op.attr('Scale_y'))
+        y = mul(y, scale_y)
+    z = mul(x, y)
+    if op.attr('Scale_out') - 1.0 > 1e-5:
+        scale_out = fill_const(
+            shape=z.shape, dtype=z.dtype, value=op.attr('Scale_out'))
+        z = mul(z, scale_out)
+    return z
+
+
+@REGISTER_ORIG2PRIM('tanh')
+def tanh_orig2prim(op, x):
+    return tanh(x)
+
+
+@REGISTER_ORIG2PRIM('fill_zeros_like')
+def fill_zeros_like_orig2prim(op, x):
+    return fill_const(value=0.0, shape=x.shape, dtype=x.dtype)
+
+
+@REGISTER_ORIG2PRIM('sum')
+def sum_orig2prim(op, xs):
+    x0 = xs[0]
+    for x in xs[1:]:
+        x0 = add(x0, x)
+    return x0
+
+
+@REGISTER_ORIG2PRIM('index_select')
+def index_select_orig2prim(op, index_t, x):
+    return gather(x, indextensor=index_t, axis=op.attr('dim'))
+
+
+@REGISTER_ORIG2PRIM('scale')
+def scale_orig2prim(op, scale_t, x):
+    if scale_t is None:
+        scale_t = fill_const(
+            shape=x.shape, dtype=x.dtype, value=op.attr('scale'))
+    bias_t = fill_const(shape=x.shape, dtype=x.dtype, value=op.attr('bias'))
+    if op.attr('bias_after_scale'):
+        return add(mul(x, scale_t), bias_t)
+    else:
+        return mul(add(x, bias_t), scale_t)
+
+
+@REGISTER_ORIG2PRIM('assign')
+def assign_orig2prim(op, x):
+    zero_t = fill_const(shape=x.shape, dtype=x.dtype, value=0.0)
+    return add(x, zero_t)
+
+
+@REGISTER_ORIG2PRIM('sqrt')
+def sqrt_orig2prim(op, x):
+    return sqrt(x)
+
+
+@REGISTER_ORIG2PRIM('matmul_v2')
+def matmul_v2_orig2prim(op, x, y):
+    def trans(shape):
+        ret = [i for i in range(len(shape))]
+        ret[-1], ret[-2] = ret[-2], ret[-1]
+        return ret
+
+    assert len(x.shape) < 4 and len(
+        y.shape) < 4, 'Do not support multi batchsize dimensions currently.'
+
+    if len(x.shape) == 1:
+        x = broadcast(x, shape=[1, x.shape[0]])
+    if len(y.shape) == 1:
+        y = broadcast(y, shape=[y.shape[0], 1])
+    if op.attr('trans_x'):
+        x = transpose(x, axis=trans(x.shape))
+    if op.attr('trans_y'):
+        y = transpose(y, axis=trans(y.shape))
+    return matmul(x, y)
+
+
+## NOTE(lml): The second output of reshape2 Xshape, which is only used in reshape2_grad, is meanlingless in new autograd mechanism, thus we use a zero tensor instead.
+@REGISTER_ORIG2PRIM('reshape2')
+def reshape2_orig2prim(op, shape_t, shape_tl, x):
+    assert shape_t is None, 'Can not lower reshape2 into prim ops with shapetensor.'
+    assert shape_tl is None, 'Can not lower reshape2 into prim ops with shapetensorlist.'
+    y, xshape = get_output_var_list(op)
+    return reshape(
+        x, shape=y.shape), fill_const(
+            shape=xshape.shape, dtype=xshape.dtype, value=0.0)
+
+
+@REGISTER_ORIG2PRIM('concat')
+def concat_orig2prim(op, axis_t, xs):
+    assert axis_t is None, 'Can not lower concat into prim ops with axistensor.'
+    return concat(xs, axis=op.attr('axis'))
+
+
+@REGISTER_ORIG2PRIM('slice')
+def slice_orig2prim(op, ends_t, ends_tl, x, starts_t, starts_tl):
+    assert starts_t is None, 'Can not lower concat into prim ops with startstensor.'
+    assert ends_t is None, 'Can not lower concat into prim ops with endstensor.'
+    assert starts_tl is None, 'Can not lower concat into prim ops with startstensorlist.'
+    assert ends_tl is None, 'Can not lower concat into prim ops with endstensorlist.'
+    starts = op.attr('starts')
+    ends = op.attr('ends')
+    strides = [1 for _ in starts]
+    axis = op.attr('axes')
+    y = slice_select(x, starts=starts, ends=ends, strides=strides, axis=axis)
+    if op.attr('decrease_axis'):
+        y = reshape(y, shape=get_output_var_list(op)[0].shape)
+    return y
+
+
+@REGISTER_ORIG2PRIM('p_norm')
+def p_norm_orig2prim(op, x):
+    def num_el(shape):
+        n = 1
+        for s in shape:
+            n = n * s
+        return n
+
+    assert op.attr(
+        'asvector'), 'Only support lower pnorm when asvector=True currently'
+    if len(x.shape) > 1:
+        x = reshape(x, shape=[num_el(x.shape)])
+
+    if abs(op.attr('porder') - 2.0) < 1e-5:
+        return sqrt(reduce(mul(x, x), axis=[0]))
+    elif abs(op.attr('porder') - 1.0) < 1e-5:
+        return reduce(sqrt(mul(x, x)), axis=[0])
+    else:
+        raise RuntimeError('Only support lower l2/l1 norm currently')
+
+
+## Register prim2orig lower rules
+
+
+@REGISTER_PRIM2ORIG('add_p')
+def add_prim2orig(op, x, y):
+    return paddle.add(x, y)
+
+
+@REGISTER_PRIM2ORIG('sub_p')
+def sub_prim2orig(op, x, y):
+    return paddle.subtract(x, y)
+
+
+@REGISTER_PRIM2ORIG('mul_p')
+def mul_prim2orig(op, x, y):
+    return paddle.multiply(x, y)
+
+
+@REGISTER_PRIM2ORIG('div_p')
+def div_prim2orig(op, x, y):
+    return paddle.divide(x, y)
+
+
+@REGISTER_PRIM2ORIG('sqrt_p')
+def sqrt_prim2orig(op, x):
+    return paddle.sqrt(x)
+
+
+@REGISTER_PRIM2ORIG('tanh_p')
+def tanh_prim2orig(op, x):
+    return paddle.tanh(x)
+
+
+@REGISTER_PRIM2ORIG('reshape_p')
+def reshape_prim2orig(op, x):
+    return paddle.reshape(x, shape=op.attr('shape'))
+
+
+@REGISTER_PRIM2ORIG('broadcast_p')
+def broadcast_prim2orig(op, x):
+    return paddle.broadcast_to(x, shape=op.attr('shape'))
+
+
+@REGISTER_PRIM2ORIG('transpose_p')
+def transpose_prim2orig(op, x):
+    return paddle.transpose(x, perm=op.attr('axis'))
+
+
+@REGISTER_PRIM2ORIG('split_p')
+def split_prim2orig(op, x):
+    num_or_sections = op.attr('num_or_sections')
+    if len(num_or_sections) == 1:
+        num_or_sections = num_or_sections[0]
+    return paddle.split(
+        x, num_or_sections=num_or_sections, axis=op.attr('axis'))
+
+
+@REGISTER_PRIM2ORIG('concat_p')
+def concat_prim2orig(op, xs):
+    return paddle.concat(xs, axis=op.attr('axis'))
+
+
+@REGISTER_PRIM2ORIG('reduce_p')
+def reduce_prim2orig(op, x):
+    return paddle.sum(x, axis=op.attr('axis'), keepdim=op.attr('keepdim'))
+
+
+@REGISTER_PRIM2ORIG('matmul_p')
+def matmul_prim2orig(op, x, y):
+    return paddle.matmul(x, y)
+
+
+@REGISTER_PRIM2ORIG('slice_select_p')
+def slice_select_prim2orig(op, x):
+    return paddle.strided_slice(
+        x,
+        axes=op.attr('axis'),
+        starts=op.attr('starts'),
+        ends=op.attr('ends'),
+        strides=op.attr('strides'))
+
+
+@REGISTER_PRIM2ORIG('slice_assign_p')
+def slice_assign_prim2orig(op, x, y):
+    x_copy = paddle.assign(x)
+    return set_value(
+        x_copy,
+        y,
+        axis=op.attr('axis'),
+        starts=op.attr('starts'),
+        ends=op.attr('ends'),
+        strides=op.attr('strides'),
+        out=x_copy)
+
+
+@REGISTER_PRIM2ORIG('gather_p')
+def gather_prim2orig(op, index_t, x):
+    return paddle.gather(x, index_t, axis=op.attr('axis'))
+
+
+@REGISTER_PRIM2ORIG('scatter_add_p')
+def scatter_add_prim2orig(op, index_t, x, y):
+    assert op.attr('axis') == 0, 'Only support axis==0 currently'
+    zeros = paddle.zeros_like(x=x, dtype=x.dtype)
+    tmp = paddle.scatter(x=zeros, index=index_t, updates=y, overwrite=False)
+    return paddle.add(x, tmp)
+
+
+@REGISTER_PRIM2ORIG('fill_constant_p')
+def fill_constant_prim2orig(op):
+    return paddle.full(
+        shape=op.attr('shape'),
+        fill_value=op.attr('value'),
+        dtype=INT_DTYPE_2_STRING[op.attr('dtype')])
+
+
+## Register linearize rules
+@REGISTER_JVP('add_p')
+def add_jvp(op, x_dot, y_dot):
+    if x_dot is None:
+        return y_dot
+    elif y_dot is None:
+        return x_dot
+    else:
+        return linear_jvp(op, x_dot, y_dot)
+
+
+@REGISTER_JVP('sub_p')
+def sub_jvp(op, x_dot, y_dot):
+    if x_dot is None:
+        return neg(y_dot)
+    elif y_dot is None:
+        return x_dot
+    else:
+        return linear_jvp(op, x_dot, y_dot)
+
+
+@REGISTER_JVP('mul_p')
+def mul_jvp(op, x_dot, y_dot):
+    if x_dot is None and y_dot is None:
+        return None
+    x, y = op_position_inputs(op)
+    if x_dot is None:
+        return mul(x, y_dot)
+    elif y_dot is None:
+        return mul(x_dot, y)
+    else:
+        t1, t2 = mul(x_dot, y), mul(x, y_dot)
+        z_dot = add(t1, t2)
+        return z_dot
+
+
+@REGISTER_JVP('div_p')
+def div_jvp(op, x_dot, y_dot):
+    if x_dot is None and y_dot is None:
+        return None
+    x, y = op_position_inputs(op)
+    if y_dot is None:
+        return div(x_dot, y)
+    elif x_dot is None:
+        return neg(div(mul(x, y_dot), mul(y, y)))
+    else:
+        t1 = div(x_dot, y)
+        t2 = div(mul(x, y_dot), mul(y, y))
+        return sub(t1, t2)
+
+
+@REGISTER_JVP('sqrt_p')
+def sqrt_jvp(op, x_dot):
+    if x_dot is None:
+        return None
+    y = op_position_output(op)
+    c2 = fill_const(value=2.0, shape=y.shape, dtype=y.dtype)
+    y_dot = div(x_dot, mul(c2, y))
+    return y_dot
+
+
+@REGISTER_JVP('tanh_p')
+def tanh_jvp(op, x_dot):
+    if x_dot is None:
+        return None
+    y = op_position_output(op)
+    c1 = fill_const(value=1.0, shape=y.shape, dtype=y.dtype)
+    y_dot = mul(x_dot, sub(c1, mul(y, y)))
+    return y_dot
+
+
+@REGISTER_JVP('reshape_p')
+def reshape_jvp(op, x_dot):
+    if x_dot is None:
+        return None
+    shape = op.attr('shape')
+    return linear_jvp(op, x_dot, shape=shape)
+
+
+@REGISTER_JVP('broadcast_p')
+def broadcast_jvp(op, x_dot):
+    if x_dot is None:
+        return None
+    shape = op.attr('shape')
+    return linear_jvp(op, x_dot, shape=shape)
+
+
+@REGISTER_JVP('transpose_p')
+def transpose_jvp(op, x_dot):
+    if x_dot is None:
+        return None
+    axis = op.attr('axis')
+    return linear_jvp(op, x_dot, axis=axis)
+
+
+@REGISTER_JVP('split_p')
+def split_jvp(op, x_dot):
+    if x_dot is None:
+        return None
+    num_or_sections = op.attr('num_or_sections')
+    axis = op.attr('axis')
+    return linear_jvp(op, x_dot, num_or_sections=num_or_sections, axis=axis)
+
+
+@REGISTER_JVP('concat_p')
+def concat_jvp(op, xs_dot):
+    if xs_dot is None:
+        return None
+    axis = op.attr('axis')
+    return linear_jvp(op, xs_dot, axis=axis)
+
+
+@REGISTER_JVP('reduce_p')
+def reduce_jvp(op, x_dot):
+    if x_dot is None:
+        return None
+    axis = op.attr('axis')
+    keepdim = op.attr('keepdim')
+    return linear_jvp(op, x_dot, axis=axis, keepdim=keepdim)
+
+
+@REGISTER_JVP('matmul_p')
+def matmul_jvp(op, x_dot, y_dot):
+    if x_dot is None and y_dot is None:
+        return None
+    x, y = op_position_inputs(op)
+    if x_dot is None:
+        return matmul(x, y_dot)
+    elif y_dot is None:
+        return matmul(x_dot, y)
+    else:
+        t1 = matmul(x, y_dot)
+        t2 = matmul(x_dot, y)
+        return add(t1, t2)
+
+
+@REGISTER_JVP('slice_select_p')
+def slice_select_jvp(op, x_dot):
+    if x_dot is None:
+        return x_dot
+    axis = op.attr('axis')
+    starts = op.attr('starts')
+    ends = op.attr('ends')
+    strides = op.attr('strides')
+    return linear_jvp(
+        op, x_dot, axis=axis, starts=starts, ends=ends, strides=strides)
+
+
+@REGISTER_JVP('slice_assign_p')
+def slice_assign_jvp(op, x_dot, y_dot):
+    if x_dot is None:
+        assert y_dot is None, 'y_dot must be None.'
+        return None
+    else:
+        assert y_dot is not None, 'y_dot should not be None.'
+    axis = op.attr('axis')
+    starts = op.attr('starts')
+    ends = op.attr('ends')
+    strides = op.attr('strides')
+    return linear_jvp(
+        op, x_dot, y_dot, axis=axis, starts=starts, ends=ends, strides=strides)
+
+
+@REGISTER_JVP('gather_p')
+def gather_jvp(op, x_dot, indextensor):
+    if x_dot is None:
+        return None
+    _, indextensor = op_position_inputs(op)
+    axis = op.attr('axis')
+    return linear_jvp(op, x_dot, indextensor, axis=axis)
+
+
+@REGISTER_JVP('scatter_add_p')
+def scatter_add_jvp(op, x_dot, y_dot):
+    if x_dot is None:
+        return None
+    _, _, indextensor = op_position_inputs(op)
+    axis = op.attr('axis')
+    return linear_jvp(op, x_dot, y_dot, indextensor, axis=axis)
+
+
+## Register transpose rules
+
+
+@REGISTER_TRANSPOSE('add_p')
+def add_transpose(op, check_dot, z_bar):
+    x, y = op_position_inputs(op)
+    assert check_dot(x) or check_dot(y), (
+        f'(check_dot(x) or check_dot(y)) must be True, '
+        f'but check_dot(x)={check_dot(x)} and check_dot(y)={check_dot(y)}.')
+    x_bar = z_bar if check_dot(x) else None
+    y_bar = z_bar if check_dot(y) else None
+    return x_bar, y_bar
+
+
+@REGISTER_TRANSPOSE('sub_p')
+def sub_transpose(op, check_dot, z_bar):
+    x, y = op_position_inputs(op)
+    assert check_dot(x) or check_dot(y), (
+        f'(check_dot(x) or check_dot(y)) must be True, '
+        f'but check_dot(x)={check_dot(x)} and check_dot(y)={check_dot(y)}.')
+    x_bar = z_bar if check_dot(x) else None
+    y_bar = neg(z_bar) if check_dot(y) else None
+    return x_bar, y_bar
+
+
+@REGISTER_TRANSPOSE('mul_p')
+def mul_transpose(op, check_dot, z_bar):
+    x, y = op_position_inputs(op)
+    assert check_dot(x) ^ check_dot(y), (
+        f'(check_dot(x) ^ check_dot(y)) must be True, '
+        f'but check_dot(x)={check_dot(x)} and check_dot(y)={check_dot(y)}.')
+    if check_dot(x):
+        return mul(z_bar, y), None
+    else:
+        return None, mul(x, z_bar)
+
+
+@REGISTER_TRANSPOSE('div_p')
+def div_transpose(op, check_dot, z_bar):
+    x, y = op_position_inputs(op)
+    assert not check_dot(y), 'check_dot(y) must be False'
+    x_bar = div(z_bar, y) if check_dot(x) else None
+    return x_bar, None
+
+
+@REGISTER_TRANSPOSE('reshape_p')
+def reshape_transpose(op, check_dot, y_bar):
+    x, = op_position_inputs(op)
+    assert check_dot(x), 'check_dot(x) must be True'
+    return reshape(y_bar, shape=x.shape)
+
+
+@REGISTER_TRANSPOSE('broadcast_p')
+def broadcast_transpose(op, check_dot, y_bar):
+    x, = op_position_inputs(op)
+    assert check_dot(x), 'check_dot(x) must be True'
+    bat = len(y_bar.shape) - len(x.shape)
+    axis = list(range(bat))
+    keepdim = [(bat + i) for i, s in enumerate(x.shape) if s == 1]
+    axis += keepdim
+    # TODO: Change it. keepdim boolean
+    out = reduce(y_bar, axis=axis, keepdim=False)
+    return reshape(out, x.shape)
+
+
+@REGISTER_TRANSPOSE('transpose_p')
+def transpose_transpose(op, check_dot, y_bar):
+    x, = op_position_inputs(op)
+    assert check_dot(x), 'check_dot(x) must be True'
+    axis = op.attr('axis')
+    reordered = sorted((k, i) for i, k in enumerate(axis))
+    axis = [i for k, i in reordered]
+    return transpose(y_bar, axis=axis)
+
+
+@REGISTER_TRANSPOSE('split_p')
+def split_transpose(op, check_dot, ys_bar):
+    x, = op_position_inputs(op)
+    assert check_dot(x), 'check_dot(x) must be True'
+    return concat(ys_bar, axis=op.attr('axis'))
+
+
+@REGISTER_TRANSPOSE('concat_p')
+def concat_transpose(op, check_dot, y_bar):
+    xs, = op_position_inputs(op)
+    for x in xs:
+        assert check_dot(x), 'check_dot(x) must be True'
+    axis = op.attr('axis')
+    sections = [x.shape[axis] for x in xs]
+    return split(y_bar, num_or_sections=sections, axis=axis)
+
+
+@REGISTER_TRANSPOSE('reduce_p')
+def reduce_transpose(op, check_dot, y_bar):
+    x, = op_position_inputs(op)
+    assert check_dot(x), 'check_dot(x) must be True'
+    axes = op.attr('axis')
+    shape = tuple(1 if i in axes else size for i, size in enumerate(x.shape))
+    t = reshape(y_bar, shape=shape)
+    return broadcast(t, shape=x.shape)
+
+
+@REGISTER_TRANSPOSE('matmul_p')
+def matmul_transpose(op, check_dot, z_bar):
+    x, y = op_position_inputs(op)
+    assert check_dot(x) ^ check_dot(y), (
+        f'(check_dot(x) ^ check_dot(y)) must be True, '
+        f'but check_dot(x)={check_dot(x)} and check_dot(y)={check_dot(y)}.')
+    # TODO: replace it. this is hacky
+    axis = [1, 0] if len(x.shape) == 2 else [0, 2, 1]
+    if check_dot(x):
+        return matmul(z_bar, transpose(y, axis=axis)), None
+    else:
+        return None, matmul(transpose(x, axis=axis), z_bar)
+
+
+@REGISTER_TRANSPOSE('slice_select_p')
+def slice_select_transpose(op, check_dot, y_bar):
+    x, = op_position_inputs(op)
+    assert check_dot(x), 'check_dot(x) must be True'
+    zeros = fill_const(value=0.0, shape=x.shape, dtype=x.dtype)
+    axis = op.attr('axis')
+    starts = op.attr('starts')
+    ends = op.attr('ends')
+    strides = op.attr('strides')
+    return slice_assign(
+        zeros, y_bar, axis=axis, starts=starts, ends=ends, strides=strides)
+
+
+@REGISTER_TRANSPOSE('slice_assign_p')
+def slice_assign_transpose(op, check_dot, z_bar):
+    x, y = op_position_inputs(op)
+    assert check_dot(x) and check_dot(y), (
+        f'(check_dot(x) and check_dot(y)) must be True, '
+        f'but check_dot(x)={check_dot(x)} and check_dot(y)={check_dot(y)}.')
+    zeros = fill_const(value=0.0, shape=y.shape, dtype=y.dtype)
+    axis = op.attr('axis')
+    starts = op.attr('starts')
+    ends = op.attr('ends')
+    strides = op.attr('strides')
+    x_bar = slice_assign(
+        z_bar, zeros, axis=axis, starts=starts, ends=ends, strides=strides)
+    y_bar = slice_select(
+        z_bar, axis=axis, starts=starts, ends=ends, strides=strides)
+    return x_bar, y_bar
+
+
+@REGISTER_TRANSPOSE('gather_p')
+def gather_transpose(op, check_dot, y_bar):
+    x, indextensor = op_position_inputs(op)
+    assert check_dot(x), 'check_dot(x) must be True'
+    axis = op.attr('axis')
+    zeros = fill_const(0.0, x.shape, x.dtype)
+    x_bar = scatter_add(zeros, y_bar, indextensor, axis=axis)
+    indextensor_bar = None
+    return x_bar, indextensor_bar
+
+
+@REGISTER_TRANSPOSE('scatter_add_p')
+def scatter_add_transpose(op, check_dot, z_bar):
+    x, y, indextensor = op_position_inputs(op)
+    assert check_dot(x) and check_dot(y), (
+        f'(check_dot(x) and check_dot(y)) must be True, '
+        f'but check_dot(x)={check_dot(x)} and check_dot(y)={check_dot(y)}.')
+    axis = op.attr('axis')
+    zeros = fill_const(value=0.0, shape=y.shape, dtype=y.dtype)
+    x_bar = scatter_add(z_bar, zeros, indextensor, axis=axis)
+    y_bar = gather(z_bar, indextensor, axis=axis)
+    indextensor_bar = None
+    return x_bar, y_bar, indextensor_bar
diff --git a/python/paddle/incubate/autograd/primx.py b/python/paddle/incubate/autograd/primx.py
new file mode 100644
index 0000000000000..7a969748208a4
--- /dev/null
+++ b/python/paddle/incubate/autograd/primx.py
@@ -0,0 +1,611 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.fluid import framework as framework
+from paddle.fluid.framework import default_main_program
+from paddle.fluid.framework import Operator
+from paddle import compat as cpt
+from .primops import fill_const, add
+from .primreg import op_position_inputs, op_position_output, lookup_orig2prim, lookup_prim2orig
+from .primrules import _orig2prim, _prim2orig, _jvp, _transpose
+from .utils import get_input_var_list, get_output_var_list, to_tensors, flatten, flatten_and_remove_none
+from collections import OrderedDict
+
+
+def topo_path(xs, ys, block=None):
+    """ Returns the list of ops on the path from `xs` to `ys` in topological 
+    order.
+    
+    TODO(Tongxin): supporting control flow and nested blocks.
+    Args:
+        xs: a list|tuple of vars as source
+        ys: a list|tuple of vars as sink
+        block: the program block containing the path, optional
+    Returns:
+        (path, unused_xs, unreached_ys): a tuple comprised of the resulting op
+        path, the unused variables in `xs`, and the unreached variables in `ys`
+    """
+
+    if block is None:
+        block = default_main_program().current_block()
+
+    path = []
+    backpath = []
+    reached_vars = OrderedDict()
+    used_vars = OrderedDict()
+
+    # Initialize reached vars
+    for x in xs:
+        assert x is None or x.block == block, f'x is not None and x.block != block'
+        reached_vars[id(x)] = x
+
+    # Reaching test, returning whether an op is reached from the given input
+    reaching = lambda op: any(id(v) in reached_vars for v in flatten_and_remove_none(get_input_var_list(op)))
+
+    # block.ops are supposedly in the order that preserves correct data
+    # dependence.
+    # Forward pass to identify all reached variables and ops
+    for op in block.ops:
+        if reaching(op):
+            path.append(op)
+            for var in flatten_and_remove_none(get_output_var_list(op)):
+                reached_vars[id(var)] = var
+
+    used_vars = OrderedDict((id(y), y) for y in ys if id(y) in reached_vars)
+    back_reaching = lambda op: any(id(out) in used_vars for out in flatten_and_remove_none(get_output_var_list(op)))
+
+    # Backward pass to find all used variables
+    for op in reversed(path):
+        if back_reaching(op):
+            backpath.append(op)
+            for var in flatten_and_remove_none(get_input_var_list(op)):
+                used_vars[id(var)] = var
+
+    unused_xs = [x for x in xs if id(x) not in used_vars]
+    unreached_ys = [y for y in ys if id(y) not in reached_vars]
+
+    return list(reversed(backpath)), unused_xs, unreached_ys
+
+
+def output_vars_on_path(path):
+    """ Returns the output variables of all the ops on the path from `xs`
+    to `ys`.
+    
+    Args:
+        path: a list of ops on which to find the output variables
+
+    Returns:
+        vars: the output vars
+    """
+    vars = OrderedDict()
+    for op in path:
+        for out in flatten_and_remove_none(get_output_var_list(op)):
+            vars[id(out)] = out
+
+    return vars
+
+
+class VarMap(object):
+    """ A general map data structure for linking variables to variables.
+    
+    An example is linking variables to their gradients.
+    """
+
+    __slots__ = ['name', 'varset', 'tab']
+
+    def __init__(self, name, varset):
+        self.name = name
+        self.varset = varset
+        self.tab = OrderedDict()
+
+    def add(self, key_var, value_var):
+        self.tab[id(key_var)] = id(value_var)
+
+    def add_rec(self, key_vars, value_vars):
+        if value_vars is None:
+            return
+        if isinstance(key_vars, paddle.fluid.framework.Variable):
+            if not isinstance(value_vars, paddle.fluid.framework.Variable):
+                raise TypeError(
+                    f'value_vars must be Variable, but got {type(value_vars)}')
+            self.tab[id(key_vars)] = id(value_vars)
+        else:
+            assert len(key_vars) == len(value_vars), (
+                f'len(key_vars) shoule be equal to len(value_vars), '
+                f'but len(key_vars)={len(key_vars)} and len(value_vars)={len(value_vars)}.'
+            )
+            for key_var, value_var in zip(key_vars, value_vars):
+                self.add_rec(key_var, value_var)
+
+    def lookup(self, key_var):
+        value_id = self.tab.get(id(key_var))
+        if value_id is not None:
+            return self.varset.get(value_id)
+        else:
+            return None
+
+    def delete(self, key_var):
+        varid = id(key_var)
+        if varid in self.tab:
+            del self.tab[id(key_var)]
+
+    def delete_keyvars(self, key_vars):
+        for var in key_vars:
+            varid = id(var)
+            if varid in self.tab:
+                del self.tab[varid]
+
+    def delete_valuevars(self, value_vars):
+        ids = [id(v) for v in value_vars]
+        keys = [k for k, v in self.tab.items() if v in ids]
+        for k in keys:
+            del self.tab[k]
+
+    def contain_var(self, key_var):
+        return self.tab.__contains__(id(key_var))
+
+    def contain_value(self, value_var):
+        return id(value_var) in self.tab.values()
+
+
+class Transform(object):
+    """ An object that maintains the state of transformations applied to a 
+    primitve program. """
+
+    def __init__(self, block):
+        self.block = block
+        self.vars = self.init_vars(block)
+        self.var2dot = VarMap('var2dot', self.vars)
+        self.dot2bar = VarMap('dot2var', self.vars)
+
+    def init_vars(self, block):
+        vars = OrderedDict()
+        for _, var in block.vars.items():
+            vars[id(var)] = var
+        return vars
+
+    def add_vars(self, new_vars):
+        self.vars.update({id(v): v for v in new_vars if v is not None})
+
+    def add_vars_rec(self, new_vars):
+        if new_vars is None:
+            return
+        if isinstance(new_vars, paddle.fluid.framework.Variable):
+            self.vars.update({id(new_vars): new_vars})
+            return
+        if not isinstance(new_vars, list):
+            raise TypeError(f'new_vars must be list, but got {type(new_vars)}')
+        for var in new_vars:
+            self.add_vars_rec(var)
+
+    def erase_ops(self, ordered_indexes):
+        block = self.block
+        for op_index in reversed(ordered_indexes):
+            block.desc._remove_op(op_index, op_index + 1)
+
+        # remove from block.ops
+        for op_index in reversed(ordered_indexes):
+            del block.ops[op_index]
+
+        block._sync_with_cpp()
+
+    def erase_dots(self, vars_to_erase):
+        for var in vars_to_erase:
+            if id(var) in self.vars:
+                del self.vars[id(var)]
+        self.dot2bar.delete_keyvars(vars_to_erase)
+        self.var2dot.delete_valuevars(vars_to_erase)
+        block = self.block
+        for var in vars_to_erase:
+            name = var.name
+            block.desc._remove_var(cpt.to_bytes(name))
+            del block.vars[name]
+        block._sync_with_cpp()
+
+    def var2dot_rec(self, vars):
+        """ Lookup var2dot recursively."""
+        if isinstance(vars, paddle.fluid.framework.Variable):
+            dot = self.var2dot.lookup(vars)
+            return dot
+
+        dots = [self.var2dot_rec(var) for var in vars]
+        return dots
+
+    def dot2bar_rec(self, dots):
+
+        if isinstance(dots, paddle.fluid.framework.Variable):
+            bar = self.dot2bar.lookup(dots)
+            assert bar is not None, 'bar must be not None'
+            return bar
+
+        bars = [self.dot2bar_rec(dot) for dot in dots]
+        return bars
+
+    def linearize(self, xs, ys, xs_dot=None):
+        """ Performs the linearization transform, a.k.a, forward mode AD 
+        transform, on a primitive lowered program.
+        
+        Args:
+            xs: a list of input variables
+            ys: a list of output variables
+            xs_dot: optional, a list of gradient input variables. The list size
+                must be equal to `len(xs)`. The shape and dtype of each element
+                must be the same as in `xs`
+
+        Returns:
+            (xs_dot, ys_dot): a tuple of two lists. `xs_dot` is the list of
+            gradient inputs of the resulting linearized program. `ys_dot` is 
+            the list gradient outputs of the resulting linearized program
+            
+        """
+        if xs_dot is None:
+            xs_dot = [fill_const(1.0, shape=x.shape, dtype=x.dtype) for x in xs]
+            self.add_vars(xs_dot)
+        else:
+            assert len(xs) == len(xs_dot), (
+                f'len(xs) should be equal to len(xs_dot), '
+                f'but len(xs)={len(xs)} and len(xs_dot)={len(xs_dot)}')
+
+        for x, dot in zip(xs, xs_dot):
+            assert x.dtype == dot.dtype, (
+                f'x.dtype should be equal to dot.dtype, '
+                f'but x.dtype={x.dtype} and dot.dtype={dot.dtype}')
+            assert x.shape == dot.shape, (
+                f'x.shape should be equal to dot.shape, '
+                f'but x.shape={x.shape} and dot.shape={dot.shape}')
+            self.var2dot.add(x, dot)
+
+        path, unused_xs, _ = topo_path(xs, ys, self.block)
+
+        # No need to track unused inputs
+        for x in unused_xs:
+            self.var2dot.delete(x)
+
+        for op in path:
+            # An input var may not be on the input-output path, which implies 
+            # there may be None's in `ins_dot`. In this case we place
+            # the original input in the position of the otherwise forward
+            # gradient.
+            ins = op_position_inputs(op)
+            jvp_ins = self.var2dot_rec(ins)
+            # apply op's forward ad rule
+            outs_dot = _jvp(op, *jvp_ins)
+            self.add_vars_rec(outs_dot)
+            outs = op_position_output(op)
+            self.var2dot.add_rec(outs, outs_dot)
+
+        ys_dot = [self.var2dot.lookup(y) for y in ys]
+        return xs_dot, ys_dot
+
+    def transpose(self, ys_dot, xs_dot, ys_bar=None, retain_fwd=False):
+        """ Performs the transpose transform, a.k.a, reverse mode AD 
+        transform, on a linearized primitive program.
+
+        Note, `transpose` is supposed to be used in couple with `linearize`.
+        
+        Args:
+            ys_dot: a list of outputs of the linearized program.
+            xs_dot: a list of inputs of the linearized program.
+            ys_bar: optional, a list of inputs of the resulting transposed 
+                program. The list size must be equal to `len(ys_dot)`. The shape
+                and dtype of each element must be the same as in `ys_dot`
+
+        Returns:
+            (ys_bar, xs_bar): a tuple of two lists. `ys_bar` is the list of
+            inputs of the resulting transposed program. `xs_bar` is 
+            the list outputs of the resulting transposed program
+            
+        """
+        assert all(v is not None for v in xs_dot), f'`xs_dot` includes None.'
+        assert all(v is not None for v in ys_dot), f'`ys_dot` includes None.'
+
+        if ys_bar is None:
+            ys_bar = []
+            for y in ys_dot:
+                ys_bar.append(fill_const(1.0, shape=y.shape, dtype=y.dtype))
+            self.add_vars(ys_bar)
+        else:
+            assert len(ys_dot) == len(ys_bar), (
+                f'len(ys_dot) should be equal to len(ys_bar), '
+                f'but len(ys_dot)={len(ys_dot)} and len(ys_bar)={len(ys_bar)}')
+            for y_dot, y_bar in zip(ys_dot, ys_bar):
+                assert y_dot.shape == y_bar.shape, (
+                    f'y_dot.shape should be equal to y_bar.shape, '
+                    f'but y_dot.shape={y_dot.shape} and y_bar.shape={y_bar.shape}'
+                )
+                assert y_dot.dtype == y_bar.dtype, (
+                    f'y_dot.dtype should be equal to y_bar.dtype, '
+                    f'but y_dot.dtype={y_dot.dtype} and y_bar.dtype={y_bar.dtype}'
+                )
+
+        for dot, bar in zip(ys_dot, ys_bar):
+            self.dot2bar.add(dot, bar)
+
+        # find all the relevant forward gradients
+        path, unused_xs_dot, _ = topo_path(xs_dot, ys_dot, self.block)
+
+        # No need to track unused inputs
+        for dot in unused_xs_dot:
+            self.dot2bar.delete(dot)
+
+        dotvars = output_vars_on_path(path)
+        dotvars.update((id(var), var) for var in xs_dot)
+
+        is_dot = lambda v: id(v) in dotvars
+
+        for op in reversed(path):
+            out = op_position_output(op)
+            out_bar_rec = self.dot2bar_rec(out)
+            ins_bar_rec = _transpose(op, is_dot, out_bar_rec)
+
+            # TODO(Tongxin): this is hacky. Tuple implies the Transpose rule
+            # returns multiple entities. There should be better ways to handle
+            # outputs.
+            if isinstance(ins_bar_rec, tuple):
+                ins_bar_rec = list(ins_bar_rec)
+            else:
+                ins_bar_rec = [ins_bar_rec]
+            self.add_vars_rec(ins_bar_rec)
+
+            ins_bar = flatten(ins_bar_rec)
+            ins = flatten(op_position_inputs(op))
+            assert len(ins) == len(ins_bar), (
+                f'len(ins) should be equal to len(ins_bar), '
+                f'but len(ins)={len(ins)} and len(ins_bar)={len(ins_bar)}')
+
+            for dot, bar in zip(ins, ins_bar):
+                if bar is not None:
+                    # aggregate gradient
+                    grad = self.dot2bar.lookup(dot)
+                    if grad is None:
+                        self.dot2bar.add(dot, bar)
+                    else:
+                        grad = add(grad, bar)
+                        self.add_vars([grad])
+                        self.dot2bar.add(dot, grad)
+
+        xs_bar = [self.dot2bar.lookup(x) for x in xs_dot]
+
+        if not retain_fwd and len(path) > 0:
+            vars_to_remove = set()
+            for op in path:
+                vars_to_remove.update(
+                    flatten_and_remove_none(get_output_var_list(op)))
+
+            op_indexes = []
+
+            block = self.block
+            for i, op in enumerate(block.ops):
+                if op in path:
+                    op_indexes.append(i)
+                    path.pop(0)
+                    if len(path) == 0:
+                        break
+
+            self.erase_ops(op_indexes)
+            self.erase_dots(vars_to_remove)
+
+        return ys_bar, xs_bar
+
+
+def _lower(block, reverse):
+    # Some functions which are only used in _lower.
+    def bind(args, to_bind, value_table):
+        for i in range(len(args)):
+            if isinstance(args[i], list):
+                bind(args[i], to_bind, value_table)
+            elif args[i] is not None and args[i].name in to_bind:
+                args[i] = value_table[to_bind[args[i].name]]
+
+    def bind_name(names, to_bind):
+        return_list = []
+        for name in names:
+            if isinstance(name, list):
+                return_list.append(bind_name(name, to_bind))
+            else:
+                return_list.append(to_bind[name] if name in to_bind else name)
+        return return_list
+
+    def expand_nested_list(xs):
+        return_list = []
+        for x in xs:
+            if isinstance(x, list):
+                return_list = return_list + expand_nested_list(x)
+            else:
+                return_list.append(x)
+        return return_list
+
+    # Step1: Do some preparatory work for lower
+    lower_fn = _prim2orig if reverse else _orig2prim
+    lookup_fn = lookup_prim2orig if reverse else lookup_orig2prim
+    if block is None:
+        program = default_main_program()
+        assert program.num_blocks == 1, "The lower transform is designed to process only one block."
+        block = program.current_block()
+
+    value_table = {}
+    to_bind = {}
+    to_bind_rev = {}
+    for var in block.desc.all_vars():
+        value_table[var.name()] = block.var(var.name())
+
+    ops_to_remove = []
+    vars_to_remove = set()
+
+    # Step2: Process all ops in the target block
+    for op_idx in range(len(block.ops)):
+        op = block.ops[op_idx]
+        ops_to_remove.append(op_idx)
+        if lookup_fn(op.type) is not None:
+            input_args = get_input_var_list(op)
+            bind(input_args, to_bind, value_table)
+
+            for orig_out, new_out in zip(
+                    expand_nested_list(get_output_var_list(op)),
+                    expand_nested_list(to_tensors(lower_fn(op, *input_args)))):
+                assert not (orig_out is None) ^ (
+                    new_out is None), "orig_out and new_out should match."
+                vars_to_remove.add(new_out.name)
+                value_table[new_out.name] = new_out
+                to_bind[orig_out.name] = new_out.name
+                to_bind_rev[new_out.name] = orig_out.name
+        else:
+            inputs = {}
+            for i in range(len(op.input_names)):
+                inputs[op.input_names[i]] = bind_name(
+                    op.input(op.input_names[i]), to_bind)
+
+            outputs = {}
+            for i in range(len(op.output_names)):
+                outputs[op.output_names[i]] = op.output(op.output_names[i])
+
+            attrs = {}
+            for name in sorted(op.attr_names):
+                attrs[name] = op.attr(name)
+            from paddle.fluid.dygraph.base import param_guard
+            new_op_desc = block.desc.append_op()
+            with param_guard(inputs), param_guard(outputs):
+                op = Operator(
+                    block=block,
+                    desc=new_op_desc,
+                    type=op.type,
+                    inputs=inputs,
+                    outputs=outputs,
+                    attrs=attrs)
+            block.ops.append(op)
+
+    # Step3: Do some post-processing work
+    for op_idx in reversed(ops_to_remove):
+        block.desc._remove_op(op_idx, op_idx + 1)
+        del block.ops[op_idx]
+    block._sync_with_cpp()
+
+    for op_idx in range(len(block.ops)):
+        op = block.ops[op_idx]
+        for in_name in op.input_arg_names:
+            if in_name in to_bind_rev:
+                op._rename_input(in_name, to_bind_rev[in_name])
+
+        for out_name in op.output_arg_names:
+            if out_name in to_bind_rev:
+                op._rename_output(out_name, to_bind_rev[out_name])
+
+    for var_name in sorted(vars_to_remove):
+        assert var_name in to_bind_rev, 'var_name "{}" is not in to_bind_rev.'.format(
+            var_name)
+        if var_name != to_bind_rev[var_name]:
+            block.desc._remove_var(cpt.to_bytes(var_name))
+            del block.vars[var_name]
+    block._sync_with_cpp()
+
+
+@framework.static_only
+def orig2prim(block=None):
+    """ 
+    .. note::
+        **This API is ONLY available in the static mode.**
+
+    All operators in the target block are processed as follows.
+    If it is an original operator, it will be transformed into
+    one or a series of automatic differential basic operators with
+    equivalent function.
+    
+    Args:
+        block(paddle.fluid.framework.Variable|None, optional): The
+            target block to process on. Default None, and will
+            process on the current block of main program.
+    
+    Returns:
+        None
+    """
+    _lower(block, reverse=False)
+
+
+@framework.static_only
+def prim2orig(block=None):
+    """
+    .. note::
+        **ONLY available in the static mode.**
+
+    All operators in the target block are processed as follows.
+    If it is an automatic differential basic operator, it will be
+    transformed into one or a series of original operators with
+    equivalent function to support execution.
+    
+    Args:
+        block(paddle.static.Variable|None, optional): The
+            target block to process on. Default None, and will
+            process on the current block of main program.
+       
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            from paddle.incubate.autograd import enable_prim, prim_enabled, prim2orig
+            
+            paddle.enable_static()
+            enable_prim()
+            
+            x = paddle.ones(shape=[2, 2], dtype='float32')
+            x.stop_gradients = False
+            y = x * x
+            dy_dx = paddle.static.gradients(y, x)
+            if prim_enabled():
+                prim2orig()
+    """
+    _lower(block, reverse=True)
+
+
+def _gradients(ys, xs, ys_bar=None):
+    """ A drop-in replacement of paddle.gradients but instead computing
+    on primitive ops.
+    
+    Args:
+        ys: the target tensor or tensors
+        xs: the input tensor or tensors
+        ys_bar: the optional gradient tensors of `ys`
+    
+    Returns:
+        xs_bar: a list gradients of input `xs`
+    """
+
+    ys, xs = to_tensors(ys), to_tensors(xs)
+    block = ys[0].block
+    # TODO(Tongxin) without any prior knowledge about whether the program
+    # is completely lowered to primitive ops, it's mandatory to run the lowering
+    # pass once and again. This is obviously inefficient and needs to be 
+    # optimized.
+    orig2prim(block)
+
+    ad = Transform(block)
+
+    xs_dot, ys_dot = ad.linearize(xs, ys)
+    if any(var is None for var in ys_dot):
+        assert False, f'Gradients cannot be computed. The given output `ys` does not depend on input `xs`.'
+    ys_bar, xs_bar = ad.transpose(ys_dot, xs_dot, ys_bar)
+    # remove xs_dot and their constructor ops
+
+    op_indexes = []
+    for var in xs_dot:
+        if var is not None:
+            op_index = block.ops.index(var.op)
+            assert op_index >= 0, f'op_index should be greater than or equal to 0, but op_index={op_index}.'
+            op_indexes.append(op_index)
+
+    ad.erase_ops(sorted(op_indexes))
+    ad.erase_dots(xs_dot)
+
+    return xs_bar
diff --git a/python/paddle/incubate/autograd/utils.py b/python/paddle/incubate/autograd/utils.py
new file mode 100644
index 0000000000000..ec4f0915ba34f
--- /dev/null
+++ b/python/paddle/incubate/autograd/utils.py
@@ -0,0 +1,178 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.fluid import framework as framework
+
+
+class PrimOption(object):
+    def __init__(self):
+        self.enable_prim = False
+
+    def get_status(self):
+        return self.enable_prim
+
+    def set_status(self, flag):
+        self.enable_prim = flag
+
+
+prim_option = PrimOption()
+
+
+@framework.static_only
+def prim_enabled():
+    """
+    .. note::
+        **ONLY available in the static mode.**
+
+    Shows whether the automatic differentiation mechanism based on 
+    automatic differential basic operators is ON. Defaults to OFF.
+     
+    Returns:
+        flag(bool): Whether the automatic differentiation mechanism based on automatic differential basic operators is ON.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            from paddle.incubate.autograd import enable_prim, disable_prim, prim_enabled
+            
+            paddle.enable_static()
+            enable_prim()
+
+            print(prim_enabled()) # True
+
+            disable_prim()
+
+            print(prim_enabled()) # False
+    """
+    return prim_option.get_status()
+
+
+@framework.static_only
+def enable_prim():
+    """
+    .. note::
+        **ONLY available in the static mode.**
+
+    Turns ON automatic differentiation mechanism based on automatic 
+    differential basic operators.
+    
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            from paddle.incubate.autograd import enable_prim, prim_enabled
+            
+            paddle.enable_static()
+            enable_prim()
+
+            print(prim_enabled()) # True
+    """
+    prim_option.set_status(True)
+
+
+@framework.static_only
+def disable_prim():
+    """
+    .. note::
+        **ONLY available in the static mode.**
+
+    Turns OFF automatic differentiation mechanism based on automatic 
+    differential basic operators.
+    
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            from paddle.incubate.autograd import enable_prim, disable_prim, prim_enabled
+            
+            paddle.enable_static()
+            enable_prim()
+
+            print(prim_enabled()) # True
+
+            disable_prim()
+
+            print(prim_enabled()) # False
+    """
+    prim_option.set_status(False)
+
+
+INT_DTYPE_2_STRING = {
+    int(0): 'bool',
+    int(1): 'int16',
+    int(2): 'int32',
+    int(3): 'int64',
+    int(4): 'float16',
+    int(5): 'float32',
+    int(6): 'float64',
+    int(20): 'uint8',
+    int(21): 'int8',
+    int(23): 'complex64',
+    int(24): 'complex128',
+}
+
+
+def get_var_block(block, names):
+    assert isinstance(names, list)
+    if len(names) == 0:
+        return None
+    elif len(names) == 1:
+        return block.var(names[0])
+    else:
+        return [block.var(name) for name in names]
+
+
+def get_input_var_list(op):
+    if op.input_names is None:
+        return []
+    else:
+        return [
+            get_var_block(op.block, op.input(n)) for n in sorted(op.input_names)
+        ]
+
+
+def get_output_var_list(op):
+    if op.output_names is None:
+        return []
+    else:
+        return [
+            get_var_block(op.block, op.output(n))
+            for n in sorted(op.output_names)
+        ]
+
+
+def to_tensors(xs):
+    if isinstance(xs, paddle.fluid.framework.Variable):
+        return [xs]
+    else:
+        return xs
+
+
+def flatten(inp):
+    if inp is None or isinstance(inp, paddle.fluid.framework.Variable):
+        return [inp]
+    flattened = []
+    for part in inp:
+        flattened += flatten(part)
+    return flattened
+
+
+def flatten_and_remove_none(inp):
+    flattened = flatten(inp)
+    return [var for var in flattened if var is not None]
diff --git a/python/paddle/incubate/autotune.py b/python/paddle/incubate/autotune.py
index e98a23bc52d65..7ac555e2520ea 100644
--- a/python/paddle/incubate/autotune.py
+++ b/python/paddle/incubate/autotune.py
@@ -49,7 +49,7 @@ def set_config(config=None):
             dictionary, the key is the tuning type, and the value is a dictionary
             of the corresponding tuning parameters. If it is a string, the path of
             a json file will be specified and the tuning configuration will be set
-            by the the json file. Default: None, auto-tuning for kernel, layout and
+            by the json file. Default: None, auto-tuning for kernel, layout and
             dataloader will be enabled.
 
     Examples:
diff --git a/python/paddle/incubate/distributed/models/moe/grad_clip.py b/python/paddle/incubate/distributed/models/moe/grad_clip.py
index b620253b9f26f..cf56f74d1f12d 100644
--- a/python/paddle/incubate/distributed/models/moe/grad_clip.py
+++ b/python/paddle/incubate/distributed/models/moe/grad_clip.py
@@ -158,7 +158,7 @@ def _dygraph_clip(self, params_grads):
         normal_params_grads = []
         moe_params_grads = []
 
-        # seperate moe params from normal params
+        # separate moe params from normal params
         if self.moe_group is not None and self.moe_group.nranks > 1:
             for p, g in params_grads:
                 if self.is_expert_param_func(p):
diff --git a/python/paddle/incubate/nn/layer/fused_transformer.py b/python/paddle/incubate/nn/layer/fused_transformer.py
index d76b990958c94..072c7d9fccade 100644
--- a/python/paddle/incubate/nn/layer/fused_transformer.py
+++ b/python/paddle/incubate/nn/layer/fused_transformer.py
@@ -101,9 +101,9 @@ def __init__(self,
         super(FusedMultiHeadAttention, self).__init__()
 
         assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
-                               "but recieved {}".format(embed_dim))
+                               "but received {}".format(embed_dim))
         assert num_heads > 0, ("Expected nhead to be greater than 0, "
-                               "but recieved {}".format(num_heads))
+                               "but received {}".format(num_heads))
 
         self.normalize_before = normalize_before
         self._dtype = self._helper.get_default_dtype()
@@ -278,10 +278,10 @@ def __init__(self,
 
         super(FusedFeedForward, self).__init__()
         assert d_model > 0, (
-            "Expected d_model to be greater than 0, but recieved {}".format(
+            "Expected d_model to be greater than 0, but received {}".format(
                 d_model))
         assert dim_feedforward > 0, (
-            "Expected dim_feedforward to be greater than 0, but recieved {}".
+            "Expected dim_feedforward to be greater than 0, but received {}".
             format(dim_feedforward))
 
         self._dtype = self._helper.get_default_dtype()
@@ -434,12 +434,12 @@ def __init__(self,
 
         super(FusedTransformerEncoderLayer, self).__init__()
         assert d_model > 0, ("Expected d_model to be greater than 0, "
-                             "but recieved {}".format(d_model))
+                             "but received {}".format(d_model))
         assert nhead > 0, ("Expected nhead to be greater than 0, "
-                           "but recieved {}".format(nhead))
+                           "but received {}".format(nhead))
         assert dim_feedforward > 0, (
             "Expected dim_feedforward to be greater than 0, "
-            "but recieved {}".format(dim_feedforward))
+            "but received {}".format(dim_feedforward))
         attn_dropout_rate = dropout_rate if attn_dropout_rate is None else attn_dropout_rate
         act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate
         self.normalize_before = normalize_before
@@ -808,11 +808,11 @@ def __init__(self,
         super(FusedMultiTransformer, self).__init__()
 
         assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
-                               "but recieved {}".format(embed_dim))
+                               "but received {}".format(embed_dim))
         assert num_heads > 0, ("Expected nhead to be greater than 0, "
-                               "but recieved {}".format(num_heads))
+                               "but received {}".format(num_heads))
         assert dim_feedforward > 0, (
-            "Expected dim_feedforward to be greater than 0, but recieved {}".
+            "Expected dim_feedforward to be greater than 0, but received {}".
             format(dim_feedforward))
 
         self.normalize_before = normalize_before
diff --git a/python/paddle/incubate/optimizer/functional/bfgs.py b/python/paddle/incubate/optimizer/functional/bfgs.py
index 23fd8dc0825f0..2065b3c1c94c0 100644
--- a/python/paddle/incubate/optimizer/functional/bfgs.py
+++ b/python/paddle/incubate/optimizer/functional/bfgs.py
@@ -49,16 +49,16 @@ def minimize_bfgs(objective_func,
         Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. pp140: Algorithm 6.1 (BFGS Method).
 
     Args:
-        objective_func: the objective function to minimize. ``objective_func`` accepts a multivariate input and returns a scalar.
-        initial_position (Tensor): the starting point of the iterates. 
+        objective_func: the objective function to minimize. ``objective_func`` accepts a 1D Tensor and returns a scalar.
+        initial_position (Tensor): the starting point of the iterates, has the same shape with the input of ``objective_func`` . 
         max_iters (int, optional): the maximum number of minimization iterations. Default value: 50.
         tolerance_grad (float, optional): terminates if the gradient norm is smaller than this. Currently gradient norm uses inf norm. Default value: 1e-7.
         tolerance_change (float, optional): terminates if the change of function value/position/parameter between two iterations is smaller than this value. Default value: 1e-9.
-        initial_inverse_hessian_estimate (Tensor, optional): the initial inverse hessian approximation at initial_position. It must be symmetric and positive definite. Default value: None.
+        initial_inverse_hessian_estimate (Tensor, optional): the initial inverse hessian approximation at initial_position. It must be symmetric and positive definite. If not given, will use an identity matrix of order N, which is size of ``initial_position`` . Default value: None.
         line_search_fn (str, optional): indicate which line search method to use, only support 'strong wolfe' right now. May support 'Hager Zhang' in the futrue. Default value: 'strong wolfe'.
         max_line_search_iters (int, optional): the maximum number of line search iterations. Default value: 50.
         initial_step_length (float, optional): step length used in first iteration of line search. different initial_step_length may cause different optimal result. For methods like Newton and quasi-Newton the initial trial step length should always be 1.0. Default value: 1.0.
-        dtype ('float32' | 'float64', optional): data type used in the algorithm. Default value: 'float32'.
+        dtype ('float32' | 'float64', optional): data type used in the algorithm, the data type of the input parameter must be consistent with the dtype. Default value: 'float32'.
         name (str, optional): Name for the operation. For more information, please refer to :ref:`api_guide_Name`. Default value: None.
 
     Returns:
diff --git a/python/paddle/incubate/optimizer/functional/lbfgs.py b/python/paddle/incubate/optimizer/functional/lbfgs.py
index f283381597733..e15ad56dc2d11 100644
--- a/python/paddle/incubate/optimizer/functional/lbfgs.py
+++ b/python/paddle/incubate/optimizer/functional/lbfgs.py
@@ -50,17 +50,17 @@ def minimize_lbfgs(objective_func,
         Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. pp179: Algorithm 7.5 (L-BFGS).
 
     Args:
-        objective_func: the objective function to minimize. ``objective_func`` accepts a multivariate input and returns a scalar.
-        initial_position (Tensor): the starting point of the iterates. 
+        objective_func: the objective function to minimize. ``objective_func`` accepts a 1D Tensor and returns a scalar.
+        initial_position (Tensor): the starting point of the iterates, has the same shape with the input of ``objective_func`` . 
         history_size (Scalar): the number of stored vector pairs {si,yi}. Default value: 100.
         max_iters (int, optional): the maximum number of minimization iterations. Default value: 50.
         tolerance_grad (float, optional): terminates if the gradient norm is smaller than this. Currently gradient norm uses inf norm. Default value: 1e-7.
         tolerance_change (float, optional): terminates if the change of function value/position/parameter between two iterations is smaller than this value. Default value: 1e-9.
-        initial_inverse_hessian_estimate (Tensor, optional): the initial inverse hessian approximation at initial_position. It must be symmetric and positive definite. Default value: None.
+        initial_inverse_hessian_estimate (Tensor, optional): the initial inverse hessian approximation at initial_position. It must be symmetric and positive definite. If not given, will use an identity matrix of order N, which is size of ``initial_position`` . Default value: None.
         line_search_fn (str, optional): indicate which line search method to use, only support 'strong wolfe' right now. May support 'Hager Zhang' in the futrue. Default value: 'strong wolfe'.
         max_line_search_iters (int, optional): the maximum number of line search iterations. Default value: 50.
         initial_step_length (float, optional): step length used in first iteration of line search. different initial_step_length may cause different optimal result. For methods like Newton and quasi-Newton the initial trial step length should always be 1.0. Default value: 1.0.
-        dtype ('float32' | 'float64', optional): data type used in the algorithm. Default value: 'float32'.
+        dtype ('float32' | 'float64', optional): data type used in the algorithm, the data type of the input parameter must be consistent with the dtype. Default value: 'float32'.
         name (str, optional): Name for the operation. For more information, please refer to :ref:`api_guide_Name`. Default value: None.
 
     Returns:
diff --git a/python/paddle/metric/metrics.py b/python/paddle/metric/metrics.py
index 118004088da16..d399cb2052498 100644
--- a/python/paddle/metric/metrics.py
+++ b/python/paddle/metric/metrics.py
@@ -771,7 +771,7 @@ def accuracy(input, label, k=1, correct=None, total=None, name=None):
     Args:
         input(Tensor): The input of accuracy layer, which is the predictions of network. A Tensor with type float32,float64.
             The shape is ``[sample_number, class_dim]`` .
-        label(Tensor): The label of dataset. Tensor with type int32,int64. The shape is ``[sample_number, 1]`` .
+        label(Tensor): The label of dataset. Tensor with type int64. The shape is ``[sample_number, 1]`` .
         k(int, optional): The top k predictions for each class will be checked. Data type is int64 or int32.
         correct(Tensor, optional): The correct predictions count. A Tensor with type int64 or int32.
         total(Tensor, optional): The total entries count. A tensor with type int64 or int32.
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index e64efda7b33bf..6970cf4962909 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -63,8 +63,10 @@ def celu(x, alpha=1.0, name=None):
     if alpha == 0:
         raise ZeroDivisionError("alpha cannot be 0 for celu")
 
-    if in_dynamic_mode():
+    if _in_legacy_dygraph():
         return _C_ops.celu(x, 'alpha', alpha)
+    if in_dygraph_mode():
+        return _C_ops.final_state_celu(x, alpha)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'celu')
     helper = LayerHelper("celu", **locals())
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 84aadbbac649b..6c7f09091ff3c 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -129,10 +129,13 @@ def _conv_nd(x,
         if bias is not None:
             channel_dim = channel_dim + len(
                 x.shape) if channel_dim < 0 else channel_dim
-            tmp_bias = _C_ops.final_state_reshape(
-                bias, bias.shape +
-                [1 for i in range(len(x.shape) - channel_dim - 1)])
-            return _C_ops.final_state_add(pre_bias, tmp_bias)
+            if len(bias.shape) < len(x.shape):
+                tmp_bias = _C_ops.final_state_reshape(
+                    bias, bias.shape +
+                    [1 for i in range(len(x.shape) - channel_dim - 1)])
+                return _C_ops.final_state_add(pre_bias, tmp_bias)
+            else:
+                return _C_ops.final_state_add(pre_bias, bias)
         else:
             return pre_bias
     if in_dynamic_mode():
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index ca3ac1772829d..d08821e510c2b 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -392,20 +392,24 @@ def hsigmoid_loss(input,
 
             paddle.set_device('cpu')
 
-            input = paddle.uniform([2, 3])
-            # [[-0.8018668   0.8736385  -0.9064771 ] # random
-            #  [-0.10228515 -0.87188244 -0.8783718 ]] # random
+            input = paddle.uniform([4, 3])
+            # [[0.45424712  -0.77296764  0.82943869] # random
+            #  [0.85062802  0.63303483  0.35312140] # random
+            #  [0.57170701  0.16627562  0.21588242] # random
+            #  [0.27610803  -0.99303514  -0.17114788]] # random
             label = paddle.to_tensor([0, 1, 4, 5])
             num_classes = 5
             weight=paddle.uniform([num_classes-1, 3])
-            # [[-0.24148715  0.8449961  -0.7399121 ] # random
-            #  [-0.9800559   0.43509364  0.9091208 ] # random
-            #  [ 0.60194826  0.10430074 -0.4521166 ] # random
-            #  [-0.4469818  -0.01536179 -0.604454  ]] # random
+            # [[-0.64477652  0.24821866  -0.17456549] # random
+            #  [-0.04635394  0.07473493  -0.25081766] # random
+            #  [ 0.05986035  -0.12185556  0.45153677] # random
+            #  [-0.66236806  0.91271877  -0.88088769]] # random
 
             out=F.hsigmoid_loss(input, label, num_classes, weight)
-            # [[3.0159328]
-            #  [2.2407534]]
+            # [[1.96709502]
+            #  [2.40019274]
+            #  [2.11009121]
+            #  [1.92374969]]
     """
 
     if _non_static_mode():
@@ -542,7 +546,7 @@ def margin_ranking_loss(input,
                         name=None):
     r"""
 
-    This op the calcluate the the margin rank loss between the input, other and label, use the math function as follows.
+    This op the calcluate the margin rank loss between the input, other and label, use the math function as follows.
 
     .. math::
         margin\_rank\_loss = max(0, -label * (input - other) + margin)
@@ -879,7 +883,7 @@ def kl_div(input, label, reduction='mean', name=None):
 
     While :attr:`reduction` is :attr:`none`, output loss is in
     the same shape as input, loss in each point is calculated
-    seperately and no reduction is applied.
+    separately and no reduction is applied.
 
     While :attr:`reduction` is :attr:`mean`, output loss is in
     shape of [1] and loss value is the mean value of all losses.
@@ -2006,7 +2010,7 @@ def sigmoid_focal_loss(logit,
             Available dtype is float32, float64.
         normalizer (Tensor, optional): The number normalizes the focal loss. It has to be
             a 1-D Tensor whose shape is `[1, ]`. The data type is float32, float64.
-            For object detection task, it is the the number of positive samples.
+            For object detection task, it is the number of positive samples.
             If set to None, the focal loss will not be normalized. Default is None.
         alpha(int|float, optional): Hyper-parameter to balance the positive and negative example,
             it should be between 0 and 1.  Default value is set to 0.25. 
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index 3160f04e830d2..6a573005f4514 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -1160,22 +1160,21 @@ def max_pool3d(x,
 
             import paddle
             import paddle.nn.functional as F
-            import numpy as np
 
             # max pool3d
-            x = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
-            output = F.max_pool2d(x,
+            x = paddle.uniform([1, 3, 32, 32, 32])
+            output = F.max_pool3d(x,
                                   kernel_size=2,
                                   stride=2, padding=0)
-            output.shape [1, 3, 16, 16, 16]
+            # output.shape [1, 3, 16, 16, 16]
             # for return_mask=True
-            x = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
+            x = paddle.uniform([1, 3, 32, 32, 32])
             output, max_indices = paddle.nn.functional.max_pool3d(x,
                                           kernel_size = 2,
                                           stride = 2,
                                           padding=0,
                                           return_mask=True)
-            # output.shape [None, 3, 16, 16, 16], max_indices.shape [None, 3, 16, 16, 16],
+            # output.shape [1, 3, 16, 16, 16], max_indices.shape [1, 3, 16, 16, 16]
     """
     kernel_size = utils.convert_to_list(kernel_size, 3, 'pool_size')
     if stride is None:
@@ -1267,10 +1266,9 @@ def adaptive_avg_pool1d(x, output_size, name=None):
     Returns:
             Tensor: The output tensor of adaptive average pooling result. The data type is same
                       as input tensor.
-    Raises:
-            ValueError: 'output_size' should be an integer.
     Examples:
         .. code-block:: python
+          :name: code-example1
 
               # average adaptive pool1d
               # suppose input data in shape of [N, C, L], `output_size` is m or [m],
@@ -1286,10 +1284,9 @@ def adaptive_avg_pool1d(x, output_size, name=None):
               #
               import paddle
               import paddle.nn.functional as F
-              import numpy as np
 
-              data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-              pool_out = F.adaptive_average_pool1d(data, output_size=16)
+              data = paddle.uniform([1, 3, 32])
+              pool_out = F.adaptive_avg_pool1d(data, output_size=16)
               # pool_out shape: [1, 3, 16])
     """
     pool_type = 'avg'
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index cd82fe12fff6b..7fd109843bede 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -367,7 +367,7 @@ class PReLU(Layer):
     Parameters:
         num_parameters (int, optional): Number of `weight` to learn. The supported values are:
             1 - a single parameter `alpha` is used for all input channels;
-            Number of channels - a seperate `alpha` is used for each input channel.
+            Number of channels - a separate `alpha` is used for each input channel.
             Default is 1.
         init (float, optional): Init value of learnable `weight`. Default is 0.25.
         weight_attr(ParamAttr, optional): The parameter attribute for the learnable `weight`.
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index d4e059b6dfa49..a20e7de751d16 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -465,14 +465,18 @@ class HSigmoidLoss(Layer):
             import paddle
             paddle.set_device('cpu')
 
-            input = paddle.uniform([2, 3])
-            # [[-0.2820413   0.9528898  -0.81638825] # random
-            #  [-0.6733154  -0.33866507  0.25770962]] # random
+            input = paddle.uniform([4, 3])
+            # [[0.56194401  -0.22450298  -0.10741806] # random
+            #  [0.36136317  0.23556745  0.88748658] # random
+            #  [0.18151939  0.80947340  -0.31078976] # random
+            #  [0.68886101  -0.14239830  -0.41297770]] # random
             label = paddle.to_tensor([0, 1, 4, 5])
             m = paddle.nn.HSigmoidLoss(3, 5)
             out = m(input, label)
-            # [[2.4543471]
-            #  [1.9359267]]
+            # [[2.42524505]
+            #  [1.74917245]
+            #  [3.14571381]
+            #  [2.34564662]]
     """
 
     def __init__(self,
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index ae6e37a02751d..461ac03899e07 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -1155,7 +1155,7 @@ class SimpleRNN(RNNBase):
             None). For more information, please refer to :ref:`api_guide_Name`.
 
     Inputs:
-        - **inputs** (Tensor): the input sequence. If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`, else, the shape is `[batch_size, time_steps, hidden_size]`. `time_steps` means the length of the input sequence.
+        - **inputs** (Tensor): the input sequence. If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`, else, the shape is `[batch_size, time_steps, input_size]`. `time_steps` means the length of the input sequence.
         - **initial_states** (Tensor, optional): the initial state. The shape is `[num_layers * num_directions, batch_size, hidden_size]`. If initial_state is not given, zero initial states are used.
         - **sequence_length** (Tensor, optional): shape `[batch_size]`, dtype: int64 or int32. The valid lengths of input sequences. Defaults to None. If `sequence_length` is not None, the inputs are treated as padded sequences. In each input sequence, elements whose time step index are not less than the valid length are treated as paddings.
 
@@ -1274,7 +1274,7 @@ class LSTM(RNNBase):
             None). For more information, please refer to :ref:`api_guide_Name`.
 
     Inputs:
-        - **inputs** (Tensor): the input sequence. If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`, else, the shape is `[batch_size, time_steps, hidden_size]`. `time_steps` means the length of the input sequence.
+        - **inputs** (Tensor): the input sequence. If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`, else, the shape is `[batch_size, time_steps, input_size]`. `time_steps` means the length of the input sequence.
         - **initial_states** (list|tuple, optional): the initial state, a list/tuple of (h, c), the shape of each is `[num_layers * num_directions, batch_size, hidden_size]`. If initial_state is not given, zero initial states are used.
         - **sequence_length** (Tensor, optional): shape `[batch_size]`, dtype: int64 or int32. The valid lengths of input sequences. Defaults to None. If `sequence_length` is not None, the inputs are treated as padded sequences. In each input sequence, elements whos time step index are not less than the valid length are treated as paddings.
 
@@ -1384,7 +1384,7 @@ class GRU(RNNBase):
             None). For more information, please refer to :ref:`api_guide_Name`.
 
     Inputs:
-        - **inputs** (Tensor): the input sequence. If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`, else, the shape is `[batch_size, time_steps, hidden_size]`. `time_steps` means the length of the input sequence.
+        - **inputs** (Tensor): the input sequence. If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`, else, the shape is `[batch_size, time_steps, input_size]`. `time_steps` means the length of the input sequence.
         - **initial_states** (Tensor, optional): the initial state. The shape is `[num_layers * num_directions, batch_size, hidden_size]`. If initial_state is not given, zero initial states are used. Defaults to None.
         - **sequence_length** (Tensor, optional): shape `[batch_size]`, dtype: int64 or int32. The valid lengths of input sequences. Defaults to None. If `sequence_length` is not None, the inputs are treated as padded sequences. In each input sequence, elements whos time step index are not less than the valid length are treated as paddings.
 
diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index b0b6e62a602aa..340372f9b6a4e 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -163,9 +163,9 @@ def __init__(self,
         super(MultiHeadAttention, self).__init__()
 
         assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
-                               "but recieved {}".format(embed_dim))
+                               "but received {}".format(embed_dim))
         assert num_heads > 0, ("Expected num_heads to be greater than 0, "
-                               "but recieved {}".format(num_heads))
+                               "but received {}".format(num_heads))
 
         self.embed_dim = embed_dim
         self.kdim = kdim if kdim is not None else embed_dim
@@ -508,12 +508,12 @@ def __init__(self,
         super(TransformerEncoderLayer, self).__init__()
 
         assert d_model > 0, ("Expected d_model to be greater than 0, "
-                             "but recieved {}".format(d_model))
+                             "but received {}".format(d_model))
         assert nhead > 0, ("Expected nhead to be greater than 0, "
-                           "but recieved {}".format(nhead))
+                           "but received {}".format(nhead))
         assert dim_feedforward > 0, (
             "Expected dim_feedforward to be greater than 0, "
-            "but recieved {}".format(dim_feedforward))
+            "but received {}".format(dim_feedforward))
 
         attn_dropout = dropout if attn_dropout is None else attn_dropout
         act_dropout = dropout if act_dropout is None else act_dropout
@@ -813,12 +813,12 @@ def __init__(self,
         super(TransformerDecoderLayer, self).__init__()
 
         assert d_model > 0, ("Expected d_model to be greater than 0, "
-                             "but recieved {}".format(d_model))
+                             "but received {}".format(d_model))
         assert nhead > 0, ("Expected nhead to be greater than 0, "
-                           "but recieved {}".format(nhead))
+                           "but received {}".format(nhead))
         assert dim_feedforward > 0, (
             "Expected dim_feedforward to be greater than 0, "
-            "but recieved {}".format(dim_feedforward))
+            "but received {}".format(dim_feedforward))
 
         attn_dropout = dropout if attn_dropout is None else attn_dropout
         act_dropout = dropout if act_dropout is None else act_dropout
@@ -1220,12 +1220,12 @@ def __init__(self,
         super(Transformer, self).__init__()
 
         assert d_model > 0, ("Expected d_model to be greater than 0, "
-                             "but recieved {}".format(d_model))
+                             "but received {}".format(d_model))
         assert nhead > 0, ("Expected nhead to be greater than 0, "
-                           "but recieved {}".format(nhead))
+                           "but received {}".format(nhead))
         assert dim_feedforward > 0, (
             "Expected dim_feedforward to be greater than 0, "
-            "but recieved {}".format(dim_feedforward))
+            "but received {}".format(dim_feedforward))
 
         if isinstance(bias_attr, (list, tuple)):
             if len(bias_attr) == 1:
diff --git a/python/paddle/nn/utils/spectral_norm_hook.py b/python/paddle/nn/utils/spectral_norm_hook.py
index 75266abdf0d13..56c9e83c38b06 100644
--- a/python/paddle/nn/utils/spectral_norm_hook.py
+++ b/python/paddle/nn/utils/spectral_norm_hook.py
@@ -178,7 +178,7 @@ def spectral_norm(layer,
        .. code-block:: python
 
             from paddle.nn import Conv2D
-            from paddle.nn.utils import Spectralnorm
+            from paddle.nn.utils import spectral_norm
 
             conv = Conv2D(3, 1, 3)
             sn_conv = spectral_norm(conv)
diff --git a/python/paddle/nn/utils/weight_norm_hook.py b/python/paddle/nn/utils/weight_norm_hook.py
index c131d218a1cde..84644ccc48445 100755
--- a/python/paddle/nn/utils/weight_norm_hook.py
+++ b/python/paddle/nn/utils/weight_norm_hook.py
@@ -213,15 +213,21 @@ def remove_weight_norm(layer, name='weight'):
     Examples:
         .. code-block:: python
           
-          import paddle
-          from paddle.nn import Conv2D
-          from paddle.nn.utils import weight_norm, remove_weight_norm
-
-          conv = Conv2D(3, 5, 3)
-          wn = weight_norm(conv)
-          remove_weight_norm(conv)
-          print(conv.weight_g)
-          # AttributeError: 'Conv2D' object has no attribute 'weight_g'
+            import paddle
+            from paddle.nn import Conv2D
+            from paddle.nn.utils import weight_norm, remove_weight_norm
+
+            conv = Conv2D(3, 5, 3)
+            wn = weight_norm(conv)
+            print(conv.weight_g)
+            # Parameter containing:
+            # Tensor(shape=[5], dtype=float32, place=Place(gpu:0), stop_gradient=False,
+            #        [0., 0., 0., 0., 0.])
+            # Conv2D(3, 5, kernel_size=[3, 3], data_format=NCHW)
+
+            remove_weight_norm(conv)
+            # print(conv.weight_g)
+            # AttributeError: 'Conv2D' object has no attribute 'weight_g'
     """
     for k, hook in layer._forward_pre_hooks.items():
         if isinstance(hook, WeightNorm) and hook.name == name:
diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py
index ea4349bc0b2c5..12b8272707bd8 100644
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -33,7 +33,8 @@
     'LambdaDecay',
     'ReduceOnPlateau',
     'CosineAnnealingDecay',
-    'MultiplicativeDecay'
+    'MultiplicativeDecay',
+    'OneCycleLR'
 ]
 
 
@@ -1591,3 +1592,212 @@ def get_lr(self):
         for epoch in range(1, self.last_epoch + 1):
             cur_lr = cur_lr * self.lr_lambda(epoch)
         return cur_lr
+
+
+class OneCycleLR(LRScheduler):
+    r"""
+    Sets the learning rate according to the one cycle learning rate scheduler.
+    The scheduler adjusts the learning rate from an initial learning rate to the maximum learning rate and then
+    from that maximum learning rate to the minimum learning rate, which is much less than the initial learning rate.
+
+    It has been proposed in `Super-Convergence: Very Fast Training of Neural Networks Using Large Learning Rates <https://arxiv.org/abs/1708.07120>`_.
+
+    Please note that the default behaviour of this scheduler follows the fastai implementation of one cycle,
+    which claims that “unpublished work has shown even better results by using only two phases”.
+    If you want the behaviour of this scheduler to be consistent with the paper, please set ``three_phase=True`` .
+
+    Also note that you should update learning rate each step.
+
+    Args:
+        max_learning_rate (float): The maximum learning rate. It is a python float number.
+             Functionally, it defines the initial learning rate by ``divide_factor`` .
+        total_steps (int): Number of total training steps.
+        divide_factor (float): Initial learning rate will be determined by initial_learning_rate = max_learning_rate / divide_factor. Default: 25.
+        end_learning_rate (float, optional): The minimum learning rate during training, it should be much less than initial learning rate.
+        phase_pct (float): The percentage of total steps which used to increasing learning rate. Default: 0.3.
+        anneal_strategy (str, optional): Strategy of adjusting learning rate.'cos' for cosine annealing,
+            'linear' for linear annealing. Default: 'cos'.
+        three_phase (bool, optional): Whether to use three phase.
+            If ``True``:
+                1. The learning rate will first increase from initial learning rate to maximum learning rate.
+                2. Then it will decrease to initial learning rate. Number of step in this phase is the same as the one in first phase.
+                3. Finally, it will decrease to minimum learning rate which is much less than initial learning rate.
+            If ``False``:
+                1. The learning rate will increase to maximum learning rate.
+                2. Then it will directly decrease to minimum learning rate.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``OneCycleLR`` instance to schedule learning rate.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dynamic graph mode
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr.OneCycleLR(max_learning_rate=1.0, total_steps=100, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
+            for epoch in range(5):
+                for batch_id in range(20):
+                    x = paddle.uniform([10, 10])
+                    out = linear(x)
+                    loss = paddle.mean(out)
+                    loss.backward()
+                    sgd.step()
+                    sgd.clear_gradients()
+                    scheduler.step()        # You should update learning rate each step
+
+            # train on static graph mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr.OneCycleLR(max_learning_rate=1.0, total_steps=100, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(5):
+                for batch_id in range(20):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                    scheduler.step()    # You should update learning rate each step
+    """
+
+    def __init__(self,
+                 max_learning_rate,
+                 total_steps,
+                 divide_factor=25.,
+                 end_learning_rate=0.0001,
+                 phase_pct=0.3,
+                 anneal_strategy='cos',
+                 three_phase=False,
+                 last_epoch=-1,
+                 verbose=False):
+        # Check type and value of max_learning_rate
+        if not isinstance(max_learning_rate, (float, int)):
+            raise TypeError(
+                "'max_learning_rate' must be 'float' or 'int', but received {}".
+                format(type(total_steps)))
+        if max_learning_rate < 0:
+            raise ValueError("'max_learning_rate' must be a positive integer.")
+
+        # Check type and value of end_learning_rate
+        if not isinstance(end_learning_rate, (float, int)):
+            raise TypeError(
+                "'end_learning_rate' must be 'float' or 'int', but received {}".
+                format(type(total_steps)))
+        if end_learning_rate < 0:
+            raise ValueError("'end_learning_rate' must be a positive integer.")
+
+        # Check type and value of total_steps
+        if not isinstance(total_steps, int):
+            raise TypeError("'total_step' must be 'int', but received {}".
+                            format(type(total_steps)))
+        if total_steps <= 0:
+            raise ValueError("'total_step' must be a positive integer.")
+        self.total_steps = total_steps
+
+        # Check type and value of pac_start
+        if not isinstance(phase_pct, float):
+            raise TypeError("'phase_pct' must be 'float', but received {}".
+                            format(type(phase_pct)))
+        if phase_pct < 0 or phase_pct > 1:
+            raise ValueError(
+                "'phase_pct' must be between 0 and 1, but received {}".format(
+                    phase_pct))
+
+        # Check type and value of divide_factor
+        if not isinstance(divide_factor, (float, int)):
+            raise TypeError(
+                "'divide_factor' must be 'float' or 'int', but received {}".
+                format(type(divide_factor)))
+
+        initial_lr = max_learning_rate / float(divide_factor)
+        min_lr = float(end_learning_rate)
+
+        if three_phase:
+            if phase_pct >= 0.5:
+                raise ValueError(
+                    "When three_phase is True, 'phase_pct' must be less than 0.5"
+                )
+            # start step and end step of each phase.
+            self._step_config = [
+                0,
+                phase_pct * self.total_steps - 1,
+                2 * phase_pct * self.total_steps - 2,
+                self.total_steps - 1,
+                self.total_steps - 1,  # for the last step.
+            ]
+            # step size of each phase.
+            self._steps_size = [
+                self._step_config[1] - self._step_config[0],
+                self._step_config[2] - self._step_config[1],
+                self._step_config[3] - self._step_config[2],
+                self._step_config[3] -
+                self._step_config[2],  # for the last step.
+            ]
+            # start lr and end lr of each phase.
+            self._lr_config = [
+                initial_lr, max_learning_rate, initial_lr, min_lr
+            ]
+        else:
+            self._step_config = [
+                0, phase_pct * self.total_steps - 1, self.total_steps - 1,
+                self.total_steps - 1
+            ]
+            self._steps_size = [
+                self._step_config[1] - self._step_config[0],
+                self._step_config[2] - self._step_config[1],
+                self._step_config[2] - self._step_config[1],
+            ]
+            self._lr_config = [initial_lr, max_learning_rate, min_lr]
+
+        # Check anneal_strategy
+        if anneal_strategy == 'cos':
+            self.anneal_func = self._cos_annealing
+        elif anneal_strategy == 'linear':
+            self.anneal_func = self._linear_annealing
+        else:
+            raise ValueError(
+                "'anneal_strategy' must by one of 'cos' or 'linear', but received {}".
+                format(anneal_strategy))
+        super(OneCycleLR, self).__init__(initial_lr, last_epoch, verbose)
+
+    def _cos_annealing(self, start_lr, end_lr, pct):
+        cos_out = math.cos(math.pi * pct) + 1
+        return end_lr + (start_lr - end_lr) / 2.0 * cos_out
+
+    def _linear_annealing(self, start_lr, end_lr, pct):
+        return (end_lr - start_lr) * pct + start_lr
+
+    def get_lr(self):
+        current_step = self.last_epoch
+
+        if current_step > self.total_steps:
+            raise ValueError(
+                "Tried to step {} times. However the number of total steps is {}"
+                .format(current_step, self.total_steps))
+
+        for (i, (end_step, step_size)
+             ) in enumerate(zip(self._step_config[1:], self._steps_size)):
+            # i == len(self._lr_config) - 2 catch the last step, otherwise it will return None.
+            if current_step <= end_step or i == len(self._lr_config) - 2:
+                # self._step_config[i] means start step of a phase.
+                percentage = (current_step - self._step_config[i]) / step_size
+                return self.anneal_func(self._lr_config[i],
+                                        self._lr_config[i + 1], percentage)
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index 0dfe294c00d5c..9dfec3947e95f 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -47,6 +47,45 @@
 __all__ = []
 
 
+@framework.static_only
+def append_backward_new(loss_list,
+                        parameter_list=None,
+                        no_grad_set=None,
+                        callbacks=None,
+                        checkpoints=None,
+                        distop_context=None):
+    from paddle.incubate.autograd.primx import orig2prim, Transform
+    program = default_main_program()
+    assert program.num_blocks == 1, "The append_backward_new interface is designed to process only one block."
+    block = program.current_block()
+
+    orig2prim(block)
+    ad = Transform(block)
+    if parameter_list is None:
+        parameter_list = program.global_block().all_parameters()
+    param_dot, loss_dot = ad.linearize(parameter_list, loss_list)
+    loss_bar, param_bar = ad.transpose(loss_dot, param_dot)
+
+    # remove param_dot and their constructor ops
+    op_indexes = []
+    for var in param_dot:
+        if var is not None:
+            op_index = block.ops.index(var.op)
+            assert op_index >= 0
+            op_indexes.append(op_index)
+
+    ad.erase_ops(sorted(op_indexes))
+    ad.erase_dots(param_dot)
+
+    if len(parameter_list) == 1:
+        params_and_grads = [(parameter_list, param_bar)]
+    else:
+        params_and_grads = []
+        for i, param in enumerate(parameter_list):
+            params_and_grads.append((param, param_bar[i]))
+    return params_and_grads
+
+
 class Optimizer(object):
     r"""Optimizer Base class.
 
@@ -880,8 +919,13 @@ def backward(self,
             parameter_list = parameters if parameters \
                 else self._parameter_list
             with program_guard(program, startup_program):
-                params_grads = append_backward(loss, parameter_list,
-                                               act_no_grad_set, callbacks)
+                from paddle.incubate.autograd.utils import prim_enabled
+                if prim_enabled():
+                    params_grads = append_backward_new(
+                        [loss], parameter_list, act_no_grad_set, callbacks)
+                else:
+                    params_grads = append_backward(loss, parameter_list,
+                                                   act_no_grad_set, callbacks)
                 # Note: since we can't use all_reduce_op now,
                 #  dgc_op should be the last op of one grad.
                 self._append_dgc_ops(params_grads)
diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py
index 77adbaff34859..9df595bc3ae73 100644
--- a/python/paddle/profiler/profiler.py
+++ b/python/paddle/profiler/profiler.py
@@ -150,7 +150,7 @@ def getScheduleState(step: int) -> ProfilerState:
 
 def _default_state_scheduler(step: int):
     r"""
-    A default state scheduler, keep recording from the begining of the profiler until ending.
+    A default state scheduler, keep recording from the beginning of the profiler until ending.
     """
     return ProfilerState.RECORD
 
diff --git a/python/paddle/profiler/timer.py b/python/paddle/profiler/timer.py
index 1fb06ddc55e39..815775ebc6aad 100644
--- a/python/paddle/profiler/timer.py
+++ b/python/paddle/profiler/timer.py
@@ -193,7 +193,7 @@ def begin(self, benchmark):
     def before_reader(self, benchmark):
         """
         Initialize the start time of the dataloader. This function will be
-        called at the begining of `next` method in `_DataLoaderIterMultiProcess` or
+        called at the beginning of `next` method in `_DataLoaderIterMultiProcess` or
         `_DataLoaderIterSingleProcess`.
 
         """
@@ -220,8 +220,8 @@ def after_step(self, benchmark):
         Record the cost for the current step. It will contain the cost of the loading
         data if there is a dataloader. Similar to `after_reader`, it will also update
         the maximum, minimum and the total time from the step 11 to the current step
-        as well as the the maximum and minimum speed of the model. This function will
-        be called in in `Profiler.step()`.
+        as well as the maximum and minimum speed of the model. This function will
+        be called in `Profiler.step()`.
 
         """
 
@@ -401,7 +401,7 @@ def check_if_need_record(self, reader):
                 # enter a new task but not calling beign() to record it.
                 # we pause the timer until the end of new task, so that 
                 # the cost of new task is not added to the current event.
-                # eg. start evaluation in the traing task
+                # eg. start evaluation in the training task
                 self.current_event.need_record = False
         else:
             # when the new task exits, continue timing for the current event.
diff --git a/python/paddle/profiler/utils.py b/python/paddle/profiler/utils.py
index fd75ab9550d52..5e95c83129f53 100644
--- a/python/paddle/profiler/utils.py
+++ b/python/paddle/profiler/utils.py
@@ -78,7 +78,7 @@ def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any):
 
     def begin(self):
         r"""
-        Record the time of begining.
+        Record the time of beginning.
 
         Examples:
 
diff --git a/python/paddle/sparse/__init__.py b/python/paddle/sparse/__init__.py
index 93653e09c9019..26a2f0cfadbe7 100644
--- a/python/paddle/sparse/__init__.py
+++ b/python/paddle/sparse/__init__.py
@@ -14,15 +14,19 @@
 
 from .creation import sparse_coo_tensor
 from .creation import sparse_csr_tensor
-from .layer.activation import ReLU
-from .layer.norm import BatchNorm
+from .layer import ReLU
+from .layer import BatchNorm
 
-from .layer.conv import Conv3D
-from .layer.conv import SubmConv3D
+from .layer import Conv3D
+from .layer import SubmConv3D
 
-from .layer.pooling import MaxPool3D
+from .layer import MaxPool3D
+
+from .functional import sqrt
+from .functional import sin
+from .functional import tanh
 
 __all__ = [
     'sparse_coo_tensor', 'sparse_csr_tensor', 'ReLU', 'Conv3D', 'SubmConv3D',
-    'BatchNorm', 'MaxPool3D'
+    'BatchNorm', 'MaxPool3D', 'sqrt', 'sin', 'tanh'
 ]
diff --git a/python/paddle/sparse/functional/__init__.py b/python/paddle/sparse/functional/__init__.py
index f1ca4cc6fcc48..cfefa3ff4ff76 100644
--- a/python/paddle/sparse/functional/__init__.py
+++ b/python/paddle/sparse/functional/__init__.py
@@ -12,9 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .activation import relu  # noqa: F401
+from .unary import relu  # noqa: F401
+from .unary import tanh  # noqa: F401
+from .unary import sqrt  # noqa: F401
+from .unary import sin  # noqa: F401
 from .conv import conv3d  # noqa: F401
 from .conv import subm_conv3d  # noqa: F401
 from .pooling import max_pool3d  # noqa: F401
 
-__all__ = ['relu', 'conv3d', 'subm_conv3d', 'max_pool3d']
+__all__ = ['relu', 'tanh', 'conv3d', 'subm_conv3d', 'max_pool3d', 'sqrt', 'sin']
diff --git a/python/paddle/sparse/functional/activation.py b/python/paddle/sparse/functional/activation.py
deleted file mode 100644
index c0109bc4e2429..0000000000000
--- a/python/paddle/sparse/functional/activation.py
+++ /dev/null
@@ -1,53 +0,0 @@
-#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-__all__ = []
-
-from paddle import _C_ops, in_dynamic_mode
-
-
-def relu(x, name=None):
-    """
-    sparse relu activation.
-
-    .. math::
-
-        out = max(x, 0)
-
-    Parameters:
-        x (Tensor): The input Sparse Tensor with data type float32, float64.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
-
-    Returns:
-        A Sparse Tensor with the same data type and shape as ``x`` .
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            import numpy as np
-            from paddle.fluid.framework import _test_eager_guard
-
-            with _test_eager_guard():
-                dense_x = paddle.to_tensor(np.array([-2, 0, 1]).astype('float32'))
-                sparse_x = dense_x.to_sparse_coo(1)
-                out = paddle.sparse.functional.relu(sparse_x) 
-    """
-
-    assert in_dynamic_mode(), "Currently, Sparse API only support dynamic mode"
-    assert x.is_sparse_coo(
-    ), "Currently, sparse.relu only support the input of SparseCooTensor"
-
-    return _C_ops.final_state_sparse_relu(x)
diff --git a/python/paddle/sparse/functional/unary.py b/python/paddle/sparse/functional/unary.py
new file mode 100644
index 0000000000000..550e6a2a39261
--- /dev/null
+++ b/python/paddle/sparse/functional/unary.py
@@ -0,0 +1,169 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = []
+
+from paddle import _C_ops, in_dynamic_mode
+
+
+def relu(x, name=None):
+    """
+    sparse relu activation, requiring x to be a sparse coo or sparse csr tensor.
+
+    .. math::
+
+        out = max(x, 0)
+
+    Parameters:
+        x (Tensor): The input Sparse Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Sparse Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.fluid.framework import _test_eager_guard
+
+            with _test_eager_guard():
+                dense_x = paddle.to_tensor([-2, 0, 1], dtype='float32')
+                sparse_x = dense_x.to_sparse_coo(1)
+                out = paddle.sparse.functional.relu(sparse_x) 
+    """
+
+    assert in_dynamic_mode(), "Currently, Sparse API only support dynamic mode"
+
+    if x.is_sparse_coo() or x.is_sparse_csr():
+        return _C_ops.final_state_sparse_relu(x)
+    else:
+        raise ValueError(
+            "Currently, sparse.relu only support the input of SparseCooTensor or SparseCsrTensor"
+        )
+
+
+def tanh(x, name=None):
+    """
+    sparse tanh activation, requiring x to be a sparse coo or sparse csr tensor.
+
+    .. math::
+
+        out = tanh(x)
+
+    Parameters:
+        x (Tensor): The input Sparse Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Sparse Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.fluid.framework import _test_eager_guard
+
+            with _test_eager_guard():
+                dense_x = paddle.to_tensor([-2, 0, 1], dtype='float32')
+                sparse_x = dense_x.to_sparse_coo(1)
+                out = paddle.sparse.tanh(sparse_x)
+    """
+
+    assert in_dynamic_mode(), "Currently, Sparse API only support dynamic mode"
+
+    if x.is_sparse_coo() or x.is_sparse_csr():
+        return _C_ops.final_state_sparse_tanh(x)
+    else:
+        raise ValueError(
+            "Currently, sparse.tanh only support the input of SparseCooTensor or SparseCsrTensor"
+        )
+
+
+def sqrt(x, name=None):
+    """
+    Calculate square root of x, requiring x to be a sparse coo or sparse csr tensor.
+
+    .. math::
+
+        out = sqrt(x)
+
+    Parameters:
+        x (Tensor): The input Sparse Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Sparse Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.fluid.framework import _test_eager_guard
+
+            with _test_eager_guard():
+                dense_x = paddle.to_tensor([4, 0, 1], dtype='float32')
+                sparse_x = dense_x.to_sparse_coo(1)
+                out = paddle.sparse.sqrt(sparse_x)
+    """
+
+    assert in_dynamic_mode(), "Currently, Sparse API only support dynamic mode"
+
+    if x.is_sparse_coo() or x.is_sparse_csr():
+        return _C_ops.final_state_sparse_sqrt(x)
+    else:
+        raise ValueError(
+            "Currently, sparse.sqrt only support the input of SparseCooTensor or SparseCsrTensor"
+        )
+
+
+def sin(x, name=None):
+    """
+    Calculate sin of x, requiring x to be a sparse coo or sparse csr tensor.
+
+    .. math::
+
+        out = sin(x)
+
+    Parameters:
+        x (Tensor): The input Sparse Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Sparse Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.fluid.framework import _test_eager_guard
+
+            with _test_eager_guard():
+                dense_x = paddle.to_tensor([-2, 0, 3], dtype='float32')
+                sparse_x = dense_x.to_sparse_coo(1)
+                out = paddle.sparse.sin(sparse_x)
+    """
+
+    assert in_dynamic_mode(), "Currently, Sparse API only support dynamic mode"
+
+    if x.is_sparse_coo() or x.is_sparse_csr():
+        return _C_ops.final_state_sparse_sin(x)
+    else:
+        raise ValueError(
+            "Currently, sparse.sin only support the input of SparseCooTensor or SparseCsrTensor"
+        )
diff --git a/python/paddle/sparse/layer/__init__.py b/python/paddle/sparse/layer/__init__.py
index 3a6d99392e4e8..8a814b514276f 100644
--- a/python/paddle/sparse/layer/__init__.py
+++ b/python/paddle/sparse/layer/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .activation import ReLU
+from .unary import ReLU
 from .norm import BatchNorm
 from .conv import Conv3D
 from .conv import SubmConv3D
diff --git a/python/paddle/sparse/layer/activation.py b/python/paddle/sparse/layer/unary.py
similarity index 100%
rename from python/paddle/sparse/layer/activation.py
rename to python/paddle/sparse/layer/unary.py
diff --git a/python/paddle/static/sparsity/__init__.py b/python/paddle/static/sparsity/__init__.py
index 59f794ef28aa4..b4543b8d000fc 100644
--- a/python/paddle/static/sparsity/__init__.py
+++ b/python/paddle/static/sparsity/__init__.py
@@ -16,8 +16,14 @@
 from ...fluid.contrib.sparsity import calculate_density  #noqa: F401
 from ...fluid.contrib.sparsity import decorate  #noqa: F401
 from ...fluid.contrib.sparsity import prune_model  #noqa: F401
-from ...fluid.contrib.sparsity import set_excluded_layers  #noqa: F401
 from ...fluid.contrib.sparsity import reset_excluded_layers  #noqa: F401
+from ...fluid.contrib import sparsity  #noqa: F401
+
+
+def set_excluded_layers(main_program, param_names):
+    sparsity.set_excluded_layers(
+        param_names=param_names, main_program=main_program)
+
 
 __all__ = [     #noqa
     'calculate_density',
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index b9604b69a5582..cc873c9a9e4d6 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -230,6 +230,7 @@
 from .math import fmin  # noqa: F401
 from .math import inner  # noqa: F401
 from .math import outer  # noqa: F401
+from .math import heaviside  # noqa: F401
 from .math import frac  # noqa: F401
 
 from .random import multinomial  # noqa: F401
@@ -497,6 +498,7 @@
            'put_along_axis',
            'put_along_axis_',
            'exponential_',
+           'heaviside',
 ]
 
 #this list used in math_op_patch.py for magic_method bind
diff --git a/python/paddle/tensor/attribute.py b/python/paddle/tensor/attribute.py
index 757b93dd88078..ca8abdaf4b3f3 100644
--- a/python/paddle/tensor/attribute.py
+++ b/python/paddle/tensor/attribute.py
@@ -36,10 +36,10 @@
 def rank(input):
     """
 
-    The OP returns the number of dimensions for a tensor, which is a 0-D int32 Tensor.
+    Returns the number of dimensions for a tensor, which is a 0-D int32 Tensor.
 
     Args:
-        input (Tensor): The input N-D tensor with shape of :math:`[N_1, N_2, ..., N_k]`, the data type is arbitrary.
+        input (Tensor): The input Tensor with shape of :math:`[N_1, N_2, ..., N_k]`, the data type is arbitrary.
 
     Returns:
         Tensor, the output data type is int32.: The 0-D tensor with the dimensions of the input Tensor.
@@ -246,15 +246,15 @@ def is_integer(x):
 
 def real(x, name=None):
     """
-    Returns a new tensor containing real values of the input tensor.
+    Returns a new Tensor containing real values of the input Tensor.
 
     Args:
-        x (Tensor): the input tensor, its data type could be complex64 or complex128.
+        x (Tensor): the input Tensor, its data type could be complex64 or complex128.
         name (str, optional): The default value is None. Normally there is no need for
             user to set this property. For more information, please refer to :ref:`api_guide_Name` .
       
     Returns:
-        Tensor: a tensor containing real values of the input tensor.
+        Tensor: a Tensor containing real values of the input Tensor.
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index a5a4df6571b77..d3430ba81b859 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -92,13 +92,13 @@ def linspace(start, stop, num, dtype=None, name=None):
         dtype = convert_np_dtype_to_dtype_(dtype)
     if not isinstance(start, Variable):
         with device_guard("cpu"):
-            tensor_start = fill_constant([1], dtype, start)
+            tensor_start = fill_constant([1], dtype, start, force_cpu=True)
     if not isinstance(stop, Variable):
         with device_guard("cpu"):
-            tensor_stop = fill_constant([1], dtype, stop)
+            tensor_stop = fill_constant([1], dtype, stop, force_cpu=True)
     if not isinstance(num, Variable):
         with device_guard("cpu"):
-            tensor_num = fill_constant([1], 'int32', num)
+            tensor_num = fill_constant([1], 'int32', num, force_cpu=True)
     if _non_static_mode():
         return _C_ops.linspace(tensor_start, tensor_stop, tensor_num, 'dtype',
                                dtype)
@@ -294,12 +294,6 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
     Returns:
         Tensor: A Tensor constructed from ``data`` .
 
-    Raises:
-        TypeError: If the data type of ``data`` is not scalar, list, tuple, np.ndarray, paddle.Tensor
-        ValueError: If ``data`` is tuple|list, it can't contain nested tuple|list with different lengths , such as: [[1, 2], [3, 4, 5]]
-        TypeError: If ``dtype`` is not bool, float16, float32, float64, int8, int16, int32, int64, uint8, complex64, complex128
-        ValueError: If ``place`` is not paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace or specified pattern string. 
-
     Examples:
 
     .. code-block:: python
@@ -445,7 +439,6 @@ def full_like(x, fill_value, dtype=None, name=None):
         .. code-block:: python
 
           import paddle
-          import numpy as np
           
           input = paddle.full(shape=[2, 3], fill_value=0.0, dtype='float32', name='input')
           output = paddle.full_like(input, 2.0)
@@ -528,7 +521,7 @@ def ones(shape, dtype=None, name=None):
 
 def ones_like(x, dtype=None, name=None):
     """
-    This OP returns a Tensor filled with the value 1, with the same shape and
+    Returns a Tensor filled with the value 1, with the same shape and
     data type (use ``dtype`` if ``dtype`` is not None) as ``x``.
 
     Args:
@@ -546,10 +539,6 @@ def ones_like(x, dtype=None, name=None):
         Tensor: A Tensor filled with the value 1, with the same shape and
         data type (use ``dtype`` if ``dtype`` is not None) as ``x``.
 
-    Raise:
-        TypeError: If ``dtype`` is not None and is not bool, float16, float32,
-        float64, int32 or int64.
-
     Examples:
         .. code-block:: python
 
@@ -565,7 +554,7 @@ def ones_like(x, dtype=None, name=None):
 
 def zeros(shape, dtype=None, name=None):
     """
-    The OP creates a tensor of specified :attr:`shape` and :attr:`dtype`, and fills it with 0.
+    Creates a tensor of specified :attr:`shape` and :attr:`dtype`, and fills it with 0.
 
     Args:
         shape(tuple|list|Tensor): Shape of the Tensor to be created, the data type of ``shape`` is int32 or int64.
@@ -621,9 +610,6 @@ def zeros_like(x, dtype=None, name=None):
         Tensor: A Tensor filled with the value 0, with the same shape and
         data type (use ``dtype`` if ``dtype`` is not None) as ``x``.
 
-    Raise:
-        TypeError: If ``dtype`` is not None and is not bool, float16, float32,
-        float64, int32 or int64.
 
     Examples:
         .. code-block:: python
@@ -765,7 +751,7 @@ def full(shape, fill_value, dtype=None, name=None):
 
 def arange(start=0, end=None, step=1, dtype=None, name=None):
     """
-    This OP returns a 1-D Tensor with spaced values within a given interval.
+    Returns a 1-D Tensor with spaced values within a given interval.
 
     Values are generated into the half-open interval [``start``, ``end``) with
     the ``step``. (the interval including ``start`` but excluding ``end``).
@@ -789,18 +775,13 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
         dtype(str|np.dtype, optional): The data type of the
             output tensor. Supported data types: int32, int64, float32, float64.
             If ``dytpe`` is None, the data type is float32. Default is None.
-        name(str, optional): The default value is None. Normally there is no
-            need for user to set this property. For more information, please
-            refer to :ref:`api_guide_Name`.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns: 
         Tensor: A 1-D Tensor with values from the interval [``start``, ``end``)
         taken with common difference ``step`` beginning from ``start``. Its
         data type is set by ``dtype``.
 
-    Raises:
-        TypeError: If ``dtype`` is not int32, int64, float32, float64.
-
     Examples:
         .. code-block:: python
 
@@ -914,7 +895,7 @@ def _tril_triu_op(helper):
 
 def tril(x, diagonal=0, name=None):
     r"""
-    This op returns the lower triangular part of a matrix (2-D tensor) or batch
+    Returns the lower triangular part of a matrix (2-D tensor) or batch
     of matrices :attr:`x`, the other elements of the result tensor are set 
     to 0. The lower triangular part of the matrix is defined as the elements 
     on and below the diagonal.
@@ -929,48 +910,42 @@ def tril(x, diagonal=0, name=None):
             the main diagonal. The main diagonal are the set of indices
             :math:`\{(i, i)\}` for :math:`i \in [0, \min\{d_{1}, d_{2}\} - 1]` where
             :math:`d_{1}, d_{2}` are the dimensions of the matrix.
-        name (str, optional): The default value is None. Normally there is no need for
-            user to set this property. For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor: Results of lower triangular operation by the specified diagonal of input tensor x,
         it's data type is the same as x's Tensor.
 
-    Raises:
-        TypeError: diagonal is not a int type.
-        ValueError: dimension of :attr:`x` is less than 2.
-
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
-            data = np.arange(1, 13, dtype="int64").reshape(3,-1)
-            # array([[ 1,  2,  3,  4],
-            #        [ 5,  6,  7,  8],
-            #        [ 9, 10, 11, 12]])
-
+            data = paddle.arange(1, 13, dtype="int64").reshape([3,-1])
+            # Tensor(shape=[3, 4], dtype=int64, place=Place(cpu), stop_gradient=True,
+            #        [[1 , 2 , 3 , 4 ],
+            #         [5 , 6 , 7 , 8 ],
+            #         [9 , 10, 11, 12]])
 
-            x = paddle.to_tensor(data)
-            
-            tril1 = paddle.tensor.tril(x)
-            # array([[ 1,  0,  0,  0],
-            #        [ 5,  6,  0,  0],
-            #        [ 9, 10, 11,  0]])
+            tril1 = paddle.tril(data)
+            # Tensor(shape=[3, 4], dtype=int64, place=Place(cpu), stop_gradient=True,
+            #        [[1 , 0 , 0 , 0 ],
+            #         [5 , 6 , 0 , 0 ],
+            #         [9 , 10, 11, 0 ]])
 
             # example 2, positive diagonal value
-            tril2 = paddle.tensor.tril(x, diagonal=2)
-            # array([[ 1,  2,  3,  0], 
-            #        [ 5,  6,  7,  8],
-            #        [ 9, 10, 11, 12]])
+            tril2 = paddle.tril(data, diagonal=2)
+            # Tensor(shape=[3, 4], dtype=int64, place=Place(cpu), stop_gradient=True,
+            #        [[1 , 2 , 3 , 0 ],
+            #         [5 , 6 , 7 , 8 ],
+            #         [9 , 10, 11, 12]])
 
             # example 3, negative diagonal value
-            tril3 = paddle.tensor.tril(x, diagonal=-1)
-            # array([[ 0,  0,  0,  0],
-            #        [ 5,  0,  0,  0],
-            #        [ 9, 10,  0,  0]])
-
+            tril3 = paddle.tril(data, diagonal=-1)
+            # Tensor(shape=[3, 4], dtype=int64, place=Place(cpu), stop_gradient=True,
+            #        [[0 , 0 , 0 , 0 ],
+            #         [5 , 0 , 0 , 0 ],
+            #         [9 , 10, 0 , 0 ]])
     """
     if in_dygraph_mode():
         return _C_ops.final_state_tril_triu(x, diagonal, True)
@@ -1006,10 +981,6 @@ def triu(x, diagonal=0, name=None):
         Tensor: Results of upper triangular operation by the specified diagonal of input tensor x,
         it's data type is the same as x's Tensor.
 
-    Raises:
-        TypeError: diagonal is not a int type.
-        ValueError: dimension of :attr:`x` is less than 2.
-
     Examples:
         .. code-block:: python
 
@@ -1054,13 +1025,12 @@ def triu(x, diagonal=0, name=None):
 
 def meshgrid(*args, **kwargs):
     """
-    This op takes a list of N tensors as input *args, each of which is 1-dimensional 
-    vector, and creates N-dimensional grids.
+    Takes a list of N tensors as input *args, each of which is 1-dimensional vector, and creates N-dimensional grids.
     
     Args:
         *args(Tensor|list of Tensor) : tensors (tuple(list) of tensor): the shapes of input k tensors are (N1,), 
             (N2,),..., (Nk,). Support data types: ``float64``, ``float32``, ``int32``, ``int64``.
-        **kwargs (optional): Currently, we only accept name in **kwargs 
+        **kwargs (optional): Currently, only accept name in **kwargs 
             The default value is None. Normally there is no need for
             user to set this property. For more information, please refer to :ref:`api_guide_Name`.
  
@@ -1342,7 +1312,7 @@ def diag(x, offset=0, padding_value=0, name=None):
 
 def empty(shape, dtype=None, name=None):
     """
-    This Op returns a Tensor with uninitialized data which size is same as ``shape``.
+    Returns a Tensor with uninitialized data which size is same as ``shape``.
     
     Args:
         shape(list|tuple|Tensor): Shape of the Tensor to be created.
@@ -1362,29 +1332,32 @@ def empty(shape, dtype=None, name=None):
     Examples:
         .. code-block:: python
 
-          import paddle
-          import numpy as np
-
-          paddle.set_device("cpu")  # and use cpu device
+            import paddle
 
-          # example 1: argument ``shape`` is a list which doesn't contain Tensor.
-          data1 = paddle.empty(shape=[2,3], dtype='float32')
-          #[[4.3612203e+27 1.8176809e+31 1.3555911e-19]     # uninitialized
-          # [1.1699684e-19 1.3563156e-19 3.6408321e-11]]    # uninitialized
-
-          # example 2: argument ``shape`` is a Tensor, the data type must be int64 or int32.
-          shape_data = np.array([2, 3]).astype('int32')
-          shape = paddle.to_tensor(shape_data)
-          data2 = paddle.empty(shape=shape, dtype='float32')
-          #[[1.7192326e-37 4.8125365e-38 1.9866003e-36]     # uninitialized
-          # [1.3284029e-40 7.1117408e-37 2.5353012e+30]]    # uninitialized
-
-          # example 3: argument ``shape`` is a list which contains Tensor.
-          dim2_data = np.array([3]).astype('int32')
-          dim2 = paddle.to_tensor(dim2_data)
-          data3 = paddle.empty(shape=[2, dim2], dtype='float32')
-          #[[1.1024214e+24 7.0379409e+22 6.5737699e-34]     # uninitialized
-          # [7.5563101e+31 7.7130405e+31 2.8020654e+20]]    # uninitialized
+            paddle.set_device("cpu")  # and use cpu device
+
+            # example 1: argument ``shape`` is a list which doesn't contain Tensor.
+            data1 = paddle.empty(shape=[2, 3], dtype='float32')
+            print(data1)
+            # Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #        [[0.00000000, 0.        , 0.00000000],
+            #         [0.        , 0.29652897, 0.09356152]])       # uninitialized
+
+            # example 2: argument ``shape`` is a Tensor, the data type must be int64 or int32.
+            shape_data = paddle.to_tensor([2, 3]).astype('int32')
+            data2 = paddle.empty(shape=shape_data, dtype='float32')
+            print(data2)
+            # Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #        [[-0.50543123, -0.09872390, -0.92634487],
+            #         [-0.51007903, -0.02454148,  1.29315734]])    # uninitialized
+
+            # example 3: argument ``shape`` is a list which contains Tensor.
+            dim2 = paddle.to_tensor([3]).astype('int32')
+            data3 = paddle.empty(shape=[2, dim2], dtype='float32')
+            print(data3)
+            # Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #        [[ 0.00000000,  0.        , -0.92634487],
+            #         [-0.51007903, -0.02454148,  1.29315734]])    # uninitialized
     """
 
     if dtype is None:
@@ -1428,7 +1401,7 @@ def empty(shape, dtype=None, name=None):
 
 def empty_like(x, dtype=None, name=None):
     """
-    This Op returns a Tensor with uninitialized data which has identical shape of ``x`` and ``dtype``.
+    Returns a Tensor with uninitialized data which has identical shape of ``x`` and ``dtype``.
     If the ``dtype`` is None, the data type of Tensor is same with ``x``.
     
     Args:
@@ -1446,7 +1419,6 @@ def empty_like(x, dtype=None, name=None):
         .. code-block:: python
 
           import paddle
-          import numpy as np
 
           paddle.set_device("cpu")  # and use cpu device
 
@@ -1538,12 +1510,14 @@ def assign(x, output=None):
     # isinstance(VarBase, Variable) == False. It will cause return None
     # after this api.
     if isinstance(input, (Variable, core.VarBase)):
-        if _non_static_mode():
+        if in_dygraph_mode():
+            if output is None:
+                output = _C_ops.final_state_assign(input)
+            else:
+                _C_ops.final_state_assign_out_(input, output)
+        elif _in_legacy_dygraph():
             if output is None:
-                if _in_legacy_dygraph():
-                    output = core.VarBase()
-                else:
-                    output = core.eager.Tensor()
+                output = core.VarBase()
             _C_ops.assign(input, output)
         else:
             check_dtype(input.dtype, 'input', [
@@ -1603,7 +1577,7 @@ def assign(x, output=None):
                 value_name: values
             })
 
-    if is_inplace and _non_static_mode():
+    if is_inplace and _in_legacy_dygraph():
         output._bump_inplace_version()
 
     return output
@@ -1741,3 +1715,90 @@ def complex(real, imag, name=None):
     attrs = {}
     helper.append_op(type=op_type, inputs=inputs, attrs=attrs, outputs=outputs)
     return out
+
+
+def tril_indices(row, col, offset=0, dtype='int64'):
+    """
+    Return the indices of the lower triangular part of the 2-D matrix 
+    whose row and col is knowed.Indices are ordered based on row and then columns. 
+    The lower triangular part of the matrix is defined as the elements on
+    and below the diagonal.
+    
+    Args:
+        row (int): The input x which is a int number describe the number of row of the matrix.
+        col (int): The input x which is a int number describe the number of col of the matrix.
+        offset (int, optional): The offset to consider, default value is 0.
+
+            - If offset = 0, all elements on and below the main diagonal are retained.  
+            - If offset > 0, include just as many diagonals above the main diagonal.  
+            - If offset < 0, excludes just as many diagonals below the main diagonal.  
+ 
+        dtype (int, optional): the data type of the output tensor, can be int32, int64.
+
+    Returns:
+        Tensor: Results of the indices of lower triangular part of a row * col matrix,
+        where the first row contains row coordinates of and the second row contains column coordinates.
+
+    Examples:
+        .. code-block:: python
+            :name: tril_indices-example
+
+            import paddle
+            
+            # example 1, default offset value
+            data1 = paddle.tril_indices(4,4,0)
+            print(data1)
+            # [[0, 1, 1, 2, 2, 2, 3, 3, 3, 3], 
+            #  [0, 0, 1, 0, 1, 2, 0, 1, 2, 3]]
+
+            # example 2, positive offset value
+            data2 = paddle.tril_indices(4,4,2)
+            print(data2)
+            # [[0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3], 
+            #  [0, 1, 2, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]]
+
+            # example 3, negative offset value
+            data3 = paddle.tril_indices(4,4,-1)
+            print(data3)
+            # [[ 1, 2, 2, 3, 3, 3],
+            #  [ 0, 0, 1, 0, 1, 2]]
+    """
+    if not isinstance(row, int) or row < 0:
+        raise TypeError("row should be a non-negative int")
+
+    if col is not None:
+        if not isinstance(col, int) or col < 0:
+            raise TypeError("col should be a non-negative int")
+    else:
+        col = row
+
+    if not isinstance(offset, int):
+        raise TypeError("offset should be a  int")
+
+    if not isinstance(dtype, core.VarDesc.VarType):
+        dtype = convert_np_dtype_to_dtype_(dtype)
+
+    if in_dygraph_mode():
+        out = _C_ops.final_state_tril_indices(row, col, offset, dtype,
+                                              _current_expected_place())
+        return out
+
+    if _in_legacy_dygraph():
+        out = _C_ops.tril_indices('rows', row, 'cols', col, 'offset', offset,
+                                  "dtype", dtype)
+        return out
+
+    else:
+        helper = LayerHelper("tril_indices", **locals())
+
+        out = helper.create_variable_for_type_inference(dtype=dtype)
+
+        helper.append_op(
+            type='tril_indices',
+            inputs={},
+            outputs={'out': [out]},
+            attrs={'rows': row,
+                   'cols': col,
+                   'offset': offset,
+                   'dtype': dtype})
+    return out
diff --git a/python/paddle/tensor/einsum.py b/python/paddle/tensor/einsum.py
index 713a611f9f39a..49cc426a00fd9 100644
--- a/python/paddle/tensor/einsum.py
+++ b/python/paddle/tensor/einsum.py
@@ -798,11 +798,12 @@ def gen_einsum_op(equation, *operands):
     """
     assert len(operands) <= 2, "Only support two operands in EinsumOp."
     if in_dygraph_mode():
-        return _C_ops.final_state_einsum(operands, equation)
+        return _C_ops.final_state_einsum(operands, equation)[0]
 
     if _in_legacy_dygraph():
         # dygraph
-        return _C_ops.einsum(operands, 'equation', equation)
+        return _C_ops.einsum(operands, len(operands), 'equation', equation)[0]
+
     # static graph 
     for inp in operands:
         check_variable_and_dtype(inp, 'dtype', ['float32', 'float64'], 'einsum')
@@ -811,11 +812,16 @@ def gen_einsum_op(equation, *operands):
     out = helper.create_variable_for_type_inference(dtype=operands[0].dtype)
     attrs = dict()
     attrs['equation'] = equation
+    caches = [
+        helper.create_variable_for_type_inference(dtype=operands[0].dtype)
+        for i in range(len(operands))
+    ]
     helper.append_op(
         type='einsum',
         inputs={'Operands': operands},
-        outputs={'Out': out},
-        attrs=attrs, )
+        outputs={'Out': out,
+                 "InnerCache": caches},
+        attrs=attrs)
     return out
 
 
@@ -977,7 +983,7 @@ def einsum(equation, *operands):
         #     [0.51476848, 0.23367381, 0.39229113]]])
     """
     import os
-    if int(os.environ.get('FLAGS_new_einsum', "0")):
+    if int(os.environ.get('FLAGS_new_einsum', "1")):
         return einsum_v2(equation, *operands)
 
     nop = len(operands)
diff --git a/python/paddle/tensor/layer_function_generator.py b/python/paddle/tensor/layer_function_generator.py
index ecb13613a125e..7f95dd60eda8a 100644
--- a/python/paddle/tensor/layer_function_generator.py
+++ b/python/paddle/tensor/layer_function_generator.py
@@ -21,7 +21,7 @@
 from six.moves import cStringIO
 from ..static import Variable
 from ..fluid.proto import framework_pb2
-from ..framework import OpProtoHolder, core, convert_np_dtype_to_dtype_
+from ..framework import OpProtoHolder, core, convert_np_dtype_to_dtype_, _non_static_mode, in_dygraph_mode
 from ..framework import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype
 import paddle
@@ -256,7 +256,13 @@ def generate_activation_fn(op_type):
     op_proto = OpProtoHolder.instance().get_op_proto(op_type)
 
     def func(x, name=None):
-        if paddle.in_dynamic_mode():
+        final_state_op_type = "final_state_%s" % op_type
+        if in_dygraph_mode() and hasattr(_C_ops, final_state_op_type):
+            op = getattr(_C_ops, final_state_op_type)
+            return op(x)
+        # TODO(dev): Because some ops' yaml has not been migrated.
+        # Replace it with _in_legacy_dygraph while all yaml work is done.
+        if _non_static_mode():
             op = getattr(_C_ops, op_type)
             return op(x)
 
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 2a77dbd115733..9ba7ef532f273 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -178,55 +178,44 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
 
     Examples:
 
-    .. code-block:: python
-
-        import paddle
-        import numpy as np
-
-        # vector * vector
-        x_data = np.random.random([10]).astype(np.float32)
-        y_data = np.random.random([10]).astype(np.float32)
-        x = paddle.to_tensor(x_data)
-        y = paddle.to_tensor(y_data)
-        z = paddle.matmul(x, y)
-        print(z.numpy().shape)
-        # [1]
-
-        # matrix * vector
-        x_data = np.random.random([10, 5]).astype(np.float32)
-        y_data = np.random.random([5]).astype(np.float32)
-        x = paddle.to_tensor(x_data)
-        y = paddle.to_tensor(y_data)
-        z = paddle.matmul(x, y)
-        print(z.numpy().shape)
-        # [10]
-
-        # batched matrix * broadcasted vector
-        x_data = np.random.random([10, 5, 2]).astype(np.float32)
-        y_data = np.random.random([2]).astype(np.float32)
-        x = paddle.to_tensor(x_data)
-        y = paddle.to_tensor(y_data)
-        z = paddle.matmul(x, y)
-        print(z.numpy().shape)
-        # [10, 5]
+        .. code-block:: python
 
-        # batched matrix * batched matrix
-        x_data = np.random.random([10, 5, 2]).astype(np.float32)
-        y_data = np.random.random([10, 2, 5]).astype(np.float32)
-        x = paddle.to_tensor(x_data)
-        y = paddle.to_tensor(y_data)
-        z = paddle.matmul(x, y)
-        print(z.numpy().shape)
-        # [10, 5, 5]
+            import paddle
 
-        # batched matrix * broadcasted matrix
-        x_data = np.random.random([10, 1, 5, 2]).astype(np.float32)
-        y_data = np.random.random([1, 3, 2, 5]).astype(np.float32)
-        x = paddle.to_tensor(x_data)
-        y = paddle.to_tensor(y_data)
-        z = paddle.matmul(x, y)
-        print(z.numpy().shape)
-        # [10, 3, 5, 5]
+            # vector * vector
+            x = paddle.rand([10])
+            y = paddle.rand([10])
+            z = paddle.matmul(x, y)
+            print(z.shape)
+            # [1]
+
+            # matrix * vector
+            x = paddle.rand([10, 5])
+            y = paddle.rand([5])
+            z = paddle.matmul(x, y)
+            print(z.shape)
+            # [10]
+
+            # batched matrix * broadcasted vector
+            x = paddle.rand([10, 5, 2])
+            y = paddle.rand([2])
+            z = paddle.matmul(x, y)
+            print(z.shape)
+            # [10, 5]
+
+            # batched matrix * batched matrix
+            x = paddle.rand([10, 5, 2])
+            y = paddle.rand([10, 2, 5])
+            z = paddle.matmul(x, y)
+            print(z.shape)
+            # [10, 5, 5]
+
+            # batched matrix * broadcasted matrix
+            x = paddle.rand([10, 1, 5, 2])
+            y = paddle.rand([1, 3, 2, 5])
+            z = paddle.matmul(x, y)
+            print(z.shape)
+            # [10, 3, 5, 5]
 
     """
     if in_dygraph_mode():
@@ -1492,10 +1481,12 @@ def bmm(x, y, name=None):
             y = paddle.to_tensor([[[1.0, 1.0],[2.0, 2.0],[3.0, 3.0]],
                                 [[4.0, 4.0],[5.0, 5.0],[6.0, 6.0]]])
             out = paddle.bmm(x, y)
-            #output size: (2, 2, 2)
-            #output value:
-            #[[[6.0, 6.0],[12.0, 12.0]],[[45.0, 45.0],[60.0, 60.0]]]
-            out_np = out.numpy()
+            # Tensor(shape=[2, 2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #        [[[6. , 6. ],
+            #          [12., 12.]],
+
+            #         [[45., 45.],
+            #          [60., 60.]]])
 
     """
     x_shape = x.shape
@@ -1530,9 +1521,10 @@ def histogram(input, bins=100, min=0, max=0, name=None):
     Args:
         input (Tensor): A Tensor(or LoDTensor) with shape :math:`[N_1, N_2,..., N_k]` . The data type of the input Tensor
             should be float32, float64, int32, int64.
-        bins (int): number of histogram bins
-        min (int): lower end of the range (inclusive)
-        max (int): upper end of the range (inclusive)
+        bins (int, optional): number of histogram bins.
+        min (int, optional): lower end of the range (inclusive).
+        max (int, optional): upper end of the range (inclusive).
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         Tensor: data type is int64, shape is (nbins,).
@@ -1640,14 +1632,14 @@ def mv(x, vec, name=None):
             # x: [M, N], vec: [N]
             # paddle.mv(x, vec)  # out: [M]
 
-            import numpy as np
             import paddle
 
-            x_data = np.array([[2, 1, 3], [3, 0, 1]]).astype("float64")
-            x = paddle.to_tensor(x_data)
-            vec_data = np.array([3, 5, 1])
-            vec = paddle.to_tensor(vec_data).astype("float64")
+            x = paddle.to_tensor([[2, 1, 3], [3, 0, 1]]).astype("float64")
+            vec = paddle.to_tensor([3, 5, 1]).astype("float64")
             out = paddle.mv(x, vec)
+            print(out)
+            # Tensor(shape=[2], dtype=float64, place=Place(cpu), stop_gradient=True,
+            #        [14., 10.])
     """
     if in_dygraph_mode():
         return _C_ops.final_state_mv(x, vec)
diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py
index d99b9973b485e..31d2ec0557dfa 100755
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
@@ -275,9 +275,10 @@ def is_empty(x, name=None):
 
 def equal_all(x, y, name=None):
     """
-    This OP returns the truth value of :math:`x == y`. True if two inputs have the same elements, False otherwise.
+    Returns the truth value of :math:`x == y`. True if two inputs have the same elements, False otherwise.
 
-    **NOTICE**: The output of this OP has no gradient.
+    Note: 
+        The output has no gradient.
 
     Args:
         x(Tensor): Tensor, data type is bool, float32, float64, int32, int64.
@@ -332,13 +333,6 @@ def allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
     Returns:
         Tensor: ${out_comment}.
 
-    Raises:
-        TypeError: The data type of ``x`` must be one of float32, float64.
-        TypeError: The data type of ``y`` must be one of float32, float64.
-        TypeError: The type of ``rtol`` must be float.
-        TypeError: The type of ``atol`` must be float.
-        TypeError: The type of ``equal_nan`` must be bool.
-
     Examples:
         .. code-block:: python
 
@@ -402,7 +396,8 @@ def equal(x, y, name=None):
 
     This layer returns the truth value of :math:`x == y` elementwise.
 
-    **NOTICE**: The output of this OP has no gradient.
+    Note: 
+        The output has no gradient.
 
     Args:
         x(Tensor): Tensor, data type is bool, float32, float64, int32, int64.
@@ -459,9 +454,10 @@ def equal(x, y, name=None):
 @templatedoc()
 def greater_equal(x, y, name=None):
     """
-    This OP returns the truth value of :math:`x >= y` elementwise, which is equivalent function to the overloaded operator `>=`.
+    Returns the truth value of :math:`x >= y` elementwise, which is equivalent function to the overloaded operator `>=`.
 
-    **NOTICE**: The output of this OP has no gradient.
+    Note: 
+        The output has no gradient.
 
     Args:
         x(Tensor): First input to compare which is N-D tensor. The input data type should be bool, float32, float64, int32, int64.
@@ -469,7 +465,7 @@ def greater_equal(x, y, name=None):
         name(str, optional): The default value is None.  Normally there is no need for
             user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
     Returns:
-        Tensor, the output data type is bool: The tensor storing the output, the output shape is same as input :attr:`x`.
+        Tensor: The output shape is same as input :attr:`x`. The output data type is bool.
 
     Examples:
         .. code-block:: python
@@ -509,9 +505,10 @@ def greater_equal(x, y, name=None):
 @templatedoc()
 def greater_than(x, y, name=None):
     """
-    This OP returns the truth value of :math:`x > y` elementwise, which is equivalent function to the overloaded operator `>`.
+    Returns the truth value of :math:`x > y` elementwise, which is equivalent function to the overloaded operator `>`.
 
-    **NOTICE**: The output of this OP has no gradient.
+    Note: 
+        The output has no gradient.
 
     Args:
         x(Tensor): First input to compare which is N-D tensor. The input data type should be bool, float32, float64, int32, int64.
@@ -519,7 +516,7 @@ def greater_than(x, y, name=None):
         name(str, optional): The default value is None.  Normally there is no need for
             user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
     Returns:
-        Tensor, the output data type is bool: The tensor storing the output, the output shape is same as input :attr:`x` .
+        Tensor: The output shape is same as input :attr:`x`. The output data type is bool.
 
     Examples:
         .. code-block:: python
@@ -558,9 +555,10 @@ def greater_than(x, y, name=None):
 @templatedoc()
 def less_equal(x, y, name=None):
     """
-    This OP returns the truth value of :math:`x <= y` elementwise, which is equivalent function to the overloaded operator `<=`.
+    Returns the truth value of :math:`x <= y` elementwise, which is equivalent function to the overloaded operator `<=`.
 
-    **NOTICE**: The output of this OP has no gradient.
+    Note: 
+        The output has no gradient.
 
     Args:
         x(Tensor): First input to compare which is N-D tensor. The input data type should be bool, float32, float64, int32, int64.
@@ -569,7 +567,7 @@ def less_equal(x, y, name=None):
             user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Tensor, the output data type is bool: The tensor storing the output, the output shape is same as input :attr:`x`.
+        Tensor: The output shape is same as input :attr:`x`. The output data type is bool.
 
     Examples:
         .. code-block:: python
@@ -609,9 +607,10 @@ def less_equal(x, y, name=None):
 @templatedoc()
 def less_than(x, y, name=None):
     """
-    This OP returns the truth value of :math:`x < y` elementwise, which is equivalent function to the overloaded operator `<`.
+    Returns the truth value of :math:`x < y` elementwise, which is equivalent function to the overloaded operator `<`.
 
-    **NOTICE**: The output of this OP has no gradient.
+    Note: 
+        The output has no gradient.
 
     Args:
         x(Tensor): First input to compare which is N-D tensor. The input data type should be bool, float32, float64, int32, int64.
@@ -620,7 +619,7 @@ def less_than(x, y, name=None):
             user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Tensor, the output data type is bool: The tensor storing the output, the output shape is same as input :attr:`x`.
+        Tensor: The output shape is same as input :attr:`x`. The output data type is bool.
 
     Examples:
         .. code-block:: python
@@ -660,9 +659,10 @@ def less_than(x, y, name=None):
 @templatedoc()
 def not_equal(x, y, name=None):
     """
-    This OP returns the truth value of :math:`x != y` elementwise, which is equivalent function to the overloaded operator `!=`.
+    Returns the truth value of :math:`x != y` elementwise, which is equivalent function to the overloaded operator `!=`.
     
-    **NOTICE**: The output of this OP has no gradient.
+    Note: 
+        The output has no gradient.
 
     Args:
         x(Tensor): First input to compare which is N-D tensor. The input data type should be bool, float32, float64, int32, int64.
@@ -671,7 +671,7 @@ def not_equal(x, y, name=None):
             user to set this property.  For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Tensor, the output data type is bool: The tensor storing the output, the output shape is same as input :attr:`x`.
+        Tensor: The output shape is same as input :attr:`x`. The output data type is bool.
 
     Examples:
         .. code-block:: python
@@ -711,13 +711,13 @@ def not_equal(x, y, name=None):
 def is_tensor(x):
     """
 
-    This function tests whether input object is a paddle.Tensor.
+    Tests whether input object is a paddle.Tensor.
 
     Args:
         x (object): Object to test.
 
     Returns:
-        A boolean value. True if 'x' is a paddle.Tensor, otherwise False.
+        A boolean value. True if ``x`` is a paddle.Tensor, otherwise False.
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 127aa71137dff..57785c16e60bb 100755
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -970,7 +970,7 @@ def tolist(x):
 def concat(x, axis=0, name=None):
     """
 
-    This OP concatenates the input along the axis.
+    Concatenates the input along the axis.
 
     Args:
         x (list|tuple): ``x`` is a Tensor list or Tensor tuple which is with data type bool, float16,
@@ -1330,13 +1330,11 @@ def rot90(x, k=1, axes=[0, 1], name=None):
 
 def flatten(x, start_axis=0, stop_axis=-1, name=None):
     r"""
-    **Flatten op**
-
     Flattens a contiguous range of axes in a tensor according to start_axis and stop_axis.
 
-    Note that the output Tensor will share data with origin Tensor and doesn't have a 
-    Tensor copy in ``dygraph`` mode. If you want to use the Tensor copy version, please 
-    use `Tensor.clone` like ``flatten_clone_x = x.flatten().clone()``.
+    Note:
+        The output Tensor will share data with origin Tensor and doesn't have a Tensor copy in ``dygraph`` mode. 
+        If you want to use the Tensor copy version, please use `Tensor.clone` like ``flatten_clone_x = x.flatten().clone()``.
 
     For Example:
 
@@ -1371,8 +1369,7 @@ def flatten(x, start_axis=0, stop_axis=-1, name=None):
                       float64, int8, int32, int64, uint8.
         start_axis (int): the start axis to flatten
         stop_axis (int): the stop axis to flatten
-        name(str, Optional): For details, please refer to :ref:`api_guide_Name`.
-                        Generally, no setting is required. Default: None.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor: A tensor with the contents of the input tensor, with input \
@@ -1427,8 +1424,7 @@ def flatten(x, start_axis=0, stop_axis=-1, name=None):
         raise ValueError("The stop_axis should be larger than stat_axis")
 
     if in_dygraph_mode():
-        dy_out, _ = _C_ops.final_state_flatten(x, start_axis, stop_axis)
-        return dy_out
+        return _C_ops.final_state_flatten(x, start_axis, stop_axis)
 
     if _in_legacy_dygraph():
         dy_out, _ = _C_ops.flatten_contiguous_range(x, 'start_axis', start_axis,
@@ -1489,7 +1485,10 @@ def roll(x, shifts, axis=None, name=None):
         x (Tensor): The x tensor as input.
         shifts (int|list|tuple): The number of places by which the elements
                            of the `x` tensor are shifted.
-        axis (int|list|tuple|None): axis(axes) along which to roll.
+        axis (int|list|tuple, optional): axis(axes) along which to roll. Default: None
+        name(str, optional): The default value is None.  Normally there is no need for user to set this property.
+                For more information, please refer to :ref:`api_guide_Name` .
+
 
     Returns:
         Tensor: A Tensor with same data type as `x`.
@@ -1512,6 +1511,11 @@ def roll(x, shifts, axis=None, name=None):
             #[[7. 8. 9.]
             # [1. 2. 3.]
             # [4. 5. 6.]]
+            out_z3 = paddle.roll(x, shifts=1, axis=1)
+            print(out_z3)
+            #[[3. 1. 2.]
+            # [6. 4. 5.]
+            # [9. 7. 8.]]
     """
     origin_shape = x.shape
     if type(shifts) == int:
@@ -1530,8 +1534,6 @@ def roll(x, shifts, axis=None, name=None):
         axis = []
 
     if in_dygraph_mode():
-        if isinstance(shifts, paddle.Tensor):
-            shifts = shifts.cpu()
         return _C_ops.final_state_roll(x, shifts, axis)
 
     if _in_legacy_dygraph():
@@ -1562,7 +1564,7 @@ def roll(x, shifts, axis=None, name=None):
 
 def stack(x, axis=0, name=None):
     """
-    This OP stacks all the input tensors ``x`` along ``axis`` dimemsion. 
+    Stacks all the input tensors ``x`` along ``axis`` dimemsion. 
     All tensors must be of the same shape and same dtype.
     
     For example, given N tensors of shape [A, B], if ``axis == 0``, the shape of stacked 
@@ -1618,7 +1620,7 @@ def stack(x, axis=0, name=None):
         axis (int, optional): The axis along which all inputs are stacked. ``axis`` range is ``[-(R+1), R+1)``,
                               where ``R`` is the number of dimensions of the first input tensor ``x[0]``. 
                               If ``axis < 0``, ``axis = axis+R+1``. The default value of axis is 0.
-        name (str, optional): Please refer to :ref:`api_guide_Name`, Default None.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
         
     Returns:
         Tensor: The stacked tensor with same data type as input.
@@ -1932,7 +1934,7 @@ def squeeze(x, axis=None, name=None):
     input = x
     axes = axis
     if in_dygraph_mode():
-        return _C_ops.final_state_squeeze(input, axes)[1]
+        return _C_ops.final_state_squeeze(input, axes)
     if _in_legacy_dygraph():
         out, _ = _C_ops.squeeze2(input, 'axes', axes)
         return out
@@ -2267,7 +2269,7 @@ def unsqueeze(x, axis, name=None):
         if _in_legacy_dygraph():
             out, _ = _C_ops.unsqueeze2(input, 'axes', axes)
             return out
-        return _C_ops.final_state_unsqueeze(input, axes)[1]
+        return _C_ops.final_state_unsqueeze(input, axes)
 
     check_type(axes, 'axis/axes', (int, list, tuple, Variable), 'unsqueeze')
     check_variable_and_dtype(input, 'input', [
@@ -2430,20 +2432,20 @@ def unbind(input, axis=0):
         .. code-block:: python
 
             import paddle
-            import numpy as np
-            # input is a variable which shape is [3, 4, 5]
-            np_input = np.random.rand(3, 4, 5).astype('float32')
-            input = paddle.to_tensor(np_input)
+
+            # input is a Tensor which shape is [3, 4, 5]
+            input = paddle.rand([3, 4, 5])
+       
             [x0, x1, x2] = paddle.unbind(input, axis=0)
             # x0.shape [4, 5]
             # x1.shape [4, 5]
             # x2.shape [4, 5]
+
             [x0, x1, x2, x3] = paddle.unbind(input, axis=1)
             # x0.shape [3, 5]
             # x1.shape [3, 5]
             # x2.shape [3, 5]
             # x3.shape [3, 5]
-
     """
     if in_dygraph_mode():
         return _C_ops.final_state_unbind(input, axis)
@@ -2583,7 +2585,6 @@ def scatter_(x, index, updates, overwrite=True, name=None):
 
 def scatter_nd_add(x, index, updates, name=None):
     r"""
-    **Scatter_nd_add Layer**
 
     Output is obtained by applying sparse addition to a single value
     or slice in a Tensor.
@@ -2640,15 +2641,16 @@ def scatter_nd_add(x, index, updates, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             x = paddle.rand(shape=[3, 5, 9, 10], dtype='float32')
             updates = paddle.rand(shape=[3, 9, 10], dtype='float32')
-            index_data = np.array([[1, 1],
-                                   [0, 1],
-                                   [1, 3]]).astype(np.int64)
-            index = paddle.to_tensor(index_data)
+            index = paddle.to_tensor([[1, 1],
+                                    [0, 1],
+                                    [1, 3]], dtype='int64')
+            
             output = paddle.scatter_nd_add(x, index, updates)
+            print(output.shape)
+            # [3, 5, 9, 10]
     """
     if in_dygraph_mode():
         op = getattr(_C_ops, 'scatter_nd_add')
@@ -2930,8 +2932,7 @@ def broadcast_to(x, shape, name=None):
         shape (list|tuple|Tensor): The result shape after broadcasting. The data type is int32. If shape is a list or tuple, all its elements
             should be integers or 1-D Tensors with the data type int32. If shape is a Tensor, it should be an 1-D Tensor with the data type int32. 
             The value -1 in shape means keeping the corresponding dimension unchanged.
-        name (str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name` .
-
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
     Returns:
         N-D Tensor: A Tensor with the given shape. The data type is the same as ``x``.
 
@@ -3013,7 +3014,7 @@ def expand(x, shape, name=None):
 
 
     Args:
-        x (Tensor): The input tensor, its data type is bool, float32, float64, int32 or int64.
+        x (Tensor): The input Tensor, its data type is bool, float32, float64, int32 or int64.
         shape (list|tuple|Tensor): The result shape after expanding. The data type is int32. If shape is a list or tuple, all its elements
             should be integers or 1-D Tensors with the data type int32. If shape is a Tensor, it should be an 1-D Tensor with the data type int32. 
             The value -1 in shape means keeping the corresponding dimension unchanged.
@@ -3094,7 +3095,7 @@ def get_attr_expand_shape(list_expand_shape):
 
 def reshape(x, shape, name=None):
     """
-    This operator changes the shape of ``x`` without changing its data.
+    Changes the shape of ``x`` without changing its data.
 
     Note that the output Tensor will share data with origin Tensor and doesn't
     have a Tensor copy in ``dygraph`` mode. 
@@ -3103,32 +3104,17 @@ def reshape(x, shape, name=None):
 
     Some tricks exist when specifying the target shape.
 
-    1. -1 means the value of this dimension is inferred from the total element
-    number of x and remaining dimensions. Thus one and only one dimension can
-    be set -1.
+        - 1. -1 means the value of this dimension is inferred from the total element number of x and remaining dimensions. Thus one and only one dimension can be set -1.
 
-    2. 0 means the actual dimension value is going to be copied from the
-    corresponding dimension of x. The index of 0s in shape can not exceed
-    the dimension of x.
+        - 2. 0 means the actual dimension value is going to be copied from the corresponding dimension of x. The index of 0s in shape can not exceed the dimension of x.
 
     Here are some examples to explain it.
 
-    1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
-    is [6, 8], the reshape operator will transform x into a 2-D tensor with
-    shape [6, 8] and leaving x's data unchanged.
+        - 1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape is [6, 8], the reshape operator will transform x into a 2-D tensor with shape [6, 8] and leaving x's data unchanged.
 
-    2. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
-    specified is [2, 3, -1, 2], the reshape operator will transform x into a
-    4-D tensor with shape [2, 3, 4, 2] and leaving x's data unchanged. In this
-    case, one dimension of the target shape is set to -1, the value of this
-    dimension is inferred from the total element number of x and remaining
-    dimensions.
+        - 2. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape specified is [2, 3, -1, 2], the reshape operator will transform x into a 4-D tensor with shape [2, 3, 4, 2] and leaving x's data unchanged. In this case, one dimension of the target shape is set to -1, the value of this dimension is inferred from the total element number of x and remaining dimensions.
 
-    3. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
-    is [-1, 0, 3, 2], the reshape operator will transform x into a 4-D tensor
-    with shape [2, 4, 3, 2] and leaving x's data unchanged. In this case,
-    besides -1, 0 means the actual dimension value is going to be copied from
-    the corresponding dimension of x.
+        - 3. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape is [-1, 0, 3, 2], the reshape operator will transform x into a 4-D tensor with shape [2, 4, 3, 2] and leaving x's data unchanged. In this case, besides -1, 0 means the actual dimension value is going to be copied from the corresponding dimension of x.
 
     Args:
         x (Tensor): An N-D Tensor. The data type is ``float32``, ``float64``, ``int32``, ``int64`` or ``bool``
@@ -3183,7 +3169,7 @@ def reshape(x, shape, name=None):
                 item.numpy().item(0) if isinstance(item, Variable) else item
                 for item in shape
             ]
-            out, _ = _C_ops.reshape2(x, None, 'shape', shape)
+            out = _C_ops.final_state_reshape(x, shape)
         elif isinstance(shape, tmp_tensor_type):
             shape.stop_gradient = True
             out, _ = _C_ops.reshape2(x, shape)
@@ -3360,8 +3346,7 @@ def gather_nd(x, index, name=None):
         x (Tensor): The input Tensor which it's data type should be bool, float32, float64, int32, int64.
         index (Tensor): The index input with rank > 1, index.shape[-1] <= input.rank.
                         Its dtype should be int32, int64.
-        name(str, optional): The default value is None.  Normally there is no need for user to set this property.
-                        For more information, please refer to :ref:`api_guide_Name` .
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         output (Tensor): A tensor with the shape index.shape[:-1] + input.shape[index.shape[-1]:]
@@ -3447,7 +3432,7 @@ def strided_slice(x, axes, starts, ends, strides, name=None):
                 result = [ [2], ]
 
     Args:
-        x (Tensor): An N-D ``Tensor``. The data type is ``float32``, ``float64``, ``int32`` or ``int64``.
+        x (Tensor): An N-D ``Tensor``. The data type is ``bool``, ``float32``, ``float64``, ``int32`` or ``int64``.
         axes (list|tuple): The data type is ``int32`` . Axes that `starts` and `ends` apply to.
                             It's optional. If it is not provides, it will be treated as :math:`[0,1,...,len(starts)-1]`.
         starts (list|tuple|Tensor): The data type is ``int32`` . If ``starts`` is a list or tuple, the elements of                                                                                          it should be integers or Tensors with shape [1]. If ``starts`` is an Tensor, it should be an 1-D Tensor.                                                                                    It represents starting indices of corresponding axis in ``axes``.
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index d513b83d3e39e..78fb04e3d3510 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -90,7 +90,7 @@
 
 def log(x, name=None):
     r"""
-    Calculates the natural log of the given input tensor, element-wise.
+    Calculates the natural log of the given input Tensor, element-wise.
 
     .. math::
 
@@ -154,7 +154,7 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Tensor: Output tensor of scale operator, with shape and data type same as input.
+        Tensor: Output Tensor of scale operator, with shape and data type same as input.
 
     Examples:
         .. code-block:: python
@@ -292,6 +292,7 @@ def multiplex(inputs, index, name=None):
             :name: code-example1
 
             import paddle
+            
             img1 = paddle.to_tensor([[1, 2], [3, 4]], dtype=paddle.float32)
             img2 = paddle.to_tensor([[5, 6], [7, 8]], dtype=paddle.float32)
             inputs = [img1, img2]
@@ -339,7 +340,7 @@ def scale_(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
 
 def pow(x, y, name=None):
     """
-    Compute the power of tensor elements. The equation is:
+    Compute the power of Tensor elements. The equation is:
 
     .. math::
         out = x^{y} 
@@ -498,6 +499,7 @@ def add(x, y, name=None):
     ..  code-block:: python
 
         import paddle
+
         x = paddle.to_tensor([2, 3, 4], 'float64')
         y = paddle.to_tensor([1, 5, 2], 'float64')
         z = paddle.add(x, y)
@@ -539,8 +541,8 @@ def subtract(x, y, name=None):
     .. math::
         out = x - y
 
-    **Note**:
-    ``paddle.subtract`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting` .
+    Note:
+        ``paddle.subtract`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting` .
 
     Args:
         x (Tensor): the input tensor, it's data type should be float32, float64, int32, int64.
@@ -554,35 +556,37 @@ def subtract(x, y, name=None):
 
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
             x = paddle.to_tensor([[1, 2], [7, 8]])
             y = paddle.to_tensor([[5, 6], [3, 4]])
             res = paddle.subtract(x, y)
             print(res)
-            #       [[-4, -4],
-            #        [4, 4]]
+            # Tensor(shape=[2, 2], dtype=int64, place=Place(cpu), stop_gradient=True,
+            #        [[-4, -4],
+            #         [ 4,  4]])
 
             x = paddle.to_tensor([[[1, 2, 3], [1, 2, 3]]])
             y = paddle.to_tensor([1, 0, 4])
             res = paddle.subtract(x, y)
             print(res)
-            #       [[[ 0,  2, -1],
-            #         [ 0,  2, -1]]]
+            # Tensor(shape=[1, 2, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            #        [[[ 0,  2, -1],
+            #          [ 0,  2, -1]]])
 
-            x = paddle.to_tensor([2, np.nan, 5], dtype='float32')
-            y = paddle.to_tensor([1, 4, np.nan], dtype='float32')
+            x = paddle.to_tensor([2, float('nan'), 5], dtype='float32')
+            y = paddle.to_tensor([1, 4, float('nan')], dtype='float32')
             res = paddle.subtract(x, y)
             print(res)
-            #       [ 1., nan, nan]
+            # Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #        [1. , nan, nan])
 
-            x = paddle.to_tensor([5, np.inf, -np.inf], dtype='float64')
+            x = paddle.to_tensor([5, float('inf'), -float('inf')], dtype='float64')
             y = paddle.to_tensor([1, 4, 5], dtype='float64')
             res = paddle.subtract(x, y)
             print(res)
-            #       [   4.,  inf., -inf.]
-
+            # Tensor(shape=[3], dtype=float64, place=Place(cpu), stop_gradient=True,
+            #        [ 4.  ,  inf., -inf.])
     """
     op_type = 'elementwise_sub'
     axis = -1
@@ -855,7 +859,7 @@ def maximum(x, y, name=None):
 
 def minimum(x, y, name=None):
     """
-    Compare two tensors and returns a new tensor containing the element-wise minima. The equation is:
+    Compare two tensors and return a new tensor containing the element-wise minima. The equation is:
 
     .. math::
         out = min(x, y)
@@ -869,7 +873,7 @@ def minimum(x, y, name=None):
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        N-D Tensor. A location into which the result is stored. If x, y have different shapes and are "broadcastable", the resulting tensor shape is the shape of x and y after broadcasting. If x, y have the same shape,  its shape is the same as x and y.
+        Tensor. If x, y have different shapes and are "broadcastable", the resulting tensor shape is the shape of x and y after broadcasting. If x, y have the same shape,  its shape is the same as x and y.
 
     Examples:
 
@@ -1083,9 +1087,6 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None):
         if `x.dtype='bool'`, `x.dtype='int32'`, it's data type is `'int64'`, 
         otherwise it's data type is the same as `x`.
 
-    Raises:
-        TypeError: The type of :attr:`axis` must be int, list or tuple.
-
     Examples:
         .. code-block:: python
 
@@ -1388,10 +1389,10 @@ def add_n(inputs, name=None):
         if len(inputs) > 0:
             for input in inputs:
                 check_variable_and_dtype(input, "inputs", \
-                   ['float32', 'float64', 'int32', 'int64'], 'add_n')
+                   ['float16', 'float32', 'float64', 'int32', 'int64'], 'add_n')
     else:
         check_variable_and_dtype(inputs, "inputs", \
-                ['float32', 'float64', 'int32', 'int64'], 'add_n')
+                ['float16', 'float32', 'float64', 'int32', 'int64'], 'add_n')
 
 
     out = helper.create_variable_for_type_inference(
@@ -1571,7 +1572,7 @@ def addmm(input, x, y, beta=1.0, alpha=1.0, name=None):
     """
     **addmm**
 
-    This operator is used to perform matrix multiplication for input $x$ and $y$.
+    Perform matrix multiplication for input $x$ and $y$.
     $input$ is added to the final result.
     The equation is:
 
@@ -1584,12 +1585,12 @@ def addmm(input, x, y, beta=1.0, alpha=1.0, name=None):
         input (Tensor): The input Tensor to be added to the final result.
         x (Tensor): The first input Tensor for matrix multiplication.
         y (Tensor): The second input Tensor for matrix multiplication.
-        beta (float): Coefficient of $input$.
-        alpha (float): Coefficient of $x*y$.
+        beta (float, optional): Coefficient of $input$, default is 1.
+        alpha (float, optional): Coefficient of $x*y$, default is 1.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Tensor: The output Tensor of addmm op.
+        Tensor: The output Tensor of addmm.
 
     Examples:
         ..  code-block:: python
@@ -1830,7 +1831,7 @@ def __check_input(x, y):
 
 def logsumexp(x, axis=None, keepdim=False, name=None):
     r"""
-    This OP calculates the log of the sum of exponentials of ``x`` along ``axis`` .
+    Calculates the log of the sum of exponentials of ``x`` along ``axis`` .
 
     .. math::
        logsumexp(x) = \log\sum exp(x)
@@ -2543,9 +2544,9 @@ def clip(x, min=None, max=None, name=None):
 
     Args:
         x (Tensor): An N-D Tensor with data type float32, float64, int32 or int64.
-        min (float|int|Tensor): The lower bound with type ``float`` , ``int`` or a ``Tensor``
+        min (float|int|Tensor, optional): The lower bound with type ``float`` , ``int`` or a ``Tensor``
             with shape [1] and type ``int32``, ``float32``, ``float64``.
-        max (float|int|Tensor): The upper bound with type ``float``, ``int`` or a ``Tensor``
+        max (float|int|Tensor, optional): The upper bound with type ``float``, ``int`` or a ``Tensor``
             with shape [1] and type ``int32``, ``float32``, ``float64``.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
@@ -2659,9 +2660,8 @@ def clip_(x, min=None, max=None, name=None):
 
 def trace(x, offset=0, axis1=0, axis2=1, name=None):
     """
-    **trace**
 
-    This OP computes the sum along diagonals of the input tensor x.
+    Computes the sum along diagonals of the input tensor x.
 
     If ``x`` is 2D, returns the sum of diagonal.
 
@@ -2862,18 +2862,15 @@ def __check_input(input, offset, dim1, dim2):
 def kron(x, y, name=None):
     """
 
-${comment}
+    ${comment}
 
     Args:
-        x (Tensor): the fist operand of kron op, data type: float16, float32,
-            float64, int32 or int64.
-        y (Tensor): the second operand of kron op, data type: float16,
-            float32, float64, int32 or int64. Its data type should be the same
-            with x.
+        x (Tensor): the fist operand of kron op, data type: float16, float32, float64, int32 or int64.
+        y (Tensor): the second operand of kron op, data type: float16, float32, float64, int32 or int64. Its data type should be the same with x.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Tensor: The output of kron op, data type: float16, float32, float64, int32 or int64. Its data is the same with x.
+        Tensor: The output of kron, data type: float16, float32, float64, int32 or int64. Its data is the same with x.
 
     Examples:
         .. code-block:: python
@@ -3123,7 +3120,7 @@ def isfinite(x, name=None):
             import paddle
 
             x = paddle.to_tensor([float('-inf'), -2, 3.6, float('inf'), 0, float('-nan'), float('nan')])
-            out = paddle.tensor.isfinite(x)
+            out = paddle.isfinite(x)
             print(out)  # [False  True  True False  True False False]
     """
     if in_dygraph_mode():
@@ -3152,8 +3149,9 @@ def isinf(x, name=None):
         .. code-block:: python
 
             import paddle
+
             x = paddle.to_tensor([float('-inf'), -2, 3.6, float('inf'), 0, float('-nan'), float('nan')])
-            out = paddle.tensor.isinf(x)
+            out = paddle.isinf(x)
             print(out)  # [ True False False  True False False False]
     """
     if in_dygraph_mode():
@@ -3182,8 +3180,9 @@ def isnan(x, name=None):
         .. code-block:: python
 
             import paddle
+            
             x = paddle.to_tensor([float('-inf'), -2, 3.6, float('inf'), 0, float('-nan'), float('nan')])
-            out = paddle.tensor.isnan(x)
+            out = paddle.isnan(x)
             print(out)  # [False False False False False  True  True]
     """
     if in_dygraph_mode():
@@ -3208,20 +3207,16 @@ def prod(x, axis=None, keepdim=False, dtype=None, name=None):
             multiply all elements of `x` and return a Tensor with a single element, 
             otherwise must be in the range :math:`[-x.ndim, x.ndim)`. If :math:`axis[i]<0`, 
             the axis to reduce is :math:`x.ndim + axis[i]`. Default is None.
+        keepdim (bool, optional): Whether to reserve the reduced dimension in the output Tensor. The result 
+            tensor will have one fewer dimension than the input unless `keepdim` is true. Default is False.
         dtype (str|np.dtype, optional): The desired date type of returned tensor, can be float32, float64, 
             int32, int64. If specified, the input tensor is casted to dtype before operator performed. 
             This is very useful for avoiding data type overflows. The default value is None, the dtype 
             of output is the same as input Tensor `x`.
-        keepdim (bool, optional): Whether to reserve the reduced dimension in the output Tensor. The result 
-            tensor will have one fewer dimension than the input unless `keepdim` is true. Default is False.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor, result of product on the specified dim of input tensor.
-
-    Raises:
-        ValueError: The :attr:`dtype` must be float32, float64, int32 or int64.
-        TypeError: The type of :attr:`axis` must be int, list or tuple.
     
     Examples:
         .. code-block:: python
@@ -3297,7 +3292,7 @@ def prod(x, axis=None, keepdim=False, dtype=None, name=None):
 
 def sign(x, name=None):
     """
-    This OP returns sign of every element in `x`: 1 for positive, -1 for negative and 0 for zero.
+    Returns sign of every element in `x`: 1 for positive, -1 for negative and 0 for zero.
 
     Args:
         x (Tensor): The input tensor. The data type can be float16, float32 or float64.
@@ -3419,7 +3414,7 @@ def increment(x, value=1.0, name=None):
 
 def all(x, axis=None, keepdim=False, name=None):
     """
-    Computes the the ``logical and`` of tensor elements over the given dimension.
+    Computes the ``logical and`` of tensor elements over the given dimension.
 
     Args:
         x (Tensor): An N-D Tensor, the input data type should be `bool`.
@@ -3437,38 +3432,32 @@ def all(x, axis=None, keepdim=False, name=None):
     Returns:
         Tensor: Results the ``logical and`` on the specified axis of input Tensor `x`,  it's data type is bool.
 
-    Raises:
-        ValueError: If the data type of `x` is not bool.
-        TypeError: The type of :attr:`axis` must be int, list or tuple.
-
     Examples:
         .. code-block:: python
 
             import paddle
-            import numpy as np
-            
+
             # x is a bool Tensor with following elements:
             #    [[True, False]
             #     [True, True]]
-            x = paddle.assign(np.array([[1, 0], [1, 1]], dtype='int32'))
+            x = paddle.to_tensor([[1, 0], [1, 1]], dtype='int32')
             print(x)
             x = paddle.cast(x, 'bool')
-            
+
             # out1 should be [False]
             out1 = paddle.all(x)  # [False]
             print(out1)
-            
+
             # out2 should be [True, False]
             out2 = paddle.all(x, axis=0)  # [True, False]
             print(out2)
-            
-            # keep_dim=False, out3 should be [False, True], out.shape should be (2,)
+
+            # keepdim=False, out3 should be [False, True], out.shape should be (2,)
             out3 = paddle.all(x, axis=-1)  # [False, True]
             print(out3)
-            
-            # keep_dim=True, out4 should be [[False], [True]], out.shape should be (2,1)
-            out4 = paddle.all(x, axis=1, keepdim=True)
-            out4 = paddle.cast(out4, 'int32')  # [[False], [True]]
+
+            # keepdim=True, out4 should be [[False], [True]], out.shape should be (2,1)
+            out4 = paddle.all(x, axis=1, keepdim=True) # [[False], [True]]
             print(out4)
             
     """
@@ -3515,7 +3504,7 @@ def all(x, axis=None, keepdim=False, name=None):
 
 def any(x, axis=None, keepdim=False, name=None):
     """
-    Computes the the ``logical or`` of tensor elements over the given dimension.
+    Computes the ``logical or`` of tensor elements over the given dimension, and return the result.
 
     Args:
         x (Tensor): An N-D Tensor, the input data type should be `bool`.
@@ -3533,39 +3522,34 @@ def any(x, axis=None, keepdim=False, name=None):
     Returns:
         Tensor: Results the ``logical or`` on the specified axis of input Tensor `x`,  it's data type is bool.
 
-    Raises:
-        ValueError: If the data type of `x` is not bool.
-        TypeError: The type of :attr:`axis` must be int, list or tuple.
-
     Examples:
         .. code-block:: python
 
             import paddle
-            import numpy as np
-            
-            # x is a bool Tensor with following elements:
-            #    [[True, False]
-            #     [False, False]]
-            x = paddle.assign(np.array([[1, 0], [1, 1]], dtype='int32'))
+
+            x = paddle.to_tensor([[1, 0], [1, 1]], dtype='int32')
+            x = paddle.assign(x)
             print(x)
             x = paddle.cast(x, 'bool')
-            
+            # x is a bool Tensor with following elements:
+            #    [[True, False]
+            #     [True, True]]
+
             # out1 should be [True]
             out1 = paddle.any(x)  # [True]
             print(out1)
-            
+
             # out2 should be [True, True]
             out2 = paddle.any(x, axis=0)  # [True, True]
             print(out2)
-            
-            # keep_dim=False, out3 should be [True, True], out.shape should be (2,)
+
+            # keepdim=False, out3 should be [True, True], out.shape should be (2,)
             out3 = paddle.any(x, axis=-1)  # [True, True]
             print(out3)
-            
-            # keep_dim=True, result should be [[True], [True]], out.shape should be (2,1)
-            out4 = paddle.any(x, axis=1, keepdim=True)
-            out4 = paddle.cast(out4, 'int32')  # [[True], [True]]
-            print(out4)
+
+            # keepdim=True, result should be [[True], [True]], out.shape should be (2,1)
+            out4 = paddle.any(x, axis=1, keepdim=True)  # [[True], [True]]
+            print(out4) 
             
     """
     if axis is not None and not isinstance(axis, (list, tuple)):
@@ -3641,18 +3625,18 @@ def conj(x, name=None):
     This function computes the conjugate of the Tensor elementwisely.
 
     Args:
-        x (Tensor): The input tensor which hold the complex numbers. 
+        x (Tensor): The input Tensor which hold the complex numbers. 
             Optional data types are: complex64, complex128, float32, float64, int32 or int64.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        out (Tensor): The conjugate of input. The shape and data type is the same with input.
-            If the elements of tensor is real type such as float32, float64, int32 or int64, the out is the same with input.
+        out (Tensor): The conjugate of input. The shape and data type is the same with input. If the elements of tensor is real type such as float32, float64, int32 or int64, the out is the same with input.
 
     Examples:
         .. code-block:: python
 
           import paddle
+          
           data=paddle.to_tensor([[1+1j, 2+2j, 3+3j], [4+4j, 5+5j, 6+6j]])
           #Tensor(shape=[2, 3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
           #       [[(1+1j), (2+2j), (3+3j)],
@@ -3883,7 +3867,7 @@ def lerp(x, y, weight, name=None):
             x = paddle.arange(1., 5., dtype='float32')
             y = paddle.empty([4], dtype='float32')
             y.fill_(10.)
-            out = paddle.lerp(start, end, 0.5)
+            out = paddle.lerp(x, y, 0.5)
             # out: [5.5., 6., 6.5, 7.]
 
     """
@@ -3929,7 +3913,7 @@ def lerp_(x, y, weight, name=None):
 
 def erfinv(x, name=None):
     r"""
-    The inverse error function of x, .
+    The inverse error function of x.
 
     Equation:
         .. math::
@@ -4454,6 +4438,54 @@ def angle(x, name=None):
     helper.append_op(type=op_type, inputs=inputs, outputs=outputs)
     return out
 
+def heaviside(x, y, name=None):
+    """
+    Computes the Heaviside step function determined by corresponding element in y for each element in x. The equation is
+
+    .. math::
+        heaviside(x, y)=
+            \left\{
+                \\begin{array}{lcl}
+                0,& &\\text{if} \ x < 0, \\\\
+                y,& &\\text{if} \ x = 0, \\\\
+                1,& &\\text{if} \ x > 0.
+                \end{array}
+            \\right.
+
+    Notes:
+        ``paddle.heaviside`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`.
+
+    Args:
+        x (Tensor): The input tensor of Heaviside step function, it's data type should be float32, float64, int32 or int64.
+        y (Tensor): The tensor that determines a Heaviside step function, it's data type should be float32, float64, int32 or int64.
+        name (str, optional): Name for the operation (optional, default is None). Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        N-D Tensor. A location into which the result is stored. If x and y have different shapes and are broadcastable, the resulting tensor shape is the shape of x and y after broadcasting. If x, y have the same shape, its shape is the same as x and y.
+
+    Examples:
+        .. code-block:: python
+            :name: heaviside-example
+
+            import paddle
+            x = paddle.to_tensor([-0.5, 0, 0.5])
+            y = paddle.to_tensor([0.1])
+            paddle.heaviside(x, y)
+            #    [0.        , 0.10000000, 1.        ]
+            x = paddle.to_tensor([[-0.5, 0, 0.5], [-0.5, 0.5, 0]])
+            y = paddle.to_tensor([0.1, 0.2, 0.3])
+            paddle.heaviside(x, y)
+            #    [[0.        , 0.20000000, 1.        ],
+            #     [0.        , 1.        , 0.30000001]]
+     """
+    op_type = 'elementwise_heaviside'
+    axis = -1
+    act = None
+    if _non_static_mode():
+        return _elementwise_op_in_dygraph(
+            x, y, axis=axis, act=act, op_name=op_type)
+    return _elementwise_op(LayerHelper(op_type, **locals()))
+
 def frac(x, name=None):
     """
     This API is used to return the fractional portion of each element in input.
diff --git a/python/paddle/tensor/ops.py b/python/paddle/tensor/ops.py
index 9ee59c6cfd843..7626552a85dbd 100644
--- a/python/paddle/tensor/ops.py
+++ b/python/paddle/tensor/ops.py
@@ -506,25 +506,27 @@ def erf(x, name=None):
 
 erf.__doc__ = r"""
 :strong:`Erf Operator`
-For more details, see [Error function](https://en.wikipedia.org/wiki/Error_function).
+For more details, see `Error function <https://en.wikipedia.org/wiki/Error_function>`_.
 
 Equation:
     ..  math::
-        out = \\frac{2}{\\sqrt{\\pi}} \\int_{0}^{x}e^{- \\eta^{2}}d\\eta
+        out = \frac{2}{\sqrt{\pi}} \int_{0}^{x}e^{- \eta^{2}}d\eta
 
 Args:
 
     x (Tensor): The input tensor, it's data type should be float32, float64.
+    name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
 Returns:
 
-    Tensor: The output of Erf op, dtype: float32 or float64, the same as the input, shape: the same as the input.
+    Tensor: The output of Erf, dtype: float32 or float64, the same as the input, shape: the same as the input.
 
 Examples:
     
     .. code-block:: python
     
         import paddle
+        
         x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.erf(x)
         print(out)
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index b82f58ea3d087..1194d81a360db 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -202,7 +202,7 @@ def multinomial(x, num_samples=1, replacement=False, name=None):
 
 def gaussian(shape, mean=0.0, std=1.0, dtype=None, name=None):
     """
-    This OP returns a Tensor filled with random values sampled from a Gaussian
+    Returns a Tensor filled with random values sampled from a Gaussian
     distribution, with ``shape`` and ``dtype``.
 
     Args:
@@ -219,9 +219,7 @@ def gaussian(shape, mean=0.0, std=1.0, dtype=None, name=None):
             Supported data types: float32, float64.
             Default is None, use global default dtype (see ``get_default_dtype``
             for details).
-        name (str, optional): The default value is None. Normally there is no
-            need for user to set this property. For more information, please
-            refer to :ref:`api_guide_Name`.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor: A Tensor filled with random values sampled from a Gaussian
@@ -335,7 +333,7 @@ def standard_normal(shape, dtype=None, name=None):
 
 def randn(shape, dtype=None, name=None):
     """
-    This OP returns a Tensor filled with random values sampled from a standard
+    Returns a Tensor filled with random values sampled from a standard
     normal distribution with mean 0 and standard deviation 1, with ``shape``
     and ``dtype``.
 
@@ -907,7 +905,7 @@ def randint_like(x, low=0, high=None, dtype=None, name=None):
 
 def randperm(n, dtype="int64", name=None):
     """
-    This OP returns a 1-D Tensor filled with random permutation values from 0
+    Returns a 1-D Tensor filled with random permutation values from 0
     to n-1, with ``dtype``.
 
     Args:
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index d86a6a3f627b3..02a71a80b9e86 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -41,7 +41,7 @@ def argsort(x, axis=-1, descending=False, name=None):
             int32, int64, uint8.
         axis(int, optional): Axis to compute indices along. The effective range
             is [-R, R), where R is Rank(x). when axis<0, it works the same way
-            as axis+R. Default is 0.
+            as axis+R. Default is -1.
         descending(bool, optional) : Descending is a flag, if set to true,
             algorithm will sort by descending order, else sort by
             ascending order. Default is false.
@@ -66,9 +66,10 @@ def argsort(x, axis=-1, descending=False, name=None):
                                    [4,7,7,9],
                                    [1,7,0,6]]], 
                                 dtype='float32')
-            out1 = paddle.argsort(x=x, axis=-1)
-            out2 = paddle.argsort(x=x, axis=0)
-            out3 = paddle.argsort(x=x, axis=1)
+            out1 = paddle.argsort(x, axis=-1)
+            out2 = paddle.argsort(x, axis=0)
+            out3 = paddle.argsort(x, axis=1)
+            
             print(out1)
             #[[[0 3 1 2]
             #  [0 1 2 3]
@@ -76,6 +77,7 @@ def argsort(x, axis=-1, descending=False, name=None):
             # [[1 3 2 0]
             #  [0 1 2 3]
             #  [2 0 3 1]]]
+            
             print(out2)
             #[[[0 1 1 1]
             #  [0 0 0 0]
@@ -83,6 +85,7 @@ def argsort(x, axis=-1, descending=False, name=None):
             # [[1 0 0 0]
             #  [1 1 1 1]
             #  [0 0 0 1]]]
+            
             print(out3)
             #[[[1 1 1 2]
             #  [0 0 2 0]
@@ -119,7 +122,7 @@ def argsort(x, axis=-1, descending=False, name=None):
 
 def argmax(x, axis=None, keepdim=False, dtype="int64", name=None):
     """
-    This OP computes the indices of the max elements of the input tensor's
+    Computes the indices of the max elements of the input tensor's
     element along the provided axis.
 
     Args:
@@ -130,23 +133,21 @@ def argmax(x, axis=None, keepdim=False, dtype="int64", name=None):
             as axis + R. Default is None, the input `x` will be into the flatten tensor, and selecting the min value index.
         keepdim(bool, optional): Whether to keep the given axis in output. If it is True, the dimensions will be same as input x and with size one in the axis. Otherwise the output dimentions is one fewer than x since the axis is squeezed. Default is False.
         dtype(str|np.dtype, optional): Data type of the output tensor which can
-                    be int32, int64. The default value is 'int64', and it will
+                    be int32, int64. The default value is ``int64`` , and it will
                     return the int64 indices.
-        name(str, optional): The default value is None. Normally there is no
-            need for user to set this property. For more information, please
-            refer to :ref:`api_guide_Name`.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Tensor, return the tensor of `int32` if set :attr:`dtype` is `int32`, otherwise return the tensor of `int64`
+        Tensor, return the tensor of int32 if set :attr:`dtype` is int32, otherwise return the tensor of int64.
 
     Examples:
         .. code-block:: python
 
             import paddle
 
-            x =  paddle.to_tensor([[5,8,9,5],
-                                     [0,0,1,7],
-                                     [6,9,2,4]])
+            x = paddle.to_tensor([[5,8,9,5],
+                                 [0,0,1,7],
+                                 [6,9,2,4]])
             out1 = paddle.argmax(x)
             print(out1) # 2
             out2 = paddle.argmax(x, axis=0)
@@ -773,7 +774,7 @@ def index_sample(x, index):
 
 def masked_select(x, mask, name=None):
     """
-    This OP Returns a new 1-D tensor which indexes the input tensor according to the ``mask``
+    Returns a new 1-D tensor which indexes the input tensor according to the ``mask``
     which is a tensor with data type of bool.
 
     Args:
diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py
index 991b86fd47d16..52ccc60100996 100644
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
@@ -118,30 +118,18 @@ def var(x, axis=None, unbiased=True, keepdim=False, name=None):
 
     Args:
         x (Tensor): The input Tensor with data type float32, float64.
-        axis (int|list|tuple, optional): The axis along which to perform
-            variance calculations. ``axis`` should be int, list(int) or
-            tuple(int). If ``axis`` is a list/tuple of dimension(s), variance
-            is calculated along all element(s) of ``axis`` . ``axis`` or
-            element(s) of ``axis`` should be in range [-D, D), where D is the
-            dimensions of ``x`` . If ``axis`` or element(s) of ``axis`` is less
-            than 0, it works the same way as :math:`axis + D` . If ``axis`` is
-            None, variance is calculated over all elements of ``x``. Default
-            is None.
-        unbiased (bool, optional): Whether to use the unbiased estimation. If
-            ``unbiased`` is True, the divisor used in the computation is
-            :math:`N - 1`, where :math:`N` represents the number of elements
-            along ``axis`` , otherwise the divisor is :math:`N`. Default is True.
-        keepdim (bool, optional): Whether to reserve the reduced dimension(s)
-            in the output Tensor. If ``keepdim`` is True, the dimensions of
-            the output Tensor is the same as ``x`` except in the reduced
-            dimensions(it is of size 1 in this case). Otherwise, the shape of
-            the output Tensor is squeezed in ``axis`` . Default is False.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        axis (int|list|tuple, optional): The axis along which to perform variance calculations. ``axis`` should be int, list(int) or tuple(int). 
+        
+            - If ``axis`` is a list/tuple of dimension(s), variance is calculated along all element(s) of ``axis`` . ``axis`` or element(s) of ``axis`` should be in range [-D, D), where D is the dimensions of ``x`` . 
+            - If ``axis`` or element(s) of ``axis`` is less than 0, it works the same way as :math:`axis + D` . 
+            - If ``axis`` is None, variance is calculated over all elements of ``x``. Default is None.
+
+        unbiased (bool, optional): Whether to use the unbiased estimation. If ``unbiased`` is True, the divisor used in the computation is :math:`N - 1`, where :math:`N` represents the number of elements along ``axis`` , otherwise the divisor is :math:`N`. Default is True.
+        keep_dim (bool, optional): Whether to reserve the reduced dimension in the output Tensor. The result tensor will have one fewer dimension than the input unless keep_dim is true. Default is False.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Tensor, results of variance along ``axis`` of ``x``, with the same data
-        type as ``x``.
+        Tensor, results of variance along ``axis`` of ``x``, with the same data type as ``x``.
 
     Examples:
         .. code-block:: python
@@ -223,7 +211,7 @@ def std(x, axis=None, unbiased=True, keepdim=False, name=None):
 def numel(x, name=None):
     """
     Returns the number of elements for a tensor, which is a int64 Tensor with shape [1] in static mode
-    or a scalar value in imperative mode
+    or a scalar value in imperative mode.
 
     Args:
         x (Tensor): The input Tensor, it's data type can be bool, float16, float32, float64, int32, int64.
@@ -280,21 +268,26 @@ def median(x, axis=None, keepdim=False, name=None):
             import paddle
 
             x = paddle.arange(12).reshape([3, 4])
-            # x is [[0 , 1 , 2 , 3 ],
-            #       [4 , 5 , 6 , 7 ],
-            #       [8 , 9 , 10, 11]]
+            # Tensor(shape=[3, 4], dtype=int64, place=Place(cpu), stop_gradient=True,
+            #        [[0 , 1 , 2 , 3 ],
+            #         [4 , 5 , 6 , 7 ],
+            #         [8 , 9 , 10, 11]])
 
             y1 = paddle.median(x)
-            # y1 is [5.5]
+            # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #        [5.50000000])
 
             y2 = paddle.median(x, axis=0)
-            # y2 is [4., 5., 6., 7.]
+            # Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #        [4., 5., 6., 7.])
 
             y3 = paddle.median(x, axis=1)
-            # y3 is [1.5, 5.5, 9.5]
+            # Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #        [1.50000000, 5.50000000, 9.50000000])
 
             y4 = paddle.median(x, axis=0, keepdim=True)
-            # y4 is [[4., 5., 6., 7.]]
+            # Tensor(shape=[1, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #        [[4., 5., 6., 7.]])
 
     """
     if not isinstance(x, Variable):
diff --git a/python/paddle/tensor/to_string.py b/python/paddle/tensor/to_string.py
index 71c97d4cac986..7935b4f275580 100644
--- a/python/paddle/tensor/to_string.py
+++ b/python/paddle/tensor/to_string.py
@@ -37,12 +37,11 @@ def set_printoptions(precision=None,
                      sci_mode=None,
                      linewidth=None):
     """Set the printing options for Tensor.
-    NOTE: The function is similar with numpy.set_printoptions()
 
     Args:
         precision (int, optional): Number of digits of the floating number, default 8.
         threshold (int, optional): Total number of elements printed, default 1000.
-        edgeitems (int, optional): Number of elements in summary at the begining and ending of each dimension, default 3.
+        edgeitems (int, optional): Number of elements in summary at the beginning and ending of each dimension, default 3.
         sci_mode (bool, optional): Format the floating number with scientific notation or not, default False.
         linewidth (int, optional): Number of characters each line, default 80.
        
diff --git a/python/paddle/tests/dist_hapi_mnist_dynamic.py b/python/paddle/tests/dist_hapi_mnist_dynamic.py
index eab34a6dafbc3..de0518e229b0a 100644
--- a/python/paddle/tests/dist_hapi_mnist_dynamic.py
+++ b/python/paddle/tests/dist_hapi_mnist_dynamic.py
@@ -58,7 +58,7 @@ def compute_accuracy(pred, gt):
 @unittest.skipIf(not fluid.is_compiled_with_cuda(),
                  'CPU testing is not supported')
 class TestDistTraning(unittest.TestCase):
-    def test_static_multiple_gpus(self):
+    def test_dynamic_multiple_gpus(self):
         device = set_device('gpu')
 
         im_shape = (-1, 1, 28, 28)
diff --git a/python/paddle/tests/test_dist_hapi_model.py b/python/paddle/tests/test_dist_hapi_model.py
index 16788e4656192..006800d3caeee 100644
--- a/python/paddle/tests/test_dist_hapi_model.py
+++ b/python/paddle/tests/test_dist_hapi_model.py
@@ -52,6 +52,7 @@ def get_gpus(selected_gpus):
 def start_local_trainers(cluster,
                          pod,
                          training_script,
+                         eager_mode,
                          training_script_args,
                          log_dir=None):
     current_env = copy.copy(os.environ.copy())
@@ -72,6 +73,9 @@ def start_local_trainers(cluster,
             "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints())
         }
 
+        if not eager_mode:
+            proc_env["FLAGS_enable_eager_mode"] = "%d" % 0
+
         current_env.update(proc_env)
 
         print("trainer proc env:{}".format(current_env))
@@ -99,7 +103,7 @@ def start_local_trainers(cluster,
 
 
 class TestMultipleGpus(unittest.TestCase):
-    def run_mnist_2gpu(self, target_file_name):
+    def run_mnist_2gpu(self, target_file_name, eager_mode=True):
         if fluid.core.get_cuda_device_count() == 0:
             return
 
@@ -112,6 +116,7 @@ def run_mnist_2gpu(self, target_file_name):
         procs = start_local_trainers(
             cluster,
             pod,
+            eager_mode=eager_mode,
             training_script=target_file_name,
             training_script_args=[])
 
@@ -125,13 +130,17 @@ def run_mnist_2gpu(self, target_file_name):
 
     def test_hapi_multiple_gpus_static(self):
         self.run_mnist_2gpu('dist_hapi_mnist_static.py')
+        self.run_mnist_2gpu('dist_hapi_mnist_static.py', eager_mode=False)
 
     def test_hapi_multiple_gpus_dynamic(self):
         self.run_mnist_2gpu('dist_hapi_mnist_dynamic.py')
+        self.run_mnist_2gpu('dist_hapi_mnist_dynamic.py', eager_mode=False)
 
     def test_hapi_amp_static(self):
         self.run_mnist_2gpu('dist_hapi_pure_fp16_static.py')
+        self.run_mnist_2gpu('dist_hapi_pure_fp16_static.py', eager_mode=False)
 
 
 if __name__ == "__main__":
+    os.environ["FLAGS_enable_eager_mode"] = "1"
     unittest.main()
diff --git a/python/paddle/tests/test_model.py b/python/paddle/tests/test_model.py
index fd3cb83d24e8a..41de8ae189f85 100644
--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
@@ -90,7 +90,26 @@ def forward(self, x):
         return y, 3
 
 
-class LeNetListInput(LeNetDygraph):
+class LeNetListInput(paddle.nn.Layer):
+    def __init__(self, num_classes=10):
+        super(LeNetListInput, self).__init__()
+        self.num_classes = num_classes
+        self.cov = Conv2D(1, 6, 3, stride=1, padding=1)
+        for param in self.cov.parameters():
+            param.trainable = False
+        self.features = Sequential(
+            self.cov,
+            ReLU(),
+            paddle.fluid.dygraph.Pool2D(2, 'max', 2),
+            Conv2D(
+                6, 16, 5, stride=1, padding=0),
+            ReLU(),
+            paddle.fluid.dygraph.Pool2D(2, 'max', 2))
+
+        if num_classes > 0:
+            self.fc = Sequential(
+                Linear(400, 120), Linear(120, 84), Linear(84, 10))
+
     def forward(self, inputs):
         x = inputs[0]
         x = self.features(x)
diff --git a/python/paddle/tests/test_transforms.py b/python/paddle/tests/test_transforms.py
index 242680bc7c738..e07ac47a0f818 100644
--- a/python/paddle/tests/test_transforms.py
+++ b/python/paddle/tests/test_transforms.py
@@ -123,6 +123,44 @@ def test_color_jitter(self):
         ])
         self.do_transform(trans)
 
+    def test_affine(self):
+        trans = transforms.Compose([
+            transforms.RandomAffine(90),
+            transforms.RandomAffine(
+                [-10, 10], translate=[0.1, 0.3]),
+            transforms.RandomAffine(
+                45, translate=[0.2, 0.2], scale=[0.2, 0.5]),
+            transforms.RandomAffine(
+                10, translate=[0.2, 0.2], scale=[0.5, 0.5], shear=[-10, 10]),
+            transforms.RandomAffine(
+                10,
+                translate=[0.5, 0.3],
+                scale=[0.7, 1.3],
+                shear=[-10, 10, 20, 40]),
+            transforms.RandomAffine(
+                10,
+                translate=[0.5, 0.3],
+                scale=[0.7, 1.3],
+                shear=[-10, 10, 20, 40],
+                interpolation='bilinear'),
+            transforms.RandomAffine(
+                10,
+                translate=[0.5, 0.3],
+                scale=[0.7, 1.3],
+                shear=[-10, 10, 20, 40],
+                interpolation='bilinear',
+                fill=114),
+            transforms.RandomAffine(
+                10,
+                translate=[0.5, 0.3],
+                scale=[0.7, 1.3],
+                shear=[-10, 10, 20, 40],
+                interpolation='bilinear',
+                fill=114,
+                center=(60, 80)),
+        ])
+        self.do_transform(trans)
+
     def test_rotate(self):
         trans = transforms.Compose([
             transforms.RandomRotation(90),
@@ -134,6 +172,14 @@ def test_rotate(self):
         ])
         self.do_transform(trans)
 
+    def test_perspective(self):
+        trans = transforms.Compose([
+            transforms.RandomPerspective(prob=1.0),
+            transforms.RandomPerspective(
+                prob=1.0, distortion_scale=0.9),
+        ])
+        self.do_transform(trans)
+
     def test_pad(self):
         trans = transforms.Compose([transforms.Pad(2)])
         self.do_transform(trans)
@@ -278,6 +324,35 @@ def test_exception(self):
             tensor_img = paddle.rand((3, 100, 100))
             F.pad(tensor_img, [1.0, 2.0, 3.0])
 
+        with self.assertRaises(ValueError):
+            transforms.RandomAffine(-10)
+
+        with self.assertRaises(ValueError):
+            transforms.RandomAffine([-30, 60], translate=[2, 2])
+
+        with self.assertRaises(ValueError):
+            transforms.RandomAffine(10, translate=[0.2, 0.2], scale=[1, 2, 3]),
+
+        with self.assertRaises(ValueError):
+            transforms.RandomAffine(
+                10, translate=[0.2, 0.2], scale=[0.5, 0.5], shear=[1, 2, 3]),
+
+        with self.assertRaises(ValueError):
+            transforms.RandomAffine(
+                10,
+                translate=[0.5, 0.3],
+                scale=[0.7, 1.3],
+                shear=[-10, 10, 0, 20, 40])
+
+        with self.assertRaises(ValueError):
+            transforms.RandomAffine(
+                10,
+                translate=[0.5, 0.3],
+                scale=[0.7, 1.3],
+                shear=[-10, 10, 20, 40],
+                fill=114,
+                center=(1, 2, 3))
+
         with self.assertRaises(ValueError):
             transforms.RandomRotation(-2)
 
@@ -383,6 +458,20 @@ def test_color_jitter(self):
         trans = transforms.Compose([transforms.ColorJitter(1.1, 2.2, 0.8, 0.1)])
         self.do_transform(trans)
 
+        color_jitter_trans = transforms.ColorJitter(1.2, 0.2, 0.5, 0.2)
+        batch_input = paddle.rand((2, 3, 4, 4), dtype=paddle.float32)
+        result = color_jitter_trans(batch_input)
+
+    def test_perspective(self):
+        trans = transforms.RandomPerspective(prob=1.0, distortion_scale=0.7)
+        batch_input = paddle.rand((2, 3, 4, 4), dtype=paddle.float32)
+        result = trans(batch_input)
+
+    def test_affine(self):
+        trans = transforms.RandomAffine(15, translate=[0.1, 0.1])
+        batch_input = paddle.rand((2, 3, 4, 4), dtype=paddle.float32)
+        result = trans(batch_input)
+
     def test_pad(self):
         trans = transforms.Compose([transforms.Pad(2)])
         self.do_transform(trans)
@@ -433,6 +522,10 @@ def test_erase(self):
         ])
         self.do_transform(trans)
 
+        erase_trans = transforms.RandomErasing(value=(0.5, 0.2, 0.01))
+        batch_input = paddle.rand((2, 3, 4, 4), dtype=paddle.float32)
+        result = erase_trans(batch_input)
+
     def test_exception(self):
         trans = transforms.Compose([transforms.Resize(-1)])
 
@@ -479,6 +572,29 @@ def test_exception(self):
             tensor_img = paddle.rand((3, 100, 100))
             F.pad(tensor_img, [1.0, 2.0, 3.0])
 
+        with self.assertRaises(ValueError):
+            transforms.RandomAffine(-10)
+
+        with self.assertRaises(ValueError):
+            transforms.RandomAffine([-30, 60], translate=[2, 2])
+
+        with self.assertRaises(ValueError):
+            transforms.RandomAffine(10, translate=[0.2, 0.2], scale=[-2, -1]),
+
+        with self.assertRaises(ValueError):
+            transforms.RandomAffine(10, translate=[0.2, 0.2], scale=[1, 2, 3]),
+
+        with self.assertRaises(ValueError):
+            transforms.RandomAffine(
+                10, translate=[0.2, 0.2], scale=[0.5, 0.5], shear=[1, 2, 3]),
+
+        with self.assertRaises(ValueError):
+            transforms.RandomAffine(
+                10,
+                translate=[0.5, 0.3],
+                scale=[0.7, 1.3],
+                shear=[-10, 10, 0, 20, 40])
+
         with self.assertRaises(ValueError):
             transforms.RandomRotation(-2)
 
@@ -547,6 +663,36 @@ def test_errors(self):
         with self.assertRaises(TypeError):
             F.adjust_saturation(1, 0.1)
 
+        with self.assertRaises(TypeError):
+            F.affine('45')
+
+        with self.assertRaises(TypeError):
+            F.affine(45, translate=0.3)
+
+        with self.assertRaises(TypeError):
+            F.affine(45, translate=[0.2, 0.2, 0.3])
+
+        with self.assertRaises(TypeError):
+            F.affine(45, translate=[0.2, 0.2], scale=-0.5)
+
+        with self.assertRaises(TypeError):
+            F.affine(45, translate=[0.2, 0.2], scale=0.5, shear=10)
+
+        with self.assertRaises(TypeError):
+            F.affine(45, translate=[0.2, 0.2], scale=0.5, shear=[-10, 0, 10])
+
+        with self.assertRaises(TypeError):
+            F.affine(
+                45,
+                translate=[0.2, 0.2],
+                scale=0.5,
+                shear=[-10, 10],
+                interpolation=2)
+
+        with self.assertRaises(TypeError):
+            F.affine(
+                45, translate=[0.2, 0.2], scale=0.5, shear=[-10, 10], center=0)
+
         with self.assertRaises(TypeError):
             F.rotate(1, 0.1)
 
@@ -785,6 +931,31 @@ def test_image_load(self):
 
         os.remove(path)
 
+    def test_affine(self):
+        np_img = (np.random.rand(32, 26, 3) * 255).astype('uint8')
+        pil_img = Image.fromarray(np_img).convert('RGB')
+        tensor_img = F.to_tensor(pil_img, data_format='CHW') * 255
+
+        np.testing.assert_almost_equal(
+            np_img, tensor_img.transpose((1, 2, 0)), decimal=4)
+
+        np_affined_img = F.affine(
+            np_img, 45, translate=[0.2, 0.2], scale=0.5, shear=[-10, 10])
+        pil_affined_img = F.affine(
+            pil_img, 45, translate=[0.2, 0.2], scale=0.5, shear=[-10, 10])
+        tensor_affined_img = F.affine(
+            tensor_img, 45, translate=[0.2, 0.2], scale=0.5, shear=[-10, 10])
+
+        np.testing.assert_equal(np_affined_img.shape,
+                                np.array(pil_affined_img).shape)
+        np.testing.assert_equal(np_affined_img.shape,
+                                tensor_affined_img.transpose((1, 2, 0)).shape)
+
+        np.testing.assert_almost_equal(
+            np.array(pil_affined_img),
+            tensor_affined_img.numpy().transpose((1, 2, 0)),
+            decimal=4)
+
     def test_rotate(self):
         np_img = (np.random.rand(28, 28, 3) * 255).astype('uint8')
         pil_img = Image.fromarray(np_img).convert('RGB')
@@ -819,6 +990,144 @@ def test_rotate1(self):
         np.testing.assert_equal(rotated_np_img.shape,
                                 np.array(rotated_pil_img).shape)
 
+    def test_perspective(self):
+        np_img = (np.random.rand(32, 26, 3) * 255).astype('uint8')
+        pil_img = Image.fromarray(np_img).convert('RGB')
+        tensor_img = F.to_tensor(pil_img, data_format='CHW') * 255
+
+        np.testing.assert_almost_equal(
+            np_img, tensor_img.transpose((1, 2, 0)), decimal=4)
+
+        startpoints = [[0, 0], [13, 0], [13, 15], [0, 15]]
+        endpoints = [[3, 2], [12, 3], [10, 14], [2, 15]]
+
+        np_perspectived_img = F.perspective(np_img, startpoints, endpoints)
+        pil_perspectived_img = F.perspective(pil_img, startpoints, endpoints)
+        tensor_perspectived_img = F.perspective(tensor_img, startpoints,
+                                                endpoints)
+
+        np.testing.assert_equal(np_perspectived_img.shape,
+                                np.array(pil_perspectived_img).shape)
+        np.testing.assert_equal(np_perspectived_img.shape,
+                                tensor_perspectived_img.transpose(
+                                    (1, 2, 0)).shape)
+
+        result_pil = np.array(pil_perspectived_img)
+        result_tensor = tensor_perspectived_img.numpy().transpose(
+            (1, 2, 0)).astype('uint8')
+        num_diff_pixels = (result_pil != result_tensor).sum() / 3.0
+        ratio_diff_pixels = num_diff_pixels / result_tensor.shape[
+            0] / result_tensor.shape[1]
+        # Tolerance : less than 6% of different pixels
+        assert ratio_diff_pixels < 0.06
+
+    def test_batch_input(self):
+        paddle.seed(777)
+        batch_tensor = paddle.rand((2, 3, 8, 8), dtype=paddle.float32)
+
+        def test_erase(batch_tensor):
+            input1, input2 = paddle.unbind(batch_tensor, axis=0)
+            target_result = paddle.stack([
+                F.erase(input1, 1, 1, 2, 2, 0.5),
+                F.erase(input2, 1, 1, 2, 2, 0.5)
+            ])
+
+            batch_result = F.erase(batch_tensor, 1, 1, 2, 2, 0.5)
+
+            return paddle.allclose(batch_result, target_result)
+
+        self.assertTrue(test_erase(batch_tensor))
+
+        def test_affine(batch_tensor):
+            input1, input2 = paddle.unbind(batch_tensor, axis=0)
+            target_result = paddle.stack([
+                F.affine(
+                    input1,
+                    45,
+                    translate=[0.2, 0.2],
+                    scale=0.5,
+                    shear=[-10, 10]), F.affine(
+                        input2,
+                        45,
+                        translate=[0.2, 0.2],
+                        scale=0.5,
+                        shear=[-10, 10])
+            ])
+            batch_result = F.affine(
+                batch_tensor,
+                45,
+                translate=[0.2, 0.2],
+                scale=0.5,
+                shear=[-10, 10])
+
+            return paddle.allclose(batch_result, target_result)
+
+        self.assertTrue(test_affine(batch_tensor))
+
+        def test_perspective(batch_tensor):
+            input1, input2 = paddle.unbind(batch_tensor, axis=0)
+            startpoints = [[0, 0], [3, 0], [4, 5], [6, 7]]
+            endpoints = [[0, 1], [3, 1], [4, 4], [5, 7]]
+            target_result = paddle.stack([
+                F.perspective(input1, startpoints, endpoints),
+                F.perspective(input2, startpoints, endpoints)
+            ])
+
+            batch_result = F.perspective(batch_tensor, startpoints, endpoints)
+
+            return paddle.allclose(batch_result, target_result)
+
+        self.assertTrue(test_perspective(batch_tensor))
+
+        def test_adjust_brightness(batch_tensor):
+            input1, input2 = paddle.unbind(batch_tensor, axis=0)
+            target_result = paddle.stack([
+                F.adjust_brightness(input1, 2.1),
+                F.adjust_brightness(input2, 2.1)
+            ])
+
+            batch_result = F.adjust_brightness(batch_tensor, 2.1)
+
+            return paddle.allclose(batch_result, target_result)
+
+        self.assertTrue(test_adjust_brightness(batch_tensor))
+
+        def test_adjust_contrast(batch_tensor):
+            input1, input2 = paddle.unbind(batch_tensor, axis=0)
+            target_result = paddle.stack([
+                F.adjust_contrast(input1, 0.3), F.adjust_contrast(input2, 0.3)
+            ])
+
+            batch_result = F.adjust_contrast(batch_tensor, 0.3)
+
+            return paddle.allclose(batch_result, target_result)
+
+        self.assertTrue(test_adjust_contrast(batch_tensor))
+
+        def test_adjust_saturation(batch_tensor):
+            input1, input2 = paddle.unbind(batch_tensor, axis=0)
+            target_result = paddle.stack([
+                F.adjust_saturation(input1, 1.1),
+                F.adjust_saturation(input2, 1.1)
+            ])
+
+            batch_result = F.adjust_saturation(batch_tensor, 1.1)
+
+            return paddle.allclose(batch_result, target_result)
+
+        self.assertTrue(test_adjust_saturation(batch_tensor))
+
+        def test_adjust_hue(batch_tensor):
+            input1, input2 = paddle.unbind(batch_tensor, axis=0)
+            target_result = paddle.stack(
+                [F.adjust_hue(input1, -0.2), F.adjust_hue(input2, -0.2)])
+
+            batch_result = F.adjust_hue(batch_tensor, -0.2)
+
+            return paddle.allclose(batch_result, target_result)
+
+        self.assertTrue(test_adjust_hue(batch_tensor))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index 103f8e942967e..d9e9c0ac0d0ad 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -189,6 +189,18 @@
     func : assign
   backward : assign_grad
 
+- api : assign_out_
+  args : (Tensor x, Tensor output)
+  output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : assign
+    param : [x]
+  inplace : (output -> out)
+  backward : assign_out__grad
+
 # atan
 - api : atan
   args : (Tensor x)
@@ -319,6 +331,16 @@
     func : ceil
   backward : ceil_grad
 
+- api : celu
+  args : (Tensor x, float alpha)
+  output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
+    param: [x]
+  kernel :
+    func : celu
+  backward : celu_grad
+
 # cholesky
 - api : cholesky
   args : (Tensor x, bool upper)
@@ -563,7 +585,7 @@
 
 - api : einsum
   args : (Tensor[] x, str equation)
-  output : Tensor
+  output : Tensor, Tensor[]{x.size()}
   infer_meta :
     func : EinsumInferMeta
     param : [x, equation]
@@ -714,6 +736,7 @@
     backend : x
   inplace : (x -> out)
   view : (x -> out)
+  intermediate : xshape
   backward : flatten_grad
 
 # flip
@@ -811,7 +834,7 @@
     skip_transform : x
 
 - api : gather
-  args : (Tensor x, Tensor index, Scalar axis=0)
+  args : (Tensor x, Tensor index, Scalar(int) axis=0)
   output : Tensor(out)
   infer_meta :
     func : GatherInferMeta
@@ -1542,7 +1565,7 @@
     func : PadInferMeta
   kernel :
     func : pad
-  # backward : pad_grad
+  backward : pad_grad
 
 - api : pad3d
   args : (Tensor x, IntArray paddings, str mode,  float pad_value, str data_format)
@@ -1992,12 +2015,13 @@
 
 - api : squeeze
   args : (Tensor x, int[] axes)
-  output : Tensor(xshape), Tensor(out)
+  output : Tensor(out), Tensor(xshape)
   infer_meta :
     func : SqueezeInferMeta
   kernel :
     func : squeeze
   view: (x -> out)
+  intermediate : xshape
   backward : squeeze_grad
 
 - api : stack
@@ -2028,7 +2052,7 @@
   backward : subtract_grad
 
 - api : sum
-  args : (Tensor x, int64_t[] dims={}, DataType out_dtype=paddle::experimental::DataType::UNDEFINED, bool keep_dim=false)
+  args : (Tensor x, int64_t[] dims={}, DataType out_dtype=DataType::UNDEFINED, bool keep_dim=false)
   output : Tensor(out)
   infer_meta :
     func : SumInferMeta
@@ -2147,6 +2171,18 @@
     func : triangular_solve
   backward : triangular_solve_grad
 
+- api : tril_indices
+  args : (int rows, int cols, int offset, DataType dtype, Place place={})
+  output : Tensor(out)
+  infer_meta :
+    func : TrilIndicesInferMeta
+    param : [rows, cols, offset, dtype]
+  kernel :
+    func : tril_indices
+    param : [rows, cols, offset, dtype]
+    data_type : dtype
+    backend : place
+
 - api : tril_triu
   args : (Tensor x,  int diagonal,  bool lower)
   output : Tensor(out)
@@ -2221,12 +2257,13 @@
 
 - api : unsqueeze
   args : (Tensor x, IntArray axis)
-  output : Tensor(xshape), Tensor(out)
+  output : Tensor(out), Tensor(xshape)
   infer_meta :
     func : UnsqueezeInferMeta
   kernel :
     func : unsqueeze
   view: (x -> out)
+  intermediate : xshape
   backward : unsqueeze_grad
 
 # viterbi_decode
diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py
index 717870ee01d0a..146925ccef6d5 100644
--- a/python/paddle/utils/code_gen/api_base.py
+++ b/python/paddle/utils/code_gen/api_base.py
@@ -32,11 +32,7 @@ def __init__(self, api_item_yaml):
         #     names : [], list of output names
         #     types : [], list of output types
         #     out_size_expr : [], expression for getting size of vector<Tensor>
-        #     return_type : Tensor, vector<Tensor>, ..., the return type of api
-        # args_str:
-        #     args_declare : "str" // str of function params with default value. Example: (..., bool flag=false)
-        #     args_define : "str" // str of function params without default value. Example: (..., bool flag)
-        self.inputs, self.attrs, self.outputs, self.args_str, self.optional_vars = self.parse_args(
+        self.inputs, self.attrs, self.outputs, self.optional_vars = self.parse_args(
             self.api, api_item_yaml)
 
         self.is_base_api = True
@@ -49,7 +45,8 @@ def __init__(self, api_item_yaml):
                     'infer_meta'])
             self.kernel = self.parse_kernel(api_item_yaml['kernel'])
             self.support_selected_rows_kernel = False if len(self.kernel[
-                'func']) == 1 else True
+                'func']) == 1 or not self.kernel['func'][1].endswith(
+                    '_sr') else True
             self.data_transform = self.parse_data_transform(api_item_yaml)
             self.inplace_map, self.view_map = self.parse_inplace_and_view(
                 api_item_yaml)
@@ -60,22 +57,54 @@ def get_api_name(self, api_item_yaml):
     def get_api_func_name(self):
         return self.api
 
+    def get_input_tensor_args(self, inplace_flag=False):
+        input_args = []
+        inplace_type_map = {
+            "const Tensor&": "Tensor&",
+            "const std::vector<Tensor>&": "std::vector<Tensor>&"
+        }
+        for name in self.inputs['names']:
+            name = name.split('@')[0]
+            if inplace_flag and name in self.inplace_map.values():
+                input_args.append(inplace_type_map[self.inputs['input_info'][
+                    name]] + ' ' + name)
+            else:
+                input_args.append(self.inputs['input_info'][name] + ' ' + name)
+        return input_args
+
+    def get_declare_args(self, inplace_flag=False):
+        declare_args = self.get_input_tensor_args(inplace_flag)
+        for name in self.attrs['names']:
+            default_value = ''
+            if self.attrs['attr_info'][name][1] is not None:
+                default_value = ' = ' + self.attrs['attr_info'][name][1]
+            declare_args.append(self.attrs['attr_info'][name][0] + ' ' + name +
+                                default_value)
+
+        return ", ".join(declare_args)
+
+    def get_define_args(self, inplace_flag=False):
+        define_args = self.get_input_tensor_args(inplace_flag)
+        for name in self.attrs['names']:
+            define_args.append(self.attrs['attr_info'][name][0] + ' ' + name)
+
+        return ", ".join(define_args)
+
     def parse_args(self, api_name, api_item_yaml):
         optional_vars = []
         if 'optional' in api_item_yaml:
             optional_vars = [
                 item.strip() for item in api_item_yaml['optional'].split(',')
             ]
-        inputs, attrs, args_str = self.parse_input_and_attr(
+        inputs, attrs = self.parse_input_and_attr(
             api_name, api_item_yaml['args'], optional_vars)
-        output_type_list, output_names, out_size_expr, return_type = self.parse_output(
+        output_type_list, output_names, out_size_expr = self.parse_output(
             api_name, api_item_yaml['output'])
         return inputs, attrs, {
             'names': output_names,
             'types': output_type_list,
-            'out_size_expr': out_size_expr,
-            'return_type': return_type
-        }, args_str, optional_vars
+            'out_size_expr': out_size_expr
+        }, optional_vars
 
     def parse_input_and_attr(self, api_name, args_config, optional_vars=[]):
         inputs = {'names': [], 'input_info': {}}
@@ -125,9 +154,6 @@ def parse_input_and_attr(self, api_name, args_config, optional_vars=[]):
             'DataType': 'paddle::optional<DataType>'
         }
 
-        args_declare_str = ""
-        args_define_str = ""
-
         for item in args_list:
             item = item.strip()
             type_and_name = item.split(' ')
@@ -146,8 +172,6 @@ def parse_input_and_attr(self, api_name, args_config, optional_vars=[]):
 
                     inputs['names'].append(input_name)
                     inputs['input_info'][input_name] = in_type
-                    args_declare_str = args_declare_str + in_type + ' ' + input_name + ', '
-                    args_define_str = args_define_str + in_type + ' ' + input_name + ', '
                     has_input = True
                     break
             if has_input:
@@ -169,16 +193,11 @@ def parse_input_and_attr(self, api_name, args_config, optional_vars=[]):
                         attr_type = optional_types_trans[attr_type_symbol]
 
                     default_value_str = "" if default_value is None else '=' + default_value
-                    args_declare_str = args_declare_str + attr_type + ' ' + attr_name + default_value_str + ', '
-                    args_define_str = args_define_str + attr_type + ' ' + attr_name + ', '
                     attrs['names'].append(attr_name)
                     attrs['attr_info'][attr_name] = (attr_type, default_value)
                     break
 
-        return inputs, attrs, {
-            'args_declare': args_declare_str[:-2],
-            'args_define': args_define_str[:-2]
-        }
+        return inputs, attrs
 
     def parse_output(self, api_name, output_config):
         def parse_output_item(output_item):
@@ -205,18 +224,18 @@ def parse_output_item(output_item):
 
         if len(temp_list) == 1:
             out_type, out_name, size_expr = parse_output_item(temp_list[0])
-            return [out_type], [out_name], size_expr, self.get_return_type(
-                [out_type])
+            return [out_type], [out_name], [size_expr]
         else:
             out_type_list = []
             out_name_list = []
+            out_size_expr_list = []
             for output_item in temp_list:
                 out_type, out_name, size_expr = parse_output_item(output_item)
                 out_type_list.append(out_type)
                 out_name_list.append(out_name)
+                out_size_expr_list.append(size_expr)
 
-            return out_type_list, out_name_list, size_expr, self.get_return_type(
-                out_type_list)
+            return out_type_list, out_name_list, out_size_expr_list
 
     def parse_infer_meta(self, infer_meta_config):
         infer_meta = infer_meta_config
@@ -232,13 +251,15 @@ def parse_kernel(self, kernel_config):
         #    backend : str, the names of param to choose the kernel backend, default is None
         #    layout : str, the names of param to choose the kernel layout, default is None
         #    data_type : str, the names of param to choose the kernel data_type, default is None
+        #    dispatch : {}, the key is kernel_func, the value is type of inputs and outputs for kernel (example: {kernel_name : (['dense','sparse_coo']#input,['sparse_coo']#output)})
         kernel = {
             'func': [],
             'param': None,
             'backend': None,
             'layout': None,
             'data_type': None,
-            'use_gpudnn': 'false'
+            'use_gpudnn': 'false',
+            'dispatch': {}
         }
         if 'backend' in kernel_config and len(kernel_config['backend']) > 0:
             kernel['backend'] = kernel_config['backend']
@@ -252,17 +273,21 @@ def parse_kernel(self, kernel_config):
             kernel['use_gpudnn'] = kernel_config['use_gpudnn']
             if isinstance(kernel['use_gpudnn'], bool):
                 kernel['use_gpudnn'] = str(kernel['use_gpudnn']).lower()
-        kernel['func'] = [
-            kernel_fn.strip() for kernel_fn in kernel_config['func'].split(',')
-        ]
-
-        if len(kernel['func']) == 2:
-            assert kernel['func'][0] == self.api, \
-                    f"{self.api} : Kernel func error: If kernel has two func config, the name of first func should be same with api name({self.api}), \
-                      but now is {kernel['func'][0]}."
-            assert kernel['func'][1].endswith('_sr'), \
-                    f"{self.api} : Kernel func error: If kernel has two func config, the name of second func should be a selected_rows kernel (the func name endwith '_sr'), \
-                      but now is {kernel['func'][1]}."
+        kernel_funcs = re.compile(r'([a-zA-Z0-9_]+)\s*({[^}]+})?').findall(
+            kernel_config['func'])
+
+        def parse_kernel_in_out_type(in_out_str):
+            if len(in_out_str) == 0:
+                return None
+            tmp_in_out_list = in_out_str[1:-1].split('->')
+            inputs = [item.strip() for item in tmp_in_out_list[0].split(',')]
+            outputs = [item.strip() for item in tmp_in_out_list[1].split(',')]
+            return (inputs, outputs)
+
+        for func_item in kernel_funcs:
+            kernel['func'].append(func_item[0])
+            kernel['dispatch'][func_item[0]] = parse_kernel_in_out_type(
+                func_item[1])
 
         return kernel
 
@@ -279,7 +304,7 @@ def parse_data_transform(self, api_item_yaml):
         return data_transform
 
     def parse_inplace_and_view(self, api_item_yaml):
-        inplace_map, view_map = None, None
+        inplace_map, view_map = {}, {}
         for mode in ['inplace', 'view']:
             if mode in api_item_yaml:
                 if mode == 'inplace':
@@ -288,7 +313,7 @@ def parse_inplace_and_view(self, api_item_yaml):
                     view_map = {}
                 in_out_mapping_list = api_item_yaml[mode].split(',')
                 for item in in_out_mapping_list:
-                    result = re.search(r"(?P<in>\w+)\s*->\s(?P<out>\w+)", item)
+                    result = re.search(r"(?P<in>\w+)\s*->\s*(?P<out>\w+)", item)
                     in_val = result.group('in')
                     out_val = result.group('out')
                     assert in_val in self.inputs['names'], \
@@ -304,17 +329,22 @@ def parse_inplace_and_view(self, api_item_yaml):
         return inplace_map, view_map
 
     # Override by child class
-    def get_return_type(self, out_type_list):
+    def get_return_type(self, inplace_flag=False):
         return None
 
     def gene_api_declaration(self):
-        api_declaration = f"""
-PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name()}({self.args_str['args_declare']});
+        api_declaration = ""
+        api_func_name = self.get_api_func_name()
+        if api_func_name[-1] != '_':
+            api_declaration = f"""
+PADDLE_API {self.get_return_type()} {api_func_name}({self.get_declare_args()});
 """
 
-        if self.is_base_api and self.inplace_map is not None:
+        if self.is_base_api and len(self.inplace_map) > 0:
+            if api_func_name[-1] != '_':
+                api_func_name += '_'
             api_declaration = api_declaration + f"""
-PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name() + '_'}({self.args_str['args_declare']});
+PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_declare_args(inplace_flag=True)});
 """
 
         return api_declaration
@@ -413,7 +443,7 @@ def gene_kernel_select(self) -> str:
                 vars_list = kernel['data_type'].split(',')
                 assert len(
                     vars_list
-                ) == 1, f"{api} api: The number of params to set data_type only allows 2, but received {len(vars_list)}."
+                ) == 1, f"{api} api: The number of params to set data_type only allows 1, but received {len(vars_list)}."
                 kernel_select_code = kernel_select_code + f"""
   kernel_data_type = ParseDataType({vars_list[0].strip()});
 """
@@ -513,7 +543,7 @@ def gene_infer_meta(self, kernel_output_names, code_indent) -> str:
 {code_indent}  auto {out_name}_{PREFIX_META_TENSOR_NAME}vec = MakeMetaTensor({out_name});
 {code_indent}  std::vector<phi::MetaTensor*> {out_name}_metas({out_name}_{PREFIX_META_TENSOR_NAME}vec.size());
 {code_indent}  for (size_t i = 0; i < {out_name}_{PREFIX_META_TENSOR_NAME}vec.size(); ++i) {{
-{code_indent}    {out_name}_metas[i] = &{out_name}_{PREFIX_META_TENSOR_NAME}vec[i];
+{code_indent}    {out_name}_metas[i] = {out_name}[i] ? &{out_name}_{PREFIX_META_TENSOR_NAME}vec[i] : nullptr;
 {code_indent}  }}"""
 
                 param_code = param_code + out_name + '_metas, '
@@ -521,8 +551,10 @@ def gene_infer_meta(self, kernel_output_names, code_indent) -> str:
                 meta_tensor_code = meta_tensor_code + code_indent + "  phi::MetaTensor " + out_name.replace(
                     'kernel_',
                     PREFIX_META_TENSOR_NAME) + "(" + out_name + ");\n"
-                param_code = param_code + "&" + out_name.replace(
-                    'kernel_', PREFIX_META_TENSOR_NAME) + ", "
+                if len(kernel_output_names) == 1:
+                    param_code = param_code + f"&{out_name.replace('kernel_', PREFIX_META_TENSOR_NAME)}, "
+                else:
+                    param_code = param_code + f"{out_name} ? &{out_name.replace('kernel_', PREFIX_META_TENSOR_NAME)} : nullptr, "
 
         param_code = param_code[:-2]
         return f"""{meta_tensor_code}
@@ -706,13 +738,9 @@ def get_selected_rows_kernel_args(self, code_indent):
 
         return input_tensor_code, kernel_args[:-2], kernel_signature
 
-    # Override by child class
-    def gene_return_type_code(self):
-        return self.outputs['return_type']
-
     # Override by child class
     def gene_return_code(self):
-        return "api_output"
+        return "return api_output;"
 
     # Override by child class
     def gene_output(self,
@@ -748,7 +776,7 @@ def gen_dense_tensor_kernel_code(self, code_indent, inplace_flag=False):
 {code_indent}    (*kernel_fn)({kernel_args}, {outputs_args});
 {code_indent}  }}
 
-{code_indent}  return {self.gene_return_code()};"""
+{code_indent}  {self.gene_return_code()}"""
 
     def gen_selected_rows_kernel_code(self, code_indent, inplace_flag=False):
         input_tensors, kernel_args, kernel_signature = self.get_selected_rows_kernel_args(
@@ -775,12 +803,14 @@ def gen_selected_rows_kernel_code(self, code_indent, inplace_flag=False):
 {code_indent}    (*kernel_fn)({kernel_args}, {outputs_args});
 {code_indent}  }}
 
-{code_indent}  return {self.gene_return_code()};"""
+{code_indent}  {self.gene_return_code()}"""
 
     def gene_base_api_code(self, inplace_flag=False):
-        api_func_name = self.get_api_func_name() + ('_' if inplace_flag else '')
+        api_func_name = self.get_api_func_name()
+        if inplace_flag and api_func_name[-1] != '_':
+            api_func_name += '_'
         api_code = f"""
-PADDLE_API {self.gene_return_type_code()} {api_func_name}({self.args_str["args_define"]}) {{
+PADDLE_API {self.get_return_type(inplace_flag)} {api_func_name}({self.get_define_args(inplace_flag)}) {{
 {self.gene_kernel_select()}
 """
 
@@ -802,18 +832,26 @@ def gene_base_api_code(self, inplace_flag=False):
 }
 """
 
+    def gene_invoke_code(self, invoke_code, params_code):
+        return f"""
+PADDLE_API {self.get_return_type()} {self.api}({params_code}) {{
+  return {invoke_code};
+}}"""
+
     def gene_api_code(self):
         if self.is_base_api:
             api_code = self.gene_base_api_code()
-            if self.inplace_map is not None:
+            if len(self.inplace_map) > 0:
+                if self.api[-1] == '_':
+                    api_code = ""
                 api_code = api_code + self.gene_base_api_code(inplace_flag=True)
             return api_code
 
         else:
-            inveke_func_name = self.invoke.split('(')[0].strip()
-            if inveke_func_name in self.attrs['names']:
+            invoke_func_name = self.invoke.split('(')[0].strip()
+            if invoke_func_name in self.attrs['names']:
                 # Adjust the param whose name is same with api invoked.
-                pattern = r'\W' + inveke_func_name + '[^A-Za-z0-9_(]'
+                pattern = r'\W' + invoke_func_name + '[^A-Za-z0-9_(]'
 
                 def adjust_name(matched):
                     matched_str = matched.group()
@@ -821,12 +859,8 @@ def adjust_name(matched):
 
                 invoke_code = re.sub(pattern, adjust_name, self.invoke)
                 params_code = re.sub(pattern, adjust_name,
-                                     self.args_str["args_define"])
+                                     self.get_define_args())
             else:
                 invoke_code = self.invoke
-                params_code = self.args_str["args_define"]
-            return f"""
-{self.outputs['return_type']} {self.api}({params_code}) {{
-  return {invoke_code};
-}}
-"""
+                params_code = self.get_define_args()
+            return self.gene_invoke_code(invoke_code, params_code)
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index 538958c2361bc..c0923adf39c46 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -19,6 +19,11 @@
 
 from api_base import BaseAPI, PREFIX_TENSOR_NAME
 
+inplace_out_type_map = {
+    "Tensor": "Tensor&",
+    "std::vector<Tensor>": "std::vector<Tensor>&"
+}
+
 
 class ForwardAPI(BaseAPI):
     def __init__(self, api_item_yaml):
@@ -42,38 +47,49 @@ def parse_intermediate(self, api_item_yaml):
         else:
             return False, []
 
-    def get_return_type(self, out_type_list):
-        return out_type_list[0] if len(
-            out_type_list) == 1 else "std::tuple<" + ",".join(
-                out_type_list) + ">"
+    def get_return_type_with_intermediate(self, inplace_flag=False):
+        out_type_list = []
+        for i, out_type in enumerate(self.outputs['types']):
+            out_name = self.outputs['names'][i].split('@')[0]
+            if inplace_flag and out_name in self.inplace_map:
+                out_type_list.append(inplace_out_type_map[out_type])
+            else:
+                out_type_list.append(out_type)
 
-    def gene_return_type_code(self):
-        if self.is_dygraph_api or len(self.intermediate_outs) == 0:
-            return self.outputs['return_type']
+        if len(out_type_list) == 1:
+            return out_type_list[0]
         else:
-            return_out_list = []
-            for i, name in enumerate(self.outputs['names']):
-                if name not in self.intermediate_outs:
-                    return_out_list.append(self.outputs['types'][i])
-            return return_out_list[0] if len(
-                return_out_list) == 1 else "std::tuple<" + ",".join(
-                    return_out_list) + ">"
+            return "std::tuple<" + ", ".join(out_type_list) + ">"
+
+    def get_return_type(self, inplace_flag=False):
+        out_type_list = []
+        for i, out_type in enumerate(self.outputs['types']):
+            out_name = self.outputs['names'][i].split('@')[0]
+            if inplace_flag and out_name in self.inplace_map:
+                out_type_list.append(inplace_out_type_map[out_type])
+            elif self.is_dygraph_api or out_name not in self.intermediate_outs:
+                out_type_list.append(out_type)
+
+        if len(out_type_list) == 1:
+            return out_type_list[0]
+        else:
+            return "std::tuple<" + ", ".join(out_type_list) + ">"
 
     def gene_return_code(self):
         if self.is_dygraph_api or len(self.intermediate_outs) == 0:
-            return "api_output"
+            return "return api_output;"
         else:
             return_out_list = []
             for i, name in enumerate(self.outputs['names']):
-                if name not in self.intermediate_outs:
+                if name.split('@')[0] not in self.intermediate_outs:
                     return_out_list.append(i)
             if len(return_out_list) == 1:
-                return f"std::get<{return_out_list[0]}>(api_output)"
+                return f"return std::get<{return_out_list[0]}>(api_output);"
             else:
                 selected_code = [
                     f"std::get<{i}>(api_output)" for i in return_out_list
                 ]
-            return '{' + ", ".join(selected_code) + '}'
+            return 'return {' + ", ".join(selected_code) + '};'
 
     def gene_output(self,
                     output_type_list,
@@ -83,21 +99,22 @@ def gene_output(self,
         kernel_output = ""
         output_names = []
         output_create = ""
+        return_type = self.get_return_type_with_intermediate(inplace_flag)
 
         if len(output_type_list) == 1:
             kernel_output = 'kernel_out'
             output_names.append('kernel_out')
             inplace_assign = " = " + self.inplace_map[self.outputs['names'][
-                0]] if inplace_flag and self.inplace_map is not None and self.outputs[
-                    'names'][0] in self.inplace_map else ""
+                0]] if inplace_flag and self.outputs['names'][
+                    0] in self.inplace_map else ""
             output_create = f"""
-{code_indent}  {self.outputs['return_type']} api_output{inplace_assign};"""
+{code_indent}  {return_type} api_output{inplace_assign};"""
 
-            if self.outputs['return_type'] == 'std::vector<Tensor>':
-                assert self.outputs['out_size_expr'] is not None, \
+            if return_type == 'std::vector<Tensor>':
+                assert self.outputs['out_size_expr'][0] is not None, \
                      f"{api_name}: The out size expr : '{{expr}}' should be set when output has Tensor[]. You can refer 'split' api."
                 output_create = output_create + f"""
-{code_indent}  auto kernel_out = {set_out_func}({self.outputs['out_size_expr']}, kernel_backend, &api_output);"""
+{code_indent}  auto kernel_out = {set_out_func}({self.outputs['out_size_expr'][0]}, kernel_backend, &api_output);"""
 
             else:
                 output_create = output_create + f"""
@@ -112,15 +129,23 @@ def gene_output(self,
 
         elif len(output_type_list) > 1:
             output_create = f"""
-{code_indent}  {self.outputs['return_type']} api_output;"""
+{code_indent}  {return_type} api_output;"""
+
+            if inplace_flag:
+                output_create = f"""
+{code_indent}  {return_type} api_output{{"""
+
+                for out_name in self.outputs['names']:
+                    if out_name in self.inplace_map:
+                        output_create = output_create + self.inplace_map[
+                            out_name] + ', '
+                    else:
+                        output_create += 'Tensor(), '
+                output_create = output_create[:-2] + '};'
 
             for i in range(len(output_type_list)):
                 kernel_output = kernel_output + f'kernel_out_{i}, '
                 output_names.append(f'kernel_out_{i}')
-                if inplace_flag and self.inplace_map is not None and self.outputs[
-                        'names'][i] in self.inplace_map:
-                    output_create = output_create + f"""
-{code_indent}  std::get<{i}>(api_output) = {self.inplace_map[self.outputs['names'][i]]};"""
 
                 if output_type_list[i] == 'std::vector<Tensor>':
                     assert self.outputs['out_size_expr'][i] is not None, \
@@ -170,7 +195,6 @@ def source_include(header_file_path):
 #include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/infermeta/binary.h"
 #include "paddle/phi/infermeta/multiary.h"
@@ -197,9 +221,14 @@ def api_namespace():
 
 
 def generate_api(api_yaml_path, header_file_path, source_file_path):
+    apis = []
+
+    for each_api_yaml in api_yaml_path:
+        with open(each_api_yaml, 'r') as f:
+            api_list = yaml.load(f, Loader=yaml.FullLoader)
+            if api_list:
+                apis.extend(api_list)
 
-    with open(api_yaml_path, 'r') as f:
-        apis = yaml.load(f, Loader=yaml.FullLoader)
     header_file = open(header_file_path, 'w')
     source_file = open(source_file_path, 'w')
 
@@ -234,6 +263,7 @@ def main():
     parser.add_argument(
         '--api_yaml_path',
         help='path to api yaml file',
+        nargs='+',
         default='python/paddle/utils/code_gen/api.yaml')
 
     parser.add_argument(
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index e62f309ff8f82..603bb1055aeb6 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -1,3 +1,15 @@
+- backward_api : abs_double_grad
+  forward : abs_grad (Tensor x, Tensor grad_out) -> Tensor(grad_x)
+  args : (Tensor x, Tensor grad_x_grad)
+  output : Tensor(grad_out_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : abs_double_grad
+  data_transform:
+    skip_transform : grad_x_grad
+
 - backward_api : abs_grad
   forward : abs (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
@@ -9,6 +21,7 @@
     func : abs_grad
   data_transform:
     skip_transform : out_grad
+  backward : abs_double_grad
 
 - backward_api : acos_grad
   forward : acos (Tensor x) -> Tensor(out)
@@ -53,12 +66,13 @@
     func : add_grad
   no_need_buffer : x, y
   backward : add_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : add_n_grad
   forward : add_n (Tensor[] x) -> Tensor(out)
   args : (Tensor[] x, Tensor out_grad)
   output : Tensor[](x_grad){x.size()}
-  invoke : add_n_grad_impl(x, out_grad)
+  invoke : add_n_grad_impl(x, out_grad, x_grad)
   no_need_buffer : x
 
 - backward_api : add_triple_grad
@@ -118,7 +132,15 @@
   output : Tensor(x_grad)
   infer_meta :
     func : UnchangedInferMeta
-    param : [out_grad]
+  kernel :
+    func : assign
+
+- backward_api : assign_out__grad
+  forward : assign_out_ (Tensor x, Tensor output) -> Tensor(out)
+  args : (Tensor out_grad)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
   kernel :
     func : assign
 
@@ -207,6 +229,7 @@
   kernel :
     func : cast_grad
     data_type : out_grad
+  no_need_buffer : x
 
 - backward_api : ceil_grad
   forward : ceil(Tensor x) -> Tensor(out)
@@ -218,6 +241,27 @@
   kernel :
     func : ceil_grad
 
+- backward_api : celu_double_grad
+  forward : celu_grad(Tensor x, Tensor grad_out, float alpha) -> Tensor(grad_x)
+  args : (Tensor x, Tensor grad_out, Tensor grad_x_grad, float alpha)
+  output : Tensor(x_grad), Tensor(grad_out_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, x]
+  kernel :
+    func : celu_double_grad
+
+- backward_api : celu_grad
+  forward : celu(Tensor x, float alpha) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad, float alpha)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param: [x]
+  kernel :
+    func : celu_grad
+  backward : celu_double_grad
+
 - backward_api : cholesky_grad
   forward : cholesky (Tensor x, bool upper) -> Tensor(out)
   args : (Tensor out, Tensor out_grad, bool upper)
@@ -238,6 +282,16 @@
   kernel :
     func : cholesky_solve_grad
 
+- backward_api : clip_double_grad
+  forward : clip_grad (Tensor x, Tensor grad_out, Scalar min = 0., Scalar max = 0.) -> Tensor(grad_x)
+  args : (Tensor x, Tensor grad_x_grad, Scalar min = 0., Scalar max = 0.)
+  output : Tensor(grad_out_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : clip_grad
+
 - backward_api : clip_grad
   forward : clip (Tensor x, Scalar min, Scalar max) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, Scalar min = 0., Scalar max = 0.)
@@ -247,6 +301,17 @@
     param : [x]
   kernel :
     func : clip_grad
+  backward : clip_double_grad
+
+- backward_api : concat_double_grad
+  forward : concat_grad (Tensor[] x, Tensor grad_out, Scalar axis) -> Tensor[](grad_x)
+  args : (Tensor[] grad_x_grad, Scalar axis = 0)
+  output : Tensor(grad_out_grad)
+  infer_meta :
+    func : ConcatInferMeta
+    param : [grad_x_grad, axis]
+  kernel :
+    func : concat
 
 - backward_api : concat_grad
   forward : concat (Tensor[] x, Scalar axis) -> Tensor(out)
@@ -258,6 +323,7 @@
   kernel :
     func : concat_grad
   no_need_buffer : x
+  backward : concat_double_grad
 
 - backward_api : conj_grad
   forward : conj (Tensor x) -> Tensor(out)
@@ -273,7 +339,7 @@
   forward : conv2d (Tensor input, Tensor filter, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) -> Tensor(out)
   args : (Tensor input, Tensor filter, Tensor out_grad,  int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search)
   output : Tensor(input_grad), Tensor(filter_grad)
-  invoke : conv2d_grad_impl(input, filter, out_grad,  strides, paddings, paddding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search)
+  invoke : conv2d_grad_impl(input, filter, out_grad,  strides, paddings, paddding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search, input_grad, filter_grad)
   backward : conv2d_grad_grad
 
 - backward_api : conv2d_grad_grad
@@ -288,6 +354,16 @@
     use_gpudnn : true
   optional : grad_input_grad, grad_filter_grad
 
+- backward_api : conv2d_transpose_double_grad
+  forward : conv2d_transpose_grad(Tensor x, Tensor filter, Tensor grad_out, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format) -> Tensor(grad_x), Tensor(grad_filter)
+  args : (Tensor x, Tensor filter, Tensor grad_out, Tensor grad_x_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format)
+  output : Tensor(x_grad), Tensor(filter_grad), Tensor(grad_out_grad)
+  infer_meta :
+    func : Conv2dTransposeDoubleGradInferMeta
+  kernel :
+    func : conv2d_transpose_grad_grad
+    use_gpudnn : true
+
 - backward_api : conv2d_transpose_grad
   forward : conv2d_transpose(Tensor x, Tensor filter, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format) -> Tensor(out)
   args : (Tensor x, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format)
@@ -297,6 +373,7 @@
   kernel :
     func : conv2d_transpose_grad
     use_gpudnn : true
+  backward : conv2d_transpose_double_grad
 
 - backward_api : conv3d_transpose_grad
   forward : conv3d_transpose(Tensor x, Tensor filter, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format) -> Tensor(out)
@@ -337,6 +414,7 @@
   kernel :
     func : cross_entropy_with_softmax_grad
     data_type : softmax
+  inplace : (softmax -> input_grad)
 
 - backward_api : cross_grad
   forward : cross (Tensor x, Tensor y, int axis = 9) -> Tensor(out)
@@ -460,7 +538,6 @@
     param : [out_grad]
   kernel :
     func : dropout_grad
-  optional : seed_tensor
 
 - backward_api : eigh_grad
   forward : eigh (Tensor x, str uplo) -> Tensor(out_w), Tensor(out_v)
@@ -476,8 +553,8 @@
     skip_transform : out_w, out_w_grad
 
 - backward_api : einsum_grad
-  forward : einsum (Tensor[] x, str equation) -> Tensor(out)
-  args : (Tensor[] x, Tensor out_grad, str equation)
+  forward : einsum (Tensor[] x, str equation) -> Tensor(out), Tensor[](inner_cache)
+  args : (Tensor[] x, Tensor[] inner_cache, Tensor out_grad, str equation)
   output : Tensor[](x_grad){x.size()}
   infer_meta :
     func : UnchangedMultiInferMeta
@@ -558,6 +635,15 @@
     func : expand_as_grad
   no_need_buffer : x
 
+- backward_api : expand_double_grad
+  forward : expand_grad (Tensor x, Tensor grad_out, IntArray shape) -> Tensor(grad_x)
+  args : (Tensor grad_x_grad, IntArray shape)
+  output : Tensor(grad_out_grad)
+  infer_meta :
+    func : ExpandInferMeta
+  kernel :
+    func : expand
+
 - backward_api : expand_grad
   forward : expand (Tensor x, IntArray shape) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, IntArray shape)
@@ -568,6 +654,7 @@
   kernel :
     func : expand_grad
   no_need_buffer : x
+  backward : expand_double_grad
 
 - backward_api : expm1_grad
   forward : expm1 (Tensor x) -> Tensor(out)
@@ -591,7 +678,7 @@
     data_type: out_grad
     backend: out_grad
     layout: out_grad
-  no_need_buffer : x
+  inplace : (out_grad -> x_grad)
 
 - backward_api : flip_grad
   forward : flip (Tensor x, int[] axis) -> Tensor(out)
@@ -742,7 +829,7 @@
   forward : imag (Tensor x) -> Tensor(out)
   args : (Tensor out_grad)
   output : Tensor(x_grad)
-  invoke : imag_grad_impl(out_grad)
+  invoke : imag_grad_impl(out_grad, x_grad)
 
 - backward_api : index_sample_grad
   forward : index_sample (Tensor x, Tensor index) -> Tensor(out)
@@ -809,7 +896,6 @@
     param : [out_grad]
   kernel :
     func : label_smooth_grad
-  optional : prior_dist
 
 - backward_api : layer_norm_grad
   forward : layer_norm (Tensor x, Tensor scale, Tensor bias, float epsilon, int begin_norm_axis, bool is_test) -> Tensor(out), Tensor(mean), Tensor(variance)
@@ -1263,6 +1349,15 @@
   kernel :
     func : p_norm_grad
 
+- backward_api : pad3d_double_grad
+  forward : pad3d_grad(Tensor x, Tensor grad_out, IntArray paddings, str mode, float pad_value, str data_format) -> Tensor(grad_x)
+  args : (Tensor grad_x_grad, IntArray paddings, str mode, float pad_value, str data_format)
+  output : Tensor(grad_out_grad)
+  infer_meta :
+    func : Pad3dInferMeta
+  kernel :
+    func : pad3d
+
 - backward_api : pad3d_grad
   forward : pad3d(Tensor x, IntArray paddings, str mode,  float pad_value, str data_format) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, IntArray paddings, str mode,  float pad_value, str data_format)
@@ -1273,6 +1368,29 @@
   kernel :
     func : pad3d_grad
   no_need_buffer : x
+  backward : pad3d_double_grad
+
+- backward_api : pad_double_grad
+  forward : pad_grad(Tensor x, Tensor grad_out, int[] paddings, float pad_value) -> Tensor(grad_x)
+  args : (Tensor grad_x_grad, int[] paddings, float pad_value)
+  output : Tensor(grad_out_grad)
+  infer_meta :
+    func : PadInferMeta
+  kernel :
+    func : pad
+
+- backward_api : pad_grad
+  forward : pad(Tensor x, int[] paddings, float pad_value) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad, int[] paddings, float pad_value)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param: [x]
+  kernel :
+    func : pad_grad
+    param: [out_grad, paddings, pad_value]
+  no_need_buffer : x
+  backward : pad_double_grad
 
 - backward_api : pixel_shuffle_grad
   forward : pixel_shuffle (Tensor x, int upscale_factor, str data_format) -> Tensor(out)
@@ -1293,6 +1411,16 @@
   kernel :
     func : poisson_grad
 
+- backward_api : pool2d_double_grad
+  forward : pool2d_grad(Tensor x, Tensor out, Tensor grad_out, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) -> Tensor(grad_x)
+  args : (Tensor grad_x_grad, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm)
+  output : Tensor(grad_out_grad)
+  infer_meta :
+    func : PoolInferMeta
+  kernel :
+    func : pool2d_double_grad
+    use_gpudnn : true
+
 - backward_api : pool2d_grad
   forward : pool2d(Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) -> Tensor(out)
   args : (Tensor x, Tensor out, Tensor out_grad, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm)
@@ -1302,6 +1430,7 @@
   kernel :
     func : pool2d_grad
     use_gpudnn : true
+  backward : pool2d_double_grad
 
 - backward_api : pool2d_grad_gpudnn_unused
   forward : pool2d_gpudnn_unused(Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) -> Tensor(out)
@@ -1370,7 +1499,7 @@
   forward : real (Tensor x) -> Tensor(out)
   args : (Tensor out_grad)
   output : Tensor(x_grad)
-  invoke : real_grad_impl(out_grad)
+  invoke : real_grad_impl(out_grad, x_grad)
 
 - backward_api : reciprocal_grad
   forward : reciprocal (Tensor x) -> Tensor(out)
@@ -1425,7 +1554,7 @@
   no_need_buffer : grad_out
 
 - backward_api : reshape_grad
-  forward : reshape_with_xshape (Tensor x, IntArray shape) -> Tensor(out), Tensor(xshape)
+  forward : reshape (Tensor x, IntArray shape) -> Tensor(out), Tensor(xshape)
   args : (Tensor xshape, Tensor out_grad)
   output : Tensor(x_grad)
   infer_meta :
@@ -1438,6 +1567,7 @@
     backend: out_grad
     layout: out_grad
   backward : reshape_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : roi_align_grad
   forward : roi_align (Tensor x, Tensor boxes, Tensor boxes_num, int pooled_height, int pooled_width, float spatial_scale, int sampling_ratio, bool aligned) -> Tensor(out)
@@ -1486,6 +1616,16 @@
   kernel :
     func : round_grad
 
+- backward_api : rsqrt_double_grad
+  forward : rsqrt_grad (Tensor out, Tensor grad_out) -> Tensor(grad_x)
+  args : (Tensor out, Tensor grad_x, Tensor grad_x_grad)
+  output : Tensor(out_grad), Tensor(grad_out_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [out, out]
+  kernel :
+    func : rsqrt_double_grad
+
 - backward_api : rsqrt_grad
   forward : rsqrt (Tensor x) -> Tensor(out)
   args : (Tensor out, Tensor out_grad)
@@ -1495,6 +1635,7 @@
     param : [out]
   kernel :
     func : rsqrt_grad
+  backward : rsqrt_double_grad
 
 - backward_api : scale_double_grad
   forward : scale_grad (Tensor grad_out, Scalar scale, float bias, bool bias_after_scale) -> Tensor(grad_x)
@@ -1509,6 +1650,7 @@
   output : Tensor(x_grad)
   invoke : scale(out_grad, scale, 0.0, bias_after_scale)
   backward : scale_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : scale_triple_grad
   forward : scale_double_grad (Tensor grad_grad_x, Scalar scale, float bias, bool bias_after_scale) -> Tensor(grad_grad_out)
@@ -1601,6 +1743,7 @@
     param : [out, fwd_grad_out, grad_grad_x]
   kernel :
     func : sigmoid_triple_grad
+  optional : grad_grad_out_grad
 
 - backward_api : silu_grad
   forward : silu (Tensor x) -> Tensor(out)
@@ -1671,6 +1814,16 @@
   invoke : concat( out_grad, axis)
 # TODO(zhangyunfei) The config of double grad and triple grad will be supported in the future.
 
+- backward_api : sqrt_double_grad
+  forward : sqrt_grad (Tensor out, Tensor grad_out) -> Tensor(grad_x)
+  args : (Tensor out, Tensor grad_x, Tensor grad_x_grad)
+  output : Tensor(out_grad), Tensor(grad_out_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [out, out]
+  kernel :
+    func : sqrt_double_grad
+
 - backward_api : sqrt_grad
   forward : sqrt (Tensor x) -> Tensor(out)
   args : (Tensor out, Tensor out_grad)
@@ -1680,6 +1833,17 @@
     param : [out]
   kernel :
     func : sqrt_grad
+  backward : sqrt_double_grad
+
+- backward_api : square_double_grad
+  forward : square_grad (Tensor x, Tensor grad_out) -> Tensor(grad_x)
+  args : (Tensor x, Tensor grad_out, Tensor grad_x_grad)
+  output : Tensor(x_grad), Tensor(grad_out_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, x]
+  kernel :
+    func : square_double_grad
 
 - backward_api : square_grad
   forward : square (Tensor x) -> Tensor(out)
@@ -1690,9 +1854,16 @@
     param : [x]
   kernel :
     func : square_grad
+  backward : square_double_grad
+
+- backward_api : squeeze_double_grad
+  forward : squeeze_grad(Tensor xshape, Tensor grad_out, int[] axes) -> Tensor(grad_x)
+  args : (Tensor grad_x_grad, int[] axes)
+  output : Tensor(grad_out_grad)
+  invoke: squeeze(grad_x_grad, axes)
 
 - backward_api : squeeze_grad
-  forward : squeeze(Tensor x, int[] axes) -> Tensor(xshape), Tensor(out)
+  forward : squeeze(Tensor x, int[] axes) -> Tensor(out), Tensor(xshape)
   args : (Tensor xshape, Tensor out_grad, int[] axes)
   output : Tensor(x_grad)
   infer_meta :
@@ -1700,6 +1871,8 @@
     param: [xshape]
   kernel :
     func : squeeze_grad
+  inplace : (out_grad -> x_grad)
+  backward: squeeze_double_grad
 
 - backward_api : stack_grad
   forward : stack (Tensor[] x, int axis) -> Tensor(out)
@@ -1747,6 +1920,7 @@
     func : subtract_grad
   no_need_buffer : x, y
   backward : subtract_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : sum_double_grad
   forward : sum_grad (Tensor x, Tensor grad_out, int64_t[] dims, bool keep_dim, bool reduce_all=false) -> Tensor(grad_x)
@@ -1756,7 +1930,7 @@
   backward : sum_triple_grad
 
 - backward_api : sum_grad
-  forward : sum (Tensor x, int64_t[] dims={}, DataType out_dtype=paddle::experimental::DataType::UNDEFINED, bool keep_dim=false) -> Tensor(out)
+  forward : sum (Tensor x, int64_t[] dims={}, DataType out_dtype=DataType::UNDEFINED, bool keep_dim=false) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, int64_t[] dims, bool keep_dim, bool reduce_all=false)
   output : Tensor(x_grad)
   infer_meta :
@@ -1771,8 +1945,7 @@
   forward : sum_double_grad (Tensor grad_grad_x, int64_t[] dims={}, bool keep_dim=false) -> Tensor(grad_grad_out)
   args : (Tensor grad_grad_x, Tensor grad_grad_out_grad, int64_t[] dims={}, bool keep_dim=false, bool reduce_all=false)
   output : Tensor(grad_grad_x_grad)
-  invoke : sum_grad(grad_grad_x, grad_grad_out_grad, dims, keep_dim, reduce_all)
-  no_need_buffer : x
+  invoke : sum_grad(grad_grad_x, grad_grad_out_grad, dims, keep_dim, reduce_all, grad_grad_x_grad)
 
 - backward_api : swish_grad
   forward : swish (Tensor x, float beta=1.0) -> Tensor(out)
@@ -1856,6 +2029,15 @@
   kernel :
     func : thresholded_relu_grad
 
+- backward_api : tile_double_grad
+  forward : tile_grad (Tensor x, Tensor grad_out, IntArray repeat_times) -> Tensor(grad_x)
+  args : (Tensor grad_x_grad, IntArray repeat_times)
+  output : Tensor(grad_out_grad)
+  infer_meta :
+    func : TileInferMeta
+  kernel :
+    func : tile
+
 - backward_api : tile_grad
   forward : tile (Tensor x, IntArray repeat_times) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, IntArray repeat_times)
@@ -1866,6 +2048,7 @@
   kernel :
     func : tile_grad
   no_need_buffer : x
+  backward : tile_double_grad
 
 - backward_api : top_k_grad
   forward : top_k (Tensor x, Scalar k, int axis = -1, bool largest = true, bool sorted = true) -> Tensor(out), Tensor(indices)
@@ -1952,15 +2135,24 @@
     func : unfold_grad
   no_need_buffer : x
 
+- backward_api : unsqueeze_double_grad
+  forward : unsqueeze_grad(Tensor xshape, Tensor grad_out, IntArray axes) -> Tensor(grad_x)
+  args : (Tensor grad_x_grad, IntArray axes)
+  output : Tensor(grad_out_grad)
+  invoke : unsqueeze(grad_x_grad, axes)
+
 - backward_api : unsqueeze_grad
-  forward : unsqueeze(Tensor x, IntArray axes) -> Tensor(xshape), Tensor(out)
-  args : (Tensor xshape, Tensor out_grad)
+  forward : unsqueeze(Tensor x, IntArray axes) -> Tensor(out), Tensor(xshape)
+  args : (Tensor xshape, Tensor out_grad, IntArray axes)
   output : Tensor(x_grad)
   infer_meta :
     func : KernelWithXShapeInferMeta
     param: [xshape]
   kernel :
     func : unsqueeze_grad
+    param: [xshape, out_grad]
+  inplace : (out_grad -> x_grad)
+  backward : unsqueeze_double_grad
 
 - backward_api : where_grad
   forward : where (Tensor condition, Tensor x, Tensor y) -> Tensor(out)
diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/python/paddle/utils/code_gen/backward_api_gen.py
index a88339c607c55..886748eeb290e 100644
--- a/python/paddle/utils/code_gen/backward_api_gen.py
+++ b/python/paddle/utils/code_gen/backward_api_gen.py
@@ -35,10 +35,10 @@ def parse_forward_config(self, forward_config):
             r"(?P<api>[a-z][a-z0-9_]+)\s*(?P<args>\([^\)]+\))\s*->\s*(?P<outputs>.+)",
             forward_config)
         api = result.group('api')
-        _, outputs, _, _ = self.parse_output(self.api, result.group('outputs'))
+        _, outputs, _, = self.parse_output(self.api, result.group('outputs'))
         outputs = [item.split('@')[0] for item in outputs]
-        fw_inputs, fw_attrs, _, = self.parse_input_and_attr(
-            api, result.group('args'))
+        fw_inputs, fw_attrs = self.parse_input_and_attr(api,
+                                                        result.group('args'))
 
         return api, fw_inputs, fw_attrs, outputs
 
@@ -77,6 +77,25 @@ def check_args(self, forward_config):
             f"{self.api} : Output error: The number of outputs should be less then the number of inputs of forward api. \
              Please check the output of {self.api} in yaml."
 
+    def get_declare_args(self, inplace_flag=False):
+        return self.get_define_args()
+
+    def get_define_args(self, inplace_flag=False):
+        out_type_map = {
+            'Tensor': 'Tensor*',
+            'std::vector<Tensor>': 'std::vector<Tensor*>'
+        }
+        intputs_and_attrs = super(BackwardAPI, self).get_define_args()
+        outs = []
+        for i, name in enumerate(self.outputs['names']):
+            outs.append(out_type_map[self.outputs['types'][i]] + ' ' +
+                        name.split('@')[0])
+        result = intputs_and_attrs + ', ' + ", ".join(outs)
+        return result
+
+    def gene_return_code(self):
+        return ""
+
     def gene_kernel_backend_select(self):
         all_no_need_buffer = True
         for in_name in self.inputs['names']:
@@ -90,9 +109,8 @@ def gene_kernel_backend_select(self):
         else:
             return super().gene_kernel_backend_select()
 
-    def get_return_type(self, out_type_list):
-        return out_type_list[0] if len(
-            out_type_list) == 1 else "std::vector<std::vector<Tensor>>"
+    def get_return_type(self, inplace_flag=False):
+        return 'void'
 
     def gene_output(self,
                     output_type_list,
@@ -109,23 +127,19 @@ def gene_output(self,
             inplace_assign = " = " + self.inplace_map[self.outputs['names'][
                 0]] if inplace_flag and self.inplace_map is not None and self.outputs[
                     'names'][0] in self.inplace_map else ""
-            output_create = f"""
-{code_indent}  {self.outputs['return_type']} api_output{inplace_assign};"""
-
+            output_create = ""
             if output_type_list[0] == 'std::vector<Tensor>':
                 assert self.outputs['out_size_expr'] is not None, \
                      f"{api_name}: The out size expr : '{{expr}}' should be set when output has Tensor[]. You can refer 'split' api."
                 output_create = output_create + f"""
-{code_indent}  auto kernel_out = {set_out_func}({self.outputs['out_size_expr']}, kernel_backend, &api_output);"""
+{code_indent}  auto kernel_out = {set_out_func}(&{self.outputs['names'][0]});"""
 
             else:
                 output_create = output_create + f"""
-{code_indent}  auto kernel_out = {set_out_func}(kernel_backend, &api_output);"""
+{code_indent}  auto kernel_out = {set_out_func}(kernel_backend, {self.outputs['names'][0]});"""
 
         elif len(output_type_list) > 1:
-            output_create = f"""
-{code_indent}  {self.outputs['return_type']} api_output({len(output_type_list)});"""
-
+            output_create = ""
             for i, out_type_item in enumerate(output_type_list):
                 kernel_output = kernel_output + f'kernel_out_{i}, '
                 output_names.append(f'kernel_out_{i}')
@@ -133,26 +147,21 @@ def gene_output(self,
                     if inplace_flag and self.inplace_map is not None and self.outputs[
                             'names'][i] in self.inplace_map:
                         output_create = output_create + f"""
-{code_indent}  api_output[{i}].emplace_back({self.inplace_map[self.outputs['names'][i]]});"""
-
-                    else:
-                        output_create = output_create + f"""
-{code_indent}  api_output[{i}].emplace_back();"""
+{code_indent}  *{self.outputs['names'][i]} = {self.inplace_map[self.outputs['names'][i]]};"""
 
                     output_create = output_create + f"""
-{code_indent}  auto kernel_out_{i} = {set_out_func}(kernel_backend, &api_output[{i}][0]);"""
+{code_indent}  auto kernel_out_{i} = {set_out_func}(kernel_backend, {self.outputs['names'][i]});"""
 
                 else:
-                    get_out_code = f'&api_output[{i}]'
                     if inplace_flag and self.inplace_map is not None and self.outputs[
                             'names'][i] in self.inplace_map:
                         output_create = output_create + f"""
-{code_indent}  api_output[{i}] = {self.inplace_map[self.outputs['names'][i]]};"""
+{code_indent}  *{self.outputs['names'][i]} = {self.inplace_map[self.outputs['names'][i]]};"""
 
                     assert self.outputs['out_size_expr'][i] is not None, \
                         f"{api_name}: The out size expr : '{{expr}}' should be set when output has Tensor[]. You can refer 'split' api."
                     output_create = output_create + f"""
-{code_indent}  auto kernel_out_{i} = {set_out_func}({self.outputs['out_size_expr'][i]}, kernel_backend, &api_output[{i}]);"""
+{code_indent}  auto kernel_out_{i} = {set_out_func}(&{self.outputs['names'][i]});"""
 
             kernel_output = kernel_output[:-2]
         else:
@@ -162,6 +171,21 @@ def gene_output(self,
 
         return kernel_output, output_names, output_create
 
+    def gene_invoke_code(self, invoke_code, params_code):
+        invoke_func_name = invoke_code.split('(')[0].strip()
+        if invoke_func_name.endswith('_grad') or invoke_func_name.endswith(
+                '_grad_impl'):
+            return f"""
+PADDLE_API {self.get_return_type()} {self.api}({params_code}) {{
+  {invoke_code};
+}}"""
+
+        else:
+            return f"""
+PADDLE_API {self.get_return_type()} {self.api}({params_code}) {{
+  *{self.outputs['names'][0].split('@')[0]} = {invoke_code};
+}}"""
+
 
 def header_include():
     return """
@@ -185,7 +209,6 @@ def source_include(header_file_path):
 #include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/api/include/api.h"
 #include "paddle/phi/infermeta/backward.h"
@@ -213,8 +236,13 @@ def backward_api_namespace():
 def generate_backward_api(backward_yaml_path, header_file_path,
                           source_file_path):
 
-    with open(backward_yaml_path, 'r') as f:
-        bw_apis = yaml.load(f, Loader=yaml.FullLoader)
+    bw_apis = []
+    for each_api_yaml in backward_yaml_path:
+        with open(each_api_yaml, 'r') as f:
+            api_list = yaml.load(f, Loader=yaml.FullLoader)
+            if api_list:
+                bw_apis.extend(api_list)
+
     header_file = open(header_file_path, 'w')
     source_file = open(source_file_path, 'w')
 
@@ -246,6 +274,7 @@ def main():
     parser.add_argument(
         '--backward_yaml_path',
         help='path to backward yaml file',
+        nargs='+',
         default='python/paddle/utils/code_gen/backward.yaml')
     parser.add_argument(
         '--backward_header_path',
diff --git a/python/paddle/utils/code_gen/cross_validate.py b/python/paddle/utils/code_gen/cross_validate.py
new file mode 100644
index 0000000000000..30fbf2e0a7d42
--- /dev/null
+++ b/python/paddle/utils/code_gen/cross_validate.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+from itertools import chain
+from pathlib import Path
+
+import yaml
+from parse_utils import cross_validate, to_named_dict
+
+
+def main(forward_api_yaml_paths, backward_api_yaml_paths):
+    apis = {}
+    for api_yaml_path in chain(forward_api_yaml_paths, backward_api_yaml_paths):
+        with open(api_yaml_path, "rt", encoding="utf-8") as f:
+            api_list = yaml.safe_load(f)
+            if api_list is not None:
+                apis.update(to_named_dict((api_list)))
+
+    cross_validate(apis)
+
+
+if __name__ == "__main__":
+    current_dir = Path(__file__).parent / "temp"
+    parser = argparse.ArgumentParser(
+        description="Parse api yaml into canonical format.")
+    parser.add_argument(
+        '--forward_yaml_paths',
+        type=str,
+        nargs='+',
+        default=str(current_dir / "api.parsed.yaml"),
+        help="forward api yaml file.")
+    parser.add_argument(
+        '--backward_yaml_paths',
+        type=str,
+        nargs='+',
+        default=str(current_dir / "backward.yaml.yaml"),
+        help="backward api yaml file.")
+
+    args = parser.parse_args()
+    main(args.forward_yaml_paths, args.backward_yaml_paths)
diff --git a/python/paddle/utils/code_gen/filters.py b/python/paddle/utils/code_gen/filters.py
new file mode 100644
index 0000000000000..d37403adcba36
--- /dev/null
+++ b/python/paddle/utils/code_gen/filters.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Dict
+import re
+
+from jinja2.filters import do_xmlattr
+from type_mapping import (input_types_map, optional_input_types_map,
+                          attr_types_map, opmaker_attr_types_map,
+                          output_type_map)
+from type_mapping import (dense_input_types_map, dense_optional_input_types_map,
+                          dense_output_types_map, sr_input_types_map,
+                          sr_optional_input_types_map, sr_output_types_map,
+                          phi_attr_types_map)
+
+
+# ------------------------------ attr -------------------------------------
+def to_phi_attr_type(s):
+    return phi_attr_types_map[s]
+
+
+def to_op_attr_type(s):
+    return opmaker_attr_types_map[s]
+
+
+def to_paddle_attr_type(s):
+    "Convert type tag for attributes in yaml to c++ types"
+    return attr_types_map[s]
+
+
+# ------------------------------ input ----------------------------------
+def to_paddle_input_type(s, optional=False):
+    "Convert type tag for inputs in yaml to c++ types"
+    if optional:
+        return optional_input_types_map[s]
+    else:
+        return input_types_map[s]
+
+
+def to_dense_input_type(s, optional=False):
+    "Convert types in yaml to dense tensor type in phi"
+    if optional:
+        return dense_input_types_map[s]
+    else:
+        return dense_optional_input_types_map[s]
+
+
+# ------------------------------ output  ----------------------------------
+def to_paddle_output_type(s):
+    return output_type_map[s]
+
+
+def to_dense_output_type(s):
+    "Convert types in yaml to dense tensor type in phi"
+    return dense_output_types_map[s]
+
+
+def to_sr_output_type(s):
+    "Convert types in yaml to selected rows type in phi"
+    return sr_output_types_map[s]
+
+
+# -------------- transform argument names from yaml to opmaker ------------
+def to_opmaker_name(s):
+    if s.endswith("_grad"):
+        return 'GradVarName("{}")'.format(
+            to_pascal_case(s.removesuffix("_grad")))
+    else:
+        return '"{}"'.format(to_pascal_case(s))
+
+
+def to_opmaker_name_cstr(s):
+    if s.endswith("_grad"):
+        return '"{}@GRAD"'.format(to_pascal_case(s.removesuffix("_grad")))
+    else:
+        return '"{}"'.format(to_pascal_case(s))
+
+
+def to_pascal_case(s):
+    words = s.split("_")
+    return "".join([word.capitalize() for word in words])
+
+
+def to_input_name(s):
+    """find input variable name in api yaml for higher order backward api.
+    x -> dx
+    x -> d2x
+    x -> d3x
+    
+    NOTE: for first order backward api
+    x -> x_grad 
+    is more common.
+    """
+    match = re.match(r"(d\d*)(\w+)", s)
+    assert (match.group(1) != ""), "it should be a grad style name."
+    return match.group(2)
diff --git a/python/paddle/utils/code_gen/generate_op.py b/python/paddle/utils/code_gen/generate_op.py
new file mode 100644
index 0000000000000..0b314e4a11cb3
--- /dev/null
+++ b/python/paddle/utils/code_gen/generate_op.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+from itertools import chain
+from pathlib import Path
+
+import yaml
+from jinja2 import Environment, FileSystemLoader, StrictUndefined
+
+from filters import to_op_attr_type, to_opmaker_name, to_opmaker_name_cstr, to_pascal_case
+from tests import is_base_api, is_vec, is_scalar, is_initializer_list, supports_inplace, supports_no_need_buffer
+from filters import to_input_name
+from parse_utils import to_named_dict
+
+file_loader = FileSystemLoader(Path(__file__).parent / "templates")
+env = Environment(
+    loader=file_loader,
+    keep_trailing_newline=True,
+    trim_blocks=True,
+    lstrip_blocks=True,
+    undefined=StrictUndefined,
+    extensions=['jinja2.ext.do'])
+env.filters["to_op_attr_type"] = to_op_attr_type
+env.filters["to_opmaker_name"] = to_opmaker_name
+env.filters["to_pascal_case"] = to_pascal_case
+env.filters["to_input_name"] = to_input_name
+env.filters["to_opmaker_name_cstr"] = to_opmaker_name_cstr
+env.tests["base_api"] = is_base_api
+env.tests["vec"] = is_vec
+env.tests["scalar"] = is_scalar
+env.tests["initializer_list"] = is_initializer_list
+env.tests["supports_inplace"] = supports_inplace
+env.tests["supports_no_need_buffer"] = supports_no_need_buffer
+
+
+def main(api_yaml_path, backward_yaml_path, output_op_path,
+         output_arg_map_path):
+    with open(api_yaml_path, "rt") as f:
+        apis = yaml.safe_load(f)
+    forward_api_dict = to_named_dict(apis)
+
+    with open(backward_yaml_path, "rt") as f:
+        backward_apis = yaml.safe_load(f)
+    backward_api_dict = to_named_dict(backward_apis)
+
+    # fill backward field for an api if another api claims it as forward
+    for name, backward_api in backward_api_dict.items():
+        forward_name = backward_api["forward"]["name"]
+        if forward_name in backward_api_dict:
+            forward_api = backward_api_dict[forward_name]
+            if forward_api["backward"] is None:
+                forward_api["backward"] = name
+
+        if forward_name in backward_api_dict:
+            forward_api = backward_api_dict[forward_name]
+            if forward_api["backward"] is None:
+                forward_api["backward"] = name
+
+    api_dict = {}
+    api_dict.update(forward_api_dict)
+    api_dict.update(backward_api_dict)
+
+    if len(apis) == 0 and len(backward_apis) == 0:
+        if os.path.isfile(output_op_path):
+            os.remove(output_op_path)
+        if os.path.isfile(output_arg_map_path):
+            os.remove(output_arg_map_path)
+        return
+
+    op_template = env.get_template('op.c.j2')
+    with open(output_op_path, "wt") as f:
+        msg = op_template.render(
+            apis=apis, backward_apis=backward_apis, api_dict=api_dict)
+        f.write(msg)
+
+    ks_template = env.get_template('ks.c.j2')
+    with open(output_arg_map_path, 'wt') as f:
+        msg = ks_template.render(apis=apis, backward_apis=backward_apis)
+        f.write(msg)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Generate operator file from api yaml.")
+    parser.add_argument(
+        '--api_yaml_path', type=str, help="parsed api yaml file.")
+    parser.add_argument(
+        '--backward_api_yaml_path',
+        type=str,
+        help="parsed backward api yaml file.")
+    parser.add_argument(
+        "--output_op_path", type=str, help="path to save generated operators.")
+    parser.add_argument(
+        "--output_arg_map_path",
+        type=str,
+        help="path to save generated argument mapping functions.")
+
+    args = parser.parse_args()
+    main(args.api_yaml_path, args.backward_api_yaml_path, args.output_op_path,
+         args.output_arg_map_path)
diff --git a/python/paddle/utils/code_gen/intermediate_api_gen.py b/python/paddle/utils/code_gen/intermediate_api_gen.py
index 6e1df7b4ec336..4e4875b596192 100644
--- a/python/paddle/utils/code_gen/intermediate_api_gen.py
+++ b/python/paddle/utils/code_gen/intermediate_api_gen.py
@@ -44,7 +44,6 @@ def source_include(header_file_path):
 #include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/sparse_api_custom_impl.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/infermeta/binary.h"
 #include "paddle/phi/infermeta/multiary.h"
@@ -94,8 +93,12 @@ def generate_intermediate_api(api_yaml_path, sparse_api_yaml_path,
     dygraph_source_file.write(source_include(dygraph_include_header_file))
     dygraph_source_file.write(namespace[0])
 
-    with open(api_yaml_path, 'r') as f:
-        apis = yaml.load(f, Loader=yaml.FullLoader)
+    apis = []
+    for each_api_yaml in api_yaml_path:
+        with open(each_api_yaml, 'r') as f:
+            api_list = yaml.load(f, Loader=yaml.FullLoader)
+            if api_list:
+                apis.extend(api_list)
 
     for api in apis:
         foward_api = ForwardAPI(api)
@@ -131,6 +134,7 @@ def main():
         description='Generate PaddlePaddle C++ Sparse API files')
     parser.add_argument(
         '--api_yaml_path',
+        nargs='+',
         help='path to api yaml file',
         default='python/paddle/utils/code_gen/api.yaml')
 
diff --git a/python/paddle/utils/code_gen/new_api.yaml b/python/paddle/utils/code_gen/new_api.yaml
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/python/paddle/utils/code_gen/new_backward.yaml b/python/paddle/utils/code_gen/new_backward.yaml
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/python/paddle/utils/code_gen/parse_api.py b/python/paddle/utils/code_gen/parse_api.py
new file mode 100644
index 0000000000000..63dc314d2e31e
--- /dev/null
+++ b/python/paddle/utils/code_gen/parse_api.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+from pathlib import Path
+
+import yaml
+
+from parse_utils import parse_api_entry
+
+
+def main(api_yaml_path, output_path, backward):
+    with open(api_yaml_path, "rt") as f:
+        apis = yaml.safe_load(f)
+        if apis is None:
+            apis = []
+        else:
+            apis = [
+                parse_api_entry(api, "backward_api" if backward else "api")
+                for api in apis
+            ]
+
+    with open(output_path, "wt") as f:
+        yaml.safe_dump(apis, f, default_flow_style=None, sort_keys=False)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Parse api yaml into canonical format.")
+    parser.add_argument('--api_yaml_path', type=str, help="api yaml file.")
+    parser.add_argument(
+        "--output_path", type=str, help="path to save parsed yaml file.")
+    parser.add_argument("--backward", action="store_true", default=False)
+
+    args = parser.parse_args()
+    main(args.api_yaml_path, args.output_path, args.backward)
diff --git a/python/paddle/utils/code_gen/parse_utils.py b/python/paddle/utils/code_gen/parse_utils.py
new file mode 100644
index 0000000000000..8168328012ec5
--- /dev/null
+++ b/python/paddle/utils/code_gen/parse_utils.py
@@ -0,0 +1,423 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+import yaml
+from copy import copy
+from typing import Dict, Any, List, Tuple
+from tests import is_attr, is_input, is_output, is_vec
+
+
+def to_named_dict(items: List[Dict]) -> Dict[str, Dict]:
+    named_dict = {}
+    for item in items:
+        if "name" not in item:
+            raise KeyError(f"name not in {item}")
+        name = item["name"]
+        named_dict[name] = item
+    return named_dict
+
+
+def parse_arg(api_name: str, s: str) -> Dict[str, str]:
+    """parse an argument in following formats:
+    1. typename name
+    2. typename name = default_value
+    """
+    typename, rest = [item.strip() for item in s.split(" ", 1)]
+    assert len(
+        typename
+    ) > 0, f"The arg typename should not be empty. Please check the args of {api_name} in yaml."
+
+    assert rest.count(
+        "=") <= 1, f"There is more than 1 = in an arg in {api_name}"
+    if rest.count("=") == 1:
+        name, default_value = [item.strip() for item in rest.split("=", 1)]
+        assert len(
+            name
+        ) > 0, f"The arg name should not be empty. Please check the args of {api_name} in yaml."
+        assert len(
+            default_value
+        ) > 0, f"The default value should not be empty. Please check the args of {api_name} in yaml."
+        return {
+            "typename": typename,
+            "name": name,
+            "default_value": default_value
+        }
+    else:
+        name = rest.strip()
+        assert len(
+            name
+        ) > 0, f"The arg name should not be empty. Please check the args of {api_name} in yaml."
+        return {"typename": typename, "name": name}
+
+
+def parse_input_and_attr(api_name: str,
+                         arguments: str) -> Tuple[List, List, Dict, Dict]:
+    args_str = arguments.strip()
+    assert args_str.startswith('(') and args_str.endswith(')'), \
+        (f"Args declaration should start with '(' and end with ')', "
+         f"please check the args of {api_name} in yaml.")
+    args_str = args_str[1:-1]
+    args = parse_plain_list(args_str)
+
+    inputs = []
+    attrs = []
+
+    met_attr_with_default_value = False
+
+    for arg in args:
+        item = parse_arg(api_name, arg)
+        typename = item["typename"]
+        name = item["name"]
+        if is_input(typename):
+            assert len(attrs) == 0, \
+                (f"The input Tensor should appear before attributes. "
+                f"please check the position of {api_name}:input({name}) "
+                f"in yaml.")
+            inputs.append(item)
+        elif is_attr(typename):
+            if met_attr_with_default_value:
+                assert "default_value" in item, f"{api_name}: Arguments with default value should not precede those without default value"
+            elif "default_value" in item:
+                met_attr_with_default_value = True
+            attrs.append(item)
+        else:
+            raise KeyError(f"{api_name}: Invalid argument type {typename}.")
+    return inputs, attrs
+
+
+def parse_output(api_name: str, s: str) -> Dict[str, str]:
+    """parse an output, typename or typename(name)."""
+    match = re.search(
+        r"(?P<out_type>[a-zA-Z0-9_[\]]+)\s*(?P<name>\([a-zA-Z0-9_@]+\))?\s*(?P<expr>\{[^\}]+\})?",
+        s)
+    typename = match.group("out_type")
+    name = match.group("name")
+    size_expr = match.group("expr")
+
+    name = name[1:-1] if name is not None else 'out'
+    size_expr = size_expr[1:-1] if size_expr is not None else None
+
+    assert is_output(typename), \
+        (f"Invalid output type: {typename} in api: {api_name}."
+            f"Supported types are Tensor and Tensor[]")
+    if size_expr is not None:
+        assert is_vec(typename), \
+            (f"Invalid output size: output {name} in api: {api_name} is "
+             f"not a vector but has size expr")
+        return {"typename": typename, "name": name, "size": size_expr}
+    else:
+        return {"typename": typename, "name": name}
+
+
+def parse_outputs(api_name: str, outputs: str) -> List[Dict]:
+    outputs = parse_plain_list(outputs, sep=",")
+    output_items = []
+    for output in outputs:
+        output_items.append(parse_output(api_name, output))
+    return output_items
+
+
+def parse_infer_meta(infer_meta: Dict[str, Any]) -> Dict[str, Any]:
+    infer_meta = copy(infer_meta)  # to prevent mutating the input
+    if "param" not in infer_meta:
+        infer_meta["param"] = None
+    return infer_meta
+
+
+def parse_candidates(s: str) -> Dict[str, Any]:
+    "parse candidates joined by either '>'(ordered) or ','(unordered)"
+    delimiter = ">" if ">" in s else ","
+    ordered = delimiter == ">"
+    candidates = parse_plain_list(s, delimiter)
+    return {"ordered": ordered, "candidates": candidates}
+
+
+def parse_plain_list(s: str, sep=",") -> List[str]:
+    items = [item.strip() for item in s.strip().split(sep)]
+    return items
+
+
+def parse_kernel(api_name: str,
+                 kernel_config: Dict[str, Any]) -> Dict[str, Any]:
+    # kernel :
+    #    func : [], Kernel functions (example: scale, scale_sr)
+    #    param : [], Input params of kernel
+    #    backend : str, the names of param to choose the kernel backend, default is None
+    #    layout : str, the names of param to choose the kernel layout, default is None
+    #    data_type : str, the names of param to choose the kernel data_type, default is None
+    kernel = {
+        'func': None,  # up to 2 function names
+        'param': None,
+        'backend': None,
+        'layout': None,
+        'data_type': None
+    }
+    kernel['func'] = parse_plain_list(kernel_config['func'])
+    if 'param' in kernel_config:
+        kernel['param'] = kernel_config['param']
+
+    if 'backend' in kernel_config:
+        kernel['backend'] = parse_candidates(kernel_config["backend"])
+
+    if 'layout' in kernel_config:
+        kernel['layout'] = parse_candidates(kernel_config["layout"])
+
+    if 'data_type' in kernel_config:
+        kernel['data_type'] = parse_candidates(kernel_config["data_type"])
+    return kernel
+
+
+def parse_inplace(api_name: str, inplace_cfg: str) -> Dict[str, str]:
+    inplace_map = {}
+    inplace_cfg = inplace_cfg.lstrip("(").rstrip(")")
+    pairs = parse_plain_list(inplace_cfg)
+    for pair in pairs:
+        in_name, out_name = parse_plain_list(pair, sep="->")
+        inplace_map[out_name] = in_name
+    return inplace_map
+
+
+def parse_invoke(api_name: str, invoke_config: str) -> Dict[str, Any]:
+    invoke_config = invoke_config.strip()
+    func, rest = invoke_config.split("(", 1)
+    func = func.strip()
+    args = rest.rstrip(")").strip()
+    invocation = {"func": func, "args": args}
+    return invocation
+
+
+def extract_type_and_name(records: List[Dict]) -> List[Dict]:
+    """extract type and name from forward call, it is simpler than forward api."""
+    extracted = [{
+        "name": item["name"],
+        "typename": item["typename"]
+    } for item in records]
+    return extracted
+
+
+def parse_forward(api_name: str, forward_config: str) -> Dict[str, Any]:
+    # api_name (const Tensor& input, ... , int attr, ...) -> Tensor(out)
+    result = re.search(
+        r"(?P<api>[a-z][a-z0-9_]+)\s*(?P<args>\([^\)]+\))\s*->\s*(?P<outputs>.+)",
+        forward_config)
+    api = result.group("api")
+    outputs = parse_outputs(api_name, result.group("outputs"))
+    outputs = extract_type_and_name(outputs)
+
+    inputs, attrs = parse_input_and_attr(api_name, result.group("args"))
+    inputs = extract_type_and_name(inputs)
+    attrs = extract_type_and_name(attrs)
+    forward_cfg = {
+        "name": api,
+        "inputs": inputs,
+        "attrs": attrs,
+        "outputs": outputs
+    }
+    return forward_cfg
+
+
+def parse_api_entry(api_entry: Dict[str, Any], name_field="api"):
+    api_name = api_entry[name_field]
+    inputs, attrs = parse_input_and_attr(api_name, api_entry["args"])
+    outputs = parse_outputs(api_name, api_entry["output"])
+
+    # validate default value of DataType and DataLayout
+    for attr in attrs:
+        if "default_value" in attr:
+            typename = attr["typename"]
+            default_value = attr["default_value"]
+            if typename == "DataType":
+                assert "DataType" in default_value, f"invalid DataType default value in {api_name}"
+                # remove namespace
+                default_value = default_value[default_value.find("DataType"):]
+                attr["default_value"] = default_value
+            elif typename == "DataLayout":
+                assert "DataLayout" in default_value, f"invalid DataLayout default value in {api_name}"
+                default_value = default_value[default_value.find("DataLayout"):]
+                attr["default_value"] = default_value
+
+    input_names = [item["name"] for item in inputs]
+    attr_names = [item["name"] for item in attrs]
+    output_names = [item["name"] for item in outputs]
+
+    # add optional tag for every input
+    for input in inputs:
+        input["optional"] = False
+    if "optional" in api_entry:
+        optional_args = parse_plain_list(api_entry["optional"])
+        for name in optional_args:
+            assert name in input_names, f"{api_name} has an optional input: '{name}' which is not an input."
+        for input in inputs:
+            if input["name"] in optional_args:
+                input["optional"] = True
+
+    # add intermediate tag for every output
+    for output in outputs:
+        output["intermediate"] = False
+    if "intermediate" in api_entry:
+        intermediate_outs = parse_plain_list(api_entry["intermediate"])
+        for name in intermediate_outs:
+            assert name in output_names, f"{api_name} has an intermediate output: '{name}' which is not an output."
+        for output in outputs:
+            if output["name"] in intermediate_outs:
+                output["intermediate"] = True
+
+    # add no_need_buffer for every input
+    for input in inputs:
+        input["no_need_buffer"] = False
+    if "no_need_buffer" in api_entry:
+        no_buffer_args = parse_plain_list(api_entry["no_need_buffer"])
+        for name in no_buffer_args:
+            assert name in input_names, f"{api_name} has an no buffer input: '{name}' which is not an input."
+        for input in inputs:
+            if input["name"] in no_buffer_args:
+                input["no_need_buffer"] = True
+    else:
+        no_buffer_args = None
+
+    # TODO(chenfeiyu): data_transform
+
+    api = {
+        "name": api_name,
+        "inputs": inputs,
+        "attrs": attrs,
+        "outputs": outputs,
+        "no_need_buffer": no_buffer_args
+    }
+
+    # invokes another api?
+    is_base_api = "invoke" not in api_entry
+
+    if is_base_api:
+        # kernel
+        kernel = parse_kernel(api_name, api_entry["kernel"])
+        if kernel["param"] is None:
+            kernel["param"] = input_names + attr_names
+
+        # infer meta
+        infer_meta = parse_infer_meta(api_entry["infer_meta"])
+        if infer_meta["param"] is None:
+            infer_meta["param"] = copy(kernel["param"])
+
+        # inplace
+        if "inplace" in api_entry:
+            inplace_pairs = parse_inplace(api_name, api_entry["inplace"])
+        else:
+            inplace_pairs = None
+        api.update({
+            "infer_meta": infer_meta,
+            "kernel": kernel,
+            "inplace": inplace_pairs
+        })
+    else:
+        # invoke
+        invoke = parse_invoke(api_name, api_entry["invoke"])
+        api["invoke"] = invoke
+
+    # backward
+    if "backward" in api_entry:
+        backward = api_entry["backward"]
+    else:
+        backward = None
+    api["backward"] = backward
+
+    # forward for backward_apis
+    is_backward_api = name_field == "backward_api"
+    if is_backward_api:
+        if "forward" in api_entry:
+            forward = parse_forward(api_name, api_entry["forward"])
+            # validate_fb
+            validate_backward_inputs(api_name, forward["inputs"],
+                                     forward["outputs"], inputs)
+            validate_backward_attrs(api_name, forward["attrs"], attrs)
+            validate_backward_outputs(api_name, forward["inputs"], outputs)
+        else:
+            forward = None
+        api["forward"] = forward
+    return api
+
+
+def validate_backward_attrs(api, forward_attrs, backward_attrs):
+    if len(forward_attrs) >= len(backward_attrs):
+        return
+    num_exceptional_attrs = len(backward_attrs) - len(forward_attrs)
+    # this is a not-that-clean trick to allow backward api to has more attrs
+    # than the forward api, as long as they all have default value
+    for i in range(-num_exceptional_attrs, 0):
+        assert "default_value" in backward_attrs[
+            i], f"{api} has exceptional attr without default value"
+
+
+def validate_backward_inputs(api, forward_inputs, forward_outputs,
+                             backward_inputs):
+    foward_input_names = [item["name"] for item in forward_inputs]
+    forward_output_names = [item["name"] for item in forward_outputs]
+    backward_input_names = [item["name"] for item in backward_inputs]
+
+    assert len(backward_input_names) <= len(foward_input_names) + 2 * len(
+        forward_output_names), f"{api} has too many inputs."
+
+
+def validate_backward_outputs(api, forward_inputs, backward_outputs):
+    assert len(backward_outputs) <= len(
+        forward_inputs), f"{api} has too many outputs"
+
+
+def cross_validate(apis):
+    for name, api in apis.items():
+        if "forward" in api:
+            fw_call = api["forward"]
+            fw_name = fw_call["name"]
+            if fw_name not in apis:
+                print(
+                    f"Something Wrong here, this backward api({name})'s forward api({fw_name}) does not exist."
+                )
+            else:
+                fw_api = apis[fw_name]
+                if "backward" not in fw_api or fw_api["backward"] is None:
+                    print(
+                        f"Something Wrong here, {name}'s forward api({fw_name}) does not claim {name} as its backward."
+                    )
+                else:
+                    assert fw_api[
+                        "backward"] == name, f"{name}: backward and forward name mismatch"
+
+                assert len(fw_call["inputs"]) <= len(
+                    fw_api["inputs"]
+                ), f"{name}: forward call has more inputs than the api"
+                for (input, input_) in zip(fw_call["inputs"], fw_api["inputs"]):
+                    assert input["typename"] == input_[
+                        "typename"], f"type mismatch in {name} and {fw_name}"
+
+                assert len(fw_call["attrs"]) <= len(
+                    fw_api["attrs"]
+                ), f"{name}: forward call has more attrs than the api"
+                for (attr, attr_) in zip(fw_call["attrs"], fw_api["attrs"]):
+                    if attr["typename"] == "Scalar":
+                        # special case for Scalar, fw_call can omit the type
+                        assert re.match(
+                            r"Scalar(\(\w+\))*", attr_["typename"]
+                        ), f"type mismatch in {name} and {fw_name}"
+                    else:
+                        assert attr["typename"] == attr_[
+                            "typename"], f"type mismatch in {name} and {fw_name}"
+
+                assert len(fw_call["outputs"]) == len(
+                    fw_api["outputs"]
+                ), f"{name}: forward call has more outputs than the api"
+                for (output, output_) in zip(fw_call["outputs"],
+                                             fw_api["outputs"]):
+                    assert output["typename"] == output_[
+                        "typename"], f"type mismatch in {name} and {fw_name}"
diff --git a/python/paddle/utils/code_gen/sparse_api.yaml b/python/paddle/utils/code_gen/sparse_api.yaml
index ca4330f2af362..5d1dc55f0638d 100644
--- a/python/paddle/utils/code_gen/sparse_api.yaml
+++ b/python/paddle/utils/code_gen/sparse_api.yaml
@@ -1,76 +1,98 @@
 - api : conv3d
   args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm)
-  output : Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
+  output : Tensor(out), Tensor(rulebook)
   kernel :
-    func : sparse_conv3d
+    func : sparse_conv3d{sparse_coo, dense -> sparse_coo, dense}
     layout : x
   intermediate : rulebook
   backward : conv3d_grad
 
 - api : coo_to_dense
   args : (Tensor x)
-  output : Tensor(out@DenseTensor)
+  output : Tensor(out)
   invoke : to_dense_impl(x)
   backward : coo_to_dense_grad
 
-- api : coo_values
-  args : (Tensor x)
-  output : Tensor(out@DenseTensor)
-  kernel :
-    func : coo_values
-    layout : x
-  backward : coo_values_grad
-
 - api : create_sparse_coo_tensor
   args : (Tensor values, Tensor indices, IntArray dense_shape)
-  output : Tensor(out@SparseCooTensor)
+  output : Tensor(out)
   kernel :
-    func : sparse_coo_tensor
+    func : sparse_coo_tensor{dense, dense -> sparse_coo}
     layout : values
     data_type : values
   backward : create_sparse_coo_tensor_grad
 
-- api : csr_values
-  args : (Tensor x)
-  output : Tensor(out@DenseTensor)
-  kernel :
-    func : csr_values
-    layout : x
-
 - api : dense_to_coo
   args : (Tensor x, int64_t sparse_dim)
-  output : Tensor(out@SparseCooTensor)
+  output : Tensor(out)
   invoke : to_sparse_coo_impl(x, sparse_dim)
   backward : dense_to_coo_grad
 
 - api : relu
+  args : (Tensor x)
+  output : Tensor(out)
+  kernel :
+    func : sparse_coo_relu{sparse_coo -> sparse_coo},
+           sparse_csr_relu{sparse_csr -> sparse_csr}
+    layout : x
+  backward : relu_grad
+
+- api : sin
   args : (Tensor x)
   output : Tensor(out@SparseCooTensor)
   kernel :
-    func : sparse_relu
+    func : sparse_coo_sin {sparse_coo -> sparse_coo},
+           sparse_csr_sin {sparse_csr -> sparse_csr}
     layout : x
-  backward : sparse_relu_grad
+  backward : sin_grad
+
+- api : sqrt
+  args : (Tensor x)
+  output : Tensor(out)
+  kernel :
+    func : sparse_coo_sqrt{sparse_coo -> sparse_coo},
+           sparse_csr_sqrt{sparse_csr -> sparse_csr}
+    layout : x
+  backward : sqrt_grad
+
+- api : tanh
+  args : (Tensor x)
+  output : Tensor(out)
+  kernel :
+    func : sparse_coo_tanh{sparse_coo -> sparse_coo},
+           sparse_csr_tanh{sparse_csr -> sparse_csr}
+    layout : x
+  backward : tanh_grad
 
 - api : to_dense
   args : (Tensor x)
-  output : Tensor(out@DenseTensor)
+  output : Tensor(out)
   invoke : to_dense_impl(x)
 
 - api : to_sparse_coo
   args : (Tensor x, int64_t sparse_dim)
-  output : Tensor(out@SparseCooTensor)
+  output : Tensor(out)
   invoke : to_sparse_coo_impl(x, sparse_dim)
 
 - api : to_sparse_csr
   args : (Tensor x)
-  output : Tensor(out@SparseCsrTensor)
+  output : Tensor(out)
   invoke : to_sparse_csr_impl(x)
 
+- api : values
+  args : (Tensor x)
+  output : Tensor(out)
+  kernel :
+    func : coo_values{sparse_coo -> dense},
+           csr_values{sparse_csr -> dense}
+    layout : x
+  backward : values_grad
+
 - api: maxpool
   args : (Tensor x, int[] kernel_sizes, int[] paddings, int[] dilations, int[] strides)
-  output : Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
+  output : Tensor(out), Tensor(rulebook)
   kernel :
-    func : sparse_maxpool
+    func : sparse_maxpool{sparse_coo -> sparse_coo, dense}
     layout : x
   intermediate : rulebook
   backward : sparse_maxpool_grad
diff --git a/python/paddle/utils/code_gen/sparse_api_gen.py b/python/paddle/utils/code_gen/sparse_api_gen.py
index c0316fc164294..bd73032e179db 100644
--- a/python/paddle/utils/code_gen/sparse_api_gen.py
+++ b/python/paddle/utils/code_gen/sparse_api_gen.py
@@ -27,7 +27,7 @@ def __init__(self, api_item_yaml):
     def gene_api_declaration(self):
         return f"""
 // {", ".join(self.outputs['names'])}
-PADDLE_API {self.outputs['return_type']} {self.get_api_func_name()}({self.args_str['args_declare']});
+{super(SparseAPI, self).gene_api_declaration()}
 """
 
     def get_kernel_tensor_out_type(self, output_name):
@@ -46,6 +46,12 @@ def gene_output(self,
         kernel_output = ""
         output_names = []
         output_create = ""
+        return_type = self.get_return_type_with_intermediate(inplace_flag)
+        output_type_map = {
+            'dense': 'TensorType::DENSE_TENSOR',
+            'sparse_coo': 'TensorType::SPARSE_COO',
+            'sparse_csr': 'TensorType::SPARSE_CSR'
+        }
 
         if len(output_type_list) == 1:
             kernel_output = 'kernel_out'
@@ -54,23 +60,30 @@ def gene_output(self,
                 0]] if inplace_flag and self.inplace_map is not None and self.outputs[
                     'names'][0] in self.inplace_map else ""
             output_create = f"""
-  {self.outputs['return_type']} api_output{inplace_assign};
-  auto* kernel_out = {set_out_func}(&api_output, {self.get_kernel_tensor_out_type(self.outputs['names'][0])});"""
+    {return_type} api_output{inplace_assign};
+    auto* kernel_out = {set_out_func}(&api_output, {output_type_map[output_type_list[0]]});"""
 
         elif len(output_type_list) > 1:
             output_create = f"""
-  {self.outputs['return_type']} api_output;"""
+    {return_type} api_output;"""
+
+            if inplace_flag:
+                output_create = f"""
+    {return_type} api_output{{"""
+
+                for out_name in self.outputs['names']:
+                    if out_name in self.inplace_map:
+                        output_create = output_create + self.inplace_map[
+                            out_name] + ', '
+                    else:
+                        output_create += 'Tensor(), '
+                output_create = output_create[:-2] + '};'
 
             for i in range(len(output_type_list)):
                 kernel_output = kernel_output + f'kernel_out_{i}, '
                 output_names.append(f'kernel_out_{i}')
-                if inplace_flag and self.inplace_map is not None and self.outputs[
-                        'names'][i] in self.inplace_map:
-                    output_create = output_create + f"""
-  std::get<{i}>(api_output) = {self.inplace_map[self.outputs['names'][i]]};"""
-
                 output_create = output_create + f"""
-  auto* kernel_out_{i} = {set_out_func}(&std::get<{i}>(api_output), {self.get_kernel_tensor_out_type(self.outputs['names'][i])});"""
+    auto* kernel_out_{i} = {set_out_func}(&std::get<{i}>(api_output), {output_type_map[output_type_list[i]]});"""
 
             kernel_output = kernel_output[:-2]
         else:
@@ -108,7 +121,7 @@ def gen_sparse_kernel_context(self, kernel_output_names):
                     )
                 else:
                     kernel_context_code = kernel_context_code + f"""
-  kernel_context.EmplaceBackInput({param}.impl().get());"""
+    kernel_context.EmplaceBackInput({param}.impl().get());"""
 
                 continue
             if param in attr_names:
@@ -122,41 +135,78 @@ def gen_sparse_kernel_context(self, kernel_output_names):
             else:
                 param + str(param) + ", "
             kernel_context_code = kernel_context_code + f"""
-  kernel_context.EmplaceBackAttr({param});"""
+    kernel_context.EmplaceBackAttr({param});"""
 
         for out_name in kernel_output_names:
             kernel_context_code = kernel_context_code + f"""
-  kernel_context.EmplaceBackOutput({out_name});"""
+    kernel_context.EmplaceBackOutput({out_name});"""
 
         return kernel_context_code
 
-    def gen_sparse_kernel_code(self, inplace_flag=False):
+    def gen_sparse_kernel_code(self, kernel_name, inplace_flag=False):
         _, kernel_output_names, output_create = self.gene_output(
-            self.outputs['types'], 'SetSparseKernelOutput', '', inplace_flag)
+            self.kernel['dispatch'][kernel_name][1], 'SetSparseKernelOutput',
+            '', inplace_flag)
 
         kernel_context_code = self.gen_sparse_kernel_context(
             kernel_output_names)
-
+        return_code = "" if len(self.gene_return_code(
+        )) == 0 else "  " + self.gene_return_code()
         return f"""
-  auto phi_kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
-      "{self.kernel['func'][0]}", {{kernel_backend, kernel_layout, kernel_data_type}});
-  VLOG(6) << "{self.api} api sparse kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
-  VLOG(6) << "{self.api} api sparse kernel: " << phi_kernel;
+    VLOG(6) << "{self.api} api sparse kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
+    auto phi_kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
+        "{kernel_name}", {{kernel_backend, kernel_layout, kernel_data_type}});
+    VLOG(6) << "{self.api} api sparse kernel: " << phi_kernel;
 
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
-  auto kernel_context = phi::KernelContext(dev_ctx);
+    auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
+    auto kernel_context = phi::KernelContext(dev_ctx);
 {output_create}
 {kernel_context_code}
-  phi_kernel(&kernel_context);
+    phi_kernel(&kernel_context);
+  {return_code}"""
+
+    def get_condition_code(self, kernel_name):
+        assert self.kernel['dispatch'][kernel_name], \
+                f"{self.api} api: the tensor type of inputs and outputs for kernel isn't set, see also 'kernel:func' of 'conv3d' in sparse_api.yaml."
+        input_types = self.kernel['dispatch'][kernel_name][0]
+        sparse_type_map = {
+            'sparse_coo': 'DataLayout::SPARSE_COO',
+            'sparse_csr': 'DataLayout::SPARSE_CSR'
+        }
+        condition_list = []
+        for i, in_type in enumerate(input_types):
+            if in_type == "dense":
+                condition_list.append(
+                    f"phi::DenseTensor::classof({self.inputs['names'][i]}.impl().get())"
+                )
+            else:
+                condition_list.append(
+                    f"{self.inputs['names'][i]}.layout() == {sparse_type_map[in_type]}"
+                )
+        return " && ".join(condition_list)
 
-  return api_output;"""
+    def gene_dispatch_code(self, kernel_name, inplace_flag=False):
+        dispatch_code = ""
+        return f"""
+  if ({self.get_condition_code(kernel_name)}) {{
+{self.gen_sparse_kernel_code(kernel_name, inplace_flag)}
+  }}
+"""
 
     def gene_base_api_code(self, inplace_flag=False):
         api_func_name = self.get_api_func_name()
+        if inplace_flag and api_func_name[-1] != '_':
+            api_func_name += '_'
+        kernel_dispatch_code = f"{self.gene_kernel_select()}\n"
+        for kernel_name in self.kernel['func']:
+            kernel_dispatch_code += self.gene_dispatch_code(kernel_name,
+                                                            inplace_flag)
+
         return f"""
-PADDLE_API {self.outputs['return_type']} {api_func_name}({self.args_str["args_define"]}) {{
-{self.gene_kernel_select()}
-{self.gen_sparse_kernel_code(inplace_flag)}
+PADDLE_API {self.get_return_type()} {api_func_name}({self.get_define_args()}) {{
+{kernel_dispatch_code}
+  PADDLE_THROW(phi::errors::Unimplemented(
+          "The kernel of ({self.api}) for input tensors is unimplemented, please check the type of input tensors."));
 }}
 """
 
diff --git a/python/paddle/utils/code_gen/sparse_bw_api.yaml b/python/paddle/utils/code_gen/sparse_bw_api.yaml
index 74299ed3e39a0..eb7114cbdd2c9 100644
--- a/python/paddle/utils/code_gen/sparse_bw_api.yaml
+++ b/python/paddle/utils/code_gen/sparse_bw_api.yaml
@@ -1,47 +1,68 @@
 - backward_api : conv3d_grad
   forward : conv3d (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
   args : (Tensor x, Tensor kernel, Tensor rulebook, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups, bool subm)
-  output : Tensor(x_grad@SparseCooTensor), Tensor(kernel_grad@DenseTensor)
+  output : Tensor(x_grad), Tensor(kernel_grad)
   kernel :
-    func : sparse_conv3d_grad
+    func : sparse_conv3d_grad{sparse_coo, dense, dense, sparse_coo -> sparse_coo, dense}
 
 - backward_api : coo_to_dense_grad
-  forward : coo_to_dense(Tensor x) -> Tensor(out@DenseTensor)
+  forward : coo_to_dense(Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
-  output : Tensor(x_grad@SparseCooTensor)
+  output : Tensor(x_grad)
   kernel :
-    func : sparse_coo_to_dense_grad
-
-- backward_api : coo_values_grad
-  forward : coo_values(Tensor x) -> Tensor(out@DenseTensor)
-  args : (Tensor x, Tensor out_grad)
-  output : Tensor(x_grad@SparseCooTensor)
-  kernel :
-    func : coo_values_grad
+    func : sparse_coo_to_dense_grad{sparse_coo, dense-> sparse_coo}
 
 - backward_api : create_sparse_coo_tensor_grad
-  forward : create_sparse_coo_tensor(Tensor values, Tensor indices, IntArray dense_shape) -> Tensor(out@SparseCooTensor)
+  forward : create_sparse_coo_tensor(Tensor values, Tensor indices, IntArray dense_shape) -> Tensor(out)
   args : (Tensor indices, Tensor out_grad)
-  output : Tensor(values_grad@DenseTensor)
+  output : Tensor(values_grad)
   kernel :
-    func : sparse_coo_tensor_grad
+    func : sparse_coo_tensor_grad{dense, sparse_coo -> dense}
 
 - backward_api : dense_to_coo_grad
-  forward : dense_to_coo(Tensor x, int64_t sparse_dim) -> Tensor(out@SparseCooTensor)
+  forward : dense_to_coo(Tensor x, int64_t sparse_dim) -> Tensor(out)
   args : (Tensor out_grad)
-  output : Tensor(x_grad@DenseTensor)
+  output : Tensor(x_grad)
   invoke : to_dense_impl(out_grad)
 
+- backward_api : relu_grad
+  forward : relu(Tensor x) -> Tensor(out)
+  args : (Tensor out, Tensor out_grad)
+  output : Tensor(x_grad)
+  kernel :
+    func : sparse_coo_relu_grad {sparse_coo, sparse_coo -> sparse_coo}
+
+- backward_api : sin_grad
+  forward : sin(Tensor x) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad)
+  output : Tensor(x_grad)
+  kernel :
+    func : sparse_coo_sin_grad {sparse_coo, sparse_coo -> sparse_coo}
+
 - backward_api : sparse_maxpool_grad
-  forward : sparse_maxpool(Tensor x, int[] kernel_sizes, int[] paddings, int[] dilations, int[] strides) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
+  forward : sparse_maxpool(Tensor x, int[] kernel_sizes, int[] paddings, int[] dilations, int[] strides) -> Tensor(out), Tensor(rulebook)
   args : (Tensor x, Tensor rulebook, Tensor out, Tensor out_grad, int[] kernel_sizes)
-  output : Tensor(x_grad@SparseCooTensor)
+  output : Tensor(x_grad)
+  kernel :
+    func : sparse_maxpool_grad {sparse_coo, dense, sparse_coo, sparse_coo -> sparse_coo}
+
+- backward_api : sqrt_grad
+  forward : sqrt(Tensor x) -> Tensor(out)
+  args : (Tensor out, Tensor out_grad)
+  output : Tensor(x_grad)
+  kernel :
+    func : sparse_coo_sqrt_grad {sparse_coo, sparse_coo -> sparse_coo}
+
+- backward_api : tanh_grad
+  forward : tanh(Tensor x) -> Tensor(out)
+  args : (Tensor out, Tensor out_grad)
+  output : Tensor(x_grad)
   kernel :
-    func : sparse_maxpool_grad
+    func : sparse_coo_tanh_grad {sparse_coo, sparse_coo -> sparse_coo}
 
-- backward_api : sparse_relu_grad
-  forward : sparse_relu(Tensor x) -> Tensor(out@SparseCooTensor)
+- backward_api : values_grad
+  forward : coo_values(Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
-  output : Tensor(x_grad@SparseCooTensor)
+  output : Tensor(x_grad)
   kernel :
-    func : sparse_relu_grad
+    func : coo_values_grad{sparse_coo, dense-> sparse_coo}
diff --git a/python/paddle/utils/code_gen/sparse_bw_api_gen.py b/python/paddle/utils/code_gen/sparse_bw_api_gen.py
index 4f209a7592161..cf59726bbb195 100644
--- a/python/paddle/utils/code_gen/sparse_bw_api_gen.py
+++ b/python/paddle/utils/code_gen/sparse_bw_api_gen.py
@@ -31,12 +31,21 @@ def get_api_func_name(self):
     def gene_kernel_backend_select(self):
         return BackwardAPI.gene_kernel_backend_select(self)
 
-    def get_return_type(self, out_type_list):
-        return BackwardAPI.get_return_type(self, out_type_list)
+    def get_return_type(self, inplace_flag=False):
+        return BackwardAPI.get_return_type(self)
+
+    def gene_return_code(self):
+        return "return;"
 
     def gene_api_declaration(self):
         return SparseAPI.gene_api_declaration(self)
 
+    def get_declare_args(self, inplace_flag=False):
+        return BackwardAPI.get_declare_args(self)
+
+    def get_define_args(self, inplace_flag=False):
+        return BackwardAPI.get_define_args(self)
+
     def gene_output(self,
                     output_type_list,
                     set_out_func,
@@ -45,6 +54,11 @@ def gene_output(self,
         kernel_output = ""
         output_names = []
         output_create = ""
+        output_type_map = {
+            'dense': 'TensorType::DENSE_TENSOR',
+            'sparse_coo': 'TensorType::SPARSE_COO',
+            'sparse_csr': 'TensorType::SPARSE_CSR'
+        }
 
         if len(output_type_list) == 1:
             kernel_output = 'kernel_out'
@@ -53,36 +67,21 @@ def gene_output(self,
                 0]] if inplace_flag and self.inplace_map is not None and self.outputs[
                     'names'][0] in self.inplace_map else ""
             output_create = f"""
-  {self.outputs['return_type']} api_output{inplace_assign};
-  auto kernel_out = {set_out_func}(&api_output, {self.get_kernel_tensor_out_type(self.outputs['names'][0])});"""
+    auto kernel_out = {set_out_func}({self.outputs['names'][0]}, {output_type_map[output_type_list[0]]});"""
 
         elif len(output_type_list) > 1:
-            output_create = f"""
-  {self.outputs['return_type']} api_output({len(output_type_list)});"""
+            output_create = ""
 
             for i, out_type_item in enumerate(output_type_list):
                 kernel_output = kernel_output + f'kernel_out_{i}, '
                 output_names.append(f'kernel_out_{i}')
-                if out_type_item == 'Tensor':
-                    get_out_code = f'&api_output[{i}][0]'
-                    if inplace_flag and self.inplace_map is not None and self.outputs[
-                            'names'][i] in self.inplace_map:
-                        output_create = output_create + f"""
-  api_output[{i}].emplace_back({self.inplace_map[self.outputs['names'][i]]});"""
-
-                    else:
-                        output_create = output_create + f"""
-  api_output[{i}].emplace_back();"""
-
-                else:
-                    get_out_code = f'&api_output[{i}]'
-                    if inplace_flag and self.inplace_map is not None and self.outputs[
-                            'names'][i] in self.inplace_map:
-                        output_create = output_create + f"""
-  api_output[{i}] = {self.inplace_map[self.outputs['names'][i]]};"""
+                if inplace_flag and self.inplace_map is not None and self.outputs[
+                        'names'][i] in self.inplace_map:
+                    output_create = output_create + f"""
+    *{self.outputs['names'][i]} = {self.inplace_map[self.outputs['names'][i]]};"""
 
                 output_create = output_create + f"""
-  auto kernel_out_{i} = {set_out_func}({get_out_code}, {self.get_kernel_tensor_out_type(self.outputs['names'][i])});"""
+    auto kernel_out_{i} = {set_out_func}({self.outputs['names'][i]}, {output_type_map[output_type_list[i]]});"""
 
             kernel_output = kernel_output[:-2]
         else:
diff --git a/python/paddle/utils/code_gen/strings_api_gen.py b/python/paddle/utils/code_gen/strings_api_gen.py
index 061ea6c3ceef9..d697ce3935708 100644
--- a/python/paddle/utils/code_gen/strings_api_gen.py
+++ b/python/paddle/utils/code_gen/strings_api_gen.py
@@ -32,7 +32,7 @@ def get_api_func_name(self):
     def gene_api_declaration(self):
         return f"""
 // {", ".join(self.outputs['names'])}
-PADDLE_API {self.outputs['return_type']} {self.get_api_func_name()}({self.args_str['args_declare']});
+{super(StringsAPI, self).gene_api_declaration()}
 """
 
     def get_kernel_tensor_out_type(self, output_name):
@@ -56,6 +56,7 @@ def gene_output(self,
         kernel_output = ""
         output_names = []
         output_create = ""
+        return_type = self.get_return_type(inplace_flag)
 
         if len(output_type_list) == 1:
             kernel_output = 'kernel_out'
@@ -67,13 +68,12 @@ def gene_output(self,
                 0]] if inplace_flag and self.inplace_map is not None and self.outputs[
                     'names'][0] in self.inplace_map else ""
             output_create = f"""
-  {self.outputs['return_type']} api_output{inplace_assign};
-  
+  {return_type} api_output{inplace_assign};
   {tensor_type}* kernel_out = dynamic_cast<{tensor_type}*>({set_out_func}(kernel_backend, &api_output, {kernel_tensor_out_type}));"""
 
         elif len(output_type_list) > 1:
             output_create = f"""
-  {self.outputs['return_type']} api_output;"""
+  {return_type} api_output;"""
 
             for i in range(len(output_type_list)):
                 kernel_output = kernel_output + f'kernel_out_{i}, '
@@ -194,7 +194,7 @@ def gen_string_tensor_kernel_code(self, inplace_flag=False, code_indent=""):
 {code_indent}  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
 {code_indent}  (*kernel_fn)({kernel_args}, {outputs_args});
 
-{code_indent}  return {self.gene_return_code()};"""
+{code_indent}  {self.gene_return_code()}"""
 
     def gene_kernel_select(self) -> str:
         api = self.api
@@ -264,7 +264,7 @@ def gene_kernel_select(self) -> str:
     def gene_base_api_code(self, inplace_flag=False):
         api_func_name = self.get_api_func_name()
         return f"""
-PADDLE_API {self.outputs['return_type']} {api_func_name}({self.args_str["args_define"]}) {{
+PADDLE_API {self.get_return_type(inplace_flag)} {api_func_name}({self.get_define_args(inplace_flag)}) {{
 {self.gene_kernel_select()}
 {self.gen_string_tensor_kernel_code(inplace_flag)}
 }}
diff --git a/python/paddle/utils/code_gen/templates/ks.c.j2 b/python/paddle/utils/code_gen/templates/ks.c.j2
new file mode 100644
index 0000000000000..1848513b878e5
--- /dev/null
+++ b/python/paddle/utils/code_gen/templates/ks.c.j2
@@ -0,0 +1,27 @@
+{% from "operator_utils.c.j2" import name_map, register_name_map %}
+// this file is generated by python/paddle/utils/code_gen/generate_op.py, do not edit.
+#include "paddle/phi/core/compat/op_utils.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/utils/small_vector.h"
+
+namespace phi {
+
+using paddle::framework::GradVarName;
+
+{% for api in apis %}
+  {% if api is base_api %}
+{{name_map(api)}}
+  {% endif %}
+{% endfor %}
+{% for api in backward_apis %}
+  {% if api is base_api %}
+{{name_map(api)}}
+  {% endif %}
+{% endfor %}
+}  // namespace phi
+
+{% for api in apis + backward_apis %}
+  {% if api is base_api %}
+{{register_name_map(api)}}
+  {% endif %}
+{% endfor %}
diff --git a/python/paddle/utils/code_gen/templates/op.c.j2 b/python/paddle/utils/code_gen/templates/op.c.j2
new file mode 100644
index 0000000000000..d4fd293ae460a
--- /dev/null
+++ b/python/paddle/utils/code_gen/templates/op.c.j2
@@ -0,0 +1,45 @@
+{% from "operator_utils.c.j2" import op_maker, backward_op_maker, operator, register_op_with_components %}
+// this file is generated by python/paddle/utils/code_gen/generate_op.py, do not edit.
+#include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/nullary.h"
+#include "paddle/phi/infermeta/unary.h"
+#include "paddle/phi/infermeta/binary.h"
+#include "paddle/phi/infermeta/ternary.h"
+#include "paddle/phi/infermeta/multiary.h"
+#include "paddle/phi/infermeta/backward.cc"
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::GradVarName;
+
+{% for api in apis %}
+  {% if api is base_api %}
+
+{{op_maker(api)}}
+
+{{operator(api)}}
+  {% endif %}
+{% endfor %}
+
+{% for api in backward_apis %}
+  {% if api is base_api %}
+
+{{backward_op_maker(api, api_dict[api["forward"]["name"]])}}
+
+{{operator(api)}}
+  {% endif %}
+{% endfor %}
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+{% for api in apis + backward_apis %}
+{% if api is base_api %}
+{{register_op_with_components(api)}}
+{% endif %}
+{% endfor %}
diff --git a/python/paddle/utils/code_gen/templates/operator_utils.c.j2 b/python/paddle/utils/code_gen/templates/operator_utils.c.j2
new file mode 100644
index 0000000000000..2771833d5a335
--- /dev/null
+++ b/python/paddle/utils/code_gen/templates/operator_utils.c.j2
@@ -0,0 +1,292 @@
+{# ----------------------------- op maker ----------------------------------- #}
+{% macro op_maker(api) %}
+  {% set api_name = api["name"] %}
+class {{api_name | to_pascal_case}}OpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+  {% filter indent(4, True) %}
+    {% for input in api["inputs"] %}
+      {% if input["name"] in api["kernel"]["param"] %}
+{{add_input(loop.index0, input, api_name)}};
+      {% endif %}
+    {% endfor %}
+    {% for output in api["outputs"] %}
+{{add_output(loop.index0, output, api_name)}};
+    {% endfor %}
+    {% for attr in api["attrs"] %}
+      {% if attr["name"] in api["kernel"]["param"] %}
+{{add_attr(loop.index0, attr, api_name)}};
+      {% endif %}
+    {% endfor %}
+  {% endfilter %}
+    AddComment(R"DOC(
+TODO: Documentation of {{api_name}} op.
+)DOC");
+  }
+};
+{% endmacro %}
+
+
+{# add input, it could be duplicable or dispensable #}
+{% macro add_input(i, input, op_name) %}{# inline #}
+  {% set name = input["name"] %}
+  {% set typename = input["typename"] %}
+AddInput({{name| to_opmaker_name}}, "({{typename}}), input {{i}} of {{op_name}} op.")
+  {%- if typename is vec +%}
+    .AsDuplicable()
+  {%- endif %}
+  {%- if input["optional"] +%}
+    .AsDispensable()
+  {%- endif %}
+{%- endmacro %}
+
+{# add output, it could be duplicable or intermediate, however, optional output is not supported #}
+{% macro add_output(i, output, op_name) %}{# inline #}
+  {% set name = output["name"] %}
+  {% set typename = output["typename"] %}
+  {% set is_intermediate = output["intermediate"] %}
+AddOutput({{name | to_opmaker_name}}, "({{typename}}), output {{i}} of {{op_name}} op.")
+  {%- if typename is vec +%}
+    .AsDuplicable()
+  {%- endif %}
+  {%- if is_intermediate +%}
+    .AsIntermediate()
+  {%- endif %}
+{%- endmacro %}
+
+{# add attribute, and process default value if needed #}
+{% macro add_attr(i, attr, op_name) %}{# inline #}
+  {% set name = attr["name"] %}
+  {% set typename = attr["typename"] %}
+  {% if typename is scalar %}
+AddInput("{{name | to_pascal_case}}Tensor", "attribute {{i}} for {{op_name}} op from 0D Tensor.")
+    .AsDispensable();
+  {% elif typename == "IntArray" %}{# the type has been renamed #}
+AddInput("{{name | to_pascal_case}}Tensor", "attribute {{i}} for {{op_name}} op from 1D integer Tensor.")
+    .AsDispensable();
+AddInput("{{name | to_pascal_case}}TensorList", "attribute {{i}} for {{op_name}} op from list fo 0D integer Tensors.")
+    .AsDuplicable()
+    .AsDispensable();
+  {% endif %}
+AddAttr<{{typename | to_op_attr_type}}>("{{name}}", "({{typename | to_op_attr_type}}), attribute {{i}} for {{op_name}} op.")
+  {%- if "default_value" in attr +%}
+    .SetDefault({{process_default_value(attr)}})
+  {%- endif %}
+{%- endmacro %}
+
+{# process default value for attributes, some attribute has different types and different default values in api & opmaker #}
+{% macro process_default_value(attr) %}{# inline #}
+  {% set default_value = attr["default_value"] %}
+  {% set typename = attr["typename"] %}
+  {% if typename == "DataType" %}{# convert back to VarType #}
+static_cast<int>(framework::TransToProtoVarType(experimental::{{default_value}}))
+  {%- elif typename == "DataLayout" %} {# does DataLayout need any processing?#}
+static_cast<int>(experimental::{{default_value}})
+  {%- elif typename == "Place" %}{# construct a Place to get the type #}
+static_cast<int>(phi::Place({{"phi::" if not default_value is initializer_list}}{{default_value}}).GetType())
+  {%- else %}{# pass through as-is #}
+{{default_value}}
+  {%- endif %}
+{%- endmacro %}
+
+
+{# --------------------------------------- name mapping ---------------------------------------------- #}
+{% macro name_map(api) %}
+KernelSignature {{api["name"] | to_pascal_case }}OpArgumentMapping(const ArgumentMappingContext& ctx) {
+  {% set kernel_args = api["kernel"]["param"] %}
+  {{get_input_list(api["inputs"], kernel_args)}};
+  paddle::small_vector<const char*> attrs;
+  {% for attr in api["attrs"]%}
+  {% filter indent(2)%}
+  {{get_an_attr(attr)}};
+  {% endfilter %}
+  {% endfor %}
+  {{get_output_list(api["outputs"], kernel_args)}};
+  return KernelSignature("{{api["name"]}}", std::move(inputs), std::move(attrs), std::move(outputs));
+}
+{% endmacro %}
+
+
+{% macro register_name_map(api) %}
+PD_REGISTER_ARG_MAPPING_FN({{api["name"]}}, phi::{{api["name"] | to_pascal_case}}OpArgumentMapping);
+{%- endmacro %}
+
+{% macro get_input_list(inputs, kernel_args) %}{# inline #}
+paddle::small_vector<const char*> inputs {
+{%- for input in inputs %}
+{%- if input["name"] in kernel_args %}
+{{input["name"] | to_opmaker_name_cstr}}{{", " if not loop.last}}
+{%- endif %}
+{%- endfor %}
+}
+{%- endmacro %}
+
+{% macro get_an_attr(attr) %}{# inline #}
+{% set typename = attr["typename"] %}
+{% set name = attr["name"] %}
+{% if typename is scalar %}{# scalar correspond to a dispensable input and an attr in opmaker #}
+attrs.emplace_back(
+  ctx.HasInput("{{name | to_pascal_case}}")
+  ? "{{name | to_pascal_case}}Tensor"
+  : "{{name}}"
+)
+{%- elif typename == "IntArray" %}
+attrs.emplace_back(
+  ctx.HasInput("{{name | to_pascal_case}}Tensor")
+  ? "{{name | to_pascal_case}}Tensor"
+  : ctx.InputSize("{{name | to_pascal_case}}TensorList") > 0
+    ? "{{name | to_pascal_case}}TensorList"
+    : "{{name}}"
+)
+{%- else %}
+attrs.emplace_back("{{name}}")
+{%- endif %}
+{%- endmacro %}
+
+{% macro get_output_list(outputs, kernel_args) %}{# inline #}
+paddle::small_vector<const char*> outputs {
+{%- for output in outputs %}
+{{output["name"] | to_opmaker_name_cstr}}{{", " if not loop.last}}
+{%- endfor %}
+}
+{%- endmacro %}
+
+{# --------------------------------------- operator  ---------------------------------------------- #}
+{% macro operator(api) %}
+class {{api["name"] | to_pascal_case}}Op : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+};
+
+{# infershape functor #}
+DECLARE_INFER_SHAPE_FUNCTOR({{api["name"]}}, {{api["name"] | to_pascal_case}}InferShapeFunctor,
+                            PD_INFER_META(phi::{{api["infer_meta"]["func"]}}));
+{# inplace inferer #}
+{% if api["inplace"] is not none %}
+  {% set inplace_map %}
+  {% for source, target in api["inplace"].items() %}
+{{"{"}}{{source | to_opmaker_name}}, {{target | to_opmaker_name}}{{"}"}}{{", " if not loop.last}}
+  {%- endfor %}
+  {%- endset %}
+DECLARE_INPLACE_OP_INFERER({{api["name"] | to_pascal_case}}InplaceInferer,
+                           {{inplace_map}});
+{% endif %}
+
+{# no_need_buffer inferer #}
+{% if api["no_need_buffer"] is not none %}
+DECLARE_NO_NEED_BUFFER_VARS_INFERER({{api["name"] | to_pascal_case}}NoNeedBufferVarInferer,
+                                    {{api["no_need_buffer"] | map("to_opmaker_name") | join(", ")}});
+{% endif %}
+{% endmacro%}
+
+{% macro register_op_with_components(api) %}
+{% set name = api["name"] %}
+REGISTER_OPERATOR({{name}}, ops::{{name | to_pascal_case}}Op,
+{% if not "forward" in api %}{# it is a forward api #}
+                  ops::{{name | to_pascal_case}}OpMaker,
+{% endif %}
+{% if "backward" in api and api["backward"] is not none %}{# backward #}
+  {% set backward_name = api["backward"] %}
+                  ops::{{backward_name | to_pascal_case}}OpMaker<paddle::framework::OpDesc>,
+                  ops::{{backward_name | to_pascal_case}}OpMaker<paddle::imperative::OpBase>,
+{% endif %}
+{% if api is supports_inplace %}{# inplace#}
+                  ops::{{name | to_pascal_case}}InplaceInferer,
+{% endif %}
+{% if api is supports_no_need_buffer %}{# no_need_buffer #}
+                  ops::{{name | to_pascal_case}}NoNeedBufferVarInferer,
+{% endif %}
+                  ops::{{name | to_pascal_case}}InferShapeFunctor);
+{% endmacro %}
+
+
+{# --------------------------------------- backward op maker ---------------------------------------------- #}
+{% macro backward_op_maker(api, forward_api) %}
+  {% set name = api["name"] %}
+  {% set forward_input_names = api["forward"]["inputs"] | map(attribute="name") | list %}
+  {% set forward_output_names = api["forward"]["outputs"] | map(attribute="name") | list %}
+  {% set forward_attr_names = api["forward"]["attrs"] | map(attribute="name") | list %}
+  {% set forward_input_orig_names = forward_api["inputs"] | map(attribute="name") | list %}
+  {% set forward_output_orig_names = forward_api["outputs"] | map(attribute="name") | list %}
+  {% set forward_attr_orig_names = forward_api["attrs"] | map(attribute="name") | list %}
+template <typename T>
+class {{name | to_pascal_case}}OpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> grad_op) const override {
+    grad_op->SetType("{{name}}");
+
+  {% for input in api["inputs"] %}
+    grad_op->SetInput("{{input["name"] | to_pascal_case}}", this->{{extract_input_from_forward(
+      input["name"], 
+      forward_input_names, 
+      forward_output_names,
+      forward_input_orig_names,
+      forward_output_orig_names)}});
+  {% endfor %}
+
+  {% for output in api["outputs"] %}
+    grad_op->SetOutput("{{output["name"] | to_pascal_case}}", this->{{extract_output_from_forward(
+      output["name"], 
+      forward_input_names, 
+      forward_output_names,
+      forward_input_orig_names,
+      forward_output_orig_names)}});
+  {% endfor %}
+
+  {% for attr in api["attrs"] %}
+    {% set attr_name = attr["name"] %}
+    {% if attr_name in forward_attr_names %}
+      {% if attr["typename"] == "IntArray" %}
+    grad_op->SetInput("{{attr_name | to_pascal_case}}Tensor", this->Input("{{attr_name | to_pascal_case}}Tensor"));
+    grad_op->SetInput("{{attr_name | to_pascal_case}}TensorList", this->Input("{{attr_name | to_pascal_case}}TensorList"));
+      {% elif attr["typename"] == "Scalar" %}
+    grad_op->SetInput("{{attr_name | to_pascal_case}}Tensor", this->Input("{{attr_name | to_pascal_case}}Tensor"));
+      {% endif %}
+    grad_op->SetAttr("{{attr_name}}", this->GetAttr("{{forward_attr_orig_names[forward_attr_names.index(attr_name)]}}"));
+    {% else %}{# maybe something wrong: backward op has more attrs than the forward one#}
+    grad_op->AddAttr<{{attr["typename"] | to_op_attr_type}}>({{attr_name}}, "({{attr["typename"] | to_op_attr_type}}), exceptional attr {{attr_name}}");
+    grad_op->SetAttr("{{attr_name}}", {{process_default_value(attr)}});
+    {% endif %}
+  {% endfor %}
+  }
+};
+{% endmacro %}
+
+
+{% macro extract_input_from_forward(name, 
+  input_names, output_names, 
+  input_orig_names, output_orig_names) %}{# inline #}
+  {% if name in input_names %}
+    {% set name_in_forward_orig = input_orig_names[input_names.index(name)]%}
+Input("{{name_in_forward_orig | to_pascal_case}}")
+  {%- elif name in output_names %}
+    {% set name_in_forward_orig = output_orig_names[output_names.index(name)]%}
+Output("{{name | to_pascal_case}}")
+  {%- elif name.endswith("_grad") %}{# output grad#}
+    {% set name_in_forward = name.removesuffix("_grad") %}
+    {% if name_in_forward in output_names %}
+      {% set name_in_forward_orig = output_orig_names[output_names.index(name_in_forward)] %}
+OutputGrad("{{name_in_forward_orig | to_pascal_case}}")
+    {%- endif %}
+  {%- endif %}
+{%- endmacro %}
+
+{% macro extract_output_from_forward(name, input_names, output_names,
+  input_orig_names, output_orig_names) %}{# inline #}
+  {% if name.removesuffix("_grad") in input_names %}
+    {% set name_in_forward = name.removesuffix("_grad") %}
+    {% set name_in_forward_orig = input_orig_names[input_names.index(name_in_forward)]%}
+InputGrad("{{name.removesuffix("_grad") | to_pascal_case}}")
+  {%- elif (name | to_input_name) in input_names %}
+    {% set name_in_forward = name | to_input_name %}
+    {% set name_in_forward_orig = input_orig_names[input_names.index(name_in_forward)]%}
+InputGrad("{{name | to_input_name | to_pascal_case}}")
+  {%- endif %}
+{%- endmacro %}
+
+{% macro extract_attr_from_forward(name, attr_names, attr_origin_names) %}
+this->GetAttr("{{name}}")
+{%- endmacro %}
diff --git a/python/paddle/utils/code_gen/tests.py b/python/paddle/utils/code_gen/tests.py
new file mode 100644
index 0000000000000..453578b5cbd8e
--- /dev/null
+++ b/python/paddle/utils/code_gen/tests.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from type_mapping import input_types_map, attr_types_map, output_type_map
+
+
+# tests for typename
+def is_input(s):
+    return s in input_types_map
+
+
+def is_attr(s):
+    return s in attr_types_map
+
+
+def is_output(s):
+    return s in output_type_map
+
+
+def is_vec(s):
+    return s.endswith("[]")
+
+
+def is_scalar(s):
+    return re.match(r"Scalar(\(\w+\))*", s) is not None
+
+
+def is_initializer_list(s):
+    return s == "{}"
+
+
+def is_base_api(api):
+    return "kernel" in api and "infer_meta" in api
+
+
+def supports_selected_rows_kernel(api):
+    return is_base_api(api) and len(api["kernel"]["func"]) == 2
+
+
+def supports_inplace(api):
+    return "inplace_map" in api
+
+
+def supports_no_need_buffer(api):
+    for input in api["inputs"]:
+        if input["no_need_buffer"]:
+            return True
+    return False
diff --git a/python/paddle/utils/code_gen/type_mapping.py b/python/paddle/utils/code_gen/type_mapping.py
new file mode 100644
index 0000000000000..ecbd1f494c2ee
--- /dev/null
+++ b/python/paddle/utils/code_gen/type_mapping.py
@@ -0,0 +1,114 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# type mapping: types in yaml -> types in c++ API
+input_types_map = {
+    'Tensor': 'const Tensor&',
+    'Tensor[]': 'const std::vector<Tensor>&'
+}
+
+optional_input_types_map = {
+    'Tensor': 'const paddle::optional<Tensor>&',
+    'Tensor[]': 'const paddle::optional<std::vector<Tensor>>&',
+}
+
+attr_types_map = {
+    # special types
+    'IntArray': 'const IntArray&',
+    'Scalar': 'const Scalar&',
+    'Scalar(bool)': 'const Scalar&',
+    'Scalar(int)': 'const Scalar&',
+    'Scalar(int64_t)': 'const Scalar&',
+    'Scalar(float)': 'const Scalar&',
+    'Place': 'Place',
+    'DataLayout': 'DataLayout',
+    'DataType': 'DataType',
+    # scalar types
+    'bool': 'bool',
+    'int': 'int',
+    'int64_t': 'int64_t',
+    'float': 'float',
+    'double': 'double',
+    'str': 'const std::string&',
+    # vector types
+    'bool[]': 'const std::vector<bool>&',
+    'int[]': 'const std::vector<int>&',
+    'int64_t[]': 'const std::vector<int64_t>&',
+    'float[]': 'const std::vector<float>&',
+    'double[]': 'const std::vector<double>&',
+    'str[]': 'const std::vector<<std::string>&',
+}
+
+opmaker_attr_types_map = {
+    # special types
+    'IntArray': 'std::vector<int64_t>',
+    'Scalar': 'float',
+    'Scalar(bool)': 'bool',
+    'Scalar(int)': 'int',
+    'Scalar(int64_t)': 'int64_t',
+    'Scalar(float)': 'float',
+    'Place': 'int',
+    'DataLayout': 'int',
+    'DataType': 'int',
+    # scalar types
+    'bool': 'bool',
+    'int': 'int',
+    'int64_t': 'int64_t',
+    'float': 'float',
+    'double': 'double',
+    'str': 'std::string',
+    # vector types
+    'bool[]': 'std::vector<bool>',
+    'int[]': 'std::vector<int>',
+    'int64_t[]': 'std::vector<int64_t>',
+    'float[]': 'std::vector<float>',
+    'double[]': 'std::vector<double>',
+    'str[]': 'std::vector<<std::string>',
+}
+
+output_type_map = {'Tensor': 'Tensor', 'Tensor[]': 'std::vector<Tensor>'}
+
+#------------------------------ phi attr ------------------------------
+phi_attr_types_map = attr_types_map.copy()
+phi_attr_types_map.update({
+    'IntArray': 'const phi::IntArray&',
+    'Scalar': 'const phi::Scalar&'
+})
+
+#--------------------------- phi dense tensor ---------------------------
+# type mapping to phi, used in implementation
+dense_input_types_map = {
+    'Tensor': 'const phi::DenseTensor&',
+    'Tensor[]': 'const std::vector<const phi::DenseTensor*>&',
+}
+
+dense_optional_input_types_map = {
+    'Tensor': 'paddle::optional<const phi::DenseTensor&>',
+    'Tensor[]': 'paddle::optional<const std::vector<phi::DenseTensor>&>'
+}
+
+dense_output_types_map = {
+    'Tensor': 'phi::DenseTensor*',
+    'Tensor[]': 'std::vector<phi::DenseTensor*>'
+}
+
+#---------------------- phi selected rows------------------------------
+# type mapping to phi, used in implementation
+sr_input_types_map = {'Tensor': 'const phi::SelectedRows&', }
+
+sr_optional_input_types_map = {
+    'Tensor': 'paddle::optional<const phi::SelectedRows&>',
+}
+
+sr_output_types_map = {'Tensor': 'phi::SelectedRows*', }
diff --git a/python/paddle/utils/code_gen/wrapped_infermeta_gen.py b/python/paddle/utils/code_gen/wrapped_infermeta_gen.py
index dd077552b7962..c14d39e9842be 100644
--- a/python/paddle/utils/code_gen/wrapped_infermeta_gen.py
+++ b/python/paddle/utils/code_gen/wrapped_infermeta_gen.py
@@ -117,9 +117,13 @@ def api_namespace():
 
 def generate_wrapped_infermeta_and_register(api_yaml_path, header_file_path,
                                             source_file_path):
+    apis = []
+    for each_api_yaml in api_yaml_path:
+        with open(each_api_yaml, 'r') as f:
+            api_list = yaml.load(f, Loader=yaml.FullLoader)
+            if api_list:
+                apis.extend(api_list)
 
-    with open(api_yaml_path, 'r') as f:
-        apis = yaml.load(f, Loader=yaml.FullLoader)
     header_file = open(header_file_path, 'w')
     source_file = open(source_file_path, 'w')
 
@@ -159,6 +163,7 @@ def main():
     parser.add_argument(
         '--api_yaml_path',
         help='path to api yaml file',
+        nargs='+',
         default='python/paddle/utils/code_gen/api.yaml')
     parser.add_argument(
         '--wrapped_infermeta_header_path',
diff --git a/python/paddle/vision/models/mobilenetv3.py b/python/paddle/vision/models/mobilenetv3.py
index da7ae010c58f6..70aa1b833d648 100644
--- a/python/paddle/vision/models/mobilenetv3.py
+++ b/python/paddle/vision/models/mobilenetv3.py
@@ -39,7 +39,7 @@
 class SqueezeExcitation(nn.Layer):
     """
     This block implements the Squeeze-and-Excitation block from https://arxiv.org/abs/1709.01507 (see Fig. 1).
-    Parameters ``activation``, and ``scale_activation`` correspond to ``delta`` and ``sigma`` in in eq. 3.
+    Parameters ``activation``, and ``scale_activation`` correspond to ``delta`` and ``sigma`` in eq. 3.
     This code is based on the torchvision code with modifications.
     You can also see at https://github.com/pytorch/vision/blob/main/torchvision/ops/misc.py#L127
     Args:
diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py
index e4dd4c797fef6..d45c652885b69 100644
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -895,7 +895,10 @@ def decode_jpeg(x, mode='unchanged', name=None):
 
     Examples:
         .. code-block:: python
+
+            # required: gpu
             import cv2
+            import numpy as np
             import paddle
 
             fake_img = (np.random.random(
diff --git a/python/paddle/vision/transforms/__init__.py b/python/paddle/vision/transforms/__init__.py
index b255e663e6876..5992a4f977411 100644
--- a/python/paddle/vision/transforms/__init__.py
+++ b/python/paddle/vision/transforms/__init__.py
@@ -28,7 +28,9 @@
 from .transforms import ColorJitter  # noqa: F401
 from .transforms import RandomCrop  # noqa: F401
 from .transforms import Pad  # noqa: F401
+from .transforms import RandomAffine  # noqa: F401
 from .transforms import RandomRotation  # noqa: F401
+from .transforms import RandomPerspective  # noqa: F401
 from .transforms import Grayscale  # noqa: F401
 from .transforms import ToTensor  # noqa: F401
 from .transforms import RandomErasing  # noqa: F401
@@ -37,7 +39,9 @@
 from .functional import vflip  # noqa: F401
 from .functional import resize  # noqa: F401
 from .functional import pad  # noqa: F401
+from .functional import affine  # noqa: F401
 from .functional import rotate  # noqa: F401
+from .functional import perspective  # noqa: F401
 from .functional import to_grayscale  # noqa: F401
 from .functional import crop  # noqa: F401
 from .functional import center_crop  # noqa: F401
@@ -64,7 +68,9 @@
     'ColorJitter',
     'RandomCrop',
     'Pad',
+    'RandomAffine',
     'RandomRotation',
+    'RandomPerspective',
     'Grayscale',
     'ToTensor',
     'RandomErasing',
@@ -73,7 +79,9 @@
     'vflip',
     'resize',
     'pad',
+    'affine',
     'rotate',
+    'perspective',
     'to_grayscale',
     'crop',
     'center_crop',
diff --git a/python/paddle/vision/transforms/functional.py b/python/paddle/vision/transforms/functional.py
index 5a8c2cc09f884..90fba1c4130e5 100644
--- a/python/paddle/vision/transforms/functional.py
+++ b/python/paddle/vision/transforms/functional.py
@@ -537,6 +537,166 @@ def adjust_hue(img, hue_factor):
         return F_t.adjust_hue(img, hue_factor)
 
 
+def _get_affine_matrix(center, angle, translate, scale, shear):
+    # Affine matrix is : M = T * C * RotateScaleShear * C^-1
+    # Ihe inverse one is : M^-1 = C * RotateScaleShear^-1 * C^-1 * T^-1
+    rot = math.radians(angle)
+    sx = math.radians(shear[0])
+    sy = math.radians(shear[1])
+
+    # Rotate and Shear without scaling 
+    a = math.cos(rot - sy) / math.cos(sy)
+    b = -math.cos(rot - sy) * math.tan(sx) / math.cos(sy) - math.sin(rot)
+    c = math.sin(rot - sy) / math.cos(sy)
+    d = -math.sin(rot - sy) * math.tan(sx) / math.cos(sy) + math.cos(rot)
+
+    # Center Translation
+    cx, cy = center
+    tx, ty = translate
+
+    # Inverted rotation matrix with scale and shear
+    # det([[a, b], [c, d]]) == 1, since det(rotation) = 1 and det(shear) = 1
+    matrix = [d, -b, 0.0, -c, a, 0.0]
+    matrix = [x / scale for x in matrix]
+    # Apply inverse of translation and of center translation: RSS^-1 * C^-1 * T^-1
+    matrix[2] += matrix[0] * (-cx - tx) + matrix[1] * (-cy - ty)
+    matrix[5] += matrix[3] * (-cx - tx) + matrix[4] * (-cy - ty)
+    # Apply center translation: C * RSS^-1 * C^-1 * T^-1
+    matrix[2] += cx
+    matrix[5] += cy
+
+    return matrix
+
+
+def affine(img,
+           angle,
+           translate,
+           scale,
+           shear,
+           interpolation="nearest",
+           fill=0,
+           center=None):
+    """Apply affine transformation on the image.
+
+    Args:
+        img (PIL.Image|np.array|paddle.Tensor): Image to be affined.
+        angle (int|float): The angle of the random rotation in clockwise order.
+        translate (list[float]): Maximum absolute fraction for horizontal and vertical translations.
+        scale (float): Scale factor for the image, scale should be positive.
+        shear (list[float]): Shear angle values which are parallel to the x-axis and y-axis in clockwise order.
+        interpolation (str, optional): Interpolation method. If omitted, or if the 
+            image has only one channel, it is set to PIL.Image.NEAREST or cv2.INTER_NEAREST 
+            according the backend. 
+            When use pil backend, support method are as following: 
+            - "nearest": Image.NEAREST, 
+            - "bilinear": Image.BILINEAR, 
+            - "bicubic": Image.BICUBIC
+            When use cv2 backend, support method are as following: 
+            - "nearest": cv2.INTER_NEAREST, 
+            - "bilinear": cv2.INTER_LINEAR, 
+            - "bicubic": cv2.INTER_CUBIC
+        fill (int|list|tuple, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+        center (2-tuple, optional): Optional center of rotation, (x, y).
+            Origin is the upper left corner.
+            Default is the center of the image.
+
+    Returns:
+        PIL.Image|np.array|paddle.Tensor: Affine Transformed image.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.transforms import functional as F
+
+            fake_img = paddle.randn((3, 256, 300)).astype(paddle.float32)
+
+            affined_img = F.affine(fake_img, 45, translate=[0.2, 0.2], scale=0.5, shear=[-10, 10])
+            print(affined_img.shape)
+    """
+
+    if not (_is_pil_image(img) or _is_numpy_image(img) or
+            _is_tensor_image(img)):
+        raise TypeError(
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
+            format(type(img)))
+
+    if not isinstance(angle, (int, float)):
+        raise TypeError("Argument angle should be int or float")
+
+    if not isinstance(translate, (list, tuple)):
+        raise TypeError("Argument translate should be a sequence")
+
+    if len(translate) != 2:
+        raise ValueError("Argument translate should be a sequence of length 2")
+
+    if scale <= 0.0:
+        raise ValueError("Argument scale should be positive")
+
+    if not isinstance(shear, (numbers.Number, (list, tuple))):
+        raise TypeError(
+            "Shear should be either a single value or a sequence of two values")
+
+    if not isinstance(interpolation, str):
+        raise TypeError("Argument interpolation should be a string")
+
+    if isinstance(angle, int):
+        angle = float(angle)
+
+    if isinstance(translate, tuple):
+        translate = list(translate)
+
+    if isinstance(shear, numbers.Number):
+        shear = [shear, 0.0]
+
+    if isinstance(shear, tuple):
+        shear = list(shear)
+
+    if len(shear) == 1:
+        shear = [shear[0], shear[0]]
+
+    if len(shear) != 2:
+        raise ValueError(
+            f"Shear should be a sequence containing two values. Got {shear}")
+
+    if center is not None and not isinstance(center, (list, tuple)):
+        raise TypeError("Argument center should be a sequence")
+
+    if _is_pil_image(img):
+        width, height = img.size
+        # center = (width * 0.5 + 0.5, height * 0.5 + 0.5)
+        # it is visually better to estimate the center without 0.5 offset
+        # otherwise image rotated by 90 degrees is shifted vs output image of F_t.affine
+        if center is None:
+            center = [width * 0.5, height * 0.5]
+        matrix = _get_affine_matrix(center, angle, translate, scale, shear)
+        return F_pil.affine(img, matrix, interpolation, fill)
+
+    if _is_numpy_image(img):
+        # get affine_matrix in F_cv2.affine() using cv2's functions
+        width, height = img.shape[0:2]
+        # center = (width * 0.5 + 0.5, height * 0.5 + 0.5)
+        # it is visually better to estimate the center without 0.5 offset
+        # otherwise image rotated by 90 degrees is shifted vs output image of F_t.affine
+        if center is None:
+            center = (width * 0.5, height * 0.5)
+        return F_cv2.affine(img, angle, translate, scale, shear, interpolation,
+                            fill, center)
+
+    if _is_tensor_image(img):
+        center_f = [0.0, 0.0]
+        if center is not None:
+            height, width = img.shape[-1], img.shape[-2]
+            # Center values should be in pixel coordinates but translated such that (0, 0) corresponds to image center.
+            center_f = [
+                1.0 * (c - s * 0.5) for c, s in zip(center, [width, height])
+            ]
+        translate_f = [1.0 * t for t in translate]
+        matrix = _get_affine_matrix(center_f, angle, translate_f, scale, shear)
+        return F_t.affine(img, matrix, interpolation, fill)
+
+
 def rotate(img,
            angle,
            interpolation="nearest",
@@ -607,6 +767,95 @@ def rotate(img,
         return F_cv2.rotate(img, angle, interpolation, expand, center, fill)
 
 
+def _get_perspective_coeffs(startpoints, endpoints):
+    """
+    get coefficients (a, b, c, d, e, f, g, h) of the perspective transforms.
+
+    In Perspective Transform each pixel (x, y) in the original image gets transformed as,
+     (x, y) -> ( (ax + by + c) / (gx + hy + 1), (dx + ey + f) / (gx + hy + 1) )
+
+    Args:
+        startpoints (list[list[int]]): [top-left, top-right, bottom-right, bottom-left] of the original image,
+        endpoints (list[list[int]]): [top-left, top-right, bottom-right, bottom-left] of the transformed image.
+
+    Returns:
+        output (list): octuple (a, b, c, d, e, f, g, h) for transforming each pixel.
+    """
+    a_matrix = np.zeros((2 * len(startpoints), 8))
+
+    for i, (p1, p2) in enumerate(zip(endpoints, startpoints)):
+        a_matrix[2 * i, :] = [
+            p1[0], p1[1], 1, 0, 0, 0, -p2[0] * p1[0], -p2[0] * p1[1]
+        ]
+        a_matrix[2 * i + 1, :] = [
+            0, 0, 0, p1[0], p1[1], 1, -p2[1] * p1[0], -p2[1] * p1[1]
+        ]
+
+    b_matrix = np.array(startpoints).reshape([8])
+    res = np.linalg.lstsq(a_matrix, b_matrix)[0]
+
+    output = list(res)
+    return output
+
+
+def perspective(img, startpoints, endpoints, interpolation='nearest', fill=0):
+    """Perform perspective transform of the given image.
+
+    Args:
+        img (PIL.Image|np.array|paddle.Tensor): Image to be transformed.
+        startpoints (list of list of ints): List containing four lists of two integers corresponding to four corners
+            ``[top-left, top-right, bottom-right, bottom-left]`` of the original image.
+        endpoints (list of list of ints): List containing four lists of two integers corresponding to four corners
+            ``[top-left, top-right, bottom-right, bottom-left]`` of the transformed image.
+        interpolation (str, optional): Interpolation method. If omitted, or if the 
+            image has only one channel, it is set to PIL.Image.NEAREST or cv2.INTER_NEAREST 
+            according the backend. 
+            When use pil backend, support method are as following: 
+            - "nearest": Image.NEAREST, 
+            - "bilinear": Image.BILINEAR, 
+            - "bicubic": Image.BICUBIC
+            When use cv2 backend, support method are as following: 
+            - "nearest": cv2.INTER_NEAREST, 
+            - "bilinear": cv2.INTER_LINEAR, 
+            - "bicubic": cv2.INTER_CUBIC
+        fill (int|list|tuple, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+
+    Returns:
+        PIL.Image|np.array|paddle.Tensor: transformed Image.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.transforms import functional as F
+
+            fake_img = paddle.randn((3, 256, 300)).astype(paddle.float32)
+
+            startpoints = [[0, 0], [33, 0], [33, 25], [0, 25]]
+            endpoints = [[3, 2], [32, 3], [30, 24], [2, 25]]
+
+            perspectived_img = F.perspective(fake_img, startpoints, endpoints)
+            print(perspectived_img.shape)
+
+    """
+    if not (_is_pil_image(img) or _is_numpy_image(img) or
+            _is_tensor_image(img)):
+        raise TypeError(
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
+            format(type(img)))
+
+    if _is_pil_image(img):
+        coeffs = _get_perspective_coeffs(startpoints, endpoints)
+        return F_pil.perspective(img, coeffs, interpolation, fill)
+    elif _is_tensor_image(img):
+        coeffs = _get_perspective_coeffs(startpoints, endpoints)
+        return F_t.perspective(img, coeffs, interpolation, fill)
+    else:
+        return F_cv2.perspective(img, startpoints, endpoints, interpolation,
+                                 fill)
+
+
 def to_grayscale(img, num_output_channels=1):
     """Converts image to grayscale version of image.
 
@@ -714,9 +963,33 @@ def erase(img, i, j, h, w, v, inplace=False):
 
                 import paddle
                 
-                fake_img = paddle.randn((3, 10, 10)).astype(paddle.float32)
+                fake_img = paddle.randn((3, 2, 4)).astype(paddle.float32)
+                print(fake_img)
+
+                #Tensor(shape=[3, 2, 4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+                #       [[[ 0.02169025, -0.97859967, -1.39175487, -1.07478464],
+                #         [ 0.20654772,  1.74624777,  0.32268861, -0.13857445]],
+                #
+                #        [[-0.14993843,  1.10793507, -0.40056887, -1.94395220],
+                #         [ 0.41686651,  0.44551995, -0.09356714, -0.60898107]],
+                #
+                #        [[-0.24998808, -1.47699273, -0.88838995,  0.42629015],
+                #         [ 0.56948012, -0.96200180,  0.53355658,  3.20450878]]])
+
                 values = paddle.zeros((1,1,1), dtype=paddle.float32)
-                result = paddle.vision.transforms.erase(fake_img, 4, 4, 3, 3, values)
+                result = paddle.vision.transforms.erase(fake_img, 0, 1, 1, 2, values)
+                
+                print(result)
+
+                #Tensor(shape=[3, 2, 4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+                #       [[[ 0.02169025,  0.        ,  0.        , -1.07478464],
+                #         [ 0.20654772,  1.74624777,  0.32268861, -0.13857445]],
+                #
+                #         [[-0.14993843,  0.        ,  0.        , -1.94395220],
+                #           [ 0.41686651,  0.44551995, -0.09356714, -0.60898107]],
+                #
+                #         [[-0.24998808,  0.        ,  0.        ,  0.42629015],
+                #          [ 0.56948012, -0.96200180,  0.53355658,  3.20450878]]])
 
     """
     if _is_tensor_image(img):
diff --git a/python/paddle/vision/transforms/functional_cv2.py b/python/paddle/vision/transforms/functional_cv2.py
index 8343a8c340ffb..1b2485541c499 100644
--- a/python/paddle/vision/transforms/functional_cv2.py
+++ b/python/paddle/vision/transforms/functional_cv2.py
@@ -411,6 +411,86 @@ def adjust_hue(img, hue_factor):
     return cv2.cvtColor(hsv_img, cv2.COLOR_HSV2BGR_FULL).astype(dtype)
 
 
+def affine(img,
+           angle,
+           translate,
+           scale,
+           shear,
+           interpolation='nearest',
+           fill=0,
+           center=None):
+    """Affine the image by matrix.
+
+    Args:
+        img (PIL.Image): Image to be affined.
+        translate (sequence or int): horizontal and vertical translations
+        scale (float): overall scale ratio
+        shear (sequence or float): shear angle value in degrees between -180 to 180, clockwise direction.
+            If a sequence is specified, the first value corresponds to a shear parallel to the x axis, while
+            the second value corresponds to a shear parallel to the y axis.
+        interpolation (int|str, optional): Interpolation method. If omitted, or if the 
+            image has only one channel, it is set to cv2.INTER_NEAREST.
+            when use cv2 backend, support method are as following: 
+            - "nearest": cv2.INTER_NEAREST, 
+            - "bilinear": cv2.INTER_LINEAR, 
+            - "bicubic": cv2.INTER_CUBIC
+        fill (3-tuple or int): RGB pixel fill value for area outside the affined image.
+            If int, it is used for all channels respectively.
+        center (sequence, optional): Optional center of rotation. Origin is the upper left corner.
+            Default is the center of the image.
+
+    Returns:
+        np.array: Affined image.
+
+    """
+    cv2 = try_import('cv2')
+    _cv2_interp_from_str = {
+        'nearest': cv2.INTER_NEAREST,
+        'bilinear': cv2.INTER_LINEAR,
+        'area': cv2.INTER_AREA,
+        'bicubic': cv2.INTER_CUBIC,
+        'lanczos': cv2.INTER_LANCZOS4
+    }
+
+    h, w = img.shape[0:2]
+
+    if isinstance(fill, int):
+        fill = tuple([fill] * 3)
+
+    if center is None:
+        center = (w / 2.0, h / 2.0)
+
+    M = np.ones([2, 3])
+    # Rotate and Scale
+    R = cv2.getRotationMatrix2D(angle=angle, center=center, scale=scale)
+
+    # Shear
+    sx = math.tan(shear[0] * math.pi / 180)
+    sy = math.tan(shear[1] * math.pi / 180)
+    M[0] = R[0] + sy * R[1]
+    M[1] = R[1] + sx * R[0]
+
+    # Translation
+    tx, ty = translate
+    M[0, 2] = tx
+    M[1, 2] = ty
+
+    if len(img.shape) == 3 and img.shape[2] == 1:
+        return cv2.warpAffine(
+            img,
+            M,
+            dsize=(w, h),
+            flags=_cv2_interp_from_str[interpolation],
+            borderValue=fill)[:, :, np.newaxis]
+    else:
+        return cv2.warpAffine(
+            img,
+            M,
+            dsize=(w, h),
+            flags=_cv2_interp_from_str[interpolation],
+            borderValue=fill)
+
+
 def rotate(img,
            angle,
            interpolation='nearest',
@@ -509,6 +589,56 @@ def transform(x, y, matrix):
             borderValue=fill)
 
 
+def perspective(img, startpoints, endpoints, interpolation='nearest', fill=0):
+    """Perspective the image.
+
+    Args:
+        img (np.array): Image to be perspectived.
+        startpoints (list[list[int]]): [top-left, top-right, bottom-right, bottom-left] of the original image,
+        endpoints (list[list[int]]): [top-left, top-right, bottom-right, bottom-left] of the transformed image.
+        interpolation (int|str, optional): Interpolation method. If omitted, or if the 
+            image has only one channel, it is set to cv2.INTER_NEAREST.
+            when use cv2 backend, support method are as following: 
+            - "nearest": cv2.INTER_NEAREST, 
+            - "bilinear": cv2.INTER_LINEAR, 
+            - "bicubic": cv2.INTER_CUBIC
+        fill (3-tuple or int): RGB pixel fill value for area outside the rotated image.
+            If int, it is used for all channels respectively.
+
+    Returns:
+        np.array: Perspectived image.
+
+    """
+    cv2 = try_import('cv2')
+    _cv2_interp_from_str = {
+        'nearest': cv2.INTER_NEAREST,
+        'bilinear': cv2.INTER_LINEAR,
+        'area': cv2.INTER_AREA,
+        'bicubic': cv2.INTER_CUBIC,
+        'lanczos': cv2.INTER_LANCZOS4
+    }
+    h, w = img.shape[0:2]
+
+    startpoints = np.array(startpoints, dtype="float32")
+    endpoints = np.array(endpoints, dtype="float32")
+    matrix = cv2.getPerspectiveTransform(startpoints, endpoints)
+
+    if len(img.shape) == 3 and img.shape[2] == 1:
+        return cv2.warpPerspective(
+            img,
+            matrix,
+            dsize=(w, h),
+            flags=_cv2_interp_from_str[interpolation],
+            borderValue=fill)[:, :, np.newaxis]
+    else:
+        return cv2.warpPerspective(
+            img,
+            matrix,
+            dsize=(w, h),
+            flags=_cv2_interp_from_str[interpolation],
+            borderValue=fill)
+
+
 def to_grayscale(img, num_output_channels=1):
     """Converts image to grayscale version of image.
 
diff --git a/python/paddle/vision/transforms/functional_pil.py b/python/paddle/vision/transforms/functional_pil.py
index 71f7759f11b66..4b86e14039ebe 100644
--- a/python/paddle/vision/transforms/functional_pil.py
+++ b/python/paddle/vision/transforms/functional_pil.py
@@ -410,6 +410,32 @@ def adjust_hue(img, hue_factor):
     return img
 
 
+def affine(img, matrix, interpolation="nearest", fill=0):
+    """Affine the image by matrix.
+
+    Args:
+        img (PIL.Image): Image to be affined.
+        matrix (float or int): Affine matrix.
+        interpolation (str, optional): Interpolation method. If omitted, or if the 
+            image has only one channel, it is set to PIL.Image.NEAREST . when use pil backend, 
+            support method are as following: 
+            - "nearest": Image.NEAREST, 
+            - "bilinear": Image.BILINEAR, 
+            - "bicubic": Image.BICUBIC
+        fill (3-tuple or int): RGB pixel fill value for area outside the affined image.
+            If int, it is used for all channels respectively.
+
+    Returns:
+        PIL.Image: Affined image.
+
+    """
+    if isinstance(fill, int):
+        fill = tuple([fill] * 3)
+
+    return img.transform(img.size, Image.AFFINE, matrix,
+                         _pil_interp_from_str[interpolation], fill)
+
+
 def rotate(img,
            angle,
            interpolation="nearest",
@@ -453,6 +479,33 @@ def rotate(img,
         fillcolor=fill)
 
 
+def perspective(img, coeffs, interpolation="nearest", fill=0):
+    """Perspective the image.
+
+    Args:
+        img (PIL.Image): Image to be perspectived.
+        coeffs (list[float]): coefficients (a, b, c, d, e, f, g, h) of the perspective transforms.
+        interpolation (str, optional): Interpolation method. If omitted, or if the 
+            image has only one channel, it is set to PIL.Image.NEAREST . when use pil backend, 
+            support method are as following: 
+            - "nearest": Image.NEAREST, 
+            - "bilinear": Image.BILINEAR, 
+            - "bicubic": Image.BICUBIC
+        fill (3-tuple or int): RGB pixel fill value for area outside the rotated image.
+            If int, it is used for all channels respectively.
+
+    Returns:
+        PIL.Image: Perspectived image.
+
+    """
+
+    if isinstance(fill, int):
+        fill = tuple([fill] * 3)
+
+    return img.transform(img.size, Image.PERSPECTIVE, coeffs,
+                         _pil_interp_from_str[interpolation], fill)
+
+
 def to_grayscale(img, num_output_channels=1):
     """Converts image to grayscale version of image.
 
diff --git a/python/paddle/vision/transforms/functional_tensor.py b/python/paddle/vision/transforms/functional_tensor.py
index 2e276883cd376..27f83029babaa 100644
--- a/python/paddle/vision/transforms/functional_tensor.py
+++ b/python/paddle/vision/transforms/functional_tensor.py
@@ -28,8 +28,9 @@
 
 def _assert_image_tensor(img, data_format):
     if not isinstance(
-            img, paddle.Tensor) or img.ndim != 3 or not data_format.lower() in (
-                'chw', 'hwc'):
+            img, paddle.Tensor
+    ) or img.ndim < 3 or img.ndim > 4 or not data_format.lower() in ('chw',
+                                                                     'hwc'):
         raise RuntimeError(
             'not support [type={}, ndim={}, data_format={}] paddle image'.
             format(type(img), img.ndim, data_format))
@@ -226,8 +227,8 @@ def _affine_grid(theta, w, h, ow, oh):
 
 def _grid_transform(img, grid, mode, fill):
     if img.shape[0] > 1:
-        grid = grid.expand(img.shape[0], grid.shape[1], grid.shape[2],
-                           grid.shape[3])
+        grid = grid.expand(
+            shape=[img.shape[0], grid.shape[1], grid.shape[2], grid.shape[3]])
 
     if fill is not None:
         dummy = paddle.ones(
@@ -255,6 +256,51 @@ def _grid_transform(img, grid, mode, fill):
     return img
 
 
+def affine(img, matrix, interpolation="nearest", fill=None, data_format='CHW'):
+    """Affine to the image by matrix.
+
+    Args:
+        img (paddle.Tensor): Image to be rotated.
+        matrix (float or int): Affine matrix.
+        interpolation (str, optional): Interpolation method. If omitted, or if the 
+            image has only one channel, it is set NEAREST . when use pil backend, 
+            support method are as following: 
+            - "nearest" 
+            - "bilinear"
+            - "bicubic"
+        fill (3-tuple or int): RGB pixel fill value for area outside the rotated image.
+            If int, it is used for all channels respectively.
+        data_format (str, optional): Data format of img, should be 'HWC' or 
+            'CHW'. Default: 'CHW'.
+
+    Returns:
+        paddle.Tensor: Affined image.
+
+    """
+    ndim = len(img.shape)
+    if ndim == 3:
+        img = img.unsqueeze(0)
+
+    img = img if data_format.lower() == 'chw' else img.transpose((0, 3, 1, 2))
+
+    matrix = paddle.to_tensor(matrix, place=img.place)
+    matrix = matrix.reshape((1, 2, 3))
+    shape = img.shape
+
+    grid = _affine_grid(
+        matrix, w=shape[-1], h=shape[-2], ow=shape[-1], oh=shape[-2])
+
+    if isinstance(fill, int):
+        fill = tuple([fill] * 3)
+
+    out = _grid_transform(img, grid, mode=interpolation, fill=fill)
+
+    out = out if data_format.lower() == 'chw' else out.transpose((0, 2, 3, 1))
+    out = out.squeeze(0) if ndim == 3 else out
+
+    return out
+
+
 def rotate(img,
            angle,
            interpolation='nearest',
@@ -354,6 +400,72 @@ def rotate(img,
     return out.squeeze(0)
 
 
+def _perspective_grid(img, coeffs, ow, oh, dtype):
+    theta1 = coeffs[:6].reshape([1, 2, 3])
+    tmp = paddle.tile(coeffs[6:].reshape([1, 2]), repeat_times=[2, 1])
+    dummy = paddle.ones((2, 1), dtype=dtype)
+    theta2 = paddle.concat((tmp, dummy), axis=1).unsqueeze(0)
+
+    d = 0.5
+    base_grid = paddle.ones((1, oh, ow, 3), dtype=dtype)
+
+    x_grid = paddle.linspace(d, ow * 1.0 + d - 1.0, ow)
+    base_grid[..., 0] = x_grid
+    y_grid = paddle.linspace(d, oh * 1.0 + d - 1.0, oh).unsqueeze_(-1)
+    base_grid[..., 1] = y_grid
+
+    scaled_theta1 = theta1.transpose(
+        (0, 2, 1)) / paddle.to_tensor([0.5 * ow, 0.5 * oh])
+    output_grid1 = base_grid.reshape((1, oh * ow, 3)).bmm(scaled_theta1)
+    output_grid2 = base_grid.reshape(
+        (1, oh * ow, 3)).bmm(theta2.transpose((0, 2, 1)))
+
+    output_grid = output_grid1 / output_grid2 - 1.0
+    return output_grid.reshape((1, oh, ow, 2))
+
+
+def perspective(img,
+                coeffs,
+                interpolation="nearest",
+                fill=None,
+                data_format='CHW'):
+    """Perspective the image.
+
+    Args:
+        img (paddle.Tensor): Image to be rotated.
+        coeffs (list[float]): coefficients (a, b, c, d, e, f, g, h) of the perspective transforms.
+        interpolation (str, optional): Interpolation method. If omitted, or if the 
+            image has only one channel, it is set NEAREST. When use pil backend, 
+            support method are as following: 
+            - "nearest" 
+            - "bilinear"
+            - "bicubic"
+        fill (3-tuple or int): RGB pixel fill value for area outside the rotated image.
+            If int, it is used for all channels respectively.
+
+    Returns:
+        paddle.Tensor: Perspectived image.
+
+    """
+
+    ndim = len(img.shape)
+    if ndim == 3:
+        img = img.unsqueeze(0)
+
+    img = img if data_format.lower() == 'chw' else img.transpose((0, 3, 1, 2))
+    ow, oh = img.shape[-1], img.shape[-2]
+    dtype = img.dtype if paddle.is_floating_point(img) else paddle.float32
+
+    coeffs = paddle.to_tensor(coeffs, place=img.place)
+    grid = _perspective_grid(img, coeffs, ow=ow, oh=oh, dtype=dtype)
+    out = _grid_transform(img, grid, mode=interpolation, fill=fill)
+
+    out = out if data_format.lower() == 'chw' else out.transpose((0, 2, 3, 1))
+    out = out.squeeze(0) if ndim == 3 else out
+
+    return out
+
+
 def vflip(img, data_format='CHW'):
     """Vertically flips the given paddle tensor.
 
diff --git a/python/paddle/vision/transforms/transforms.py b/python/paddle/vision/transforms/transforms.py
index 828a0d9b0936d..fea2efb1fb2b1 100644
--- a/python/paddle/vision/transforms/transforms.py
+++ b/python/paddle/vision/transforms/transforms.py
@@ -45,7 +45,14 @@ def _get_image_size(img):
     elif F._is_numpy_image(img):
         return img.shape[:2][::-1]
     elif F._is_tensor_image(img):
-        return img.shape[1:][::-1]  # chw
+        if len(img.shape) == 3:
+            return img.shape[1:][::-1]  # chw -> wh
+        elif len(img.shape) == 4:
+            return img.shape[2:][::-1]  # nchw -> wh
+        else:
+            raise ValueError(
+                "The dim for input Tensor should be 3-D or 4-D, but received {}".
+                format(len(img.shape)))
     else:
         raise TypeError("Unexpected type {}".format(type(img)))
 
@@ -1205,6 +1212,189 @@ def _apply_image(self, img):
         return F.pad(img, self.padding, self.fill, self.padding_mode)
 
 
+def _check_sequence_input(x, name, req_sizes):
+    msg = req_sizes[0] if len(req_sizes) < 2 else " or ".join(
+        [str(s) for s in req_sizes])
+    if not isinstance(x, Sequence):
+        raise TypeError(f"{name} should be a sequence of length {msg}.")
+    if len(x) not in req_sizes:
+        raise ValueError(f"{name} should be sequence of length {msg}.")
+
+
+def _setup_angle(x, name, req_sizes=(2, )):
+    if isinstance(x, numbers.Number):
+        if x < 0:
+            raise ValueError(
+                f"If {name} is a single number, it must be positive.")
+        x = [-x, x]
+    else:
+        _check_sequence_input(x, name, req_sizes)
+
+    return [float(d) for d in x]
+
+
+class RandomAffine(BaseTransform):
+    """Random affine transformation of the image.
+
+    Args:
+        degrees (int|float|tuple): The angle interval of the random rotation.
+            If set as a number instead of sequence like (min, max), the range of degrees
+            will be (-degrees, +degrees) in clockwise order. If set 0, will not rotate.
+        translate (tuple, optional): Maximum absolute fraction for horizontal and vertical translations.
+            For example translate=(a, b), then horizontal shift is randomly sampled in the range -img_width * a < dx < img_width * a
+            and vertical shift is randomly sampled in the range -img_height * b < dy < img_height * b. 
+            Default is None, will not translate.
+        scale (tuple, optional): Scaling factor interval, e.g (a, b), then scale is randomly sampled from the range a <= scale <= b. 
+            Default is None, will keep original scale and not scale.
+        shear (sequence or number, optional): Range of degrees to shear, ranges from -180 to 180 in clockwise order.
+            If set as a number, a shear parallel to the x axis in the range (-shear, +shear) will be applied. 
+            Else if set as a sequence of 2 values a shear parallel to the x axis in the range (shear[0], shear[1]) will be applied. 
+            Else if set as a sequence of 4 values, a x-axis shear in (shear[0], shear[1]) and y-axis shear in (shear[2], shear[3]) will be applied.
+            Default is None, will not apply shear.
+        interpolation (str, optional): Interpolation method. If omitted, or if the 
+            image has only one channel, it is set to PIL.Image.NEAREST or cv2.INTER_NEAREST 
+            according the backend. 
+            When use pil backend, support method are as following: 
+            - "nearest": Image.NEAREST, 
+            - "bilinear": Image.BILINEAR, 
+            - "bicubic": Image.BICUBIC
+            When use cv2 backend, support method are as following: 
+            - "nearest": cv2.INTER_NEAREST, 
+            - "bilinear": cv2.INTER_LINEAR, 
+            - "bicubic": cv2.INTER_CUBIC
+        fill (int|list|tuple, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+        center (2-tuple, optional): Optional center of rotation, (x, y).
+            Origin is the upper left corner.
+            Default is the center of the image.
+        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
+
+    Shape:
+        - img(PIL.Image|np.ndarray|Paddle.Tensor): The input image with shape (H x W x C).
+        - output(PIL.Image|np.ndarray|Paddle.Tensor): An affined image.
+
+    Returns:
+        A callable object of RandomAffine.
+
+    Examples:
+    
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.transforms import RandomAffine
+
+            transform = RandomAffine([-90, 90], translate=[0.2, 0.2], scale=[0.5, 0.5], shear=[-10, 10])
+
+            fake_img = paddle.randn((3, 256, 300)).astype(paddle.float32)
+
+            fake_img = transform(fake_img)
+            print(fake_img.shape)
+    """
+
+    def __init__(self,
+                 degrees,
+                 translate=None,
+                 scale=None,
+                 shear=None,
+                 interpolation='nearest',
+                 fill=0,
+                 center=None,
+                 keys=None):
+        self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2, ))
+
+        super(RandomAffine, self).__init__(keys)
+        assert interpolation in ['nearest', 'bilinear', 'bicubic']
+        self.interpolation = interpolation
+
+        if translate is not None:
+            _check_sequence_input(translate, "translate", req_sizes=(2, ))
+            for t in translate:
+                if not (0.0 <= t <= 1.0):
+                    raise ValueError(
+                        "translation values should be between 0 and 1")
+        self.translate = translate
+
+        if scale is not None:
+            _check_sequence_input(scale, "scale", req_sizes=(2, ))
+            for s in scale:
+                if s <= 0:
+                    raise ValueError("scale values should be positive")
+        self.scale = scale
+
+        if shear is not None:
+            self.shear = _setup_angle(shear, name="shear", req_sizes=(2, 4))
+        else:
+            self.shear = shear
+
+        if fill is None:
+            fill = 0
+        elif not isinstance(fill, (Sequence, numbers.Number)):
+            raise TypeError("Fill should be either a sequence or a number.")
+        self.fill = fill
+
+        if center is not None:
+            _check_sequence_input(center, "center", req_sizes=(2, ))
+        self.center = center
+
+    def _get_param(self,
+                   img_size,
+                   degrees,
+                   translate=None,
+                   scale_ranges=None,
+                   shears=None):
+        """Get parameters for affine transformation
+
+        Returns:
+            params to be passed to the affine transformation
+        """
+        angle = random.uniform(degrees[0], degrees[1])
+
+        if translate is not None:
+            max_dx = float(translate[0] * img_size[0])
+            max_dy = float(translate[1] * img_size[1])
+            tx = int(random.uniform(-max_dx, max_dx))
+            ty = int(random.uniform(-max_dy, max_dy))
+            translations = (tx, ty)
+        else:
+            translations = (0, 0)
+
+        if scale_ranges is not None:
+            scale = random.uniform(scale_ranges[0], scale_ranges[1])
+        else:
+            scale = 1.0
+
+        shear_x, shear_y = 0.0, 0.0
+        if shears is not None:
+            shear_x = random.uniform(shears[0], shears[1])
+            if len(shears) == 4:
+                shear_y = random.uniform(shears[2], shears[3])
+        shear = (shear_x, shear_y)
+
+        return angle, translations, scale, shear
+
+    def _apply_image(self, img):
+        """
+        Args:
+            img (PIL.Image|np.array): Image to be affine transformed.
+
+        Returns:
+            PIL.Image or np.array: Affine transformed image.
+        """
+
+        w, h = _get_image_size(img)
+        img_size = [w, h]
+
+        ret = self._get_param(img_size, self.degrees, self.translate,
+                              self.scale, self.shear)
+
+        return F.affine(
+            img,
+            *ret,
+            interpolation=self.interpolation,
+            fill=self.fill,
+            center=self.center)
+
+
 class RandomRotation(BaseTransform):
     """Rotates the image by angle.
 
@@ -1298,6 +1488,125 @@ def _apply_image(self, img):
                         self.center, self.fill)
 
 
+class RandomPerspective(BaseTransform):
+    """Random perspective transformation with a given probability.
+
+    Args:
+        prob (float, optional): Probability of using transformation, ranges from
+            0 to 1, default is 0.5.
+        distortion_scale (float, optional): Degree of distortion, ranges from
+            0 to 1, default is 0.5.
+        interpolation (str, optional): Interpolation method. If omitted, or if
+            the image has only one channel, it is set to PIL.Image.NEAREST or
+            cv2.INTER_NEAREST.
+            When use pil backend, support method are as following: 
+            - "nearest": Image.NEAREST, 
+            - "bilinear": Image.BILINEAR, 
+            - "bicubic": Image.BICUBIC
+            When use cv2 backend, support method are as following: 
+            - "nearest": cv2.INTER_NEAREST, 
+            - "bilinear": cv2.INTER_LINEAR, 
+            - "bicubic": cv2.INTER_CUBIC
+        fill (int|list|tuple, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
+
+    Shape:
+        - img(PIL.Image|np.ndarray|Paddle.Tensor): The input image with shape (H x W x C).
+        - output(PIL.Image|np.ndarray|Paddle.Tensor): A perspectived image.
+
+    Returns:
+        A callable object of RandomPerspective.
+
+    Examples:
+    
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.transforms import RandomPerspective
+
+            transform = RandomPerspective(prob=1.0, distortion_scale=0.9)
+
+            fake_img = paddle.randn((3, 200, 150)).astype(paddle.float32)
+
+            fake_img = transform(fake_img)
+            print(fake_img.shape)
+    """
+
+    def __init__(self,
+                 prob=0.5,
+                 distortion_scale=0.5,
+                 interpolation='nearest',
+                 fill=0,
+                 keys=None):
+        super(RandomPerspective, self).__init__(keys)
+        assert 0 <= prob <= 1, "probability must be between 0 and 1"
+        assert 0 <= distortion_scale <= 1, "distortion_scale must be between 0 and 1"
+        assert interpolation in ['nearest', 'bilinear', 'bicubic']
+        assert isinstance(fill, (numbers.Number, str, list, tuple))
+
+        self.prob = prob
+        self.distortion_scale = distortion_scale
+        self.interpolation = interpolation
+        self.fill = fill
+
+    def get_params(self, width, height, distortion_scale):
+        """
+        Returns:
+            startpoints (list[list[int]]): [top-left, top-right, bottom-right, bottom-left] of the original image,
+            endpoints (list[list[int]]): [top-left, top-right, bottom-right, bottom-left] of the transformed image.
+        """
+        half_height = height // 2
+        half_width = width // 2
+        topleft = [
+            int(random.uniform(0, int(distortion_scale * half_width) + 1)),
+            int(random.uniform(0, int(distortion_scale * half_height) + 1)),
+        ]
+        topright = [
+            int(
+                random.uniform(width - int(distortion_scale * half_width) - 1,
+                               width)),
+            int(random.uniform(0, int(distortion_scale * half_height) + 1)),
+        ]
+        botright = [
+            int(
+                random.uniform(width - int(distortion_scale * half_width) - 1,
+                               width)),
+            int(
+                random.uniform(height - int(distortion_scale * half_height) - 1,
+                               height)),
+        ]
+        botleft = [
+            int(random.uniform(0, int(distortion_scale * half_width) + 1)),
+            int(
+                random.uniform(height - int(distortion_scale * half_height) - 1,
+                               height)),
+        ]
+        startpoints = [[0, 0], [width - 1, 0], [width - 1, height - 1],
+                       [0, height - 1]]
+        endpoints = [topleft, topright, botright, botleft]
+
+        return startpoints, endpoints
+
+    def _apply_image(self, img):
+        """
+        Args:
+            img (PIL.Image|np.array|paddle.Tensor): Image to be Perspectively transformed.
+
+        Returns:
+            PIL.Image|np.array|paddle.Tensor: Perspectively transformed image.
+        """
+
+        width, height = _get_image_size(img)
+
+        if random.random() < self.prob:
+            startpoints, endpoints = self.get_params(width, height,
+                                                     self.distortion_scale)
+            return F.perspective(img, startpoints, endpoints,
+                                 self.interpolation, self.fill)
+        return img
+
+
 class Grayscale(BaseTransform):
     """Converts image to grayscale.
 
@@ -1377,7 +1686,9 @@ class RandomErasing(BaseTransform):
             
             fake_img = paddle.randn((3, 10, 10)).astype(paddle.float32)
             transform = paddle.vision.transforms.RandomErasing()
-            result = transform(fake_img) 
+            result = transform(fake_img)
+
+            print(result)
     """
 
     def __init__(self,
diff --git a/python/requirements.txt b/python/requirements.txt
index e7fc6cd651cb0..74f2c2b9401aa 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -1,9 +1,9 @@
 requests>=2.20.0
 numpy>=1.13
-protobuf>=3.1.0
+protobuf>=3.1.0, <=3.20.0
 Pillow
 six
 decorator
 astor
-paddle_bfloat==0.1.2
+paddle_bfloat==0.1.7
 opt_einsum==3.3.0
diff --git a/python/setup.py.in b/python/setup.py.in
index 4cf8bc3fc6a2e..2a0d745729aab 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -281,6 +281,7 @@ packages=['paddle',
           'paddle.incubate.tensor',
           'paddle.incubate.multiprocessing',
           'paddle.incubate.nn',
+          'paddle.incubate.asp',
           'paddle.incubate.passes',
           'paddle.distribution',
           'paddle.distributed.sharding',
@@ -368,6 +369,7 @@ packages=['paddle',
           'paddle.incubate.nn.functional',
           'paddle.incubate.nn.layer',
           'paddle.incubate.optimizer.functional',
+          'paddle.incubate.autograd',
           'paddle.incubate.distributed',
           'paddle.incubate.distributed.models',
           'paddle.incubate.distributed.models.moe',
diff --git a/python/unittest_py/requirements.txt b/python/unittest_py/requirements.txt
index ffc4fde7c27d1..ea82c46b95c5e 100644
--- a/python/unittest_py/requirements.txt
+++ b/python/unittest_py/requirements.txt
@@ -1,3 +1,4 @@
+pynacl==1.4.0 ; python_version == "3.6"
 PyGithub
 coverage==5.5
 pycrypto ; platform_system != "Windows"
@@ -7,8 +8,9 @@ pygame==2.1.0
 hypothesis
 opencv-python<=4.2.0.32
 visualdl
-paddle2onnx>=0.8.2
-scipy>=1.6
+paddle2onnx>=0.9.6
+scipy>=1.6; python_version >= "3.7"
+scipy>=1.5; python_version == "3.6"
 prettytable
 distro
-numpy>=1.20,<1.22
+numpy>=1.20,<1.22; python_version >= "3.7"
diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh
index 45d4731ba1dba..dd864d9ed0ddc 100644
--- a/tools/check_api_approvals.sh
+++ b/tools/check_api_approvals.sh
@@ -103,11 +103,11 @@ fi
 
 
 if [ -n "${echo_list}" ];then
-  echo "****************"
+  echo "**************************************************************"
   echo "Please find RD for approval first, and then find TPM for approval."
   echo -e "${echo_list[@]}"
   echo "There are ${failed_num} approved errors."
-  echo "****************"
+  echo "**************************************************************"
 
   # L40 L48 L62 has fetch the result out, but there are splitted.
   if [ "${api_spec_diff}" != "" -o "${api_doc_spec_diff}" != "" ] ; then
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index b0800a9cd845e..6e086d9d7ca58 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -360,6 +360,23 @@ if [ "${OP_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     fi
 fi
 
+CMAKE_FILE_CHANGED=`git diff --name-only --diff-filter=AMR upstream/$BRANCH |grep -E "\.cmake|CMakeLists\.txt"  || true`
+if [ "${CMAKE_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    ERROR_LINES=""
+    for CMAKE_FILE in ${CMAKE_FILE_CHANGED};
+    do
+        CHECK_OBJECT_FLAGS=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/${CMAKE_FILE} |grep "+" |grep -E "\-Wno\-error" || true`
+        if [ "${CHECK_OBJECT_FLAGS}" != "" ]; then
+            ERROR_LINES="${ERROR_LINES}\n${CMAKE_FILE}${CHECK_OBJECT_FLAGS}\n"
+        fi
+    done
+    if [ "${ERROR_LINES}" != "" ]; then
+        ERROR_LINES=${ERROR_LINES//+/'\n+\t'}
+        echo_line="Change compilation flag of warnings is not recommended. You must have one RD's (zhiqiu (Recommend), luotao1 or phlrain) approval to use these methods. "
+        check_approval 1 6888866 47554610 2002279
+    fi
+fi
+
 NEW_OP_TEST_ADDED=`git diff --name-only --diff-filter=AMR upstream/$BRANCH |grep -oE "test_.*.\.py" || true`
 if [ "${NEW_OP_TEST_ADDED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     CHECK_OUTPUT=`git diff -U5 --diff-filter=AMR upstream/$BRANCH |grep "self\.check_output(a*t*o*l*=*[0-9]"|grep "+" || true`
@@ -391,6 +408,22 @@ if [ "${UNITTEST_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     fi
 fi
 
+if [ "${UNITTEST_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    ERROR_LINES=""
+    for TEST_FILE in ${UNITTEST_FILE_CHANGED};
+    do
+        ENABLE_LEGACY_DYGRAPH_CI=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/${TEST_FILE} |grep "_enable_legacy_dygraph" || true`
+        if [ "${ENABLE_LEGACY_DYGRAPH_CI}" != "" ]; then
+            ERROR_LINES="${ERROR_LINES}\n${TEST_FILE}\n${ENABLE_LEGACY_DYGRAPH_CI}\n"
+        fi
+    done
+    if [ "${ERROR_LINES}" != "" ]; then
+        ERROR_LINES=${ERROR_LINES//+/'\n+\t'}
+        echo_line="_enable_legacy_dygraph forces the mode to old dynamic graph. You must have one RD (pangyoki (Recommend), Aurelius84 or JiabinYang) approval for the usage (either add or delete) of _enable_legacy_dygraph. For more information, please refer to: https://github.com/PaddlePaddle/Paddle/wiki/Enable-Eager-Mode-in-Paddle-CI. The corresponding lines are as follows:\n${ERROR_LINES}\n"
+        check_approval 1 26408901 9301846 22361972
+    fi
+fi
+
 RUNTYPE_FILE_CHANGED=`git diff --name-only --diff-filter=AM upstream/$BRANCH|grep -E "CMakeLists.txt"||true`
 if [ "${RUNTYPE_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     for CMAKELISTS_FILE in ${RUNTYPE_FILE_CHANGED};
diff --git a/tools/dockerfile/Dockerfile.ipu b/tools/dockerfile/Dockerfile.ipu
index 08536ae401fe1..8f1948de8a4dc 100644
--- a/tools/dockerfile/Dockerfile.ipu
+++ b/tools/dockerfile/Dockerfile.ipu
@@ -6,7 +6,7 @@
 # run a container
 # docker run --ulimit memlock=-1:-1 --net=host --cap-add=IPC_LOCK --device=/dev/infiniband/ --ipc=host --rm -it paddlepaddle/paddle:latest-dev-ipu bash
 
-FROM graphcore/poplar:2.3.0
+FROM graphcore/poplar:2.5.1
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 
 # ENV variables
@@ -25,6 +25,7 @@ RUN apt-get update && apt-get install -y curl wget vim git unzip unrar tar xz-ut
             bison graphviz libjpeg-dev zlib1g zlib1g-dev automake locales swig net-tools libtool module-init-tools numactl libnuma-dev \
             openssl libffi-dev pciutils libblas-dev gfortran libblas3 liblapack-dev liblapack3 default-jre screen tmux gdb lldb gcc g++
 RUN apt-get update && apt-get install -y rdma-core librdmacm1
+RUN apt-get update && apt-get install libspdlog-dev
 
 # Downgrade gcc&&g++
 WORKDIR /usr/bin 
diff --git a/tools/get_build_time.sh b/tools/get_build_time.sh
index a89c024f97ea2..496c8c12d6ca3 100755
--- a/tools/get_build_time.sh
+++ b/tools/get_build_time.sh
@@ -14,5 +14,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-CUR_ROOT=$(dirname "$0")/..
-/usr/bin/time -f '%C, %E elapsed, %U user, %S sys' "$@" >> $CUR_ROOT/build/build-time 2>&1
+CMAKE_BINARY_DIR=$1
+shift
+start=$(date +%s.%N)
+duration=$("/usr/bin/time" -f "%C, %E elapsed, %U user, %S sys" "$@" 2>&1)
+end=$(date +%s.%N)
+
+echo ${duration}, 'start', $start, 'end', $end, 'process', $$ >> $CMAKE_BINARY_DIR/build-time
diff --git a/tools/get_ut_mem_map.py b/tools/get_ut_mem_map.py
index daf80597d3ad0..745d7f9a90c24 100644
--- a/tools/get_ut_mem_map.py
+++ b/tools/get_ut_mem_map.py
@@ -34,8 +34,8 @@ def get_ut_mem(rootPath):
                 if '[Memory Usage (Byte)] gpu' in line:
                     mem_reserved = round(
                         float(
-                            line.split('[max memory reserved] gpu')[1].split(
-                                ':')[1].split('\\n')[0].strip()), 2)
+                            line.split(' : Reserved = ')[1].split(
+                                ', Allocated = ')[0]), 2)
                     if mem_reserved > mem_reserved1:
                         mem_reserved1 = mem_reserved
                 if 'MAX_GPU_MEMORY_USE=' in line:
diff --git a/tools/nvcc_lazy b/tools/nvcc_lazy
new file mode 100755
index 0000000000000..9cb49b04ffaff
--- /dev/null
+++ b/tools/nvcc_lazy
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+## CUDA_MODULE_LOADING=EAGER,DEFAULT,LAZY
+
+# check nvcc version, if nvcc >= 11.6, just run nvcc itself
+CUDA_VERSION=$(nvcc --version | grep -oP '(?<=cuda_)\d*\.\d*')
+CUDA_VERSION_MAJOR=${CUDA_VERSION%.*}
+CUDA_VERSION_MINOR=${CUDA_VERSION#*.}
+if (( CUDA_VERSION_MAJOR > 11 || (CUDA_VERSION_MAJOR == 11 && CUDA_VERSION_MINOR >= 6) )); then
+  nvcc "$@"
+  exit
+fi
+
+BUILDDIR=$(mktemp -d  /tmp/nvcc-lazy-build.XXXXXXXX)
+echo "$@" > ${BUILDDIR}/args
+BUILDSH=${BUILDDIR}/build.sh
+/usr/local/cuda/bin/nvcc --dryrun --keep --keep-dir=${BUILDDIR} "$@" 2>&1 | sed -e 's/#\$ //;/^rm/d' > $BUILDSH
+sed -i -e '/^\s*--/d' $BUILDSH
+sed -ne '1,/^cicc.*cudafe1.stub.c/p' ${BUILDSH} > ${BUILDSH}.pre
+sed -e '1,/^cicc.*cudafe1.stub.c/d' ${BUILDSH} > ${BUILDSH}.post
+
+sed -i -e '/LIBRARIES=/{s/\s//g;s/""/ /g}' ${BUILDSH}.pre
+
+/usr/bin/env bash ${BUILDSH}.pre
+STUBF=$(find $BUILDDIR -name *.cudafe1.stub.c)
+CUFILE=$(basename -s '.cudafe1.stub.c' $STUBF)
+sed -i -e '/__sti____cudaRegisterAll.*__attribute__/a static void __try____cudaRegisterAll(int);' $STUBF
+sed -i -e 's/__sti____cudaRegisterAll\(.*{\)/__do____cudaRegisterAll\1/' $STUBF
+# sed -i -e "/__do____cudaRegisterAll\(.*{\)/a static void __try____cudaRegisterAll(int l){static int _ls = 0; if (_ls) return; const char* lm = getenv(\"CUDA_MODULE_LOADING\"); if (lm&&(lm[0]=='L')&&(lm[1]=='A')&&(lm[2]=='Z')&&(lm[3]=='Y')&&(l!=1)) return; _ls = 1; fprintf(stderr,\"===> ${CUFILE} lazy-load? %d\\\\n\", l); __do____cudaRegisterAll();}" $STUBF
+sed -i -e "/__do____cudaRegisterAll\(.*{\)/a static void __try____cudaRegisterAll(int l){static int _ls = 0; if (_ls) return; const char* lm = getenv(\"CUDA_MODULE_LOADING\"); if (lm&&(lm[0]=='L')&&(lm[1]=='A')&&(lm[2]=='Z')&&(lm[3]=='Y')&&(l!=1)) return; _ls = 1; __do____cudaRegisterAll();}" $STUBF
+sed -i -e '/__try____cudaRegisterAll\(.*{\)/a static void __sti____cudaRegisterAll(void){__try____cudaRegisterAll(0);}' $STUBF
+sed -i -e 's/{\(__device_stub__\)/{__try____cudaRegisterAll(1);\1/' $STUBF
+/usr/bin/env bash ${BUILDSH}.post
+rm -rf $BUILDDIR
diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py
index 2d8692c5bc7e5..13005350d7bd5 100644
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -339,8 +339,10 @@ def sampcd_extract_to_file(srccom, name, htype="def", hname=""):
 Please use '.. code-block:: python' to format the sample code.""")
                 return []
         else:
-            logger.warning("Error: No sample code!")
-            return []
+            logger.error(
+                "Error: No sample code found! Please check if the API comment contais string 'Examples:' correctly"
+            )
+            exit(1)
 
     sample_code_filenames = []
     for y, cb in enumerate(codeblocks):
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index aaa667595f94c..6067b40f0a7c1 100755
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -170,6 +170,7 @@
     'test_elementwise_div_op',
     'test_elementwise_floordiv_op',
     'test_elementwise_gradient_op',
+    'test_elementwise_heaviside_op',
     'test_elementwise_max_op',
     'test_elementwise_min_op',
     'test_elementwise_mod_op',
@@ -654,10 +655,12 @@
     'test_transpose_mkldnn_op',
     'test_mkldnn_conv_activation_fuse_pass',
     'test_mkldnn_conv_concat_relu_mkldnn_fuse_pass',
+    'test_mkldnn_int8_scale_calculation_pass',
     'test_mkldnn_matmul_op_output_fuse_pass',
     'test_mkldnn_matmul_transpose_reshape_fuse_pass',
     'test_mkldnn_scale_matmul_fuse_pass',
     'test_mkldnn_inplace_fuse_pass',
+    'test_mkldnn_conv_affine_channel_fuse_pass',
     'test_batch_fc_op',
     'test_c_comm_init_all_op',
     'test_conv2d_fusion_op',
diff --git a/tools/test_runner.py b/tools/test_runner.py
index 7ceed18634a87..02d926914f904 100644
--- a/tools/test_runner.py
+++ b/tools/test_runner.py
@@ -32,6 +32,7 @@ def main():
     if core.is_compiled_with_cuda() or core.is_compiled_with_rocm():
         if (os.getenv('FLAGS_enable_gpu_memory_usage_log') == None):
             os.environ['FLAGS_enable_gpu_memory_usage_log'] = 'true'
+            os.environ['FLAGS_enable_gpu_memory_usage_log_mb'] = 'false'
 
     some_test_failed = False
     for module_name in sys.argv[1:]: